Migration is supported on KVM/Xen hypervisors only. Finalize methods are called in both success or suspected failure (the master node can't certainly know about failures - e.g. in case of timeouts). Thus these functions need to be as robust as possible in order to be able to recover from failures - in case success=False the functions should be conservative, trying to detect the actual state and idempotent. The cleanup workaround works for XM, on XL it is a no-op.
Signed-off-by: Viktor Bachraty <[email protected]> --- lib/hypervisor/hv_kvm/__init__.py | 16 ++++--- lib/hypervisor/hv_xen.py | 99 ++++++++++++++++++++++++++++++++------- 2 files changed, 92 insertions(+), 23 deletions(-) diff --git a/lib/hypervisor/hv_kvm/__init__.py b/lib/hypervisor/hv_kvm/__init__.py index 456f493..9192348 100644 --- a/lib/hypervisor/hv_kvm/__init__.py +++ b/lib/hypervisor/hv_kvm/__init__.py @@ -2467,24 +2467,28 @@ class KVMHypervisor(hv_base.BaseHypervisor): migrate_command = "migrate -d tcp:%s:%s" % (target, port) self._CallMonitorCommand(instance_name, migrate_command) - def FinalizeMigrationSource(self, instance, success, live): + def FinalizeMigrationSource(self, instance, success, _): """Finalize the instance migration on the source node. @type instance: L{objects.Instance} @param instance: the instance that was migrated @type success: bool @param success: whether the migration succeeded or not - @type live: bool - @param live: whether the user requested a live migration or not """ if success: pidfile, pid, _ = self._InstancePidAlive(instance.name) utils.KillProcess(pid) self._RemoveInstanceRuntimeFiles(pidfile, instance.name) - elif live: - self._CallMonitorCommand(instance.name, self._CONT_CMD) - self._ClearUserShutdown(instance.name) + self._ClearUserShutdown(instance.name) + else: + # Detect if PID is alive rather than deciding if we were to perform a live + # migration. + _, _, alive = self._InstancePidAlive(instance.name) + if alive: + self._CallMonitorCommand(instance.name, self._CONT_CMD) + else: + self.CleanupInstance(instance.name) def GetMigrationStatus(self, instance): """Get the migration status diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py index 7f2c839..9ba2ae8 100644 --- a/lib/hypervisor/hv_xen.py +++ b/lib/hypervisor/hv_xen.py @@ -148,6 +148,7 @@ def _ParseInstanceList(lines, include_node): raise errors.HypervisorError("Can't parse instance list," " line: %s" % line) try: + # TODO: Cleanup this mess - introduce a namedtuple/dict/class data[1] = int(data[1]) data[2] = int(data[2]) data[3] = int(data[3]) @@ -168,6 +169,26 @@ def _ParseInstanceList(lines, include_node): return result +def _InstanceDomID(info): + """Get instance domain ID from instance info tuple. + @type info: tuple + @param info: instance info as parsed by _ParseInstanceList() + + @return: int, instance domain ID + """ + return info[1] + + +def _InstanceRuntime(info): + """Get instance runtime from instance info tuple. + @type info: tuple + @param info: instance info as parsed by _ParseInstanceList() + + @return: float value of instance runtime + """ + return info[5] + + def _GetAllInstanceList(fn, include_node, delays, timeout): """Return the list of instances including running and shutdown. @@ -941,7 +962,7 @@ class XenHypervisor(hv_base.BaseHypervisor): return self._RunXen(["shutdown", "-w", name], hvparams, timeout) def _DestroyInstance(self, name, hvparams): - """Destroy an instance if the instance if the instance exists. + """Destroy an instance if the instance exists. @type name: string @param name: name of the instance to destroy @@ -974,6 +995,19 @@ class XenHypervisor(hv_base.BaseHypervisor): else: self._DestroyInstance(name, hvparams) + def _RenameInstance(self, old_name, new_name, hvparams): + """Rename an instance (domain). + + @type old_name: string + @param old_name: current name of the instance + @type new_name: string + @param new_name: future (requested) name of the instace + @type hvparams: dict of string + @param hvparams: hypervisor parameters of the instance + + """ + return self._RunXen(["rename", old_name, new_name], hvparams) + def _StopInstance(self, name, force, hvparams, timeout): """Stop an instance. @@ -1034,7 +1068,8 @@ class XenHypervisor(hv_base.BaseHypervisor): # check if the domain ID has changed or the run time has decreased if (new_info is not None and - (new_info[1] != ini_info[1] or new_info[5] < ini_info[5])): + (_InstanceDomID(new_info) != _InstanceDomID(ini_info) or ( + _InstanceRuntime(new_info) < _InstanceRuntime(ini_info)))): return raise utils.RetryAgain() @@ -1208,24 +1243,39 @@ class XenHypervisor(hv_base.BaseHypervisor): " on port %d: check if port is available" % port) - def FinalizeMigrationDst(self, instance, info, success): + def FinalizeMigrationDst(self, instance, config, success): """Finalize an instance migration. - After a successful migration we write the xen config file. - We do nothing on a failure, as we did not change anything at accept time. + Write a config file if the instance is running on the destination node + regardles if we think the migration succeeded or not. This will cover cases, + when the migration succeeded but due to a timeout on the source node we + think it failed. If we think the migration failed and there is an unstarted + domain, clean it up. @type instance: L{objects.Instance} @param instance: instance whose migration is being finalized - @type info: string - @param info: content of the xen config file on the source node + @type config: string + @param config: content of the xen config file from the source node @type success: boolean - @param success: whether the migration was a success or a failure + @param success: whether the master node thinks the migration succeeded """ - if success: - self._WriteConfigFile(instance.name, info) - elif self._UseMigrationDaemon(instance.hvparams): - XenHypervisor._KillMigrationDaemon(instance) + + # We should recreate the config file if the domain is present and running, + # regardless if we think the migration succeeded or not. + info = self.GetInstanceInfo(instance.name, hvparams=instance.hvparams) + if info and _InstanceRuntime(info) != 0: + self._WriteConfigFile(instance.name, config) + + if not success: + if self._UseMigrationDaemon(instance.hvparams): + XenHypervisor._KillMigrationDaemon(instance) + + # Fix the common failure when the domain was created but never started: + # this happens if the memory transfer didn't complete and the instance + # is running on the source node. + if info and _InstanceRuntime(info) == 0: + self._DestroyInstance(instance.name, instance.hvparams) def MigrateInstance(self, _cluster_name, instance, target, live): """Migrate an instance to a target node. @@ -1291,24 +1341,39 @@ class XenHypervisor(hv_base.BaseHypervisor): raise errors.HypervisorError("Failed to migrate instance %s: %s" % (instance_name, result.output)) - def FinalizeMigrationSource(self, instance, success, live): + def FinalizeMigrationSource(self, instance, success, _): """Finalize the instance migration on the source node. @type instance: L{objects.Instance} @param instance: the instance that was migrated @type success: bool - @param success: whether the migration succeeded or not - @type live: bool - @param live: whether the user requested a live migration or not + @param success: whether the master thinks the migration succeeded """ # pylint: disable=W0613 if success: - # remove old xen file after migration succeeded + # Remove old xen file after migration succeeded + # Note that _RemoveConfigFile silently succeeds if the file is already + # deleted, that makes this function idempotent try: self._RemoveConfigFile(instance.name) except EnvironmentError: logging.exception("Failure while removing instance config file") + else: + # Cleanup the most common failure case when the source instance fails + # to freeze and remains running renamed: + # XM: renamed to 'migrating-${oldname}' + # XL: renamed to '${oldname}--migratedaway' + + temp_name, info = None, None + for n in ['migrating-' + instance.name, instance.name + '--migratedaway']: + info = self.GetInstanceInfo(n, hvparams=instance.hvparams) + if info: + temp_name = n + break + + if info: + self._RenameInstance(temp_name, instance.name, instance.hvparams) def GetMigrationStatus(self, instance): """Get the migration status -- 2.8.0.rc3.226.g39d4020
