Migration is supported on KVM/Xen hypervisors only. Finalize methods are
called in both success or suspected failure (the master node can't
certainly know about failures - e.g. in case of timeouts). Thus these
functions need to be as robust as possible in order to be able to
recover from failures - in case success=False the functions should be
conservative, trying to detect the actual state and idempotent. The
cleanup workaround works for XM, on XL it is a no-op.

Signed-off-by: Viktor Bachraty <[email protected]>
---
 lib/hypervisor/hv_kvm/__init__.py | 16 ++++---
 lib/hypervisor/hv_xen.py          | 99 ++++++++++++++++++++++++++++++++-------
 2 files changed, 92 insertions(+), 23 deletions(-)

diff --git a/lib/hypervisor/hv_kvm/__init__.py 
b/lib/hypervisor/hv_kvm/__init__.py
index 456f493..9192348 100644
--- a/lib/hypervisor/hv_kvm/__init__.py
+++ b/lib/hypervisor/hv_kvm/__init__.py
@@ -2467,24 +2467,28 @@ class KVMHypervisor(hv_base.BaseHypervisor):
     migrate_command = "migrate -d tcp:%s:%s" % (target, port)
     self._CallMonitorCommand(instance_name, migrate_command)
 
-  def FinalizeMigrationSource(self, instance, success, live):
+  def FinalizeMigrationSource(self, instance, success, _):
     """Finalize the instance migration on the source node.
 
     @type instance: L{objects.Instance}
     @param instance: the instance that was migrated
     @type success: bool
     @param success: whether the migration succeeded or not
-    @type live: bool
-    @param live: whether the user requested a live migration or not
 
     """
     if success:
       pidfile, pid, _ = self._InstancePidAlive(instance.name)
       utils.KillProcess(pid)
       self._RemoveInstanceRuntimeFiles(pidfile, instance.name)
-    elif live:
-      self._CallMonitorCommand(instance.name, self._CONT_CMD)
-    self._ClearUserShutdown(instance.name)
+      self._ClearUserShutdown(instance.name)
+    else:
+      # Detect if PID is alive rather than deciding if we were to perform a 
live
+      # migration.
+      _, _, alive = self._InstancePidAlive(instance.name)
+      if alive:
+        self._CallMonitorCommand(instance.name, self._CONT_CMD)
+      else:
+        self.CleanupInstance(instance.name)
 
   def GetMigrationStatus(self, instance):
     """Get the migration status
diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py
index 7f2c839..9ba2ae8 100644
--- a/lib/hypervisor/hv_xen.py
+++ b/lib/hypervisor/hv_xen.py
@@ -148,6 +148,7 @@ def _ParseInstanceList(lines, include_node):
       raise errors.HypervisorError("Can't parse instance list,"
                                    " line: %s" % line)
     try:
+      # TODO: Cleanup this mess - introduce a namedtuple/dict/class
       data[1] = int(data[1])
       data[2] = int(data[2])
       data[3] = int(data[3])
@@ -168,6 +169,26 @@ def _ParseInstanceList(lines, include_node):
   return result
 
 
+def _InstanceDomID(info):
+  """Get instance domain ID from instance info tuple.
+  @type info: tuple
+  @param info: instance info as parsed by _ParseInstanceList()
+
+  @return: int, instance domain ID
+  """
+  return info[1]
+
+
+def _InstanceRuntime(info):
+  """Get instance runtime from instance info tuple.
+  @type info: tuple
+  @param info: instance info as parsed by _ParseInstanceList()
+
+  @return: float value of instance runtime
+  """
+  return info[5]
+
+
 def _GetAllInstanceList(fn, include_node, delays, timeout):
   """Return the list of instances including running and shutdown.
 
@@ -941,7 +962,7 @@ class XenHypervisor(hv_base.BaseHypervisor):
     return self._RunXen(["shutdown", "-w", name], hvparams, timeout)
 
   def _DestroyInstance(self, name, hvparams):
-    """Destroy an instance if the instance if the instance exists.
+    """Destroy an instance if the instance exists.
 
     @type name: string
     @param name: name of the instance to destroy
@@ -974,6 +995,19 @@ class XenHypervisor(hv_base.BaseHypervisor):
     else:
       self._DestroyInstance(name, hvparams)
 
+  def _RenameInstance(self, old_name, new_name, hvparams):
+    """Rename an instance (domain).
+
+    @type old_name: string
+    @param old_name: current name of the instance
+    @type new_name: string
+    @param new_name: future (requested) name of the instace
+    @type hvparams: dict of string
+    @param hvparams: hypervisor parameters of the instance
+
+    """
+    return self._RunXen(["rename", old_name, new_name], hvparams)
+
   def _StopInstance(self, name, force, hvparams, timeout):
     """Stop an instance.
 
@@ -1034,7 +1068,8 @@ class XenHypervisor(hv_base.BaseHypervisor):
 
       # check if the domain ID has changed or the run time has decreased
       if (new_info is not None and
-          (new_info[1] != ini_info[1] or new_info[5] < ini_info[5])):
+          (_InstanceDomID(new_info) != _InstanceDomID(ini_info) or (
+              _InstanceRuntime(new_info) < _InstanceRuntime(ini_info)))):
         return
 
       raise utils.RetryAgain()
@@ -1208,24 +1243,39 @@ class XenHypervisor(hv_base.BaseHypervisor):
                                      " on port %d: check if port is available" 
%
                                      port)
 
-  def FinalizeMigrationDst(self, instance, info, success):
+  def FinalizeMigrationDst(self, instance, config, success):
     """Finalize an instance migration.
 
-    After a successful migration we write the xen config file.
-    We do nothing on a failure, as we did not change anything at accept time.
+    Write a config file if the instance is running on the destination node
+    regardles if we think the migration succeeded or not. This will cover 
cases,
+    when the migration succeeded but due to a timeout on the source node we
+    think it failed. If we think the migration failed and there is an unstarted
+    domain, clean it up.
 
     @type instance: L{objects.Instance}
     @param instance: instance whose migration is being finalized
-    @type info: string
-    @param info: content of the xen config file on the source node
+    @type config: string
+    @param config: content of the xen config file from the source node
     @type success: boolean
-    @param success: whether the migration was a success or a failure
+    @param success: whether the master node thinks the migration succeeded
 
     """
-    if success:
-      self._WriteConfigFile(instance.name, info)
-    elif self._UseMigrationDaemon(instance.hvparams):
-      XenHypervisor._KillMigrationDaemon(instance)
+
+    # We should recreate the config file if the domain is present and running,
+    # regardless if we think the migration succeeded or not.
+    info = self.GetInstanceInfo(instance.name, hvparams=instance.hvparams)
+    if info and _InstanceRuntime(info) != 0:
+      self._WriteConfigFile(instance.name, config)
+
+    if not success:
+      if self._UseMigrationDaemon(instance.hvparams):
+        XenHypervisor._KillMigrationDaemon(instance)
+
+      # Fix the common failure when the domain was created but never started:
+      # this happens if the memory transfer didn't complete and the instance
+      # is running on the source node.
+      if info and _InstanceRuntime(info) == 0:
+        self._DestroyInstance(instance.name, instance.hvparams)
 
   def MigrateInstance(self, _cluster_name, instance, target, live):
     """Migrate an instance to a target node.
@@ -1291,24 +1341,39 @@ class XenHypervisor(hv_base.BaseHypervisor):
       raise errors.HypervisorError("Failed to migrate instance %s: %s" %
                                    (instance_name, result.output))
 
-  def FinalizeMigrationSource(self, instance, success, live):
+  def FinalizeMigrationSource(self, instance, success, _):
     """Finalize the instance migration on the source node.
 
     @type instance: L{objects.Instance}
     @param instance: the instance that was migrated
     @type success: bool
-    @param success: whether the migration succeeded or not
-    @type live: bool
-    @param live: whether the user requested a live migration or not
+    @param success: whether the master thinks the migration succeeded
 
     """
     # pylint: disable=W0613
     if success:
-      # remove old xen file after migration succeeded
+      # Remove old xen file after migration succeeded
+      # Note that _RemoveConfigFile silently succeeds if the file is already
+      # deleted, that makes this function idempotent
       try:
         self._RemoveConfigFile(instance.name)
       except EnvironmentError:
         logging.exception("Failure while removing instance config file")
+    else:
+      # Cleanup the most common failure case when the source instance fails
+      # to freeze and remains running renamed:
+      # XM: renamed to 'migrating-${oldname}'
+      # XL: renamed to '${oldname}--migratedaway'
+
+      temp_name, info = None, None
+      for n in ['migrating-' + instance.name, instance.name + 
'--migratedaway']:
+        info = self.GetInstanceInfo(n, hvparams=instance.hvparams)
+        if info:
+          temp_name = n
+          break
+
+      if info:
+        self._RenameInstance(temp_name, instance.name, instance.hvparams)
 
   def GetMigrationStatus(self, instance):
     """Get the migration status
-- 
2.8.0.rc3.226.g39d4020

Reply via email to