FinalizeMigrationSource and FinalizeMigrationDst should compose an
'atomic' operation consisting of 2 idempotent steps, that can be always
both retried, similarly how they are used in AbortMigration() Otherwise
we would have to remember at which step did the migration fail in order
to be able to rollback/recover. Also state of the record should be updated
only if both succeeded, otherwise we won't know in case of recovery if the
record corresponds to the original state or the final state.

Signed-off-by: Viktor Bachraty <[email protected]>
---
 lib/cmdlib/instance_migration.py | 45 ++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/lib/cmdlib/instance_migration.py b/lib/cmdlib/instance_migration.py
index b93b334..0d96dc0 100644
--- a/lib/cmdlib/instance_migration.py
+++ b/lib/cmdlib/instance_migration.py
@@ -872,30 +872,39 @@ class TLMigrateInstance(Tasklet):
 
       time.sleep(self._MIGRATION_POLL_INTERVAL)
 
-    result = self.rpc.call_instance_finalize_migration_src(
-               self.source_node_uuid, self.instance, True, self.live)
-    msg = result.fail_msg
-    if msg:
+    # Always call finalize on both source and destination, it should be seen as
+    # an 'atomic' operation running on two hosts (same as in _AbortMigration).
+    result_src = self.rpc.call_instance_finalize_migration_src(
+        self.source_node_uuid, self.instance, True, self.live)
+
+    result_dst = self.rpc.call_instance_finalize_migration_dst(
+        self.target_node_uuid, self.instance, migration_info, True)
+
+    err_msg = []
+    if result_src.fail_msg:
       logging.error("Instance migration succeeded, but finalization failed"
-                    " on the source node: %s", msg)
-      raise errors.OpExecError("Could not finalize instance migration: %s" %
-                               msg)
+                    " on the source node: %s", result_src.fail_msg)
+      err_msg.append(self.cfg.GetNodeName(self.source_node_uuid) + ': '
+                     + result_src.fail_msg)
+
+    if result_dst.fail_msg:
+      logging.error("Instance migration succeeded, but finalization failed"
+                    " on the target node: %s", result_dst.fail_msg)
+      err_msg.append(self.cfg.GetNodeName(self.target_node_uuid) + ': '
+                     + result_dst.fail_msg)
+
+    if err_msg:
+      raise errors.OpExecError(
+          "Could not finalize instance migration: %s" % ' '.join(err_msg))
 
+    # Update instance location only after finalize completed. This way if the
+    # instance runs on both nodes we know which one was the primary before the
+    # migration in case we need to do cleanup.
     self.cfg.SetInstancePrimaryNode(self.instance.uuid, self.target_node_uuid)
     self.instance = self.cfg.GetInstanceInfo(self.instance_uuid)
-    disks = self.cfg.GetInstanceDisks(self.instance_uuid)
-
-    result = self.rpc.call_instance_finalize_migration_dst(
-               self.target_node_uuid, self.instance, migration_info, True)
-    msg = result.fail_msg
-    if msg:
-      logging.error("Instance migration succeeded, but finalization failed"
-                    " on the target node: %s", msg)
-      raise errors.OpExecError("Could not finalize instance migration: %s" %
-                               msg)
 
     self._CloseInstanceDisks(self.source_node_uuid)
-
+    disks = self.cfg.GetInstanceDisks(self.instance_uuid)
     if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
       self._WaitUntilSync()
       self._GoStandalone()
-- 
2.8.0.rc3.226.g39d4020

Reply via email to