Federico Simoncelli has uploaded a new change for review.

Change subject: sp: improve masterMigrate safety
......................................................................

sp: improve masterMigrate safety

The goal of this patch is to clean, optimize and give robustness to
the masterMigrate method.

In this patch:
- acquire the cluster lock before start using the new master filesystem
- refactor out all the required metadata changes in switchMasterDomain
  (rename copyPoolMD to switchMasterDomain, and include the domain role
  changes)
- divde the method in two parts (operations required to switch to the
  new master and previous master cleanup)
- address the two parts exception handling (hard failure for the first,
  logging only for the second)

Change-Id: I155b36d41df43230f99ba610699bba05bee3f0c0
Signed-off-by: Federico Simoncelli <[email protected]>
---
M vdsm/storage/sp.py
1 file changed, 61 insertions(+), 78 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/19/22419/1

diff --git a/vdsm/storage/sp.py b/vdsm/storage/sp.py
index 19d07b3..c370b52 100644
--- a/vdsm/storage/sp.py
+++ b/vdsm/storage/sp.py
@@ -836,27 +836,29 @@
             else:
                 futureMaster.releaseClusterLock()
 
-    @unsecured
-    def copyPoolMD(self, prevMd, newMD):
-        prevPoolMD = self._getPoolMD(prevMd)
+    def switchMasterDomain(self, currentMasterDomain, newMasterDomain,
+                           newMasterVersion):
+        prevPoolMD = self._getPoolMD(currentMasterDomain)
         domains = prevPoolMD[PMDK_DOMAINS]
         pool_descr = prevPoolMD[PMDK_POOL_DESCRIPTION]
         lver = prevPoolMD[PMDK_LVER]
         spmId = prevPoolMD[PMDK_SPM_ID]
         # This is actually domain metadata, But I can't change this because of
         # backward compatibility
-        leaseParams = prevMd.getLeaseParams()
+        leaseParams = currentMasterDomain.getLeaseParams()
 
         # Now insert pool metadata into new mastersd metadata
 
-        newPoolMD = self._getPoolMD(newMD)
+        newPoolMD = self._getPoolMD(newMasterDomain)
         with newPoolMD.transaction():
             newPoolMD.update({
                 PMDK_DOMAINS: domains,
                 PMDK_POOL_DESCRIPTION: pool_descr,
                 PMDK_LVER: lver,
-                PMDK_SPM_ID: spmId})
-            newMD.changeLeaseParams(leaseParams)
+                PMDK_SPM_ID: spmId,
+                PMDK_MASTER_VER: newMasterVersion,
+            })
+            newMasterDomain.changeLeaseParams(leaseParams)
 
     @unsecured
     def _copyLeaseParameters(self, srcDomain, dstDomain):
@@ -865,7 +867,6 @@
                       srcDomain.sdUUID, leaseParams)
         dstDomain.changeLeaseParams(leaseParams)
 
-    @unsecured
     def masterMigrate(self, sdUUID, msdUUID, masterVersion):
         self.log.info("sdUUID=%s spUUID=%s msdUUID=%s", sdUUID, self.spUUID,
                       msdUUID)
@@ -895,95 +896,77 @@
         if newmsd.isBackup():
             raise se.BackupCannotBeMasterDomain(msdUUID)
 
-        # Copy master file system content to the new master
-        src = os.path.join(curmsd.domaindir, sd.MASTER_FS_DIR)
-        dst = os.path.join(newmsd.domaindir, sd.MASTER_FS_DIR)
+        # If the new master domain is using safelease (version < 3) then
+        # we can speed up the cluster lock acquirement by resetting the
+        # SPM lease.
+        # XXX: With SANLock there is no need to speed up the process (the
+        # acquirement will take a short time anyway since we already hold
+        # the host id) and more importantly resetting the lease is going
+        # to interfere with the regular SANLock behavior.
+        # @deprecated this is relevant only for domain version < 3
+        if not newmsd.hasVolumeLeases():
+            newmsd.initSPMlease()
 
-        # Mount new master file system
-        newmsd.mountMaster()
-        # Make sure there is no cruft left over
-        for dir in [newmsd.getVMsDir(), newmsd.getTasksDir()]:
-            fileUtils.cleanupdir(dir)
+        # Forcing to acquire the host id (if it's not acquired already)
+        # and acquiring the cluster lock on new master.
+        newmsd.acquireHostId(self.id)
+        newmsd.acquireClusterLock(self.id)
 
         try:
-            fileUtils.tarCopy(src, dst, exclude=("./lost+found",))
-        except fileUtils.TarCopyFailed:
-            self.log.error("tarCopy failed", exc_info=True)
-            # Failed to copy the master data
+            self.log.debug('migration to the new master %s begins',
+                           newmsd.sdUUID)
+
+            # Mount new master file system
+            newmsd.mountMaster()
+
+            # Make sure there is no cruft left over
+            for dir in [newmsd.getVMsDir(), newmsd.getTasksDir()]:
+                fileUtils.cleanupdir(dir)
+
+            # Copy master file system content to the new master
+            src = os.path.join(curmsd.domaindir, sd.MASTER_FS_DIR)
+            dst = os.path.join(newmsd.domaindir, sd.MASTER_FS_DIR)
+            fileUtils.tarCopy(src, dst, exclude=('./lost+found',))
+
+            if not newmsd.getMDPath():
+                raise se.StorageDomainLayoutError("domain", msdUUID)
+
+            newmsd.changeRole(sd.MASTER_DOMAIN)
+            self.switchMasterDomain(curmsd, newmsd, masterVersion)
+        except Exception:
+            self.log.exception('migration to new master failed')
             try:
                 newmsd.unmountMaster()
-            except Exception:
-                self.log.error("Unexpected error", exc_info=True)
-            raise se.StorageDomainMasterCopyError(msdUUID)
-
-        self.copyPoolMD(curmsd, newmsd)
-
-        path = newmsd.getMDPath()
-        if not path:
-            newmsd.unmountMaster()
-            raise se.StorageDomainLayoutError("domain", msdUUID)
-
-        # Acquire cluster lock on new master
-        try:
-            # If the new master domain is using safelease (version < 3) then
-            # we can speed up the cluster lock acquirement by resetting the
-            # SPM lease.
-            # XXX: With SANLock there is no need to speed up the process (the
-            # acquirement will take a short time anyway since we already hold
-            # the host id) and more importantly resetting the lease is going
-            # to interfere with the regular SANLock behavior.
-            # @deprecated this is relevant only for domain version < 3
-            if not newmsd.hasVolumeLeases():
-                newmsd.initSPMlease()
-            # Forcing to acquire the host id (if it's not acquired already)
-            newmsd.acquireHostId(self.id)
-            newmsd.acquireClusterLock(self.id)
-        except Exception:
-            self.log.error("Unexpected error", exc_info=True)
+            except:  # logging only
+                self.log.error('unable to unmount master filesystem '
+                               'for domain %s', newmsd.sdUUID)
             newmsd.releaseClusterLock()
-            newmsd.unmountMaster()
-            raise
-        self.log.debug("masterMigrate - lease acquired successfully")
-
-        try:
-            # Now mark new domain as 'master'
-            # if things break down here move the master back pronto
-            newPoolMD = self._getPoolMD(newmsd)
-            with newPoolMD.transaction():
-                newPoolMD[PMDK_MASTER_VER] = masterVersion
-                newmsd.changeRole(sd.MASTER_DOMAIN)
-            self._saveReconnectInformation(self.id, self.scsiKey,
-                                           newmsd.sdUUID, masterVersion)
-        except Exception:
-            self.log.error("Unexpected error", exc_info=True)
-            newmsd.releaseClusterLock()
-            newmsd.unmountMaster()
             raise
 
         # From this point on we have a new master and should not fail
         try:
-            # Now recreate 'mastersd' link
-            # we can use refresh() to do the job
             self.refresh(msdUUID, masterVersion)
+            self._saveReconnectInformation(
+                self.id, self.scsiKey, newmsd.sdUUID, masterVersion)
 
             # From this point on there is a new master domain in the pool
-            # Now that we are beyond the critical point we can clean up things
-            curmsd.changeRole(sd.REGULAR_DOMAIN)
+            # Now that we are beyond the critical point we can clean up
+            # things
+            self.setDomainRegularRole(curmsd)
 
             # Clean up the old data from previous master fs
             for directory in [curmsd.getVMsDir(), curmsd.getTasksDir()]:
                 fileUtils.cleanupdir(directory)
-
-            # NOTE: here we unmount the *previous* master file system !!!
-            curmsd.unmountMaster()
         except Exception:
-            self.log.error("Unexpected error", exc_info=True)
-
-        try:
-            # Release old lease
+            self.log.exception('ignoring old master cleanup failure')
+        finally:
+            # Unmoung old master filesystem and release old cluster lock
+            try:
+                curmsd.unmountMaster()
+            except:  # logging only
+                self.log.error('unable to unmount master filesystem '
+                               'for domain %s', curmsd.sdUUID)
             curmsd.releaseClusterLock()
-        except Exception:
-            self.log.error("Unexpected error", exc_info=True)
 
     def attachSD(self, sdUUID):
         """


-- 
To view, visit http://gerrit.ovirt.org/22419
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I155b36d41df43230f99ba610699bba05bee3f0c0
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Federico Simoncelli <[email protected]>
_______________________________________________
vdsm-patches mailing list
[email protected]
https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches

Reply via email to