Federico Simoncelli has uploaded a new change for review. Change subject: sp: improve masterMigrate safety ......................................................................
sp: improve masterMigrate safety The goal of this patch is to clean, optimize and give robustness to the masterMigrate method. In this patch: - acquire the cluster lock before start using the new master filesystem - refactor out all the required metadata changes in switchMasterDomain (rename copyPoolMD to switchMasterDomain, and include the domain role changes) - divde the method in two parts (operations required to switch to the new master and previous master cleanup) - address the two parts exception handling (hard failure for the first, logging only for the second) Change-Id: I155b36d41df43230f99ba610699bba05bee3f0c0 Signed-off-by: Federico Simoncelli <[email protected]> --- M vdsm/storage/sp.py 1 file changed, 61 insertions(+), 78 deletions(-) git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/19/22419/1 diff --git a/vdsm/storage/sp.py b/vdsm/storage/sp.py index 19d07b3..c370b52 100644 --- a/vdsm/storage/sp.py +++ b/vdsm/storage/sp.py @@ -836,27 +836,29 @@ else: futureMaster.releaseClusterLock() - @unsecured - def copyPoolMD(self, prevMd, newMD): - prevPoolMD = self._getPoolMD(prevMd) + def switchMasterDomain(self, currentMasterDomain, newMasterDomain, + newMasterVersion): + prevPoolMD = self._getPoolMD(currentMasterDomain) domains = prevPoolMD[PMDK_DOMAINS] pool_descr = prevPoolMD[PMDK_POOL_DESCRIPTION] lver = prevPoolMD[PMDK_LVER] spmId = prevPoolMD[PMDK_SPM_ID] # This is actually domain metadata, But I can't change this because of # backward compatibility - leaseParams = prevMd.getLeaseParams() + leaseParams = currentMasterDomain.getLeaseParams() # Now insert pool metadata into new mastersd metadata - newPoolMD = self._getPoolMD(newMD) + newPoolMD = self._getPoolMD(newMasterDomain) with newPoolMD.transaction(): newPoolMD.update({ PMDK_DOMAINS: domains, PMDK_POOL_DESCRIPTION: pool_descr, PMDK_LVER: lver, - PMDK_SPM_ID: spmId}) - newMD.changeLeaseParams(leaseParams) + PMDK_SPM_ID: spmId, + PMDK_MASTER_VER: newMasterVersion, + }) + newMasterDomain.changeLeaseParams(leaseParams) @unsecured def _copyLeaseParameters(self, srcDomain, dstDomain): @@ -865,7 +867,6 @@ srcDomain.sdUUID, leaseParams) dstDomain.changeLeaseParams(leaseParams) - @unsecured def masterMigrate(self, sdUUID, msdUUID, masterVersion): self.log.info("sdUUID=%s spUUID=%s msdUUID=%s", sdUUID, self.spUUID, msdUUID) @@ -895,95 +896,77 @@ if newmsd.isBackup(): raise se.BackupCannotBeMasterDomain(msdUUID) - # Copy master file system content to the new master - src = os.path.join(curmsd.domaindir, sd.MASTER_FS_DIR) - dst = os.path.join(newmsd.domaindir, sd.MASTER_FS_DIR) + # If the new master domain is using safelease (version < 3) then + # we can speed up the cluster lock acquirement by resetting the + # SPM lease. + # XXX: With SANLock there is no need to speed up the process (the + # acquirement will take a short time anyway since we already hold + # the host id) and more importantly resetting the lease is going + # to interfere with the regular SANLock behavior. + # @deprecated this is relevant only for domain version < 3 + if not newmsd.hasVolumeLeases(): + newmsd.initSPMlease() - # Mount new master file system - newmsd.mountMaster() - # Make sure there is no cruft left over - for dir in [newmsd.getVMsDir(), newmsd.getTasksDir()]: - fileUtils.cleanupdir(dir) + # Forcing to acquire the host id (if it's not acquired already) + # and acquiring the cluster lock on new master. + newmsd.acquireHostId(self.id) + newmsd.acquireClusterLock(self.id) try: - fileUtils.tarCopy(src, dst, exclude=("./lost+found",)) - except fileUtils.TarCopyFailed: - self.log.error("tarCopy failed", exc_info=True) - # Failed to copy the master data + self.log.debug('migration to the new master %s begins', + newmsd.sdUUID) + + # Mount new master file system + newmsd.mountMaster() + + # Make sure there is no cruft left over + for dir in [newmsd.getVMsDir(), newmsd.getTasksDir()]: + fileUtils.cleanupdir(dir) + + # Copy master file system content to the new master + src = os.path.join(curmsd.domaindir, sd.MASTER_FS_DIR) + dst = os.path.join(newmsd.domaindir, sd.MASTER_FS_DIR) + fileUtils.tarCopy(src, dst, exclude=('./lost+found',)) + + if not newmsd.getMDPath(): + raise se.StorageDomainLayoutError("domain", msdUUID) + + newmsd.changeRole(sd.MASTER_DOMAIN) + self.switchMasterDomain(curmsd, newmsd, masterVersion) + except Exception: + self.log.exception('migration to new master failed') try: newmsd.unmountMaster() - except Exception: - self.log.error("Unexpected error", exc_info=True) - raise se.StorageDomainMasterCopyError(msdUUID) - - self.copyPoolMD(curmsd, newmsd) - - path = newmsd.getMDPath() - if not path: - newmsd.unmountMaster() - raise se.StorageDomainLayoutError("domain", msdUUID) - - # Acquire cluster lock on new master - try: - # If the new master domain is using safelease (version < 3) then - # we can speed up the cluster lock acquirement by resetting the - # SPM lease. - # XXX: With SANLock there is no need to speed up the process (the - # acquirement will take a short time anyway since we already hold - # the host id) and more importantly resetting the lease is going - # to interfere with the regular SANLock behavior. - # @deprecated this is relevant only for domain version < 3 - if not newmsd.hasVolumeLeases(): - newmsd.initSPMlease() - # Forcing to acquire the host id (if it's not acquired already) - newmsd.acquireHostId(self.id) - newmsd.acquireClusterLock(self.id) - except Exception: - self.log.error("Unexpected error", exc_info=True) + except: # logging only + self.log.error('unable to unmount master filesystem ' + 'for domain %s', newmsd.sdUUID) newmsd.releaseClusterLock() - newmsd.unmountMaster() - raise - self.log.debug("masterMigrate - lease acquired successfully") - - try: - # Now mark new domain as 'master' - # if things break down here move the master back pronto - newPoolMD = self._getPoolMD(newmsd) - with newPoolMD.transaction(): - newPoolMD[PMDK_MASTER_VER] = masterVersion - newmsd.changeRole(sd.MASTER_DOMAIN) - self._saveReconnectInformation(self.id, self.scsiKey, - newmsd.sdUUID, masterVersion) - except Exception: - self.log.error("Unexpected error", exc_info=True) - newmsd.releaseClusterLock() - newmsd.unmountMaster() raise # From this point on we have a new master and should not fail try: - # Now recreate 'mastersd' link - # we can use refresh() to do the job self.refresh(msdUUID, masterVersion) + self._saveReconnectInformation( + self.id, self.scsiKey, newmsd.sdUUID, masterVersion) # From this point on there is a new master domain in the pool - # Now that we are beyond the critical point we can clean up things - curmsd.changeRole(sd.REGULAR_DOMAIN) + # Now that we are beyond the critical point we can clean up + # things + self.setDomainRegularRole(curmsd) # Clean up the old data from previous master fs for directory in [curmsd.getVMsDir(), curmsd.getTasksDir()]: fileUtils.cleanupdir(directory) - - # NOTE: here we unmount the *previous* master file system !!! - curmsd.unmountMaster() except Exception: - self.log.error("Unexpected error", exc_info=True) - - try: - # Release old lease + self.log.exception('ignoring old master cleanup failure') + finally: + # Unmoung old master filesystem and release old cluster lock + try: + curmsd.unmountMaster() + except: # logging only + self.log.error('unable to unmount master filesystem ' + 'for domain %s', curmsd.sdUUID) curmsd.releaseClusterLock() - except Exception: - self.log.error("Unexpected error", exc_info=True) def attachSD(self, sdUUID): """ -- To view, visit http://gerrit.ovirt.org/22419 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I155b36d41df43230f99ba610699bba05bee3f0c0 Gerrit-PatchSet: 1 Gerrit-Project: vdsm Gerrit-Branch: master Gerrit-Owner: Federico Simoncelli <[email protected]> _______________________________________________ vdsm-patches mailing list [email protected] https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches
