Roy Golan has uploaded a new change for review. Change subject: core: prevent split brain caused by fence commands ......................................................................
core: prevent split brain caused by fence commands This fix aims to run fence command mutually exclusive. RestartVds is running Stop, FenceManually and Start. This flow is marking VMs as down and playing them again on other hosts. 2 parallel calls to this flow or interleacing RestartVds with other Stop or FenceManaually could end up marking again VMs whic currently starting up by flow 1 as down, causing them to start and new instance on other host and leading to 2 instances of the same VM. To Achieve this mutually exclusive run: * Added exclusive lock on VDS_FENCE + vdsId on RestartVds, Stop, FenceManually and Start commands * Shared the lock from parent RestartVds with childs Stop, FenceManually and start * overriden freeLock() method of Stop, FenceManually and Start to release the lock only if we have been invoked without RestartVds as a parent command Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1026811 Change-Id: Iaf021010188e3c3662fbf8a057da4c58f2600c02 Signed-off-by: Roy Golan <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java M backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties M frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java M frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties M frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties 10 files changed, 135 insertions(+), 9 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/43/21143/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java index 9a2020a..b715c3a 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java @@ -1,7 +1,9 @@ package org.ovirt.engine.core.bll; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorCompletionService; @@ -25,6 +27,8 @@ import org.ovirt.engine.core.common.errors.VdcBLLException; import org.ovirt.engine.core.common.errors.VdcBllErrors; import org.ovirt.engine.core.common.errors.VdcBllMessages; +import org.ovirt.engine.core.common.locks.LockingGroup; +import org.ovirt.engine.core.common.utils.Pair; import org.ovirt.engine.core.common.vdscommands.DestroyVmVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.SetVmStatusVDSCommandParameters; @@ -597,4 +601,15 @@ this.succeeded = succeeded; } } + + @Override + protected Map<String, Pair<String, String>> getExclusiveLocks() { + return createFenceExclusiveLocksMap(getVdsId()); + } + + public static Map<String, Pair<String, String>> createFenceExclusiveLocksMap(Guid vdsId) { + return Collections.singletonMap(vdsId.toString(), LockMessagesMatchUtil.makeLockingPair( + LockingGroup.VDS_FENCE, + VdcBllMessages.POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS)); + } } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java index 342b45e..2b18611 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java @@ -4,9 +4,13 @@ import static org.ovirt.engine.core.common.AuditLogType.SYSTEM_VDS_RESTART; import static org.ovirt.engine.core.common.AuditLogType.USER_FAILED_VDS_RESTART; import static org.ovirt.engine.core.common.AuditLogType.USER_VDS_RESTART; +import static org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__ACTION__RESTART; +import static org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__TYPE__HOST; +import static org.ovirt.engine.core.common.errors.VdcBllMessages.VDS_FENCE_OPERATION_FAILED; import java.util.List; +import org.ovirt.engine.core.bll.context.CommandContext; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; import org.ovirt.engine.core.common.action.FenceVdsManualyParameters; @@ -19,9 +23,29 @@ import org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.VDSCommandType; import org.ovirt.engine.core.compat.Guid; +import org.ovirt.engine.core.utils.lock.EngineLock; +/** + * Send a Stop followed by Start action to a power management device. + * + * This command should be run exclusively on a host for it is assuming that when + * a host is down it can mark all VMs as DOWN and start them on other host. + * 2 parallel action like that on the same server can lead to a race where + * the 1st flow end by starting VMs and the 2nd flow marking them as down and + * starting another instance of VMs on other hosts, leading to split-brain where + * 2 exact instances of VMs running in 2 different hosts and writing to the same disk. + * + * In order to make this flow distinct the child commands, Start, FenceManually and Stop + * are under the same lock as the parent, preventing other Restart, Start, Stop,FenceVdsManually to interleave. + * + * @see FenceVdsBaseCommand#restartVdsVms() The critical section restaring the VMs + */ +@LockIdNameAttribute @NonTransactiveCommandAttribute public class RestartVdsCommand<T extends FenceVdsActionParameters> extends FenceVdsBaseCommand<T> { + + private CommandContext commandContext; + protected List<VM> getVmList() { return mVmList; } @@ -74,9 +98,10 @@ fenceVdsManuallyParams.setStoragePoolId(getVds().getStoragePoolId()); fenceVdsManuallyParams.setVdsId(vdsId); fenceVdsManuallyParams.setSessionId(sessionId); + fenceVdsManuallyParams.setParentCommand(VdcActionType.RestartVds); // if fencing succeeded, call to reset irs in order to try select new spm - Backend.getInstance().runInternalAction(VdcActionType.FenceVdsManualy, fenceVdsManuallyParams); + Backend.getInstance().runInternalAction(VdcActionType.FenceVdsManualy, fenceVdsManuallyParams, getContext()); } private VdcReturnValueBase executeVdsFenceAction(final Guid vdsId, @@ -86,14 +111,26 @@ FenceVdsActionParameters params = new FenceVdsActionParameters(vdsId, fenceAction); params.setParentCommand(VdcActionType.RestartVds); params.setSessionId(sessionId); - return Backend.getInstance().runInternalAction(action, params); + return Backend.getInstance().runInternalAction(action, params, getContext()); + } + + private CommandContext getContext() { + if (commandContext == null) { + commandContext = new CommandContext(getExecutionContext(), new EngineLock(getExclusiveLocks(), null)); + } + return commandContext; + } + + @Override + protected void setActionMessageParameters() { + addCanDoActionMessage(VdcBllMessages.VAR__ACTION__RESTART); } @Override protected void handleError() { - addCanDoActionMessage(VdcBllMessages.VDS_FENCE_OPERATION_FAILED); - addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST); - addCanDoActionMessage(VdcBllMessages.VAR__ACTION__RESTART); + addCanDoActionMessage(VDS_FENCE_OPERATION_FAILED); + addCanDoActionMessage(VAR__TYPE__HOST); + addCanDoActionMessage(VAR__ACTION__RESTART); log.errorFormat("Failed to run RestartVdsCommand on vds :{0}", getVdsName()); } @@ -119,4 +156,5 @@ @Override protected void handleSpecificCommandActions() { } + } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java index edee396..7c83b87 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java @@ -2,12 +2,23 @@ import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; +import org.ovirt.engine.core.common.action.VdcActionType; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.errors.VdcBllMessages; +/** + * Send a Start action to a power control device. + * + * This command should be run mutually exclusive from other fence actions to prevent same action or other fence actions + * to clear the VMs and start them. + * + * @see RestartVdsCommand + * @see FenceVdsBaseCommand#restartVdsVms() + */ +@LockIdNameAttribute @NonTransactiveCommandAttribute public class StartVdsCommand<T extends FenceVdsActionParameters> extends FenceVdsBaseCommand<T> { public StartVdsCommand(T parameters) { @@ -43,6 +54,11 @@ } @Override + protected void setActionMessageParameters() { + addCanDoActionMessage(VdcBllMessages.VAR__ACTION__START); + } + + @Override protected void handleError() { log.errorFormat("Failed to run StartVdsCommand on vds :{0}", getVdsName()); } @@ -70,4 +86,11 @@ protected int getDelayInSeconds() { return Config.<Integer> GetValue(ConfigValues.FenceStartStatusDelayBetweenRetriesInSec); } + + @Override + protected void freeLock() { + if (getParameters().getParentCommand() != VdcActionType.RestartVds) { + super.freeLock(); + } + } } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java index c32fcc8..10ca107 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java @@ -1,5 +1,7 @@ package org.ovirt.engine.core.bll; +import static org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__ACTION__STOP; + import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; import org.ovirt.engine.core.common.action.VdcActionType; @@ -12,6 +14,16 @@ import org.ovirt.engine.core.utils.log.Log; import org.ovirt.engine.core.utils.log.LogFactory; +/** + * Send a Stop action to a power control device. + * + * This command should be run mutually exclusive from other fence actions to prevent same action or other fence actions + * to clear the VMs and start them. + * + * @see RestartVdsCommand + * @see FenceVdsBaseCommand#restartVdsVms() + */ +@LockIdNameAttribute @NonTransactiveCommandAttribute public class StopVdsCommand<T extends FenceVdsActionParameters> extends FenceVdsBaseCommand<T> { public StopVdsCommand(T parameters) { @@ -49,6 +61,11 @@ } @Override + protected void setActionMessageParameters() { + addCanDoActionMessage(VAR__ACTION__STOP); + } + + @Override protected void handleError() { addCanDoActionMessage(VdcBllMessages.VDS_FENCE_OPERATION_FAILED); addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST); @@ -83,4 +100,11 @@ protected int getDelayInSeconds() { return Config.<Integer> GetValue(ConfigValues.FenceStopStatusDelayBetweenRetriesInSec); } + + @Override + protected void freeLock() { + if (getParameters().getParentCommand() != VdcActionType.RestartVds) { + super.freeLock(); + } + } } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java index 8553a23..4a9cf09 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java @@ -5,8 +5,8 @@ import java.util.Map; import org.ovirt.engine.core.bll.Backend; +import org.ovirt.engine.core.bll.FenceVdsBaseCommand; import org.ovirt.engine.core.bll.LockIdNameAttribute; -import org.ovirt.engine.core.bll.LockMessagesMatchUtil; import org.ovirt.engine.core.bll.job.ExecutionHandler; import org.ovirt.engine.core.bll.utils.PermissionSubject; import org.ovirt.engine.core.common.AuditLogType; @@ -25,7 +25,6 @@ import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.businessentities.VdsSpmStatus; import org.ovirt.engine.core.common.errors.VdcBllMessages; -import org.ovirt.engine.core.common.locks.LockingGroup; import org.ovirt.engine.core.common.utils.Pair; import org.ovirt.engine.core.common.vdscommands.FenceSpmStorageVDSCommandParameters; import org.ovirt.engine.core.common.vdscommands.ResetIrsVDSCommandParameters; @@ -37,9 +36,18 @@ import org.ovirt.engine.core.utils.linq.LinqUtils; import org.ovirt.engine.core.utils.linq.Predicate; +/** + * Confirm a host has been rebooted, clear spm flag, its VMs(optional) and alerts. + * + * This command should be run mutually exclusive from other fence actions to prevent same action or other fence actions + * to clear the VMs and start them. + * + * @see org.ovirt.engine.core.bll.RestartVdsCommand + */ @LockIdNameAttribute public class FenceVdsManualyCommand<T extends FenceVdsManualyParameters> extends StorageHandlingCommandBase<T> { private final VDS _problematicVds; + /** * Constructor for command creation when compensation is applied on startup * @@ -227,8 +235,7 @@ @Override protected Map<String, Pair<String, String>> getExclusiveLocks() { - return Collections.singletonMap(getProblematicVdsId().toString(), - LockMessagesMatchUtil.makeLockingPair(LockingGroup.VDS_FENCE, VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED)); + return FenceVdsBaseCommand.createFenceExclusiveLocksMap(getProblematicVdsId()); } @Override @@ -236,4 +243,16 @@ return Collections.singletonList(new PermissionSubject(getParameters().getVdsId(), VdcObjectType.VDS, getActionType().getActionGroup())); } + + protected void setActionMessageParameters() { + addCanDoActionMessage(VdcBllMessages.VAR__ACTION__MANUAL_FENCE); + addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST); + } + + @Override + protected void freeLock() { + if (getParameters().getParentCommand() != VdcActionType.RestartVds) { + super.freeLock(); + } + } } diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java index fc95611..5665c27 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java @@ -119,6 +119,7 @@ VAR__VM_STATUS__IMAGE_ILLEGAL, VAR__VM_STATUS__PREPARING_FOR_HIBERNATE, + POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS(ErrorType.CONFLICT), ACTION_LIST_CANNOT_BE_EMPTY(ErrorType.BAD_PARAMETERS), ACTION_TYPE_FAILED_NAME_ALREADY_USED(ErrorType.BAD_PARAMETERS), ACTION_TYPE_FAILED_VM_IN_PREVIEW(ErrorType.CONFLICT), diff --git a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties index d05c9b1..1814e73 100644 --- a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties +++ b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties @@ -1041,3 +1041,4 @@ # Alignment scan ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. Alignment scan of a disk attached to a running VM is only supported with RAW virtual disks. +POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Cannot perform ${action}. Another power management action is already in progress. diff --git a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java index 06ca9fe..24e647f 100644 --- a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java +++ b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java @@ -2758,4 +2758,7 @@ @DefaultStringValue("Cannot ${action} ${type}. Disk ${DiskAlias} alignment is currently being determined.") String ACTION_TYPE_FAILED_DISK_IS_USED_BY_GET_ALIGNMENT(); + + @DefaultStringValue("Cannot perform ${action}. Another power management action is already in progress.") + String POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS(); } diff --git a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties index a2e3ae2..10d3649 100644 --- a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties +++ b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties @@ -913,3 +913,4 @@ # Alignment scan ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. Alignment scan of a disk attached to a running VM is only supported with RAW virtual disks. +POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Cannot perform ${action}. Another power management action is already in progress. diff --git a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties index 5b065be..e11d1ec 100644 --- a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties +++ b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties @@ -1014,3 +1014,4 @@ # Alignment scan ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. Alignment scan of a disk attached to a running VM is only supported with RAW virtual disks. +POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Another power management action, ${action}, is already in progress. -- To view, visit http://gerrit.ovirt.org/21143 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iaf021010188e3c3662fbf8a057da4c58f2600c02 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: ovirt-engine-3.3 Gerrit-Owner: Roy Golan <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
