Roy Golan has uploaded a new change for review.

Change subject: core: prevent split brain caused by fence commands
......................................................................

core: prevent split brain caused by fence commands

This fix aims to run fence command mutually exclusive.

RestartVds is running Stop, FenceManually and Start. This flow
is marking VMs as down and playing them again on other hosts.

2 parallel calls to this flow or interleacing RestartVds with other Stop
or FenceManaually could end up marking again VMs whic currently starting
up by flow 1 as down, causing them to start and new instance on other
host and leading to 2 instances of the same VM.

To Achieve this mutually exclusive run:

* Added exclusive lock on VDS_FENCE + vdsId on RestartVds, Stop, FenceManually 
and Start commands
* Shared the lock from parent RestartVds with childs Stop, FenceManually and 
start
* overriden freeLock() method of Stop, FenceManually and Start to release the 
lock only if
we have been invoked without RestartVds as a parent command

Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1026811
Change-Id: Iaf021010188e3c3662fbf8a057da4c58f2600c02
Signed-off-by: Roy Golan <[email protected]>
---
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java
M backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties
M 
frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java
M 
frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
M 
frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
10 files changed, 135 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/43/21143/1

diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java
index 9a2020a..b715c3a 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/FenceVdsBaseCommand.java
@@ -1,7 +1,9 @@
 package org.ovirt.engine.core.bll;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorCompletionService;
@@ -25,6 +27,8 @@
 import org.ovirt.engine.core.common.errors.VdcBLLException;
 import org.ovirt.engine.core.common.errors.VdcBllErrors;
 import org.ovirt.engine.core.common.errors.VdcBllMessages;
+import org.ovirt.engine.core.common.locks.LockingGroup;
+import org.ovirt.engine.core.common.utils.Pair;
 import org.ovirt.engine.core.common.vdscommands.DestroyVmVDSCommandParameters;
 import 
org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters;
 import 
org.ovirt.engine.core.common.vdscommands.SetVmStatusVDSCommandParameters;
@@ -597,4 +601,15 @@
             this.succeeded = succeeded;
         }
     }
+
+    @Override
+    protected Map<String, Pair<String, String>> getExclusiveLocks() {
+        return createFenceExclusiveLocksMap(getVdsId());
+    }
+
+    public static Map<String, Pair<String, String>> 
createFenceExclusiveLocksMap(Guid vdsId) {
+        return Collections.singletonMap(vdsId.toString(), 
LockMessagesMatchUtil.makeLockingPair(
+                LockingGroup.VDS_FENCE,
+                
VdcBllMessages.POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS));
+    }
 }
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java
index 342b45e..2b18611 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/RestartVdsCommand.java
@@ -4,9 +4,13 @@
 import static org.ovirt.engine.core.common.AuditLogType.SYSTEM_VDS_RESTART;
 import static 
org.ovirt.engine.core.common.AuditLogType.USER_FAILED_VDS_RESTART;
 import static org.ovirt.engine.core.common.AuditLogType.USER_VDS_RESTART;
+import static 
org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__ACTION__RESTART;
+import static 
org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__TYPE__HOST;
+import static 
org.ovirt.engine.core.common.errors.VdcBllMessages.VDS_FENCE_OPERATION_FAILED;
 
 import java.util.List;
 
+import org.ovirt.engine.core.bll.context.CommandContext;
 import org.ovirt.engine.core.common.AuditLogType;
 import org.ovirt.engine.core.common.action.FenceVdsActionParameters;
 import org.ovirt.engine.core.common.action.FenceVdsManualyParameters;
@@ -19,9 +23,29 @@
 import 
org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters;
 import org.ovirt.engine.core.common.vdscommands.VDSCommandType;
 import org.ovirt.engine.core.compat.Guid;
+import org.ovirt.engine.core.utils.lock.EngineLock;
 
+/**
+ * Send a Stop followed by Start action to a power management device.
+ *
+ * This command should be run exclusively on a host for it is assuming that 
when
+ * a host is down it can mark all VMs as DOWN and start them on other host.
+ * 2 parallel action like that on the same server can lead to a race where
+ * the 1st flow end by starting VMs and the 2nd flow marking them as down and
+ * starting another instance of VMs on other hosts, leading to split-brain 
where
+ * 2 exact instances of VMs running in 2 different hosts and writing to the 
same disk.
+ *
+ * In order to make this flow distinct the child commands, Start, 
FenceManually and Stop
+ * are under the same lock as the parent, preventing other Restart, Start, 
Stop,FenceVdsManually to interleave.
+ *
+ * @see FenceVdsBaseCommand#restartVdsVms() The critical section restaring the 
VMs
+ */
+@LockIdNameAttribute
 @NonTransactiveCommandAttribute
 public class RestartVdsCommand<T extends FenceVdsActionParameters> extends 
FenceVdsBaseCommand<T> {
+
+    private CommandContext commandContext;
+
     protected List<VM> getVmList() {
         return mVmList;
     }
@@ -74,9 +98,10 @@
         fenceVdsManuallyParams.setStoragePoolId(getVds().getStoragePoolId());
         fenceVdsManuallyParams.setVdsId(vdsId);
         fenceVdsManuallyParams.setSessionId(sessionId);
+        fenceVdsManuallyParams.setParentCommand(VdcActionType.RestartVds);
 
         // if fencing succeeded, call to reset irs in order to try select new 
spm
-        Backend.getInstance().runInternalAction(VdcActionType.FenceVdsManualy, 
fenceVdsManuallyParams);
+        Backend.getInstance().runInternalAction(VdcActionType.FenceVdsManualy, 
fenceVdsManuallyParams, getContext());
     }
 
     private VdcReturnValueBase executeVdsFenceAction(final Guid vdsId,
@@ -86,14 +111,26 @@
         FenceVdsActionParameters params = new FenceVdsActionParameters(vdsId, 
fenceAction);
         params.setParentCommand(VdcActionType.RestartVds);
         params.setSessionId(sessionId);
-        return Backend.getInstance().runInternalAction(action, params);
+        return Backend.getInstance().runInternalAction(action, params, 
getContext());
+    }
+
+    private CommandContext getContext() {
+        if (commandContext == null) {
+            commandContext = new CommandContext(getExecutionContext(), new 
EngineLock(getExclusiveLocks(), null));
+        }
+        return commandContext;
+    }
+
+    @Override
+    protected void setActionMessageParameters() {
+        addCanDoActionMessage(VdcBllMessages.VAR__ACTION__RESTART);
     }
 
     @Override
     protected void handleError() {
-        addCanDoActionMessage(VdcBllMessages.VDS_FENCE_OPERATION_FAILED);
-        addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST);
-        addCanDoActionMessage(VdcBllMessages.VAR__ACTION__RESTART);
+        addCanDoActionMessage(VDS_FENCE_OPERATION_FAILED);
+        addCanDoActionMessage(VAR__TYPE__HOST);
+        addCanDoActionMessage(VAR__ACTION__RESTART);
         log.errorFormat("Failed to run RestartVdsCommand on vds :{0}", 
getVdsName());
     }
 
@@ -119,4 +156,5 @@
     @Override
     protected void handleSpecificCommandActions() {
     }
+
 }
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java
index edee396..7c83b87 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StartVdsCommand.java
@@ -2,12 +2,23 @@
 
 import org.ovirt.engine.core.common.AuditLogType;
 import org.ovirt.engine.core.common.action.FenceVdsActionParameters;
+import org.ovirt.engine.core.common.action.VdcActionType;
 import org.ovirt.engine.core.common.businessentities.VDS;
 import org.ovirt.engine.core.common.businessentities.VDSStatus;
 import org.ovirt.engine.core.common.config.Config;
 import org.ovirt.engine.core.common.config.ConfigValues;
 import org.ovirt.engine.core.common.errors.VdcBllMessages;
 
+/**
+ * Send a Start action to a power control device.
+ *
+ * This command should be run mutually exclusive from other fence actions to 
prevent same action or other fence actions
+ * to clear the VMs and start them.
+ *
+ * @see RestartVdsCommand
+ * @see FenceVdsBaseCommand#restartVdsVms()
+ */
+@LockIdNameAttribute
 @NonTransactiveCommandAttribute
 public class StartVdsCommand<T extends FenceVdsActionParameters> extends 
FenceVdsBaseCommand<T> {
     public StartVdsCommand(T parameters) {
@@ -43,6 +54,11 @@
     }
 
     @Override
+    protected void setActionMessageParameters() {
+        addCanDoActionMessage(VdcBllMessages.VAR__ACTION__START);
+    }
+
+    @Override
     protected void handleError() {
         log.errorFormat("Failed to run StartVdsCommand on vds :{0}", 
getVdsName());
     }
@@ -70,4 +86,11 @@
     protected int getDelayInSeconds() {
         return Config.<Integer> 
GetValue(ConfigValues.FenceStartStatusDelayBetweenRetriesInSec);
     }
+
+    @Override
+    protected void freeLock() {
+        if (getParameters().getParentCommand() != VdcActionType.RestartVds) {
+            super.freeLock();
+        }
+    }
 }
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java
index c32fcc8..10ca107 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/StopVdsCommand.java
@@ -1,5 +1,7 @@
 package org.ovirt.engine.core.bll;
 
+import static 
org.ovirt.engine.core.common.errors.VdcBllMessages.VAR__ACTION__STOP;
+
 import org.ovirt.engine.core.common.AuditLogType;
 import org.ovirt.engine.core.common.action.FenceVdsActionParameters;
 import org.ovirt.engine.core.common.action.VdcActionType;
@@ -12,6 +14,16 @@
 import org.ovirt.engine.core.utils.log.Log;
 import org.ovirt.engine.core.utils.log.LogFactory;
 
+/**
+ * Send a Stop action to a power control device.
+ *
+ * This command should be run mutually exclusive from other fence actions to 
prevent same action or other fence actions
+ * to clear the VMs and start them.
+ *
+ * @see RestartVdsCommand
+ * @see FenceVdsBaseCommand#restartVdsVms()
+ */
+@LockIdNameAttribute
 @NonTransactiveCommandAttribute
 public class StopVdsCommand<T extends FenceVdsActionParameters> extends 
FenceVdsBaseCommand<T> {
     public StopVdsCommand(T parameters) {
@@ -49,6 +61,11 @@
     }
 
     @Override
+    protected void setActionMessageParameters() {
+        addCanDoActionMessage(VAR__ACTION__STOP);
+    }
+
+    @Override
     protected void handleError() {
         addCanDoActionMessage(VdcBllMessages.VDS_FENCE_OPERATION_FAILED);
         addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST);
@@ -83,4 +100,11 @@
     protected int getDelayInSeconds() {
         return Config.<Integer> 
GetValue(ConfigValues.FenceStopStatusDelayBetweenRetriesInSec);
     }
+
+    @Override
+    protected void freeLock() {
+        if (getParameters().getParentCommand() != VdcActionType.RestartVds) {
+            super.freeLock();
+        }
+    }
 }
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java
index 8553a23..4a9cf09 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/storage/FenceVdsManualyCommand.java
@@ -5,8 +5,8 @@
 import java.util.Map;
 
 import org.ovirt.engine.core.bll.Backend;
+import org.ovirt.engine.core.bll.FenceVdsBaseCommand;
 import org.ovirt.engine.core.bll.LockIdNameAttribute;
-import org.ovirt.engine.core.bll.LockMessagesMatchUtil;
 import org.ovirt.engine.core.bll.job.ExecutionHandler;
 import org.ovirt.engine.core.bll.utils.PermissionSubject;
 import org.ovirt.engine.core.common.AuditLogType;
@@ -25,7 +25,6 @@
 import org.ovirt.engine.core.common.businessentities.VDSStatus;
 import org.ovirt.engine.core.common.businessentities.VdsSpmStatus;
 import org.ovirt.engine.core.common.errors.VdcBllMessages;
-import org.ovirt.engine.core.common.locks.LockingGroup;
 import org.ovirt.engine.core.common.utils.Pair;
 import 
org.ovirt.engine.core.common.vdscommands.FenceSpmStorageVDSCommandParameters;
 import org.ovirt.engine.core.common.vdscommands.ResetIrsVDSCommandParameters;
@@ -37,9 +36,18 @@
 import org.ovirt.engine.core.utils.linq.LinqUtils;
 import org.ovirt.engine.core.utils.linq.Predicate;
 
+/**
+ * Confirm a host has been rebooted, clear spm flag, its VMs(optional) and 
alerts.
+ *
+ * This command should be run mutually exclusive from other fence actions to 
prevent same action or other fence actions
+ * to clear the VMs and start them.
+ *
+ * @see org.ovirt.engine.core.bll.RestartVdsCommand
+ */
 @LockIdNameAttribute
 public class FenceVdsManualyCommand<T extends FenceVdsManualyParameters> 
extends StorageHandlingCommandBase<T> {
     private final VDS _problematicVds;
+
     /**
      * Constructor for command creation when compensation is applied on startup
      *
@@ -227,8 +235,7 @@
 
     @Override
     protected Map<String, Pair<String, String>> getExclusiveLocks() {
-        return Collections.singletonMap(getProblematicVdsId().toString(),
-                LockMessagesMatchUtil.makeLockingPair(LockingGroup.VDS_FENCE, 
VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED));
+        return 
FenceVdsBaseCommand.createFenceExclusiveLocksMap(getProblematicVdsId());
     }
 
     @Override
@@ -236,4 +243,16 @@
         return Collections.singletonList(new 
PermissionSubject(getParameters().getVdsId(), VdcObjectType.VDS,
                 getActionType().getActionGroup()));
     }
+
+    protected void setActionMessageParameters() {
+        addCanDoActionMessage(VdcBllMessages.VAR__ACTION__MANUAL_FENCE);
+        addCanDoActionMessage(VdcBllMessages.VAR__TYPE__HOST);
+     }
+
+    @Override
+    protected void freeLock() {
+        if (getParameters().getParentCommand() != VdcActionType.RestartVds) {
+            super.freeLock();
+        }
+    }
 }
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java
index fc95611..5665c27 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/errors/VdcBllMessages.java
@@ -119,6 +119,7 @@
     VAR__VM_STATUS__IMAGE_ILLEGAL,
     VAR__VM_STATUS__PREPARING_FOR_HIBERNATE,
 
+    POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS(ErrorType.CONFLICT),
     ACTION_LIST_CANNOT_BE_EMPTY(ErrorType.BAD_PARAMETERS),
     ACTION_TYPE_FAILED_NAME_ALREADY_USED(ErrorType.BAD_PARAMETERS),
     ACTION_TYPE_FAILED_VM_IN_PREVIEW(ErrorType.CONFLICT),
diff --git 
a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties 
b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties
index d05c9b1..1814e73 100644
--- 
a/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties
+++ 
b/backend/manager/modules/dal/src/main/resources/bundles/AppErrors.properties
@@ -1041,3 +1041,4 @@
 
 # Alignment scan
 ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. 
Alignment scan of a disk attached to a running VM is only supported with RAW 
virtual disks.
+POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Cannot perform 
${action}. Another power management action is already in progress.
diff --git 
a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java
 
b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java
index 06ca9fe..24e647f 100644
--- 
a/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java
+++ 
b/frontend/webadmin/modules/frontend/src/main/java/org/ovirt/engine/ui/frontend/AppErrors.java
@@ -2758,4 +2758,7 @@
 
     @DefaultStringValue("Cannot ${action} ${type}. Disk ${DiskAlias} alignment 
is currently being determined.")
     String ACTION_TYPE_FAILED_DISK_IS_USED_BY_GET_ALIGNMENT();
+
+    @DefaultStringValue("Cannot perform ${action}. Another power management 
action is already in progress.")
+    String POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS();
 }
diff --git 
a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
 
b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
index a2e3ae2..10d3649 100644
--- 
a/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
+++ 
b/frontend/webadmin/modules/userportal-gwtp/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
@@ -913,3 +913,4 @@
 
 # Alignment scan
 ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. 
Alignment scan of a disk attached to a running VM is only supported with RAW 
virtual disks.
+POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Cannot perform 
${action}. Another power management action is already in progress.
diff --git 
a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
 
b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
index 5b065be..e11d1ec 100644
--- 
a/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
+++ 
b/frontend/webadmin/modules/webadmin/src/main/resources/org/ovirt/engine/ui/frontend/AppErrors.properties
@@ -1014,3 +1014,4 @@
 
 # Alignment scan
 ERROR_CANNOT_RUN_ALIGNMENT_SCAN_VM_IS_RUNNING=Cannot ${action} ${type}. 
Alignment scan of a disk attached to a running VM is only supported with RAW 
virtual disks.
+POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS=Another power management 
action, ${action}, is already in progress.


-- 
To view, visit http://gerrit.ovirt.org/21143
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaf021010188e3c3662fbf8a057da4c58f2600c02
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-engine
Gerrit-Branch: ovirt-engine-3.3
Gerrit-Owner: Roy Golan <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to