This is an automated email from the ASF dual-hosted git repository.
rohit pushed a commit to branch 4.20
in repository https://gitbox.apache.org/repos/asf/cloudstack.git
The following commit(s) were added to refs/heads/4.20 by this push:
new 33a37da9ec7 server: investigate pending HA work when executing in new
MS session (#10167)
33a37da9ec7 is described below
commit 33a37da9ec7ef75fa42b9c6f2ca3a8f0ebf072bc
Author: Abhishek Kumar <[email protected]>
AuthorDate: Tue Jan 28 14:39:31 2025 +0530
server: investigate pending HA work when executing in new MS session
(#10167)
For HA work items that are created for host state change, checks must be
done when execution is called in a new management server session.
A new column, reason, has been added in cloud.op_ha_work table to track
the reason for HA work.
When HighAvailabilityManager starts it finds and puts all pending HA
work items in Investigating state. During execution of the HA work if it
is found in investigating state, checks are done to verify if the work
is still valid. If the jobs is found to be invalid it is cancelled.
Signed-off-by: Abhishek Kumar <[email protected]>
---
.../java/com/cloud/ha/HighAvailabilityManager.java | 21 ++++--
.../com/cloud/agent/manager/AgentManagerImpl.java | 2 +-
.../resources/META-INF/db/schema-42000to42010.sql | 3 +
server/src/main/java/com/cloud/ha/HaWorkVO.java | 15 ++++-
.../com/cloud/ha/HighAvailabilityManagerImpl.java | 74 ++++++++++++++++++----
.../java/com/cloud/ha/dao/HighAvailabilityDao.java | 2 +
.../com/cloud/ha/dao/HighAvailabilityDaoImpl.java | 30 ++++++++-
.../com/cloud/resource/ResourceManagerImpl.java | 30 ++++-----
.../ha/provider/host/HAAbstractHostProvider.java | 2 +-
.../cloud/ha/HighAvailabilityManagerImplTest.java | 66 +++++++++++++++++--
.../cloud/ha/dao/HighAvailabilityDaoImplTest.java | 59 ++++++++++++++++-
11 files changed, 260 insertions(+), 44 deletions(-)
diff --git
a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
index 728f5a2b180..ddc8153d739 100644
---
a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
+++
b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
@@ -84,6 +84,13 @@ public interface HighAvailabilityManager extends Manager {
HA; // Restart a VM.
}
+ enum ReasonType {
+ Unknown,
+ HostMaintenance,
+ HostDown,
+ HostDegraded;
+ }
+
enum Step {
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating,
Cancelled, Done, Error,
}
@@ -92,7 +99,7 @@ public interface HighAvailabilityManager extends Manager {
* Investigate why a host has disconnected and migrate the VMs on it
* if necessary.
*
- * @param host - the host that has disconnected.
+ * @param hostId - the id of the host that has disconnected.
*/
Status investigate(long hostId);
@@ -109,17 +116,19 @@ public interface HighAvailabilityManager extends Manager {
* @param investigate must be investigated before we do anything with this
vm.
*/
void scheduleRestart(VMInstanceVO vm, boolean investigate);
+ void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType
reasonType);
void cancelDestroy(VMInstanceVO vm, Long hostId);
- boolean scheduleDestroy(VMInstanceVO vm, long hostId);
+ boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType
reasonType);
/**
* Schedule restarts for all vms running on the host.
* @param host host.
- * @param investigate TODO
+ * @param investigate whether to investigate
+ * @param reasonType reason for HA work
*/
- void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
+ void scheduleRestartForVmsOnHost(HostVO host, boolean investigate,
ReasonType reasonType);
/**
* Schedule the vm for migration.
@@ -128,6 +137,7 @@ public interface HighAvailabilityManager extends Manager {
* @return true if schedule worked.
*/
boolean scheduleMigration(VMInstanceVO vm);
+ boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);
List<VMInstanceVO> findTakenMigrationWork();
@@ -140,10 +150,11 @@ public interface HighAvailabilityManager extends Manager {
* 3. Check if a VM has been stopped: WorkType.CheckStop
*
* @param vm virtual machine to stop.
- * @param host host the virtual machine is on.
+ * @param hostId the id of the host the virtual machine is on.
* @param type which type of stop is requested.
*/
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
+ boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type,
ReasonType reasonType);
void cancelScheduledMigrations(HostVO host);
diff --git
a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
index 09fb211fedf..32180a91909 100644
---
a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
+++
b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java
@@ -989,7 +989,7 @@ public class AgentManagerImpl extends ManagerBase
implements AgentManager, Handl
handleDisconnectWithoutInvestigation(attache, event, true, true);
host = _hostDao.findById(hostId); // Maybe the host magically
reappeared?
if (host != null && host.getStatus() == Status.Down) {
- _haMgr.scheduleRestartForVmsOnHost(host, true);
+ _haMgr.scheduleRestartForVmsOnHost(host, true,
HighAvailabilityManager.ReasonType.HostDown);
}
return true;
}
diff --git
a/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql
b/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql
index 8b70cce3404..976ef217832 100644
--- a/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql
+++ b/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql
@@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes',
'last_id', 'bigint(20) uns
-- Add used_iops column to support IOPS data in storage stats
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops',
'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage
pool" ');
+
+-- Add reason column for op_ha_work
+CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason',
'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');
diff --git a/server/src/main/java/com/cloud/ha/HaWorkVO.java
b/server/src/main/java/com/cloud/ha/HaWorkVO.java
index f5a36e3bb1a..8307f4d9317 100644
--- a/server/src/main/java/com/cloud/ha/HaWorkVO.java
+++ b/server/src/main/java/com/cloud/ha/HaWorkVO.java
@@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
@Column(name = "tried")
int timesTried;
+ @Column(name = "reason")
+ @Enumerated(value = EnumType.STRING)
+ private HighAvailabilityManager.ReasonType reasonType;
+
protected HaWorkVO() {
}
@@ -179,7 +183,7 @@ public class HaWorkVO implements InternalIdentity {
}
public HaWorkVO(final long instanceId, final VirtualMachine.Type type,
final WorkType workType, final Step step, final long hostId, final State
previousState,
- final int timesTried, final long updated) {
+ final int timesTried, final long updated,
HighAvailabilityManager.ReasonType reasonType) {
this.workType = workType;
this.type = type;
this.instanceId = instanceId;
@@ -191,6 +195,7 @@ public class HaWorkVO implements InternalIdentity {
this.step = step;
this.timeToTry = System.currentTimeMillis() >> 10;
this.updateTime = updated;
+ this.reasonType = reasonType;
}
@Override
@@ -207,4 +212,12 @@ public class HaWorkVO implements InternalIdentity {
.append("]")
.toString();
}
+
+ public HighAvailabilityManager.ReasonType getReasonType() {
+ return reasonType;
+ }
+
+ public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
+ this.reasonType = reasonType;
+ }
}
diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
index e10bd47a067..2ce803756fe 100644
--- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
+++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
@@ -19,6 +19,7 @@ package com.cloud.ha;
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
@@ -43,6 +44,7 @@ import
org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
+import org.apache.logging.log4j.ThreadContext;
import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
@@ -90,7 +92,6 @@ import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VirtualMachineProfile;
import com.cloud.vm.dao.VMInstanceDao;
-import org.apache.logging.log4j.ThreadContext;
/**
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered
with the HA Manager for HA. The request is stored
@@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new
ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
"Enable/Disable alerts for the VM HA operations, it is enabled by
default.", true, Zone);
+ protected static final List<ReasonType> CancellableWorkReasonTypes =
+ Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown,
ReasonType.HostDegraded);
+
WorkerThread[] _workers;
boolean _stopped;
long _timeToSleep;
@@ -269,8 +273,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
@Override
- public void scheduleRestartForVmsOnHost(final HostVO host, boolean
investigate) {
-
+ public void scheduleRestartForVmsOnHost(final HostVO host, boolean
investigate, ReasonType reasonType) {
if (host.getType() != Host.Type.Routing) {
return;
}
@@ -337,12 +340,12 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
logger.debug("VM {} is not on down host {} it is on other host
{} VM HA is done", vm, host, hostId);
continue;
}
- scheduleRestart(vm, investigate);
+ scheduleRestart(vm, investigate, reasonType);
}
}
@Override
- public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
+ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type,
ReasonType reasonType) {
assert (type == WorkType.CheckStop || type == WorkType.ForceStop ||
type == WorkType.Stop);
if (_haDao.hasBeenScheduled(vm.getId(), type)) {
@@ -359,7 +362,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return false;
}
- HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type,
Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
+ HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type,
Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work);
@@ -368,6 +371,11 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return true;
}
+ @Override
+ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
+ return scheduleStop(vm, hostId, type, null);
+ }
+
protected void wakeupWorkers() {
logger.debug("Wakeup workers HA");
for (WorkerThread worker : _workers) {
@@ -376,7 +384,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
@Override
- public boolean scheduleMigration(final VMInstanceVO vm) {
+ public boolean scheduleMigration(final VMInstanceVO vm, ReasonType
reasonType) {
if (vm.getHostId() == null) {
return false;
}
@@ -390,7 +398,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return false;
}
- final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(),
WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0,
vm.getUpdated());
+ final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(),
WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0,
vm.getUpdated(), reasonType);
_haDao.persist(work);
logger.info("Scheduled migration work of VM {} from host {} with
HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
wakeupWorkers();
@@ -398,7 +406,12 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
@Override
- public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
+ public boolean scheduleMigration(final VMInstanceVO vm) {
+ return scheduleMigration(vm, null);
+ }
+
+ @Override
+ public void scheduleRestart(VMInstanceVO vm, boolean investigate,
ReasonType reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule restart for the
VM %s (%d), VM high availability manager is disabled.", vm.getName(),
vm.getId());
if (logger.isDebugEnabled()) {
@@ -490,7 +503,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA,
investigate ? Step.Investigating : Step.Scheduled,
- hostId != null ? hostId : 0L, vm.getState(), timesTried,
vm.getUpdated());
+ hostId != null ? hostId : 0L, vm.getState(), timesTried,
vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isInfoEnabled()) {
@@ -500,6 +513,11 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
wakeupWorkers();
}
+ @Override
+ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
+ scheduleRestart(vm, investigate, null);
+ }
+
private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param,
Object> params,
DeploymentPlanner planner) throws InsufficientCapacityException,
ResourceUnavailableException,
ConcurrentOperationException, OperationTimedoutException {
@@ -561,6 +579,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
logger.info("Unable to find vm: " + vmId);
return null;
}
+ if (checkAndCancelWorkIfNeeded(work)) {
+ return null;
+ }
logger.info("HA on " + vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() !=
work.getUpdateTime()) {
@@ -762,6 +783,22 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
}
+ protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
+ if (!Step.Investigating.equals(work.getStep())) {
+ return false;
+ }
+ if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
+ return false;
+ }
+ Status hostStatus = investigate(work.getHostId());
+ if (!Status.Up.equals(hostStatus)) {
+ return false;
+ }
+ logger.debug("Cancelling {} as it is not needed anymore", () -> work);
+ work.setStep(Step.Cancelled);
+ return true;
+ }
+
public Long migrate(final HaWorkVO work) {
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
@@ -772,6 +809,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
return null;
}
+ if (checkAndCancelWorkIfNeeded(work)) {
+ return null;
+ }
logger.info("Migration attempt: for VM {}from host {}. Starting
attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
try {
work.setStep(Step.Migrating);
@@ -791,7 +831,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
@Override
- public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
+ public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType
reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule destroy for the
VM %s (%d) on host %d, VM high availability manager is disabled.",
vm.getName(), vm.getId(), hostId);
if (logger.isDebugEnabled()) {
@@ -801,7 +841,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return false;
}
- final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(),
WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
+ final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(),
WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(),
reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work.toString());
@@ -838,6 +878,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
logger.info("No longer can find VM " + work.getInstanceId() + ".
Throwing away " + work);
return null;
}
+ if (checkAndCancelWorkIfNeeded(work)) {
+ return null;
+ }
boolean expunge =
VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
if (!expunge &&
VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
@@ -872,6 +915,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
work.setStep(Step.Done);
return null;
}
+ if (checkAndCancelWorkIfNeeded(work)) {
+ return null;
+ }
logger.info("Stopping " + vm);
try {
if (work.getWorkType() == WorkType.Stop) {
@@ -1057,6 +1103,8 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
public boolean start() {
_stopped = false;
+ _haDao.markPendingWorksAsInvestigating();
+
for (final WorkerThread thread : _workers) {
thread.start();
}
@@ -1074,6 +1122,8 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
_executor.shutdown();
+ _haDao.markServerPendingWorksAsInvestigating(_msServer.getId());
+
return true;
}
diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java
b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java
index f6539105d78..42c8aabe41a 100644
--- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java
+++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java
@@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends
GenericDao<HaWorkVO, Long> {
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
int expungeByVmList(List<Long> vmIds, Long batchSize);
+ void markPendingWorksAsInvestigating();
+ void markServerPendingWorksAsInvestigating(long managementServerId);
}
diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java
b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java
index c722c6376c1..00b62e0d601 100644
--- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java
+++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java
@@ -31,12 +31,13 @@ import com.cloud.utils.db.SearchBuilder;
import com.cloud.utils.db.SearchCriteria;
import com.cloud.utils.db.SearchCriteria.Op;
import com.cloud.utils.db.TransactionLegacy;
+import com.cloud.utils.db.UpdateBuilder;
import com.cloud.utils.exception.CloudRuntimeException;
@Component
public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long>
implements HighAvailabilityDao {
- private final SearchBuilder<HaWorkVO> TBASearch;
+ protected SearchBuilder<HaWorkVO> TBASearch;
private final SearchBuilder<HaWorkVO> PreviousInstanceSearch;
private final SearchBuilder<HaWorkVO> UntakenMigrationSearch;
private final SearchBuilder<HaWorkVO> CleanupSearch;
@@ -270,4 +271,31 @@ public class HighAvailabilityDaoImpl extends
GenericDaoBase<HaWorkVO, Long> impl
sc.setParameters("vmIds", vmIds.toArray());
return batchExpunge(sc, batchSize);
}
+
+ protected void updatePendingWorkToInvestigating(SearchCriteria<HaWorkVO>
sc) {
+ HaWorkVO haWorkVO = createForUpdate();
+ haWorkVO.setStep(Step.Investigating);
+ UpdateBuilder updateBuilder = getUpdateBuilder(haWorkVO);
+ update(updateBuilder, sc, null);
+ }
+
+ @Override
+ public void markPendingWorksAsInvestigating() {
+ final SearchCriteria<HaWorkVO> sc = TBASearch.create();
+ sc.setParameters("time", System.currentTimeMillis() >> 10);
+ sc.setParameters("step", Step.Done, Step.Cancelled);
+ updatePendingWorkToInvestigating(sc);
+ }
+
+ @Override
+ public void markServerPendingWorksAsInvestigating(long managementServerId)
{
+ SearchBuilder<HaWorkVO> sb = createSearchBuilder();
+ sb.and("server", sb.entity().getServerId(), Op.EQ);
+ sb.and("step", sb.entity().getStep(), Op.NIN);
+ sb.done();
+ SearchCriteria<HaWorkVO> sc = sb.create();
+ sc.setParameters("server", managementServerId);
+ sc.setParameters("step", Step.Done, Step.Cancelled);
+ updatePendingWorkToInvestigating(sc);
+ }
}
diff --git a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
index 50116905bfe..1349e03f205 100755
--- a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
+++ b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
@@ -37,16 +37,6 @@ import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
-import com.cloud.alert.AlertManager;
-import com.cloud.cpu.CPU;
-import com.cloud.exception.StorageConflictException;
-import com.cloud.exception.StorageUnavailableException;
-import com.cloud.ha.HighAvailabilityManagerImpl;
-import com.cloud.host.HostTagVO;
-import com.cloud.storage.Volume;
-import com.cloud.storage.VolumeVO;
-import com.cloud.storage.dao.VolumeDao;
-import com.cloud.hypervisor.HypervisorGuru;
import org.apache.cloudstack.alert.AlertService;
import org.apache.cloudstack.annotation.AnnotationService;
import org.apache.cloudstack.annotation.dao.AnnotationDao;
@@ -93,6 +83,7 @@ import com.cloud.agent.api.UpdateHostPasswordCommand;
import com.cloud.agent.api.VgpuTypesInfo;
import com.cloud.agent.api.to.GPUDeviceTO;
import com.cloud.agent.transport.Request;
+import com.cloud.alert.AlertManager;
import com.cloud.capacity.Capacity;
import com.cloud.capacity.CapacityManager;
import com.cloud.capacity.CapacityState;
@@ -101,6 +92,7 @@ import com.cloud.capacity.dao.CapacityDao;
import com.cloud.cluster.ClusterManager;
import com.cloud.configuration.Config;
import com.cloud.configuration.ConfigurationManager;
+import com.cloud.cpu.CPU;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.ClusterDetailsVO;
import com.cloud.dc.ClusterVO;
@@ -134,6 +126,8 @@ import com.cloud.exception.InvalidParameterValueException;
import com.cloud.exception.PermissionDeniedException;
import com.cloud.exception.ResourceInUseException;
import com.cloud.exception.ResourceUnavailableException;
+import com.cloud.exception.StorageConflictException;
+import com.cloud.exception.StorageUnavailableException;
import com.cloud.gpu.GPU;
import com.cloud.gpu.HostGpuGroupsVO;
import com.cloud.gpu.VGPUTypesVO;
@@ -141,10 +135,12 @@ import com.cloud.gpu.dao.HostGpuGroupsDao;
import com.cloud.gpu.dao.VGPUTypesDao;
import com.cloud.ha.HighAvailabilityManager;
import com.cloud.ha.HighAvailabilityManager.WorkType;
+import com.cloud.ha.HighAvailabilityManagerImpl;
import com.cloud.host.DetailVO;
import com.cloud.host.Host;
import com.cloud.host.Host.Type;
import com.cloud.host.HostStats;
+import com.cloud.host.HostTagVO;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.Status.Event;
@@ -153,6 +149,7 @@ import com.cloud.host.dao.HostDetailsDao;
import com.cloud.host.dao.HostTagsDao;
import com.cloud.hypervisor.Hypervisor;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
+import com.cloud.hypervisor.HypervisorGuru;
import com.cloud.hypervisor.kvm.discoverer.KvmDummyResourceBase;
import com.cloud.network.dao.IPAddressDao;
import com.cloud.network.dao.IPAddressVO;
@@ -170,10 +167,13 @@ import com.cloud.storage.StoragePoolHostVO;
import com.cloud.storage.StoragePoolStatus;
import com.cloud.storage.StorageService;
import com.cloud.storage.VMTemplateVO;
+import com.cloud.storage.Volume;
+import com.cloud.storage.VolumeVO;
import com.cloud.storage.dao.DiskOfferingDao;
import com.cloud.storage.dao.GuestOSCategoryDao;
import com.cloud.storage.dao.StoragePoolHostDao;
import com.cloud.storage.dao.VMTemplateDao;
+import com.cloud.storage.dao.VolumeDao;
import com.cloud.user.Account;
import com.cloud.user.AccountManager;
import com.cloud.utils.Ternary;
@@ -1348,7 +1348,7 @@ public class ResourceManagerImpl extends ManagerBase
implements ResourceManager,
if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
logger.error("Maintenance: VM is of type {}. Destroying VM {}
immediately instead of migration.", vm.getType(), vm);
- _haMgr.scheduleDestroy(vm, host.getId());
+ _haMgr.scheduleDestroy(vm, host.getId(),
HighAvailabilityManager.ReasonType.HostMaintenance);
return;
}
logger.error("Maintenance: No hosts available for migrations.
Scheduling shutdown for VM {} instead of migration.", vm);
@@ -1405,10 +1405,10 @@ public class ResourceManagerImpl extends ManagerBase
implements ResourceManager,
handleVmForLastHostOrWithVGpu(host, vm);
} else if (HypervisorType.LXC.equals(host.getHypervisorType())
&& VirtualMachine.Type.User.equals(vm.getType())){
//Migration is not supported for LXC Vms. Schedule restart
instead.
- _haMgr.scheduleRestart(vm, false);
+ _haMgr.scheduleRestart(vm, false,
HighAvailabilityManager.ReasonType.HostMaintenance);
} else if (userVmManager.isVMUsingLocalStorage(vm)) {
if (isMaintenanceLocalStrategyForceStop()) {
- _haMgr.scheduleStop(vm, hostId, WorkType.ForceStop);
+ _haMgr.scheduleStop(vm, hostId, WorkType.ForceStop,
HighAvailabilityManager.ReasonType.HostMaintenance);
} else if (isMaintenanceLocalStrategyMigrate()) {
migrateAwayVmWithVolumes(host, vm);
} else if (!isMaintenanceLocalStrategyDefault()){
@@ -1421,7 +1421,7 @@ public class ResourceManagerImpl extends ManagerBase
implements ResourceManager,
}
} else {
logger.info("Maintenance: scheduling migration of VM {}
from host {}", vm, host);
- _haMgr.scheduleMigration(vm);
+ _haMgr.scheduleMigration(vm,
HighAvailabilityManager.ReasonType.HostMaintenance);
}
}
}
@@ -1637,7 +1637,7 @@ public class ResourceManagerImpl extends ManagerBase
implements ResourceManager,
for (VMInstanceVO vm : allVmsOnHost) {
State vmState = vm.getState();
if (vmState == State.Starting || vmState == State.Running ||
vmState == State.Stopping) {
- _haMgr.scheduleRestart(vm, false);
+ _haMgr.scheduleRestart(vm, false,
HighAvailabilityManager.ReasonType.HostDegraded);
}
}
}
diff --git
a/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
b/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
index 5907c1864ad..2d77e6f9d20 100644
---
a/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
+++
b/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java
@@ -74,7 +74,7 @@ public abstract class HAAbstractHostProvider extends
AdapterBase implements HAPr
try {
logger.debug("Trying to disconnect the host without
investigation and scheduling HA for the VMs on host {}", host);
agentManager.disconnectWithoutInvestigation(host.getId(),
Event.HostDown);
-
oldHighAvailabilityManager.scheduleRestartForVmsOnHost((HostVO)host, true);
+
oldHighAvailabilityManager.scheduleRestartForVmsOnHost((HostVO)host, true,
HighAvailabilityManager.ReasonType.HostDown);
} catch (Exception e) {
logger.error("Failed to disconnect host and schedule HA
restart of VMs after fencing the host: ", e);
}
diff --git
a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
index 53ae5d2279e..24714b72388 100644
--- a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
+++ b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
@@ -135,6 +135,9 @@ public class HighAvailabilityManagerImplTest {
@Mock
UserVmManager userVmManager;
+ @Mock
+ private HaWorkVO mockWork;
+
HighAvailabilityManagerImpl highAvailabilityManager;
HighAvailabilityManagerImpl highAvailabilityManagerSpy;
static Method processWorkMethod = null;
@@ -185,7 +188,7 @@ public class HighAvailabilityManagerImplTest {
highAvailabilityManager.VmHaEnabled = haEnabled;
Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(true);
- highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
+ highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true,
HighAvailabilityManager.ReasonType.HostDown);
}
@Test
@@ -193,7 +196,7 @@ public class HighAvailabilityManagerImplTest {
Mockito.when(hostVO.getType()).thenReturn(Host.Type.Routing);
Mockito.when(hostVO.getHypervisorType()).thenReturn(HypervisorType.VMware);
- highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
+ highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true,
HighAvailabilityManager.ReasonType.HostDown);
}
@Test
@@ -206,7 +209,7 @@ public class HighAvailabilityManagerImplTest {
highAvailabilityManager.VmHaEnabled = haEnabled;
Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(false);
- highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
+ highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true,
HighAvailabilityManager.ReasonType.HostDown);
}
@Test
@@ -240,7 +243,7 @@ public class HighAvailabilityManagerImplTest {
highAvailabilityManager.VmHaEnabled = haEnabled;
Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(true);
- highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true);
+ highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true,
HighAvailabilityManager.ReasonType.HostDown);
}
@Test
@@ -336,7 +339,7 @@ public class HighAvailabilityManagerImplTest {
Mockito.when(vm.getState()).thenReturn(VirtualMachine.State.Running);
Mockito.when(_haDao.persist((HaWorkVO)Mockito.any())).thenReturn(Mockito.mock(HaWorkVO.class));
- assertTrue(highAvailabilityManager.scheduleDestroy(vm, 1L));
+ assertTrue(highAvailabilityManager.scheduleDestroy(vm, 1L,
HighAvailabilityManager.ReasonType.HostMaintenance));
}
@Test
@@ -348,7 +351,7 @@ public class HighAvailabilityManagerImplTest {
highAvailabilityManager.VmHaEnabled = haEnabled;
Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(false);
- assertFalse(highAvailabilityManager.scheduleDestroy(vm, 1L));
+ assertFalse(highAvailabilityManager.scheduleDestroy(vm, 1L,
HighAvailabilityManager.ReasonType.HostMaintenance));
}
@Test
@@ -402,7 +405,7 @@ public class HighAvailabilityManagerImplTest {
private void processWorkWithRetryCount(int count, Step expectedStep) {
assertNotNull(processWorkMethod);
- HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User,
WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count,
12345678l);
+ HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User,
WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count,
12345678l, null);
Mockito.doReturn(12345678l).when(highAvailabilityManagerSpy).migrate(work);
try {
processWorkMethod.invoke(highAvailabilityManagerSpy, work);
@@ -425,4 +428,53 @@ public class HighAvailabilityManagerImplTest {
public void processWorkWithRetryCountNotExceeded() {
processWorkWithRetryCount(3, Step.Scheduled);
}
+
+ @Test
+ public void testCheckAndCancelWorkIfNeeded_Success() {
+ Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating);
+
Mockito.when(mockWork.getReasonType()).thenReturn(HighAvailabilityManager.ReasonType.HostMaintenance);
+ Mockito.when(mockWork.getHostId()).thenReturn(1L);
+
Mockito.doReturn(Status.Up).when(highAvailabilityManagerSpy).investigate(1L);
+ Mockito.doNothing().when(mockWork).setStep(Step.Cancelled);
+ boolean result =
highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork);
+ assertTrue(result);
+ Mockito.verify(mockWork).setStep(Step.Cancelled);
+ }
+
+ @Test
+ public void testCheckAndCancelWorkIfNeeded_StepNotInvestigating() {
+ Mockito.when(mockWork.getStep()).thenReturn(Step.Cancelled);
+ boolean result =
highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork);
+ assertFalse(result);
+ Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any());
+ }
+
+ private void
runInvalidReasonCheckAndCancelWorkIfNeeded(HighAvailabilityManager.ReasonType
reasonType) {
+ Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating);
+ Mockito.when(mockWork.getReasonType()).thenReturn(reasonType);
+ boolean result =
highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork);
+ assertFalse(result);
+ Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any());
+ }
+
+ @Test
+ public void testCheckAndCancelWorkIfNeeded_InvalidReasonType() {
+
runInvalidReasonCheckAndCancelWorkIfNeeded(HighAvailabilityManager.ReasonType.Unknown);
+ }
+
+ @Test
+ public void testCheckAndCancelWorkIfNeeded_NullReasonType() {
+ runInvalidReasonCheckAndCancelWorkIfNeeded(null);
+ }
+
+ @Test
+ public void testCheckAndCancelWorkIfNeeded_HostStatusNotUp() {
+ Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating);
+
Mockito.when(mockWork.getReasonType()).thenReturn(HighAvailabilityManager.ReasonType.HostDown);
+ Mockito.when(mockWork.getHostId()).thenReturn(1L);
+
Mockito.doReturn(Status.Down).when(highAvailabilityManagerSpy).investigate(1L);
+ boolean result =
highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork);
+ assertFalse(result);
+ Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any());
+ }
}
diff --git
a/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java
b/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java
index 783497740fd..c2b95474d0b 100644
--- a/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java
+++ b/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java
@@ -22,20 +22,25 @@ import java.util.List;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
+import org.mockito.MockedStatic;
import org.mockito.Mockito;
import org.mockito.Spy;
import org.mockito.junit.MockitoJUnitRunner;
import org.mockito.stubbing.Answer;
import com.cloud.ha.HaWorkVO;
+import com.cloud.ha.HighAvailabilityManager;
+import com.cloud.utils.db.GenericDaoBase;
import com.cloud.utils.db.SearchBuilder;
import com.cloud.utils.db.SearchCriteria;
+import com.cloud.utils.db.UpdateBuilder;
+import com.cloud.vm.VirtualMachine;
@RunWith(MockitoJUnitRunner.class)
public class HighAvailabilityDaoImplTest {
@Spy
- HighAvailabilityDaoImpl highAvailabilityDaoImpl;
+ HighAvailabilityDaoImpl highAvailabilityDaoImpl = new
HighAvailabilityDaoImpl();
@Test
public void testExpungeByVmListNoVms() {
@@ -65,4 +70,56 @@ public class HighAvailabilityDaoImplTest {
Mockito.verify(highAvailabilityDaoImpl, Mockito.times(1))
.batchExpunge(sc, batchSize);
}
+
+ @Test
+ public void testMarkPendingWorksAsInvestigating() throws Exception {
+ SearchBuilder<HaWorkVO> mockTBASearch =
Mockito.mock(SearchBuilder.class);
+ highAvailabilityDaoImpl.TBASearch = mockTBASearch;
+ SearchCriteria<HaWorkVO> mockSearchCriteria =
Mockito.mock(SearchCriteria.class);
+ UpdateBuilder mockUpdateBuilder = Mockito.mock(UpdateBuilder.class);
+ Mockito.when(mockTBASearch.create()).thenReturn(mockSearchCriteria);
+
Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("time"),
Mockito.anyLong());
+
Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("step"),
Mockito.eq(HighAvailabilityManager.Step.Done),
Mockito.eq(HighAvailabilityManager.Step.Cancelled));
+ HaWorkVO haWorkVO = new HaWorkVO(1L, VirtualMachine.Type.User, null,
+ null, 1L, null, 0, 0,
+ HighAvailabilityManager.ReasonType.HostMaintenance);
+
Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO);
+ try(MockedStatic<GenericDaoBase> genericDaoBaseMockedStatic =
Mockito.mockStatic(GenericDaoBase.class)) {
+ genericDaoBaseMockedStatic.when(() ->
GenericDaoBase.getUpdateBuilder(Mockito.any())).thenReturn(mockUpdateBuilder);
+
Mockito.doReturn(5).when(highAvailabilityDaoImpl).update(Mockito.any(UpdateBuilder.class),
Mockito.any(), Mockito.nullable(Integer.class));
+ highAvailabilityDaoImpl.markPendingWorksAsInvestigating();
+ Mockito.verify(mockTBASearch).create();
+
Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("time"),
Mockito.anyLong());
+
Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("step"),
Mockito.eq(HighAvailabilityManager.Step.Done),
Mockito.eq(HighAvailabilityManager.Step.Cancelled));
+ Assert.assertEquals(HighAvailabilityManager.Step.Investigating,
haWorkVO.getStep()); // Ensure the step is set correctly
+
Mockito.verify(highAvailabilityDaoImpl).update(Mockito.eq(mockUpdateBuilder),
Mockito.eq(mockSearchCriteria), Mockito.isNull());
+ }
+ }
+
+ @Test
+ public void testMarkServerPendingWorksAsInvestigating() {
+ SearchBuilder<HaWorkVO> mockSearch = Mockito.mock(SearchBuilder.class);
+
Mockito.doReturn(Mockito.mock(HaWorkVO.class)).when(mockSearch).entity();
+
Mockito.doReturn(mockSearch).when(highAvailabilityDaoImpl).createSearchBuilder();
+ SearchCriteria<HaWorkVO> mockSearchCriteria =
Mockito.mock(SearchCriteria.class);
+ UpdateBuilder mockUpdateBuilder = Mockito.mock(UpdateBuilder.class);
+ Mockito.when(mockSearch.create()).thenReturn(mockSearchCriteria);
+
Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("server"),
Mockito.eq(1L));
+
Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("step"),
Mockito.eq(HighAvailabilityManager.Step.Done),
Mockito.eq(HighAvailabilityManager.Step.Cancelled));
+ HaWorkVO haWorkVO = new HaWorkVO(1L, VirtualMachine.Type.User, null,
+ null, 1L, null, 0, 0,
+ HighAvailabilityManager.ReasonType.HostMaintenance);
+
Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO);
+
Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO);
+ try(MockedStatic<GenericDaoBase> genericDaoBaseMockedStatic =
Mockito.mockStatic(GenericDaoBase.class)) {
+ genericDaoBaseMockedStatic.when(() ->
GenericDaoBase.getUpdateBuilder(Mockito.any())).thenReturn(mockUpdateBuilder);
+
Mockito.doReturn(5).when(highAvailabilityDaoImpl).update(Mockito.any(UpdateBuilder.class),
Mockito.any(), Mockito.nullable(Integer.class));
+ highAvailabilityDaoImpl.markServerPendingWorksAsInvestigating(1L);
+ Mockito.verify(mockSearch).create();
+
Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("server"),
Mockito.eq(1L));
+
Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("step"),
Mockito.eq(HighAvailabilityManager.Step.Done),
Mockito.eq(HighAvailabilityManager.Step.Cancelled));
+ Assert.assertEquals(HighAvailabilityManager.Step.Investigating,
haWorkVO.getStep()); // Ensure the step is set correctly
+
Mockito.verify(highAvailabilityDaoImpl).update(Mockito.eq(mockUpdateBuilder),
Mockito.eq(mockSearchCriteria), Mockito.isNull());
+ }
+ }
}