Arik Hadas has uploaded a new change for review. Change subject: core: run HA VMs that went down right before engine stopped ......................................................................
core: run HA VMs that went down right before engine stopped This patch solves a known problem that was exists before: when HA VMs went down, the engine (VdsUpdateRunTimeInfo) detected it (and updated the DB) but didn't manage to run the VM, we won't try to run the VM when the engine start. This problem is solved by fetching from the DB all the HA VMs which are down with exit status ERROR when initializing the AutoStartVmsRunner job, and those VMs are set as the first VMs the job will try to run on its first iteration. Change-Id: I2d5876f196819b2a69be0b71287c5325a8ff9dc9 Signed-off-by: Arik Hadas <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java M backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java M packaging/dbscripts/vms_sp.sql 4 files changed, 47 insertions(+), 8 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/02/19502/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java index 4261fa0..a481445 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java @@ -2,12 +2,14 @@ import java.util.Collections; import java.util.LinkedList; +import java.util.List; import java.util.concurrent.CopyOnWriteArraySet; import org.ovirt.engine.core.bll.job.ExecutionHandler; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.RunVmParams; import org.ovirt.engine.core.common.action.VdcActionType; +import org.ovirt.engine.core.common.businessentities.VM; import org.ovirt.engine.core.common.businessentities.VMStatus; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; @@ -19,6 +21,7 @@ import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; +import org.ovirt.engine.core.dao.VmDAO; import org.ovirt.engine.core.dao.VmDynamicDAO; import org.ovirt.engine.core.utils.lock.EngineLock; import org.ovirt.engine.core.utils.lock.LockManager; @@ -45,6 +48,15 @@ } private AutoStartVmsRunner() { + // There might be HA VMs which went down just before the engine stopped, we detected + // the failure and updated the DB but didn't made it to rerun the VM. So here we'll + // take all the HA VMs which are down because of an error and add them to the set + DateTime now = DateTime.getNow(); + List<Pair<Guid, DateTime>> initialFailedVms = new LinkedList<>(); + for (VM vm: getVmDao().getAllFailedAutoStartVms()) { + initialFailedVms.add(new Pair<>(vm.getId(), now)); + } + autoStartVmsToRun.addAll(initialFailedVms); } @OnTimerMethodAnnotation("startFailedAutoStartVms") @@ -54,6 +66,7 @@ DateTime nextTimeOfRetryToRun = now.AddSeconds(RETRY_TO_RUN_HA_VM_INTERVAL); for(Pair<Guid, DateTime> vmIdAndDateTime: autoStartVmsToRun) { + // if it is not the time to rerun this VM yet, skip for now if (now.compareTo(vmIdAndDateTime.getSecond()) < 0) { continue; } @@ -75,8 +88,8 @@ continue; } - // the VM reached WaitForLunch, so we can remove it from the set, - // from now on errors will be detected by VdsUpdateRuntimeInfo + // the VM reached status which is different from Down, therefore we can remove it from + // the set and from now on errors will be detected by VdsUpdateRuntimeInfo idsToRemove.add(vmIdAndDateTime); } @@ -93,10 +106,6 @@ null); } - protected VmDynamicDAO getVmDynamicDao() { - return DbFacade.getInstance().getVmDynamicDao(); - } - protected LockManager getLockManager() { return LockManagerFactory.getLockManager(); } @@ -106,11 +115,17 @@ } private boolean runVm(Guid vmId, EngineLock lock) { - boolean succeeded = Backend.getInstance().runInternalAction( + return Backend.getInstance().runInternalAction( VdcActionType.RunVm, new RunVmParams(vmId), ExecutionHandler.createInternalJobContext(lock)).getSucceeded(); + } - return succeeded; + protected VmDynamicDAO getVmDynamicDao() { + return DbFacade.getInstance().getVmDynamicDao(); + } + + protected VmDAO getVmDao() { + return DbFacade.getInstance().getVmDao(); } } diff --git a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java index 3d7d35b..1e18092 100644 --- a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java +++ b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAO.java @@ -256,4 +256,11 @@ * @return the list of VMs */ List<VM> getAllForVnicProfile(Guid vnicProfileId); + + /** + * Retrieves all auto started VMs that went down unintentionally + * + * @return the list of VMs + */ + List<VM> getAllFailedAutoStartVms(); } diff --git a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java index 23928db..c04de12 100644 --- a/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java +++ b/backend/manager/modules/dal/src/main/java/org/ovirt/engine/core/dao/VmDAODbFacadeImpl.java @@ -237,6 +237,13 @@ } @Override + public List<VM> getAllFailedAutoStartVms() { + return getCallsHandler().executeReadList("GetFailedAutoStartVms", + VMRowMapper.instance, + getCustomMapSqlParameterSource()); + } + + @Override public List<VM> getAllMigratingToHost(Guid vdsId) { return getCallsHandler().executeReadList("GetVmsMigratingToVds", VMRowMapper.instance, diff --git a/packaging/dbscripts/vms_sp.sql b/packaging/dbscripts/vms_sp.sql index c97f60e..fb2d871 100644 --- a/packaging/dbscripts/vms_sp.sql +++ b/packaging/dbscripts/vms_sp.sql @@ -1090,3 +1090,13 @@ END; $procedure$ LANGUAGE plpgsql; + +Create or replace FUNCTION GetFailedAutoStartVms() RETURNS SETOF vms STABLE + AS $procedure$ +BEGIN + RETURN QUERY SELECT vms.* + FROM vms + WHERE auto_startup = TRUE and status = 0 and exit_status = 1; +END; $procedure$ +LANGUAGE plpgsql; + -- To view, visit http://gerrit.ovirt.org/19502 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2d5876f196819b2a69be0b71287c5325a8ff9dc9 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Arik Hadas <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
