Arik Hadas has uploaded a new change for review. Change subject: core: change the retry to run failed HA VM frequency ......................................................................
core: change the retry to run failed HA VM frequency The AutoStartVmsRunner default frequency is 1 sec, in each iteration it iterated the down HA VMs in his set, tried to acquire a proper locks for running them and then invokes the RunVmCommand for each of them. The behavior above is ok for the scenario where you want to try to run the VM after you failed to do so because you did not manage to acquire the lock (because some other command(s) took the lock and run in the background) - you want to keep trying in high frequency in order to run it when the operation(s) in the background is finished. But if the VM cannot be run from other reason, because of scheduling constraints for example, trying after 1 sec could make bad effects and is not likely to succeed. So in that case we will wait more before trying to run the VM again. New configuration value is added to define the interval (in seconds) we will wait before trying to run the down HA Vm after not being able to run it not because of lock acquisition problem. Change-Id: I9f8914efc73ac1ea4f4a2ec4e588d778e9002451 Signed-off-by: Arik Hadas <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java M backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql 3 files changed, 49 insertions(+), 15 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/00/19500/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java index 493fa1a..4261fa0 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/AutoStartVmsRunner.java @@ -8,11 +8,18 @@ import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.action.RunVmParams; import org.ovirt.engine.core.common.action.VdcActionType; +import org.ovirt.engine.core.common.businessentities.VMStatus; +import org.ovirt.engine.core.common.config.Config; +import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.errors.VdcBllMessages; import org.ovirt.engine.core.common.locks.LockingGroup; +import org.ovirt.engine.core.common.utils.Pair; +import org.ovirt.engine.core.compat.DateTime; import org.ovirt.engine.core.compat.Guid; +import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; +import org.ovirt.engine.core.dao.VmDynamicDAO; import org.ovirt.engine.core.utils.lock.EngineLock; import org.ovirt.engine.core.utils.lock.LockManager; import org.ovirt.engine.core.utils.lock.LockManagerFactory; @@ -20,11 +27,18 @@ import org.ovirt.engine.core.utils.log.LogFactory; import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation; +/** + * This class represent a job which is responsible for running HA VMs + */ public class AutoStartVmsRunner { private static Log log = LogFactory.getLog(AutoStartVmsRunner.class); private static AutoStartVmsRunner instance = new AutoStartVmsRunner(); - private CopyOnWriteArraySet<Guid> autoStartVmsToRun = new CopyOnWriteArraySet<>(); + /** Pair of id of the VM to be rerun and the time when it should be rerun */ + private CopyOnWriteArraySet<Pair<Guid, DateTime>> autoStartVmsToRun = new CopyOnWriteArraySet<>(); + /** How long to wait before rerun HA VM that failed to start (not because of lock acquisition) */ + private static final double RETRY_TO_RUN_HA_VM_INTERVAL = + Config.<Integer> GetValue(ConfigValues.RetryToRunAutoStartVmIntervalInSeconds); public static AutoStartVmsRunner getInstance() { return instance; @@ -35,18 +49,35 @@ @OnTimerMethodAnnotation("startFailedAutoStartVms") public void startFailedAutoStartVms() { - LinkedList<Guid> idsToRemove = new LinkedList<>(); + LinkedList<Pair<Guid, DateTime>> idsToRemove = new LinkedList<>(); + DateTime now = DateTime.getNow(); + DateTime nextTimeOfRetryToRun = now.AddSeconds(RETRY_TO_RUN_HA_VM_INTERVAL); - for(Guid vmId: autoStartVmsToRun) { - EngineLock runVmLock = createLockForRunVmCommand(vmId); - - if (!getLockManager().acquireLock(runVmLock).getFirst()) { + for(Pair<Guid, DateTime> vmIdAndDateTime: autoStartVmsToRun) { + if (now.compareTo(vmIdAndDateTime.getSecond()) < 0) { continue; } - runVm(vmId, runVmLock); + Guid vmId = vmIdAndDateTime.getFirst(); + EngineLock runVmLock = createLockForRunVmCommand(vmId); - idsToRemove.add(vmId); + if (!getLockManager().acquireLock(runVmLock).getFirst()) { + log.infoFormat("Could not acquire log for running HA VM {0}", vmId); + continue; + } + + if (!runVm(vmId, runVmLock) && getVmDynamicDao().get(vmId).getStatus() == VMStatus.Down) { + AuditLogableBase event = new AuditLogableBase(); + event.setVmId(vmId); + AuditLogDirector.log(event, AuditLogType.HA_VM_RESTART_FAILED); + + vmIdAndDateTime.setSecond(nextTimeOfRetryToRun); + continue; + } + + // the VM reached WaitForLunch, so we can remove it from the set, + // from now on errors will be detected by VdsUpdateRuntimeInfo + idsToRemove.add(vmIdAndDateTime); } autoStartVmsToRun.removeAll(idsToRemove); @@ -62,12 +93,16 @@ null); } + protected VmDynamicDAO getVmDynamicDao() { + return DbFacade.getInstance().getVmDynamicDao(); + } + protected LockManager getLockManager() { return LockManagerFactory.getLockManager(); } public void addVmToRun(Guid vmId) { - autoStartVmsToRun.add(vmId); + autoStartVmsToRun.add(new Pair<>(vmId, DateTime.getNow())); } private boolean runVm(Guid vmId, EngineLock lock) { @@ -75,12 +110,6 @@ VdcActionType.RunVm, new RunVmParams(vmId), ExecutionHandler.createInternalJobContext(lock)).getSucceeded(); - - if (!succeeded) { - final AuditLogableBase event = new AuditLogableBase(); - event.setVmId(vmId); - AuditLogDirector.log(event, AuditLogType.HA_VM_RESTART_FAILED); - } return succeeded; } diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java index 1b8a402..b8dd113 100644 --- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java +++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java @@ -1513,6 +1513,10 @@ @DefaultValueAttribute("1") AutoStartVmsRunnerIntervalInSeconds(538), + @TypeConverterAttribute(Integer.class) + @DefaultValueAttribute("30") + RetryToRunAutoStartVmIntervalInSeconds(539), + Invalid(65535); private int intValue; diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql index 2c8ff5d..6a28561 100644 --- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql +++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql @@ -549,6 +549,7 @@ select fn_db_add_config_value('VmPriorityMaxValue','100','general'); --How often we'll try to run HA VM that we couldn't run before select fn_db_add_config_value('AutoStartVmsRunnerIntervalInSeconds','1','general'); +select fn_db_add_config_value('RetryToRunAutoStartVmIntervalInSeconds','30','general'); --Handling Keyboard Layout configuration for VNC select fn_db_add_config_value('VncKeyboardLayout','en-us','general'); select fn_db_add_config_value('VncKeyboardLayoutValidValues','ar,da,de,de-ch,en-gb,en-us,es,et,fi,fo,fr,fr-be,fr-ca,fr-ch,hr,hu,is,it,ja,lt,lv,mk,nl,nl-be,no,pl,pt,pt-br,ru,sl,sv,th,tr','general'); -- To view, visit http://gerrit.ovirt.org/19500 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9f8914efc73ac1ea4f4a2ec4e588d778e9002451 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Arik Hadas <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
