Eli Mesika has uploaded a new change for review. Change subject: core: start PM enabled hosts after engine restart ......................................................................
core: start PM enabled hosts after engine restart This patch handles the following scenario host non-responding->Restart->Stop->off->engine restart In this case the host will not be started again after the engine is up. This patch handle such scenarios by searching for hosts that both have PM enabled and are in 'Reboot' status after the DisableFenceAtStartupInSec quite time in which PM operations are skipped. If such hosts are found after the quite time, an attempt to start the host via its PM card is executed. Since the status of a Host is moved to 'Reboot' when it is restarted just after the Stop command (Reboot=stop->wait for off->start->wait for on), if the engine fails or stopped at this point the Host will be left in the Reboot status. Change-Id: Ibbfe83535364556df72eaac348e58d4b7146156e Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1005756 Signed-off-by: Eli Mesika <[email protected]> --- M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitBackendServicesOnStartupBean.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java M backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java 3 files changed, 68 insertions(+), 3 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/05/28305/1 diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitBackendServicesOnStartupBean.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitBackendServicesOnStartupBean.java index a51fd4c..dd4b16b 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitBackendServicesOnStartupBean.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/InitBackendServicesOnStartupBean.java @@ -20,11 +20,19 @@ import org.ovirt.engine.core.bll.storage.StoragePoolStatusHandler; import org.ovirt.engine.core.common.action.MigrateVmParameters; import org.ovirt.engine.core.common.action.VdcActionType; +import org.ovirt.engine.core.common.businessentities.VDS; +import org.ovirt.engine.core.common.businessentities.VDSStatus; +import org.ovirt.engine.core.common.config.Config; +import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.utils.customprop.VmPropertiesUtils; import org.ovirt.engine.core.common.utils.exceptions.InitializationException; import org.ovirt.engine.core.compat.Guid; +import org.ovirt.engine.core.dal.dbbroker.DbFacade; +import org.ovirt.engine.core.utils.ThreadUtils; import org.ovirt.engine.core.utils.customprop.DevicePropertiesUtils; import org.ovirt.engine.core.utils.extensionsmgr.EngineExtensionsManager; +import org.ovirt.engine.core.utils.linq.LinqUtils; +import org.ovirt.engine.core.utils.linq.Predicate; import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil; import org.ovirt.engine.core.vdsbroker.ResourceManager; import org.slf4j.Logger; @@ -40,8 +48,9 @@ public class InitBackendServicesOnStartupBean implements InitBackendServicesOnStartup{ private static final Logger log = LoggerFactory.getLogger(InitBackendServicesOnStartupBean.class); + private List<VDS> hostsWithPMInReboot; - /** + /** * This method is called upon the bean creation as part * of the management Service bean life cycle. */ @@ -50,6 +59,9 @@ public void create() { try { + // This must be done before starting to sample the hosts status from VDSM since the sampling will turn such host from Reboot to NonResponsive + setHostsWithPMInReboot(); + // Create authentication profiles for all the domains that exist in the database: // TODO: remove this later, and rely only on the custom and built in extensions directories configuration @@ -59,7 +71,6 @@ AsyncTaskManager.getInstance().initAsyncTaskManager(); ResourceManager.getInstance().init(); OvfDataUpdater.getInstance().initOvfDataUpdater(); - SchedulingManager.getInstance().setMigrationHandler(new MigrationHandler() { @Override @@ -104,11 +115,35 @@ // Initialize Power Management Health Check PmHealthCheckManager.getInstance().initialize(); + // Start hosts that were fenced in the middle of an engine restart (if exists) + startHostsWithPMInReboot(); } catch (Exception ex) { log.error("Failed to initialize backend", ex); throw ex; } } + private void startHostsWithPMInReboot() { + if (hostsWithPMInReboot.size() > 0) { + ThreadPoolUtil.execute(new Runnable() { + @Override + public void run() { + // wait the quiet time from engine start in which we skip fencing operations + int mSecToWait = Config.<Integer>getValue(ConfigValues.DisableFenceAtStartupInSec) * 1000; + ThreadUtils.sleep(mSecToWait); + PmHealthCheckManager.getInstance().startHostsWithPMInReboot(hostsWithPMInReboot); + } + }); + } + } + private void setHostsWithPMInReboot() { + hostsWithPMInReboot = LinqUtils.filter(DbFacade.getInstance().getVdsDao().getAll(), + new Predicate<VDS> () { + @Override + public boolean eval(VDS host) { + return (host.getpm_enabled() && host.getStatus() == VDSStatus.Reboot); + } + }); + } } diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java index 4ff10c9..7a287e6 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java @@ -312,7 +312,7 @@ * @return * boolean */ - protected boolean isPmReportsStatusDown() { + public boolean isPmReportsStatusDown() { boolean result = false; VDS vds = getVds(); VDSReturnValue returnValue=null; diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java index 86b1cde..7850b84 100644 --- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java +++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/pm/PmHealthCheckManager.java @@ -1,12 +1,18 @@ package org.ovirt.engine.core.bll.pm; +import org.ovirt.engine.core.bll.Backend; import org.ovirt.engine.core.bll.FenceExecutor; +import org.ovirt.engine.core.bll.RestartVdsCommand; import org.ovirt.engine.core.common.AuditLogType; +import org.ovirt.engine.core.common.action.FenceVdsActionParameters; +import org.ovirt.engine.core.common.action.VdcActionType; +import org.ovirt.engine.core.common.action.VdcReturnValueBase; import org.ovirt.engine.core.common.businessentities.FenceActionType; import org.ovirt.engine.core.common.businessentities.FenceAgentOrder; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; +import org.ovirt.engine.core.common.vdscommands.VDSReturnValue; import org.ovirt.engine.core.compat.Guid; import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AlertDirector; @@ -105,6 +111,30 @@ } } + /** + * This method starts hosts remained in off status because of the following flow + * non-responding -> stop -> wait -> off -> engine restart + * Such hosts will stay DOWN while its status will show Reboot + * We should try to catch such hosts and attempt to restart it. + * @param hostWithPMInStatusReboot + */ + public void startHostsWithPMInReboot(List<VDS> hostWithPMInStatusReboot) { + VDSReturnValue returnValue = null; + for (VDS host : hostWithPMInStatusReboot) { + RestartVdsCommand restartVdsCommand = new RestartVdsCommand(new + FenceVdsActionParameters(host.getId(), FenceActionType.Status)); + if (restartVdsCommand.isPmReportsStatusDown()) { + VdcReturnValueBase retValue = Backend.getInstance().runInternalAction(VdcActionType.RestartVds, restartVdsCommand.getParameters()); + if (retValue!= null && retValue.getSucceeded()) { + log.infoFormat("Host {0} was started successfully by PM Health Check Manager", + host.getName()); + } + else { + log.infoFormat("PM Health Check Manager failed to start Host {0}", host.getName()); + } + } + } + } public static PmHealthCheckManager getInstance() { return instance; } -- To view, visit http://gerrit.ovirt.org/28305 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibbfe83535364556df72eaac348e58d4b7146156e Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Eli Mesika <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
