Roy Golan has uploaded a new change for review. Change subject: core: Monitoring - refactor Host newtork error handling ......................................................................
core: Monitoring - refactor Host newtork error handling Host VDS network error should trigger a state change and a treatment for auto-fencing as outlined in [1] [1] http://www.ovirt.org/Automatic_Fencing This handling is done at a single place [2] and is only called from a single place, the VdsEventListener [3]. [2] VdsManager.handleNetworkException [3] void onError(@Observes VdsNetworkException networkException) *All* vds broker commands in action that throw VDS network exception is also firing and event with the Exception as payload. The event it caught by VdsEventListener and it invokes the handling on using the assigned VdsManager in a separate thread. Mo more VDSNetwork Exception throwing and re-throwing for flow control. Change-Id: I34004b370740eabbfd2563473eaab82615ceb250 Bug-Url: https://bugzilla.redhat.com/?????? Signed-off-by: Roy Golan <[email protected]> --- M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/HostMonitoring.java M backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java M backend/manager/modules/vdsbroker/src/test/java/org/ovirt/engine/core/vdsbroker/HostMonitoringTest.java 3 files changed, 60 insertions(+), 88 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/28/35628/1 diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/HostMonitoring.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/HostMonitoring.java index cb0c6cd..dcd4ef0 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/HostMonitoring.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/HostMonitoring.java @@ -43,7 +43,6 @@ import org.ovirt.engine.core.utils.transaction.TransactionMethod; import org.ovirt.engine.core.utils.transaction.TransactionSupport; import org.ovirt.engine.core.vdsbroker.irsbroker.IRSErrorException; -import org.ovirt.engine.core.vdsbroker.vdsbroker.VDSNetworkException; import org.ovirt.engine.core.vdsbroker.vdsbroker.VDSRecoveringException; import org.ovirt.engine.core.vdsbroker.vdsbroker.entities.VmInternalData; import org.slf4j.Logger; @@ -136,9 +135,6 @@ ResourceManager.getInstance().runVdsCommand(VDSCommandType.SetVdsStatus, new SetVdsStatusVDSCommandParameters(vds.getId(), VDSStatus.Error)); } - } catch (VDSNetworkException e) { - saveVdsDynamic = vdsManager.handleNetworkException(e); - throw e; } catch (Throwable t) { log.error("Failure to refresh Vds runtime info: {}", t.getMessage()); log.error("Exception", t); @@ -475,21 +471,9 @@ getResourceManager().getEventListener().updateSchedulingStats(vds); if (!statsReturnValue.getSucceeded() && statsReturnValue.getExceptionObject() != null) { - VDSNetworkException ex = - (VDSNetworkException) ((statsReturnValue.getExceptionObject() instanceof VDSNetworkException) - ? statsReturnValue.getExceptionObject() - : null); - if (ex != null) { - if (vdsManager.handleNetworkException(ex)) { - saveVdsDynamic = true; - } - log.error("vds::refreshVdsStats Failed getVdsStats, vds='{}'({}): {}", - vds.getName(), vds.getId(), ex.getMessage()); - } else { - log.error("vds::refreshVdsStats Failed getVdsStats, vds='{}'({}): {}", - vds.getName(), vds.getId(), statsReturnValue.getExceptionString()); - } - throw statsReturnValue.getExceptionObject(); + log.error(" Failed getting vds stats, vds='{}'({}): {}", + vds.getName(), vds.getId(), statsReturnValue.getExceptionString()); + return; } // save also dynamic because vm_count data and image_check getting with // statistics data diff --git a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java index 32a24fd..ffaab89 100644 --- a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java +++ b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/VdsManager.java @@ -661,14 +661,6 @@ return returnStatus; } else if (caps.getExceptionObject() != null) { - // if exception is VDSNetworkException then call to - // handleNetworkException - if (caps.getExceptionObject() instanceof VDSNetworkException - && handleNetworkException((VDSNetworkException) caps - .getExceptionObject())) { - updateDynamicData(vds.getDynamicData()); - updateStatisticsData(vds.getStatisticsData()); - } throw caps.getExceptionObject(); } else { log.error("refreshCapabilities:GetCapabilitiesVDSCommand failed with no exception!"); @@ -696,56 +688,73 @@ } /** - * Handle network exception, return true if save vdsDynamic to DB is needed. + * Handle network exception * - * @param ex + * @param ex exception to handle * @return */ - public boolean handleNetworkException(VDSNetworkException ex) { + public void handleNetworkException(VDSNetworkException ex) { + boolean saveToDb = true; if (cachedVds.getStatus() != VDSStatus.Down) { long timeoutToFence = calcTimeoutToFence(cachedVds.getVmCount(), cachedVds.getSpmStatus()); - log.warn("Host '{}' is not responding. It will stay in Connecting state for a grace period " + - "of {} seconds and after that an attempt to fence the host will be issued.", - cachedVds.getName(), - TimeUnit.MILLISECONDS.toSeconds(timeoutToFence)); - AuditLogableBase logable = new AuditLogableBase(); - logable.setVdsId(cachedVds.getId()); - logable.addCustomValue("Seconds", Long.toString(TimeUnit.MILLISECONDS.toSeconds(timeoutToFence))); - AuditLogDirector.log(logable, AuditLogType.VDS_HOST_NOT_RESPONDING_CONNECTING); - if (mUnrespondedAttempts.get() < Config.<Integer>getValue(ConfigValues.VDSAttemptsToResetCount) - || (lastUpdate + timeoutToFence) > System.currentTimeMillis()) { - boolean result = false; - if (cachedVds.getStatus() != VDSStatus.Connecting && cachedVds.getStatus() != VDSStatus.PreparingForMaintenance + logHostNonResponding(timeoutToFence); + if (dontFenceYet(timeoutToFence)) { + if (cachedVds.getStatus() != VDSStatus.Connecting + && cachedVds.getStatus() != VDSStatus.PreparingForMaintenance && cachedVds.getStatus() != VDSStatus.NonResponsive) { setStatus(VDSStatus.Connecting, cachedVds); - result = true; + } else { + saveToDb = false; } mUnrespondedAttempts.incrementAndGet(); - return result; + } else { + if (cachedVds.getStatus() == VDSStatus.NonResponsive + || cachedVds.getStatus() == VDSStatus.Maintenance) { + setStatus(VDSStatus.NonResponsive, cachedVds); + } else { + setStatus(VDSStatus.NonResponsive, cachedVds); + moveVMsToUnknown(); + logHostFailToResponde(ex, timeoutToFence); + ResourceManager.getInstance().getEventListener().vdsNotResponding( + cachedVds, + !sshSoftFencingExecuted.getAndSet(true), + lastUpdate); + } } - - if (cachedVds.getStatus() == VDSStatus.NonResponsive || cachedVds.getStatus() == VDSStatus.Maintenance) { - setStatus(VDSStatus.NonResponsive, cachedVds); - return true; - } - setStatus(VDSStatus.NonResponsive, cachedVds); - moveVMsToUnknown(); - log.info( - "Server failed to respond, vds_id='{}', vds_name='{}', vm_count={}, " + - "spm_status='{}', non-responsive_timeout (seconds)={}, error: {}", - cachedVds.getId(), cachedVds.getName(), cachedVds.getVmCount(), cachedVds.getSpmStatus(), - TimeUnit.MILLISECONDS.toSeconds(timeoutToFence), ex.getMessage()); - - logable = new AuditLogableBase(cachedVds.getId()); - logable.updateCallStackFromThrowable(ex); - AuditLogDirector.log(logable, AuditLogType.VDS_FAILURE); - boolean executeSshSoftFencing = false; - if (!sshSoftFencingExecuted.getAndSet(true)) { - executeSshSoftFencing = true; - } - ResourceManager.getInstance().getEventListener().vdsNotResponding(cachedVds, executeSshSoftFencing, lastUpdate); } - return true; + if (saveToDb) { + updateDynamicData(cachedVds.getDynamicData()); + updateStatisticsData(cachedVds.getStatisticsData()); + } + } + + private boolean dontFenceYet(long timeoutToFence) { + return mUnrespondedAttempts.get() < Config.<Integer>getValue(ConfigValues.VDSAttemptsToResetCount) + || (lastUpdate + timeoutToFence) > System.currentTimeMillis(); + } + + private void logHostFailToResponde(VDSNetworkException ex, long timeoutToFence) { + log.info( + "Server failed to respond, vds_id='{}', vds_name='{}', vm_count={}, " + + "spm_status='{}', non-responsive_timeout (seconds)={}, error: {}", + cachedVds.getId(), cachedVds.getName(), cachedVds.getVmCount(), cachedVds.getSpmStatus(), + TimeUnit.MILLISECONDS.toSeconds(timeoutToFence), ex.getMessage()); + + AuditLogableBase logable; + logable = new AuditLogableBase(cachedVds.getId()); + logable.updateCallStackFromThrowable(ex); + AuditLogDirector.log(logable, AuditLogType.VDS_FAILURE); + } + + private void logHostNonResponding(long timeoutToFence) { + log.warn("Host '{}' is not responding. It will stay in Connecting state for a grace period " + + "of {} seconds and after that an attempt to fence the host will be issued.", + cachedVds.getName(), + TimeUnit.MILLISECONDS.toSeconds(timeoutToFence)); + AuditLogableBase logable = new AuditLogableBase(); + logable.setVdsId(cachedVds.getId()); + logable.addCustomValue("Seconds", Long.toString(TimeUnit.MILLISECONDS.toSeconds(timeoutToFence))); + AuditLogDirector.log(logable, AuditLogType.VDS_HOST_NOT_RESPONDING_CONNECTING); } public void dispose() { diff --git a/backend/manager/modules/vdsbroker/src/test/java/org/ovirt/engine/core/vdsbroker/HostMonitoringTest.java b/backend/manager/modules/vdsbroker/src/test/java/org/ovirt/engine/core/vdsbroker/HostMonitoringTest.java index 651a6cc..a4c0ccf 100644 --- a/backend/manager/modules/vdsbroker/src/test/java/org/ovirt/engine/core/vdsbroker/HostMonitoringTest.java +++ b/backend/manager/modules/vdsbroker/src/test/java/org/ovirt/engine/core/vdsbroker/HostMonitoringTest.java @@ -12,6 +12,7 @@ import org.ovirt.engine.core.common.businessentities.IVdsEventListener; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.businessentities.VDSGroup; +import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.businessentities.network.VdsNetworkInterface; import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.vdscommands.VDSCommandType; @@ -111,27 +112,5 @@ vds = new VDS(); vds.setId(new Guid("00000000-0000-0000-0000-000000000012")); vds.setVdsGroupCompatibilityVersion(Version.v3_4); - } - - /** - * test that a network error is handled correctly and is triggering a state - * change and a save to the DB - */ - @Test - public void testErrorHandling() { - VDSReturnValue value = new VDSReturnValue(); - value.setSucceeded(false); - value.setExceptionObject(new VDSNetworkException("unknown host")); - when(resourceManager.getEventListener()).thenReturn(vdsEventlistener); - when(resourceManager.runVdsCommand(any(VDSCommandType.class), - any(VDSParametersBase.class))).thenReturn(value); - when(updater.getResourceManager()).thenReturn(resourceManager); - try { - updater.refreshVdsStats(); - } catch (Exception e) { - verify(vdsManager).handleNetworkException(any(VDSNetworkException.class)); - verify(vdsManager).updateDynamicData(vds.getDynamicData()); - } - } } -- To view, visit http://gerrit.ovirt.org/35628 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I34004b370740eabbfd2563473eaab82615ceb250 Gerrit-PatchSet: 1 Gerrit-Project: ovirt-engine Gerrit-Branch: master Gerrit-Owner: Roy Golan <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
