anuragaw commented on a change in pull request #3575: [WIP DO NOT MERGE] Health check feature for virtual router URL: https://github.com/apache/cloudstack/pull/3575#discussion_r357993085
########## File path: server/src/main/java/com/cloud/network/router/VirtualNetworkApplianceManagerImpl.java ########## @@ -1186,6 +1219,290 @@ protected void pushToUpdateQueue(final List<NetworkVO> networks) throws Interrup } } + protected class AnalyseRouterMonitorResultsTask extends ManagedContextRunnable { + public AnalyseRouterMonitorResultsTask() { + } + + @Override + protected void runInContext() { + try { + final List<DomainRouterVO> routers = _routerDao.listByStateAndManagementServer(VirtualMachine.State.Running, mgmtSrvrId); + s_logger.debug("Found " + routers.size() + " running routers. "); + + for (final DomainRouterVO router : routers) { + GetRouterMonitorResultsAnswer answer = getMonitorResults(router, false); + String checkFailsToRestartVr = RouterHealthChecksFailuresToRestartVr.valueIn(router.getDataCenterId()); + if (answer != null && answer.getFailingChecks().size() > 0 && StringUtils.isNotBlank(checkFailsToRestartVr)) { + for (String failedCheck : answer.getFailingChecks()) { + if (checkFailsToRestartVr.contains(failedCheck)) { + rebootRouter(router.getId(), true); + } + } + } + } + } catch (final Exception ex) { + s_logger.error("Fail to complete the AnalyseRouterMonitorResultsTask! ", ex); + } + } + } + + // Returns null if health checks are not enabled + private GetRouterMonitorResultsAnswer getMonitorResults(DomainRouterVO router, boolean performFreshChecks) { + if (!RouterHealthChecksEnabled.valueIn(router.getDataCenterId())) { + return null; + } + + String controlIP = getRouterControlIP(router); + if (StringUtils.isNotBlank(controlIP) && !controlIP.equals("0.0.0.0")) { + final GetRouterMonitorResultsCommand command = new GetRouterMonitorResultsCommand(performFreshChecks); + command.setAccessDetail(NetworkElementCommand.ROUTER_IP, controlIP); + command.setAccessDetail(NetworkElementCommand.ROUTER_NAME, router.getInstanceName()); + try { + final Answer answer = _agentMgr.easySend(router.getHostId(), command); + + if (answer == null) { + s_logger.warn("Unable to fetch monitoring results data from router " + router.getHostName()); + return null; + } + if (answer instanceof GetRouterMonitorResultsAnswer) { + return (GetRouterMonitorResultsAnswer) answer; + } else { + s_logger.warn("Unable to fetch health checks results to router " + router.getHostName() + " Received answer " + answer.getDetails()); + return new GetRouterMonitorResultsAnswer(command, false, null, answer.getDetails()); + } + } catch (final Exception e) { + s_logger.warn("Error while collecting alerts from router: " + router.getInstanceName(), e); + return null; + } + } + + return null; + } + + @Override + public Map<String, String> getRouterHealthCheckResults(long routerId, boolean runChecks) { + DomainRouterVO router = _routerDao.findById(routerId); + Map<String, String> result = new HashMap<>(); + + if (router == null) { + result.put("success", "False"); + result.put("message", "Router not found"); + return result; + } + + if (!RouterHealthChecksEnabled.valueIn(router.getDataCenterId())) { + result.put("success", "False"); + result.put("message", "Router id not valid. Health checks are disabled in router's zone."); + return result; + } + + s_logger.info("Getting router health check results for router " + router.getUuid()); + + if (runChecks) { + boolean successfullyUpdatedData = updateRouterHealthCheckData(router); + s_logger.info("Updating health check data for fresh run successfully: " + successfullyUpdatedData); + } + + s_logger.info("Retrieving results for fresh health check execution for router " + router.getUuid()); + GetRouterMonitorResultsAnswer answer = getMonitorResults(router, runChecks); + if (answer == null) { + result.put("success", "False"); + result.put("message", "Router is unreachable."); + return result; + } + + result.put("success", String.valueOf(answer.getResult())); + result.put("message", answer.getDetails()); + + return result; + } + + protected class UpdateRouterHealthChecksConfigDataTask extends ManagedContextRunnable { + public UpdateRouterHealthChecksConfigDataTask() { + } + + @Override + protected void runInContext() { + try { + final List<DomainRouterVO> routers = _routerDao.listByStateAndManagementServer(VirtualMachine.State.Running, mgmtSrvrId); + s_logger.debug("Found " + routers.size() + " running routers. "); + + for (final DomainRouterVO router : routers) { + updateRouterHealthCheckData(router); + } + } catch (final Exception ex) { + s_logger.error("Fail to complete the UpdateRouterHealthChecksConfigDataTask! ", ex); + } + } + } + + private boolean updateRouterHealthCheckData(DomainRouterVO router) { + if (!RouterHealthChecksEnabled.valueIn(router.getDataCenterId())) { + return false; + } + + String controlIP = getRouterControlIP(router); + if (StringUtils.isNotBlank(controlIP) && !controlIP.equals("0.0.0.0")) { + s_logger.info("Updating data for router health checks for router " + router.getUuid()); + final SetMonitorServiceCommand command = new SetMonitorServiceCommand(); + command.setAccessDetail(NetworkElementCommand.ROUTER_IP, getRouterControlIP(router)); + command.setAccessDetail(NetworkElementCommand.ROUTER_NAME, router.getInstanceName()); + command.setAccessDetail(SetMonitorServiceCommand.ROUTER_HEALTH_CHECKS_ENABLED, RouterHealthChecksEnabled.valueIn(router.getDataCenterId()).toString()); + command.setAccessDetail(SetMonitorServiceCommand.ROUTER_HEALTH_CHECKS_BASIC_INTERVAL, RouterHealthChecksBasicInterval.value().toString()); + command.setAccessDetail(SetMonitorServiceCommand.ROUTER_HEALTH_CHECKS_ADVANCED_INTERVAL, RouterHealthChecksAdvancedInterval.value().toString()); + command.setAccessDetail(SetMonitorServiceCommand.ROUTER_HEALTH_CHECKS_EXCLUDED, RouterHealthChecksToExclude.valueIn(router.getDataCenterId())); + command.setAdditionalData(getAdditionalDataForRouterHealthChecks(router)); + command.setReconfigureAfterUpdate(true); + + Answer origAnswer = null; + try { + origAnswer = _agentMgr.easySend(router.getHostId(), command); + } catch (final Exception e) { + s_logger.warn("Error while collecting alerts from router: " + router.getInstanceName(), e); + return false; + } + + if (origAnswer == null) { + s_logger.warn("Unable to update health checks data to router " + router.getHostName()); + return false; + } + + GroupAnswer answer = null; + if (origAnswer instanceof GroupAnswer) { + answer = (GroupAnswer) origAnswer; + } else { + s_logger.warn("Unable to update health checks data to router " + router.getHostName() + " Received answer " + origAnswer.getDetails()); + return false; + } + + if (!answer.getResult()) { + s_logger.warn("Unable to update health checks data to router " + router.getHostName() + ", details : " + answer.getDetails()); + } + + return answer.getResult(); + } + s_logger.debug("Skipping update data on router " + router.getUuid() + " because controlIp is not correct."); + return false; + } + + private Map<String, String> getAdditionalDataForRouterHealthChecks(final DomainRouterVO router) { Review comment: Done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services