Greg Padgett has uploaded a new change for review. Change subject: WIP agent, broker: propagate engine status via metadata ......................................................................
WIP agent, broker: propagate engine status via metadata Let each host find the status of its own engine (and vm) and propagate it through the shared metadata, rather than each host trying to query the status over the network. Change-Id: I791ae950db987c87679efb9c23ccb201895b20bc Signed-off-by: Greg Padgett <[email protected]> --- M ovirt_hosted_engine_ha/agent/config.py M ovirt_hosted_engine_ha/agent/hosted_engine.py M ovirt_hosted_engine_ha/broker/submonitor_util.py M ovirt_hosted_engine_ha/broker/submonitors/engine_health.py 4 files changed, 78 insertions(+), 42 deletions(-) git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha refs/changes/01/17801/1 diff --git a/ovirt_hosted_engine_ha/agent/config.py b/ovirt_hosted_engine_ha/agent/config.py index f8aa587..5dd6969 100644 --- a/ovirt_hosted_engine_ha/agent/config.py +++ b/ovirt_hosted_engine_ha/agent/config.py @@ -30,6 +30,7 @@ # constants for vm.conf options VM = 'vm' BRIDGE_NAME = 'bridge' +VM_UUID = 'vmId' class Config(object): diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py b/ovirt_hosted_engine_ha/agent/hosted_engine.py index 926b726..64c33f9 100644 --- a/ovirt_hosted_engine_ha/agent/hosted_engine.py +++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py @@ -111,7 +111,9 @@ 'field': 'engine-health', 'monitor': 'engine-health', 'options': { - 'address': self._config.get(config.ENGINE, config.ENGINE_FQDN)} + 'address': '0', + 'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL), + 'vm_uuid': self._config.get(config.VM, config.VM_UUID)} }) return req @@ -431,6 +433,7 @@ 'last-update-host-ts': None, 'alive': False, 'score': 0, + 'engine-status': None, 'hostname': '(unknown)'} if len(data) < 512: @@ -440,7 +443,7 @@ continue data = data[:512].rstrip('\0') tokens = data.split('|') - if len(tokens) < 6: + if len(tokens) < 7: self._log.error("Malformed metadata for host %d:" " received %d of %d expected tokens", host_id, len(tokens), 6) @@ -463,16 +466,19 @@ host_ts = int(tokens[2]) score = int(tokens[4]) - hostname = str(tokens[5]) # convert from bytearray + engine_status = str(tokens[5]) # convert from bytearray + hostname = str(tokens[6]) # convert from bytearray if host_ts != self._all_host_stats[host_id]['last-update-host-ts']: # Track first update in order to accurately judge liveness if self._all_host_stats[host_id]['last-update-host-ts']: self._all_host_stats[host_id]['first-update'] = False + self._all_host_stats[host_id]['last-update-host-ts'] = host_ts self._all_host_stats[host_id]['last-update-local-ts'] = \ local_ts self._all_host_stats[host_id]['score'] = score + self._all_host_stats[host_id]['engine-status'] = engine_status self._all_host_stats[host_id]['hostname'] = hostname # All updated, now determine if hosts are alive/updating @@ -503,24 +509,40 @@ """ Start or stop engine on current host based on hosts' statistics. """ + local_host_id = int(self._config.get(config.ENGINE, config.HOST_ID)) engine_status = self._local_monitors['engine-health']['status'] - if engine_status == 'None': + engine_status_host_id = local_host_id + best_score = self._local_monitors['score'] + best_score_host_id = local_host_id + + if engine_status is None: self._log.info("Unknown engine vm status, no actions taken") return - if engine_status[:2] == 'up': - self._log.info("Engine vm is running", - extra=self._get_lf_args(self.LF_ENGINE_HEALTH)) + for host_id, stats in self._all_host_stats.iteritems(): + if stats['engine-health']['status'] != 'vm-down': + engine_status = stats['engine-health']['status'] + engine_status_host_id = host_id + # Score is updated only if it's better than local score + if stats['score'] > best_score: + best_score_host_id = host_id + + if engine_status[:5] == 'vm-up': + # FIXME timeout for bad-host-status: if up and no engine, try to + # migrate; if can't migrate, reduce local score and shut down + self._log.info( + "Engine vm is running on host %s (id %d)", + self._all_host_stats[engine_status_host_id]['hostname'], + engine_status_host_id, + extra=self._get_lf_args(self.LF_ENGINE_HEALTH) + ) return - # FIXME other statuses: bad health status, remote db down + # FIXME remote db down, other statuses # FIXME cluster-wide engine maintenance bit - best_score = max((self._all_host_stats[host_id]['score'] - for host_id in self._all_host_stats)) - local_host_id = int(self._config.get(config.ENGINE, config.HOST_ID)) - if self._all_host_stats[local_host_id]['score'] != best_score: + if best_score_host_id != local_host_id: self._log.info("Engine down, local host does not have best score", extra=self._get_lf_args(self.LF_ENGINE_HEALTH)) return @@ -561,3 +583,4 @@ raise Exception(output[1]) self._log.error("Engine VM started on localhost") + # FIXME record start time in order to track bad-health-status timeout diff --git a/ovirt_hosted_engine_ha/broker/submonitor_util.py b/ovirt_hosted_engine_ha/broker/submonitor_util.py index 7c20b41..d76bcfb 100644 --- a/ovirt_hosted_engine_ha/broker/submonitor_util.py +++ b/ovirt_hosted_engine_ha/broker/submonitor_util.py @@ -27,7 +27,7 @@ from . import constants -def run_vds_client_cmd(address, use_ssl, command): +def run_vds_client_cmd(address, use_ssl, command, *args): """ Run the passed in command name from the vdsClient library and either throw an exception with the error message or return the results. @@ -60,7 +60,7 @@ retry = 0 while retry < constants.VDS_CLIENT_MAX_RETRY: try: - response = method() + response = method(args) break except socket.error: log.debug("Error", exc_info=True) diff --git a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py index fcd1b47..5ac7f61 100644 --- a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py +++ b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py @@ -22,6 +22,8 @@ from ovirt_hosted_engine_ha.broker import constants from ovirt_hosted_engine_ha.broker import submonitor_base +from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util +from ovirt_hosted_engine_ha.lib import util as util def register(): @@ -29,41 +31,51 @@ class Submonitor(submonitor_base.SubmonitorBase): + def setup(self, options): + self._log = logging.getLogger("EngineHealth") + +# FIXME combine blocks + self._address = options.get('address') + self._use_ssl = util.to_bool(options.get('use_ssl')) + self._vm_uuid = options.get('vm_uuid') + if (self._address is None + or self._use_ssl is None + or self._vm_uuid is None): + raise Exception("mem-load requires address, use_ssl, and vm_uuid") + self._log.debug("address=%s, use_ssl=%r, vm_uuid=%s", + self._address, self._use_ssl, self._vm_uuid) + def action(self, options): - # Rely on hosted-engine for status log = logging.getLogger("EngineHealth") - # FIXME use this when `hosted-engine --vm-status` is implemented - """ - # First see if VM is holding a lock on its storage... - p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, '--vm-status'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output = p.communicate() - if p.returncode != 0: - log.warning("Engine VM not running: %s", output[0]) - self.update_result("down") + # First, see if vdsm tells us it's up + try: + stats = sm_util.run_vds_client_cmd(self._address, self._use_ssl, + 'getVmStats', self._vm_uuid) + except Exception as e: + if str(e) != "Virtual machine does not exist": + self._log.error("Failed to getVmStats: %s", str(e)) + self.update_result(None) + return + else: + # Not on this host + self._log.info("VM not on this host") + self.update_result('vm-down') + return + vm_status = stats['info']['status'] + if vm_status != 'up': + self._log.info("VM not running on this host, status %s", vm_status) + self.update_result('vm-down') return - # VM is up, see if the engine inside it is healthy + # VM is up, let's see if engine is up by polling health status page p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, '--check-liveliness'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = p.communicate() - if p.returncode != 0: - log.warning("Engine VM up but bad health status: %s", output[0]) - self.update_result("up bad-health-status") + if p.returncode == 0: + self.update_result("vm-up good-health-status") return - else: - self.update_result("up good-health-status") - """ - # For now, just look at the health status page - p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, - '--check-liveliness'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output = p.communicate() - if p.returncode != 0: - log.warning("bad health status: %s", output[0]) - self.update_result("down") - return - else: - self.update_result("up good-health-status") + log.warning("bad health status: %s", output[0]) + self.update_result("vm-up bad-health-status") + # FIXME remote db down status -- To view, visit http://gerrit.ovirt.org/17801 To unsubscribe, visit http://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I791ae950db987c87679efb9c23ccb201895b20bc Gerrit-PatchSet: 1 Gerrit-Project: ovirt-hosted-engine-ha Gerrit-Branch: master Gerrit-Owner: Greg Padgett <[email protected]> _______________________________________________ Engine-patches mailing list [email protected] http://lists.ovirt.org/mailman/listinfo/engine-patches
