[Engine-patches] Change in ovirt-hosted-engine-ha[master]: WIP agent, broker: propagate engine status via metadata

gpadgett Wed, 07 Aug 2013 18:15:00 -0700

Greg Padgett has uploaded a new change for review.

Change subject: WIP agent, broker: propagate engine status via metadata
......................................................................


WIP agent, broker: propagate engine status via metadata

Let each host find the status of its own engine (and vm) and propagate
it through the shared metadata, rather than each host trying to query
the status over the network.

Change-Id: I791ae950db987c87679efb9c23ccb201895b20bc
Signed-off-by: Greg Padgett <[email protected]>
---
M ovirt_hosted_engine_ha/agent/config.py
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/broker/submonitor_util.py
M ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
4 files changed, 78 insertions(+), 42 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/01/17801/1

diff --git a/ovirt_hosted_engine_ha/agent/config.py 
b/ovirt_hosted_engine_ha/agent/config.py
index f8aa587..5dd6969 100644
--- a/ovirt_hosted_engine_ha/agent/config.py
+++ b/ovirt_hosted_engine_ha/agent/config.py
@@ -30,6 +30,7 @@
 # constants for vm.conf options
 VM = 'vm'
 BRIDGE_NAME = 'bridge'
+VM_UUID = 'vmId'
 
 
 class Config(object):
diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 926b726..64c33f9 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -111,7 +111,9 @@
             'field': 'engine-health',
             'monitor': 'engine-health',
             'options': {
-                'address': self._config.get(config.ENGINE, config.ENGINE_FQDN)}
+                'address': '0',
+                'use_ssl': self._config.get(config.ENGINE, config.VDSM_SSL),
+                'vm_uuid': self._config.get(config.VM, config.VM_UUID)}
         })
         return req
 
@@ -431,6 +433,7 @@
                     'last-update-host-ts': None,
                     'alive': False,
                     'score': 0,
+                    'engine-status': None,
                     'hostname': '(unknown)'}
 
             if len(data) < 512:
@@ -440,7 +443,7 @@
                 continue
             data = data[:512].rstrip('\0')
             tokens = data.split('|')
-            if len(tokens) < 6:
+            if len(tokens) < 7:
                 self._log.error("Malformed metadata for host %d:"
                                 " received %d of %d expected tokens",
                                 host_id, len(tokens), 6)
@@ -463,16 +466,19 @@
 
             host_ts = int(tokens[2])
             score = int(tokens[4])
-            hostname = str(tokens[5])  # convert from bytearray
+            engine_status = str(tokens[5])  # convert from bytearray
+            hostname = str(tokens[6])  # convert from bytearray
 
             if host_ts != self._all_host_stats[host_id]['last-update-host-ts']:
                 # Track first update in order to accurately judge liveness
                 if self._all_host_stats[host_id]['last-update-host-ts']:
                     self._all_host_stats[host_id]['first-update'] = False
+
                 self._all_host_stats[host_id]['last-update-host-ts'] = host_ts
                 self._all_host_stats[host_id]['last-update-local-ts'] = \
                     local_ts
                 self._all_host_stats[host_id]['score'] = score
+                self._all_host_stats[host_id]['engine-status'] = engine_status
                 self._all_host_stats[host_id]['hostname'] = hostname
 
         # All updated, now determine if hosts are alive/updating
@@ -503,24 +509,40 @@
         """
         Start or stop engine on current host based on hosts' statistics.
         """
+        local_host_id = int(self._config.get(config.ENGINE, config.HOST_ID))
         engine_status = self._local_monitors['engine-health']['status']
-        if engine_status == 'None':
+        engine_status_host_id = local_host_id
+        best_score = self._local_monitors['score']
+        best_score_host_id = local_host_id
+
+        if engine_status is None:
             self._log.info("Unknown engine vm status, no actions taken")
             return
 
-        if engine_status[:2] == 'up':
-            self._log.info("Engine vm is running",
-                           extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
+        for host_id, stats in self._all_host_stats.iteritems():
+            if stats['engine-health']['status'] != 'vm-down':
+                engine_status = stats['engine-health']['status']
+                engine_status_host_id = host_id
+            # Score is updated only if it's better than local score
+            if stats['score'] > best_score:
+                best_score_host_id = host_id
+
+        if engine_status[:5] == 'vm-up':
+            # FIXME timeout for bad-host-status: if up and no engine, try to
+            # migrate; if can't migrate, reduce local score and shut down
+            self._log.info(
+                "Engine vm is running on host %s (id %d)",
+                self._all_host_stats[engine_status_host_id]['hostname'],
+                engine_status_host_id,
+                extra=self._get_lf_args(self.LF_ENGINE_HEALTH)
+            )
             return
 
-        # FIXME other statuses: bad health status, remote db down
+        # FIXME remote db down, other statuses
 
         # FIXME cluster-wide engine maintenance bit
 
-        best_score = max((self._all_host_stats[host_id]['score']
-                         for host_id in self._all_host_stats))
-        local_host_id = int(self._config.get(config.ENGINE, config.HOST_ID))
-        if self._all_host_stats[local_host_id]['score'] != best_score:
+        if best_score_host_id != local_host_id:
             self._log.info("Engine down, local host does not have best score",
                            extra=self._get_lf_args(self.LF_ENGINE_HEALTH))
             return
@@ -561,3 +583,4 @@
             raise Exception(output[1])
 
         self._log.error("Engine VM started on localhost")
+        # FIXME record start time in order to track bad-health-status timeout
diff --git a/ovirt_hosted_engine_ha/broker/submonitor_util.py 
b/ovirt_hosted_engine_ha/broker/submonitor_util.py
index 7c20b41..d76bcfb 100644
--- a/ovirt_hosted_engine_ha/broker/submonitor_util.py
+++ b/ovirt_hosted_engine_ha/broker/submonitor_util.py
@@ -27,7 +27,7 @@
 from . import constants
 
 
-def run_vds_client_cmd(address, use_ssl, command):
+def run_vds_client_cmd(address, use_ssl, command, *args):
     """
     Run the passed in command name from the vdsClient library and either
     throw an exception with the error message or return the results.
@@ -60,7 +60,7 @@
     retry = 0
     while retry < constants.VDS_CLIENT_MAX_RETRY:
         try:
-            response = method()
+            response = method(args)
             break
         except socket.error:
             log.debug("Error", exc_info=True)
diff --git a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py 
b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
index fcd1b47..5ac7f61 100644
--- a/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
+++ b/ovirt_hosted_engine_ha/broker/submonitors/engine_health.py
@@ -22,6 +22,8 @@
 
 from ovirt_hosted_engine_ha.broker import constants
 from ovirt_hosted_engine_ha.broker import submonitor_base
+from ovirt_hosted_engine_ha.broker import submonitor_util as sm_util
+from ovirt_hosted_engine_ha.lib import util as util
 
 
 def register():
@@ -29,41 +31,51 @@
 
 
 class Submonitor(submonitor_base.SubmonitorBase):
+    def setup(self, options):
+        self._log = logging.getLogger("EngineHealth")
+
+# FIXME combine blocks
+        self._address = options.get('address')
+        self._use_ssl = util.to_bool(options.get('use_ssl'))
+        self._vm_uuid = options.get('vm_uuid')
+        if (self._address is None
+                or self._use_ssl is None
+                or self._vm_uuid is None):
+            raise Exception("mem-load requires address, use_ssl, and vm_uuid")
+        self._log.debug("address=%s, use_ssl=%r, vm_uuid=%s",
+                        self._address, self._use_ssl, self._vm_uuid)
+
     def action(self, options):
-        # Rely on hosted-engine for status
         log = logging.getLogger("EngineHealth")
 
-        # FIXME use this when `hosted-engine --vm-status` is implemented
-        """
-        # First see if VM is holding a lock on its storage...
-        p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY, '--vm-status'],
-                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output = p.communicate()
-        if p.returncode != 0:
-            log.warning("Engine VM not running: %s", output[0])
-            self.update_result("down")
+        # First, see if vdsm tells us it's up
+        try:
+            stats = sm_util.run_vds_client_cmd(self._address, self._use_ssl,
+                                               'getVmStats', self._vm_uuid)
+        except Exception as e:
+            if str(e) != "Virtual machine does not exist":
+                self._log.error("Failed to getVmStats: %s", str(e))
+                self.update_result(None)
+                return
+            else:
+                # Not on this host
+                self._log.info("VM not on this host")
+                self.update_result('vm-down')
+                return
+        vm_status = stats['info']['status']
+        if vm_status != 'up':
+            self._log.info("VM not running on this host, status %s", vm_status)
+            self.update_result('vm-down')
             return
 
-        # VM is up, see if the engine inside it is healthy
+        # VM is up, let's see if engine is up by polling health status page
         p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY,
                               '--check-liveliness'],
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         output = p.communicate()
-        if p.returncode != 0:
-            log.warning("Engine VM up but bad health status: %s", output[0])
-            self.update_result("up bad-health-status")
+        if p.returncode == 0:
+            self.update_result("vm-up good-health-status")
             return
-        else:
-            self.update_result("up good-health-status")
-        """
-        # For now, just look at the health status page
-        p = subprocess.Popen([constants.HOSTED_ENGINE_BINARY,
-                              '--check-liveliness'],
-                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output = p.communicate()
-        if p.returncode != 0:
-            log.warning("bad health status: %s", output[0])
-            self.update_result("down")
-            return
-        else:
-            self.update_result("up good-health-status")
+        log.warning("bad health status: %s", output[0])
+        self.update_result("vm-up bad-health-status")
+        # FIXME remote db down status


-- 
To view, visit http://gerrit.ovirt.org/17801
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I791ae950db987c87679efb9c23ccb201895b20bc
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: master
Gerrit-Owner: Greg Padgett <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

[Engine-patches] Change in ovirt-hosted-engine-ha[master]: WIP agent, broker: propagate engine status via metadata

Reply via email to