Nir Soffer has uploaded a new change for review.

Change subject: health: Introduce Vdsm health monitoring
......................................................................

health: Introduce Vdsm health monitoring

Debugging Vdsm in the field is very hard, because we don't have enough
information about Vdsm resource usage. This patch adds a new health
monitoring thread, checking Vdsm health every 60 seconds.

For now, we monitor only uncollectible objects that the garbage
collector cannot collect. In the future we should report other
information like used memory, number of active threads, etc.

Health information is only logged; in the future we may post events to
engine.

Relates-To: https://gerrit.ovirt.org/51630
Change-Id: I2abbd753118cb212a298055138087ca2e48ede91
Signed-off-by: Nir Soffer <nsof...@redhat.com>
---
M debian/vdsm-python.install
M lib/vdsm/Makefile.am
M lib/vdsm/config.py.in
A lib/vdsm/health.py
M vdsm.spec.in
M vdsm/vdsm
6 files changed, 96 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/08/51708/1

diff --git a/debian/vdsm-python.install b/debian/vdsm-python.install
index 1863918..e2dc6e8 100644
--- a/debian/vdsm-python.install
+++ b/debian/vdsm-python.install
@@ -11,6 +11,7 @@
 ./usr/lib/python2.7/dist-packages/vdsm/dmidecodeUtil.py
 ./usr/lib/python2.7/dist-packages/vdsm/exception.py
 ./usr/lib/python2.7/dist-packages/vdsm/executor.py
+./usr/lib/python2.7/dist-packages/vdsm/health.py
 ./usr/lib/python2.7/dist-packages/vdsm/hooks.py
 ./usr/lib/python2.7/dist-packages/vdsm/ipwrapper.py
 ./usr/lib/python2.7/dist-packages/vdsm/jsonrpcvdscli.py
diff --git a/lib/vdsm/Makefile.am b/lib/vdsm/Makefile.am
index b4be4fc..e9a9f9f 100644
--- a/lib/vdsm/Makefile.am
+++ b/lib/vdsm/Makefile.am
@@ -31,6 +31,7 @@
        dmidecodeUtil.py \
        exception.py \
        executor.py \
+       health.py \
        hooks.py \
        host.py \
        ipwrapper.py \
diff --git a/lib/vdsm/config.py.in b/lib/vdsm/config.py.in
index fe54318..43868fc 100644
--- a/lib/vdsm/config.py.in
+++ b/lib/vdsm/config.py.in
@@ -428,6 +428,12 @@
             'false by default. Use environment file /etc/sysconfig/vdsm to '
             'set COVERAGE_PROCESS_START and COVERAGE_FILE variables.'),
 
+        ('health_monitor_enable', 'false',
+            'Enable Vdsm health monitoring.'),
+
+        ('health_check_interval', '60',
+            'Number of seconds to wait between health checks.'),
+
     ]),
 
     # Section: [gluster]
diff --git a/lib/vdsm/health.py b/lib/vdsm/health.py
new file mode 100644
index 0000000..572a213
--- /dev/null
+++ b/lib/vdsm/health.py
@@ -0,0 +1,84 @@
+#
+# Copyright 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# Refer to the README and COPYING files for full details of the license
+#
+
+from __future__ import absolute_import
+import gc
+import logging
+import threading
+
+from . config import config
+from . import concurrent
+
+_monitor = None
+
+
+def start():
+    global _monitor
+    assert _monitor is None
+    if config.getboolean("devel", "health_monitor_enable"):
+        interval = config.getint("devel", "health_check_interval")
+        _monitor = health.Monitor(interval)
+        _monitor.start()
+
+
+def stop():
+    global _monitor
+    if _monitor is not None:
+        _monitor.stop()
+        _monitor = None
+
+
+class Monitor(object):
+
+    log = logging.getLogger("health")
+
+    def __init__(self, interval):
+        self._interval = interval
+        self._thread = concurrent.thread(self._run)
+        self._done = threading.Event()
+
+    def start(self):
+        self.log.info("Starting health monitor (interval=%d)", self._interval)
+        self._thread.start()
+
+    def stop(self):
+        self.log.info("Stopping health monitor")
+        self._done.set()
+
+    def wait(self):
+        self.log.debug("Waiting for health monitor")
+        self._thread.join()
+
+    def _run(self):
+        self.debug.info("Health monitor started")
+        gc.set_debug(gc.DEBUG_LEAK)
+        try:
+            while not self._done.wait(self._interval):
+                self._check()
+        finally:
+            gc.set_debug(0)
+        self.debug.info("Health monitor stopped")
+
+    def _check(self):
+        self.log.debug("Checking health")
+        collected = gc.collect()
+        self.log.debug("Collected %d objects", collected)
+        if gc.garbage:
+            self.log.warning("Uncollectible objects found: %s", gc.garbage)
diff --git a/vdsm.spec.in b/vdsm.spec.in
index 16039da..5a521d2 100644
--- a/vdsm.spec.in
+++ b/vdsm.spec.in
@@ -1076,6 +1076,7 @@
 %{python_sitelib}/%{vdsm_name}/dmidecodeUtil.py*
 %{python_sitelib}/%{vdsm_name}/exception.py*
 %{python_sitelib}/%{vdsm_name}/executor.py*
+%{python_sitelib}/%{vdsm_name}/health.py*
 %{python_sitelib}/%{vdsm_name}/hooks.py*
 %{python_sitelib}/%{vdsm_name}/host.py*
 %{python_sitelib}/%{vdsm_name}/ipwrapper.py*
diff --git a/vdsm/vdsm b/vdsm/vdsm
index 3b576a7..961567d 100755
--- a/vdsm/vdsm
+++ b/vdsm/vdsm
@@ -33,6 +33,7 @@
 from vdsm import commands
 from vdsm import constants
 from vdsm import dsaversion
+from vdsm import health
 from vdsm import schedule
 from vdsm import utils
 from vdsm.config import config
@@ -104,12 +105,14 @@
 
         cif.start()
         periodic.start(cif, scheduler)
+        health.start()
         try:
             while running[0]:
                 sigutils.wait_for_signal()
 
             profile.stop()
         finally:
+            health.stop()
             periodic.stop()
             cif.prepareForShutdown()
             scheduler.stop()


-- 
To view, visit https://gerrit.ovirt.org/51708
To unsubscribe, visit https://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2abbd753118cb212a298055138087ca2e48ede91
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Nir Soffer <nsof...@redhat.com>
_______________________________________________
vdsm-patches mailing list
vdsm-patches@lists.fedorahosted.org
https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches

Reply via email to