Nir Soffer has uploaded a new change for review.

Change subject: health: Warn about delays in monitor threads
......................................................................

health: Warn about delays in monitor threads

We have a bug about lost hearbeats, caused by unknown delay in the
reactor thread sending the heartbeats. This patch monitor delays in
waking up the health monitor, hopefully shedding more light on this
issue.

When the monitor is waking up normally, we log this debug message:

    Woke up - delay=0.00s

If there was a delay bigger then health_max_delay seconds, we log this
warning message:

    Abnormal delay - delay=0.62s

Change-Id: I6a0a1b1a375166a08443673a097706998bbdad57
Signed-off-by: Nir Soffer <nsof...@redhat.com>
---
M lib/vdsm/config.py.in
M lib/vdsm/health.py
2 files changed, 24 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/24/52524/1

diff --git a/lib/vdsm/config.py.in b/lib/vdsm/config.py.in
index 43868fc..ea297df 100644
--- a/lib/vdsm/config.py.in
+++ b/lib/vdsm/config.py.in
@@ -434,6 +434,9 @@
         ('health_check_interval', '60',
             'Number of seconds to wait between health checks.'),
 
+        ('health_max_delay', '0.5',
+            'Warn if health checks are delayed longer than this interval.'),
+
     ]),
 
     # Section: [gluster]
diff --git a/lib/vdsm/health.py b/lib/vdsm/health.py
index 1b414a5..b7cc6b2 100644
--- a/lib/vdsm/health.py
+++ b/lib/vdsm/health.py
@@ -26,6 +26,7 @@
 
 from . config import config
 from . import concurrent
+from . import utils
 
 _monitor = None
 
@@ -35,7 +36,8 @@
     assert _monitor is None
     if config.getboolean("devel", "health_monitor_enable"):
         interval = config.getint("devel", "health_check_interval")
-        _monitor = Monitor(interval)
+        max_delay = config.getfloat("devel", "health_max_delay")
+        _monitor = Monitor(interval, max_delay=max_delay)
         _monitor.start()
 
 
@@ -50,14 +52,16 @@
 
     log = logging.getLogger("health")
 
-    def __init__(self, interval):
+    def __init__(self, interval, max_delay=0.5):
         self._interval = interval
+        self._max_delay = max_delay
         self._thread = concurrent.thread(self._run)
         self._done = threading.Event()
         self._last = ProcStat()
 
     def start(self):
-        self.log.info("Starting health monitor (interval=%d)", self._interval)
+        self.log.info("Starting health monitor (interval=%d, max_delay=%.2f)",
+                      self._interval, self._max_delay)
         self._thread.start()
 
     def stop(self):
@@ -73,8 +77,10 @@
         saved_flags = gc.get_debug()
         gc.set_debug(0)
         try:
-            while not self._done.wait(self._interval):
+            while True:
                 try:
+                    if self._wait():
+                        break
                     self._check()
                 except Exception:
                     self.log.exception("Error checking health")
@@ -82,8 +88,18 @@
             gc.set_debug(saved_flags)
         self.log.debug("Health monitor stopped")
 
+    def _wait(self):
+        deadline = utils.monotonic_time() + self._interval
+        if self._done.wait(self._interval):
+            return True
+        delay = utils.monotonic_time() - deadline
+        if delay <= self._max_delay:
+            self.log.debug("Woke up - delay=%.02fs", delay)
+        else:
+            self.log.warning("Abnormal dealy - delay=%.02fs", delay)
+        return False
+
     def _check(self):
-        self.log.debug("Checking health")
         self._check_garbage()
         self._check_resources()
 


-- 
To view, visit https://gerrit.ovirt.org/52524
To unsubscribe, visit https://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I6a0a1b1a375166a08443673a097706998bbdad57
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Nir Soffer <nsof...@redhat.com>
_______________________________________________
vdsm-patches mailing list
vdsm-patches@lists.fedorahosted.org
https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches

Reply via email to