Nir Soffer has uploaded a new change for review. Change subject: health: Warn about delays in monitor threads ......................................................................
health: Warn about delays in monitor threads We have a bug about lost hearbeats, caused by unknown delay in the reactor thread sending the heartbeats. This patch monitor delays in waking up the health monitor, hopefully shedding more light on this issue. When the monitor is waking up normally, we log this debug message: Woke up - delay=0.00s If there was a delay bigger then health_max_delay seconds, we log this warning message: Abnormal delay - delay=0.62s Change-Id: I6a0a1b1a375166a08443673a097706998bbdad57 Signed-off-by: Nir Soffer <nsof...@redhat.com> --- M lib/vdsm/config.py.in M lib/vdsm/health.py 2 files changed, 24 insertions(+), 5 deletions(-) git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/24/52524/1 diff --git a/lib/vdsm/config.py.in b/lib/vdsm/config.py.in index 43868fc..ea297df 100644 --- a/lib/vdsm/config.py.in +++ b/lib/vdsm/config.py.in @@ -434,6 +434,9 @@ ('health_check_interval', '60', 'Number of seconds to wait between health checks.'), + ('health_max_delay', '0.5', + 'Warn if health checks are delayed longer than this interval.'), + ]), # Section: [gluster] diff --git a/lib/vdsm/health.py b/lib/vdsm/health.py index 1b414a5..b7cc6b2 100644 --- a/lib/vdsm/health.py +++ b/lib/vdsm/health.py @@ -26,6 +26,7 @@ from . config import config from . import concurrent +from . import utils _monitor = None @@ -35,7 +36,8 @@ assert _monitor is None if config.getboolean("devel", "health_monitor_enable"): interval = config.getint("devel", "health_check_interval") - _monitor = Monitor(interval) + max_delay = config.getfloat("devel", "health_max_delay") + _monitor = Monitor(interval, max_delay=max_delay) _monitor.start() @@ -50,14 +52,16 @@ log = logging.getLogger("health") - def __init__(self, interval): + def __init__(self, interval, max_delay=0.5): self._interval = interval + self._max_delay = max_delay self._thread = concurrent.thread(self._run) self._done = threading.Event() self._last = ProcStat() def start(self): - self.log.info("Starting health monitor (interval=%d)", self._interval) + self.log.info("Starting health monitor (interval=%d, max_delay=%.2f)", + self._interval, self._max_delay) self._thread.start() def stop(self): @@ -73,8 +77,10 @@ saved_flags = gc.get_debug() gc.set_debug(0) try: - while not self._done.wait(self._interval): + while True: try: + if self._wait(): + break self._check() except Exception: self.log.exception("Error checking health") @@ -82,8 +88,18 @@ gc.set_debug(saved_flags) self.log.debug("Health monitor stopped") + def _wait(self): + deadline = utils.monotonic_time() + self._interval + if self._done.wait(self._interval): + return True + delay = utils.monotonic_time() - deadline + if delay <= self._max_delay: + self.log.debug("Woke up - delay=%.02fs", delay) + else: + self.log.warning("Abnormal dealy - delay=%.02fs", delay) + return False + def _check(self): - self.log.debug("Checking health") self._check_garbage() self._check_resources() -- To view, visit https://gerrit.ovirt.org/52524 To unsubscribe, visit https://gerrit.ovirt.org/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I6a0a1b1a375166a08443673a097706998bbdad57 Gerrit-PatchSet: 1 Gerrit-Project: vdsm Gerrit-Branch: master Gerrit-Owner: Nir Soffer <nsof...@redhat.com> _______________________________________________ vdsm-patches mailing list vdsm-patches@lists.fedorahosted.org https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches