The ganeti-watcher holds the group file lock for too long, until after the execution of a group-verify-disk job. This locks for a long time if there are other jobs already running and blocking the verify from executing. When the lock is held, another ganeti-watcher run cannot be scheduled, so this prevents the ganeti-watcher from running for several minutes.
With this commit, the lock is released before running the VerifyDisks operation, so even if the submitted job gets stuck in the Job Queue, a subsequient ganeti-watcher run would still happen. As an additional change, an incorrect docstring was also removed. Signed-off-by: Federico Morg Pareschi <[email protected]> --- lib/watcher/__init__.py | 31 ++++++++++++++++--------------- lib/watcher/state.py | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py index 6b73dc0..4e946b3 100644 --- a/lib/watcher/__init__.py +++ b/lib/watcher/__init__.py @@ -844,7 +844,7 @@ def _GroupWatcher(opts): logging.debug("Using state file %s", state_path) - # Global watcher + # Group watcher file lock statefile = state.OpenStateFile(state_path) # pylint: disable=E0602 if not statefile: return constants.EXIT_FAILURE @@ -867,26 +867,27 @@ def _GroupWatcher(opts): started = _CheckInstances(client, notepad, instances, locks) _CheckDisks(client, notepad, nodes, instances, started) - - # Check if the nodegroup only has ext storage type - only_ext = compat.all(i.disk_template == constants.DT_EXT - for i in instances.values()) - - # We skip current NodeGroup verification if there are only external storage - # devices. Currently we provide an interface for external storage provider - # for disk verification implementations, however current ExtStorageDevice - # does not provide an API for this yet. - # - # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice - # is implemented. - if not opts.no_verify_disks and not only_ext: - _VerifyDisks(client, group_uuid, nodes, instances) except Exception, err: logging.info("Not updating status file due to failure: %s", err) raise else: # Save changes for next run notepad.Save(state_path) + notepad.Close() + + # Check if the nodegroup only has ext storage type + only_ext = compat.all(i.disk_template == constants.DT_EXT + for i in instances.values()) + + # We skip current NodeGroup verification if there are only external storage + # devices. Currently we provide an interface for external storage provider + # for disk verification implementations, however current ExtStorageDevice + # does not provide an API for this yet. + # + # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice + # is implemented. + if not opts.no_verify_disks and not only_ext: + _VerifyDisks(client, group_uuid, nodes, instances) return constants.EXIT_SUCCESS diff --git a/lib/watcher/state.py b/lib/watcher/state.py index 5c51b5b..b8ff4ef 100644 --- a/lib/watcher/state.py +++ b/lib/watcher/state.py @@ -111,7 +111,7 @@ class WatcherState(object): self._orig_data = serializer.Dump(self._data) def Save(self, filename): - """Save state to file, then unlock and close it. + """Save state to file. """ assert self.statefile -- 2.8.0.rc3.226.g39d4020
