The ganeti-watcher holds the group file lock for too long, until after
the execution of a group-verify-disk job. This locks for a long time if
there are other jobs already running and blocking the verify from
executing. When the lock is held, another ganeti-watcher run cannot be
scheduled, so this prevents the ganeti-watcher from running for several
minutes.

With this commit, the lock is released before running the VerifyDisks
operation, so even if the submitted job gets stuck in the Job Queue, a
subsequient ganeti-watcher run would still happen.

As an additional change, an incorrect docstring was also removed.

Signed-off-by: Federico Morg Pareschi <[email protected]>
---
 lib/watcher/__init__.py | 31 ++++++++++++++++---------------
 lib/watcher/state.py    |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 6b73dc0..4e946b3 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -844,7 +844,7 @@ def _GroupWatcher(opts):
 
   logging.debug("Using state file %s", state_path)
 
-  # Global watcher
+  # Group watcher file lock
   statefile = state.OpenStateFile(state_path) # pylint: disable=E0602
   if not statefile:
     return constants.EXIT_FAILURE
@@ -867,26 +867,27 @@ def _GroupWatcher(opts):
 
     started = _CheckInstances(client, notepad, instances, locks)
     _CheckDisks(client, notepad, nodes, instances, started)
-
-    # Check if the nodegroup only has ext storage type
-    only_ext = compat.all(i.disk_template == constants.DT_EXT
-                          for i in instances.values())
-
-    # We skip current NodeGroup verification if there are only external storage
-    # devices. Currently we provide an interface for external storage provider
-    # for disk verification implementations, however current ExtStorageDevice
-    # does not provide an API for this yet.
-    #
-    # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
-    # is implemented.
-    if not opts.no_verify_disks and not only_ext:
-      _VerifyDisks(client, group_uuid, nodes, instances)
   except Exception, err:
     logging.info("Not updating status file due to failure: %s", err)
     raise
   else:
     # Save changes for next run
     notepad.Save(state_path)
+    notepad.Close()
+
+  # Check if the nodegroup only has ext storage type
+  only_ext = compat.all(i.disk_template == constants.DT_EXT
+                        for i in instances.values())
+
+  # We skip current NodeGroup verification if there are only external storage
+  # devices. Currently we provide an interface for external storage provider
+  # for disk verification implementations, however current ExtStorageDevice
+  # does not provide an API for this yet.
+  #
+  # This check needs to be revisited if ES_ACTION_VERIFY on ExtStorageDevice
+  # is implemented.
+  if not opts.no_verify_disks and not only_ext:
+    _VerifyDisks(client, group_uuid, nodes, instances)
 
   return constants.EXIT_SUCCESS
 
diff --git a/lib/watcher/state.py b/lib/watcher/state.py
index 5c51b5b..b8ff4ef 100644
--- a/lib/watcher/state.py
+++ b/lib/watcher/state.py
@@ -111,7 +111,7 @@ class WatcherState(object):
     self._orig_data = serializer.Dump(self._data)
 
   def Save(self, filename):
-    """Save state to file, then unlock and close it.
+    """Save state to file.
 
     """
     assert self.statefile
-- 
2.8.0.rc3.226.g39d4020

Reply via email to