When the watcher runs on each node group, if it can obtain the group
lock, it submits a GROUP_VERIFY_DISKS job for each group. This happens
every 5 minutes due to a cron job. This patch stops the watcher from
submitting unnecessary verify disks jobs if there are some already
pending in the queue to prevent job congestion.

Signed-off-by: Federico Morg Pareschi <[email protected]>
---
 lib/watcher/__init__.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py
index 4e946b3..cfc23e1 100644
--- a/lib/watcher/__init__.py
+++ b/lib/watcher/__init__.py
@@ -344,11 +344,32 @@ def _CheckForOfflineNodes(nodes, instance):
   """
   return compat.any(nodes[node_name].offline for node_name in instance.snodes)
 
+def _GetPendingVerifyDisks(cl, uuid):
+  """Checks if there are any currently running or pending group verify jobs and
+  if so, returns their id.
+
+  """
+  qfilter = qlang.MakeSimpleFilter("status",
+                                    frozenset([constants.JOB_STATUS_RUNNING,
+                                               constants.JOB_STATUS_QUEUED,
+                                               constants.JOB_STATUS_WAITING]))
+  qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter)
+
+  ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data
+         if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)]
+  return ids
 
 def _VerifyDisks(cl, uuid, nodes, instances):
   """Run a per-group "gnt-cluster verify-disks".
 
   """
+
+  existing_jobs = _GetPendingVerifyDisks(cl, uuid)
+  if existing_jobs:
+    logging.info("There are verify disks jobs already pending (%s), skipping "
+                 "VerifyDisks step for %s." % (existing_jobs, uuid))
+    return
+
   op = opcodes.OpGroupVerifyDisks(
     group_name=uuid, priority=constants.OP_PRIO_LOW)
   op.reason = [(constants.OPCODE_REASON_SRC_WATCHER,
-- 
2.8.0.rc3.226.g39d4020

Reply via email to