When the watcher runs on each node group, if it can obtain the group lock, it submits a GROUP_VERIFY_DISKS job for each group. This happens every 5 minutes due to a cron job. This patch stops the watcher from submitting unnecessary verify disks jobs if there are some already pending in the queue to prevent job congestion.
Signed-off-by: Federico Morg Pareschi <[email protected]> --- lib/watcher/__init__.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/watcher/__init__.py b/lib/watcher/__init__.py index 4e946b3..cfc23e1 100644 --- a/lib/watcher/__init__.py +++ b/lib/watcher/__init__.py @@ -344,11 +344,32 @@ def _CheckForOfflineNodes(nodes, instance): """ return compat.any(nodes[node_name].offline for node_name in instance.snodes) +def _GetPendingVerifyDisks(cl, uuid): + """Checks if there are any currently running or pending group verify jobs and + if so, returns their id. + + """ + qfilter = qlang.MakeSimpleFilter("status", + frozenset([constants.JOB_STATUS_RUNNING, + constants.JOB_STATUS_QUEUED, + constants.JOB_STATUS_WAITING])) + qresult = cl.Query(constants.QR_JOB, ["id", "summary"], qfilter) + + ids = [jobid for ((_, jobid), (_, (job, ))) in qresult.data + if job == ("GROUP_VERIFY_DISKS(%s)" % uuid)] + return ids def _VerifyDisks(cl, uuid, nodes, instances): """Run a per-group "gnt-cluster verify-disks". """ + + existing_jobs = _GetPendingVerifyDisks(cl, uuid) + if existing_jobs: + logging.info("There are verify disks jobs already pending (%s), skipping " + "VerifyDisks step for %s." % (existing_jobs, uuid)) + return + op = opcodes.OpGroupVerifyDisks( group_name=uuid, priority=constants.OP_PRIO_LOW) op.reason = [(constants.OPCODE_REASON_SRC_WATCHER, -- 2.8.0.rc3.226.g39d4020
