Our tests running via RunWithLocks strictly depend on no watcher jobs interfering. Therefore they pause the watcher; unfortunately, there still is a race: the watcher only checks the pause status upon its invocation, but submits jobs later in its run time. Therefore not only pause it (doesn't hurt), but also add a filter to reject all its jobs, and then wait for all running jobs to terminate.
Signed-off-by: Klaus Aehlig <[email protected]> --- qa/qa_job_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/qa/qa_job_utils.py b/qa/qa_job_utils.py index 23ff206..b0edd6f 100644 --- a/qa/qa_job_utils.py +++ b/qa/qa_job_utils.py @@ -365,7 +365,15 @@ def RunWithLocks(fn, locks, timeout, block, *args, **kwargs): "acquired in the course of a QA test.") # The watcher may interfere by issuing its own jobs - therefore pause it + # also reject all its jobs and wait for any running jobs to finish. AssertCommand(["gnt-cluster", "watcher", "pause", "12h"]) + filter_uuid = stdout_of([ + "gnt-filter", "add", + '--predicates=[["reason", ["=", "source", "gnt:watcher"]]]', + "--action=REJECT" + ]) + while stdout_of(["gnt-job", "list", "--no-header", "--running"]) != "": + time.sleep(1) # Find out the lock names prior to starting the delay function lock_name_map = _FindLockNames(locks) @@ -416,6 +424,7 @@ def RunWithLocks(fn, locks, timeout, block, *args, **kwargs): pass # Revive the watcher + AssertCommand(["gnt-filter", "delete", filter_uuid]) AssertCommand(["gnt-cluster", "watcher", "continue"]) -- 2.4.3.573.g4eafbef
