On Tue, Feb 18, 2014 at 3:39 PM, Hrvoje Ribicic <[email protected]> wrote:

> This patch adds a QA utility function that acquires a set of locks, and
> attempts to run a given function with the locks in place. Should the
> given function block, this function does not detect this - later
> patches will address the issue.
>
> An example of its use is provided by having the move-instance test
> modified to use it.
>
> Signed-off-by: Hrvoje Ribicic <[email protected]>
> ---
>  Makefile.am        |   1 +
>  qa/qa_job_utils.py | 149
> +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qa/qa_rapi.py      |  18 +++++--
>  3 files changed, 163 insertions(+), 5 deletions(-)
>  create mode 100644 qa/qa_job_utils.py
>
> diff --git a/Makefile.am b/Makefile.am
> index 8cc443e..4009eae 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -1010,6 +1010,7 @@ qa_scripts = \
>         qa/qa_instance.py \
>         qa/qa_instance_utils.py \
>         qa/qa_job.py \
> +       qa/qa_job_utils.py \
>         qa/qa_monitoring.py \
>         qa/qa_node.py \
>         qa/qa_os.py \
> diff --git a/qa/qa_job_utils.py b/qa/qa_job_utils.py
> new file mode 100644
> index 0000000..bc4733c
> --- /dev/null
> +++ b/qa/qa_job_utils.py
> @@ -0,0 +1,149 @@
> +#
> +#
> +
> +# Copyright (C) 2014 Google Inc.
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful, but
> +# WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write to the Free Software
> +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> +# 02110-1301, USA.
> +
> +
> +"""QA utility functions for testing jobs
> +
> +"""
> +
> +import re
> +
> +from ganeti import constants
> +from ganeti import locking
> +from ganeti import utils
> +
> +import qa_config
> +import qa_error
> +
> +from qa_utils import AssertCommand, GetCommandOutput, GetObjectInfo
> +
> +
> +AVAILABLE_LOCKS = [locking.LEVEL_NODE, ]
> +
> +
> +def _GetOutputFromMaster(cmd):
> +  """ Gets the output of a command executed on master.
> +
> +  """
> +  if isinstance(cmd, basestring):
> +    cmdstr = cmd
> +  else:
> +    cmdstr = utils.ShellQuoteArgs(cmd)
> +
> +  # Necessary due to the stderr stream not being captured properly on the
> +  # buildbot
> +  cmdstr += " 2>&1"
> +
> +  return GetCommandOutput(qa_config.GetMasterNode().primary, cmdstr)
> +
> +
> +def ExecuteJobProducingCommand(cmd):
> +  """ Executes a --submit-using command producing a job, and returns an
> id.
>
This is somewhat confusing, I'd suggest to reword it to something like "a
command using --submit".


> +
> +  @type cmd: list of string
> +  @param cmd: The command to execute, broken into constituent components.
> +
> +  """
> +  job_id_output = _GetOutputFromMaster(cmd)
> +
> +  possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output)
> +  if len(possible_job_ids) != 1:
> +    raise qa_error.Error("Cannot parse command output to find job id:
> output "
> +                         "is %s" % job_id_output)
> +
> +  return int(possible_job_ids[0])
> +
> +
> +def _StartDelayFunction(locks, timeout):
> +  """ Starts the gnt-debug delay option with the given locks and timeout.
> +
> +  """
> +  # The interruptible switch must be used
> +  cmd = ["gnt-debug", "delay", "-i", "--submit", "--no-master"]
> +
> +  for node in locks.get(locking.LEVEL_NODE, []):
> +    cmd.append("-n%s" % node)
> +
> +  cmd.append(str(timeout))
> +
> +  job_id = ExecuteJobProducingCommand(cmd)
> +  job_info = GetObjectInfo(["gnt-job", "info", str(job_id)])
> +  execution_logs = job_info[0]["Opcodes"][0]["Execution log"]
> +
> +  is_termination_info_fn = \
> +    lambda e: e["Content"][1] == constants.ELOG_DELAY_TEST
> +  filtered_logs = filter(is_termination_info_fn, execution_logs)
> +
> +  if len(filtered_logs) != 1:
> +    raise qa_error.Error("Failure when trying to retrieve delay
> termination "
> +                         "information")
> +
> +  _, _, (socket_path, ) = filtered_logs[0]["Content"]
> +
> +  return socket_path
> +
> +
> +def _TerminateDelayFunction(termination_socket):
> +  """ Terminates the delay function by communicating with the domain
> socket.
> +
> +  """
> +  AssertCommand("echo a | socat -u stdin UNIX-CLIENT:%s" %
> termination_socket)
>

For the QA it's probably completely fine to call an external command. You
might also want to have a look at Transport in lib/rpc/transport.py, which
is used in Luxi and WConfd to send a EOM-terminated messages (0x03) over UD
sockets.


> +
> +
> +# TODO: Can this be done as a decorator? Implement as needed.
>

IDK if it makes sense here, but perhaps creating a ContextManager could be
one option:
http://docs.python.org/2/reference/compound_stmts.html#the-with-statement/


> +def RunWithLocks(fn, locks, timeout, *args, **kwargs):
> +  """ Runs the given function, acquiring a set of locks beforehand.
> +
> +  @type fn: function
> +  @param fn: The function to invoke.
> +  @type locks: dict of string to list of string
> +  @param locks: The locks to acquire, per lock category.
> +  @type timeout: number
> +  @param timeout: The number of seconds the locks should be held before
> +                  expiring.
> +
> +  This function allows a set of locks to be acquired in preparation for a
> QA
> +  test, to try and see if the function can run in parallel with other
> +  operations.
> +
> +  The current version simply creates the locks, which expire after a given
> +  timeout, and attempts to invoke the provided function.
> +
> +  This will probably block the QA, and future versions will address this.
> +
> +  A default timeout is not provided by design - the test creator must
> make a
> +  good conservative estimate.
> +
> +  """
> +  if filter(lambda l_type: l_type not in AVAILABLE_LOCKS, locks):
> +    raise qa_error.Error("Attempted to acquire locks that cannot yet be "
> +                         "acquired in the course of a QA test.")
> +
> +  # The watcher may interfere by issuing its own jobs - therefore pause it
> +  AssertCommand(["gnt-cluster", "watcher", "pause", "12h"])
> +
> +  termination_socket = _StartDelayFunction(locks, timeout)
> +
> +  fn(*args, **kwargs)
> +
> +  _TerminateDelayFunction(termination_socket)
> +
> +  # Revive the watcher
> +  AssertCommand(["gnt-cluster", "watcher", "continue"])
> diff --git a/qa/qa_rapi.py b/qa/qa_rapi.py
> index 3164cb5..9ad8afb 100644
> --- a/qa/qa_rapi.py
> +++ b/qa/qa_rapi.py
> @@ -52,6 +52,7 @@ import qa_utils
>  from qa_instance import IsDiskReplacingSupported
>  from qa_instance import IsFailoverSupported
>  from qa_instance import IsMigrationSupported
> +from qa_job_utils import RunWithLocks
>  from qa_utils import (AssertEqual, AssertIn, AssertMatch,
> StartLocalCommand)
>  from qa_utils import InstanceCheck, INST_DOWN, INST_UP, FIRST_ARG
>
> @@ -917,7 +918,10 @@ def _InvokeMoveInstance(current_dest_inst,
> current_src_inst, rapi_pw_filename,
>        "--dest-secondary-node=%s" % snode,
>        ])
>    else:
> -    cmd.append("--iallocator=%s" % constants.IALLOC_HAIL)
> +    cmd.extend([
> +      "--iallocator=%s" % constants.IALLOC_HAIL,
> +      "--opportunistic-tries=1",
> +      ])
>
>    cmd.extend([
>      "--net=0:mac=%s" % constants.VALUE_GENERATE,
> @@ -959,10 +963,14 @@ def TestInterClusterInstanceMove(src_instance,
> dest_instance,
>      snode = tnode
>    pnode = inodes[0]
>
> -  # pnode:snode are the *current* nodes, so we move it first to
> tnode:pnode
> -  _InvokeMoveInstance(dest_instance.name, src_instance.name,
> rapi_pw_file.name,
> -                      master.primary, perform_checks,
> -                      target_nodes=(tnode.primary, pnode.primary))
> +  # pnode:snode are the *current* nodes, and the first move is an
> +  # iallocator-guided move outside of pnode. The node lock for the pnode
> +  # assures that this happens, and while we cannot be sure where the
> instance
> +  # will land, it is a real move.
> +  locks = {locking.LEVEL_NODE: [pnode.primary]}
> +  RunWithLocks(_InvokeMoveInstance, locks, 600.0,
> +               dest_instance.name, src_instance.name, rapi_pw_file.name,
> +               master.primary, perform_checks)
>
>    # And then back to pnode:snode
>    _InvokeMoveInstance(src_instance.name, dest_instance.name,
> rapi_pw_file.name,
> --
> 1.9.0.rc1.175.g0b1dcb5
>
>
LGTM, thanks.

Reply via email to