On Mon, May 23, 2011 at 06:57:03PM +0200, Michael Hanselmann wrote:
> Until now LUNodeMigrate used multiple tasklets to evacuate all primary
> instances on a node. In some cases it would acquire all node locks,
> which isn't good on big clusters. With upcoming improvements to the LUs
> for instance failover and migration, switching to separate jobs looks
> like a better option. This patch changes LUNodeMigrate to use
> LU-generated jobs.
> 
> While working on this patch, I identified a race condition in
> LUNodeMigrate.ExpandNames. A node's instances were retrieved without a
> lock and no verification was done.
> 
> Commit aac4511a added CheckArguments to LUNodeMigrate with a call to
> _CheckIAllocatorOrNode. I haven't understood why this was done and can't
> see how it would've worked, at least on clusters with default settings.

What is not clear about that?

> For RAPI, a new feature string is added and can be used to detect
> clusters which support more parameters for node migration. The client
> is updated.
> 
> Signed-off-by: Michael Hanselmann <[email protected]>
> ---
>  lib/client/gnt_node.py              |   35 ++++++++++++++++----
>  lib/cmdlib.py                       |   59 ++++++++++++++--------------------
>  lib/opcodes.py                      |   11 ++++--
>  lib/rapi/client.py                  |   34 ++++++++++++++++++--
>  lib/rapi/rlib2.py                   |   34 ++++++++++++++------
>  test/ganeti.rapi.client_unittest.py |   34 ++++++++++++++++++++
>  6 files changed, 148 insertions(+), 59 deletions(-)
> 
> diff --git a/lib/client/gnt_node.py b/lib/client/gnt_node.py
> index 6ddc9e4..a56e449 100644
> --- a/lib/client/gnt_node.py
> +++ b/lib/client/gnt_node.py
> @@ -364,7 +364,7 @@ def MigrateNode(opts, args):
>    selected_fields = ["name", "pinst_list"]
>  
>    result = cl.QueryNodes(names=args, fields=selected_fields, 
> use_locking=False)
> -  node, pinst = result[0]
> +  ((node, pinst), ) = result
>  
>    if not pinst:
>      ToStdout("No primary instances on node %s, exiting." % node)
> @@ -372,9 +372,10 @@ def MigrateNode(opts, args):
>  
>    pinst = utils.NiceSort(pinst)
>  
> -  if not force and not AskUser("Migrate instance(s) %s?" %
> -                               (",".join("'%s'" % name for name in pinst))):
> -    return 2
> +  if not (force or
> +          AskUser("Migrate instance(s) %s?" %
> +                  utils.CommaJoin(utils.NiceSort(pinst)))):
> +    return constants.EXIT_CONFIRMATION
>  
>    # this should be removed once --non-live is deprecated
>    if not opts.live and opts.migration_mode is not None:
> @@ -385,9 +386,29 @@ def MigrateNode(opts, args):
>      mode = constants.HT_MIGRATION_NONLIVE
>    else:
>      mode = opts.migration_mode
> +
>    op = opcodes.OpNodeMigrate(node_name=args[0], mode=mode,
> -                             iallocator=opts.iallocator)
> -  SubmitOpCode(op, cl=cl, opts=opts)
> +                             iallocator=opts.iallocator,
> +                             target_node=opts.dst_node)
> +
> +  result = SubmitOpCode(op, cl=cl, opts=opts)
> +
> +  # Keep track of submitted jobs
> +  jex = JobExecutor(cl=cl, opts=opts)
> +
> +  for (status, job_id) in result[constants.JOB_IDS_KEY]:
> +    jex.AddJobId(None, status, job_id)
> +
> +  results = jex.GetResults()
> +  bad_cnt = len([row for row in results if not row[0]])
> +  if bad_cnt == 0:
> +    ToStdout("All instances migrated successfully.")
> +    rcode = constants.EXIT_SUCCESS
> +  else:
> +    ToStdout("There were %s errors during the node migration.", bad_cnt)
> +    rcode = constants.EXIT_FAILURE
> +
> +  return rcode
>  
>  
>  def ShowNodeConfig(opts, args):
> @@ -829,7 +850,7 @@ commands = {
>      " secondary node (only for instances with drbd disk template)"),
>    'migrate': (
>      MigrateNode, ARGS_ONE_NODE,
> -    [FORCE_OPT, NONLIVE_OPT, MIGRATION_MODE_OPT,
> +    [FORCE_OPT, NONLIVE_OPT, MIGRATION_MODE_OPT, DST_NODE_OPT,
>       IALLOCATOR_OPT, PRIORITY_OPT],
>      "[-f] <node>",
>      "Migrate all the primary instance on a node away from it"
> diff --git a/lib/cmdlib.py b/lib/cmdlib.py
> index 98a056c..7e7b90a 100644
> --- a/lib/cmdlib.py
> +++ b/lib/cmdlib.py
> @@ -6648,45 +6648,15 @@ class LUNodeMigrate(LogicalUnit):
>    REQ_BGL = False
>  
>    def CheckArguments(self):
> -    _CheckIAllocatorOrNode(self, "iallocator", "remote_node")
> +    pass

I would rather have this clarified and changed in a separate CL.
Basically this tests that either there is an iallocator or that we pass
a new remote node.

>    def ExpandNames(self):
>      self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name)
>  
> -    self.needed_locks = {}
> -
> -    # Create tasklets for migrating instances for all instances on this node
> -    names = []
> -    tasklets = []
> -
> -    self.lock_all_nodes = False
> -
> -    for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name):
> -      logging.debug("Migrating instance %s", inst.name)
> -      names.append(inst.name)
> -
> -      tasklets.append(TLMigrateInstance(self, inst.name, cleanup=False))
> -
> -      if inst.disk_template in constants.DTS_EXT_MIRROR:
> -        # We need to lock all nodes, as the iallocator will choose the
> -        # destination nodes afterwards
> -        self.lock_all_nodes = True
> -
> -    self.tasklets = tasklets
> -
> -    # Declare node locks
> -    if self.lock_all_nodes:
> -      self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET
> -    else:
> -      self.needed_locks[locking.LEVEL_NODE] = [self.op.node_name]
> -      self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND
> -
> -    # Declare instance locks
> -    self.needed_locks[locking.LEVEL_INSTANCE] = names
> -
> -  def DeclareLocks(self, level):
> -    if level == locking.LEVEL_NODE and not self.lock_all_nodes:
> -      self._LockInstancesNodes()
> +    self.share_locks = dict.fromkeys(locking.LEVELS, 1)
> +    self.needed_locks = {
> +      locking.LEVEL_NODE: [self.op.node_name],
> +      }
>
>    def BuildHooksEnv(self):
>      """Build hooks env.
> @@ -6705,6 +6675,25 @@ class LUNodeMigrate(LogicalUnit):
>      nl = [self.cfg.GetMasterNode()]
>      return (nl, nl)
>  
> +  def CheckPrereq(self):
> +    pass
> +
> +  def Exec(self, feedback_fn):
> +    # Prepare jobs for migration instances
> +    jobs = [
> +      [opcodes.OpInstanceMigrate(instance_name=inst.name,
> +                                 mode=self.op.mode,
> +                                 live=self.op.live,
> +                                 iallocator=self.op.iallocator,
> +                                 target_node=self.op.target_node)]
> +      for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name)
> +      ]
> +
> +    assert (frozenset(self.glm.list_owned(locking.LEVEL_NODE)) ==
> +            frozenset([self.op.node_name]))
> +
> +    return ResultWithJobs(jobs)

Well well. This is completely suboptimal, as in case of iallocator runs
and external storage, you will run the iallocator and get the cluster
locks N times.

The proper model is that this LU (LUNodeMigrate) takes an iallocator,
that it runs the iallocator to compute the node strategy, and then
submits N instance migrate jobs that use explicit nodes, without
iallocator decisions to be done in the individual jobs, thus allowing
more parallelism.

iustin

Reply via email to