On Mon, May 23, 2011 at 06:57:03PM +0200, Michael Hanselmann wrote: > Until now LUNodeMigrate used multiple tasklets to evacuate all primary > instances on a node. In some cases it would acquire all node locks, > which isn't good on big clusters. With upcoming improvements to the LUs > for instance failover and migration, switching to separate jobs looks > like a better option. This patch changes LUNodeMigrate to use > LU-generated jobs. > > While working on this patch, I identified a race condition in > LUNodeMigrate.ExpandNames. A node's instances were retrieved without a > lock and no verification was done. > > Commit aac4511a added CheckArguments to LUNodeMigrate with a call to > _CheckIAllocatorOrNode. I haven't understood why this was done and can't > see how it would've worked, at least on clusters with default settings.
What is not clear about that? > For RAPI, a new feature string is added and can be used to detect > clusters which support more parameters for node migration. The client > is updated. > > Signed-off-by: Michael Hanselmann <[email protected]> > --- > lib/client/gnt_node.py | 35 ++++++++++++++++---- > lib/cmdlib.py | 59 ++++++++++++++-------------------- > lib/opcodes.py | 11 ++++-- > lib/rapi/client.py | 34 ++++++++++++++++++-- > lib/rapi/rlib2.py | 34 ++++++++++++++------ > test/ganeti.rapi.client_unittest.py | 34 ++++++++++++++++++++ > 6 files changed, 148 insertions(+), 59 deletions(-) > > diff --git a/lib/client/gnt_node.py b/lib/client/gnt_node.py > index 6ddc9e4..a56e449 100644 > --- a/lib/client/gnt_node.py > +++ b/lib/client/gnt_node.py > @@ -364,7 +364,7 @@ def MigrateNode(opts, args): > selected_fields = ["name", "pinst_list"] > > result = cl.QueryNodes(names=args, fields=selected_fields, > use_locking=False) > - node, pinst = result[0] > + ((node, pinst), ) = result > > if not pinst: > ToStdout("No primary instances on node %s, exiting." % node) > @@ -372,9 +372,10 @@ def MigrateNode(opts, args): > > pinst = utils.NiceSort(pinst) > > - if not force and not AskUser("Migrate instance(s) %s?" % > - (",".join("'%s'" % name for name in pinst))): > - return 2 > + if not (force or > + AskUser("Migrate instance(s) %s?" % > + utils.CommaJoin(utils.NiceSort(pinst)))): > + return constants.EXIT_CONFIRMATION > > # this should be removed once --non-live is deprecated > if not opts.live and opts.migration_mode is not None: > @@ -385,9 +386,29 @@ def MigrateNode(opts, args): > mode = constants.HT_MIGRATION_NONLIVE > else: > mode = opts.migration_mode > + > op = opcodes.OpNodeMigrate(node_name=args[0], mode=mode, > - iallocator=opts.iallocator) > - SubmitOpCode(op, cl=cl, opts=opts) > + iallocator=opts.iallocator, > + target_node=opts.dst_node) > + > + result = SubmitOpCode(op, cl=cl, opts=opts) > + > + # Keep track of submitted jobs > + jex = JobExecutor(cl=cl, opts=opts) > + > + for (status, job_id) in result[constants.JOB_IDS_KEY]: > + jex.AddJobId(None, status, job_id) > + > + results = jex.GetResults() > + bad_cnt = len([row for row in results if not row[0]]) > + if bad_cnt == 0: > + ToStdout("All instances migrated successfully.") > + rcode = constants.EXIT_SUCCESS > + else: > + ToStdout("There were %s errors during the node migration.", bad_cnt) > + rcode = constants.EXIT_FAILURE > + > + return rcode > > > def ShowNodeConfig(opts, args): > @@ -829,7 +850,7 @@ commands = { > " secondary node (only for instances with drbd disk template)"), > 'migrate': ( > MigrateNode, ARGS_ONE_NODE, > - [FORCE_OPT, NONLIVE_OPT, MIGRATION_MODE_OPT, > + [FORCE_OPT, NONLIVE_OPT, MIGRATION_MODE_OPT, DST_NODE_OPT, > IALLOCATOR_OPT, PRIORITY_OPT], > "[-f] <node>", > "Migrate all the primary instance on a node away from it" > diff --git a/lib/cmdlib.py b/lib/cmdlib.py > index 98a056c..7e7b90a 100644 > --- a/lib/cmdlib.py > +++ b/lib/cmdlib.py > @@ -6648,45 +6648,15 @@ class LUNodeMigrate(LogicalUnit): > REQ_BGL = False > > def CheckArguments(self): > - _CheckIAllocatorOrNode(self, "iallocator", "remote_node") > + pass I would rather have this clarified and changed in a separate CL. Basically this tests that either there is an iallocator or that we pass a new remote node. > def ExpandNames(self): > self.op.node_name = _ExpandNodeName(self.cfg, self.op.node_name) > > - self.needed_locks = {} > - > - # Create tasklets for migrating instances for all instances on this node > - names = [] > - tasklets = [] > - > - self.lock_all_nodes = False > - > - for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name): > - logging.debug("Migrating instance %s", inst.name) > - names.append(inst.name) > - > - tasklets.append(TLMigrateInstance(self, inst.name, cleanup=False)) > - > - if inst.disk_template in constants.DTS_EXT_MIRROR: > - # We need to lock all nodes, as the iallocator will choose the > - # destination nodes afterwards > - self.lock_all_nodes = True > - > - self.tasklets = tasklets > - > - # Declare node locks > - if self.lock_all_nodes: > - self.needed_locks[locking.LEVEL_NODE] = locking.ALL_SET > - else: > - self.needed_locks[locking.LEVEL_NODE] = [self.op.node_name] > - self.recalculate_locks[locking.LEVEL_NODE] = constants.LOCKS_APPEND > - > - # Declare instance locks > - self.needed_locks[locking.LEVEL_INSTANCE] = names > - > - def DeclareLocks(self, level): > - if level == locking.LEVEL_NODE and not self.lock_all_nodes: > - self._LockInstancesNodes() > + self.share_locks = dict.fromkeys(locking.LEVELS, 1) > + self.needed_locks = { > + locking.LEVEL_NODE: [self.op.node_name], > + } > > def BuildHooksEnv(self): > """Build hooks env. > @@ -6705,6 +6675,25 @@ class LUNodeMigrate(LogicalUnit): > nl = [self.cfg.GetMasterNode()] > return (nl, nl) > > + def CheckPrereq(self): > + pass > + > + def Exec(self, feedback_fn): > + # Prepare jobs for migration instances > + jobs = [ > + [opcodes.OpInstanceMigrate(instance_name=inst.name, > + mode=self.op.mode, > + live=self.op.live, > + iallocator=self.op.iallocator, > + target_node=self.op.target_node)] > + for inst in _GetNodePrimaryInstances(self.cfg, self.op.node_name) > + ] > + > + assert (frozenset(self.glm.list_owned(locking.LEVEL_NODE)) == > + frozenset([self.op.node_name])) > + > + return ResultWithJobs(jobs) Well well. This is completely suboptimal, as in case of iallocator runs and external storage, you will run the iallocator and get the cluster locks N times. The proper model is that this LU (LUNodeMigrate) takes an iallocator, that it runs the iallocator to compute the node strategy, and then submits N instance migrate jobs that use explicit nodes, without iallocator decisions to be done in the individual jobs, thus allowing more parallelism. iustin
