2010/2/23 René Nussbaumer <[email protected]>:
> --- /dev/null
> +++ b/tools/cluster-merge
> +LOG_CLUSTER_MERGE = constants.LOG_DIR + "cluster-merge.log"

Please add a slash before the filename. Just in case the directory
doesn't have one at the end.

> +class Merger(object):
> +  ROLLBACK_STEPS = (
> +      "Remove our key from authorized_keys on nodes: %(nodes)s",
> +      "Start all instances again on the merging clusters: %(clusters)s",
> +      "Restore %s from another master candidate" % 
> constants.CLUSTER_CONF_FILE)

This is used as a list, not a tuple. Please make it a list.

> +  def Setup(self, remote_key_name="id_dsa"):
> +    remote_path = os.path.join("~", ".ssh", remote_key_name)

As discussed offline, use ssh.GetUserFiles:

(remote_path, _, _) = ssh.GetUserFiles("root")

> +    # Fetch remotes private key
> +    for cluster in self.clusters:
> +      data = self.ssh_map[cluster] = {}
> +
> +      result = self._RunCmd(hostname=cluster, command="cat %s" % remote_path,
> +                            batch=False, ask_key=False)
> +      if result.failed:
> +        raise errors.RemoteError("There was an error while grabbing key from"
> +                                 " %s." % cluster)
> +
> +      key_path = data["key"] = os.path.join(self.work_dir, cluster)
> +      utils.WriteFile(key_path, mode=0600, data=result.stdout)
> +
> +      result = self._RunCmd(hostname=cluster, private_key=key_path,
> +                            command="gnt-node list -o name --no-header")
> +      if not result.failed:
> +        data["nodes"] = result.stdout.splitlines()
> +      else:
> +        raise errors.RemoteError("Unable to retrieve list of nodes.")
> +
> +      result = self._RunCmd(hostname=cluster, private_key=key_path,
> +                            command="gnt-instance list -o name --no-header")
> +      if not result.failed:
> +        data["instances"] = result.stdout.splitlines()
> +      else:
> +        raise errors.RemoteError("Unable to retrieve list of instances.")

here and in a few other places:

if result.failed:
  raise …

data[…] = …

Don't put periods at the end of exception messages.

> +  def _PrepareAuthorizedKeys(self):
> +    """Prepare the authorized_keys on every merging node.
> +
> +    This method add our public key to remotes authorized_key for further
> +    communication.
> +
> +    """
> +    _, pub_key_file, _ = ssh.GetUserFiles("root")
> +    pub_key = utils.ReadFile(pub_key_file)
> +
> +    for cluster, data in self.ssh_map.iteritems():
> +      for node in data["nodes"]:
> +        result = self._RunCmd(hostname=node, private_key=data["key"],
> +                              command=("cat >> %s/.ssh/authorized_keys <<"

ssh.GetUserFiles for authorized_keys.

> +                                       " '!EOF.'\n%s!EOF.\n" %
> +                                       (utils.GetHomeDir("root"), pub_key)))

Don't pass positional arguments with keywords. Here and in other places.

> +        if result.failed:
> +          raise errors.RemoteError("Unable to add our public key to %s in 
> %s."
> +                                   % (node, cluster))
> +
> +  def _RunCmd(self, hostname, command, user="root", use_cluster_key=False,
> +              strict_host_check=False, private_key=None, batch=True,
> +              ask_key=False):

Yes, this looks a lot better.

> +    """Wrapping SshRunner.Run with default parameters.
> +
> +    For explanation of parameters see L{SshRunner.Run}.

Shouldn't the link be L{ssh.SshRunner.Run}?

> +
> +    """
> +    return self.ssh_runner.Run(hostname=hostname, command=command, user=user,
> +                               use_cluster_key=use_cluster_key,
> +                               strict_host_check=strict_host_check,
> +                               private_key=private_key, batch=batch,
> +                               ask_key=ask_key)
> +
> +  def _StopMergingInstances(self):
> +    """Stop instances on merging clusters.
> +
> +    """
> +    for cluster in self.clusters:
> +      self._RunCmd(hostname=cluster,
> +                   command="gnt-instance shutdown --all --force-multiple")
> +
> +  def _DisableWatcher(self):
> +    """Disable watch on all merging clusters, including ourself.
> +
> +    """
> +    for cluster in ["localhost"] + self.clusters:
> +      self._RunCmd(hostname=cluster,
> +                   command="gnt-cluster watcher pause 1800")

Could the duration be a command line parameter instead of a hardcoded constant?

> +  # R0201: Method could be a function
> +  def _EnableWatcher(self): # pylint: disable-msg=R0201
> +    """Reenable watcher (locally).
> +
> +    """
> +    utils.RunCmd(["gnt-cluster", "watcher", "continue"])
> +
> +  def _StopDaemons(self):
> +    """Stop all daemons on merging nodes.
> +
> +    """
> +    # FIXME: Worth to put this into constants?
> +    cmds = []
> +    for daemon in (constants.NODED, constants.CONFD, constants.RAPI,
> +                   constants.MASTERD):
> +      cmds.append("%s stop %s" % (constants.DAEMON_UTIL, daemon))
> +    for data in self.ssh_map.values():
> +      for node in data["nodes"]:
> +        self._RunCmd(hostname=node,
> +                     command=';'.join(cmds))

What if stopping fails?

> +  # R0201: Method could be a function
> +  def _StartMasterDaemon(self, no_vote=False): # pylint: disable-msg=R0201
> +    cmd = ["ganeti-masterd"]
> +    if no_vote:
> +      cmd.extend(["--no-voting", "--yes-do-it"])
> +
> +    if utils.RunCmd(cmd).failed:
> +      raise errors.ProgrammerError(
> +          "Couldn't start ganeti-masterd." % options)

daemon-util (see backend.StartMaster)

Hm, this seems to be the time when you start the master daemon again.
I believe you should use the RPC call “node_start_master” (which also
configures the master role).

> +  def _ReaddMergedNodesAndRedist(self):
> +    for _, data in self.ssh_map.iteritems():
> +      for node in data["nodes"]:
> +        result = utils.RunCmd(["gnt-node", "add", "--readd",
> +                               "--no-ssh-key-check", node])
> +        if result.failed:
> +          raise errors.ProgrammerError("Couldn't readd node %s." % node)
> +
> +    if utils.RunCmd(["gnt-cluster", "redist-conf"]).failed:
> +      raise errors.ProgrammerError("Redistribution failed.")

Output ignored (here and in other places).

It's not a ProgrammerError. CommandError maybe. Here and in other places.

> +  def Merge(self):
> +    """Does the actual merge.
> +
> +    It runs all the steps in the right order and updates the user about steps
> +    taken. Also it keeps track of rollback_steps to undo everything.
> +
> +    """
> +    logging.info("Pre cluster verification")
> +    self._VerifyCluster()
> +    logging.info("Prepare authorized_keys")
> +    self._PrepareAuthorizedKeys()
> +    self.rollback_step += 1

Everyone modifying this code has to keep track between this counter
and the list above. Wouldn't it be more sensible to use a
try/except/finally structure here and build the rollback steps as you
go?

Something along these lines:

rbsteps = []
try:
  try:
    rbsteps.append("Remove our key from …")
    self._PrepareAuthorizedKeys()

    rbsteps.append("Start all instances again on …")
    …

    logging.warning("We are at the point of no return. …")
  finally:
    del rbsteps[:]

  self._ReaddMergedNodesAndRedist()
except Exception:
  logging.exception("Error while …")
  print rbsteps

> +    logging.info("Stopping merging instances (takes a while)")
> […]

> +  def Cleanup(self):
> +    for data in self.ssh_map.values():
> +      if "key" in data:
> +        os.unlink(data["key"])

Maybe using an actual class instead of a dict would be easier to handle.

> +      if "config" in data:
> +        os.unlink(data["config"])
> +    os.rmdir(self.work_dir)

shutil.rmtree and you don't have to remove every file.

> +  def RollbackDescription(self, step):
> +    nodes = Flatten([data["nodes"] for _, data in self.ssh_map.iteritems()])

dict.itervalues

> +    info = {"clusters": self.clusters,
> +            "nodes": nodes}
> +    return "  * %s" % (self.ROLLBACK_STEPS[step] % info)
> +
> +def SetupLogging(options):
> +  """Setting up logging infrastructure.
> +
> +  @param options: Parsed command line options
> +
> +  """
> +  utils.SetupLogging(LOG_CLUSTER_MERGE,
> +      stderr_logging=True, program="cluster-merge",
> +      debug=True)
> +
> +  # FIXME: Is there a better less hackish way?

Yes, don't use SetupLogging. This is an interactive tool and hence
shouldn't write to a log file at all. For the user it's easier if she
can just look at stdout/stderr instead of having to open a file to
diagnose.

> +  root_logger = logging.getLogger("")
> +  hndl = None
> +  for hndl in root_logger.handlers:
> +    if isinstance(hndl, logging.StreamHandler):
> +      if hndl.stream.name == "<stderr>":
> +        break
> +    hndl = None
> +
> +  # W0631: Using possibly undefined loop variable 'hndl'
> +  if hndl: # pylint: disable-msg=W0631
> +    if options.verbose:
> +      hndl.setLevel(logging.INFO)
> +    elif not options.verbose and not options.debug:
> +      hndl.setLevel(logging.CRITICAL)
> […]

> +  cluster_merger = Merger(args)
> +  try:
> +    try:
> +      cluster_merger.Setup()
> +      cluster_merger.Merge()
> +    except errors.GenericError, e:
> +      logging.critical(e)

logging.exception will give you a nice backtrace.

Regards,
Michael

Reply via email to