2010/2/23 René Nussbaumer <[email protected]>:
> --- /dev/null
> +++ b/tools/cluster-merge
> +LOG_CLUSTER_MERGE = constants.LOG_DIR + "cluster-merge.log"
Please add a slash before the filename. Just in case the directory
doesn't have one at the end.
> +class Merger(object):
> + ROLLBACK_STEPS = (
> + "Remove our key from authorized_keys on nodes: %(nodes)s",
> + "Start all instances again on the merging clusters: %(clusters)s",
> + "Restore %s from another master candidate" %
> constants.CLUSTER_CONF_FILE)
This is used as a list, not a tuple. Please make it a list.
> + def Setup(self, remote_key_name="id_dsa"):
> + remote_path = os.path.join("~", ".ssh", remote_key_name)
As discussed offline, use ssh.GetUserFiles:
(remote_path, _, _) = ssh.GetUserFiles("root")
> + # Fetch remotes private key
> + for cluster in self.clusters:
> + data = self.ssh_map[cluster] = {}
> +
> + result = self._RunCmd(hostname=cluster, command="cat %s" % remote_path,
> + batch=False, ask_key=False)
> + if result.failed:
> + raise errors.RemoteError("There was an error while grabbing key from"
> + " %s." % cluster)
> +
> + key_path = data["key"] = os.path.join(self.work_dir, cluster)
> + utils.WriteFile(key_path, mode=0600, data=result.stdout)
> +
> + result = self._RunCmd(hostname=cluster, private_key=key_path,
> + command="gnt-node list -o name --no-header")
> + if not result.failed:
> + data["nodes"] = result.stdout.splitlines()
> + else:
> + raise errors.RemoteError("Unable to retrieve list of nodes.")
> +
> + result = self._RunCmd(hostname=cluster, private_key=key_path,
> + command="gnt-instance list -o name --no-header")
> + if not result.failed:
> + data["instances"] = result.stdout.splitlines()
> + else:
> + raise errors.RemoteError("Unable to retrieve list of instances.")
here and in a few other places:
if result.failed:
raise …
data[…] = …
Don't put periods at the end of exception messages.
> + def _PrepareAuthorizedKeys(self):
> + """Prepare the authorized_keys on every merging node.
> +
> + This method add our public key to remotes authorized_key for further
> + communication.
> +
> + """
> + _, pub_key_file, _ = ssh.GetUserFiles("root")
> + pub_key = utils.ReadFile(pub_key_file)
> +
> + for cluster, data in self.ssh_map.iteritems():
> + for node in data["nodes"]:
> + result = self._RunCmd(hostname=node, private_key=data["key"],
> + command=("cat >> %s/.ssh/authorized_keys <<"
ssh.GetUserFiles for authorized_keys.
> + " '!EOF.'\n%s!EOF.\n" %
> + (utils.GetHomeDir("root"), pub_key)))
Don't pass positional arguments with keywords. Here and in other places.
> + if result.failed:
> + raise errors.RemoteError("Unable to add our public key to %s in
> %s."
> + % (node, cluster))
> +
> + def _RunCmd(self, hostname, command, user="root", use_cluster_key=False,
> + strict_host_check=False, private_key=None, batch=True,
> + ask_key=False):
Yes, this looks a lot better.
> + """Wrapping SshRunner.Run with default parameters.
> +
> + For explanation of parameters see L{SshRunner.Run}.
Shouldn't the link be L{ssh.SshRunner.Run}?
> +
> + """
> + return self.ssh_runner.Run(hostname=hostname, command=command, user=user,
> + use_cluster_key=use_cluster_key,
> + strict_host_check=strict_host_check,
> + private_key=private_key, batch=batch,
> + ask_key=ask_key)
> +
> + def _StopMergingInstances(self):
> + """Stop instances on merging clusters.
> +
> + """
> + for cluster in self.clusters:
> + self._RunCmd(hostname=cluster,
> + command="gnt-instance shutdown --all --force-multiple")
> +
> + def _DisableWatcher(self):
> + """Disable watch on all merging clusters, including ourself.
> +
> + """
> + for cluster in ["localhost"] + self.clusters:
> + self._RunCmd(hostname=cluster,
> + command="gnt-cluster watcher pause 1800")
Could the duration be a command line parameter instead of a hardcoded constant?
> + # R0201: Method could be a function
> + def _EnableWatcher(self): # pylint: disable-msg=R0201
> + """Reenable watcher (locally).
> +
> + """
> + utils.RunCmd(["gnt-cluster", "watcher", "continue"])
> +
> + def _StopDaemons(self):
> + """Stop all daemons on merging nodes.
> +
> + """
> + # FIXME: Worth to put this into constants?
> + cmds = []
> + for daemon in (constants.NODED, constants.CONFD, constants.RAPI,
> + constants.MASTERD):
> + cmds.append("%s stop %s" % (constants.DAEMON_UTIL, daemon))
> + for data in self.ssh_map.values():
> + for node in data["nodes"]:
> + self._RunCmd(hostname=node,
> + command=';'.join(cmds))
What if stopping fails?
> + # R0201: Method could be a function
> + def _StartMasterDaemon(self, no_vote=False): # pylint: disable-msg=R0201
> + cmd = ["ganeti-masterd"]
> + if no_vote:
> + cmd.extend(["--no-voting", "--yes-do-it"])
> +
> + if utils.RunCmd(cmd).failed:
> + raise errors.ProgrammerError(
> + "Couldn't start ganeti-masterd." % options)
daemon-util (see backend.StartMaster)
Hm, this seems to be the time when you start the master daemon again.
I believe you should use the RPC call “node_start_master” (which also
configures the master role).
> + def _ReaddMergedNodesAndRedist(self):
> + for _, data in self.ssh_map.iteritems():
> + for node in data["nodes"]:
> + result = utils.RunCmd(["gnt-node", "add", "--readd",
> + "--no-ssh-key-check", node])
> + if result.failed:
> + raise errors.ProgrammerError("Couldn't readd node %s." % node)
> +
> + if utils.RunCmd(["gnt-cluster", "redist-conf"]).failed:
> + raise errors.ProgrammerError("Redistribution failed.")
Output ignored (here and in other places).
It's not a ProgrammerError. CommandError maybe. Here and in other places.
> + def Merge(self):
> + """Does the actual merge.
> +
> + It runs all the steps in the right order and updates the user about steps
> + taken. Also it keeps track of rollback_steps to undo everything.
> +
> + """
> + logging.info("Pre cluster verification")
> + self._VerifyCluster()
> + logging.info("Prepare authorized_keys")
> + self._PrepareAuthorizedKeys()
> + self.rollback_step += 1
Everyone modifying this code has to keep track between this counter
and the list above. Wouldn't it be more sensible to use a
try/except/finally structure here and build the rollback steps as you
go?
Something along these lines:
rbsteps = []
try:
try:
rbsteps.append("Remove our key from …")
self._PrepareAuthorizedKeys()
rbsteps.append("Start all instances again on …")
…
logging.warning("We are at the point of no return. …")
finally:
del rbsteps[:]
self._ReaddMergedNodesAndRedist()
except Exception:
logging.exception("Error while …")
print rbsteps
> + logging.info("Stopping merging instances (takes a while)")
> […]
> + def Cleanup(self):
> + for data in self.ssh_map.values():
> + if "key" in data:
> + os.unlink(data["key"])
Maybe using an actual class instead of a dict would be easier to handle.
> + if "config" in data:
> + os.unlink(data["config"])
> + os.rmdir(self.work_dir)
shutil.rmtree and you don't have to remove every file.
> + def RollbackDescription(self, step):
> + nodes = Flatten([data["nodes"] for _, data in self.ssh_map.iteritems()])
dict.itervalues
> + info = {"clusters": self.clusters,
> + "nodes": nodes}
> + return " * %s" % (self.ROLLBACK_STEPS[step] % info)
> +
> +def SetupLogging(options):
> + """Setting up logging infrastructure.
> +
> + @param options: Parsed command line options
> +
> + """
> + utils.SetupLogging(LOG_CLUSTER_MERGE,
> + stderr_logging=True, program="cluster-merge",
> + debug=True)
> +
> + # FIXME: Is there a better less hackish way?
Yes, don't use SetupLogging. This is an interactive tool and hence
shouldn't write to a log file at all. For the user it's easier if she
can just look at stdout/stderr instead of having to open a file to
diagnose.
> + root_logger = logging.getLogger("")
> + hndl = None
> + for hndl in root_logger.handlers:
> + if isinstance(hndl, logging.StreamHandler):
> + if hndl.stream.name == "<stderr>":
> + break
> + hndl = None
> +
> + # W0631: Using possibly undefined loop variable 'hndl'
> + if hndl: # pylint: disable-msg=W0631
> + if options.verbose:
> + hndl.setLevel(logging.INFO)
> + elif not options.verbose and not options.debug:
> + hndl.setLevel(logging.CRITICAL)
> […]
> + cluster_merger = Merger(args)
> + try:
> + try:
> + cluster_merger.Setup()
> + cluster_merger.Merge()
> + except errors.GenericError, e:
> + logging.critical(e)
logging.exception will give you a nice backtrace.
Regards,
Michael