This is a convinience command to do an automated EPO in the possible limits of
Ganeti.
---
Unittests/QA tests are following as soon as I got an approval on the _concept_
so I don't put effort into something not wanted/working as you wanted it to
lib/client/gnt_cluster.py | 279 +++++++++++++++++++++++++++++++++++++++++++++
man/gnt-cluster.rst | 17 +++
2 files changed, 296 insertions(+), 0 deletions(-)
diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index 1755ef9..3a45b58 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -29,6 +29,7 @@
import os.path
import time
import OpenSSL
+import itertools
from ganeti.cli import *
from ganeti import opcodes
@@ -40,6 +41,18 @@ from ganeti import ssh
from ganeti import objects
from ganeti import uidpool
from ganeti import compat
+from ganeti import netutils
+
+
+ON_OPT = cli_option("--on", default=False,
+ action="store_true", dest="on",
+ help="Recover from an EPO")
+
+GROUPS_OPT = cli_option("--groups", default=False,
+ action="store_true", dest="groups",
+ help="Arguments are node groups instead of nodes")
+
+_PING_INTERVAL = 30 # 30 seconds between pings
@UsesRPC
@@ -882,6 +895,267 @@ def WatcherOps(opts, args):
return 0
+def _OobPower(opts, node_list, power):
+ """Puts the node in the list to desired power state.
+
+ @param opts: The command line options selected by the user
+ @param node_list: The list of nodes to operate on
+ @param power: True if they should be powered on, False otherwise
+ @return: The success of the operation (none failed)
+
+ """
+ if power:
+ command = constants.OOB_POWER_ON
+ else:
+ command = constants.OOB_POWER_OFF
+
+ op = opcodes.OpOobCommand(node_names=node_list,
+ command=command,
+ ignore_status=True,
+ timeout=opts.oob_timeout)
+ result = SubmitOpCode(op, opts=opts)
+ errs = 0
+ for node_result in result:
+ (node_tuple, data_tuple) = node_result
+ (_, node_name) = node_tuple
+ (data_status, _) = data_tuple
+ if data_status != constants.RS_NORMAL:
+ assert data_status != constants.RS_UNAVAIL
+ errs += 1
+ ToStderr("There was a problem changing power for %s, please investigate",
+ node_name)
+
+ if errs > 0:
+ return False
+
+ return True
+
+
+def _InstanceStart(opts, inst_list, start):
+ """Puts the instances in the list to desired state.
+
+ @param opts: The command line options selected by the user
+ @param inst_list: The list of instances to operate on
+ @param start: True if they should be started, False for shutdown
+ @return: The success of the operation (none failed)
+
+ """
+ if start:
+ opcls = opcodes.OpInstanceStartup
+ text_addition = ("startup", "started", "starting")
+ else:
+ opcls = opcodes.OpInstanceShutdown
+ text_addition = ("shutdown", "stopped", "stopping")
+
+ jex = JobExecutor(opts=opts)
+
+ for inst in inst_list:
+ ToStdout("Submit %s of instance %s", text_addition[0], inst)
+ op = opcls(instance_name=inst)
+ jex.QueueJob(inst, op)
+
+ results = jex.GetResults()
+ bad_cnt = len([row for row in results if not row[0]])
+
+ if bad_cnt == 0:
+ ToStdout("All instances has been %s successfully", text_addition[1])
+ else:
+ ToStderr("There were errors while %s instances:\n"
+ "%d error(s) out of %d instance(s)", text_addition[2], bad_cnt,
+ len(results))
+ return False
+
+ return True
+
+
+def _RunWhenNodesReachable(node_list, action_cb, interval,
+ ping_fn=netutils.TcpPing, sleep_fn=time.sleep):
+ """Run action_cb when nodes become reachable.
+
+ @param node_list: The list of nodes to be reachable
+ @param action_cb: Callback called when a new host is reachable
+ @param interval: The earliest time to retry
+ @param ping_fn: Function to check reachabilty (for unittest use only)
+ @param sleep_fn: Function to sleep (for unittest use only)
+
+ """
+ client = GetClient()
+ cluster_info = client.QueryClusterInfo()
+ if cluster_info["primary_ip_version"] == constants.IP4_VERSION:
+ family = netutils.IPAddress.family
+ else:
+ family = netutils.IP6Address.family
+
+ nodes = [netutils.GetHostname(node, family=family) for node in node_list]
+ node_states = dict((name, False) for name in node_list)
+ success = True
+
+ while nodes:
+ timeout = utils.RunningTimeout(interval, True)
+ new = False
+ for (idx, node) in enumerate(nodes):
+ if ping_fn(node.ip, constants.DEFAULT_NODED_PORT, timeout=1,
+ live_port_needed=True):
+ ToStdout("Node %s became available", node.name)
+ node_name = node_list[idx]
+ node_states[node_name] = True
+ nodes.remove(node)
+ new = True
+
+ if new:
+ if not action_cb(node_states):
+ success = False
+
+ time_remaining = timeout.Remaining()
+ if nodes and time_remaining > 0:
+ sleep_fn(time_remaining)
+
+ return success
+
+
+def _MaybeInstanceStartup(opts, inst_map, node_states):
+ """Start the instances conditional based on node_states.
+
+ @param opts: The command line options selected by the user
+ @param inst_map: A dict of inst -> nodes mapping
+ @param node_states: A dict of node -> on/offline (True/False)
+ @return: Success of the operation on all instances
+
+ """
+ nodes_online = set(node for (node, on) in node_states.items() if on)
+
+ start_inst_list = []
+ for (inst, nodes) in inst_map.items():
+ if not (nodes - nodes_online):
+ # All nodes the instance lives on are back online
+ start_inst_list.append(inst)
+ del inst_map[inst]
+
+ if start_inst_list:
+ return _InstanceStart(opts, start_inst_list, True)
+
+ return True
+
+
+def _EpoOn(opts, full_node_list, node_list, inst_map):
+ """Does the actual power on.
+
+ @param opts: The command line options selected by the user
+ @param full_node_list: All nodes to operate on (includes nodes not supporting
+ OOB)
+ @param node_list: The list of nodes to operate on (all need to support OOB)
+ @param inst_map: A dict of inst -> nodes mapping
+ @return: The desired exit status
+
+ """
+ if node_list and not _OobPower(opts, node_list, False):
+ ToStderr("Not all nodes seems to get back up, investigate and start"
+ " manually if needed")
+
+ # Wait for the nodes to be back up
+ action_cb = compat.partial(_MaybeInstanceStartup, opts, dict(inst_map))
+
+ ToStdout("Waiting until all nodes are available again")
+ if not _RunWhenNodesReachable(full_node_list, action_cb, _PING_INTERVAL):
+ ToStderr("Please investigate and start not running instances manually")
+ return constants.EXIT_FAILURE
+
+ return constants.EXIT_FAILURE
+
+
+def _EpoOff(opts, node_list, inst_map):
+ """Does the actual power off.
+
+ @param opts: The command line options selected by the user
+ @param node_list: The list of nodes to operate on (all need to support OOB)
+ @param inst_map: A dict of inst -> nodes mapping
+ @return: The desired exit status
+
+ """
+ if not _InstanceStart(opts, inst_map.keys(), False):
+ ToStderr("Please investigate and stop instances manually before continue")
+ return constants.EXIT_FAILURE
+
+ if node_list and _OobPower(opts, node_list, False):
+ return constants.EXIT_SUCCESS
+ else:
+ return constants.EXIT_FAILURE
+
+
+def Epo(opts, args):
+ """EPO operations.
+
+ @param opts: the command line options selected by the user
+ @type args: list
+ @param args: should contain only one element, the subcommand
+ @rtype: int
+ @return: the desired exit code
+
+ """
+ if opts.groups and opts.show_all:
+ ToStderr("Only one of --groups or --all are allowed")
+ return constants.EXIT_FAILURE
+ elif args and opts.show_all:
+ ToStderr("Arguments in combination with --all are not allowed")
+ return constants.EXIT_FAILURE
+
+ client = GetClient()
+
+ if opts.groups:
+ node_query_list = itertools.chain(*client.QueryGroups(names=args,
+ fields=["node_list"],
+ use_locking=False))
+ else:
+ node_query_list = args
+
+ result = client.QueryNodes(names=node_query_list,
+ fields=["name", "master", "pinst_list",
+ "sinst_list", "powered", "offline"],
+ use_locking=False)
+ node_list = []
+ inst_map = {}
+ for (idx, (node, master, pinsts, sinsts, powered,
+ offline)) in enumerate(result):
+ # Normalize the node_query_list as well
+ node_query_list[idx] = node
+ if not offline:
+ for inst in (pinsts + sinsts):
+ if inst not in inst_map:
+ if master:
+ inst_map[inst] = set()
+ else:
+ inst_map[inst] = set([node])
+ else:
+ if not master:
+ inst_map[inst].add(node)
+
+ if master and opts.on:
+ # We ignore the master for turning on the machines, in fact we are
+ # already operating on the master at this point :)
+ continue
+ elif master and not opts.show_all:
+ ToStderr("%s is the master node, please do a master-failover to another"
+ " node not affected by the EPO or use --all if you intend to"
+ " shutdown the whole cluster", node)
+ return constants.EXIT_FAILURE
+ elif powered is None:
+ ToStdout("Node %s does not support out-of-band handling, it can not be"
+ " handled in a fully automated manner", node)
+ elif powered == opts.on:
+ ToStdout("Node %s is already in desired power state, skipping", node)
+ else:
+ if not offline or (offline and powered):
+ node_list.append(node)
+
+ if not opts.force and not ConfirmOperation(node_query_list, "nodes", "epo"):
+ return constants.EXIT_FAILURE
+
+ if opts.on:
+ return _EpoOn(opts, node_query_list, node_list, inst_map)
+ else:
+ return _EpoOff(opts, node_list, inst_map)
+
+
commands = {
'init': (
InitCluster, [ArgHost(min=1, max=1)],
@@ -977,6 +1251,11 @@ commands = {
NEW_CLUSTER_DOMAIN_SECRET_OPT, CLUSTER_DOMAIN_SECRET_OPT],
"[opts...]",
"Renews cluster certificates, keys and secrets"),
+ "epo": (
+ Epo, [ArgUnknown()],
+ [FORCE_OPT, ON_OPT, GROUPS_OPT, ALL_OPT, OOB_TIMEOUT_OPT],
+ "[opts...] [args]",
+ "Performs an EPO on given args"),
}
diff --git a/man/gnt-cluster.rst b/man/gnt-cluster.rst
index 5a65324..6dc52ee 100644
--- a/man/gnt-cluster.rst
+++ b/man/gnt-cluster.rst
@@ -93,6 +93,23 @@ Remove all configuration files related to the cluster, so
that a
Since this is a dangerous command, you are required to pass the
argument *--yes-do-it.*
+EPO
+~~~
+
+**epo** [--on] [--groups|--all] *arguments*
+
+Performans an EPO on nodes given as arguments. If ``--groups`` is given,
+arguments are node groups. If ``--all`` is provided, the whole cluster will be
+shut down.
+
+The ``--on`` flag says, that the cluster should be recovered from a EPO.
+
+Please note that the master node will not be turned down or up automatically.
+It will just be left in a state, where you can manully perform the shutdown of
+that one node. If the master is in the list of affected nodes and this is not
+complete cluster EPO (e.g. using ``--all``), you're required to do a master
+failover to another node not affected.
+
GETMASTER
~~~~~~~~~
--
1.7.3.1