This allows failing over in certain corner cases, such as a 2 node cluster with one node down. The man page is also updated to document the shortcomings of this option (we cannot pass --no-voting ourselves to the master, because that requires user interaction) and how to make the cluster consistent again.
Signed-off-by: Guido Trotter <[email protected]> --- lib/bootstrap.py | 32 +++++++++++++++++++------------- man/gnt-cluster.sgml | 14 ++++++++++++++ scripts/gnt-cluster | 19 +++++++++++++++++-- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/lib/bootstrap.py b/lib/bootstrap.py index 0308484..496a017 100644 --- a/lib/bootstrap.py +++ b/lib/bootstrap.py @@ -373,13 +373,17 @@ def SetupNodeDaemon(cluster_name, node, ssh_key_check): (node, result.fail_reason, result.output)) -def MasterFailover(): +def MasterFailover(skip_voting=False): """Failover the master node. This checks that we are not already the master, and will cause the current master to cease being master, and the non-master to become new master. + @type skip_voting: boolean + @param skip_voting: force the operation without remote nodes agreement + (dangerous) + """ sstore = ssconf.SimpleStore() @@ -401,18 +405,20 @@ def MasterFailover(): " master candidates is:\n" "%s" % ('\n'.join(mc_no_master))) - vote_list = GatherMasterVotes(node_list) - - if vote_list: - voted_master = vote_list[0][0] - if voted_master is None: - raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not" - " respond.") - elif voted_master != old_master: - raise errors.OpPrereqError("I have wrong configuration, I believe the" - " master is %s but the other nodes voted for" - " %s. Please resync the configuration of" - " this node." % (old_master, voted_master)) + if not skip_voting: + vote_list = GatherMasterVotes(node_list) + + if vote_list: + voted_master = vote_list[0][0] + if voted_master is None: + raise errors.OpPrereqError("Cluster is inconsistent, most nodes did" + " not respond.") + elif voted_master != old_master: + raise errors.OpPrereqError("I have a wrong configuration, I believe" + " the master is %s but the other nodes" + " voted %s. Please resync the configuration" + " of this node." % + (old_master, voted_master)) # end checks rcode = 0 diff --git a/man/gnt-cluster.sgml b/man/gnt-cluster.sgml index e3fecbf..9467c00 100644 --- a/man/gnt-cluster.sgml +++ b/man/gnt-cluster.sgml @@ -442,11 +442,25 @@ <cmdsynopsis> <command>masterfailover</command> + <arg>--no-voting</arg> </cmdsynopsis> <para> Failover the master role to the current node. </para> + + <para> + The <option>--no-voting</option> option skips the remote node agreement + checks. This is dangerous, but necessary in some cases (for example + failing over the master role in a 2 node cluster with the second node + down). After a failover performed this way the master daemon will most + probably not start, and you will need to start it manually passing the + --no-voting option to ganeti-masterd as well. Be careful because the + second node will still believe to be the master, so when it comes up + you'll need to start just ganeti-noded there, and perform a gnt-cluster + redist-conf on the new master to make the cluster consistent again. + </para> + </refsect2> <refsect2> diff --git a/scripts/gnt-cluster b/scripts/gnt-cluster index 99cab31..d547ac3 100755 --- a/scripts/gnt-cluster +++ b/scripts/gnt-cluster @@ -424,7 +424,17 @@ def MasterFailover(opts, args): @return: the desired exit code """ - return bootstrap.MasterFailover() + if opts.skip_voting: + sys.stdout.write("The 'no voting' option has been selected.\n") + sys.stdout.write("This is dangerous, please confirm by" + " typing uppercase 'yes': ") + sys.stdout.flush() + confirmation = sys.stdin.readline().strip() + if confirmation != "YES": + print "Aborting." + return + + return bootstrap.MasterFailover(skip_voting=opts.skip_voting) def SearchTags(opts, args): @@ -613,7 +623,12 @@ commands = { "", "Does a check on the cluster configuration"), 'verify-disks': (VerifyDisks, ARGS_NONE, [DEBUG_OPT], "", "Does a check on the cluster disk status"), - 'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT], + 'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT, + make_option("--no-voting", dest="skip_voting", + help="Skip node agreement check (dangerous)", + action="store_true", + default=False,), + ], "", "Makes the current node the master"), 'version': (ShowClusterVersion, ARGS_NONE, [DEBUG_OPT], "", "Shows the cluster version"), -- 1.5.6.5
