This allows failing over in certain corner cases, such as a 2 node
cluster with one node down. The man page is also updated to document
this dangerous option and how to recover from this situation.

Signed-off-by: Guido Trotter <[email protected]>
---
 lib/bootstrap.py     |   34 ++++++++++++++++++++--------------
 man/gnt-cluster.sgml |   13 +++++++++++++
 scripts/gnt-cluster  |   17 +++++++++++++++--
 3 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/bootstrap.py b/lib/bootstrap.py
index 1990fc4..3f96561 100644
--- a/lib/bootstrap.py
+++ b/lib/bootstrap.py
@@ -373,13 +373,17 @@ def SetupNodeDaemon(cluster_name, node, ssh_key_check):
                              (node, result.fail_reason, result.output))
 
 
-def MasterFailover():
+def MasterFailover(no_voting=False):
   """Failover the master node.
 
   This checks that we are not already the master, and will cause the
   current master to cease being master, and the non-master to become
   new master.
 
+  @type no_voting: boolean
+  @param no_voting: force the operation without remote nodes agreement
+                      (dangerous)
+
   """
   sstore = ssconf.SimpleStore()
 
@@ -401,18 +405,20 @@ def MasterFailover():
                                " master candidates is:\n"
                                "%s" % ('\n'.join(mc_no_master)))
 
-  vote_list = GatherMasterVotes(node_list)
-
-  if vote_list:
-    voted_master = vote_list[0][0]
-    if voted_master is None:
-      raise errors.OpPrereqError("Cluster is inconsistent, most nodes did not"
-                                 " respond.")
-    elif voted_master != old_master:
-      raise errors.OpPrereqError("I have wrong configuration, I believe the"
-                                 " master is %s but the other nodes voted for"
-                                 " %s. Please resync the configuration of"
-                                 " this node." % (old_master, voted_master))
+  if not no_voting:
+    vote_list = GatherMasterVotes(node_list)
+
+    if vote_list:
+      voted_master = vote_list[0][0]
+      if voted_master is None:
+        raise errors.OpPrereqError("Cluster is inconsistent, most nodes did"
+                                   " not respond.")
+      elif voted_master != old_master:
+        raise errors.OpPrereqError("I have a wrong configuration, I believe"
+                                   " the master is %s but the other nodes"
+                                   " voted %s. Please resync the configuration"
+                                   " of this node." %
+                                   (old_master, voted_master))
   # end checks
 
   rcode = 0
@@ -436,7 +442,7 @@ def MasterFailover():
   # cluster info
   cfg.Update(cluster_info)
 
-  result = rpc.RpcRunner.call_node_start_master(new_master, True, False)
+  result = rpc.RpcRunner.call_node_start_master(new_master, True, no_voting)
   if result.failed or not result.data:
     logging.error("Could not start the master role on the new master"
                   " %s, please check", new_master)
diff --git a/man/gnt-cluster.sgml b/man/gnt-cluster.sgml
index e3fecbf..f675fca 100644
--- a/man/gnt-cluster.sgml
+++ b/man/gnt-cluster.sgml
@@ -442,11 +442,24 @@
 
       <cmdsynopsis>
         <command>masterfailover</command>
+        <arg>--no-voting</arg>
       </cmdsynopsis>
 
       <para>
         Failover the master role to the current node.
       </para>
+
+      <para>
+        The <option>--no-voting</option> option skips the remote node agreement
+        checks. This is dangerous, but necessary in some cases (for example
+        failing over the master role in a 2 node cluster with the original 
master
+        down). If the original master then comes up, it won't be able to start
+        its master daemon because it won't have enough votes, but so won't the
+        new master, if the master daemon ever needs a restart. You can pass
+        --no-voting to ganeti-masterd on the new master to solve this problem,
+        and gnt-cluster redist-conf to make sure the cluster is consistent 
again.
+      </para>
+
     </refsect2>
 
     <refsect2>
diff --git a/scripts/gnt-cluster b/scripts/gnt-cluster
index 99cab31..3696b37 100755
--- a/scripts/gnt-cluster
+++ b/scripts/gnt-cluster
@@ -424,7 +424,15 @@ def MasterFailover(opts, args):
   @return: the desired exit code
 
   """
-  return bootstrap.MasterFailover()
+  if opts.no_voting:
+    usertext = ("This will perform the failover even if most other nodes"
+                " are down, or if this node is outdated. This is dangerous"
+                " as it can lead to a non-consistent cluster. Check the" 
+                " gnt-cluster(8) man page before proceeding. Continue?")
+    if not AskUser(usertext):
+      return 1
+
+  return bootstrap.MasterFailover(no_voting=opts.no_voting)
 
 
 def SearchTags(opts, args):
@@ -613,7 +621,12 @@ commands = {
              "", "Does a check on the cluster configuration"),
   'verify-disks': (VerifyDisks, ARGS_NONE, [DEBUG_OPT],
                    "", "Does a check on the cluster disk status"),
-  'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT],
+  'masterfailover': (MasterFailover, ARGS_NONE, [DEBUG_OPT,
+                     make_option("--no-voting", dest="no_voting",
+                                 help="Skip node agreement check (dangerous)",
+                                 action="store_true",
+                                 default=False,),
+                     ],
                      "", "Makes the current node the master"),
   'version': (ShowClusterVersion, ARGS_NONE, [DEBUG_OPT],
               "", "Shows the cluster version"),
-- 
1.5.6.5

Reply via email to