This patch introduces retries for updating non-master nodes
when a new SSH key is added to the cluster. If an SSH update
fails even after the maximum number of retries is exhausted,
this does not raise an exception, but an error message is
added to the result of the method, which can be displayed
in the LU operations so that the admin sees which nodes
were the problem and need further attention.

Signed-off-by: Helga Velroyen <[email protected]>
---
 lib/backend.py                     |  73 +++++++++++++++++++--------
 lib/cmdlib/cluster.py              |   9 +++-
 lib/cmdlib/node.py                 |  14 ++++-
 test/py/ganeti.backend_unittest.py | 101 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 169 insertions(+), 28 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index 818f3fb..3b17306 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1566,17 +1566,38 @@ def AddNodeSshKey(node_uuid, node_name,
   master_node = ssconf_store.GetMasterNode()
   online_nodes = ssconf_store.GetOnlineNodeList()
 
+  node_errors = {}
   for node in all_nodes:
     if node == master_node:
+      logging.debug("Skipping master node '%s'.", master_node)
       continue
     if node not in online_nodes:
       logging.debug("Skipping offline node '%s'.", node)
       continue
     if node in potential_master_candidates:
-      run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
-                 ssh_port_map.get(node), pot_mc_data,
-                 debug=False, verbose=False, use_cluster_key=False,
-                 ask_key=False, strict_host_check=False)
+      logging.debug("Updating SSH key files of node '%s'.")
+      for i in range(max_retries):
+        try:
+          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
+                     ssh_port_map.get(node), pot_mc_data,
+                     debug=False, verbose=False, use_cluster_key=False,
+                     ask_key=False, strict_host_check=False)
+          break
+        except errors.OpExecError as e:
+          logging.error("Updating SSH key files of node '%s' failed"
+                        " at try no %s. Error: %s.", node, i, e)
+          last_exception = e
+      else:
+        if last_exception:
+          error_msg = ("When adding the key of node '%s', updating SSH key"
+                       " files of node '%s' failed after %s retries."
+                       " Not trying again. Last errors was: %s." %
+                       (node, node_name, max_retries, last_exception))
+          node_errors[node] = error_msg
+          # We only log the error and don't throw an exception, because
+          # one unreachable node shall not abort the entire procedure.
+          logging.error(error_msg)
+
     else:
       if to_authorized_keys:
         run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
@@ -1584,6 +1605,8 @@ def AddNodeSshKey(node_uuid, node_name,
                    debug=False, verbose=False, use_cluster_key=False,
                    ask_key=False, strict_host_check=False)
 
+  return node_errors
+
 
 def RemoveNodeSshKey(node_uuid, node_name,
                      master_candidate_uuids,
@@ -1925,6 +1948,10 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
 
   master_node_name = ssconf_store.GetMasterNode()
   master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map, master_node_name)
+  # List of all node errors that happened, but which did not abort the
+  # procedure as a whole. It is important that this is a list to have a
+  # somewhat chronological history of events.
+  all_node_errors = []
 
   # process non-master nodes
   for node_uuid, node_name in node_uuid_name_map:
@@ -1989,15 +2016,16 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
       ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
 
     logging.debug("Add ssh key of node '%s'.", node_name)
-    AddNodeSshKey(node_uuid, node_name,
-                  potential_master_candidates,
-                  ssh_port_map,
-                  to_authorized_keys=master_candidate,
-                  to_public_keys=potential_master_candidate,
-                  get_public_keys=True,
-                  pub_key_file=pub_key_file, ssconf_store=ssconf_store,
-                  noded_cert_file=noded_cert_file,
-                  run_cmd_fn=run_cmd_fn)
+    node_errors = AddNodeSshKey(
+        node_uuid, node_name, potential_master_candidates,
+        ssh_port_map, to_authorized_keys=master_candidate,
+        to_public_keys=potential_master_candidate,
+        get_public_keys=True,
+        pub_key_file=pub_key_file, ssconf_store=ssconf_store,
+        noded_cert_file=noded_cert_file,
+        run_cmd_fn=run_cmd_fn)
+    if node_errors:
+      all_node_errors = all_node_errors + node_errors.items()
 
   # Renewing the master node's key
 
@@ -2022,15 +2050,14 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
 
   # Add new master key to all node's public and authorized keys
   logging.debug("Add new master key to all nodes.")
-  AddNodeSshKey(master_node_uuid, master_node_name,
-                potential_master_candidates,
-                ssh_port_map,
-                to_authorized_keys=True,
-                to_public_keys=True,
-                get_public_keys=False,
-                pub_key_file=pub_key_file, ssconf_store=ssconf_store,
-                noded_cert_file=noded_cert_file,
-                run_cmd_fn=run_cmd_fn)
+  node_errors = AddNodeSshKey(
+      master_node_uuid, master_node_name, potential_master_candidates,
+      ssh_port_map, to_authorized_keys=True, to_public_keys=True,
+      get_public_keys=False, pub_key_file=pub_key_file,
+      ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
+      run_cmd_fn=run_cmd_fn)
+  if node_errors:
+    all_node_errors = all_node_errors + node_errors.items()
 
   # Remove the old key file and rename the new key to the non-temporary 
filename
   _ReplaceMasterKeyOnMaster(root_keyfiles)
@@ -2053,6 +2080,8 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map,
                    clear_authorized_keys=False,
                    clear_public_keys=False)
 
+  return all_node_errors
+
 
 def GetBlockDevSizes(devices):
   """Return the size of the given block devices
diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py
index c504d60..14c18da 100644
--- a/lib/cmdlib/cluster.py
+++ b/lib/cmdlib/cluster.py
@@ -210,7 +210,7 @@ class LUClusterRenewCrypto(NoHooksLU):
     self.cfg.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid)
     self.cfg.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid)
 
-  def _RenewSshKeys(self):
+  def _RenewSshKeys(self, feedback_fn):
     """Renew all nodes' SSH keys.
 
     """
@@ -230,6 +230,11 @@ class LUClusterRenewCrypto(NoHooksLU):
       master_candidate_uuids,
       potential_master_candidates)
     result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
+    node_errors = result[master_uuid].payload
+    if node_errors:
+      feedback_fn("Some nodes' SSH key files could not be updated:")
+      for node_name, error_msg in node_errors:
+        feedback_fn("%s: %s" % (node_name, error_msg))
 
   def Exec(self, feedback_fn):
     if self.op.node_certificates:
@@ -237,7 +242,7 @@ class LUClusterRenewCrypto(NoHooksLU):
       self._RenewNodeSslCertificates(feedback_fn)
     if self.op.ssh_keys and not self._ssh_renewal_suppressed:
       feedback_fn("Renewing SSH keys")
-      self._RenewSshKeys()
+      self._RenewSshKeys(feedback_fn)
     elif self._ssh_renewal_suppressed:
       feedback_fn("Cannot renew SSH keys if the cluster is configured to not"
                   " modify the SSH setup.")
diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
index 19b630a..6a13d8a 100644
--- a/lib/cmdlib/node.py
+++ b/lib/cmdlib/node.py
@@ -339,7 +339,7 @@ class LUNodeAdd(LogicalUnit):
       result.Raise("Failed to initialize OpenVSwitch on new node")
 
   def _SshUpdate(self, new_node_uuid, new_node_name, is_master_candidate,
-                 is_potential_master_candidate, rpcrunner, readd):
+                 is_potential_master_candidate, rpcrunner, readd, feedback_fn):
     """Update the SSH setup of all nodes after adding a new node.
 
     @type readd: boolean
@@ -373,6 +373,11 @@ class LUNodeAdd(LogicalUnit):
       is_master_candidate, is_potential_master_candidate,
       is_potential_master_candidate)
     result[master_node].Raise("Could not update the node's SSH setup.")
+    node_errors = result[master_node].payload
+    if node_errors:
+      feedback_fn("Some nodes' SSH key files could not be updated:")
+      for node_name, error_msg in node_errors.items():
+        feedback_fn("%s: %s." % (node_name, error_msg))
 
   def Exec(self, feedback_fn):
     """Adds the new node to the cluster.
@@ -481,7 +486,7 @@ class LUNodeAdd(LogicalUnit):
       # FIXME: so far, all nodes are considered potential master candidates
       self._SshUpdate(self.new_node.uuid, self.new_node.name,
                       self.new_node.master_candidate, True,
-                      self.rpc, self.op.readd)
+                      self.rpc, self.op.readd, feedback_fn)
 
 
 class LUNodeSetParams(LogicalUnit):
@@ -897,6 +902,11 @@ class LUNodeSetParams(LogicalUnit):
           ssh_result[master_node].Raise(
             "Could not update the SSH setup of node '%s' after promotion"
             " (UUID: %s)." % (node.name, node.uuid))
+          node_errors = ssh_result[master_node].payload
+          if node_errors:
+            feedback_fn("Some nodes' SSH key files could not be updated:")
+            for node_name, error_msg in node_errors.items():
+              feedback_fn("%s: %s." % (node_name, error_msg))
 
     return result
 
diff --git a/test/py/ganeti.backend_unittest.py 
b/test/py/ganeti.backend_unittest.py
index cc65501..e05580d 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1400,7 +1400,14 @@ class 
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
         self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
             node, node_key))
 
-  def testAddKeySuccessfullyWithRetries(self):
+  def testAddKeySuccessfullyOnNewNodeWithRetries(self):
+    """Tests adding a new node's key when updating that node takes retries.
+
+    This test checks whether adding a new node's key successfully updates
+    all node's SSH key files, even if updating the new node's key files itself
+    takes a couple of retries to succeed.
+
+    """
     new_node_name = "new_node_name"
     new_node_uuid = "new_node_uuid"
     new_node_key = "new_node_key"
@@ -1430,7 +1437,14 @@ class 
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
         new_node_key))
 
-  def testAddKeyFailedWithRetries(self):
+  def testAddKeyFailedOnNewNodeWithRetries(self):
+    """Tests clean up if updating a new node's SSH setup fails.
+
+    If adding a new node's keys fails, because the SSH key files of that
+    new node fails, check whether already carried out operations are
+    successfully rolled back and thus the state of the cluster is cleaned up.
+
+    """
     new_node_name = "new_node_name"
     new_node_uuid = "new_node_uuid"
     new_node_key = "new_node_key"
@@ -1466,6 +1480,89 @@ class 
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
         new_node_uuid, new_node_key))
 
+  def testAddKeySuccessfullyOnOldNodeWithRetries(self):
+    """Tests adding a new key even if updating nodes takes retries.
+
+    This tests whether adding a new node's key successfully finishes,
+    even if one of the other cluster nodes takes a couple of retries
+    to succeed.
+
+    """
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
+        .GetAllMasterCandidates()[0]
+    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
+    assert other_node_name != new_node_name
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+
+    backend.AddNodeSshKey(new_node_uuid, new_node_name,
+                          self._potential_master_candidates,
+                          self._ssh_port_map,
+                          to_authorized_keys=is_master_candidate,
+                          to_public_keys=is_potential_master_candidate,
+                          get_public_keys=is_potential_master_candidate,
+                          pub_key_file=self._pub_key_file,
+                          ssconf_store=self._ssconf_mock,
+                          noded_cert_file=self.noded_cert_file,
+                          run_cmd_fn=self._run_cmd_mock)
+
+    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
+        new_node_key))
+
+  def testAddKeyFailedOnOldNodeWithRetries(self):
+    """Tests adding keys when updating one node's SSH setup fails.
+
+    This tests, whether when adding a new node's key and one node is
+    unreachable (but not marked as offline) the operation still finishes
+    properly and only that unreachable node's SSH key setup did not get
+    updated.
+
+    """
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
+        .GetAllMasterCandidates()[0]
+    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
+    assert other_node_name != new_node_name
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+
+    node_errors = backend.AddNodeSshKey(
+        new_node_uuid, new_node_name, self._potential_master_candidates,
+        self._ssh_port_map, to_authorized_keys=is_master_candidate,
+        to_public_keys=is_potential_master_candidate,
+        get_public_keys=is_potential_master_candidate,
+        pub_key_file=self._pub_key_file,
+        ssconf_store=self._ssconf_mock,
+        noded_cert_file=self.noded_cert_file,
+        run_cmd_fn=self._run_cmd_mock)
+
+    for node in self._all_nodes:
+      if node == other_node_name:
+        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))
+      else:
+        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))
+
+    self.assertTrue(node_errors[other_node_name])
+
 
 class TestVerifySshSetup(testutils.GanetiTestCase):
 
-- 
2.2.0.rc0.207.ga3a616c

Reply via email to