So far, if a SSH key of a node is added to the cluster
and updating the SSH setup of the node itself fails,
the entire operation fails quite disgraceful. This
patch introduces a retry mechanism and a proper cleanup
in case of a failure.

Note that there are more places where retries would
be a good idea. They'll be added in other patches.

Signed-off-by: Helga Velroyen <[email protected]>
---
 lib/backend.py                     | 28 +++++++++++++---
 test/py/ganeti.backend_unittest.py | 67 ++++++++++++++++++++++++++++++++++++++
 test/py/testutils_ssh.py           | 30 ++++++++++++++++-
 3 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index bc1a570..818f3fb 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
   _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
   cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
 
+  # number of retries for SSH connections
+  max_retries = 3
+
   # Update the target node itself
   if get_public_keys:
     node_data = {}
@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
     all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
     node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
       (constants.SSHS_OVERRIDE, all_keys)
-    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
-               ssh_port_map.get(node_name), node_data,
-               debug=False, verbose=False, use_cluster_key=False,
-               ask_key=False, strict_host_check=False)
+
+    last_exception = None
+    for _ in range(max_retries):
+      try:
+        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
+                   ssh_port_map.get(node_name), node_data,
+                   debug=False, verbose=False, use_cluster_key=False,
+                   ask_key=False, strict_host_check=False)
+        break
+      except errors.OpExecError as e:
+        logging.error("Updating SSH key files of node '%s' failed."
+                      " Error: %s", node_name, e)
+        last_exception = e
+    else:
+      if last_exception:
+        # Clean up the master's public key file if adding key fails
+        if to_public_keys:
+          ssh.RemovePublicKey(node_uuid)
+        raise errors.SshUpdateError(
+          "Could not update the SSH setup of node '%s' itself. Error: %s."
+          % (node_name, last_exception))
 
   # Update all nodes except master and the target node
   if to_authorized_keys:
diff --git a/test/py/ganeti.backend_unittest.py 
b/test/py/ganeti.backend_unittest.py
index 5305c15..cc65501 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1400,6 +1400,73 @@ class 
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
         self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
             node, node_key))
 
+  def testAddKeySuccessfullyWithRetries(self):
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
+
+    backend.AddNodeSshKey(new_node_uuid, new_node_name,
+                          self._potential_master_candidates,
+                          self._ssh_port_map,
+                          to_authorized_keys=is_master_candidate,
+                          to_public_keys=is_potential_master_candidate,
+                          get_public_keys=is_potential_master_candidate,
+                          pub_key_file=self._pub_key_file,
+                          ssconf_store=self._ssconf_mock,
+                          noded_cert_file=self.noded_cert_file,
+                          run_cmd_fn=self._run_cmd_mock)
+
+    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
+        new_node_name)
+    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
+        new_node_key))
+
+  def testAddKeyFailedWithRetries(self):
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
+
+    self.assertRaises(
+        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
+        new_node_name, self._potential_master_candidates, self._ssh_port_map,
+        to_authorized_keys=is_master_candidate,
+        to_public_keys=is_potential_master_candidate,
+        get_public_keys=is_potential_master_candidate,
+        pub_key_file=self._pub_key_file,
+        ssconf_store=self._ssconf_mock,
+        noded_cert_file=self.noded_cert_file,
+        run_cmd_fn=self._run_cmd_mock)
+
+    for node in self._all_nodes:
+      if node == new_node_name:
+        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))
+      else:
+        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))
+
+    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
+        new_node_uuid, new_node_key))
+
+
 class TestVerifySshSetup(testutils.GanetiTestCase):
 
   _NODE1_UUID = "uuid1"
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index ff7db63..a180144 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -32,6 +32,7 @@
 
 from ganeti import constants
 from ganeti import pathutils
+from ganeti import errors
 
 
 class FakeSshFileManager(object):
@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
     self._public_keys = {} # dict of dicts
     # Node name of the master node
     self._master_node_name = None
+    # Dictionary mapping nodes by name to number of retries where 'RunCommand'
+    # succeeds. For example if set to '3', RunCommand will fail two times when
+    # called for this node before it succeeds in the 3rd retry.
+    self._max_retries = {}
+    # Dictionary mapping nodes by name to number of retries which
+    # 'RunCommand' has already carried out.
+    self._retries = {}
 
   def _SetMasterNodeName(self):
     self._master_node_name = [name for name, (_, _, _, _, master)
@@ -124,6 +132,17 @@ class FakeSshFileManager(object):
       self._FillAuthorizedKeyOfOneNode(node)
     self._SetMasterNodeName()
 
+  def SetMaxRetries(self, node_name, retries):
+    """Set the number of unsuccessful retries of 'RunCommand' per node.
+
+    @type node_name: string
+    @param node_name: name of the node
+    @type retries: integer
+    @param retries: number of unsuccessful retries
+
+    """
+    self._max_retries[node_name] = retries
+
   def GetSshPortMap(self, port):
     """Creates a SSH port map with all nodes mapped to the given port.
 
@@ -281,7 +300,8 @@ class FakeSshFileManager(object):
     for name in self._all_node_data.keys():
       node_pub_keys = self._public_keys[name]
       if (uuid, key) in node_pub_keys.items():
-        return False
+        raise Exception("Node '%s' does have public key '%s' of node '%s'"
+                        % (name, key, uuid))
     return True
 
   def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
@@ -326,6 +346,14 @@ class FakeSshFileManager(object):
     of SSH keys. No actual key files of any node is touched.
 
     """
+    if node in self._max_retries:
+      if node not in self._retries:
+        self._retries[node] = 0
+      self._retries[node] += 1
+      if self._retries[node] < self._max_retries[node]:
+        raise errors.OpExecError("(Fake) SSH connection to node '%s' failed."
+                                 % node)
+
     assert base_cmd == pathutils.SSH_UPDATE
 
     if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
-- 
2.2.0.rc0.207.ga3a616c

Reply via email to