Interdiff for the max-retry constant, introduced in patch 8.

diff --git a/lib/backend.py b/lib/backend.py
index 5f4c99b..5a09efa 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1574,7 +1574,7 @@ def AddNodeSshKey(node_uuid, node_name,
       continue
     if node in potential_master_candidates:
       logging.debug("Updating SSH key files of node '%s'.")
-      for i in range(max_retries):
+      for i in range(constants.SSHS_MAX_RETRIES):
         try:
           run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
                      ssh_port_map.get(node), pot_mc_data,
@@ -1590,7 +1590,8 @@ def AddNodeSshKey(node_uuid, node_name,
           error_msg = ("When adding the key of node '%s', updating SSH key"
                        " files of node '%s' failed after %s retries."
                        " Not trying again. Last errors was: %s." %
-                       (node, node_name, max_retries, last_exception))
+                       (node, node_name, constants.SSHS_MAX_RETRIES,
+                        last_exception))
           node_errors[node] = error_msg
           # We only log the error and don't throw an exception, because
           # one unreachable node shall not abort the entire procedure.
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
ganeti.backend_unittest.py
index 021972c..ec61f07 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1499,7 +1499,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):

     other_node_name, _, _, _, _, _ = self._ssh_file_manager \
         .GetAllMasterCandidates()[0]
-    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
+    self._ssh_file_manager.SetMaxRetries(
+        other_node_name, constants.SSHS_MAX_RETRIES)
     assert other_node_name != new_node_name
     self._AddNewNodeToTestData(
         new_node_name, new_node_uuid, new_node_key,
@@ -1538,7 +1539,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):

     other_node_name, _, _, _, _, _ = self._ssh_file_manager \
         .GetAllMasterCandidates()[0]
-    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
+    self._ssh_file_manager.SetMaxRetries(
+        other_node_name, constants.SSHS_MAX_RETRIES + 1)
     assert other_node_name != new_node_name
     self._AddNewNodeToTestData(
         new_node_name, new_node_uuid, new_node_key,


On Fri, 17 Apr 2015 at 16:37 Helga Velroyen <[email protected]> wrote:

> So far, if a SSH key of a node is added to the cluster
> and updating the SSH setup of the node itself fails,
> the entire operation fails quite disgraceful. This
> patch introduces a retry mechanism and a proper cleanup
> in case of a failure.
>
> Note that there are more places where retries would
> be a good idea. They'll be added in other patches.
>
> Signed-off-by: Helga Velroyen <[email protected]>
> ---
>  lib/backend.py                     | 28 +++++++++++++---
>  test/py/ganeti.backend_unittest.py | 67
> ++++++++++++++++++++++++++++++++++++++
>  test/py/testutils_ssh.py           | 30 ++++++++++++++++-
>  3 files changed, 120 insertions(+), 5 deletions(-)
>
> diff --git a/lib/backend.py b/lib/backend.py
> index bc1a570..818f3fb 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
>    _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
>    cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
>
> +  # number of retries for SSH connections
> +  max_retries = 3
> +
>    # Update the target node itself
>    if get_public_keys:
>      node_data = {}
> @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
>      all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>        (constants.SSHS_OVERRIDE, all_keys)
> -    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> -               ssh_port_map.get(node_name), node_data,
> -               debug=False, verbose=False, use_cluster_key=False,
> -               ask_key=False, strict_host_check=False)
> +
> +    last_exception = None
> +    for _ in range(max_retries):
> +      try:
> +        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> +                   ssh_port_map.get(node_name), node_data,
> +                   debug=False, verbose=False, use_cluster_key=False,
> +                   ask_key=False, strict_host_check=False)
> +        break
> +      except errors.OpExecError as e:
> +        logging.error("Updating SSH key files of node '%s' failed."
> +                      " Error: %s", node_name, e)
> +        last_exception = e
> +    else:
> +      if last_exception:
> +        # Clean up the master's public key file if adding key fails
> +        if to_public_keys:
> +          ssh.RemovePublicKey(node_uuid)
> +        raise errors.SshUpdateError(
> +          "Could not update the SSH setup of node '%s' itself. Error: %s."
> +          % (node_name, last_exception))
>
>    # Update all nodes except master and the target node
>    if to_authorized_keys:
> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> ganeti.backend_unittest.py
> index 5305c15..cc65501 100755
> --- a/test/py/ganeti.backend_unittest.py
> +++ b/test/py/ganeti.backend_unittest.py
> @@ -1400,6 +1400,73 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>              node, node_key))
>
> +  def testAddKeySuccessfullyWithRetries(self):
> +    new_node_name = "new_node_name"
> +    new_node_uuid = "new_node_uuid"
> +    new_node_key = "new_node_key"
> +    is_master_candidate = True
> +    is_potential_master_candidate = True
> +    is_master = False
> +
> +    self._AddNewNodeToTestData(
> +        new_node_name, new_node_uuid, new_node_key,
> +        is_potential_master_candidate, is_master_candidate,
> +        is_master)
> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
> +
> +    backend.AddNodeSshKey(new_node_uuid, new_node_name,
> +                          self._potential_master_candidates,
> +                          self._ssh_port_map,
> +                          to_authorized_keys=is_master_candidate,
> +                          to_public_keys=is_potential_master_candidate,
> +                          get_public_keys=is_potential_master_candidate,
> +                          pub_key_file=self._pub_key_file,
> +                          ssconf_store=self._ssconf_mock,
> +                          noded_cert_file=self.noded_cert_file,
> +                          run_cmd_fn=self._run_cmd_mock)
> +
> +    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
> +        new_node_name)
> +    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
> +        new_node_key))
> +
> +  def testAddKeyFailedWithRetries(self):
> +    new_node_name = "new_node_name"
> +    new_node_uuid = "new_node_uuid"
> +    new_node_key = "new_node_key"
> +    is_master_candidate = True
> +    is_potential_master_candidate = True
> +    is_master = False
> +
> +    self._AddNewNodeToTestData(
> +        new_node_name, new_node_uuid, new_node_key,
> +        is_potential_master_candidate, is_master_candidate,
> +        is_master)
> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
> +
> +    self.assertRaises(
> +        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
> +        new_node_name, self._potential_master_candidates,
> self._ssh_port_map,
> +        to_authorized_keys=is_master_candidate,
> +        to_public_keys=is_potential_master_candidate,
> +        get_public_keys=is_potential_master_candidate,
> +        pub_key_file=self._pub_key_file,
> +        ssconf_store=self._ssconf_mock,
> +        noded_cert_file=self.noded_cert_file,
> +        run_cmd_fn=self._run_cmd_mock)
> +
> +    for node in self._all_nodes:
> +      if node == new_node_name:
> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> +          node, new_node_key))
> +      else:
> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
> +          node, new_node_key))
> +
> +    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
> +        new_node_uuid, new_node_key))
> +
> +
>  class TestVerifySshSetup(testutils.GanetiTestCase):
>
>    _NODE1_UUID = "uuid1"
> diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
> index ff7db63..a180144 100644
> --- a/test/py/testutils_ssh.py
> +++ b/test/py/testutils_ssh.py
> @@ -32,6 +32,7 @@
>
>  from ganeti import constants
>  from ganeti import pathutils
> +from ganeti import errors
>
>
>  class FakeSshFileManager(object):
> @@ -66,6 +67,13 @@ class FakeSshFileManager(object):
>      self._public_keys = {} # dict of dicts
>      # Node name of the master node
>      self._master_node_name = None
> +    # Dictionary mapping nodes by name to number of retries where
> 'RunCommand'
> +    # succeeds. For example if set to '3', RunCommand will fail two times
> when
> +    # called for this node before it succeeds in the 3rd retry.
> +    self._max_retries = {}
> +    # Dictionary mapping nodes by name to number of retries which
> +    # 'RunCommand' has already carried out.
> +    self._retries = {}
>
>    def _SetMasterNodeName(self):
>      self._master_node_name = [name for name, (_, _, _, _, master)
> @@ -124,6 +132,17 @@ class FakeSshFileManager(object):
>        self._FillAuthorizedKeyOfOneNode(node)
>      self._SetMasterNodeName()
>
> +  def SetMaxRetries(self, node_name, retries):
> +    """Set the number of unsuccessful retries of 'RunCommand' per node.
> +
> +    @type node_name: string
> +    @param node_name: name of the node
> +    @type retries: integer
> +    @param retries: number of unsuccessful retries
> +
> +    """
> +    self._max_retries[node_name] = retries
> +
>    def GetSshPortMap(self, port):
>      """Creates a SSH port map with all nodes mapped to the given port.
>
> @@ -281,7 +300,8 @@ class FakeSshFileManager(object):
>      for name in self._all_node_data.keys():
>        node_pub_keys = self._public_keys[name]
>        if (uuid, key) in node_pub_keys.items():
> -        return False
> +        raise Exception("Node '%s' does have public key '%s' of node '%s'"
> +                        % (name, key, uuid))
>      return True
>
>    def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
> @@ -326,6 +346,14 @@ class FakeSshFileManager(object):
>      of SSH keys. No actual key files of any node is touched.
>
>      """
> +    if node in self._max_retries:
> +      if node not in self._retries:
> +        self._retries[node] = 0
> +      self._retries[node] += 1
> +      if self._retries[node] < self._max_retries[node]:
> +        raise errors.OpExecError("(Fake) SSH connection to node '%s'
> failed."
> +                                 % node)
> +
>      assert base_cmd == pathutils.SSH_UPDATE
>
>      if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
> --
> 2.2.0.rc0.207.ga3a616c
>
>

Reply via email to