Interdiff for the max-retry constants introduced in patch 8:

diff --git a/lib/backend.py b/lib/backend.py
index 84c0ebc..1231592 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1661,8 +1661,6 @@ def RemoveNodeSshKey(node_uuid, node_name,
   """
   # Non-disruptive error messages, mapped from node name to message
   result_msgs = {}
-  # maximum number of retries for SSH connections
-  max_retries = 3

   # Make sure at least one of these flags is true.
   if not (from_authorized_keys or from_public_keys or clear_authorized_keys
@@ -1740,7 +1738,7 @@ def RemoveNodeSshKey(node_uuid, node_name,
                              " SSH key files of node '%s' failed after %s"
                              " retries. Not trying again. Last error was:
%s.")
           last_exception = None
-          for i in range(max_retries):
+          for i in range(constants.SSHS_MAX_RETRIES):
             try:
               run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
                          ssh_port, pot_mc_data,
@@ -1752,15 +1750,15 @@ def RemoveNodeSshKey(node_uuid, node_name,
               last_exception = e
           else:
             if last_exception:
-              error_msg = error_msg_final % (node_name, node, max_retries,
-                                             last_exception)
+              error_msg = error_msg_final % (
+                  node_name, node, constants.SSHS_MAX_RETRIES,
last_exception)
               result_msgs[node] = error_msg
               logging.error(error_msg)

         else:
           last_exception = None
           if from_authorized_keys:
-            for i in range(max_retries):
+            for i in range(constants.SSHS_MAX_RETRIES):
               try:
                 run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
                            ssh_port, base_data,
@@ -1772,8 +1770,8 @@ def RemoveNodeSshKey(node_uuid, node_name,
                 last_exception = e
           else:
             if last_exception:
-              error_msg = error_msg_final % (node_name, node, max_retries,
-                                             last_exception)
+              error_msg = error_msg_final % (
+                  node_name, node, constants.SSHS_MAX_RETRIES,
last_exception)
               result_msgs[node] = error_msg
               logging.error(error_msg)

diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
ganeti.backend_unittest.py
index 49e3f3c..d09e842 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1581,7 +1581,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     assert node_name != self._master_node
     assert other_node_name != self._master_node
     assert node_name != other_node_name
-    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
+    self._ssh_file_manager.SetMaxRetries(
+        other_node_name, constants.SSHS_MAX_RETRIES)

     backend.RemoveNodeSshKey(node_uuid, node_name,
                              self._master_candidate_uuids,
@@ -1614,7 +1615,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
     assert node_name != self._master_node
     assert other_node_name != self._master_node
     assert node_name != other_node_name
-    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
+    self._ssh_file_manager.SetMaxRetries(
+        other_node_name, constants.SSHS_MAX_RETRIES + 1)

     error_msgs = backend.RemoveNodeSshKey(
         node_uuid, node_name, self._master_candidate_uuids,


On Fri, 17 Apr 2015 at 16:37 Helga Velroyen <[email protected]> wrote:

> This patch introduces retries for updating non-master nodes
> when a new SSH key is added to the cluster. If an SSH update
> fails even after the maximum number of retries is exhausted,
> this does not raise an exception, but an error message is
> added to the result of the method, which can be displayed
> in the LU operations so that the admin sees which nodes
> were the problem and need further attention.
>
> Signed-off-by: Helga Velroyen <[email protected]>
> ---
>  lib/backend.py                     |  73 +++++++++++++++++++--------
>  lib/cmdlib/cluster.py              |   9 +++-
>  lib/cmdlib/node.py                 |  14 ++++-
>  test/py/ganeti.backend_unittest.py | 101
> ++++++++++++++++++++++++++++++++++++-
>  4 files changed, 169 insertions(+), 28 deletions(-)
>
> diff --git a/lib/backend.py b/lib/backend.py
> index 818f3fb..3b17306 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1566,17 +1566,38 @@ def AddNodeSshKey(node_uuid, node_name,
>    master_node = ssconf_store.GetMasterNode()
>    online_nodes = ssconf_store.GetOnlineNodeList()
>
> +  node_errors = {}
>    for node in all_nodes:
>      if node == master_node:
> +      logging.debug("Skipping master node '%s'.", master_node)
>        continue
>      if node not in online_nodes:
>        logging.debug("Skipping offline node '%s'.", node)
>        continue
>      if node in potential_master_candidates:
> -      run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> -                 ssh_port_map.get(node), pot_mc_data,
> -                 debug=False, verbose=False, use_cluster_key=False,
> -                 ask_key=False, strict_host_check=False)
> +      logging.debug("Updating SSH key files of node '%s'.")
> +      for i in range(max_retries):
> +        try:
> +          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> +                     ssh_port_map.get(node), pot_mc_data,
> +                     debug=False, verbose=False, use_cluster_key=False,
> +                     ask_key=False, strict_host_check=False)
> +          break
> +        except errors.OpExecError as e:
> +          logging.error("Updating SSH key files of node '%s' failed"
> +                        " at try no %s. Error: %s.", node, i, e)
> +          last_exception = e
> +      else:
> +        if last_exception:
> +          error_msg = ("When adding the key of node '%s', updating SSH
> key"
> +                       " files of node '%s' failed after %s retries."
> +                       " Not trying again. Last errors was: %s." %
> +                       (node, node_name, max_retries, last_exception))
> +          node_errors[node] = error_msg
> +          # We only log the error and don't throw an exception, because
> +          # one unreachable node shall not abort the entire procedure.
> +          logging.error(error_msg)
> +
>      else:
>        if to_authorized_keys:
>          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> @@ -1584,6 +1605,8 @@ def AddNodeSshKey(node_uuid, node_name,
>                     debug=False, verbose=False, use_cluster_key=False,
>                     ask_key=False, strict_host_check=False)
>
> +  return node_errors
> +
>
>  def RemoveNodeSshKey(node_uuid, node_name,
>                       master_candidate_uuids,
> @@ -1925,6 +1948,10 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>
>    master_node_name = ssconf_store.GetMasterNode()
>    master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map,
> master_node_name)
> +  # List of all node errors that happened, but which did not abort the
> +  # procedure as a whole. It is important that this is a list to have a
> +  # somewhat chronological history of events.
> +  all_node_errors = []
>
>    # process non-master nodes
>    for node_uuid, node_name in node_uuid_name_map:
> @@ -1989,15 +2016,16 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>        ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
>
>      logging.debug("Add ssh key of node '%s'.", node_name)
> -    AddNodeSshKey(node_uuid, node_name,
> -                  potential_master_candidates,
> -                  ssh_port_map,
> -                  to_authorized_keys=master_candidate,
> -                  to_public_keys=potential_master_candidate,
> -                  get_public_keys=True,
> -                  pub_key_file=pub_key_file, ssconf_store=ssconf_store,
> -                  noded_cert_file=noded_cert_file,
> -                  run_cmd_fn=run_cmd_fn)
> +    node_errors = AddNodeSshKey(
> +        node_uuid, node_name, potential_master_candidates,
> +        ssh_port_map, to_authorized_keys=master_candidate,
> +        to_public_keys=potential_master_candidate,
> +        get_public_keys=True,
> +        pub_key_file=pub_key_file, ssconf_store=ssconf_store,
> +        noded_cert_file=noded_cert_file,
> +        run_cmd_fn=run_cmd_fn)
> +    if node_errors:
> +      all_node_errors = all_node_errors + node_errors.items()
>
>    # Renewing the master node's key
>
> @@ -2022,15 +2050,14 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>
>    # Add new master key to all node's public and authorized keys
>    logging.debug("Add new master key to all nodes.")
> -  AddNodeSshKey(master_node_uuid, master_node_name,
> -                potential_master_candidates,
> -                ssh_port_map,
> -                to_authorized_keys=True,
> -                to_public_keys=True,
> -                get_public_keys=False,
> -                pub_key_file=pub_key_file, ssconf_store=ssconf_store,
> -                noded_cert_file=noded_cert_file,
> -                run_cmd_fn=run_cmd_fn)
> +  node_errors = AddNodeSshKey(
> +      master_node_uuid, master_node_name, potential_master_candidates,
> +      ssh_port_map, to_authorized_keys=True, to_public_keys=True,
> +      get_public_keys=False, pub_key_file=pub_key_file,
> +      ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
> +      run_cmd_fn=run_cmd_fn)
> +  if node_errors:
> +    all_node_errors = all_node_errors + node_errors.items()
>
>    # Remove the old key file and rename the new key to the non-temporary
> filename
>    _ReplaceMasterKeyOnMaster(root_keyfiles)
> @@ -2053,6 +2080,8 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>                     clear_authorized_keys=False,
>                     clear_public_keys=False)
>
> +  return all_node_errors
> +
>
>  def GetBlockDevSizes(devices):
>    """Return the size of the given block devices
> diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py
> index c504d60..14c18da 100644
> --- a/lib/cmdlib/cluster.py
> +++ b/lib/cmdlib/cluster.py
> @@ -210,7 +210,7 @@ class LUClusterRenewCrypto(NoHooksLU):
>      self.cfg.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid)
>      self.cfg.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid)
>
> -  def _RenewSshKeys(self):
> +  def _RenewSshKeys(self, feedback_fn):
>      """Renew all nodes' SSH keys.
>
>      """
> @@ -230,6 +230,11 @@ class LUClusterRenewCrypto(NoHooksLU):
>        master_candidate_uuids,
>        potential_master_candidates)
>      result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
> +    node_errors = result[master_uuid].payload
> +    if node_errors:
> +      feedback_fn("Some nodes' SSH key files could not be updated:")
> +      for node_name, error_msg in node_errors:
> +        feedback_fn("%s: %s" % (node_name, error_msg))
>
>    def Exec(self, feedback_fn):
>      if self.op.node_certificates:
> @@ -237,7 +242,7 @@ class LUClusterRenewCrypto(NoHooksLU):
>        self._RenewNodeSslCertificates(feedback_fn)
>      if self.op.ssh_keys and not self._ssh_renewal_suppressed:
>        feedback_fn("Renewing SSH keys")
> -      self._RenewSshKeys()
> +      self._RenewSshKeys(feedback_fn)
>      elif self._ssh_renewal_suppressed:
>        feedback_fn("Cannot renew SSH keys if the cluster is configured to
> not"
>                    " modify the SSH setup.")
> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
> index 19b630a..6a13d8a 100644
> --- a/lib/cmdlib/node.py
> +++ b/lib/cmdlib/node.py
> @@ -339,7 +339,7 @@ class LUNodeAdd(LogicalUnit):
>        result.Raise("Failed to initialize OpenVSwitch on new node")
>
>    def _SshUpdate(self, new_node_uuid, new_node_name, is_master_candidate,
> -                 is_potential_master_candidate, rpcrunner, readd):
> +                 is_potential_master_candidate, rpcrunner, readd,
> feedback_fn):
>      """Update the SSH setup of all nodes after adding a new node.
>
>      @type readd: boolean
> @@ -373,6 +373,11 @@ class LUNodeAdd(LogicalUnit):
>        is_master_candidate, is_potential_master_candidate,
>        is_potential_master_candidate)
>      result[master_node].Raise("Could not update the node's SSH setup.")
> +    node_errors = result[master_node].payload
> +    if node_errors:
> +      feedback_fn("Some nodes' SSH key files could not be updated:")
> +      for node_name, error_msg in node_errors.items():
> +        feedback_fn("%s: %s." % (node_name, error_msg))
>
>    def Exec(self, feedback_fn):
>      """Adds the new node to the cluster.
> @@ -481,7 +486,7 @@ class LUNodeAdd(LogicalUnit):
>        # FIXME: so far, all nodes are considered potential master
> candidates
>        self._SshUpdate(self.new_node.uuid, self.new_node.name,
>                        self.new_node.master_candidate, True,
> -                      self.rpc, self.op.readd)
> +                      self.rpc, self.op.readd, feedback_fn)
>
>
>  class LUNodeSetParams(LogicalUnit):
> @@ -897,6 +902,11 @@ class LUNodeSetParams(LogicalUnit):
>            ssh_result[master_node].Raise(
>              "Could not update the SSH setup of node '%s' after promotion"
>              " (UUID: %s)." % (node.name, node.uuid))
> +          node_errors = ssh_result[master_node].payload
> +          if node_errors:
> +            feedback_fn("Some nodes' SSH key files could not be updated:")
> +            for node_name, error_msg in node_errors.items():
> +              feedback_fn("%s: %s." % (node_name, error_msg))
>
>      return result
>
> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> ganeti.backend_unittest.py
> index cc65501..e05580d 100755
> --- a/test/py/ganeti.backend_unittest.py
> +++ b/test/py/ganeti.backend_unittest.py
> @@ -1400,7 +1400,14 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>              node, node_key))
>
> -  def testAddKeySuccessfullyWithRetries(self):
> +  def testAddKeySuccessfullyOnNewNodeWithRetries(self):
> +    """Tests adding a new node's key when updating that node takes
> retries.
> +
> +    This test checks whether adding a new node's key successfully updates
> +    all node's SSH key files, even if updating the new node's key files
> itself
> +    takes a couple of retries to succeed.
> +
> +    """
>      new_node_name = "new_node_name"
>      new_node_uuid = "new_node_uuid"
>      new_node_key = "new_node_key"
> @@ -1430,7 +1437,14 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>      self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>          new_node_key))
>
> -  def testAddKeyFailedWithRetries(self):
> +  def testAddKeyFailedOnNewNodeWithRetries(self):
> +    """Tests clean up if updating a new node's SSH setup fails.
> +
> +    If adding a new node's keys fails, because the SSH key files of that
> +    new node fails, check whether already carried out operations are
> +    successfully rolled back and thus the state of the cluster is cleaned
> up.
> +
> +    """
>      new_node_name = "new_node_name"
>      new_node_uuid = "new_node_uuid"
>      new_node_key = "new_node_key"
> @@ -1466,6 +1480,89 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>      self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>          new_node_uuid, new_node_key))
>
> +  def testAddKeySuccessfullyOnOldNodeWithRetries(self):
> +    """Tests adding a new key even if updating nodes takes retries.
> +
> +    This tests whether adding a new node's key successfully finishes,
> +    even if one of the other cluster nodes takes a couple of retries
> +    to succeed.
> +
> +    """
> +    new_node_name = "new_node_name"
> +    new_node_uuid = "new_node_uuid"
> +    new_node_key = "new_node_key"
> +    is_master_candidate = True
> +    is_potential_master_candidate = True
> +    is_master = False
> +
> +    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
> +        .GetAllMasterCandidates()[0]
> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
> +    assert other_node_name != new_node_name
> +    self._AddNewNodeToTestData(
> +        new_node_name, new_node_uuid, new_node_key,
> +        is_potential_master_candidate, is_master_candidate,
> +        is_master)
> +
> +    backend.AddNodeSshKey(new_node_uuid, new_node_name,
> +                          self._potential_master_candidates,
> +                          self._ssh_port_map,
> +                          to_authorized_keys=is_master_candidate,
> +                          to_public_keys=is_potential_master_candidate,
> +                          get_public_keys=is_potential_master_candidate,
> +                          pub_key_file=self._pub_key_file,
> +                          ssconf_store=self._ssconf_mock,
> +                          noded_cert_file=self.noded_cert_file,
> +                          run_cmd_fn=self._run_cmd_mock)
> +
> +    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
> +        new_node_key))
> +
> +  def testAddKeyFailedOnOldNodeWithRetries(self):
> +    """Tests adding keys when updating one node's SSH setup fails.
> +
> +    This tests, whether when adding a new node's key and one node is
> +    unreachable (but not marked as offline) the operation still finishes
> +    properly and only that unreachable node's SSH key setup did not get
> +    updated.
> +
> +    """
> +    new_node_name = "new_node_name"
> +    new_node_uuid = "new_node_uuid"
> +    new_node_key = "new_node_key"
> +    is_master_candidate = True
> +    is_potential_master_candidate = True
> +    is_master = False
> +
> +    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
> +        .GetAllMasterCandidates()[0]
> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
> +    assert other_node_name != new_node_name
> +    self._AddNewNodeToTestData(
> +        new_node_name, new_node_uuid, new_node_key,
> +        is_potential_master_candidate, is_master_candidate,
> +        is_master)
> +
> +    node_errors = backend.AddNodeSshKey(
> +        new_node_uuid, new_node_name, self._potential_master_candidates,
> +        self._ssh_port_map, to_authorized_keys=is_master_candidate,
> +        to_public_keys=is_potential_master_candidate,
> +        get_public_keys=is_potential_master_candidate,
> +        pub_key_file=self._pub_key_file,
> +        ssconf_store=self._ssconf_mock,
> +        noded_cert_file=self.noded_cert_file,
> +        run_cmd_fn=self._run_cmd_mock)
> +
> +    for node in self._all_nodes:
> +      if node == other_node_name:
> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
> +          node, new_node_key))
> +      else:
> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> +          node, new_node_key))
> +
> +    self.assertTrue(node_errors[other_node_name])
> +
>
>  class TestVerifySshSetup(testutils.GanetiTestCase):
>
> --
> 2.2.0.rc0.207.ga3a616c
>
>

Reply via email to