Re: [PATCH stable-2.13 8/8] AddNodeSshKey: retry when target node not reachable

'Petr Pudlak' via ganeti-devel Tue, 21 Apr 2015 06:17:32 -0700

On Thu, Apr 16, 2015 at 05:32:00PM +0200, 'Helga Velroyen' via ganeti-devel 
wrote:

So far, if a SSH key of a node is added to the cluster
and updating the SSH setup of the node itself fails,
the entire operation fails quite disgraceful. This
patch introduces a retry mechanism and a proper cleanup
in case of a failure.


Note that there are more places where retries would
be a good idea. They'll be added in other patches.

Signed-off-by: Helga Velroyen <[email protected]>
---
lib/backend.py                     | 28 +++++++++++++---
test/py/ganeti.backend_unittest.py | 67 ++++++++++++++++++++++++++++++++++++++
test/py/testutils_ssh.py           | 30 ++++++++++++++++-
3 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index bc1a570..818f3fb 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
  _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
  cluster_name = base_data[constants.SSHS_CLUSTER_NAME]

+  # number of retries for SSH connections
+  max_retries = 3

Perhaps it'd be useful to have the number as some global constant (and alsoreference the constant in the tests).

+
  # Update the target node itself
  if get_public_keys:
    node_data = {}
@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
    all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
    node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
      (constants.SSHS_OVERRIDE, all_keys)
-    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
-               ssh_port_map.get(node_name), node_data,
-               debug=False, verbose=False, use_cluster_key=False,
-               ask_key=False, strict_host_check=False)
+
+    last_exception = None
+    for _ in range(max_retries):
+      try:
+        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
+                   ssh_port_map.get(node_name), node_data,
+                   debug=False, verbose=False, use_cluster_key=False,
+                   ask_key=False, strict_host_check=False)
+        break
+      except errors.OpExecError as e:
+        logging.error("Updating SSH key files of node '%s' failed."
+                      " Error: %s", node_name, e)
+        last_exception = e
+    else:
+      if last_exception:
+        # Clean up the master's public key file if adding key fails
+        if to_public_keys:
+          ssh.RemovePublicKey(node_uuid)
+        raise errors.SshUpdateError(
+          "Could not update the SSH setup of node '%s' itself. Error: %s."
+          % (node_name, last_exception))

Would there be somoe benefit from using utils.retry.Retry? The difference isthat Retry is based on timeout instead of the number of invocations (whichcan make tests a bit problematic under some circumstances), and that afailure should return RetryAgain - it could just wrap the OpExecError.

If not, it might make sense to have another utility function for this, witha similar interface.


  # Update all nodes except master and the target node
  if to_authorized_keys:
diff --git a/test/py/ganeti.backend_unittest.py 
b/test/py/ganeti.backend_unittest.py
index 5305c15..cc65501 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1400,6 +1400,73 @@ class 
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
            node, node_key))

+  def testAddKeySuccessfullyWithRetries(self):
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
+
+    backend.AddNodeSshKey(new_node_uuid, new_node_name,
+                          self._potential_master_candidates,
+                          self._ssh_port_map,
+                          to_authorized_keys=is_master_candidate,
+                          to_public_keys=is_potential_master_candidate,
+                          get_public_keys=is_potential_master_candidate,
+                          pub_key_file=self._pub_key_file,
+                          ssconf_store=self._ssconf_mock,
+                          noded_cert_file=self.noded_cert_file,
+                          run_cmd_fn=self._run_cmd_mock)
+
+    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
+        new_node_name)
+    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
+        new_node_key))
+
+  def testAddKeyFailedWithRetries(self):
+    new_node_name = "new_node_name"
+    new_node_uuid = "new_node_uuid"
+    new_node_key = "new_node_key"
+    is_master_candidate = True
+    is_potential_master_candidate = True
+    is_master = False
+
+    self._AddNewNodeToTestData(
+        new_node_name, new_node_uuid, new_node_key,
+        is_potential_master_candidate, is_master_candidate,
+        is_master)
+    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
+
+    self.assertRaises(
+        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
+        new_node_name, self._potential_master_candidates, self._ssh_port_map,
+        to_authorized_keys=is_master_candidate,
+        to_public_keys=is_potential_master_candidate,
+        get_public_keys=is_potential_master_candidate,
+        pub_key_file=self._pub_key_file,
+        ssconf_store=self._ssconf_mock,
+        noded_cert_file=self.noded_cert_file,
+        run_cmd_fn=self._run_cmd_mock)
+
+    for node in self._all_nodes:
+      if node == new_node_name:
+        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))
+      else:
+        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
+          node, new_node_key))

It might be useful to add a helper (probably in a later patch?) to the sshfile manager for checking that all nodes (and only those) in a given sethave a given authorized key - a similar loop is already used in a few otherplaces.

+
+    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
+        new_node_uuid, new_node_key))
+
+
class TestVerifySshSetup(testutils.GanetiTestCase):

  _NODE1_UUID = "uuid1"
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index a65c44e..b7b4e82 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -32,6 +32,7 @@

from ganeti import constants
from ganeti import pathutils
+from ganeti import errors


class FakeSshFileManager(object):
@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
    self._public_keys = {} # dict of dicts
    # Node name of the master node
    self._master_node_name = None
+    # Dictionary mapping nodes by name to number of retries where 'RunCommand'
+    # succeeds. For example if set to '3', RunCommand will fail two times when
+    # called for this node before it succeeds in the 3rd retry.
+    self._max_retries = {}
+    # Dictionary mapping nodes by name to number of retries which
+    # 'RunCommand' has already carried out.
+    self._retries = {}

Maybe only one dictionary would suffice. When setting a retry, we'd setself._retries[node] to a given number and let an operation succeed only ifthe number is positive (decrementing otherwise). Or do we need to keep themax. number of retries while resetting the counter at some point?


  def _SetMasterNodeName(self):
    self._master_node_name = [name for name, (_, _, _, _, master)
@@ -138,6 +146,17 @@ class FakeSshFileManager(object):
      self._FillAuthorizedKeyOfOneNode(node)
    self._SetMasterNodeName()

+  def SetMaxRetries(self, node_name, retries):
+    """Set the number of unsuccessful retries of 'RunCommand' per node.
+
+    @type node_name: string
+    @param node_name: name of the node
+    @type retries: integer
+    @param retries: number of unsuccessful retries
+
+    """
+    self._max_retries[node_name] = retries
+
  def GetSshPortMap(self, port):
    """Creates a SSH port map with all nodes mapped to the given port.

@@ -297,7 +316,8 @@ class FakeSshFileManager(object):
    for name in self._all_node_data.keys():
      node_pub_keys = self._public_keys[name]
      if (uuid, key) in node_pub_keys.items():
-        return False
+        raise Exception("Node '%s' does have public key '%s' of node '%s'"
+                        % (name, key, uuid))

This change is somewhat confusing, and seems to me unrelated to the patch.The method used to return True/False, but after that change it's either Trueor an exception. So I'd either be in favor of True/False or None/exception.And currently all the tests in the backend_unittest use assertTrue/False onthe result.

    return True

  def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
@@ -342,6 +362,14 @@ class FakeSshFileManager(object):
    of SSH keys. No actual key files of any node is touched.

    """
+    if node in self._max_retries:
+      if node not in self._retries:
+        self._retries[node] = 0
+      self._retries[node] += 1
+      if self._retries[node] < self._max_retries[node]:
+        raise errors.OpExecError("(Fake) SSH connection to node '%s' failed."
+                                 % node)
+
    assert base_cmd == pathutils.SSH_UPDATE

    if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
--
2.2.0.rc0.207.ga3a616c

Re: [PATCH stable-2.13 8/8] AddNodeSshKey: retry when target node not reachable

Reply via email to