On Tue, 21 Apr 2015 at 17:23 Petr Pudlak <[email protected]> wrote:

> On Tue, Apr 21, 2015 at 02:47:26PM +0000, Helga Velroyen wrote:
> >Hi!
> >
> >On Tue, 21 Apr 2015 at 15:16 Petr Pudlak <[email protected]> wrote:
> >
> >> On Thu, Apr 16, 2015 at 05:32:00PM +0200, 'Helga Velroyen' via
> >> ganeti-devel wrote:
> >> >So far, if a SSH key of a node is added to the cluster
> >> >and updating the SSH setup of the node itself fails,
> >> >the entire operation fails quite disgraceful. This
> >> >patch introduces a retry mechanism and a proper cleanup
> >> >in case of a failure.
> >> >
> >> >Note that there are more places where retries would
> >> >be a good idea. They'll be added in other patches.
> >> >
> >> >Signed-off-by: Helga Velroyen <[email protected]>
> >> >---
> >> > lib/backend.py                     | 28 +++++++++++++---
> >> > test/py/ganeti.backend_unittest.py | 67
> >> ++++++++++++++++++++++++++++++++++++++
> >> > test/py/testutils_ssh.py           | 30 ++++++++++++++++-
> >> > 3 files changed, 120 insertions(+), 5 deletions(-)
> >> >
> >> >diff --git a/lib/backend.py b/lib/backend.py
> >> >index bc1a570..818f3fb 100644
> >> >--- a/lib/backend.py
> >> >+++ b/lib/backend.py
> >> >@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
> >> >   _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
> >> >   cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
> >> >
> >> >+  # number of retries for SSH connections
> >> >+  max_retries = 3
> >>
> >> Perhaps it'd be useful to have the number as some global constant (and
> also
> >> reference the constant in the tests).
> >>
> >
> >True, thought about this as well. See interdiff below. Als note that this
> >results in some interdiffs in the other 7 patches of that series, I will
> >send them as reply to those respective patches.
> >
> >
> >>
> >> >+
> >> >   # Update the target node itself
> >> >   if get_public_keys:
> >> >     node_data = {}
> >> >@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
> >> >     all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
> >> >     node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
> >> >       (constants.SSHS_OVERRIDE, all_keys)
> >> >-    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> >> >-               ssh_port_map.get(node_name), node_data,
> >> >-               debug=False, verbose=False, use_cluster_key=False,
> >> >-               ask_key=False, strict_host_check=False)
> >> >+
> >> >+    last_exception = None
> >> >+    for _ in range(max_retries):
> >> >+      try:
> >> >+        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> >> >+                   ssh_port_map.get(node_name), node_data,
> >> >+                   debug=False, verbose=False, use_cluster_key=False,
> >> >+                   ask_key=False, strict_host_check=False)
> >> >+        break
> >> >+      except errors.OpExecError as e:
> >> >+        logging.error("Updating SSH key files of node '%s' failed."
> >> >+                      " Error: %s", node_name, e)
> >> >+        last_exception = e
> >> >+    else:
> >> >+      if last_exception:
> >> >+        # Clean up the master's public key file if adding key fails
> >> >+        if to_public_keys:
> >> >+          ssh.RemovePublicKey(node_uuid)
> >> >+        raise errors.SshUpdateError(
> >> >+          "Could not update the SSH setup of node '%s' itself. Error:
> >> %s."
> >> >+          % (node_name, last_exception))
> >>
> >> Would there be somoe benefit from using utils.retry.Retry? The
> difference
> >> is
> >> that Retry is based on timeout instead of the number of invocations
> (which
> >> can make tests a bit problematic under some circumstances), and that a
> >> failure should return RetryAgain - it could just wrap the OpExecError.
> >>
> >> If not, it might make sense to have another utility function for this,
> with
> >> a similar interface.
> >>
> >
> >I would not like to use utils.retry.Retry as it is based on a timeout and
> >that makes stuff (including the unit tests) awfully slow.
> >
> >I agree that having an extra utility function might be a good idea. Would
> >you mind if I add it as a separate patch at the end of the series? I
> >already have used this in the other 7 patches of this series, and I'd
> >rather not merge that in now.
>
> I agree on both, the usecase is somewhat different, and it'll be better to
> add the utility function as another patch.
>
> >
> >
> >>
> >> >
> >> >   # Update all nodes except master and the target node
> >> >   if to_authorized_keys:
> >> >diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> >> ganeti.backend_unittest.py
> >> >index 5305c15..cc65501 100755
> >> >--- a/test/py/ganeti.backend_unittest.py
> >> >+++ b/test/py/ganeti.backend_unittest.py
> >> >@@ -1400,6 +1400,73 @@ class
> >> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >> >         self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> >> >             node, node_key))
> >> >
> >> >+  def testAddKeySuccessfullyWithRetries(self):
> >> >+    new_node_name = "new_node_name"
> >> >+    new_node_uuid = "new_node_uuid"
> >> >+    new_node_key = "new_node_key"
> >> >+    is_master_candidate = True
> >> >+    is_potential_master_candidate = True
> >> >+    is_master = False
> >> >+
> >> >+    self._AddNewNodeToTestData(
> >> >+        new_node_name, new_node_uuid, new_node_key,
> >> >+        is_potential_master_candidate, is_master_candidate,
> >> >+        is_master)
> >> >+    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
> >> >+
> >> >+    backend.AddNodeSshKey(new_node_uuid, new_node_name,
> >> >+                          self._potential_master_candidates,
> >> >+                          self._ssh_port_map,
> >> >+                          to_authorized_keys=is_master_candidate,
> >> >+
> to_public_keys=is_potential_master_candidate,
> >> >+
> get_public_keys=is_potential_master_candidate,
> >> >+                          pub_key_file=self._pub_key_file,
> >> >+                          ssconf_store=self._ssconf_mock,
> >> >+                          noded_cert_file=self.noded_cert_file,
> >> >+                          run_cmd_fn=self._run_cmd_mock)
> >> >+
> >> >+    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
> >> >+        new_node_name)
> >> >+    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
> >> >+        new_node_key))
> >> >+
> >> >+  def testAddKeyFailedWithRetries(self):
> >> >+    new_node_name = "new_node_name"
> >> >+    new_node_uuid = "new_node_uuid"
> >> >+    new_node_key = "new_node_key"
> >> >+    is_master_candidate = True
> >> >+    is_potential_master_candidate = True
> >> >+    is_master = False
> >> >+
> >> >+    self._AddNewNodeToTestData(
> >> >+        new_node_name, new_node_uuid, new_node_key,
> >> >+        is_potential_master_candidate, is_master_candidate,
> >> >+        is_master)
> >> >+    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
> >> >+
> >> >+    self.assertRaises(
> >> >+        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
> >> >+        new_node_name, self._potential_master_candidates,
> >> self._ssh_port_map,
> >> >+        to_authorized_keys=is_master_candidate,
> >> >+        to_public_keys=is_potential_master_candidate,
> >> >+        get_public_keys=is_potential_master_candidate,
> >> >+        pub_key_file=self._pub_key_file,
> >> >+        ssconf_store=self._ssconf_mock,
> >> >+        noded_cert_file=self.noded_cert_file,
> >> >+        run_cmd_fn=self._run_cmd_mock)
> >> >+
> >> >+    for node in self._all_nodes:
> >> >+      if node == new_node_name:
> >> >+        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> >> >+          node, new_node_key))
> >> >+      else:
> >> >+        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
> >> >+          node, new_node_key))
> >>
> >> It might be useful to add a helper (probably in a later patch?) to the
> ssh
> >> file manager for checking that all nodes (and only those) in a given set
> >> have a given authorized key - a similar loop is already used in a few
> other
> >> places.
> >>
> >
> >Good point. As you suggest, I'll add that in an additional patch.
>
> ACK
>
> >
> >
> >>
> >> >+
> >> >+    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
> >> >+        new_node_uuid, new_node_key))
> >> >+
> >> >+
> >> > class TestVerifySshSetup(testutils.GanetiTestCase):
> >> >
> >> >   _NODE1_UUID = "uuid1"
> >> >diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
> >> >index a65c44e..b7b4e82 100644
> >> >--- a/test/py/testutils_ssh.py
> >> >+++ b/test/py/testutils_ssh.py
> >> >@@ -32,6 +32,7 @@
> >> >
> >> > from ganeti import constants
> >> > from ganeti import pathutils
> >> >+from ganeti import errors
> >> >
> >> >
> >> > class FakeSshFileManager(object):
> >> >@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
> >> >     self._public_keys = {} # dict of dicts
> >> >     # Node name of the master node
> >> >     self._master_node_name = None
> >> >+    # Dictionary mapping nodes by name to number of retries where
> >> 'RunCommand'
> >> >+    # succeeds. For example if set to '3', RunCommand will fail two
> >> times when
> >> >+    # called for this node before it succeeds in the 3rd retry.
> >> >+    self._max_retries = {}
> >> >+    # Dictionary mapping nodes by name to number of retries which
> >> >+    # 'RunCommand' has already carried out.
> >> >+    self._retries = {}
> >>
> >> Maybe only one dictionary would suffice. When setting a retry, we'd set
> >> self._retries[node] to a given number and let an operation succeed only
> if
> >> the number is positive (decrementing otherwise). Or do we need to keep
> the
> >> max. number of retries while resetting the counter at some point?
> >>
> >
> >While I understand that your desire here is to get rid of one variable,
> I'd
> >rather not do that. The reason is that this would count down instead of up
> >like it is done in the implementation that this class is emulating. For
> the
> >sanity of any developer who has to work with this in the future, I'd
> rather
> >keep those similar to not add any additional confusion. Would that be okay
> >for you?
>
> OK, thanks for the explanation, let's leave it as it is.
>
> >
> >
> >>
> >> >
> >> >   def _SetMasterNodeName(self):
> >> >     self._master_node_name = [name for name, (_, _, _, _, master)
> >> >@@ -138,6 +146,17 @@ class FakeSshFileManager(object):
> >> >       self._FillAuthorizedKeyOfOneNode(node)
> >> >     self._SetMasterNodeName()
> >> >
> >> >+  def SetMaxRetries(self, node_name, retries):
> >> >+    """Set the number of unsuccessful retries of 'RunCommand' per
> node.
> >> >+
> >> >+    @type node_name: string
> >> >+    @param node_name: name of the node
> >> >+    @type retries: integer
> >> >+    @param retries: number of unsuccessful retries
> >> >+
> >> >+    """
> >> >+    self._max_retries[node_name] = retries
> >> >+
> >> >   def GetSshPortMap(self, port):
> >> >     """Creates a SSH port map with all nodes mapped to the given port.
> >> >
> >> >@@ -297,7 +316,8 @@ class FakeSshFileManager(object):
> >> >     for name in self._all_node_data.keys():
> >> >       node_pub_keys = self._public_keys[name]
> >> >       if (uuid, key) in node_pub_keys.items():
> >> >-        return False
> >> >+        raise Exception("Node '%s' does have public key '%s' of node
> >> '%s'"
> >> >+                        % (name, key, uuid))
> >>
> >> This change is somewhat confusing, and seems to me unrelated to the
> patch.
> >> The method used to return True/False, but after that change it's either
> >> True
> >> or an exception. So I'd either be in favor of True/False or
> None/exception.
> >> And currently all the tests in the backend_unittest use
> assertTrue/False on
> >> the result.
> >>
> >
> >I agree. I realized for debugging any problems, I not only need to know
> >that some node is not configured correctly, but which node. Therefore I
> >started throwing some exceptions to add additional information. I realized
> >later that this is kinda inconsistent. I fixed this for all occurrences in
> >patch:
> >
> >[PATCH stable-2.13 14/15] SSH file manager: properly name assertion
> function
> >
> >Therefore, can you let this slip here as it is not breaking anything and
> >just a little bending of semantics?
>
> OK, if it's fixed later in the series, that's fine.
>
> >
> >
> >>
> >> >     return True
> >> >
> >> >   def PotentialMasterCandidatesOnlyHavePublicKey(self,
> query_node_name):
> >> >@@ -342,6 +362,14 @@ class FakeSshFileManager(object):
> >> >     of SSH keys. No actual key files of any node is touched.
> >> >
> >> >     """
> >> >+    if node in self._max_retries:
> >> >+      if node not in self._retries:
> >> >+        self._retries[node] = 0
> >> >+      self._retries[node] += 1
> >> >+      if self._retries[node] < self._max_retries[node]:
> >> >+        raise errors.OpExecError("(Fake) SSH connection to node '%s'
> >> failed."
> >> >+                                 % node)
> >> >+
> >> >     assert base_cmd == pathutils.SSH_UPDATE
> >> >
> >> >     if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
> >> >--
> >> >2.2.0.rc0.207.ga3a616c
> >> >
> >>
> >
> >Interdiff for the max_retries constant:
> >
> > diff --git a/lib/backend.py b/lib/backend.py
> >index 818f3fb..fd30bd8 100644
> >--- a/lib/backend.py
> >+++ b/lib/backend.py
> >@@ -1520,10 +1520,8 @@ def AddNodeSshKey(node_uuid, node_name,
> >   _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
> >   cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
> >
> >-  # number of retries for SSH connections
> >-  max_retries = 3
> >-
> >   # Update the target node itself
> >+  logging.debug("Updating SSH key files of target node '%s'.", node_name)
> >   if get_public_keys:
> >     node_data = {}
> >     _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
> >@@ -1532,7 +1530,7 @@ def AddNodeSshKey(node_uuid, node_name,
> >       (constants.SSHS_OVERRIDE, all_keys)
> >
> >     last_exception = None
> >-    for _ in range(max_retries):
> >+    for _ in range(constants.SSHS_MAX_RETRIES):
> >       try:
> >         run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> >                    ssh_port_map.get(node_name), node_data,
> >diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
> >index 01bf435..eddadf3 100644
> >--- a/src/Ganeti/Constants.hs
> >+++ b/src/Ganeti/Constants.hs
> >@@ -4563,6 +4563,9 @@ sshsSshPublicKeys = "public_keys"
> > sshsNodeDaemonCertificate :: String
> > sshsNodeDaemonCertificate = "node_daemon_certificate"
> >
> >+sshsMaxRetries :: Integer
> >+sshsMaxRetries = 3
>
> Just nitpicking, the name doesn't make it clear to which SSH communication
> the number applies. So perhaps name it something like
> renewCryptoSShMaxRetries? Perhaps also add a comment to the constant.
>

Actually, I wanted to use this constants for all operations that update the
SSH setup, hence the SSHS (SSH Setup) constants. Technically it is only
used when adding and removing keys and only indirectly when renewing all
keys. So unless there will be a reason to use different numbers of retries
I would go with this one to keep it simple.

I will, however, add a comment to make this more clear.

Interdiff to the interdiff:

diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
index eddadf3..e158471 100644
--- a/src/Ganeti/Constants.hs
+++ b/src/Ganeti/Constants.hs
@@ -4563,6 +4563,8 @@ sshsSshPublicKeys = "public_keys"
 sshsNodeDaemonCertificate :: String
 sshsNodeDaemonCertificate = "node_daemon_certificate"

+-- Number of maximum retries when contacting nodes per SSH
+-- during SSH update operations.
 sshsMaxRetries :: Integer
 sshsMaxRetries = 3




> No need to resend.
>
> >+
> > sshsAdd :: String
> > sshsAdd = "add"
> >
> >diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> >ganeti.backend_unittest.py
> >index cc65501..288ed15 100755
> >--- a/test/py/ganeti.backend_unittest.py
> >+++ b/test/py/ganeti.backend_unittest.py
> >@@ -1412,7 +1412,8 @@ class
> >TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >         new_node_name, new_node_uuid, new_node_key,
> >         is_potential_master_candidate, is_master_candidate,
> >         is_master)
> >-    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
> >+    self._ssh_file_manager.SetMaxRetries(
> >+        new_node_name, constants.SSHS_MAX_RETRIES)
> >
> >     backend.AddNodeSshKey(new_node_uuid, new_node_name,
> >                           self._potential_master_candidates,
> >@@ -1442,7 +1443,8 @@ class
> >TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >         new_node_name, new_node_uuid, new_node_key,
> >         is_potential_master_candidate, is_master_candidate,
> >         is_master)
> >-    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
> >+    self._ssh_file_manager.SetMaxRetries(
> >+        new_node_name, constants.SSHS_MAX_RETRIES + 1)
> >
> >     self.assertRaises(
> >         errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
>
> the interdiff LGTM, thanks
>

Thanks,
Helga

Reply via email to