Hi!

On Tue, Apr 21, 2015, 5:50 PM Petr Pudlak <[email protected]> wrote:

> On Tue, Apr 21, 2015 at 02:48:18PM +0000, 'Helga Velroyen' via
> ganeti-devel wrote:
> >Interdiff for the max-retry constant, introduced in patch 8.
> >
> >diff --git a/lib/backend.py b/lib/backend.py
> >index 5f4c99b..5a09efa 100644
> >--- a/lib/backend.py
> >+++ b/lib/backend.py
> >@@ -1574,7 +1574,7 @@ def AddNodeSshKey(node_uuid, node_name,
> >       continue
> >     if node in potential_master_candidates:
> >       logging.debug("Updating SSH key files of node '%s'.")
> >-      for i in range(max_retries):
> >+      for i in range(constants.SSHS_MAX_RETRIES):
> >         try:
> >           run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> >                      ssh_port_map.get(node), pot_mc_data,
> >@@ -1590,7 +1590,8 @@ def AddNodeSshKey(node_uuid, node_name,
> >           error_msg = ("When adding the key of node '%s', updating SSH
> key"
> >                        " files of node '%s' failed after %s retries."
> >                        " Not trying again. Last errors was: %s." %
> >-                       (node, node_name, max_retries, last_exception))
> >+                       (node, node_name, constants.SSHS_MAX_RETRIES,
> >+                        last_exception))
> >           node_errors[node] = error_msg
> >           # We only log the error and don't throw an exception, because
> >           # one unreachable node shall not abort the entire procedure.
> >diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> >ganeti.backend_unittest.py
> >index 021972c..ec61f07 100755
> >--- a/test/py/ganeti.backend_unittest.py
> >+++ b/test/py/ganeti.backend_unittest.py
> >@@ -1499,7 +1499,8 @@ class
> >TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >
> >     other_node_name, _, _, _, _, _ = self._ssh_file_manager \
> >         .GetAllMasterCandidates()[0]
> >-    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
> >+    self._ssh_file_manager.SetMaxRetries(
> >+        other_node_name, constants.SSHS_MAX_RETRIES)
> >     assert other_node_name != new_node_name
> >     self._AddNewNodeToTestData(
> >         new_node_name, new_node_uuid, new_node_key,
> >@@ -1538,7 +1539,8 @@ class
> >TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >
> >     other_node_name, _, _, _, _, _ = self._ssh_file_manager \
> >         .GetAllMasterCandidates()[0]
> >-    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
> >+    self._ssh_file_manager.SetMaxRetries(
> >+        other_node_name, constants.SSHS_MAX_RETRIES + 1)
> >     assert other_node_name != new_node_name
> >     self._AddNewNodeToTestData(
> >         new_node_name, new_node_uuid, new_node_key,
> >
> >
> >On Fri, 17 Apr 2015 at 16:37 Helga Velroyen <[email protected]> wrote:
> >
> >> So far, if a SSH key of a node is added to the cluster
> >> and updating the SSH setup of the node itself fails,
> >> the entire operation fails quite disgraceful. This
> >> patch introduces a retry mechanism and a proper cleanup
> >> in case of a failure.
> >>
> >> Note that there are more places where retries would
> >> be a good idea. They'll be added in other patches.
> >>
> >> Signed-off-by: Helga Velroyen <[email protected]>
> >> ---
> >>  lib/backend.py                     | 28 +++++++++++++---
> >>  test/py/ganeti.backend_unittest.py | 67
> >> ++++++++++++++++++++++++++++++++++++++
> >>  test/py/testutils_ssh.py           | 30 ++++++++++++++++-
> >>  3 files changed, 120 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/lib/backend.py b/lib/backend.py
> >> index bc1a570..818f3fb 100644
> >> --- a/lib/backend.py
> >> +++ b/lib/backend.py
> >> @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
> >>    _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
> >>    cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
> >>
> >> +  # number of retries for SSH connections
> >> +  max_retries = 3
> >> +
> >>    # Update the target node itself
> >>    if get_public_keys:
> >>      node_data = {}
> >> @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
> >>      all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
> >>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
> >>        (constants.SSHS_OVERRIDE, all_keys)
> >> -    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> >> -               ssh_port_map.get(node_name), node_data,
> >> -               debug=False, verbose=False, use_cluster_key=False,
> >> -               ask_key=False, strict_host_check=False)
> >> +
> >> +    last_exception = None
> >> +    for _ in range(max_retries):
> >> +      try:
> >> +        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> >> +                   ssh_port_map.get(node_name), node_data,
> >> +                   debug=False, verbose=False, use_cluster_key=False,
> >> +                   ask_key=False, strict_host_check=False)
> >> +        break
> >> +      except errors.OpExecError as e:
> >> +        logging.error("Updating SSH key files of node '%s' failed."
> >> +                      " Error: %s", node_name, e)
> >> +        last_exception = e
> >> +    else:
> >> +      if last_exception:
> >> +        # Clean up the master's public key file if adding key fails
> >> +        if to_public_keys:
> >> +          ssh.RemovePublicKey(node_uuid)
> >> +        raise errors.SshUpdateError(
> >> +          "Could not update the SSH setup of node '%s' itself. Error:
> %s."
> >> +          % (node_name, last_exception))
> >>
> >>    # Update all nodes except master and the target node
> >>    if to_authorized_keys:
> >> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> >> ganeti.backend_unittest.py
> >> index 5305c15..cc65501 100755
> >> --- a/test/py/ganeti.backend_unittest.py
> >> +++ b/test/py/ganeti.backend_unittest.py
> >> @@ -1400,6 +1400,73 @@ class
> >> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> >>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> >>              node, node_key))
> >>
> >> +  def testAddKeySuccessfullyWithRetries(self):
> >> +    new_node_name = "new_node_name"
> >> +    new_node_uuid = "new_node_uuid"
> >> +    new_node_key = "new_node_key"
> >> +    is_master_candidate = True
> >> +    is_potential_master_candidate = True
> >> +    is_master = False
> >> +
> >> +    self._AddNewNodeToTestData(
> >> +        new_node_name, new_node_uuid, new_node_key,
> >> +        is_potential_master_candidate, is_master_candidate,
> >> +        is_master)
> >> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
> >> +
> >> +    backend.AddNodeSshKey(new_node_uuid, new_node_name,
> >> +                          self._potential_master_candidates,
> >> +                          self._ssh_port_map,
> >> +                          to_authorized_keys=is_master_candidate,
> >> +                          to_public_keys=is_potential_master_candidate,
> >> +
> get_public_keys=is_potential_master_candidate,
> >> +                          pub_key_file=self._pub_key_file,
> >> +                          ssconf_store=self._ssconf_mock,
> >> +                          noded_cert_file=self.noded_cert_file,
> >> +                          run_cmd_fn=self._run_cmd_mock)
> >> +
> >> +    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
> >> +        new_node_name)
> >> +    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
> >> +        new_node_key))
> >> +
> >> +  def testAddKeyFailedWithRetries(self):
> >> +    new_node_name = "new_node_name"
> >> +    new_node_uuid = "new_node_uuid"
> >> +    new_node_key = "new_node_key"
> >> +    is_master_candidate = True
> >> +    is_potential_master_candidate = True
> >> +    is_master = False
> >> +
> >> +    self._AddNewNodeToTestData(
> >> +        new_node_name, new_node_uuid, new_node_key,
> >> +        is_potential_master_candidate, is_master_candidate,
> >> +        is_master)
> >> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
> >> +
> >> +    self.assertRaises(
> >> +        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
> >> +        new_node_name, self._potential_master_candidates,
> >> self._ssh_port_map,
> >> +        to_authorized_keys=is_master_candidate,
> >> +        to_public_keys=is_potential_master_candidate,
> >> +        get_public_keys=is_potential_master_candidate,
> >> +        pub_key_file=self._pub_key_file,
> >> +        ssconf_store=self._ssconf_mock,
> >> +        noded_cert_file=self.noded_cert_file,
> >> +        run_cmd_fn=self._run_cmd_mock)
> >> +
> >> +    for node in self._all_nodes:
> >> +      if node == new_node_name:
> >> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
> >> +          node, new_node_key))
> >> +      else:
> >> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
> >> +          node, new_node_key))
> >> +
> >> +    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
> >> +        new_node_uuid, new_node_key))
> >> +
> >> +
> >>  class TestVerifySshSetup(testutils.GanetiTestCase):
> >>
> >>    _NODE1_UUID = "uuid1"
> >> diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
> >> index ff7db63..a180144 100644
> >> --- a/test/py/testutils_ssh.py
> >> +++ b/test/py/testutils_ssh.py
> >> @@ -32,6 +32,7 @@
> >>
> >>  from ganeti import constants
> >>  from ganeti import pathutils
> >> +from ganeti import errors
> >>
> >>
> >>  class FakeSshFileManager(object):
> >> @@ -66,6 +67,13 @@ class FakeSshFileManager(object):
> >>      self._public_keys = {} # dict of dicts
> >>      # Node name of the master node
> >>      self._master_node_name = None
> >> +    # Dictionary mapping nodes by name to number of retries where
> >> 'RunCommand'
> >> +    # succeeds. For example if set to '3', RunCommand will fail two
> times
> >> when
> >> +    # called for this node before it succeeds in the 3rd retry.
> >> +    self._max_retries = {}
> >> +    # Dictionary mapping nodes by name to number of retries which
> >> +    # 'RunCommand' has already carried out.
> >> +    self._retries = {}
> >>
> >>    def _SetMasterNodeName(self):
> >>      self._master_node_name = [name for name, (_, _, _, _, master)
> >> @@ -124,6 +132,17 @@ class FakeSshFileManager(object):
> >>        self._FillAuthorizedKeyOfOneNode(node)
> >>      self._SetMasterNodeName()
> >>
> >> +  def SetMaxRetries(self, node_name, retries):
> >> +    """Set the number of unsuccessful retries of 'RunCommand' per node.
> >> +
> >> +    @type node_name: string
> >> +    @param node_name: name of the node
> >> +    @type retries: integer
> >> +    @param retries: number of unsuccessful retries
> >> +
> >> +    """
> >> +    self._max_retries[node_name] = retries
> >> +
> >>    def GetSshPortMap(self, port):
> >>      """Creates a SSH port map with all nodes mapped to the given port.
> >>
> >> @@ -281,7 +300,8 @@ class FakeSshFileManager(object):
> >>      for name in self._all_node_data.keys():
> >>        node_pub_keys = self._public_keys[name]
> >>        if (uuid, key) in node_pub_keys.items():
> >> -        return False
> >> +        raise Exception("Node '%s' does have public key '%s' of node
> '%s'"
> >> +                        % (name, key, uuid))
> >>      return True
> >>
> >>    def PotentialMasterCandidatesOnlyHavePublicKey(self,
> query_node_name):
> >> @@ -326,6 +346,14 @@ class FakeSshFileManager(object):
> >>      of SSH keys. No actual key files of any node is touched.
> >>
> >>      """
> >> +    if node in self._max_retries:
> >> +      if node not in self._retries:
> >> +        self._retries[node] = 0
> >> +      self._retries[node] += 1
> >> +      if self._retries[node] < self._max_retries[node]:
> >> +        raise errors.OpExecError("(Fake) SSH connection to node '%s'
> >> failed."
> >> +                                 % node)
> >> +
> >>      assert base_cmd == pathutils.SSH_UPDATE
> >>
> >>      if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
> >> --
> >> 2.2.0.rc0.207.ga3a616c
> >>
> >>
>
> I'm a bit confused, isn't this patch the same as number 8? Perhaps it just
> got to the second series by mistake.
>

You might think that, but no! AddNodeSshKey has two phases, in one all
nodes but the new node is updated and in a second one, the new node itself
is updated (as it for example needs to get all keys of the other nodes).
Thus the nodes in the two phases receive different data and thus different
things can happen.

I added retries to both situations in two different patches because I
thought smaller patches are easier to review. However, the fact that this
looks too similar to the other one is another reason to add a retry utility
function, which I will do, as promised, in a separate patch.

Btw. the same situation applies to RemoveNodeSshKey, there are also two
phases in the code which I updated with retries in two patches (later in
this series).

Cheers,
Helga

Reply via email to