With that as an explanation, LGTM.

On Wed, Apr 22, 2015 at 4:37 PM, Helga Velroyen <[email protected]> wrote:

>
>
> On Wed, 22 Apr 2015 at 16:33 Hrvoje Ribicic <[email protected]> wrote:
>
>> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel <
>> [email protected]> wrote:
>>
>>> So far, if a SSH key of a node is added to the cluster
>>> and updating the SSH setup of the node itself fails,
>>> the entire operation fails quite disgraceful. This
>>> patch introduces a retry mechanism and a proper cleanup
>>> in case of a failure.
>>>
>>> Note that there are more places where retries would
>>> be a good idea. They'll be added in other patches.
>>>
>>> Signed-off-by: Helga Velroyen <[email protected]>
>>> ---
>>>  lib/backend.py                     | 28 +++++++++++++---
>>>  test/py/ganeti.backend_unittest.py | 67
>>> ++++++++++++++++++++++++++++++++++++++
>>>  test/py/testutils_ssh.py           | 30 ++++++++++++++++-
>>>  3 files changed, 120 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/lib/backend.py b/lib/backend.py
>>> index bc1a570..818f3fb 100644
>>> --- a/lib/backend.py
>>> +++ b/lib/backend.py
>>> @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
>>>    _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
>>>    cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
>>>
>>> +  # number of retries for SSH connections
>>> +  max_retries = 3
>>
>> +
>>>    # Update the target node itself
>>>    if get_public_keys:
>>>      node_data = {}
>>> @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
>>>      all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
>>>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>>>        (constants.SSHS_OVERRIDE, all_keys)
>>> -    run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>>> -               ssh_port_map.get(node_name), node_data,
>>> -               debug=False, verbose=False, use_cluster_key=False,
>>> -               ask_key=False, strict_host_check=False)
>>> +
>>> +    last_exception = None
>>> +    for _ in range(max_retries):
>>> +      try:
>>> +        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>>> +                   ssh_port_map.get(node_name), node_data,
>>> +                   debug=False, verbose=False, use_cluster_key=False,
>>> +                   ask_key=False, strict_host_check=False)
>>> +        break
>>> +      except errors.OpExecError as e:
>>> +        logging.error("Updating SSH key files of node '%s' failed."
>>> +                      " Error: %s", node_name, e)
>>> +        last_exception = e
>>> +    else:
>>> +      if last_exception:
>>> +        # Clean up the master's public key file if adding key fails
>>> +        if to_public_keys:
>>> +          ssh.RemovePublicKey(node_uuid)
>>> +        raise errors.SshUpdateError(
>>> +          "Could not update the SSH setup of node '%s' itself. Error:
>>> %s."
>>> +          % (node_name, last_exception))
>>>
>>>    # Update all nodes except master and the target node
>>>    if to_authorized_keys:
>>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>>> ganeti.backend_unittest.py
>>> index 5305c15..cc65501 100755
>>> --- a/test/py/ganeti.backend_unittest.py
>>> +++ b/test/py/ganeti.backend_unittest.py
>>> @@ -1400,6 +1400,73 @@ class
>>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>>>              node, node_key))
>>>
>>> +  def testAddKeySuccessfullyWithRetries(self):
>>> +    new_node_name = "new_node_name"
>>> +    new_node_uuid = "new_node_uuid"
>>> +    new_node_key = "new_node_key"
>>> +    is_master_candidate = True
>>> +    is_potential_master_candidate = True
>>> +    is_master = False
>>> +
>>> +    self._AddNewNodeToTestData(
>>> +        new_node_name, new_node_uuid, new_node_key,
>>> +        is_potential_master_candidate, is_master_candidate,
>>> +        is_master)
>>> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
>>> +
>>> +    backend.AddNodeSshKey(new_node_uuid, new_node_name,
>>> +                          self._potential_master_candidates,
>>> +                          self._ssh_port_map,
>>> +                          to_authorized_keys=is_master_candidate,
>>> +                          to_public_keys=is_potential_master_candidate,
>>> +                          get_public_keys=is_potential_master_candidate,
>>> +                          pub_key_file=self._pub_key_file,
>>> +                          ssconf_store=self._ssconf_mock,
>>> +                          noded_cert_file=self.noded_cert_file,
>>> +                          run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
>>> +        new_node_name)
>>>
>>
>> This looks like it needs an assertTrue.
>>
>
> Actually, PotentialMasterCandidatesOnlyHavePublicKey is more an assertion
> function than a boolean one, because it throws an exception instead of
> returning false. I fixed that inconsistency for all occurrences in a later
> patch.
>
>
>>
>>
>>> +    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>>> +        new_node_key))
>>> +
>>> +  def testAddKeyFailedWithRetries(self):
>>> +    new_node_name = "new_node_name"
>>> +    new_node_uuid = "new_node_uuid"
>>> +    new_node_key = "new_node_key"
>>> +    is_master_candidate = True
>>> +    is_potential_master_candidate = True
>>> +    is_master = False
>>> +
>>> +    self._AddNewNodeToTestData(
>>> +        new_node_name, new_node_uuid, new_node_key,
>>> +        is_potential_master_candidate, is_master_candidate,
>>> +        is_master)
>>> +    self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
>>> +
>>> +    self.assertRaises(
>>> +        errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
>>> +        new_node_name, self._potential_master_candidates,
>>> self._ssh_port_map,
>>> +        to_authorized_keys=is_master_candidate,
>>> +        to_public_keys=is_potential_master_candidate,
>>> +        get_public_keys=is_potential_master_candidate,
>>> +        pub_key_file=self._pub_key_file,
>>> +        ssconf_store=self._ssconf_mock,
>>> +        noded_cert_file=self.noded_cert_file,
>>> +        run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    for node in self._all_nodes:
>>> +      if node == new_node_name:
>>> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +          node, new_node_key))
>>> +      else:
>>> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +          node, new_node_key))
>>> +
>>> +    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>>> +        new_node_uuid, new_node_key))
>>> +
>>> +
>>>  class TestVerifySshSetup(testutils.GanetiTestCase):
>>>
>>>    _NODE1_UUID = "uuid1"
>>> diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
>>> index ff7db63..a180144 100644
>>> --- a/test/py/testutils_ssh.py
>>> +++ b/test/py/testutils_ssh.py
>>> @@ -32,6 +32,7 @@
>>>
>>>  from ganeti import constants
>>>  from ganeti import pathutils
>>> +from ganeti import errors
>>>
>>>
>>>  class FakeSshFileManager(object):
>>> @@ -66,6 +67,13 @@ class FakeSshFileManager(object):
>>>      self._public_keys = {} # dict of dicts
>>>      # Node name of the master node
>>>      self._master_node_name = None
>>> +    # Dictionary mapping nodes by name to number of retries where
>>> 'RunCommand'
>>> +    # succeeds. For example if set to '3', RunCommand will fail two
>>> times when
>>> +    # called for this node before it succeeds in the 3rd retry.
>>> +    self._max_retries = {}
>>> +    # Dictionary mapping nodes by name to number of retries which
>>> +    # 'RunCommand' has already carried out.
>>> +    self._retries = {}
>>>
>>>    def _SetMasterNodeName(self):
>>>      self._master_node_name = [name for name, (_, _, _, _, master)
>>> @@ -124,6 +132,17 @@ class FakeSshFileManager(object):
>>>        self._FillAuthorizedKeyOfOneNode(node)
>>>      self._SetMasterNodeName()
>>>
>>> +  def SetMaxRetries(self, node_name, retries):
>>> +    """Set the number of unsuccessful retries of 'RunCommand' per node.
>>> +
>>> +    @type node_name: string
>>> +    @param node_name: name of the node
>>> +    @type retries: integer
>>> +    @param retries: number of unsuccessful retries
>>> +
>>> +    """
>>> +    self._max_retries[node_name] = retries
>>> +
>>>    def GetSshPortMap(self, port):
>>>      """Creates a SSH port map with all nodes mapped to the given port.
>>>
>>> @@ -281,7 +300,8 @@ class FakeSshFileManager(object):
>>>      for name in self._all_node_data.keys():
>>>        node_pub_keys = self._public_keys[name]
>>>        if (uuid, key) in node_pub_keys.items():
>>> -        return False
>>> +        raise Exception("Node '%s' does have public key '%s' of node
>>> '%s'"
>>> +                        % (name, key, uuid))
>>>      return True
>>>
>>>    def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
>>> @@ -326,6 +346,14 @@ class FakeSshFileManager(object):
>>>      of SSH keys. No actual key files of any node is touched.
>>>
>>>      """
>>> +    if node in self._max_retries:
>>> +      if node not in self._retries:
>>> +        self._retries[node] = 0
>>> +      self._retries[node] += 1
>>> +      if self._retries[node] < self._max_retries[node]:
>>> +        raise errors.OpExecError("(Fake) SSH connection to node '%s'
>>> failed."
>>> +                                 % node)
>>> +
>>>      assert base_cmd == pathutils.SSH_UPDATE
>>>
>>>      if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
>>> --
>>> 2.2.0.rc0.207.ga3a616c
>>>
>>>
>> Hrvoje Ribicic
>> Ganeti Engineering
>> Google Germany GmbH
>> Dienerstr. 12, 80331, München
>>
>> Registergericht und -nummer: Hamburg, HRB 86891
>> Sitz der Gesellschaft: Hamburg
>> Geschäftsführer: Graham Law, Christine Elizabeth Flores
>> Steuernummer: 48/725/00206
>> Umsatzsteueridentifikationsnummer: DE813741370
>>
>
Hrvoje Ribicic
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to