LGTM, thanks

On Thu, Apr 23, 2015 at 12:56 PM, Helga Velroyen <[email protected]> wrote:

> Hi!
>
> On Thu, 23 Apr 2015 at 11:29 Hrvoje Ribicic <[email protected]> wrote:
>
>> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel <
>> [email protected]> wrote:
>>
>>> When removing an SSH key, RemoveNodeSshKey so far did only
>>> try to contact each node once. This made the entire
>>> procedure not very resilient towards temporary network
>>> failures. This patch adds a couple of retries for contacting
>>> each node and only gives up after those are exhausted.
>>>
>>> Signed-off-by: Helga Velroyen <[email protected]>
>>> ---
>>>  lib/backend.py                     | 101
>>> ++++++++++++++++++++++++-------------
>>>  lib/cmdlib/node.py                 |  21 ++++++--
>>>  test/py/ganeti.backend_unittest.py |  65 ++++++++++++++++++++++++
>>>  3 files changed, 148 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/lib/backend.py b/lib/backend.py
>>> index 8ec22d9..c3190d2 100644
>>> --- a/lib/backend.py
>>> +++ b/lib/backend.py
>>> @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>    @returns: list of feedback messages
>>>
>>>    """
>>> -  result_msgs = []
>>> +  # Non-disruptive error messages, mapped from node name to message
>>> +  result_msgs = {}
>>> +  # maximum number of retries for SSH connections
>>> +  max_retries = 3
>>>
>>>    # Make sure at least one of these flags is true.
>>>    if not (from_authorized_keys or from_public_keys or
>>> clear_authorized_keys
>>>            or clear_public_keys):
>>> -    result_msgs.append("No removal from any key file was requested.")
>>> +    raise errors.SshUpdateError("No removal from any key file was
>>> requested.")
>>>
>>>    if not ssconf_store:
>>>      ssconf_store = ssconf.SimpleStore()
>>> @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>
>>>        all_nodes = ssconf_store.GetNodeList()
>>>        online_nodes = ssconf_store.GetOnlineNodeList()
>>> +      logging.debug("Removing key of node '%s' from all nodes but
>>> itself and"
>>> +                    " master.", node_name)
>>>        for node in all_nodes:
>>>          if node == master_node:
>>> +          logging.debug("Skipping master node '%s'.", master_node)
>>>            continue
>>>          if node not in online_nodes:
>>>            logging.debug("Skipping offline node '%s'.", node)
>>> @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>                                     " node '%s', map: %s." %
>>>                                     (node, ssh_port_map))
>>>          if node in potential_master_candidates:
>>> -          try:
>>> -            run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                       ssh_port, pot_mc_data,
>>> -                       debug=False, verbose=False,
>>> use_cluster_key=False,
>>> -                       ask_key=False, strict_host_check=False)
>>> -          except errors.OpExecError as e:
>>> -            result_msgs.append("Warning: the SSH setup of node '%s'
>>> could not"
>>> -                               " be adjusted." % node)
>>> -        else:
>>> -          if from_authorized_keys:
>>> +          error_msg_try = ("The SSH setup of node '%s' could not"
>>> +                           " be adjusted in try no %s. Error: %s.")
>>> +          error_msg_final = ("When removing the key of node '%s',
>>> updating the"
>>> +                             " SSH key files of node '%s' failed after
>>> %s"
>>> +                             " retries. Not trying again. Last error
>>> was: %s.")
>>>
>>
>> Looking at the code, it seems this has to go before the if as it is used
>> in the else as well.
>>
>
> Ack.
>
>
>>
>>
>>> +          last_exception = None
>>> +          for i in range(max_retries):
>>>              try:
>>>                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                         ssh_port, base_data,
>>> +                         ssh_port, pot_mc_data,
>>>                           debug=False, verbose=False,
>>> use_cluster_key=False,
>>>                           ask_key=False, strict_host_check=False)
>>> +              break
>>>              except errors.OpExecError as e:
>>> -              result_msgs.append("Warning: the SSH setup of node '%s'
>>> could"
>>> -                                 " not be adjusted." % node)
>>> +              logging.error(error_msg_try, node, i, e)
>>> +              last_exception = e
>>> +          else:
>>> +            if last_exception:
>>> +              error_msg = error_msg_final % (node_name, node,
>>> max_retries,
>>> +                                             last_exception)
>>> +              result_msgs[node] = error_msg
>>> +              logging.error(error_msg)
>>> +
>>> +        else:
>>> +          last_exception = None
>>> +          if from_authorized_keys:
>>> +            for i in range(max_retries):
>>> +              try:
>>> +                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> +                           ssh_port, base_data,
>>> +                           debug=False, verbose=False,
>>> use_cluster_key=False,
>>> +                           ask_key=False, strict_host_check=False)
>>> +                break
>>> +              except errors.OpExecError as e:
>>> +                logging.error(error_msg_try, node, i, e)
>>> +                last_exception = e
>>> +          else:
>>> +            if last_exception:
>>> +              error_msg = error_msg_final % (node_name, node,
>>> max_retries,
>>> +                                             last_exception)
>>> +              result_msgs[node] = error_msg
>>> +              logging.error(error_msg)
>>
>>
>>>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>>>      data = {}
>>> @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>                   debug=False, verbose=False, use_cluster_key=False,
>>>                   ask_key=False, strict_host_check=False)
>>>      except errors.OpExecError as e:
>>> -      result_msgs.append("Removing SSH keys from node '%s' failed. This
>>> can"
>>> -                         " happen when the node is already unreachable."
>>> -                         " Error: %s" % (node_name, e))
>>> +      result_msgs[node_name] = \
>>> +          ("Removing SSH keys from node '%s' failed. This can"
>>> +           " happen when the node is already unreachable."
>>> +           " Error: %s" % (node_name, e))
>>>
>>>    return result_msgs
>>>
>>> @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>          # and that would terminate all communication from the master to
>>> the
>>>          # node.
>>>          logging.debug("Removing SSH key of node '%s'.", node_name)
>>> -        RemoveNodeSshKey(node_uuid, node_name,
>>> -                         master_candidate_uuids,
>>> -                         potential_master_candidates,
>>> -                         ssh_port_map,
>>> -                         master_uuid=master_node_uuid,
>>> -                         from_authorized_keys=master_candidate,
>>> -                         from_public_keys=False,
>>> -                         clear_authorized_keys=False,
>>> -                         clear_public_keys=False)
>>> +        node_errors = RemoveNodeSshKey(
>>> +           node_uuid, node_name, master_candidate_uuids,
>>> +           potential_master_candidates, ssh_port_map,
>>> +           master_uuid=master_node_uuid,
>>> from_authorized_keys=master_candidate,
>>> +           from_public_keys=False, clear_authorized_keys=False,
>>> +           clear_public_keys=False)
>>> +        if node_errors:
>>> +          all_node_errors = all_node_errors + [node_errors.items()]
>>>        else:
>>>          logging.debug("Old key of node '%s' is the same as the current
>>> master"
>>>                        " key. Not deleting that key on the node.",
>>> node_name)
>>> @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>
>>>    # Remove the old key from all node's authorized keys file
>>>    logging.debug("Remove the old master key from all nodes.")
>>> -  RemoveNodeSshKey(master_node_uuid, master_node_name,
>>> -                   master_candidate_uuids,
>>> -                   potential_master_candidates,
>>> -                   ssh_port_map,
>>> -                   keys_to_remove=old_master_keys_by_uuid,
>>> -                   from_authorized_keys=True,
>>> -                   from_public_keys=False,
>>> -                   clear_authorized_keys=False,
>>> -                   clear_public_keys=False)
>>> +  node_errors = RemoveNodeSshKey(
>>> +      master_node_uuid, master_node_name, master_candidate_uuids,
>>> +      potential_master_candidates, ssh_port_map,
>>> +      keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True,
>>> +      from_public_keys=False, clear_authorized_keys=False,
>>> +      clear_public_keys=False)
>>> +  if node_errors:
>>> +    all_node_errors = all_node_errors + node_errors.items()
>>>
>>>    return all_node_errors
>>>
>>> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
>>> index 6a13d8a..8453b33 100644
>>> --- a/lib/cmdlib/node.py
>>> +++ b/lib/cmdlib/node.py
>>> @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit):
>>>        remove_result[master_node].Raise(
>>>          "Could not remove SSH keys of node %s before readding,"
>>>          " (UUID: %s)." % (new_node_name, new_node_uuid))
>>>
>>
>> As with previous patches, please insert space, use helper function, alter
>> message.
>>
>
> I added a helper function and used it here. I don't really get the request
> for spaces. It is calling the rpc, raising serious exceptions, then
> evaluating the other output. I think that pretty much belongs together as a
> block.
>
>

With the helper function, all is well.


>
>>
>>> +      result_msgs = remove_result[master_node].payload
>>> +      if result_msgs:
>>> +        feedback_fn("Removing old SSH key of node '%s' failed on some
>>> nodes."
>>> +                    % new_node_name)
>>> +        for node, error_msg in result_msgs.items():
>>> +          feedback_fn("%s: %s" % (node, error_msg))
>>>
>>>      result = rpcrunner.call_node_ssh_key_add(
>>>        [master_node], new_node_uuid, new_node_name,
>>> @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit):
>>>              False, # currently, all nodes are potential master
>>> candidates
>>>              False, # do not clear node's 'authorized_keys'
>>>              False) # do not clear node's 'ganeti_pub_keys'
>>> -          if not ssh_result[master_node].fail_msg:
>>> -            for message in ssh_result[master_node].payload:
>>> -              feedback_fn(message)
>>>            ssh_result[master_node].Raise(
>>>              "Could not adjust the SSH setup after demoting node '%s'"
>>>              " (UUID: %s)." % (node.name, node.uuid))
>>>
>>
>> As above.
>>
>
> See above.
>
>
>>
>>
>>> +          if not ssh_result[master_node].fail_msg:
>>> +            if ssh_result[master_node].payload:
>>> +              feedback_fn("When demoting node '%s', updating SSH keys
>>> on"
>>> +                          " some nodes failed." % (node.name))
>>> +            for node, message in ssh_result[master_node].payload:
>>> +              feedback_fn("%s: %s" % (node, message))
>>>
>>>          if self.new_role == self._ROLE_CANDIDATE:
>>>            ssh_result = self.rpc.call_node_ssh_key_add(
>>> @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit):
>>>        result[master_node].Raise(
>>>          "Could not remove the SSH key of node '%s' (UUID: %s)." %
>>>          (self.op.node_name, self.node.uuid))
>>> +      result_msgs = result[master_node].payload
>>> +      if result_msgs:
>>> +        feedback_fn("Removing old SSH key of node '%s' failed on some
>>> nodes."
>>> +                    % self.op.node_name)
>>> +        for node, error_msg in result_msgs.items():
>>> +          feedback_fn("%s: %s" % (node, error_msg))
>>>
>>>      # Promote nodes to master candidate as needed
>>>      AdjustCandidatePool(self, [self.node.uuid])
>>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>>> ganeti.backend_unittest.py
>>> index e05580d..3c207f8 100755
>>> --- a/test/py/ganeti.backend_unittest.py
>>> +++ b/test/py/ganeti.backend_unittest.py
>>> @@ -1563,6 +1563,71 @@ class
>>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>>
>>>      self.assertTrue(node_errors[other_node_name])
>>>
>>> +  def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self):
>>> +    """Test removing keys even if one of the old nodes needs retries.
>>> +
>>> +    This tests checks whether a key can be removed successfully even
>>> +    when one of the other nodes needs to be contacted with several
>>> +    retries.
>>> +
>>> +    """
>>> +    all_master_candidates =
>>> self._ssh_file_manager.GetAllMasterCandidates()
>>> +    node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0]
>>> +    other_node_name, _, _, _, _, _ = all_master_candidates[1]
>>> +    assert node_name != self._master_node
>>> +    assert other_node_name != self._master_node
>>> +    assert node_name != other_node_name
>>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
>>> +
>>> +    backend.RemoveNodeSshKey(node_uuid, node_name,
>>> +                             self._master_candidate_uuids,
>>> +                             self._potential_master_candidates,
>>> +                             self._ssh_port_map,
>>> +                             from_authorized_keys=True,
>>> +                             from_public_keys=True,
>>> +                             clear_authorized_keys=True,
>>> +                             clear_public_keys=True,
>>> +                             pub_key_file=self._pub_key_file,
>>> +                             ssconf_store=self._ssconf_mock,
>>> +                             noded_cert_file=self.noded_cert_file,
>>> +                             run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>>> +        node_uuid, node_key))
>>> +
>>> self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key))
>>> +
>>> +  def testRemoveKeyFailedWithRetriesOnOtherNode(self):
>>> +    """Test removing keys even if one of the old nodes fails even with
>>> retries.
>>> +
>>> +    This tests checks whether the removal of a key finishes properly,
>>> even if
>>> +    the update of the key files on one of the other nodes fails despite
>>> several
>>> +    retries.
>>> +
>>> +    """
>>> +    all_master_candidates =
>>> self._ssh_file_manager.GetAllMasterCandidates()
>>> +    node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0]
>>> +    other_node_name, _, _, _, _, _ = all_master_candidates[1]
>>> +    assert node_name != self._master_node
>>> +    assert other_node_name != self._master_node
>>> +    assert node_name != other_node_name
>>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
>>> +
>>> +    error_msgs = backend.RemoveNodeSshKey(
>>> +        node_uuid, node_name, self._master_candidate_uuids,
>>> +        self._potential_master_candidates, self._ssh_port_map,
>>> +        from_authorized_keys=True, from_public_keys=True,
>>> +        clear_authorized_keys=True, clear_public_keys=True,
>>> +        pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock,
>>> +        noded_cert_file=self.noded_cert_file,
>>> run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    for node in self._all_nodes:
>>> +      if node == other_node_name:
>>> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +            node, node_key))
>>> +      else:
>>> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +            node, node_key))
>>> +    self.assertTrue(error_msgs[other_node_name])
>>>
>>>  class TestVerifySshSetup(testutils.GanetiTestCase):
>>>
>>> --
>>> 2.2.0.rc0.207.ga3a616c
>>>
>>>
> Interdiff:
>
> commit cdd4f00f57d5610dd467ba1c3562d3d89ff6eca0
> Author: Helga Velroyen <[email protected]>
> Date:   Thu Apr 23 12:29:06 2015 +0200
>
>     interdiff
>
> diff --git a/lib/backend.py b/lib/backend.py
> index 87c8c77..bdc7d3e 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1659,8 +1659,8 @@ def RemoveNodeSshKey(node_uuid, node_name,
>    @returns: list of feedback messages
>
>    """
> -  # Non-disruptive error messages, mapped from node name to message
> -  result_msgs = {}
> +  # Non-disruptive error messages, list of (node, msg) pairs
> +  result_msgs = []
>
>    # Make sure at least one of these flags is true.
>    if not (from_authorized_keys or from_public_keys or
> clear_authorized_keys
> @@ -1731,13 +1731,12 @@ def RemoveNodeSshKey(node_uuid, node_name,
>            raise errors.OpExecError("No SSH port information available for"
>                                     " node '%s', map: %s." %
>                                     (node, ssh_port_map))
> +        error_msg_try = ("The SSH setup of node '%s' could not"
> +                         " be adjusted in try no %s. Error: %s.")
> +        error_msg_final = ("When removing the key of node '%s', updating
> the"
> +                           " SSH key files of node '%s' failed after %s"
> +                           " retries. Not trying again. Last error was:
> %s.")
>          if node in potential_master_candidates:
> -          error_msg_try = ("The SSH setup of node '%s' could not"
> -                           " be adjusted in try no %s. Error: %s.")
> -          error_msg_final = ("When removing the key of node '%s',
> updating the"
> -                             " SSH key files of node '%s' failed after %s"
> -                             " retries. Not trying again. Last error was:
> %s.")
> -          last_exception = None
>            for i in range(constants.SSHS_MAX_RETRIES):
>              try:
>                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> @@ -1752,7 +1751,7 @@ def RemoveNodeSshKey(node_uuid, node_name,
>              if last_exception:
>                error_msg = error_msg_final % (
>                    node_name, node, constants.SSHS_MAX_RETRIES,
> last_exception)
> -              result_msgs[node] = error_msg
> +              result_msgs.append((node, error_msg))
>                logging.error(error_msg)
>
>          else:
> @@ -1772,7 +1771,7 @@ def RemoveNodeSshKey(node_uuid, node_name,
>              if last_exception:
>                error_msg = error_msg_final % (
>                    node_name, node, constants.SSHS_MAX_RETRIES,
> last_exception)
> -              result_msgs[node] = error_msg
> +              result_msgs.append((node, error_msg))
>                logging.error(error_msg)
>
>    if clear_authorized_keys or from_public_keys or clear_public_keys:
> @@ -2015,7 +2014,7 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>             from_public_keys=False, clear_authorized_keys=False,
>             clear_public_keys=False)
>          if node_errors:
> -          all_node_errors = all_node_errors + [node_errors.items()]
> +          all_node_errors = all_node_errors + node_errors
>        else:
>          logging.debug("Old key of node '%s' is the same as the current
> master"
>                        " key. Not deleting that key on the node.",
> node_name)
> @@ -2104,7 +2103,7 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>        from_public_keys=False, clear_authorized_keys=False,
>        clear_public_keys=False)
>    if node_errors:
> -    all_node_errors = all_node_errors + node_errors.items()
> +    all_node_errors = all_node_errors + node_errors
>
>    return all_node_errors
>
> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
> index 337290e..41451b3 100644
> --- a/lib/cmdlib/node.py
> +++ b/lib/cmdlib/node.py
> @@ -366,12 +366,7 @@ class LUNodeAdd(LogicalUnit):
>        remove_result[master_node].Raise(
>          "Could not remove SSH keys of node %s before readding,"
>          " (UUID: %s)." % (new_node_name, new_node_uuid))
> -      result_msgs = remove_result[master_node].payload
> -      if result_msgs:
> -        feedback_fn("Removing old SSH key of node '%s' failed on some
> nodes."
> -                    % new_node_name)
> -        for node, error_msg in result_msgs.items():
> -          feedback_fn("%s: %s" % (node, error_msg))
> +      EvaluateSshUpdateRPC(remove_result, master_node, feedback_fn)
>
>      result = rpcrunner.call_node_ssh_key_add(
>        [master_node], new_node_uuid, new_node_name,
> @@ -891,12 +886,7 @@ class LUNodeSetParams(LogicalUnit):
>            ssh_result[master_node].Raise(
>              "Could not adjust the SSH setup after demoting node '%s'"
>              " (UUID: %s)." % (node.name, node.uuid))
> -          if not ssh_result[master_node].fail_msg:
> -            if ssh_result[master_node].payload:
> -              feedback_fn("When demoting node '%s', updating SSH keys on"
> -                          " some nodes failed." % (node.name))
> -            for node, message in ssh_result[master_node].payload:
> -              feedback_fn("%s: %s" % (node, message))
> +          EvaluateSshUpdateRPC(ssh_result, master_node, feedback_fn)
>
>          if self.new_role == self._ROLE_CANDIDATE:
>            ssh_result = self.rpc.call_node_ssh_key_add(
> @@ -1605,12 +1595,7 @@ class LUNodeRemove(LogicalUnit):
>        result[master_node].Raise(
>          "Could not remove the SSH key of node '%s' (UUID: %s)." %
>          (self.op.node_name, self.node.uuid))
> -      result_msgs = result[master_node].payload
> -      if result_msgs:
> -        feedback_fn("Removing old SSH key of node '%s' failed on some
> nodes."
> -                    % self.op.node_name)
> -        for node, error_msg in result_msgs.items():
> -          feedback_fn("%s: %s" % (node, error_msg))
> +      EvaluateSshUpdateRPC(result, master_node, feedback_fn)
>
>      # Promote nodes to master candidate as needed
>      AdjustCandidatePool(self, [self.node.uuid])
> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> ganeti.backend_unittest.py
> index 3e54c38..181f495 100755
> --- a/test/py/ganeti.backend_unittest.py
> +++ b/test/py/ganeti.backend_unittest.py
> @@ -1634,7 +1634,8 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>        else:
>          self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>              node, node_key))
> -    self.assertTrue(error_msgs[other_node_name])
> +    self.assertTrue([error_msg for (node, error_msg) in error_msgs
> +                     if node == other_node_name])
>
>  class TestVerifySshSetup(testutils.GanetiTestCase):
>
>
>
>>
>> Hrvoje Ribicic
>> Ganeti Engineering
>> Google Germany GmbH
>> Dienerstr. 12, 80331, München
>>
>> Registergericht und -nummer: Hamburg, HRB 86891
>> Sitz der Gesellschaft: Hamburg
>> Geschäftsführer: Graham Law, Christine Elizabeth Flores
>> Steuernummer: 48/725/00206
>> Umsatzsteueridentifikationsnummer: DE813741370
>>
>
Hrvoje Ribicic
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to