LGTM, thanks

On Tue, Apr 28, 2015 at 3:55 PM, Helga Velroyen <[email protected]> wrote:

> Hi!
>
> On Tue, 28 Apr 2015 at 15:42 Hrvoje Ribicic <[email protected]> wrote:
>
>> On Thu, Apr 23, 2015 at 5:30 PM, 'Helga Velroyen' via ganeti-devel <
>> [email protected]> wrote:
>>
>>> This patch adds the RetryByNumberOfTimes utility function
>>> which wraps around a given function and calls that one
>>> up to a maximum number of times before raising a final
>>> exception.
>>>
>>> This function is used in the SSH key updating code in
>>> backend.py.
>>>
>>> Signed-off-by: Helga Velroyen <[email protected]>
>>> ---
>>>  lib/backend.py     | 178
>>> +++++++++++++++++++++++------------------------------
>>>  lib/utils/retry.py |  28 +++++++++
>>>  2 files changed, 106 insertions(+), 100 deletions(-)
>>>
>>> diff --git a/lib/backend.py b/lib/backend.py
>>> index 41f8a37..5b2b558 100644
>>> --- a/lib/backend.py
>>> +++ b/lib/backend.py
>>> @@ -1529,26 +1529,19 @@ def AddNodeSshKey(node_uuid, node_name,
>>>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>>>        (constants.SSHS_OVERRIDE, all_keys)
>>>
>>> -    last_exception = None
>>> -    for _ in range(constants.SSHS_MAX_RETRIES):
>>> -      try:
>>> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>>> -                   ssh_port_map.get(node_name), node_data,
>>> -                   debug=False, verbose=False, use_cluster_key=False,
>>> -                   ask_key=False, strict_host_check=False)
>>> -        break
>>> -      except errors.OpExecError as e:
>>> -        logging.error("Updating SSH key files of node '%s' failed."
>>> -                      " Error: %s", node_name, e)
>>> -        last_exception = e
>>> -    else:
>>> -      if last_exception:
>>> -        # Clean up the master's public key file if adding key fails
>>> -        if to_public_keys:
>>> -          ssh.RemovePublicKey(node_uuid)
>>> -        raise errors.SshUpdateError(
>>> -          "Could not update the SSH setup of node '%s' itself. Error:
>>> %s."
>>> -          % (node_name, last_exception))
>>> +    try:
>>> +      utils.RetryByNumberOfTimes(
>>> +          constants.SSHS_MAX_RETRIES,
>>> +          errors.SshUpdateError,
>>> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
>>> +          ssh_port_map.get(node_name), node_data,
>>> +          debug=False, verbose=False, use_cluster_key=False,
>>> +          ask_key=False, strict_host_check=False)
>>> +    except errors.SshUpdateError as e:
>>> +      # Clean up the master's public key file if adding key fails
>>> +      if to_public_keys:
>>> +        ssh.RemovePublicKey(node_uuid)
>>> +      raise e
>>>
>>>    # Update all nodes except master and the target node
>>>    if to_authorized_keys:
>>> @@ -1573,29 +1566,25 @@ def AddNodeSshKey(node_uuid, node_name,
>>>        logging.debug("Skipping offline node '%s'.", node)
>>>        continue
>>>      if node in potential_master_candidates:
>>> -      logging.debug("Updating SSH key files of node '%s'.")
>>> -      for i in range(constants.SSHS_MAX_RETRIES):
>>> -        try:
>>> -          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                     ssh_port_map.get(node), pot_mc_data,
>>> -                     debug=False, verbose=False, use_cluster_key=False,
>>> -                     ask_key=False, strict_host_check=False)
>>> -          break
>>> -        except errors.OpExecError as e:
>>> -          logging.error("Updating SSH key files of node '%s' failed"
>>> -                        " at try no %s. Error: %s.", node, i, e)
>>> -          last_exception = e
>>> -      else:
>>> -        if last_exception:
>>> -          error_msg = ("When adding the key of node '%s', updating SSH
>>> key"
>>> -                       " files of node '%s' failed after %s retries."
>>> -                       " Not trying again. Last error was: %s." %
>>> -                       (node, node_name, constants.SSHS_MAX_RETRIES,
>>> -                        last_exception))
>>> -          node_errors.append((node, error_msg))
>>> -          # We only log the error and don't throw an exception, because
>>> -          # one unreachable node shall not abort the entire procedure.
>>> -          logging.error(error_msg)
>>> +      logging.debug("Updating SSH key files of node '%s'.", node)
>>> +      try:
>>> +        utils.RetryByNumberOfTimes(
>>> +            constants.SSHS_MAX_RETRIES,
>>> +            errors.SshUpdateError,
>>> +            run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>>> +            ssh_port_map.get(node), pot_mc_data,
>>> +            debug=False, verbose=False, use_cluster_key=False,
>>> +            ask_key=False, strict_host_check=False)
>>> +      except errors.SshUpdateError as last_exception:
>>> +        error_msg = ("When adding the key of node '%s', updating SSH
>>> key"
>>> +                     " files of node '%s' failed after %s retries."
>>> +                     " Not trying again. Last errors was: %s." %
>>>
>>
>> s/errors/error/
>>
>
> ACK
>
>
>>
>>
>>> +                     (node, node_name, constants.SSHS_MAX_RETRIES,
>>> +                      last_exception))
>>> +        node_errors.append((node, error_msg))
>>> +        # We only log the error and don't throw an exception, because
>>> +        # one unreachable node shall not abort the entire procedure.
>>> +        logging.error(error_msg)
>>>
>>>      else:
>>>        if to_authorized_keys:
>>> @@ -1731,48 +1720,41 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>            raise errors.OpExecError("No SSH port information available
>>> for"
>>>                                     " node '%s', map: %s." %
>>>                                     (node, ssh_port_map))
>>> -        error_msg_try = ("The SSH setup of node '%s' could not"
>>> -                         " be adjusted in try no %s. Error: %s.")
>>>          error_msg_final = ("When removing the key of node '%s',
>>> updating the"
>>> -                           " SSH key files of node '%s' failed after %s"
>>> -                           " retries. Not trying again. Last error was:
>>> %s.")
>>> +                           " SSH key files of node '%s' failed. Last
>>> error"
>>> +                           " was: %s.")
>>>          if node in potential_master_candidates:
>>> -          for i in range(constants.SSHS_MAX_RETRIES):
>>> -            try:
>>> -              run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                         ssh_port, pot_mc_data,
>>> -                         debug=False, verbose=False,
>>> use_cluster_key=False,
>>> -                         ask_key=False, strict_host_check=False)
>>> -              break
>>> -            except errors.OpExecError as e:
>>> -              logging.error(error_msg_try, node, i, e)
>>> -              last_exception = e
>>> -          else:
>>> -            if last_exception:
>>> -              error_msg = error_msg_final % (
>>> -                  node_name, node, constants.SSHS_MAX_RETRIES,
>>> last_exception)
>>> -              result_msgs.append((node, error_msg))
>>> -              logging.error(error_msg)
>>> +          logging.debug("Updating key setup of potential master
>>> candidate node"
>>> +                        " %s.", node)
>>> +          try:
>>> +            utils.RetryByNumberOfTimes(
>>> +                constants.SSHS_MAX_RETRIES,
>>> +                errors.SshUpdateError,
>>> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>>> +                ssh_port, pot_mc_data,
>>> +                debug=False, verbose=False, use_cluster_key=False,
>>> +                ask_key=False, strict_host_check=False)
>>> +          except errors.SshUpdateError as last_exception:
>>> +            error_msg = error_msg_final % (
>>> +                node_name, node, last_exception)
>>> +            result_msgs.append((node, error_msg))
>>> +            logging.error(error_msg)
>>>
>>>          else:
>>> -          last_exception = None
>>> -          if from_authorized_keys:
>>>
>>
>> This line appears to have gotten lost in the refactoring.
>>
>
> Good catch, thanks!
>
>
>>
>>
>>> -            for i in range(constants.SSHS_MAX_RETRIES):
>>> -              try:
>>> -                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                           ssh_port, base_data,
>>> -                           debug=False, verbose=False,
>>> use_cluster_key=False,
>>> -                           ask_key=False, strict_host_check=False)
>>> -                break
>>> -              except errors.OpExecError as e:
>>> -                logging.error(error_msg_try, node, i, e)
>>> -                last_exception = e
>>> -          else:
>>> -            if last_exception:
>>> -              error_msg = error_msg_final % (
>>> -                  node_name, node, constants.SSHS_MAX_RETRIES,
>>> last_exception)
>>> -              result_msgs.append((node, error_msg))
>>> -              logging.error(error_msg)
>>> +          logging.debug("Updating key setup of normal node %s.", node)
>>> +          try:
>>> +            utils.RetryByNumberOfTimes(
>>> +                constants.SSHS_MAX_RETRIES,
>>> +                errors.SshUpdateError,
>>> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
>>> +                ssh_port, base_data,
>>> +                debug=False, verbose=False, use_cluster_key=False,
>>> +                ask_key=False, strict_host_check=False)
>>> +          except errors.SshUpdateError as last_exception:
>>> +            error_msg = error_msg_final % (
>>> +                node_name, node, last_exception)
>>> +            result_msgs.append((node, error_msg))
>>> +            logging.error(error_msg)
>>>
>>>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>>>      data = {}
>>> @@ -1810,25 +1792,21 @@ def RemoveNodeSshKey(node_uuid, node_name,
>>>              constants.SSHS_SSH_AUTHORIZED_KEYS in data):
>>>        return
>>>
>>> -    last_exception = None
>>> -    for i in range(constants.SSHS_MAX_RETRIES):
>>> -      try:
>>> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>>> -                   ssh_port, data,
>>> -                   debug=False, verbose=False, use_cluster_key=False,
>>> -                   ask_key=False, strict_host_check=False)
>>> -      except errors.OpExecError as e:
>>> -        logging.error("Updating the SSH key files of node '%s', whose
>>> key"
>>> -                      " itself is being removed failed with try no %s."
>>> -                      " Error: %s", node_name, i, e)
>>> -        last_exception = e
>>> -    else:
>>> -      if last_exception:
>>> -        result_msgs.append(
>>> -            (node_name,
>>> -             ("Removing SSH keys from node '%s' failed after %s
>>> retries."
>>> -              " This can happen when the node is already unreachable."
>>> -              " Error: %s" % (node_name, constants.SSHS_MAX_RETRIES,
>>> e))))
>>> +    logging.debug("Updating SSH key setup of target node '%s'.",
>>> node_name)
>>> +    try:
>>> +      utils.RetryByNumberOfTimes(
>>> +          constants.SSHS_MAX_RETRIES,
>>> +          errors.SshUpdateError,
>>> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
>>> +          ssh_port, data,
>>> +          debug=False, verbose=False, use_cluster_key=False,
>>> +          ask_key=False, strict_host_check=False)
>>> +    except errors.SshUpdateError as last_exception:
>>> +      result_msgs.append(
>>> +          (node_name,
>>> +           ("Removing SSH keys from node '%s' failed."
>>> +            " This can happen when the node is already unreachable."
>>> +            " Error: %s" % (node_name, last_exception))))
>>>
>>>    return result_msgs
>>>
>>> diff --git a/lib/utils/retry.py b/lib/utils/retry.py
>>> index c7567fe..382d0cb 100644
>>> --- a/lib/utils/retry.py
>>> +++ b/lib/utils/retry.py
>>> @@ -32,6 +32,7 @@
>>>  """
>>>
>>>
>>> +import logging
>>>  import time
>>>
>>>  from ganeti import errors
>>> @@ -230,3 +231,30 @@ def SimpleRetry(expected, fn, delay, timeout,
>>> args=None, wait_fn=time.sleep,
>>>      assert "result" in rdict
>>>      result = rdict["result"]
>>>    return result
>>> +
>>> +
>>> +def RetryByNumberOfTimes(max_retries, exception_class, fn, *args,
>>> **kwargs):
>>> +  """Retries calling a function up to the specified number of times.
>>> +
>>> +  @type max_retries: integer
>>> +  @param max_retries: maximum number of retries.
>>> +  @type exception_class: class
>>> +  @param exception_class: exception class which is used for throwing the
>>> +                          final exception.
>>> +  @type fn: callable
>>> +  @param fn: function to be called (up to the specified maximum number
>>> of
>>> +             retries.
>>>
>>
>> Very minor nitpick: doclint capitalization in this comment does not match
>> the rest of the file. Could you fix that up?
>>
>
> ACK
>
>
>>
>>
>>> +
>>> +  """
>>> +  last_exception = None
>>> +  for i in range(max_retries):
>>> +    try:
>>> +      fn(*args, **kwargs)
>>> +      break
>>> +    except errors.OpExecError as e:
>>> +      logging.error("Error after retry no. %s: %s.", i, e)
>>> +      last_exception = e
>>> +  else:
>>> +    if last_exception:
>>> +      raise exception_class("Error after %s retries. Last exception:
>>> %s."
>>> +                            % (max_retries, last_exception))
>>> --
>>> 2.2.0.rc0.207.ga3a616c
>>>
>>>
>> As discussed offline, overall it would still be good to have delays. The
>> retries currently assume that the operations performed take enough time
>> that whatever transient problems are present might fix themselves.
>> Please make an issue and refactor later.
>>
>
> Filed https://code.google.com/p/ganeti/issues/detail?id=1078
>
> Interdiff:
> diff --git a/lib/backend.py b/lib/backend.py
> index 5b2b558..52eee65 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1578,7 +1578,7 @@ def AddNodeSshKey(node_uuid, node_name,
>        except errors.SshUpdateError as last_exception:
>          error_msg = ("When adding the key of node '%s', updating SSH key"
>                       " files of node '%s' failed after %s retries."
> -                     " Not trying again. Last errors was: %s." %
> +                     " Not trying again. Last error was: %s." %
>                       (node, node_name, constants.SSHS_MAX_RETRIES,
>                        last_exception))
>          node_errors.append((node, error_msg))
> @@ -1741,20 +1741,21 @@ def RemoveNodeSshKey(node_uuid, node_name,
>              logging.error(error_msg)
>
>          else:
> -          logging.debug("Updating key setup of normal node %s.", node)
> -          try:
> -            utils.RetryByNumberOfTimes(
> -                constants.SSHS_MAX_RETRIES,
> -                errors.SshUpdateError,
> -                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
> -                ssh_port, base_data,
> -                debug=False, verbose=False, use_cluster_key=False,
> -                ask_key=False, strict_host_check=False)
> -          except errors.SshUpdateError as last_exception:
> -            error_msg = error_msg_final % (
> -                node_name, node, last_exception)
> -            result_msgs.append((node, error_msg))
> -            logging.error(error_msg)
> +          if from_authorized_keys:
> +            logging.debug("Updating key setup of normal node %s.", node)
> +            try:
> +              utils.RetryByNumberOfTimes(
> +                  constants.SSHS_MAX_RETRIES,
> +                  errors.SshUpdateError,
> +                  run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
> +                  ssh_port, base_data,
> +                  debug=False, verbose=False, use_cluster_key=False,
> +                  ask_key=False, strict_host_check=False)
> +            except errors.SshUpdateError as last_exception:
> +              error_msg = error_msg_final % (
> +                  node_name, node, last_exception)
> +              result_msgs.append((node, error_msg))
> +              logging.error(error_msg)
>
>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>      data = {}
> diff --git a/lib/utils/retry.py b/lib/utils/retry.py
> index 382d0cb..fda3fcd 100644
> --- a/lib/utils/retry.py
> +++ b/lib/utils/retry.py
> @@ -237,12 +237,12 @@ def RetryByNumberOfTimes(max_retries,
> exception_class, fn, *args, **kwargs):
>    """Retries calling a function up to the specified number of times.
>
>    @type max_retries: integer
> -  @param max_retries: maximum number of retries.
> +  @param max_retries: Maximum number of retries.
>    @type exception_class: class
> -  @param exception_class: exception class which is used for throwing the
> +  @param exception_class: Exception class which is used for throwing the
>                            final exception.
>    @type fn: callable
> -  @param fn: function to be called (up to the specified maximum number of
> +  @param fn: Function to be called (up to the specified maximum number of
>               retries.
>
>    """
>
>
>
>>
>>
>> Hrvoje Ribicic
>> Ganeti Engineering
>> Google Germany GmbH
>> Dienerstr. 12, 80331, München
>>
>> Registergericht und -nummer: Hamburg, HRB 86891
>> Sitz der Gesellschaft: Hamburg
>> Geschäftsführer: Graham Law, Christine Elizabeth Flores
>> Steuernummer: 48/725/00206
>> Umsatzsteueridentifikationsnummer: DE813741370
>>
>
Hrvoje Ribicic
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to