On Thu, Apr 23, 2015 at 5:30 PM, 'Helga Velroyen' via ganeti-devel <
[email protected]> wrote:

> This patch adds the RetryByNumberOfTimes utility function
> which wraps around a given function and calls that one
> up to a maximum number of times before raising a final
> exception.
>
> This function is used in the SSH key updating code in
> backend.py.
>
> Signed-off-by: Helga Velroyen <[email protected]>
> ---
>  lib/backend.py     | 178
> +++++++++++++++++++++++------------------------------
>  lib/utils/retry.py |  28 +++++++++
>  2 files changed, 106 insertions(+), 100 deletions(-)
>
> diff --git a/lib/backend.py b/lib/backend.py
> index 41f8a37..5b2b558 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1529,26 +1529,19 @@ def AddNodeSshKey(node_uuid, node_name,
>      node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>        (constants.SSHS_OVERRIDE, all_keys)
>
> -    last_exception = None
> -    for _ in range(constants.SSHS_MAX_RETRIES):
> -      try:
> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> -                   ssh_port_map.get(node_name), node_data,
> -                   debug=False, verbose=False, use_cluster_key=False,
> -                   ask_key=False, strict_host_check=False)
> -        break
> -      except errors.OpExecError as e:
> -        logging.error("Updating SSH key files of node '%s' failed."
> -                      " Error: %s", node_name, e)
> -        last_exception = e
> -    else:
> -      if last_exception:
> -        # Clean up the master's public key file if adding key fails
> -        if to_public_keys:
> -          ssh.RemovePublicKey(node_uuid)
> -        raise errors.SshUpdateError(
> -          "Could not update the SSH setup of node '%s' itself. Error: %s."
> -          % (node_name, last_exception))
> +    try:
> +      utils.RetryByNumberOfTimes(
> +          constants.SSHS_MAX_RETRIES,
> +          errors.SshUpdateError,
> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
> +          ssh_port_map.get(node_name), node_data,
> +          debug=False, verbose=False, use_cluster_key=False,
> +          ask_key=False, strict_host_check=False)
> +    except errors.SshUpdateError as e:
> +      # Clean up the master's public key file if adding key fails
> +      if to_public_keys:
> +        ssh.RemovePublicKey(node_uuid)
> +      raise e
>
>    # Update all nodes except master and the target node
>    if to_authorized_keys:
> @@ -1573,29 +1566,25 @@ def AddNodeSshKey(node_uuid, node_name,
>        logging.debug("Skipping offline node '%s'.", node)
>        continue
>      if node in potential_master_candidates:
> -      logging.debug("Updating SSH key files of node '%s'.")
> -      for i in range(constants.SSHS_MAX_RETRIES):
> -        try:
> -          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> -                     ssh_port_map.get(node), pot_mc_data,
> -                     debug=False, verbose=False, use_cluster_key=False,
> -                     ask_key=False, strict_host_check=False)
> -          break
> -        except errors.OpExecError as e:
> -          logging.error("Updating SSH key files of node '%s' failed"
> -                        " at try no %s. Error: %s.", node, i, e)
> -          last_exception = e
> -      else:
> -        if last_exception:
> -          error_msg = ("When adding the key of node '%s', updating SSH
> key"
> -                       " files of node '%s' failed after %s retries."
> -                       " Not trying again. Last error was: %s." %
> -                       (node, node_name, constants.SSHS_MAX_RETRIES,
> -                        last_exception))
> -          node_errors.append((node, error_msg))
> -          # We only log the error and don't throw an exception, because
> -          # one unreachable node shall not abort the entire procedure.
> -          logging.error(error_msg)
> +      logging.debug("Updating SSH key files of node '%s'.", node)
> +      try:
> +        utils.RetryByNumberOfTimes(
> +            constants.SSHS_MAX_RETRIES,
> +            errors.SshUpdateError,
> +            run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
> +            ssh_port_map.get(node), pot_mc_data,
> +            debug=False, verbose=False, use_cluster_key=False,
> +            ask_key=False, strict_host_check=False)
> +      except errors.SshUpdateError as last_exception:
> +        error_msg = ("When adding the key of node '%s', updating SSH key"
> +                     " files of node '%s' failed after %s retries."
> +                     " Not trying again. Last errors was: %s." %
>

s/errors/error/


> +                     (node, node_name, constants.SSHS_MAX_RETRIES,
> +                      last_exception))
> +        node_errors.append((node, error_msg))
> +        # We only log the error and don't throw an exception, because
> +        # one unreachable node shall not abort the entire procedure.
> +        logging.error(error_msg)
>
>      else:
>        if to_authorized_keys:
> @@ -1731,48 +1720,41 @@ def RemoveNodeSshKey(node_uuid, node_name,
>            raise errors.OpExecError("No SSH port information available for"
>                                     " node '%s', map: %s." %
>                                     (node, ssh_port_map))
> -        error_msg_try = ("The SSH setup of node '%s' could not"
> -                         " be adjusted in try no %s. Error: %s.")
>          error_msg_final = ("When removing the key of node '%s', updating
> the"
> -                           " SSH key files of node '%s' failed after %s"
> -                           " retries. Not trying again. Last error was:
> %s.")
> +                           " SSH key files of node '%s' failed. Last
> error"
> +                           " was: %s.")
>          if node in potential_master_candidates:
> -          for i in range(constants.SSHS_MAX_RETRIES):
> -            try:
> -              run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> -                         ssh_port, pot_mc_data,
> -                         debug=False, verbose=False,
> use_cluster_key=False,
> -                         ask_key=False, strict_host_check=False)
> -              break
> -            except errors.OpExecError as e:
> -              logging.error(error_msg_try, node, i, e)
> -              last_exception = e
> -          else:
> -            if last_exception:
> -              error_msg = error_msg_final % (
> -                  node_name, node, constants.SSHS_MAX_RETRIES,
> last_exception)
> -              result_msgs.append((node, error_msg))
> -              logging.error(error_msg)
> +          logging.debug("Updating key setup of potential master candidate
> node"
> +                        " %s.", node)
> +          try:
> +            utils.RetryByNumberOfTimes(
> +                constants.SSHS_MAX_RETRIES,
> +                errors.SshUpdateError,
> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
> +                ssh_port, pot_mc_data,
> +                debug=False, verbose=False, use_cluster_key=False,
> +                ask_key=False, strict_host_check=False)
> +          except errors.SshUpdateError as last_exception:
> +            error_msg = error_msg_final % (
> +                node_name, node, last_exception)
> +            result_msgs.append((node, error_msg))
> +            logging.error(error_msg)
>
>          else:
> -          last_exception = None
> -          if from_authorized_keys:
>

This line appears to have gotten lost in the refactoring.


> -            for i in range(constants.SSHS_MAX_RETRIES):
> -              try:
> -                run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
> -                           ssh_port, base_data,
> -                           debug=False, verbose=False,
> use_cluster_key=False,
> -                           ask_key=False, strict_host_check=False)
> -                break
> -              except errors.OpExecError as e:
> -                logging.error(error_msg_try, node, i, e)
> -                last_exception = e
> -          else:
> -            if last_exception:
> -              error_msg = error_msg_final % (
> -                  node_name, node, constants.SSHS_MAX_RETRIES,
> last_exception)
> -              result_msgs.append((node, error_msg))
> -              logging.error(error_msg)
> +          logging.debug("Updating key setup of normal node %s.", node)
> +          try:
> +            utils.RetryByNumberOfTimes(
> +                constants.SSHS_MAX_RETRIES,
> +                errors.SshUpdateError,
> +                run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE,
> +                ssh_port, base_data,
> +                debug=False, verbose=False, use_cluster_key=False,
> +                ask_key=False, strict_host_check=False)
> +          except errors.SshUpdateError as last_exception:
> +            error_msg = error_msg_final % (
> +                node_name, node, last_exception)
> +            result_msgs.append((node, error_msg))
> +            logging.error(error_msg)
>
>    if clear_authorized_keys or from_public_keys or clear_public_keys:
>      data = {}
> @@ -1810,25 +1792,21 @@ def RemoveNodeSshKey(node_uuid, node_name,
>              constants.SSHS_SSH_AUTHORIZED_KEYS in data):
>        return
>
> -    last_exception = None
> -    for i in range(constants.SSHS_MAX_RETRIES):
> -      try:
> -        run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> -                   ssh_port, data,
> -                   debug=False, verbose=False, use_cluster_key=False,
> -                   ask_key=False, strict_host_check=False)
> -      except errors.OpExecError as e:
> -        logging.error("Updating the SSH key files of node '%s', whose key"
> -                      " itself is being removed failed with try no %s."
> -                      " Error: %s", node_name, i, e)
> -        last_exception = e
> -    else:
> -      if last_exception:
> -        result_msgs.append(
> -            (node_name,
> -             ("Removing SSH keys from node '%s' failed after %s retries."
> -              " This can happen when the node is already unreachable."
> -              " Error: %s" % (node_name, constants.SSHS_MAX_RETRIES, e))))
> +    logging.debug("Updating SSH key setup of target node '%s'.",
> node_name)
> +    try:
> +      utils.RetryByNumberOfTimes(
> +          constants.SSHS_MAX_RETRIES,
> +          errors.SshUpdateError,
> +          run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE,
> +          ssh_port, data,
> +          debug=False, verbose=False, use_cluster_key=False,
> +          ask_key=False, strict_host_check=False)
> +    except errors.SshUpdateError as last_exception:
> +      result_msgs.append(
> +          (node_name,
> +           ("Removing SSH keys from node '%s' failed."
> +            " This can happen when the node is already unreachable."
> +            " Error: %s" % (node_name, last_exception))))
>
>    return result_msgs
>
> diff --git a/lib/utils/retry.py b/lib/utils/retry.py
> index c7567fe..382d0cb 100644
> --- a/lib/utils/retry.py
> +++ b/lib/utils/retry.py
> @@ -32,6 +32,7 @@
>  """
>
>
> +import logging
>  import time
>
>  from ganeti import errors
> @@ -230,3 +231,30 @@ def SimpleRetry(expected, fn, delay, timeout,
> args=None, wait_fn=time.sleep,
>      assert "result" in rdict
>      result = rdict["result"]
>    return result
> +
> +
> +def RetryByNumberOfTimes(max_retries, exception_class, fn, *args,
> **kwargs):
> +  """Retries calling a function up to the specified number of times.
> +
> +  @type max_retries: integer
> +  @param max_retries: maximum number of retries.
> +  @type exception_class: class
> +  @param exception_class: exception class which is used for throwing the
> +                          final exception.
> +  @type fn: callable
> +  @param fn: function to be called (up to the specified maximum number of
> +             retries.
>

Very minor nitpick: doclint capitalization in this comment does not match
the rest of the file. Could you fix that up?


> +
> +  """
> +  last_exception = None
> +  for i in range(max_retries):
> +    try:
> +      fn(*args, **kwargs)
> +      break
> +    except errors.OpExecError as e:
> +      logging.error("Error after retry no. %s: %s.", i, e)
> +      last_exception = e
> +  else:
> +    if last_exception:
> +      raise exception_class("Error after %s retries. Last exception: %s."
> +                            % (max_retries, last_exception))
> --
> 2.2.0.rc0.207.ga3a616c
>
>
As discussed offline, overall it would still be good to have delays. The
retries currently assume that the operations performed take enough time
that whatever transient problems are present might fix themselves.
Please make an issue and refactor later.



Hrvoje Ribicic
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to