LGTM, thanks On Tue, Apr 28, 2015 at 3:55 PM, Helga Velroyen <[email protected]> wrote:
> Hi! > > On Tue, 28 Apr 2015 at 15:42 Hrvoje Ribicic <[email protected]> wrote: > >> On Thu, Apr 23, 2015 at 5:30 PM, 'Helga Velroyen' via ganeti-devel < >> [email protected]> wrote: >> >>> This patch adds the RetryByNumberOfTimes utility function >>> which wraps around a given function and calls that one >>> up to a maximum number of times before raising a final >>> exception. >>> >>> This function is used in the SSH key updating code in >>> backend.py. >>> >>> Signed-off-by: Helga Velroyen <[email protected]> >>> --- >>> lib/backend.py | 178 >>> +++++++++++++++++++++++------------------------------ >>> lib/utils/retry.py | 28 +++++++++ >>> 2 files changed, 106 insertions(+), 100 deletions(-) >>> >>> diff --git a/lib/backend.py b/lib/backend.py >>> index 41f8a37..5b2b558 100644 >>> --- a/lib/backend.py >>> +++ b/lib/backend.py >>> @@ -1529,26 +1529,19 @@ def AddNodeSshKey(node_uuid, node_name, >>> node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ >>> (constants.SSHS_OVERRIDE, all_keys) >>> >>> - last_exception = None >>> - for _ in range(constants.SSHS_MAX_RETRIES): >>> - try: >>> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >>> - ssh_port_map.get(node_name), node_data, >>> - debug=False, verbose=False, use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - break >>> - except errors.OpExecError as e: >>> - logging.error("Updating SSH key files of node '%s' failed." >>> - " Error: %s", node_name, e) >>> - last_exception = e >>> - else: >>> - if last_exception: >>> - # Clean up the master's public key file if adding key fails >>> - if to_public_keys: >>> - ssh.RemovePublicKey(node_uuid) >>> - raise errors.SshUpdateError( >>> - "Could not update the SSH setup of node '%s' itself. Error: >>> %s." >>> - % (node_name, last_exception)) >>> + try: >>> + utils.RetryByNumberOfTimes( >>> + constants.SSHS_MAX_RETRIES, >>> + errors.SshUpdateError, >>> + run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE, >>> + ssh_port_map.get(node_name), node_data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + except errors.SshUpdateError as e: >>> + # Clean up the master's public key file if adding key fails >>> + if to_public_keys: >>> + ssh.RemovePublicKey(node_uuid) >>> + raise e >>> >>> # Update all nodes except master and the target node >>> if to_authorized_keys: >>> @@ -1573,29 +1566,25 @@ def AddNodeSshKey(node_uuid, node_name, >>> logging.debug("Skipping offline node '%s'.", node) >>> continue >>> if node in potential_master_candidates: >>> - logging.debug("Updating SSH key files of node '%s'.") >>> - for i in range(constants.SSHS_MAX_RETRIES): >>> - try: >>> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> - ssh_port_map.get(node), pot_mc_data, >>> - debug=False, verbose=False, use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - break >>> - except errors.OpExecError as e: >>> - logging.error("Updating SSH key files of node '%s' failed" >>> - " at try no %s. Error: %s.", node, i, e) >>> - last_exception = e >>> - else: >>> - if last_exception: >>> - error_msg = ("When adding the key of node '%s', updating SSH >>> key" >>> - " files of node '%s' failed after %s retries." >>> - " Not trying again. Last error was: %s." % >>> - (node, node_name, constants.SSHS_MAX_RETRIES, >>> - last_exception)) >>> - node_errors.append((node, error_msg)) >>> - # We only log the error and don't throw an exception, because >>> - # one unreachable node shall not abort the entire procedure. >>> - logging.error(error_msg) >>> + logging.debug("Updating SSH key files of node '%s'.", node) >>> + try: >>> + utils.RetryByNumberOfTimes( >>> + constants.SSHS_MAX_RETRIES, >>> + errors.SshUpdateError, >>> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >>> + ssh_port_map.get(node), pot_mc_data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + except errors.SshUpdateError as last_exception: >>> + error_msg = ("When adding the key of node '%s', updating SSH >>> key" >>> + " files of node '%s' failed after %s retries." >>> + " Not trying again. Last errors was: %s." % >>> >> >> s/errors/error/ >> > > ACK > > >> >> >>> + (node, node_name, constants.SSHS_MAX_RETRIES, >>> + last_exception)) >>> + node_errors.append((node, error_msg)) >>> + # We only log the error and don't throw an exception, because >>> + # one unreachable node shall not abort the entire procedure. >>> + logging.error(error_msg) >>> >>> else: >>> if to_authorized_keys: >>> @@ -1731,48 +1720,41 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> raise errors.OpExecError("No SSH port information available >>> for" >>> " node '%s', map: %s." % >>> (node, ssh_port_map)) >>> - error_msg_try = ("The SSH setup of node '%s' could not" >>> - " be adjusted in try no %s. Error: %s.") >>> error_msg_final = ("When removing the key of node '%s', >>> updating the" >>> - " SSH key files of node '%s' failed after %s" >>> - " retries. Not trying again. Last error was: >>> %s.") >>> + " SSH key files of node '%s' failed. Last >>> error" >>> + " was: %s.") >>> if node in potential_master_candidates: >>> - for i in range(constants.SSHS_MAX_RETRIES): >>> - try: >>> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> - ssh_port, pot_mc_data, >>> - debug=False, verbose=False, >>> use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - break >>> - except errors.OpExecError as e: >>> - logging.error(error_msg_try, node, i, e) >>> - last_exception = e >>> - else: >>> - if last_exception: >>> - error_msg = error_msg_final % ( >>> - node_name, node, constants.SSHS_MAX_RETRIES, >>> last_exception) >>> - result_msgs.append((node, error_msg)) >>> - logging.error(error_msg) >>> + logging.debug("Updating key setup of potential master >>> candidate node" >>> + " %s.", node) >>> + try: >>> + utils.RetryByNumberOfTimes( >>> + constants.SSHS_MAX_RETRIES, >>> + errors.SshUpdateError, >>> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >>> + ssh_port, pot_mc_data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + except errors.SshUpdateError as last_exception: >>> + error_msg = error_msg_final % ( >>> + node_name, node, last_exception) >>> + result_msgs.append((node, error_msg)) >>> + logging.error(error_msg) >>> >>> else: >>> - last_exception = None >>> - if from_authorized_keys: >>> >> >> This line appears to have gotten lost in the refactoring. >> > > Good catch, thanks! > > >> >> >>> - for i in range(constants.SSHS_MAX_RETRIES): >>> - try: >>> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> - ssh_port, base_data, >>> - debug=False, verbose=False, >>> use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - break >>> - except errors.OpExecError as e: >>> - logging.error(error_msg_try, node, i, e) >>> - last_exception = e >>> - else: >>> - if last_exception: >>> - error_msg = error_msg_final % ( >>> - node_name, node, constants.SSHS_MAX_RETRIES, >>> last_exception) >>> - result_msgs.append((node, error_msg)) >>> - logging.error(error_msg) >>> + logging.debug("Updating key setup of normal node %s.", node) >>> + try: >>> + utils.RetryByNumberOfTimes( >>> + constants.SSHS_MAX_RETRIES, >>> + errors.SshUpdateError, >>> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >>> + ssh_port, base_data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + except errors.SshUpdateError as last_exception: >>> + error_msg = error_msg_final % ( >>> + node_name, node, last_exception) >>> + result_msgs.append((node, error_msg)) >>> + logging.error(error_msg) >>> >>> if clear_authorized_keys or from_public_keys or clear_public_keys: >>> data = {} >>> @@ -1810,25 +1792,21 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> constants.SSHS_SSH_AUTHORIZED_KEYS in data): >>> return >>> >>> - last_exception = None >>> - for i in range(constants.SSHS_MAX_RETRIES): >>> - try: >>> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >>> - ssh_port, data, >>> - debug=False, verbose=False, use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - except errors.OpExecError as e: >>> - logging.error("Updating the SSH key files of node '%s', whose >>> key" >>> - " itself is being removed failed with try no %s." >>> - " Error: %s", node_name, i, e) >>> - last_exception = e >>> - else: >>> - if last_exception: >>> - result_msgs.append( >>> - (node_name, >>> - ("Removing SSH keys from node '%s' failed after %s >>> retries." >>> - " This can happen when the node is already unreachable." >>> - " Error: %s" % (node_name, constants.SSHS_MAX_RETRIES, >>> e)))) >>> + logging.debug("Updating SSH key setup of target node '%s'.", >>> node_name) >>> + try: >>> + utils.RetryByNumberOfTimes( >>> + constants.SSHS_MAX_RETRIES, >>> + errors.SshUpdateError, >>> + run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE, >>> + ssh_port, data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + except errors.SshUpdateError as last_exception: >>> + result_msgs.append( >>> + (node_name, >>> + ("Removing SSH keys from node '%s' failed." >>> + " This can happen when the node is already unreachable." >>> + " Error: %s" % (node_name, last_exception)))) >>> >>> return result_msgs >>> >>> diff --git a/lib/utils/retry.py b/lib/utils/retry.py >>> index c7567fe..382d0cb 100644 >>> --- a/lib/utils/retry.py >>> +++ b/lib/utils/retry.py >>> @@ -32,6 +32,7 @@ >>> """ >>> >>> >>> +import logging >>> import time >>> >>> from ganeti import errors >>> @@ -230,3 +231,30 @@ def SimpleRetry(expected, fn, delay, timeout, >>> args=None, wait_fn=time.sleep, >>> assert "result" in rdict >>> result = rdict["result"] >>> return result >>> + >>> + >>> +def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, >>> **kwargs): >>> + """Retries calling a function up to the specified number of times. >>> + >>> + @type max_retries: integer >>> + @param max_retries: maximum number of retries. >>> + @type exception_class: class >>> + @param exception_class: exception class which is used for throwing the >>> + final exception. >>> + @type fn: callable >>> + @param fn: function to be called (up to the specified maximum number >>> of >>> + retries. >>> >> >> Very minor nitpick: doclint capitalization in this comment does not match >> the rest of the file. Could you fix that up? >> > > ACK > > >> >> >>> + >>> + """ >>> + last_exception = None >>> + for i in range(max_retries): >>> + try: >>> + fn(*args, **kwargs) >>> + break >>> + except errors.OpExecError as e: >>> + logging.error("Error after retry no. %s: %s.", i, e) >>> + last_exception = e >>> + else: >>> + if last_exception: >>> + raise exception_class("Error after %s retries. Last exception: >>> %s." >>> + % (max_retries, last_exception)) >>> -- >>> 2.2.0.rc0.207.ga3a616c >>> >>> >> As discussed offline, overall it would still be good to have delays. The >> retries currently assume that the operations performed take enough time >> that whatever transient problems are present might fix themselves. >> Please make an issue and refactor later. >> > > Filed https://code.google.com/p/ganeti/issues/detail?id=1078 > > Interdiff: > diff --git a/lib/backend.py b/lib/backend.py > index 5b2b558..52eee65 100644 > --- a/lib/backend.py > +++ b/lib/backend.py > @@ -1578,7 +1578,7 @@ def AddNodeSshKey(node_uuid, node_name, > except errors.SshUpdateError as last_exception: > error_msg = ("When adding the key of node '%s', updating SSH key" > " files of node '%s' failed after %s retries." > - " Not trying again. Last errors was: %s." % > + " Not trying again. Last error was: %s." % > (node, node_name, constants.SSHS_MAX_RETRIES, > last_exception)) > node_errors.append((node, error_msg)) > @@ -1741,20 +1741,21 @@ def RemoveNodeSshKey(node_uuid, node_name, > logging.error(error_msg) > > else: > - logging.debug("Updating key setup of normal node %s.", node) > - try: > - utils.RetryByNumberOfTimes( > - constants.SSHS_MAX_RETRIES, > - errors.SshUpdateError, > - run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, > - ssh_port, base_data, > - debug=False, verbose=False, use_cluster_key=False, > - ask_key=False, strict_host_check=False) > - except errors.SshUpdateError as last_exception: > - error_msg = error_msg_final % ( > - node_name, node, last_exception) > - result_msgs.append((node, error_msg)) > - logging.error(error_msg) > + if from_authorized_keys: > + logging.debug("Updating key setup of normal node %s.", node) > + try: > + utils.RetryByNumberOfTimes( > + constants.SSHS_MAX_RETRIES, > + errors.SshUpdateError, > + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, > + ssh_port, base_data, > + debug=False, verbose=False, use_cluster_key=False, > + ask_key=False, strict_host_check=False) > + except errors.SshUpdateError as last_exception: > + error_msg = error_msg_final % ( > + node_name, node, last_exception) > + result_msgs.append((node, error_msg)) > + logging.error(error_msg) > > if clear_authorized_keys or from_public_keys or clear_public_keys: > data = {} > diff --git a/lib/utils/retry.py b/lib/utils/retry.py > index 382d0cb..fda3fcd 100644 > --- a/lib/utils/retry.py > +++ b/lib/utils/retry.py > @@ -237,12 +237,12 @@ def RetryByNumberOfTimes(max_retries, > exception_class, fn, *args, **kwargs): > """Retries calling a function up to the specified number of times. > > @type max_retries: integer > - @param max_retries: maximum number of retries. > + @param max_retries: Maximum number of retries. > @type exception_class: class > - @param exception_class: exception class which is used for throwing the > + @param exception_class: Exception class which is used for throwing the > final exception. > @type fn: callable > - @param fn: function to be called (up to the specified maximum number of > + @param fn: Function to be called (up to the specified maximum number of > retries. > > """ > > > >> >> >> Hrvoje Ribicic >> Ganeti Engineering >> Google Germany GmbH >> Dienerstr. 12, 80331, München >> >> Registergericht und -nummer: Hamburg, HRB 86891 >> Sitz der Gesellschaft: Hamburg >> Geschäftsführer: Graham Law, Christine Elizabeth Flores >> Steuernummer: 48/725/00206 >> Umsatzsteueridentifikationsnummer: DE813741370 >> > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
