Hi! On Tue, 28 Apr 2015 at 15:42 Hrvoje Ribicic <[email protected]> wrote:
> On Thu, Apr 23, 2015 at 5:30 PM, 'Helga Velroyen' via ganeti-devel < > [email protected]> wrote: > >> This patch adds the RetryByNumberOfTimes utility function >> which wraps around a given function and calls that one >> up to a maximum number of times before raising a final >> exception. >> >> This function is used in the SSH key updating code in >> backend.py. >> >> Signed-off-by: Helga Velroyen <[email protected]> >> --- >> lib/backend.py | 178 >> +++++++++++++++++++++++------------------------------ >> lib/utils/retry.py | 28 +++++++++ >> 2 files changed, 106 insertions(+), 100 deletions(-) >> >> diff --git a/lib/backend.py b/lib/backend.py >> index 41f8a37..5b2b558 100644 >> --- a/lib/backend.py >> +++ b/lib/backend.py >> @@ -1529,26 +1529,19 @@ def AddNodeSshKey(node_uuid, node_name, >> node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ >> (constants.SSHS_OVERRIDE, all_keys) >> >> - last_exception = None >> - for _ in range(constants.SSHS_MAX_RETRIES): >> - try: >> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >> - ssh_port_map.get(node_name), node_data, >> - debug=False, verbose=False, use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - break >> - except errors.OpExecError as e: >> - logging.error("Updating SSH key files of node '%s' failed." >> - " Error: %s", node_name, e) >> - last_exception = e >> - else: >> - if last_exception: >> - # Clean up the master's public key file if adding key fails >> - if to_public_keys: >> - ssh.RemovePublicKey(node_uuid) >> - raise errors.SshUpdateError( >> - "Could not update the SSH setup of node '%s' itself. Error: >> %s." >> - % (node_name, last_exception)) >> + try: >> + utils.RetryByNumberOfTimes( >> + constants.SSHS_MAX_RETRIES, >> + errors.SshUpdateError, >> + run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE, >> + ssh_port_map.get(node_name), node_data, >> + debug=False, verbose=False, use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + except errors.SshUpdateError as e: >> + # Clean up the master's public key file if adding key fails >> + if to_public_keys: >> + ssh.RemovePublicKey(node_uuid) >> + raise e >> >> # Update all nodes except master and the target node >> if to_authorized_keys: >> @@ -1573,29 +1566,25 @@ def AddNodeSshKey(node_uuid, node_name, >> logging.debug("Skipping offline node '%s'.", node) >> continue >> if node in potential_master_candidates: >> - logging.debug("Updating SSH key files of node '%s'.") >> - for i in range(constants.SSHS_MAX_RETRIES): >> - try: >> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> - ssh_port_map.get(node), pot_mc_data, >> - debug=False, verbose=False, use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - break >> - except errors.OpExecError as e: >> - logging.error("Updating SSH key files of node '%s' failed" >> - " at try no %s. Error: %s.", node, i, e) >> - last_exception = e >> - else: >> - if last_exception: >> - error_msg = ("When adding the key of node '%s', updating SSH >> key" >> - " files of node '%s' failed after %s retries." >> - " Not trying again. Last error was: %s." % >> - (node, node_name, constants.SSHS_MAX_RETRIES, >> - last_exception)) >> - node_errors.append((node, error_msg)) >> - # We only log the error and don't throw an exception, because >> - # one unreachable node shall not abort the entire procedure. >> - logging.error(error_msg) >> + logging.debug("Updating SSH key files of node '%s'.", node) >> + try: >> + utils.RetryByNumberOfTimes( >> + constants.SSHS_MAX_RETRIES, >> + errors.SshUpdateError, >> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >> + ssh_port_map.get(node), pot_mc_data, >> + debug=False, verbose=False, use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + except errors.SshUpdateError as last_exception: >> + error_msg = ("When adding the key of node '%s', updating SSH key" >> + " files of node '%s' failed after %s retries." >> + " Not trying again. Last errors was: %s." % >> > > s/errors/error/ > ACK > > >> + (node, node_name, constants.SSHS_MAX_RETRIES, >> + last_exception)) >> + node_errors.append((node, error_msg)) >> + # We only log the error and don't throw an exception, because >> + # one unreachable node shall not abort the entire procedure. >> + logging.error(error_msg) >> >> else: >> if to_authorized_keys: >> @@ -1731,48 +1720,41 @@ def RemoveNodeSshKey(node_uuid, node_name, >> raise errors.OpExecError("No SSH port information available >> for" >> " node '%s', map: %s." % >> (node, ssh_port_map)) >> - error_msg_try = ("The SSH setup of node '%s' could not" >> - " be adjusted in try no %s. Error: %s.") >> error_msg_final = ("When removing the key of node '%s', updating >> the" >> - " SSH key files of node '%s' failed after %s" >> - " retries. Not trying again. Last error was: >> %s.") >> + " SSH key files of node '%s' failed. Last >> error" >> + " was: %s.") >> if node in potential_master_candidates: >> - for i in range(constants.SSHS_MAX_RETRIES): >> - try: >> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> - ssh_port, pot_mc_data, >> - debug=False, verbose=False, >> use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - break >> - except errors.OpExecError as e: >> - logging.error(error_msg_try, node, i, e) >> - last_exception = e >> - else: >> - if last_exception: >> - error_msg = error_msg_final % ( >> - node_name, node, constants.SSHS_MAX_RETRIES, >> last_exception) >> - result_msgs.append((node, error_msg)) >> - logging.error(error_msg) >> + logging.debug("Updating key setup of potential master >> candidate node" >> + " %s.", node) >> + try: >> + utils.RetryByNumberOfTimes( >> + constants.SSHS_MAX_RETRIES, >> + errors.SshUpdateError, >> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >> + ssh_port, pot_mc_data, >> + debug=False, verbose=False, use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + except errors.SshUpdateError as last_exception: >> + error_msg = error_msg_final % ( >> + node_name, node, last_exception) >> + result_msgs.append((node, error_msg)) >> + logging.error(error_msg) >> >> else: >> - last_exception = None >> - if from_authorized_keys: >> > > This line appears to have gotten lost in the refactoring. > Good catch, thanks! > > >> - for i in range(constants.SSHS_MAX_RETRIES): >> - try: >> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >> - ssh_port, base_data, >> - debug=False, verbose=False, >> use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - break >> - except errors.OpExecError as e: >> - logging.error(error_msg_try, node, i, e) >> - last_exception = e >> - else: >> - if last_exception: >> - error_msg = error_msg_final % ( >> - node_name, node, constants.SSHS_MAX_RETRIES, >> last_exception) >> - result_msgs.append((node, error_msg)) >> - logging.error(error_msg) >> + logging.debug("Updating key setup of normal node %s.", node) >> + try: >> + utils.RetryByNumberOfTimes( >> + constants.SSHS_MAX_RETRIES, >> + errors.SshUpdateError, >> + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, >> + ssh_port, base_data, >> + debug=False, verbose=False, use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + except errors.SshUpdateError as last_exception: >> + error_msg = error_msg_final % ( >> + node_name, node, last_exception) >> + result_msgs.append((node, error_msg)) >> + logging.error(error_msg) >> >> if clear_authorized_keys or from_public_keys or clear_public_keys: >> data = {} >> @@ -1810,25 +1792,21 @@ def RemoveNodeSshKey(node_uuid, node_name, >> constants.SSHS_SSH_AUTHORIZED_KEYS in data): >> return >> >> - last_exception = None >> - for i in range(constants.SSHS_MAX_RETRIES): >> - try: >> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >> - ssh_port, data, >> - debug=False, verbose=False, use_cluster_key=False, >> - ask_key=False, strict_host_check=False) >> - except errors.OpExecError as e: >> - logging.error("Updating the SSH key files of node '%s', whose >> key" >> - " itself is being removed failed with try no %s." >> - " Error: %s", node_name, i, e) >> - last_exception = e >> - else: >> - if last_exception: >> - result_msgs.append( >> - (node_name, >> - ("Removing SSH keys from node '%s' failed after %s retries." >> - " This can happen when the node is already unreachable." >> - " Error: %s" % (node_name, constants.SSHS_MAX_RETRIES, >> e)))) >> + logging.debug("Updating SSH key setup of target node '%s'.", >> node_name) >> + try: >> + utils.RetryByNumberOfTimes( >> + constants.SSHS_MAX_RETRIES, >> + errors.SshUpdateError, >> + run_cmd_fn, cluster_name, node_name, pathutils.SSH_UPDATE, >> + ssh_port, data, >> + debug=False, verbose=False, use_cluster_key=False, >> + ask_key=False, strict_host_check=False) >> + except errors.SshUpdateError as last_exception: >> + result_msgs.append( >> + (node_name, >> + ("Removing SSH keys from node '%s' failed." >> + " This can happen when the node is already unreachable." >> + " Error: %s" % (node_name, last_exception)))) >> >> return result_msgs >> >> diff --git a/lib/utils/retry.py b/lib/utils/retry.py >> index c7567fe..382d0cb 100644 >> --- a/lib/utils/retry.py >> +++ b/lib/utils/retry.py >> @@ -32,6 +32,7 @@ >> """ >> >> >> +import logging >> import time >> >> from ganeti import errors >> @@ -230,3 +231,30 @@ def SimpleRetry(expected, fn, delay, timeout, >> args=None, wait_fn=time.sleep, >> assert "result" in rdict >> result = rdict["result"] >> return result >> + >> + >> +def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, >> **kwargs): >> + """Retries calling a function up to the specified number of times. >> + >> + @type max_retries: integer >> + @param max_retries: maximum number of retries. >> + @type exception_class: class >> + @param exception_class: exception class which is used for throwing the >> + final exception. >> + @type fn: callable >> + @param fn: function to be called (up to the specified maximum number of >> + retries. >> > > Very minor nitpick: doclint capitalization in this comment does not match > the rest of the file. Could you fix that up? > ACK > > >> + >> + """ >> + last_exception = None >> + for i in range(max_retries): >> + try: >> + fn(*args, **kwargs) >> + break >> + except errors.OpExecError as e: >> + logging.error("Error after retry no. %s: %s.", i, e) >> + last_exception = e >> + else: >> + if last_exception: >> + raise exception_class("Error after %s retries. Last exception: %s." >> + % (max_retries, last_exception)) >> -- >> 2.2.0.rc0.207.ga3a616c >> >> > As discussed offline, overall it would still be good to have delays. The > retries currently assume that the operations performed take enough time > that whatever transient problems are present might fix themselves. > Please make an issue and refactor later. > Filed https://code.google.com/p/ganeti/issues/detail?id=1078 Interdiff: diff --git a/lib/backend.py b/lib/backend.py index 5b2b558..52eee65 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1578,7 +1578,7 @@ def AddNodeSshKey(node_uuid, node_name, except errors.SshUpdateError as last_exception: error_msg = ("When adding the key of node '%s', updating SSH key" " files of node '%s' failed after %s retries." - " Not trying again. Last errors was: %s." % + " Not trying again. Last error was: %s." % (node, node_name, constants.SSHS_MAX_RETRIES, last_exception)) node_errors.append((node, error_msg)) @@ -1741,20 +1741,21 @@ def RemoveNodeSshKey(node_uuid, node_name, logging.error(error_msg) else: - logging.debug("Updating key setup of normal node %s.", node) - try: - utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, - errors.SshUpdateError, - run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, - ssh_port, base_data, - debug=False, verbose=False, use_cluster_key=False, - ask_key=False, strict_host_check=False) - except errors.SshUpdateError as last_exception: - error_msg = error_msg_final % ( - node_name, node, last_exception) - result_msgs.append((node, error_msg)) - logging.error(error_msg) + if from_authorized_keys: + logging.debug("Updating key setup of normal node %s.", node) + try: + utils.RetryByNumberOfTimes( + constants.SSHS_MAX_RETRIES, + errors.SshUpdateError, + run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, + ssh_port, base_data, + debug=False, verbose=False, use_cluster_key=False, + ask_key=False, strict_host_check=False) + except errors.SshUpdateError as last_exception: + error_msg = error_msg_final % ( + node_name, node, last_exception) + result_msgs.append((node, error_msg)) + logging.error(error_msg) if clear_authorized_keys or from_public_keys or clear_public_keys: data = {} diff --git a/lib/utils/retry.py b/lib/utils/retry.py index 382d0cb..fda3fcd 100644 --- a/lib/utils/retry.py +++ b/lib/utils/retry.py @@ -237,12 +237,12 @@ def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs): """Retries calling a function up to the specified number of times. @type max_retries: integer - @param max_retries: maximum number of retries. + @param max_retries: Maximum number of retries. @type exception_class: class - @param exception_class: exception class which is used for throwing the + @param exception_class: Exception class which is used for throwing the final exception. @type fn: callable - @param fn: function to be called (up to the specified maximum number of + @param fn: Function to be called (up to the specified maximum number of retries. """ > > > Hrvoje Ribicic > Ganeti Engineering > Google Germany GmbH > Dienerstr. 12, 80331, München > > Registergericht und -nummer: Hamburg, HRB 86891 > Sitz der Gesellschaft: Hamburg > Geschäftsführer: Graham Law, Christine Elizabeth Flores > Steuernummer: 48/725/00206 > Umsatzsteueridentifikationsnummer: DE813741370 >
