This patch adds a backing-off mechanism to the function RetryByNumberOfTimes. This is useful for example when SSH connections fail in a flaky network. The original version of RetryByNumberOfTimes immediately retried failed SSH calls, but that might not be enough to recover from a network problem.
The patch adds an additional parameter 'backoff' which specifies the base number of seconds of the backoff. That means after the first failed try, a delay is added as long as the backoff parameter specifies. With each additional failed try, the delay is doubled until the maximum number of retries is hit. Note that the backoff parameter is not a keyword argument, which might have been more convenient. That's because otherwise RetryByNumberOfTimes would no longer be able to propagate *args and **kwargs to the function to be called with retries. Also note that there is a function "Retry" in the same package, which already provides somewhat complicated timeout capabilities. However, we did not merge these two functions, because Retry also does not propagate *args and **kwargs properly which is something we depend on in backend.py. This patch also updates the unit tests and mocks the sleep function in the backend.py's unit tests to not slow down the tests. This fixes issue 1078. Signed-off-by: Helga Velroyen <[email protected]> --- lib/backend.py | 18 +++++++++-------- lib/utils/retry.py | 19 ++++++++++++++++- test/py/ganeti.backend_unittest.py | 7 +++++++ test/py/ganeti.utils.retry_unittest.py | 37 ++++++++++++++++++++++++++++++---- 4 files changed, 68 insertions(+), 13 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 67e82a0..346b900 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1559,8 +1559,9 @@ def AddNodeSshKeyBulk(node_list, (constants.SSHS_OVERRIDE, all_keys) try: + backoff = 5 # seconds utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, + constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE, ssh_port_map.get(node_info.name), node_data, @@ -1605,9 +1606,9 @@ def AddNodeSshKeyBulk(node_list, if node in potential_master_candidates: logging.debug("Updating SSH key files of node '%s'.", node) try: + backoff = 5 # seconds utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, - errors.SshUpdateError, + constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, ssh_port_map.get(node), pot_mc_data, debug=ssh_update_debug, verbose=ssh_update_verbose, @@ -1875,9 +1876,9 @@ def RemoveNodeSshKeyBulk(node_list, logging.debug("Updating key setup of potential master candidate node" " %s.", node) try: + backoff = 5 # seconds utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, - errors.SshUpdateError, + constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, ssh_port, pot_mc_data, debug=ssh_update_debug, verbose=ssh_update_verbose, @@ -1892,9 +1893,9 @@ def RemoveNodeSshKeyBulk(node_list, if from_authorized_keys: logging.debug("Updating key setup of normal node %s.", node) try: + backoff = 5 # seconds utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, - errors.SshUpdateError, + constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, run_cmd_fn, cluster_name, node, pathutils.SSH_UPDATE, ssh_port, base_data, debug=ssh_update_debug, verbose=ssh_update_verbose, @@ -1946,8 +1947,9 @@ def RemoveNodeSshKeyBulk(node_list, logging.debug("Updating SSH key setup of target node '%s'.", node_info.name) try: + backoff = 5 # seconds utils.RetryByNumberOfTimes( - constants.SSHS_MAX_RETRIES, + constants.SSHS_MAX_RETRIES, backoff, errors.SshUpdateError, run_cmd_fn, cluster_name, node_info.name, pathutils.SSH_UPDATE, ssh_port, data, diff --git a/lib/utils/retry.py b/lib/utils/retry.py index 8079303..895cc0e 100644 --- a/lib/utils/retry.py +++ b/lib/utils/retry.py @@ -253,7 +253,8 @@ def CountRetry(expected, fn, count, args=None): wait_fn=inc_tries, _time_fn=get_tries) -def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs): +def RetryByNumberOfTimes(max_retries, backoff, exception_class, fn, *args, + **kwargs): """Retries calling a function up to the specified number of times. @type max_retries: integer @@ -264,9 +265,23 @@ def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs): @type fn: callable @param fn: Function to be called (up to the specified maximum number of retries. + @type backoff: int + @param backoff: this enables and configures the back off behavior after + failed tries. If value is '0', there will be no delay between failed + tries. If the value is a positive integer, it is interpreted as the + base length of the back off delay (in seconds). That means there will be a + delay between failed tries of the length specified in this paramter. With + each next retry, the delay is increased by the factor of two. For example, + if the value is '2', the first delay is 2 seconds, the second 4 seconds, + the third 8 seconds (until the max_retries) are hit or the function call + succeeds. """ + if backoff < 0: + raise exception_class("Backoff must be a non-negative integer.") + last_exception = None + delay = backoff for i in range(max_retries): try: fn(*args, **kwargs) @@ -274,6 +289,8 @@ def RetryByNumberOfTimes(max_retries, exception_class, fn, *args, **kwargs): except errors.OpExecError as e: logging.error("Error after retry no. %s: %s.", i, e) last_exception = e + time.sleep(delay) + delay *= 2 else: if last_exception: raise exception_class("Error after %s retries. Last exception: %s." diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py index 17e7dc7..2b1d71e 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -32,6 +32,7 @@ import collections import copy +import time import mock import os import shutil @@ -986,6 +987,11 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): self._ssh_replace_name_by_uuid_mock.side_effect = \ self._ssh_file_manager.ReplaceNameByUuid + self._time_sleep_patcher = testutils \ + .patch_object(time, "sleep") + self._time_sleep_mock = \ + self._time_sleep_patcher.start() + self.noded_cert_file = testutils.TestDataFilename("cert1.pem") self._SetupTestData() @@ -998,6 +1004,7 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): self._ssh_remove_public_key_patcher.stop() self._ssh_query_pub_key_file_patcher.stop() self._ssh_replace_name_by_uuid_patcher.stop() + self._time_sleep_patcher.stop() self._TearDownTestData() def _SetupTestData(self, number_of_nodes=15, number_of_pot_mcs=5, diff --git a/test/py/ganeti.utils.retry_unittest.py b/test/py/ganeti.utils.retry_unittest.py index e796f84..93638cd 100755 --- a/test/py/ganeti.utils.retry_unittest.py +++ b/test/py/ganeti.utils.retry_unittest.py @@ -31,6 +31,7 @@ """Script for testing ganeti.utils.retry""" import mock +import time import unittest from ganeti import constants @@ -213,12 +214,12 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase): def testSuccessOnFirst(self): test_fn = mock.Mock() - utils.RetryByNumberOfTimes(5, Exception, test_fn) + utils.RetryByNumberOfTimes(5, 0, Exception, test_fn) test_fn.assert_called_once() def testSuccessOnFirstWithArgs(self): test_fn = mock.Mock() - utils.RetryByNumberOfTimes(5, Exception, test_fn, + utils.RetryByNumberOfTimes(5, 0, Exception, test_fn, "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value") test_fn.assert_called_with( "arg1", "arg2", kwarg1_key="kwarg1_value", kwarg2_key="kwarg2_value") @@ -235,7 +236,7 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase): else: return "I succeed." - utils.RetryByNumberOfTimes(self.max_tries, Exception, test_fn) + utils.RetryByNumberOfTimes(self.max_tries, 0, Exception, test_fn) def testFailAllTries(self): self.max_tries = 5 @@ -244,7 +245,35 @@ class TestRetryByNumberOfTimes(testutils.GanetiTestCase): raise errors.OpExecError("I fail!") self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries, - Exception, test_fn) + 0, Exception, test_fn) + + @testutils.patch_object(time, "sleep") + def testBackoffZero(self, mock_sleep): + self.max_tries = 5 + + def test_fn(): + raise errors.OpExecError("I fail!") + + self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries, + backoff=0, exception_class=Exception, fn=test_fn) + for call in mock_sleep.mock_calls: + self.assertEqual(mock.call(0), call) + + @testutils.patch_object(time, "sleep") + def testBackoffPositive(self, mock_sleep): + self.max_tries = 5 + + def test_fn(): + raise errors.OpExecError("I fail!") + + backoff = 3 + self.assertRaises(Exception, utils.RetryByNumberOfTimes, self.max_tries, + backoff=backoff, exception_class=Exception, fn=test_fn) + + expected_calls = [3, 6, 12, 24, 48] + for call_idx in range(len(mock_sleep.mock_calls)): + self.assertEqual(mock.call(expected_calls[call_idx]), + mock_sleep.mock_calls[call_idx]) if __name__ == "__main__": -- 2.6.0.rc2.230.g3dd15c0
