Hi! On Tue, 21 Apr 2015 at 15:16 Petr Pudlak <[email protected]> wrote:
> On Thu, Apr 16, 2015 at 05:32:00PM +0200, 'Helga Velroyen' via > ganeti-devel wrote: > >So far, if a SSH key of a node is added to the cluster > >and updating the SSH setup of the node itself fails, > >the entire operation fails quite disgraceful. This > >patch introduces a retry mechanism and a proper cleanup > >in case of a failure. > > > >Note that there are more places where retries would > >be a good idea. They'll be added in other patches. > > > >Signed-off-by: Helga Velroyen <[email protected]> > >--- > > lib/backend.py | 28 +++++++++++++--- > > test/py/ganeti.backend_unittest.py | 67 > ++++++++++++++++++++++++++++++++++++++ > > test/py/testutils_ssh.py | 30 ++++++++++++++++- > > 3 files changed, 120 insertions(+), 5 deletions(-) > > > >diff --git a/lib/backend.py b/lib/backend.py > >index bc1a570..818f3fb 100644 > >--- a/lib/backend.py > >+++ b/lib/backend.py > >@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name, > > _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) > > cluster_name = base_data[constants.SSHS_CLUSTER_NAME] > > > >+ # number of retries for SSH connections > >+ max_retries = 3 > > Perhaps it'd be useful to have the number as some global constant (and also > reference the constant in the tests). > True, thought about this as well. See interdiff below. Als note that this results in some interdiffs in the other 7 patches of that series, I will send them as reply to those respective patches. > > >+ > > # Update the target node itself > > if get_public_keys: > > node_data = {} > >@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name, > > all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file) > > node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ > > (constants.SSHS_OVERRIDE, all_keys) > >- run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, > >- ssh_port_map.get(node_name), node_data, > >- debug=False, verbose=False, use_cluster_key=False, > >- ask_key=False, strict_host_check=False) > >+ > >+ last_exception = None > >+ for _ in range(max_retries): > >+ try: > >+ run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, > >+ ssh_port_map.get(node_name), node_data, > >+ debug=False, verbose=False, use_cluster_key=False, > >+ ask_key=False, strict_host_check=False) > >+ break > >+ except errors.OpExecError as e: > >+ logging.error("Updating SSH key files of node '%s' failed." > >+ " Error: %s", node_name, e) > >+ last_exception = e > >+ else: > >+ if last_exception: > >+ # Clean up the master's public key file if adding key fails > >+ if to_public_keys: > >+ ssh.RemovePublicKey(node_uuid) > >+ raise errors.SshUpdateError( > >+ "Could not update the SSH setup of node '%s' itself. Error: > %s." > >+ % (node_name, last_exception)) > > Would there be somoe benefit from using utils.retry.Retry? The difference > is > that Retry is based on timeout instead of the number of invocations (which > can make tests a bit problematic under some circumstances), and that a > failure should return RetryAgain - it could just wrap the OpExecError. > > If not, it might make sense to have another utility function for this, with > a similar interface. > I would not like to use utils.retry.Retry as it is based on a timeout and that makes stuff (including the unit tests) awfully slow. I agree that having an extra utility function might be a good idea. Would you mind if I add it as a separate patch at the end of the series? I already have used this in the other 7 patches of this series, and I'd rather not merge that in now. > > > > > # Update all nodes except master and the target node > > if to_authorized_keys: > >diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ > ganeti.backend_unittest.py > >index 5305c15..cc65501 100755 > >--- a/test/py/ganeti.backend_unittest.py > >+++ b/test/py/ganeti.backend_unittest.py > >@@ -1400,6 +1400,73 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > > self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > > node, node_key)) > > > >+ def testAddKeySuccessfullyWithRetries(self): > >+ new_node_name = "new_node_name" > >+ new_node_uuid = "new_node_uuid" > >+ new_node_key = "new_node_key" > >+ is_master_candidate = True > >+ is_potential_master_candidate = True > >+ is_master = False > >+ > >+ self._AddNewNodeToTestData( > >+ new_node_name, new_node_uuid, new_node_key, > >+ is_potential_master_candidate, is_master_candidate, > >+ is_master) > >+ self._ssh_file_manager.SetMaxRetries(new_node_name, 3) > >+ > >+ backend.AddNodeSshKey(new_node_uuid, new_node_name, > >+ self._potential_master_candidates, > >+ self._ssh_port_map, > >+ to_authorized_keys=is_master_candidate, > >+ to_public_keys=is_potential_master_candidate, > >+ get_public_keys=is_potential_master_candidate, > >+ pub_key_file=self._pub_key_file, > >+ ssconf_store=self._ssconf_mock, > >+ noded_cert_file=self.noded_cert_file, > >+ run_cmd_fn=self._run_cmd_mock) > >+ > >+ self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey( > >+ new_node_name) > >+ self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( > >+ new_node_key)) > >+ > >+ def testAddKeyFailedWithRetries(self): > >+ new_node_name = "new_node_name" > >+ new_node_uuid = "new_node_uuid" > >+ new_node_key = "new_node_key" > >+ is_master_candidate = True > >+ is_potential_master_candidate = True > >+ is_master = False > >+ > >+ self._AddNewNodeToTestData( > >+ new_node_name, new_node_uuid, new_node_key, > >+ is_potential_master_candidate, is_master_candidate, > >+ is_master) > >+ self._ssh_file_manager.SetMaxRetries(new_node_name, 4) > >+ > >+ self.assertRaises( > >+ errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid, > >+ new_node_name, self._potential_master_candidates, > self._ssh_port_map, > >+ to_authorized_keys=is_master_candidate, > >+ to_public_keys=is_potential_master_candidate, > >+ get_public_keys=is_potential_master_candidate, > >+ pub_key_file=self._pub_key_file, > >+ ssconf_store=self._ssconf_mock, > >+ noded_cert_file=self.noded_cert_file, > >+ run_cmd_fn=self._run_cmd_mock) > >+ > >+ for node in self._all_nodes: > >+ if node == new_node_name: > >+ self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > >+ node, new_node_key)) > >+ else: > >+ self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( > >+ node, new_node_key)) > > It might be useful to add a helper (probably in a later patch?) to the ssh > file manager for checking that all nodes (and only those) in a given set > have a given authorized key - a similar loop is already used in a few other > places. > Good point. As you suggest, I'll add that in an additional patch. > > >+ > >+ self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( > >+ new_node_uuid, new_node_key)) > >+ > >+ > > class TestVerifySshSetup(testutils.GanetiTestCase): > > > > _NODE1_UUID = "uuid1" > >diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py > >index a65c44e..b7b4e82 100644 > >--- a/test/py/testutils_ssh.py > >+++ b/test/py/testutils_ssh.py > >@@ -32,6 +32,7 @@ > > > > from ganeti import constants > > from ganeti import pathutils > >+from ganeti import errors > > > > > > class FakeSshFileManager(object): > >@@ -66,6 +67,13 @@ class FakeSshFileManager(object): > > self._public_keys = {} # dict of dicts > > # Node name of the master node > > self._master_node_name = None > >+ # Dictionary mapping nodes by name to number of retries where > 'RunCommand' > >+ # succeeds. For example if set to '3', RunCommand will fail two > times when > >+ # called for this node before it succeeds in the 3rd retry. > >+ self._max_retries = {} > >+ # Dictionary mapping nodes by name to number of retries which > >+ # 'RunCommand' has already carried out. > >+ self._retries = {} > > Maybe only one dictionary would suffice. When setting a retry, we'd set > self._retries[node] to a given number and let an operation succeed only if > the number is positive (decrementing otherwise). Or do we need to keep the > max. number of retries while resetting the counter at some point? > While I understand that your desire here is to get rid of one variable, I'd rather not do that. The reason is that this would count down instead of up like it is done in the implementation that this class is emulating. For the sanity of any developer who has to work with this in the future, I'd rather keep those similar to not add any additional confusion. Would that be okay for you? > > > > > def _SetMasterNodeName(self): > > self._master_node_name = [name for name, (_, _, _, _, master) > >@@ -138,6 +146,17 @@ class FakeSshFileManager(object): > > self._FillAuthorizedKeyOfOneNode(node) > > self._SetMasterNodeName() > > > >+ def SetMaxRetries(self, node_name, retries): > >+ """Set the number of unsuccessful retries of 'RunCommand' per node. > >+ > >+ @type node_name: string > >+ @param node_name: name of the node > >+ @type retries: integer > >+ @param retries: number of unsuccessful retries > >+ > >+ """ > >+ self._max_retries[node_name] = retries > >+ > > def GetSshPortMap(self, port): > > """Creates a SSH port map with all nodes mapped to the given port. > > > >@@ -297,7 +316,8 @@ class FakeSshFileManager(object): > > for name in self._all_node_data.keys(): > > node_pub_keys = self._public_keys[name] > > if (uuid, key) in node_pub_keys.items(): > >- return False > >+ raise Exception("Node '%s' does have public key '%s' of node > '%s'" > >+ % (name, key, uuid)) > > This change is somewhat confusing, and seems to me unrelated to the patch. > The method used to return True/False, but after that change it's either > True > or an exception. So I'd either be in favor of True/False or None/exception. > And currently all the tests in the backend_unittest use assertTrue/False on > the result. > I agree. I realized for debugging any problems, I not only need to know that some node is not configured correctly, but which node. Therefore I started throwing some exceptions to add additional information. I realized later that this is kinda inconsistent. I fixed this for all occurrences in patch: [PATCH stable-2.13 14/15] SSH file manager: properly name assertion function Therefore, can you let this slip here as it is not breaking anything and just a little bending of semantics? > > > return True > > > > def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name): > >@@ -342,6 +362,14 @@ class FakeSshFileManager(object): > > of SSH keys. No actual key files of any node is touched. > > > > """ > >+ if node in self._max_retries: > >+ if node not in self._retries: > >+ self._retries[node] = 0 > >+ self._retries[node] += 1 > >+ if self._retries[node] < self._max_retries[node]: > >+ raise errors.OpExecError("(Fake) SSH connection to node '%s' > failed." > >+ % node) > >+ > > assert base_cmd == pathutils.SSH_UPDATE > > > > if constants.SSHS_SSH_AUTHORIZED_KEYS in data: > >-- > >2.2.0.rc0.207.ga3a616c > > > Interdiff for the max_retries constant: diff --git a/lib/backend.py b/lib/backend.py index 818f3fb..fd30bd8 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1520,10 +1520,8 @@ def AddNodeSshKey(node_uuid, node_name, _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) cluster_name = base_data[constants.SSHS_CLUSTER_NAME] - # number of retries for SSH connections - max_retries = 3 - # Update the target node itself + logging.debug("Updating SSH key files of target node '%s'.", node_name) if get_public_keys: node_data = {} _InitSshUpdateData(node_data, noded_cert_file, ssconf_store) @@ -1532,7 +1530,7 @@ def AddNodeSshKey(node_uuid, node_name, (constants.SSHS_OVERRIDE, all_keys) last_exception = None - for _ in range(max_retries): + for _ in range(constants.SSHS_MAX_RETRIES): try: run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, ssh_port_map.get(node_name), node_data, diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs index 01bf435..eddadf3 100644 --- a/src/Ganeti/Constants.hs +++ b/src/Ganeti/Constants.hs @@ -4563,6 +4563,9 @@ sshsSshPublicKeys = "public_keys" sshsNodeDaemonCertificate :: String sshsNodeDaemonCertificate = "node_daemon_certificate" +sshsMaxRetries :: Integer +sshsMaxRetries = 3 + sshsAdd :: String sshsAdd = "add" diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ ganeti.backend_unittest.py index cc65501..288ed15 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -1412,7 +1412,8 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): new_node_name, new_node_uuid, new_node_key, is_potential_master_candidate, is_master_candidate, is_master) - self._ssh_file_manager.SetMaxRetries(new_node_name, 3) + self._ssh_file_manager.SetMaxRetries( + new_node_name, constants.SSHS_MAX_RETRIES) backend.AddNodeSshKey(new_node_uuid, new_node_name, self._potential_master_candidates, @@ -1442,7 +1443,8 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): new_node_name, new_node_uuid, new_node_key, is_potential_master_candidate, is_master_candidate, is_master) - self._ssh_file_manager.SetMaxRetries(new_node_name, 4) + self._ssh_file_manager.SetMaxRetries( + new_node_name, constants.SSHS_MAX_RETRIES + 1) self.assertRaises( errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
