So far, if a SSH key of a node is added to the cluster and updating the SSH setup of the node itself fails, the entire operation fails quite disgraceful. This patch introduces a retry mechanism and a proper cleanup in case of a failure.
Note that there are more places where retries would be a good idea. They'll be added in other patches. Signed-off-by: Helga Velroyen <[email protected]> --- lib/backend.py | 28 +++++++++++++--- test/py/ganeti.backend_unittest.py | 67 ++++++++++++++++++++++++++++++++++++++ test/py/testutils_ssh.py | 30 ++++++++++++++++- 3 files changed, 120 insertions(+), 5 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index bc1a570..818f3fb 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name, _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) cluster_name = base_data[constants.SSHS_CLUSTER_NAME] + # number of retries for SSH connections + max_retries = 3 + # Update the target node itself if get_public_keys: node_data = {} @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name, all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file) node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ (constants.SSHS_OVERRIDE, all_keys) - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, - ssh_port_map.get(node_name), node_data, - debug=False, verbose=False, use_cluster_key=False, - ask_key=False, strict_host_check=False) + + last_exception = None + for _ in range(max_retries): + try: + run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, + ssh_port_map.get(node_name), node_data, + debug=False, verbose=False, use_cluster_key=False, + ask_key=False, strict_host_check=False) + break + except errors.OpExecError as e: + logging.error("Updating SSH key files of node '%s' failed." + " Error: %s", node_name, e) + last_exception = e + else: + if last_exception: + # Clean up the master's public key file if adding key fails + if to_public_keys: + ssh.RemovePublicKey(node_uuid) + raise errors.SshUpdateError( + "Could not update the SSH setup of node '%s' itself. Error: %s." + % (node_name, last_exception)) # Update all nodes except master and the target node if to_authorized_keys: diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py index 5305c15..cc65501 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -1400,6 +1400,73 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( node, node_key)) + def testAddKeySuccessfullyWithRetries(self): + new_node_name = "new_node_name" + new_node_uuid = "new_node_uuid" + new_node_key = "new_node_key" + is_master_candidate = True + is_potential_master_candidate = True + is_master = False + + self._AddNewNodeToTestData( + new_node_name, new_node_uuid, new_node_key, + is_potential_master_candidate, is_master_candidate, + is_master) + self._ssh_file_manager.SetMaxRetries(new_node_name, 3) + + backend.AddNodeSshKey(new_node_uuid, new_node_name, + self._potential_master_candidates, + self._ssh_port_map, + to_authorized_keys=is_master_candidate, + to_public_keys=is_potential_master_candidate, + get_public_keys=is_potential_master_candidate, + pub_key_file=self._pub_key_file, + ssconf_store=self._ssconf_mock, + noded_cert_file=self.noded_cert_file, + run_cmd_fn=self._run_cmd_mock) + + self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey( + new_node_name) + self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( + new_node_key)) + + def testAddKeyFailedWithRetries(self): + new_node_name = "new_node_name" + new_node_uuid = "new_node_uuid" + new_node_key = "new_node_key" + is_master_candidate = True + is_potential_master_candidate = True + is_master = False + + self._AddNewNodeToTestData( + new_node_name, new_node_uuid, new_node_key, + is_potential_master_candidate, is_master_candidate, + is_master) + self._ssh_file_manager.SetMaxRetries(new_node_name, 4) + + self.assertRaises( + errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid, + new_node_name, self._potential_master_candidates, self._ssh_port_map, + to_authorized_keys=is_master_candidate, + to_public_keys=is_potential_master_candidate, + get_public_keys=is_potential_master_candidate, + pub_key_file=self._pub_key_file, + ssconf_store=self._ssconf_mock, + noded_cert_file=self.noded_cert_file, + run_cmd_fn=self._run_cmd_mock) + + for node in self._all_nodes: + if node == new_node_name: + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( + node, new_node_key)) + else: + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( + node, new_node_key)) + + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( + new_node_uuid, new_node_key)) + + class TestVerifySshSetup(testutils.GanetiTestCase): _NODE1_UUID = "uuid1" diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py index a65c44e..b7b4e82 100644 --- a/test/py/testutils_ssh.py +++ b/test/py/testutils_ssh.py @@ -32,6 +32,7 @@ from ganeti import constants from ganeti import pathutils +from ganeti import errors class FakeSshFileManager(object): @@ -66,6 +67,13 @@ class FakeSshFileManager(object): self._public_keys = {} # dict of dicts # Node name of the master node self._master_node_name = None + # Dictionary mapping nodes by name to number of retries where 'RunCommand' + # succeeds. For example if set to '3', RunCommand will fail two times when + # called for this node before it succeeds in the 3rd retry. + self._max_retries = {} + # Dictionary mapping nodes by name to number of retries which + # 'RunCommand' has already carried out. + self._retries = {} def _SetMasterNodeName(self): self._master_node_name = [name for name, (_, _, _, _, master) @@ -138,6 +146,17 @@ class FakeSshFileManager(object): self._FillAuthorizedKeyOfOneNode(node) self._SetMasterNodeName() + def SetMaxRetries(self, node_name, retries): + """Set the number of unsuccessful retries of 'RunCommand' per node. + + @type node_name: string + @param node_name: name of the node + @type retries: integer + @param retries: number of unsuccessful retries + + """ + self._max_retries[node_name] = retries + def GetSshPortMap(self, port): """Creates a SSH port map with all nodes mapped to the given port. @@ -297,7 +316,8 @@ class FakeSshFileManager(object): for name in self._all_node_data.keys(): node_pub_keys = self._public_keys[name] if (uuid, key) in node_pub_keys.items(): - return False + raise Exception("Node '%s' does have public key '%s' of node '%s'" + % (name, key, uuid)) return True def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name): @@ -342,6 +362,14 @@ class FakeSshFileManager(object): of SSH keys. No actual key files of any node is touched. """ + if node in self._max_retries: + if node not in self._retries: + self._retries[node] = 0 + self._retries[node] += 1 + if self._retries[node] < self._max_retries[node]: + raise errors.OpExecError("(Fake) SSH connection to node '%s' failed." + % node) + assert base_cmd == pathutils.SSH_UPDATE if constants.SSHS_SSH_AUTHORIZED_KEYS in data: -- 2.2.0.rc0.207.ga3a616c
