With that as an explanation, LGTM. On Wed, Apr 22, 2015 at 4:37 PM, Helga Velroyen <[email protected]> wrote:
> > > On Wed, 22 Apr 2015 at 16:33 Hrvoje Ribicic <[email protected]> wrote: > >> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < >> [email protected]> wrote: >> >>> So far, if a SSH key of a node is added to the cluster >>> and updating the SSH setup of the node itself fails, >>> the entire operation fails quite disgraceful. This >>> patch introduces a retry mechanism and a proper cleanup >>> in case of a failure. >>> >>> Note that there are more places where retries would >>> be a good idea. They'll be added in other patches. >>> >>> Signed-off-by: Helga Velroyen <[email protected]> >>> --- >>> lib/backend.py | 28 +++++++++++++--- >>> test/py/ganeti.backend_unittest.py | 67 >>> ++++++++++++++++++++++++++++++++++++++ >>> test/py/testutils_ssh.py | 30 ++++++++++++++++- >>> 3 files changed, 120 insertions(+), 5 deletions(-) >>> >>> diff --git a/lib/backend.py b/lib/backend.py >>> index bc1a570..818f3fb 100644 >>> --- a/lib/backend.py >>> +++ b/lib/backend.py >>> @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name, >>> _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) >>> cluster_name = base_data[constants.SSHS_CLUSTER_NAME] >>> >>> + # number of retries for SSH connections >>> + max_retries = 3 >> >> + >>> # Update the target node itself >>> if get_public_keys: >>> node_data = {} >>> @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name, >>> all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file) >>> node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ >>> (constants.SSHS_OVERRIDE, all_keys) >>> - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >>> - ssh_port_map.get(node_name), node_data, >>> - debug=False, verbose=False, use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> + >>> + last_exception = None >>> + for _ in range(max_retries): >>> + try: >>> + run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, >>> + ssh_port_map.get(node_name), node_data, >>> + debug=False, verbose=False, use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + break >>> + except errors.OpExecError as e: >>> + logging.error("Updating SSH key files of node '%s' failed." >>> + " Error: %s", node_name, e) >>> + last_exception = e >>> + else: >>> + if last_exception: >>> + # Clean up the master's public key file if adding key fails >>> + if to_public_keys: >>> + ssh.RemovePublicKey(node_uuid) >>> + raise errors.SshUpdateError( >>> + "Could not update the SSH setup of node '%s' itself. Error: >>> %s." >>> + % (node_name, last_exception)) >>> >>> # Update all nodes except master and the target node >>> if to_authorized_keys: >>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ >>> ganeti.backend_unittest.py >>> index 5305c15..cc65501 100755 >>> --- a/test/py/ganeti.backend_unittest.py >>> +++ b/test/py/ganeti.backend_unittest.py >>> @@ -1400,6 +1400,73 @@ class >>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): >>> self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( >>> node, node_key)) >>> >>> + def testAddKeySuccessfullyWithRetries(self): >>> + new_node_name = "new_node_name" >>> + new_node_uuid = "new_node_uuid" >>> + new_node_key = "new_node_key" >>> + is_master_candidate = True >>> + is_potential_master_candidate = True >>> + is_master = False >>> + >>> + self._AddNewNodeToTestData( >>> + new_node_name, new_node_uuid, new_node_key, >>> + is_potential_master_candidate, is_master_candidate, >>> + is_master) >>> + self._ssh_file_manager.SetMaxRetries(new_node_name, 3) >>> + >>> + backend.AddNodeSshKey(new_node_uuid, new_node_name, >>> + self._potential_master_candidates, >>> + self._ssh_port_map, >>> + to_authorized_keys=is_master_candidate, >>> + to_public_keys=is_potential_master_candidate, >>> + get_public_keys=is_potential_master_candidate, >>> + pub_key_file=self._pub_key_file, >>> + ssconf_store=self._ssconf_mock, >>> + noded_cert_file=self.noded_cert_file, >>> + run_cmd_fn=self._run_cmd_mock) >>> + >>> + self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey( >>> + new_node_name) >>> >> >> This looks like it needs an assertTrue. >> > > Actually, PotentialMasterCandidatesOnlyHavePublicKey is more an assertion > function than a boolean one, because it throws an exception instead of > returning false. I fixed that inconsistency for all occurrences in a later > patch. > > >> >> >>> + self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( >>> + new_node_key)) >>> + >>> + def testAddKeyFailedWithRetries(self): >>> + new_node_name = "new_node_name" >>> + new_node_uuid = "new_node_uuid" >>> + new_node_key = "new_node_key" >>> + is_master_candidate = True >>> + is_potential_master_candidate = True >>> + is_master = False >>> + >>> + self._AddNewNodeToTestData( >>> + new_node_name, new_node_uuid, new_node_key, >>> + is_potential_master_candidate, is_master_candidate, >>> + is_master) >>> + self._ssh_file_manager.SetMaxRetries(new_node_name, 4) >>> + >>> + self.assertRaises( >>> + errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid, >>> + new_node_name, self._potential_master_candidates, >>> self._ssh_port_map, >>> + to_authorized_keys=is_master_candidate, >>> + to_public_keys=is_potential_master_candidate, >>> + get_public_keys=is_potential_master_candidate, >>> + pub_key_file=self._pub_key_file, >>> + ssconf_store=self._ssconf_mock, >>> + noded_cert_file=self.noded_cert_file, >>> + run_cmd_fn=self._run_cmd_mock) >>> + >>> + for node in self._all_nodes: >>> + if node == new_node_name: >>> + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( >>> + node, new_node_key)) >>> + else: >>> + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( >>> + node, new_node_key)) >>> + >>> + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( >>> + new_node_uuid, new_node_key)) >>> + >>> + >>> class TestVerifySshSetup(testutils.GanetiTestCase): >>> >>> _NODE1_UUID = "uuid1" >>> diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py >>> index ff7db63..a180144 100644 >>> --- a/test/py/testutils_ssh.py >>> +++ b/test/py/testutils_ssh.py >>> @@ -32,6 +32,7 @@ >>> >>> from ganeti import constants >>> from ganeti import pathutils >>> +from ganeti import errors >>> >>> >>> class FakeSshFileManager(object): >>> @@ -66,6 +67,13 @@ class FakeSshFileManager(object): >>> self._public_keys = {} # dict of dicts >>> # Node name of the master node >>> self._master_node_name = None >>> + # Dictionary mapping nodes by name to number of retries where >>> 'RunCommand' >>> + # succeeds. For example if set to '3', RunCommand will fail two >>> times when >>> + # called for this node before it succeeds in the 3rd retry. >>> + self._max_retries = {} >>> + # Dictionary mapping nodes by name to number of retries which >>> + # 'RunCommand' has already carried out. >>> + self._retries = {} >>> >>> def _SetMasterNodeName(self): >>> self._master_node_name = [name for name, (_, _, _, _, master) >>> @@ -124,6 +132,17 @@ class FakeSshFileManager(object): >>> self._FillAuthorizedKeyOfOneNode(node) >>> self._SetMasterNodeName() >>> >>> + def SetMaxRetries(self, node_name, retries): >>> + """Set the number of unsuccessful retries of 'RunCommand' per node. >>> + >>> + @type node_name: string >>> + @param node_name: name of the node >>> + @type retries: integer >>> + @param retries: number of unsuccessful retries >>> + >>> + """ >>> + self._max_retries[node_name] = retries >>> + >>> def GetSshPortMap(self, port): >>> """Creates a SSH port map with all nodes mapped to the given port. >>> >>> @@ -281,7 +300,8 @@ class FakeSshFileManager(object): >>> for name in self._all_node_data.keys(): >>> node_pub_keys = self._public_keys[name] >>> if (uuid, key) in node_pub_keys.items(): >>> - return False >>> + raise Exception("Node '%s' does have public key '%s' of node >>> '%s'" >>> + % (name, key, uuid)) >>> return True >>> >>> def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name): >>> @@ -326,6 +346,14 @@ class FakeSshFileManager(object): >>> of SSH keys. No actual key files of any node is touched. >>> >>> """ >>> + if node in self._max_retries: >>> + if node not in self._retries: >>> + self._retries[node] = 0 >>> + self._retries[node] += 1 >>> + if self._retries[node] < self._max_retries[node]: >>> + raise errors.OpExecError("(Fake) SSH connection to node '%s' >>> failed." >>> + % node) >>> + >>> assert base_cmd == pathutils.SSH_UPDATE >>> >>> if constants.SSHS_SSH_AUTHORIZED_KEYS in data: >>> -- >>> 2.2.0.rc0.207.ga3a616c >>> >>> >> Hrvoje Ribicic >> Ganeti Engineering >> Google Germany GmbH >> Dienerstr. 12, 80331, München >> >> Registergericht und -nummer: Hamburg, HRB 86891 >> Sitz der Gesellschaft: Hamburg >> Geschäftsführer: Graham Law, Christine Elizabeth Flores >> Steuernummer: 48/725/00206 >> Umsatzsteueridentifikationsnummer: DE813741370 >> > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
