On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < [email protected]> wrote:
> So far, if a SSH key of a node is added to the cluster > and updating the SSH setup of the node itself fails, > the entire operation fails quite disgraceful. This > patch introduces a retry mechanism and a proper cleanup > in case of a failure. > > Note that there are more places where retries would > be a good idea. They'll be added in other patches. > > Signed-off-by: Helga Velroyen <[email protected]> > --- > lib/backend.py | 28 +++++++++++++--- > test/py/ganeti.backend_unittest.py | 67 > ++++++++++++++++++++++++++++++++++++++ > test/py/testutils_ssh.py | 30 ++++++++++++++++- > 3 files changed, 120 insertions(+), 5 deletions(-) > > diff --git a/lib/backend.py b/lib/backend.py > index bc1a570..818f3fb 100644 > --- a/lib/backend.py > +++ b/lib/backend.py > @@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name, > _InitSshUpdateData(base_data, noded_cert_file, ssconf_store) > cluster_name = base_data[constants.SSHS_CLUSTER_NAME] > > + # number of retries for SSH connections > + max_retries = 3 + > # Update the target node itself > if get_public_keys: > node_data = {} > @@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name, > all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file) > node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \ > (constants.SSHS_OVERRIDE, all_keys) > - run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, > - ssh_port_map.get(node_name), node_data, > - debug=False, verbose=False, use_cluster_key=False, > - ask_key=False, strict_host_check=False) > + > + last_exception = None > + for _ in range(max_retries): > + try: > + run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE, > + ssh_port_map.get(node_name), node_data, > + debug=False, verbose=False, use_cluster_key=False, > + ask_key=False, strict_host_check=False) > + break > + except errors.OpExecError as e: > + logging.error("Updating SSH key files of node '%s' failed." > + " Error: %s", node_name, e) > + last_exception = e > + else: > + if last_exception: > + # Clean up the master's public key file if adding key fails > + if to_public_keys: > + ssh.RemovePublicKey(node_uuid) > + raise errors.SshUpdateError( > + "Could not update the SSH setup of node '%s' itself. Error: %s." > + % (node_name, last_exception)) > > # Update all nodes except master and the target node > if to_authorized_keys: > diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ > ganeti.backend_unittest.py > index 5305c15..cc65501 100755 > --- a/test/py/ganeti.backend_unittest.py > +++ b/test/py/ganeti.backend_unittest.py > @@ -1400,6 +1400,73 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > node, node_key)) > > + def testAddKeySuccessfullyWithRetries(self): > + new_node_name = "new_node_name" > + new_node_uuid = "new_node_uuid" > + new_node_key = "new_node_key" > + is_master_candidate = True > + is_potential_master_candidate = True > + is_master = False > + > + self._AddNewNodeToTestData( > + new_node_name, new_node_uuid, new_node_key, > + is_potential_master_candidate, is_master_candidate, > + is_master) > + self._ssh_file_manager.SetMaxRetries(new_node_name, 3) > + > + backend.AddNodeSshKey(new_node_uuid, new_node_name, > + self._potential_master_candidates, > + self._ssh_port_map, > + to_authorized_keys=is_master_candidate, > + to_public_keys=is_potential_master_candidate, > + get_public_keys=is_potential_master_candidate, > + pub_key_file=self._pub_key_file, > + ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > + run_cmd_fn=self._run_cmd_mock) > + > + self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey( > + new_node_name) > This looks like it needs an assertTrue. > + self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey( > + new_node_key)) > + > + def testAddKeyFailedWithRetries(self): > + new_node_name = "new_node_name" > + new_node_uuid = "new_node_uuid" > + new_node_key = "new_node_key" > + is_master_candidate = True > + is_potential_master_candidate = True > + is_master = False > + > + self._AddNewNodeToTestData( > + new_node_name, new_node_uuid, new_node_key, > + is_potential_master_candidate, is_master_candidate, > + is_master) > + self._ssh_file_manager.SetMaxRetries(new_node_name, 4) > + > + self.assertRaises( > + errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid, > + new_node_name, self._potential_master_candidates, > self._ssh_port_map, > + to_authorized_keys=is_master_candidate, > + to_public_keys=is_potential_master_candidate, > + get_public_keys=is_potential_master_candidate, > + pub_key_file=self._pub_key_file, > + ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > + run_cmd_fn=self._run_cmd_mock) > + > + for node in self._all_nodes: > + if node == new_node_name: > + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, new_node_key)) > + else: > + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, new_node_key)) > + > + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( > + new_node_uuid, new_node_key)) > + > + > class TestVerifySshSetup(testutils.GanetiTestCase): > > _NODE1_UUID = "uuid1" > diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py > index ff7db63..a180144 100644 > --- a/test/py/testutils_ssh.py > +++ b/test/py/testutils_ssh.py > @@ -32,6 +32,7 @@ > > from ganeti import constants > from ganeti import pathutils > +from ganeti import errors > > > class FakeSshFileManager(object): > @@ -66,6 +67,13 @@ class FakeSshFileManager(object): > self._public_keys = {} # dict of dicts > # Node name of the master node > self._master_node_name = None > + # Dictionary mapping nodes by name to number of retries where > 'RunCommand' > + # succeeds. For example if set to '3', RunCommand will fail two times > when > + # called for this node before it succeeds in the 3rd retry. > + self._max_retries = {} > + # Dictionary mapping nodes by name to number of retries which > + # 'RunCommand' has already carried out. > + self._retries = {} > > def _SetMasterNodeName(self): > self._master_node_name = [name for name, (_, _, _, _, master) > @@ -124,6 +132,17 @@ class FakeSshFileManager(object): > self._FillAuthorizedKeyOfOneNode(node) > self._SetMasterNodeName() > > + def SetMaxRetries(self, node_name, retries): > + """Set the number of unsuccessful retries of 'RunCommand' per node. > + > + @type node_name: string > + @param node_name: name of the node > + @type retries: integer > + @param retries: number of unsuccessful retries > + > + """ > + self._max_retries[node_name] = retries > + > def GetSshPortMap(self, port): > """Creates a SSH port map with all nodes mapped to the given port. > > @@ -281,7 +300,8 @@ class FakeSshFileManager(object): > for name in self._all_node_data.keys(): > node_pub_keys = self._public_keys[name] > if (uuid, key) in node_pub_keys.items(): > - return False > + raise Exception("Node '%s' does have public key '%s' of node '%s'" > + % (name, key, uuid)) > return True > > def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name): > @@ -326,6 +346,14 @@ class FakeSshFileManager(object): > of SSH keys. No actual key files of any node is touched. > > """ > + if node in self._max_retries: > + if node not in self._retries: > + self._retries[node] = 0 > + self._retries[node] += 1 > + if self._retries[node] < self._max_retries[node]: > + raise errors.OpExecError("(Fake) SSH connection to node '%s' > failed." > + % node) > + > assert base_cmd == pathutils.SSH_UPDATE > > if constants.SSHS_SSH_AUTHORIZED_KEYS in data: > -- > 2.2.0.rc0.207.ga3a616c > > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
