LGTM, thanks On Thu, Apr 23, 2015 at 12:56 PM, Helga Velroyen <[email protected]> wrote:
> Hi! > > On Thu, 23 Apr 2015 at 11:29 Hrvoje Ribicic <[email protected]> wrote: > >> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < >> [email protected]> wrote: >> >>> When removing an SSH key, RemoveNodeSshKey so far did only >>> try to contact each node once. This made the entire >>> procedure not very resilient towards temporary network >>> failures. This patch adds a couple of retries for contacting >>> each node and only gives up after those are exhausted. >>> >>> Signed-off-by: Helga Velroyen <[email protected]> >>> --- >>> lib/backend.py | 101 >>> ++++++++++++++++++++++++------------- >>> lib/cmdlib/node.py | 21 ++++++-- >>> test/py/ganeti.backend_unittest.py | 65 ++++++++++++++++++++++++ >>> 3 files changed, 148 insertions(+), 39 deletions(-) >>> >>> diff --git a/lib/backend.py b/lib/backend.py >>> index 8ec22d9..c3190d2 100644 >>> --- a/lib/backend.py >>> +++ b/lib/backend.py >>> @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> @returns: list of feedback messages >>> >>> """ >>> - result_msgs = [] >>> + # Non-disruptive error messages, mapped from node name to message >>> + result_msgs = {} >>> + # maximum number of retries for SSH connections >>> + max_retries = 3 >>> >>> # Make sure at least one of these flags is true. >>> if not (from_authorized_keys or from_public_keys or >>> clear_authorized_keys >>> or clear_public_keys): >>> - result_msgs.append("No removal from any key file was requested.") >>> + raise errors.SshUpdateError("No removal from any key file was >>> requested.") >>> >>> if not ssconf_store: >>> ssconf_store = ssconf.SimpleStore() >>> @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> >>> all_nodes = ssconf_store.GetNodeList() >>> online_nodes = ssconf_store.GetOnlineNodeList() >>> + logging.debug("Removing key of node '%s' from all nodes but >>> itself and" >>> + " master.", node_name) >>> for node in all_nodes: >>> if node == master_node: >>> + logging.debug("Skipping master node '%s'.", master_node) >>> continue >>> if node not in online_nodes: >>> logging.debug("Skipping offline node '%s'.", node) >>> @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> " node '%s', map: %s." % >>> (node, ssh_port_map)) >>> if node in potential_master_candidates: >>> - try: >>> - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> - ssh_port, pot_mc_data, >>> - debug=False, verbose=False, >>> use_cluster_key=False, >>> - ask_key=False, strict_host_check=False) >>> - except errors.OpExecError as e: >>> - result_msgs.append("Warning: the SSH setup of node '%s' >>> could not" >>> - " be adjusted." % node) >>> - else: >>> - if from_authorized_keys: >>> + error_msg_try = ("The SSH setup of node '%s' could not" >>> + " be adjusted in try no %s. Error: %s.") >>> + error_msg_final = ("When removing the key of node '%s', >>> updating the" >>> + " SSH key files of node '%s' failed after >>> %s" >>> + " retries. Not trying again. Last error >>> was: %s.") >>> >> >> Looking at the code, it seems this has to go before the if as it is used >> in the else as well. >> > > Ack. > > >> >> >>> + last_exception = None >>> + for i in range(max_retries): >>> try: >>> run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> - ssh_port, base_data, >>> + ssh_port, pot_mc_data, >>> debug=False, verbose=False, >>> use_cluster_key=False, >>> ask_key=False, strict_host_check=False) >>> + break >>> except errors.OpExecError as e: >>> - result_msgs.append("Warning: the SSH setup of node '%s' >>> could" >>> - " not be adjusted." % node) >>> + logging.error(error_msg_try, node, i, e) >>> + last_exception = e >>> + else: >>> + if last_exception: >>> + error_msg = error_msg_final % (node_name, node, >>> max_retries, >>> + last_exception) >>> + result_msgs[node] = error_msg >>> + logging.error(error_msg) >>> + >>> + else: >>> + last_exception = None >>> + if from_authorized_keys: >>> + for i in range(max_retries): >>> + try: >>> + run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, >>> + ssh_port, base_data, >>> + debug=False, verbose=False, >>> use_cluster_key=False, >>> + ask_key=False, strict_host_check=False) >>> + break >>> + except errors.OpExecError as e: >>> + logging.error(error_msg_try, node, i, e) >>> + last_exception = e >>> + else: >>> + if last_exception: >>> + error_msg = error_msg_final % (node_name, node, >>> max_retries, >>> + last_exception) >>> + result_msgs[node] = error_msg >>> + logging.error(error_msg) >> >> >>> if clear_authorized_keys or from_public_keys or clear_public_keys: >>> data = {} >>> @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name, >>> debug=False, verbose=False, use_cluster_key=False, >>> ask_key=False, strict_host_check=False) >>> except errors.OpExecError as e: >>> - result_msgs.append("Removing SSH keys from node '%s' failed. This >>> can" >>> - " happen when the node is already unreachable." >>> - " Error: %s" % (node_name, e)) >>> + result_msgs[node_name] = \ >>> + ("Removing SSH keys from node '%s' failed. This can" >>> + " happen when the node is already unreachable." >>> + " Error: %s" % (node_name, e)) >>> >>> return result_msgs >>> >>> @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names, >>> ssh_port_map, >>> # and that would terminate all communication from the master to >>> the >>> # node. >>> logging.debug("Removing SSH key of node '%s'.", node_name) >>> - RemoveNodeSshKey(node_uuid, node_name, >>> - master_candidate_uuids, >>> - potential_master_candidates, >>> - ssh_port_map, >>> - master_uuid=master_node_uuid, >>> - from_authorized_keys=master_candidate, >>> - from_public_keys=False, >>> - clear_authorized_keys=False, >>> - clear_public_keys=False) >>> + node_errors = RemoveNodeSshKey( >>> + node_uuid, node_name, master_candidate_uuids, >>> + potential_master_candidates, ssh_port_map, >>> + master_uuid=master_node_uuid, >>> from_authorized_keys=master_candidate, >>> + from_public_keys=False, clear_authorized_keys=False, >>> + clear_public_keys=False) >>> + if node_errors: >>> + all_node_errors = all_node_errors + [node_errors.items()] >>> else: >>> logging.debug("Old key of node '%s' is the same as the current >>> master" >>> " key. Not deleting that key on the node.", >>> node_name) >>> @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names, >>> ssh_port_map, >>> >>> # Remove the old key from all node's authorized keys file >>> logging.debug("Remove the old master key from all nodes.") >>> - RemoveNodeSshKey(master_node_uuid, master_node_name, >>> - master_candidate_uuids, >>> - potential_master_candidates, >>> - ssh_port_map, >>> - keys_to_remove=old_master_keys_by_uuid, >>> - from_authorized_keys=True, >>> - from_public_keys=False, >>> - clear_authorized_keys=False, >>> - clear_public_keys=False) >>> + node_errors = RemoveNodeSshKey( >>> + master_node_uuid, master_node_name, master_candidate_uuids, >>> + potential_master_candidates, ssh_port_map, >>> + keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True, >>> + from_public_keys=False, clear_authorized_keys=False, >>> + clear_public_keys=False) >>> + if node_errors: >>> + all_node_errors = all_node_errors + node_errors.items() >>> >>> return all_node_errors >>> >>> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py >>> index 6a13d8a..8453b33 100644 >>> --- a/lib/cmdlib/node.py >>> +++ b/lib/cmdlib/node.py >>> @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit): >>> remove_result[master_node].Raise( >>> "Could not remove SSH keys of node %s before readding," >>> " (UUID: %s)." % (new_node_name, new_node_uuid)) >>> >> >> As with previous patches, please insert space, use helper function, alter >> message. >> > > I added a helper function and used it here. I don't really get the request > for spaces. It is calling the rpc, raising serious exceptions, then > evaluating the other output. I think that pretty much belongs together as a > block. > > With the helper function, all is well. > >> >>> + result_msgs = remove_result[master_node].payload >>> + if result_msgs: >>> + feedback_fn("Removing old SSH key of node '%s' failed on some >>> nodes." >>> + % new_node_name) >>> + for node, error_msg in result_msgs.items(): >>> + feedback_fn("%s: %s" % (node, error_msg)) >>> >>> result = rpcrunner.call_node_ssh_key_add( >>> [master_node], new_node_uuid, new_node_name, >>> @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit): >>> False, # currently, all nodes are potential master >>> candidates >>> False, # do not clear node's 'authorized_keys' >>> False) # do not clear node's 'ganeti_pub_keys' >>> - if not ssh_result[master_node].fail_msg: >>> - for message in ssh_result[master_node].payload: >>> - feedback_fn(message) >>> ssh_result[master_node].Raise( >>> "Could not adjust the SSH setup after demoting node '%s'" >>> " (UUID: %s)." % (node.name, node.uuid)) >>> >> >> As above. >> > > See above. > > >> >> >>> + if not ssh_result[master_node].fail_msg: >>> + if ssh_result[master_node].payload: >>> + feedback_fn("When demoting node '%s', updating SSH keys >>> on" >>> + " some nodes failed." % (node.name)) >>> + for node, message in ssh_result[master_node].payload: >>> + feedback_fn("%s: %s" % (node, message)) >>> >>> if self.new_role == self._ROLE_CANDIDATE: >>> ssh_result = self.rpc.call_node_ssh_key_add( >>> @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit): >>> result[master_node].Raise( >>> "Could not remove the SSH key of node '%s' (UUID: %s)." % >>> (self.op.node_name, self.node.uuid)) >>> + result_msgs = result[master_node].payload >>> + if result_msgs: >>> + feedback_fn("Removing old SSH key of node '%s' failed on some >>> nodes." >>> + % self.op.node_name) >>> + for node, error_msg in result_msgs.items(): >>> + feedback_fn("%s: %s" % (node, error_msg)) >>> >>> # Promote nodes to master candidate as needed >>> AdjustCandidatePool(self, [self.node.uuid]) >>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ >>> ganeti.backend_unittest.py >>> index e05580d..3c207f8 100755 >>> --- a/test/py/ganeti.backend_unittest.py >>> +++ b/test/py/ganeti.backend_unittest.py >>> @@ -1563,6 +1563,71 @@ class >>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): >>> >>> self.assertTrue(node_errors[other_node_name]) >>> >>> + def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self): >>> + """Test removing keys even if one of the old nodes needs retries. >>> + >>> + This tests checks whether a key can be removed successfully even >>> + when one of the other nodes needs to be contacted with several >>> + retries. >>> + >>> + """ >>> + all_master_candidates = >>> self._ssh_file_manager.GetAllMasterCandidates() >>> + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] >>> + other_node_name, _, _, _, _, _ = all_master_candidates[1] >>> + assert node_name != self._master_node >>> + assert other_node_name != self._master_node >>> + assert node_name != other_node_name >>> + self._ssh_file_manager.SetMaxRetries(other_node_name, 3) >>> + >>> + backend.RemoveNodeSshKey(node_uuid, node_name, >>> + self._master_candidate_uuids, >>> + self._potential_master_candidates, >>> + self._ssh_port_map, >>> + from_authorized_keys=True, >>> + from_public_keys=True, >>> + clear_authorized_keys=True, >>> + clear_public_keys=True, >>> + pub_key_file=self._pub_key_file, >>> + ssconf_store=self._ssconf_mock, >>> + noded_cert_file=self.noded_cert_file, >>> + run_cmd_fn=self._run_cmd_mock) >>> + >>> + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( >>> + node_uuid, node_key)) >>> + >>> self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key)) >>> + >>> + def testRemoveKeyFailedWithRetriesOnOtherNode(self): >>> + """Test removing keys even if one of the old nodes fails even with >>> retries. >>> + >>> + This tests checks whether the removal of a key finishes properly, >>> even if >>> + the update of the key files on one of the other nodes fails despite >>> several >>> + retries. >>> + >>> + """ >>> + all_master_candidates = >>> self._ssh_file_manager.GetAllMasterCandidates() >>> + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] >>> + other_node_name, _, _, _, _, _ = all_master_candidates[1] >>> + assert node_name != self._master_node >>> + assert other_node_name != self._master_node >>> + assert node_name != other_node_name >>> + self._ssh_file_manager.SetMaxRetries(other_node_name, 4) >>> + >>> + error_msgs = backend.RemoveNodeSshKey( >>> + node_uuid, node_name, self._master_candidate_uuids, >>> + self._potential_master_candidates, self._ssh_port_map, >>> + from_authorized_keys=True, from_public_keys=True, >>> + clear_authorized_keys=True, clear_public_keys=True, >>> + pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock, >>> + noded_cert_file=self.noded_cert_file, >>> run_cmd_fn=self._run_cmd_mock) >>> + >>> + for node in self._all_nodes: >>> + if node == other_node_name: >>> + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( >>> + node, node_key)) >>> + else: >>> + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( >>> + node, node_key)) >>> + self.assertTrue(error_msgs[other_node_name]) >>> >>> class TestVerifySshSetup(testutils.GanetiTestCase): >>> >>> -- >>> 2.2.0.rc0.207.ga3a616c >>> >>> > Interdiff: > > commit cdd4f00f57d5610dd467ba1c3562d3d89ff6eca0 > Author: Helga Velroyen <[email protected]> > Date: Thu Apr 23 12:29:06 2015 +0200 > > interdiff > > diff --git a/lib/backend.py b/lib/backend.py > index 87c8c77..bdc7d3e 100644 > --- a/lib/backend.py > +++ b/lib/backend.py > @@ -1659,8 +1659,8 @@ def RemoveNodeSshKey(node_uuid, node_name, > @returns: list of feedback messages > > """ > - # Non-disruptive error messages, mapped from node name to message > - result_msgs = {} > + # Non-disruptive error messages, list of (node, msg) pairs > + result_msgs = [] > > # Make sure at least one of these flags is true. > if not (from_authorized_keys or from_public_keys or > clear_authorized_keys > @@ -1731,13 +1731,12 @@ def RemoveNodeSshKey(node_uuid, node_name, > raise errors.OpExecError("No SSH port information available for" > " node '%s', map: %s." % > (node, ssh_port_map)) > + error_msg_try = ("The SSH setup of node '%s' could not" > + " be adjusted in try no %s. Error: %s.") > + error_msg_final = ("When removing the key of node '%s', updating > the" > + " SSH key files of node '%s' failed after %s" > + " retries. Not trying again. Last error was: > %s.") > if node in potential_master_candidates: > - error_msg_try = ("The SSH setup of node '%s' could not" > - " be adjusted in try no %s. Error: %s.") > - error_msg_final = ("When removing the key of node '%s', > updating the" > - " SSH key files of node '%s' failed after %s" > - " retries. Not trying again. Last error was: > %s.") > - last_exception = None > for i in range(constants.SSHS_MAX_RETRIES): > try: > run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > @@ -1752,7 +1751,7 @@ def RemoveNodeSshKey(node_uuid, node_name, > if last_exception: > error_msg = error_msg_final % ( > node_name, node, constants.SSHS_MAX_RETRIES, > last_exception) > - result_msgs[node] = error_msg > + result_msgs.append((node, error_msg)) > logging.error(error_msg) > > else: > @@ -1772,7 +1771,7 @@ def RemoveNodeSshKey(node_uuid, node_name, > if last_exception: > error_msg = error_msg_final % ( > node_name, node, constants.SSHS_MAX_RETRIES, > last_exception) > - result_msgs[node] = error_msg > + result_msgs.append((node, error_msg)) > logging.error(error_msg) > > if clear_authorized_keys or from_public_keys or clear_public_keys: > @@ -2015,7 +2014,7 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > from_public_keys=False, clear_authorized_keys=False, > clear_public_keys=False) > if node_errors: > - all_node_errors = all_node_errors + [node_errors.items()] > + all_node_errors = all_node_errors + node_errors > else: > logging.debug("Old key of node '%s' is the same as the current > master" > " key. Not deleting that key on the node.", > node_name) > @@ -2104,7 +2103,7 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > from_public_keys=False, clear_authorized_keys=False, > clear_public_keys=False) > if node_errors: > - all_node_errors = all_node_errors + node_errors.items() > + all_node_errors = all_node_errors + node_errors > > return all_node_errors > > diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py > index 337290e..41451b3 100644 > --- a/lib/cmdlib/node.py > +++ b/lib/cmdlib/node.py > @@ -366,12 +366,7 @@ class LUNodeAdd(LogicalUnit): > remove_result[master_node].Raise( > "Could not remove SSH keys of node %s before readding," > " (UUID: %s)." % (new_node_name, new_node_uuid)) > - result_msgs = remove_result[master_node].payload > - if result_msgs: > - feedback_fn("Removing old SSH key of node '%s' failed on some > nodes." > - % new_node_name) > - for node, error_msg in result_msgs.items(): > - feedback_fn("%s: %s" % (node, error_msg)) > + EvaluateSshUpdateRPC(remove_result, master_node, feedback_fn) > > result = rpcrunner.call_node_ssh_key_add( > [master_node], new_node_uuid, new_node_name, > @@ -891,12 +886,7 @@ class LUNodeSetParams(LogicalUnit): > ssh_result[master_node].Raise( > "Could not adjust the SSH setup after demoting node '%s'" > " (UUID: %s)." % (node.name, node.uuid)) > - if not ssh_result[master_node].fail_msg: > - if ssh_result[master_node].payload: > - feedback_fn("When demoting node '%s', updating SSH keys on" > - " some nodes failed." % (node.name)) > - for node, message in ssh_result[master_node].payload: > - feedback_fn("%s: %s" % (node, message)) > + EvaluateSshUpdateRPC(ssh_result, master_node, feedback_fn) > > if self.new_role == self._ROLE_CANDIDATE: > ssh_result = self.rpc.call_node_ssh_key_add( > @@ -1605,12 +1595,7 @@ class LUNodeRemove(LogicalUnit): > result[master_node].Raise( > "Could not remove the SSH key of node '%s' (UUID: %s)." % > (self.op.node_name, self.node.uuid)) > - result_msgs = result[master_node].payload > - if result_msgs: > - feedback_fn("Removing old SSH key of node '%s' failed on some > nodes." > - % self.op.node_name) > - for node, error_msg in result_msgs.items(): > - feedback_fn("%s: %s" % (node, error_msg)) > + EvaluateSshUpdateRPC(result, master_node, feedback_fn) > > # Promote nodes to master candidate as needed > AdjustCandidatePool(self, [self.node.uuid]) > diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ > ganeti.backend_unittest.py > index 3e54c38..181f495 100755 > --- a/test/py/ganeti.backend_unittest.py > +++ b/test/py/ganeti.backend_unittest.py > @@ -1634,7 +1634,8 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > else: > self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( > node, node_key)) > - self.assertTrue(error_msgs[other_node_name]) > + self.assertTrue([error_msg for (node, error_msg) in error_msgs > + if node == other_node_name]) > > class TestVerifySshSetup(testutils.GanetiTestCase): > > > >> >> Hrvoje Ribicic >> Ganeti Engineering >> Google Germany GmbH >> Dienerstr. 12, 80331, München >> >> Registergericht und -nummer: Hamburg, HRB 86891 >> Sitz der Gesellschaft: Hamburg >> Geschäftsführer: Graham Law, Christine Elizabeth Flores >> Steuernummer: 48/725/00206 >> Umsatzsteueridentifikationsnummer: DE813741370 >> > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
