When removing an SSH key, RemoveNodeSshKey so far did only try to contact each node once. This made the entire procedure not very resilient towards temporary network failures. This patch adds a couple of retries for contacting each node and only gives up after those are exhausted.
Signed-off-by: Helga Velroyen <[email protected]> --- lib/backend.py | 101 ++++++++++++++++++++++++------------- lib/cmdlib/node.py | 21 ++++++-- test/py/ganeti.backend_unittest.py | 65 ++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 39 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 8ec22d9..c3190d2 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name, @returns: list of feedback messages """ - result_msgs = [] + # Non-disruptive error messages, mapped from node name to message + result_msgs = {} + # maximum number of retries for SSH connections + max_retries = 3 # Make sure at least one of these flags is true. if not (from_authorized_keys or from_public_keys or clear_authorized_keys or clear_public_keys): - result_msgs.append("No removal from any key file was requested.") + raise errors.SshUpdateError("No removal from any key file was requested.") if not ssconf_store: ssconf_store = ssconf.SimpleStore() @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name, all_nodes = ssconf_store.GetNodeList() online_nodes = ssconf_store.GetOnlineNodeList() + logging.debug("Removing key of node '%s' from all nodes but itself and" + " master.", node_name) for node in all_nodes: if node == master_node: + logging.debug("Skipping master node '%s'.", master_node) continue if node not in online_nodes: logging.debug("Skipping offline node '%s'.", node) @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name, " node '%s', map: %s." % (node, ssh_port_map)) if node in potential_master_candidates: - try: - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, - ssh_port, pot_mc_data, - debug=False, verbose=False, use_cluster_key=False, - ask_key=False, strict_host_check=False) - except errors.OpExecError as e: - result_msgs.append("Warning: the SSH setup of node '%s' could not" - " be adjusted." % node) - else: - if from_authorized_keys: + error_msg_try = ("The SSH setup of node '%s' could not" + " be adjusted in try no %s. Error: %s.") + error_msg_final = ("When removing the key of node '%s', updating the" + " SSH key files of node '%s' failed after %s" + " retries. Not trying again. Last error was: %s.") + last_exception = None + for i in range(max_retries): try: run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, - ssh_port, base_data, + ssh_port, pot_mc_data, debug=False, verbose=False, use_cluster_key=False, ask_key=False, strict_host_check=False) + break except errors.OpExecError as e: - result_msgs.append("Warning: the SSH setup of node '%s' could" - " not be adjusted." % node) + logging.error(error_msg_try, node, i, e) + last_exception = e + else: + if last_exception: + error_msg = error_msg_final % (node_name, node, max_retries, + last_exception) + result_msgs[node] = error_msg + logging.error(error_msg) + + else: + last_exception = None + if from_authorized_keys: + for i in range(max_retries): + try: + run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, + ssh_port, base_data, + debug=False, verbose=False, use_cluster_key=False, + ask_key=False, strict_host_check=False) + break + except errors.OpExecError as e: + logging.error(error_msg_try, node, i, e) + last_exception = e + else: + if last_exception: + error_msg = error_msg_final % (node_name, node, max_retries, + last_exception) + result_msgs[node] = error_msg + logging.error(error_msg) if clear_authorized_keys or from_public_keys or clear_public_keys: data = {} @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name, debug=False, verbose=False, use_cluster_key=False, ask_key=False, strict_host_check=False) except errors.OpExecError as e: - result_msgs.append("Removing SSH keys from node '%s' failed. This can" - " happen when the node is already unreachable." - " Error: %s" % (node_name, e)) + result_msgs[node_name] = \ + ("Removing SSH keys from node '%s' failed. This can" + " happen when the node is already unreachable." + " Error: %s" % (node_name, e)) return result_msgs @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map, # and that would terminate all communication from the master to the # node. logging.debug("Removing SSH key of node '%s'.", node_name) - RemoveNodeSshKey(node_uuid, node_name, - master_candidate_uuids, - potential_master_candidates, - ssh_port_map, - master_uuid=master_node_uuid, - from_authorized_keys=master_candidate, - from_public_keys=False, - clear_authorized_keys=False, - clear_public_keys=False) + node_errors = RemoveNodeSshKey( + node_uuid, node_name, master_candidate_uuids, + potential_master_candidates, ssh_port_map, + master_uuid=master_node_uuid, from_authorized_keys=master_candidate, + from_public_keys=False, clear_authorized_keys=False, + clear_public_keys=False) + if node_errors: + all_node_errors = all_node_errors + [node_errors.items()] else: logging.debug("Old key of node '%s' is the same as the current master" " key. Not deleting that key on the node.", node_name) @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names, ssh_port_map, # Remove the old key from all node's authorized keys file logging.debug("Remove the old master key from all nodes.") - RemoveNodeSshKey(master_node_uuid, master_node_name, - master_candidate_uuids, - potential_master_candidates, - ssh_port_map, - keys_to_remove=old_master_keys_by_uuid, - from_authorized_keys=True, - from_public_keys=False, - clear_authorized_keys=False, - clear_public_keys=False) + node_errors = RemoveNodeSshKey( + master_node_uuid, master_node_name, master_candidate_uuids, + potential_master_candidates, ssh_port_map, + keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True, + from_public_keys=False, clear_authorized_keys=False, + clear_public_keys=False) + if node_errors: + all_node_errors = all_node_errors + node_errors.items() return all_node_errors diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py index 6a13d8a..8453b33 100644 --- a/lib/cmdlib/node.py +++ b/lib/cmdlib/node.py @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit): remove_result[master_node].Raise( "Could not remove SSH keys of node %s before readding," " (UUID: %s)." % (new_node_name, new_node_uuid)) + result_msgs = remove_result[master_node].payload + if result_msgs: + feedback_fn("Removing old SSH key of node '%s' failed on some nodes." + % new_node_name) + for node, error_msg in result_msgs.items(): + feedback_fn("%s: %s" % (node, error_msg)) result = rpcrunner.call_node_ssh_key_add( [master_node], new_node_uuid, new_node_name, @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit): False, # currently, all nodes are potential master candidates False, # do not clear node's 'authorized_keys' False) # do not clear node's 'ganeti_pub_keys' - if not ssh_result[master_node].fail_msg: - for message in ssh_result[master_node].payload: - feedback_fn(message) ssh_result[master_node].Raise( "Could not adjust the SSH setup after demoting node '%s'" " (UUID: %s)." % (node.name, node.uuid)) + if not ssh_result[master_node].fail_msg: + if ssh_result[master_node].payload: + feedback_fn("When demoting node '%s', updating SSH keys on" + " some nodes failed." % (node.name)) + for node, message in ssh_result[master_node].payload: + feedback_fn("%s: %s" % (node, message)) if self.new_role == self._ROLE_CANDIDATE: ssh_result = self.rpc.call_node_ssh_key_add( @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit): result[master_node].Raise( "Could not remove the SSH key of node '%s' (UUID: %s)." % (self.op.node_name, self.node.uuid)) + result_msgs = result[master_node].payload + if result_msgs: + feedback_fn("Removing old SSH key of node '%s' failed on some nodes." + % self.op.node_name) + for node, error_msg in result_msgs.items(): + feedback_fn("%s: %s" % (node, error_msg)) # Promote nodes to master candidate as needed AdjustCandidatePool(self, [self.node.uuid]) diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py index e05580d..3c207f8 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -1563,6 +1563,71 @@ class TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): self.assertTrue(node_errors[other_node_name]) + def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self): + """Test removing keys even if one of the old nodes needs retries. + + This tests checks whether a key can be removed successfully even + when one of the other nodes needs to be contacted with several + retries. + + """ + all_master_candidates = self._ssh_file_manager.GetAllMasterCandidates() + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] + other_node_name, _, _, _, _, _ = all_master_candidates[1] + assert node_name != self._master_node + assert other_node_name != self._master_node + assert node_name != other_node_name + self._ssh_file_manager.SetMaxRetries(other_node_name, 3) + + backend.RemoveNodeSshKey(node_uuid, node_name, + self._master_candidate_uuids, + self._potential_master_candidates, + self._ssh_port_map, + from_authorized_keys=True, + from_public_keys=True, + clear_authorized_keys=True, + clear_public_keys=True, + pub_key_file=self._pub_key_file, + ssconf_store=self._ssconf_mock, + noded_cert_file=self.noded_cert_file, + run_cmd_fn=self._run_cmd_mock) + + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( + node_uuid, node_key)) + self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key)) + + def testRemoveKeyFailedWithRetriesOnOtherNode(self): + """Test removing keys even if one of the old nodes fails even with retries. + + This tests checks whether the removal of a key finishes properly, even if + the update of the key files on one of the other nodes fails despite several + retries. + + """ + all_master_candidates = self._ssh_file_manager.GetAllMasterCandidates() + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] + other_node_name, _, _, _, _, _ = all_master_candidates[1] + assert node_name != self._master_node + assert other_node_name != self._master_node + assert node_name != other_node_name + self._ssh_file_manager.SetMaxRetries(other_node_name, 4) + + error_msgs = backend.RemoveNodeSshKey( + node_uuid, node_name, self._master_candidate_uuids, + self._potential_master_candidates, self._ssh_port_map, + from_authorized_keys=True, from_public_keys=True, + clear_authorized_keys=True, clear_public_keys=True, + pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock, + noded_cert_file=self.noded_cert_file, run_cmd_fn=self._run_cmd_mock) + + for node in self._all_nodes: + if node == other_node_name: + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( + node, node_key)) + else: + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( + node, node_key)) + self.assertTrue(error_msgs[other_node_name]) class TestVerifySshSetup(testutils.GanetiTestCase): -- 2.2.0.rc0.207.ga3a616c
