On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel < [email protected]> wrote:
> When removing an SSH key, RemoveNodeSshKey so far did only > try to contact each node once. This made the entire > procedure not very resilient towards temporary network > failures. This patch adds a couple of retries for contacting > each node and only gives up after those are exhausted. > > Signed-off-by: Helga Velroyen <[email protected]> > --- > lib/backend.py | 101 > ++++++++++++++++++++++++------------- > lib/cmdlib/node.py | 21 ++++++-- > test/py/ganeti.backend_unittest.py | 65 ++++++++++++++++++++++++ > 3 files changed, 148 insertions(+), 39 deletions(-) > > diff --git a/lib/backend.py b/lib/backend.py > index 8ec22d9..c3190d2 100644 > --- a/lib/backend.py > +++ b/lib/backend.py > @@ -1660,12 +1660,15 @@ def RemoveNodeSshKey(node_uuid, node_name, > @returns: list of feedback messages > > """ > - result_msgs = [] > + # Non-disruptive error messages, mapped from node name to message > + result_msgs = {} > + # maximum number of retries for SSH connections > + max_retries = 3 > > # Make sure at least one of these flags is true. > if not (from_authorized_keys or from_public_keys or > clear_authorized_keys > or clear_public_keys): > - result_msgs.append("No removal from any key file was requested.") > + raise errors.SshUpdateError("No removal from any key file was > requested.") > > if not ssconf_store: > ssconf_store = ssconf.SimpleStore() > @@ -1717,8 +1720,11 @@ def RemoveNodeSshKey(node_uuid, node_name, > > all_nodes = ssconf_store.GetNodeList() > online_nodes = ssconf_store.GetOnlineNodeList() > + logging.debug("Removing key of node '%s' from all nodes but itself > and" > + " master.", node_name) > for node in all_nodes: > if node == master_node: > + logging.debug("Skipping master node '%s'.", master_node) > continue > if node not in online_nodes: > logging.debug("Skipping offline node '%s'.", node) > @@ -1729,24 +1735,48 @@ def RemoveNodeSshKey(node_uuid, node_name, > " node '%s', map: %s." % > (node, ssh_port_map)) > if node in potential_master_candidates: > - try: > - run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > - ssh_port, pot_mc_data, > - debug=False, verbose=False, use_cluster_key=False, > - ask_key=False, strict_host_check=False) > - except errors.OpExecError as e: > - result_msgs.append("Warning: the SSH setup of node '%s' could > not" > - " be adjusted." % node) > - else: > - if from_authorized_keys: > + error_msg_try = ("The SSH setup of node '%s' could not" > + " be adjusted in try no %s. Error: %s.") > + error_msg_final = ("When removing the key of node '%s', > updating the" > + " SSH key files of node '%s' failed after %s" > + " retries. Not trying again. Last error was: > %s.") > Looking at the code, it seems this has to go before the if as it is used in the else as well. > + last_exception = None > + for i in range(max_retries): > try: > run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > - ssh_port, base_data, > + ssh_port, pot_mc_data, > debug=False, verbose=False, > use_cluster_key=False, > ask_key=False, strict_host_check=False) > + break > except errors.OpExecError as e: > - result_msgs.append("Warning: the SSH setup of node '%s' > could" > - " not be adjusted." % node) > + logging.error(error_msg_try, node, i, e) > + last_exception = e > + else: > + if last_exception: > + error_msg = error_msg_final % (node_name, node, max_retries, > + last_exception) > + result_msgs[node] = error_msg > + logging.error(error_msg) > + > + else: > + last_exception = None > + if from_authorized_keys: > + for i in range(max_retries): > + try: > + run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE, > + ssh_port, base_data, > + debug=False, verbose=False, > use_cluster_key=False, > + ask_key=False, strict_host_check=False) > + break > + except errors.OpExecError as e: > + logging.error(error_msg_try, node, i, e) > + last_exception = e > + else: > + if last_exception: > + error_msg = error_msg_final % (node_name, node, max_retries, > + last_exception) > + result_msgs[node] = error_msg > + logging.error(error_msg) > if clear_authorized_keys or from_public_keys or clear_public_keys: > data = {} > @@ -1790,9 +1820,10 @@ def RemoveNodeSshKey(node_uuid, node_name, > debug=False, verbose=False, use_cluster_key=False, > ask_key=False, strict_host_check=False) > except errors.OpExecError as e: > - result_msgs.append("Removing SSH keys from node '%s' failed. This > can" > - " happen when the node is already unreachable." > - " Error: %s" % (node_name, e)) > + result_msgs[node_name] = \ > + ("Removing SSH keys from node '%s' failed. This can" > + " happen when the node is already unreachable." > + " Error: %s" % (node_name, e)) > > return result_msgs > > @@ -1980,15 +2011,14 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > # and that would terminate all communication from the master to > the > # node. > logging.debug("Removing SSH key of node '%s'.", node_name) > - RemoveNodeSshKey(node_uuid, node_name, > - master_candidate_uuids, > - potential_master_candidates, > - ssh_port_map, > - master_uuid=master_node_uuid, > - from_authorized_keys=master_candidate, > - from_public_keys=False, > - clear_authorized_keys=False, > - clear_public_keys=False) > + node_errors = RemoveNodeSshKey( > + node_uuid, node_name, master_candidate_uuids, > + potential_master_candidates, ssh_port_map, > + master_uuid=master_node_uuid, > from_authorized_keys=master_candidate, > + from_public_keys=False, clear_authorized_keys=False, > + clear_public_keys=False) > + if node_errors: > + all_node_errors = all_node_errors + [node_errors.items()] > else: > logging.debug("Old key of node '%s' is the same as the current > master" > " key. Not deleting that key on the node.", > node_name) > @@ -2070,15 +2100,14 @@ def RenewSshKeys(node_uuids, node_names, > ssh_port_map, > > # Remove the old key from all node's authorized keys file > logging.debug("Remove the old master key from all nodes.") > - RemoveNodeSshKey(master_node_uuid, master_node_name, > - master_candidate_uuids, > - potential_master_candidates, > - ssh_port_map, > - keys_to_remove=old_master_keys_by_uuid, > - from_authorized_keys=True, > - from_public_keys=False, > - clear_authorized_keys=False, > - clear_public_keys=False) > + node_errors = RemoveNodeSshKey( > + master_node_uuid, master_node_name, master_candidate_uuids, > + potential_master_candidates, ssh_port_map, > + keys_to_remove=old_master_keys_by_uuid, from_authorized_keys=True, > + from_public_keys=False, clear_authorized_keys=False, > + clear_public_keys=False) > + if node_errors: > + all_node_errors = all_node_errors + node_errors.items() > > return all_node_errors > > diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py > index 6a13d8a..8453b33 100644 > --- a/lib/cmdlib/node.py > +++ b/lib/cmdlib/node.py > @@ -366,6 +366,12 @@ class LUNodeAdd(LogicalUnit): > remove_result[master_node].Raise( > "Could not remove SSH keys of node %s before readding," > " (UUID: %s)." % (new_node_name, new_node_uuid)) > As with previous patches, please insert space, use helper function, alter message. > + result_msgs = remove_result[master_node].payload > + if result_msgs: > + feedback_fn("Removing old SSH key of node '%s' failed on some > nodes." > + % new_node_name) > + for node, error_msg in result_msgs.items(): > + feedback_fn("%s: %s" % (node, error_msg)) > > result = rpcrunner.call_node_ssh_key_add( > [master_node], new_node_uuid, new_node_name, > @@ -885,12 +891,15 @@ class LUNodeSetParams(LogicalUnit): > False, # currently, all nodes are potential master candidates > False, # do not clear node's 'authorized_keys' > False) # do not clear node's 'ganeti_pub_keys' > - if not ssh_result[master_node].fail_msg: > - for message in ssh_result[master_node].payload: > - feedback_fn(message) > ssh_result[master_node].Raise( > "Could not adjust the SSH setup after demoting node '%s'" > " (UUID: %s)." % (node.name, node.uuid)) > As above. > + if not ssh_result[master_node].fail_msg: > + if ssh_result[master_node].payload: > + feedback_fn("When demoting node '%s', updating SSH keys on" > + " some nodes failed." % (node.name)) > + for node, message in ssh_result[master_node].payload: > + feedback_fn("%s: %s" % (node, message)) > > if self.new_role == self._ROLE_CANDIDATE: > ssh_result = self.rpc.call_node_ssh_key_add( > @@ -1603,6 +1612,12 @@ class LUNodeRemove(LogicalUnit): > result[master_node].Raise( > "Could not remove the SSH key of node '%s' (UUID: %s)." % > (self.op.node_name, self.node.uuid)) > + result_msgs = result[master_node].payload > + if result_msgs: > + feedback_fn("Removing old SSH key of node '%s' failed on some > nodes." > + % self.op.node_name) > + for node, error_msg in result_msgs.items(): > + feedback_fn("%s: %s" % (node, error_msg)) > > # Promote nodes to master candidate as needed > AdjustCandidatePool(self, [self.node.uuid]) > diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ > ganeti.backend_unittest.py > index e05580d..3c207f8 100755 > --- a/test/py/ganeti.backend_unittest.py > +++ b/test/py/ganeti.backend_unittest.py > @@ -1563,6 +1563,71 @@ class > TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase): > > self.assertTrue(node_errors[other_node_name]) > > + def testRemoveKeySuccessfullyWithRetriesOnOtherNode(self): > + """Test removing keys even if one of the old nodes needs retries. > + > + This tests checks whether a key can be removed successfully even > + when one of the other nodes needs to be contacted with several > + retries. > + > + """ > + all_master_candidates = > self._ssh_file_manager.GetAllMasterCandidates() > + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] > + other_node_name, _, _, _, _, _ = all_master_candidates[1] > + assert node_name != self._master_node > + assert other_node_name != self._master_node > + assert node_name != other_node_name > + self._ssh_file_manager.SetMaxRetries(other_node_name, 3) > + > + backend.RemoveNodeSshKey(node_uuid, node_name, > + self._master_candidate_uuids, > + self._potential_master_candidates, > + self._ssh_port_map, > + from_authorized_keys=True, > + from_public_keys=True, > + clear_authorized_keys=True, > + clear_public_keys=True, > + pub_key_file=self._pub_key_file, > + ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > + run_cmd_fn=self._run_cmd_mock) > + > + self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey( > + node_uuid, node_key)) > + > self.assertTrue(self._ssh_file_manager.NoNodeHasAuthorizedKey(node_key)) > + > + def testRemoveKeyFailedWithRetriesOnOtherNode(self): > + """Test removing keys even if one of the old nodes fails even with > retries. > + > + This tests checks whether the removal of a key finishes properly, > even if > + the update of the key files on one of the other nodes fails despite > several > + retries. > + > + """ > + all_master_candidates = > self._ssh_file_manager.GetAllMasterCandidates() > + node_name, node_uuid, node_key, _, _, _ = all_master_candidates[0] > + other_node_name, _, _, _, _, _ = all_master_candidates[1] > + assert node_name != self._master_node > + assert other_node_name != self._master_node > + assert node_name != other_node_name > + self._ssh_file_manager.SetMaxRetries(other_node_name, 4) > + > + error_msgs = backend.RemoveNodeSshKey( > + node_uuid, node_name, self._master_candidate_uuids, > + self._potential_master_candidates, self._ssh_port_map, > + from_authorized_keys=True, from_public_keys=True, > + clear_authorized_keys=True, clear_public_keys=True, > + pub_key_file=self._pub_key_file, ssconf_store=self._ssconf_mock, > + noded_cert_file=self.noded_cert_file, > run_cmd_fn=self._run_cmd_mock) > + > + for node in self._all_nodes: > + if node == other_node_name: > + self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, node_key)) > + else: > + self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey( > + node, node_key)) > + self.assertTrue(error_msgs[other_node_name]) > > class TestVerifySshSetup(testutils.GanetiTestCase): > > -- > 2.2.0.rc0.207.ga3a616c > > Hrvoje Ribicic Ganeti Engineering Google Germany GmbH Dienerstr. 12, 80331, München Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Graham Law, Christine Elizabeth Flores Steuernummer: 48/725/00206 Umsatzsteueridentifikationsnummer: DE813741370
