On Thu, Apr 16, 2015 at 05:32:00PM +0200, 'Helga Velroyen' via ganeti-devel
wrote:
So far, if a SSH key of a node is added to the cluster
and updating the SSH setup of the node itself fails,
the entire operation fails quite disgraceful. This
patch introduces a retry mechanism and a proper cleanup
in case of a failure.
Note that there are more places where retries would
be a good idea. They'll be added in other patches.
Signed-off-by: Helga Velroyen <[email protected]>
---
lib/backend.py | 28 +++++++++++++---
test/py/ganeti.backend_unittest.py | 67 ++++++++++++++++++++++++++++++++++++++
test/py/testutils_ssh.py | 30 ++++++++++++++++-
3 files changed, 120 insertions(+), 5 deletions(-)
diff --git a/lib/backend.py b/lib/backend.py
index bc1a570..818f3fb 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
_InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
+ # number of retries for SSH connections
+ max_retries = 3
Perhaps it'd be useful to have the number as some global constant (and also
reference the constant in the tests).
+
# Update the target node itself
if get_public_keys:
node_data = {}
@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
(constants.SSHS_OVERRIDE, all_keys)
- run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
- ssh_port_map.get(node_name), node_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+
+ last_exception = None
+ for _ in range(max_retries):
+ try:
+ run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
+ ssh_port_map.get(node_name), node_data,
+ debug=False, verbose=False, use_cluster_key=False,
+ ask_key=False, strict_host_check=False)
+ break
+ except errors.OpExecError as e:
+ logging.error("Updating SSH key files of node '%s' failed."
+ " Error: %s", node_name, e)
+ last_exception = e
+ else:
+ if last_exception:
+ # Clean up the master's public key file if adding key fails
+ if to_public_keys:
+ ssh.RemovePublicKey(node_uuid)
+ raise errors.SshUpdateError(
+ "Could not update the SSH setup of node '%s' itself. Error: %s."
+ % (node_name, last_exception))
Would there be somoe benefit from using utils.retry.Retry? The difference is
that Retry is based on timeout instead of the number of invocations (which
can make tests a bit problematic under some circumstances), and that a
failure should return RetryAgain - it could just wrap the OpExecError.
If not, it might make sense to have another utility function for this, with
a similar interface.
# Update all nodes except master and the target node
if to_authorized_keys:
diff --git a/test/py/ganeti.backend_unittest.py
b/test/py/ganeti.backend_unittest.py
index 5305c15..cc65501 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1400,6 +1400,73 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
node, node_key))
+ def testAddKeySuccessfullyWithRetries(self):
+ new_node_name = "new_node_name"
+ new_node_uuid = "new_node_uuid"
+ new_node_key = "new_node_key"
+ is_master_candidate = True
+ is_potential_master_candidate = True
+ is_master = False
+
+ self._AddNewNodeToTestData(
+ new_node_name, new_node_uuid, new_node_key,
+ is_potential_master_candidate, is_master_candidate,
+ is_master)
+ self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
+
+ backend.AddNodeSshKey(new_node_uuid, new_node_name,
+ self._potential_master_candidates,
+ self._ssh_port_map,
+ to_authorized_keys=is_master_candidate,
+ to_public_keys=is_potential_master_candidate,
+ get_public_keys=is_potential_master_candidate,
+ pub_key_file=self._pub_key_file,
+ ssconf_store=self._ssconf_mock,
+ noded_cert_file=self.noded_cert_file,
+ run_cmd_fn=self._run_cmd_mock)
+
+ self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
+ new_node_name)
+ self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
+ new_node_key))
+
+ def testAddKeyFailedWithRetries(self):
+ new_node_name = "new_node_name"
+ new_node_uuid = "new_node_uuid"
+ new_node_key = "new_node_key"
+ is_master_candidate = True
+ is_potential_master_candidate = True
+ is_master = False
+
+ self._AddNewNodeToTestData(
+ new_node_name, new_node_uuid, new_node_key,
+ is_potential_master_candidate, is_master_candidate,
+ is_master)
+ self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
+
+ self.assertRaises(
+ errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
+ new_node_name, self._potential_master_candidates, self._ssh_port_map,
+ to_authorized_keys=is_master_candidate,
+ to_public_keys=is_potential_master_candidate,
+ get_public_keys=is_potential_master_candidate,
+ pub_key_file=self._pub_key_file,
+ ssconf_store=self._ssconf_mock,
+ noded_cert_file=self.noded_cert_file,
+ run_cmd_fn=self._run_cmd_mock)
+
+ for node in self._all_nodes:
+ if node == new_node_name:
+ self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
+ node, new_node_key))
+ else:
+ self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
+ node, new_node_key))
It might be useful to add a helper (probably in a later patch?) to the ssh
file manager for checking that all nodes (and only those) in a given set
have a given authorized key - a similar loop is already used in a few other
places.
+
+ self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
+ new_node_uuid, new_node_key))
+
+
class TestVerifySshSetup(testutils.GanetiTestCase):
_NODE1_UUID = "uuid1"
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index a65c44e..b7b4e82 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -32,6 +32,7 @@
from ganeti import constants
from ganeti import pathutils
+from ganeti import errors
class FakeSshFileManager(object):
@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
self._public_keys = {} # dict of dicts
# Node name of the master node
self._master_node_name = None
+ # Dictionary mapping nodes by name to number of retries where 'RunCommand'
+ # succeeds. For example if set to '3', RunCommand will fail two times when
+ # called for this node before it succeeds in the 3rd retry.
+ self._max_retries = {}
+ # Dictionary mapping nodes by name to number of retries which
+ # 'RunCommand' has already carried out.
+ self._retries = {}
Maybe only one dictionary would suffice. When setting a retry, we'd set
self._retries[node] to a given number and let an operation succeed only if
the number is positive (decrementing otherwise). Or do we need to keep the
max. number of retries while resetting the counter at some point?
def _SetMasterNodeName(self):
self._master_node_name = [name for name, (_, _, _, _, master)
@@ -138,6 +146,17 @@ class FakeSshFileManager(object):
self._FillAuthorizedKeyOfOneNode(node)
self._SetMasterNodeName()
+ def SetMaxRetries(self, node_name, retries):
+ """Set the number of unsuccessful retries of 'RunCommand' per node.
+
+ @type node_name: string
+ @param node_name: name of the node
+ @type retries: integer
+ @param retries: number of unsuccessful retries
+
+ """
+ self._max_retries[node_name] = retries
+
def GetSshPortMap(self, port):
"""Creates a SSH port map with all nodes mapped to the given port.
@@ -297,7 +316,8 @@ class FakeSshFileManager(object):
for name in self._all_node_data.keys():
node_pub_keys = self._public_keys[name]
if (uuid, key) in node_pub_keys.items():
- return False
+ raise Exception("Node '%s' does have public key '%s' of node '%s'"
+ % (name, key, uuid))
This change is somewhat confusing, and seems to me unrelated to the patch.
The method used to return True/False, but after that change it's either True
or an exception. So I'd either be in favor of True/False or None/exception.
And currently all the tests in the backend_unittest use assertTrue/False on
the result.
return True
def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
@@ -342,6 +362,14 @@ class FakeSshFileManager(object):
of SSH keys. No actual key files of any node is touched.
"""
+ if node in self._max_retries:
+ if node not in self._retries:
+ self._retries[node] = 0
+ self._retries[node] += 1
+ if self._retries[node] < self._max_retries[node]:
+ raise errors.OpExecError("(Fake) SSH connection to node '%s' failed."
+ % node)
+
assert base_cmd == pathutils.SSH_UPDATE
if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
--
2.2.0.rc0.207.ga3a616c