Interdiff for the max-retry constant, introduced in patch 8.
diff --git a/lib/backend.py b/lib/backend.py
index 5f4c99b..5a09efa 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1574,7 +1574,7 @@ def AddNodeSshKey(node_uuid, node_name,
continue
if node in potential_master_candidates:
logging.debug("Updating SSH key files of node '%s'.")
- for i in range(max_retries):
+ for i in range(constants.SSHS_MAX_RETRIES):
try:
run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
ssh_port_map.get(node), pot_mc_data,
@@ -1590,7 +1590,8 @@ def AddNodeSshKey(node_uuid, node_name,
error_msg = ("When adding the key of node '%s', updating SSH key"
" files of node '%s' failed after %s retries."
" Not trying again. Last errors was: %s." %
- (node, node_name, max_retries, last_exception))
+ (node, node_name, constants.SSHS_MAX_RETRIES,
+ last_exception))
node_errors[node] = error_msg
# We only log the error and don't throw an exception, because
# one unreachable node shall not abort the entire procedure.
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
ganeti.backend_unittest.py
index 021972c..ec61f07 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1499,7 +1499,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
other_node_name, _, _, _, _, _ = self._ssh_file_manager \
.GetAllMasterCandidates()[0]
- self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
+ self._ssh_file_manager.SetMaxRetries(
+ other_node_name, constants.SSHS_MAX_RETRIES)
assert other_node_name != new_node_name
self._AddNewNodeToTestData(
new_node_name, new_node_uuid, new_node_key,
@@ -1538,7 +1539,8 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
other_node_name, _, _, _, _, _ = self._ssh_file_manager \
.GetAllMasterCandidates()[0]
- self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
+ self._ssh_file_manager.SetMaxRetries(
+ other_node_name, constants.SSHS_MAX_RETRIES + 1)
assert other_node_name != new_node_name
self._AddNewNodeToTestData(
new_node_name, new_node_uuid, new_node_key,
On Fri, 17 Apr 2015 at 16:37 Helga Velroyen <[email protected]> wrote:
So far, if a SSH key of a node is added to the cluster
and updating the SSH setup of the node itself fails,
the entire operation fails quite disgraceful. This
patch introduces a retry mechanism and a proper cleanup
in case of a failure.
Note that there are more places where retries would
be a good idea. They'll be added in other patches.
Signed-off-by: Helga Velroyen <[email protected]>
---
lib/backend.py | 28 +++++++++++++---
test/py/ganeti.backend_unittest.py | 67
++++++++++++++++++++++++++++++++++++++
test/py/testutils_ssh.py | 30 ++++++++++++++++-
3 files changed, 120 insertions(+), 5 deletions(-)
diff --git a/lib/backend.py b/lib/backend.py
index bc1a570..818f3fb 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
_InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
+ # number of retries for SSH connections
+ max_retries = 3
+
# Update the target node itself
if get_public_keys:
node_data = {}
@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
(constants.SSHS_OVERRIDE, all_keys)
- run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
- ssh_port_map.get(node_name), node_data,
- debug=False, verbose=False, use_cluster_key=False,
- ask_key=False, strict_host_check=False)
+
+ last_exception = None
+ for _ in range(max_retries):
+ try:
+ run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
+ ssh_port_map.get(node_name), node_data,
+ debug=False, verbose=False, use_cluster_key=False,
+ ask_key=False, strict_host_check=False)
+ break
+ except errors.OpExecError as e:
+ logging.error("Updating SSH key files of node '%s' failed."
+ " Error: %s", node_name, e)
+ last_exception = e
+ else:
+ if last_exception:
+ # Clean up the master's public key file if adding key fails
+ if to_public_keys:
+ ssh.RemovePublicKey(node_uuid)
+ raise errors.SshUpdateError(
+ "Could not update the SSH setup of node '%s' itself. Error: %s."
+ % (node_name, last_exception))
# Update all nodes except master and the target node
if to_authorized_keys:
diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
ganeti.backend_unittest.py
index 5305c15..cc65501 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -1400,6 +1400,73 @@ class
TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
node, node_key))
+ def testAddKeySuccessfullyWithRetries(self):
+ new_node_name = "new_node_name"
+ new_node_uuid = "new_node_uuid"
+ new_node_key = "new_node_key"
+ is_master_candidate = True
+ is_potential_master_candidate = True
+ is_master = False
+
+ self._AddNewNodeToTestData(
+ new_node_name, new_node_uuid, new_node_key,
+ is_potential_master_candidate, is_master_candidate,
+ is_master)
+ self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
+
+ backend.AddNodeSshKey(new_node_uuid, new_node_name,
+ self._potential_master_candidates,
+ self._ssh_port_map,
+ to_authorized_keys=is_master_candidate,
+ to_public_keys=is_potential_master_candidate,
+ get_public_keys=is_potential_master_candidate,
+ pub_key_file=self._pub_key_file,
+ ssconf_store=self._ssconf_mock,
+ noded_cert_file=self.noded_cert_file,
+ run_cmd_fn=self._run_cmd_mock)
+
+ self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
+ new_node_name)
+ self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
+ new_node_key))
+
+ def testAddKeyFailedWithRetries(self):
+ new_node_name = "new_node_name"
+ new_node_uuid = "new_node_uuid"
+ new_node_key = "new_node_key"
+ is_master_candidate = True
+ is_potential_master_candidate = True
+ is_master = False
+
+ self._AddNewNodeToTestData(
+ new_node_name, new_node_uuid, new_node_key,
+ is_potential_master_candidate, is_master_candidate,
+ is_master)
+ self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
+
+ self.assertRaises(
+ errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
+ new_node_name, self._potential_master_candidates,
self._ssh_port_map,
+ to_authorized_keys=is_master_candidate,
+ to_public_keys=is_potential_master_candidate,
+ get_public_keys=is_potential_master_candidate,
+ pub_key_file=self._pub_key_file,
+ ssconf_store=self._ssconf_mock,
+ noded_cert_file=self.noded_cert_file,
+ run_cmd_fn=self._run_cmd_mock)
+
+ for node in self._all_nodes:
+ if node == new_node_name:
+ self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
+ node, new_node_key))
+ else:
+ self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
+ node, new_node_key))
+
+ self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
+ new_node_uuid, new_node_key))
+
+
class TestVerifySshSetup(testutils.GanetiTestCase):
_NODE1_UUID = "uuid1"
diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
index ff7db63..a180144 100644
--- a/test/py/testutils_ssh.py
+++ b/test/py/testutils_ssh.py
@@ -32,6 +32,7 @@
from ganeti import constants
from ganeti import pathutils
+from ganeti import errors
class FakeSshFileManager(object):
@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
self._public_keys = {} # dict of dicts
# Node name of the master node
self._master_node_name = None
+ # Dictionary mapping nodes by name to number of retries where
'RunCommand'
+ # succeeds. For example if set to '3', RunCommand will fail two times
when
+ # called for this node before it succeeds in the 3rd retry.
+ self._max_retries = {}
+ # Dictionary mapping nodes by name to number of retries which
+ # 'RunCommand' has already carried out.
+ self._retries = {}
def _SetMasterNodeName(self):
self._master_node_name = [name for name, (_, _, _, _, master)
@@ -124,6 +132,17 @@ class FakeSshFileManager(object):
self._FillAuthorizedKeyOfOneNode(node)
self._SetMasterNodeName()
+ def SetMaxRetries(self, node_name, retries):
+ """Set the number of unsuccessful retries of 'RunCommand' per node.
+
+ @type node_name: string
+ @param node_name: name of the node
+ @type retries: integer
+ @param retries: number of unsuccessful retries
+
+ """
+ self._max_retries[node_name] = retries
+
def GetSshPortMap(self, port):
"""Creates a SSH port map with all nodes mapped to the given port.
@@ -281,7 +300,8 @@ class FakeSshFileManager(object):
for name in self._all_node_data.keys():
node_pub_keys = self._public_keys[name]
if (uuid, key) in node_pub_keys.items():
- return False
+ raise Exception("Node '%s' does have public key '%s' of node '%s'"
+ % (name, key, uuid))
return True
def PotentialMasterCandidatesOnlyHavePublicKey(self, query_node_name):
@@ -326,6 +346,14 @@ class FakeSshFileManager(object):
of SSH keys. No actual key files of any node is touched.
"""
+ if node in self._max_retries:
+ if node not in self._retries:
+ self._retries[node] = 0
+ self._retries[node] += 1
+ if self._retries[node] < self._max_retries[node]:
+ raise errors.OpExecError("(Fake) SSH connection to node '%s'
failed."
+ % node)
+
assert base_cmd == pathutils.SSH_UPDATE
if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
--
2.2.0.rc0.207.ga3a616c