On Tue, Apr 21, 2015 at 02:47:26PM +0000, Helga Velroyen wrote:
>Hi!
>
>On Tue, 21 Apr 2015 at 15:16 Petr Pudlak <[email protected]> wrote:
>
>> On Thu, Apr 16, 2015 at 05:32:00PM +0200, 'Helga Velroyen' via
>> ganeti-devel wrote:
>> >So far, if a SSH key of a node is added to the cluster
>> >and updating the SSH setup of the node itself fails,
>> >the entire operation fails quite disgraceful. This
>> >patch introduces a retry mechanism and a proper cleanup
>> >in case of a failure.
>> >
>> >Note that there are more places where retries would
>> >be a good idea. They'll be added in other patches.
>> >
>> >Signed-off-by: Helga Velroyen <[email protected]>
>> >---
>> > lib/backend.py | 28 +++++++++++++---
>> > test/py/ganeti.backend_unittest.py | 67
>> ++++++++++++++++++++++++++++++++++++++
>> > test/py/testutils_ssh.py | 30 ++++++++++++++++-
>> > 3 files changed, 120 insertions(+), 5 deletions(-)
>> >
>> >diff --git a/lib/backend.py b/lib/backend.py
>> >index bc1a570..818f3fb 100644
>> >--- a/lib/backend.py
>> >+++ b/lib/backend.py
>> >@@ -1520,6 +1520,9 @@ def AddNodeSshKey(node_uuid, node_name,
>> > _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
>> > cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
>> >
>> >+ # number of retries for SSH connections
>> >+ max_retries = 3
>>
>> Perhaps it'd be useful to have the number as some global constant (and
also
>> reference the constant in the tests).
>>
>
>True, thought about this as well. See interdiff below. Als note that this
>results in some interdiffs in the other 7 patches of that series, I will
>send them as reply to those respective patches.
>
>
>>
>> >+
>> > # Update the target node itself
>> > if get_public_keys:
>> > node_data = {}
>> >@@ -1527,10 +1530,27 @@ def AddNodeSshKey(node_uuid, node_name,
>> > all_keys = ssh.QueryPubKeyFile(None, key_file=pub_key_file)
>> > node_data[constants.SSHS_SSH_PUBLIC_KEYS] = \
>> > (constants.SSHS_OVERRIDE, all_keys)
>> >- run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> >- ssh_port_map.get(node_name), node_data,
>> >- debug=False, verbose=False, use_cluster_key=False,
>> >- ask_key=False, strict_host_check=False)
>> >+
>> >+ last_exception = None
>> >+ for _ in range(max_retries):
>> >+ try:
>> >+ run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
>> >+ ssh_port_map.get(node_name), node_data,
>> >+ debug=False, verbose=False, use_cluster_key=False,
>> >+ ask_key=False, strict_host_check=False)
>> >+ break
>> >+ except errors.OpExecError as e:
>> >+ logging.error("Updating SSH key files of node '%s' failed."
>> >+ " Error: %s", node_name, e)
>> >+ last_exception = e
>> >+ else:
>> >+ if last_exception:
>> >+ # Clean up the master's public key file if adding key fails
>> >+ if to_public_keys:
>> >+ ssh.RemovePublicKey(node_uuid)
>> >+ raise errors.SshUpdateError(
>> >+ "Could not update the SSH setup of node '%s' itself. Error:
>> %s."
>> >+ % (node_name, last_exception))
>>
>> Would there be somoe benefit from using utils.retry.Retry? The
difference
>> is
>> that Retry is based on timeout instead of the number of invocations
(which
>> can make tests a bit problematic under some circumstances), and that a
>> failure should return RetryAgain - it could just wrap the OpExecError.
>>
>> If not, it might make sense to have another utility function for this,
with
>> a similar interface.
>>
>
>I would not like to use utils.retry.Retry as it is based on a timeout and
>that makes stuff (including the unit tests) awfully slow.
>
>I agree that having an extra utility function might be a good idea. Would
>you mind if I add it as a separate patch at the end of the series? I
>already have used this in the other 7 patches of this series, and I'd
>rather not merge that in now.
I agree on both, the usecase is somewhat different, and it'll be better to
add the utility function as another patch.
>
>
>>
>> >
>> > # Update all nodes except master and the target node
>> > if to_authorized_keys:
>> >diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>> ganeti.backend_unittest.py
>> >index 5305c15..cc65501 100755
>> >--- a/test/py/ganeti.backend_unittest.py
>> >+++ b/test/py/ganeti.backend_unittest.py
>> >@@ -1400,6 +1400,73 @@ class
>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>> > self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>> > node, node_key))
>> >
>> >+ def testAddKeySuccessfullyWithRetries(self):
>> >+ new_node_name = "new_node_name"
>> >+ new_node_uuid = "new_node_uuid"
>> >+ new_node_key = "new_node_key"
>> >+ is_master_candidate = True
>> >+ is_potential_master_candidate = True
>> >+ is_master = False
>> >+
>> >+ self._AddNewNodeToTestData(
>> >+ new_node_name, new_node_uuid, new_node_key,
>> >+ is_potential_master_candidate, is_master_candidate,
>> >+ is_master)
>> >+ self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
>> >+
>> >+ backend.AddNodeSshKey(new_node_uuid, new_node_name,
>> >+ self._potential_master_candidates,
>> >+ self._ssh_port_map,
>> >+ to_authorized_keys=is_master_candidate,
>> >+
to_public_keys=is_potential_master_candidate,
>> >+
get_public_keys=is_potential_master_candidate,
>> >+ pub_key_file=self._pub_key_file,
>> >+ ssconf_store=self._ssconf_mock,
>> >+ noded_cert_file=self.noded_cert_file,
>> >+ run_cmd_fn=self._run_cmd_mock)
>> >+
>> >+ self._ssh_file_manager.PotentialMasterCandidatesOnlyHavePublicKey(
>> >+ new_node_name)
>> >+ self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>> >+ new_node_key))
>> >+
>> >+ def testAddKeyFailedWithRetries(self):
>> >+ new_node_name = "new_node_name"
>> >+ new_node_uuid = "new_node_uuid"
>> >+ new_node_key = "new_node_key"
>> >+ is_master_candidate = True
>> >+ is_potential_master_candidate = True
>> >+ is_master = False
>> >+
>> >+ self._AddNewNodeToTestData(
>> >+ new_node_name, new_node_uuid, new_node_key,
>> >+ is_potential_master_candidate, is_master_candidate,
>> >+ is_master)
>> >+ self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
>> >+
>> >+ self.assertRaises(
>> >+ errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
>> >+ new_node_name, self._potential_master_candidates,
>> self._ssh_port_map,
>> >+ to_authorized_keys=is_master_candidate,
>> >+ to_public_keys=is_potential_master_candidate,
>> >+ get_public_keys=is_potential_master_candidate,
>> >+ pub_key_file=self._pub_key_file,
>> >+ ssconf_store=self._ssconf_mock,
>> >+ noded_cert_file=self.noded_cert_file,
>> >+ run_cmd_fn=self._run_cmd_mock)
>> >+
>> >+ for node in self._all_nodes:
>> >+ if node == new_node_name:
>> >+ self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>> >+ node, new_node_key))
>> >+ else:
>> >+ self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>> >+ node, new_node_key))
>>
>> It might be useful to add a helper (probably in a later patch?) to the
ssh
>> file manager for checking that all nodes (and only those) in a given set
>> have a given authorized key - a similar loop is already used in a few
other
>> places.
>>
>
>Good point. As you suggest, I'll add that in an additional patch.
ACK
>
>
>>
>> >+
>> >+ self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>> >+ new_node_uuid, new_node_key))
>> >+
>> >+
>> > class TestVerifySshSetup(testutils.GanetiTestCase):
>> >
>> > _NODE1_UUID = "uuid1"
>> >diff --git a/test/py/testutils_ssh.py b/test/py/testutils_ssh.py
>> >index a65c44e..b7b4e82 100644
>> >--- a/test/py/testutils_ssh.py
>> >+++ b/test/py/testutils_ssh.py
>> >@@ -32,6 +32,7 @@
>> >
>> > from ganeti import constants
>> > from ganeti import pathutils
>> >+from ganeti import errors
>> >
>> >
>> > class FakeSshFileManager(object):
>> >@@ -66,6 +67,13 @@ class FakeSshFileManager(object):
>> > self._public_keys = {} # dict of dicts
>> > # Node name of the master node
>> > self._master_node_name = None
>> >+ # Dictionary mapping nodes by name to number of retries where
>> 'RunCommand'
>> >+ # succeeds. For example if set to '3', RunCommand will fail two
>> times when
>> >+ # called for this node before it succeeds in the 3rd retry.
>> >+ self._max_retries = {}
>> >+ # Dictionary mapping nodes by name to number of retries which
>> >+ # 'RunCommand' has already carried out.
>> >+ self._retries = {}
>>
>> Maybe only one dictionary would suffice. When setting a retry, we'd set
>> self._retries[node] to a given number and let an operation succeed only
if
>> the number is positive (decrementing otherwise). Or do we need to keep
the
>> max. number of retries while resetting the counter at some point?
>>
>
>While I understand that your desire here is to get rid of one variable,
I'd
>rather not do that. The reason is that this would count down instead of up
>like it is done in the implementation that this class is emulating. For
the
>sanity of any developer who has to work with this in the future, I'd
rather
>keep those similar to not add any additional confusion. Would that be okay
>for you?
OK, thanks for the explanation, let's leave it as it is.
>
>
>>
>> >
>> > def _SetMasterNodeName(self):
>> > self._master_node_name = [name for name, (_, _, _, _, master)
>> >@@ -138,6 +146,17 @@ class FakeSshFileManager(object):
>> > self._FillAuthorizedKeyOfOneNode(node)
>> > self._SetMasterNodeName()
>> >
>> >+ def SetMaxRetries(self, node_name, retries):
>> >+ """Set the number of unsuccessful retries of 'RunCommand' per
node.
>> >+
>> >+ @type node_name: string
>> >+ @param node_name: name of the node
>> >+ @type retries: integer
>> >+ @param retries: number of unsuccessful retries
>> >+
>> >+ """
>> >+ self._max_retries[node_name] = retries
>> >+
>> > def GetSshPortMap(self, port):
>> > """Creates a SSH port map with all nodes mapped to the given port.
>> >
>> >@@ -297,7 +316,8 @@ class FakeSshFileManager(object):
>> > for name in self._all_node_data.keys():
>> > node_pub_keys = self._public_keys[name]
>> > if (uuid, key) in node_pub_keys.items():
>> >- return False
>> >+ raise Exception("Node '%s' does have public key '%s' of node
>> '%s'"
>> >+ % (name, key, uuid))
>>
>> This change is somewhat confusing, and seems to me unrelated to the
patch.
>> The method used to return True/False, but after that change it's either
>> True
>> or an exception. So I'd either be in favor of True/False or
None/exception.
>> And currently all the tests in the backend_unittest use
assertTrue/False on
>> the result.
>>
>
>I agree. I realized for debugging any problems, I not only need to know
>that some node is not configured correctly, but which node. Therefore I
>started throwing some exceptions to add additional information. I realized
>later that this is kinda inconsistent. I fixed this for all occurrences in
>patch:
>
>[PATCH stable-2.13 14/15] SSH file manager: properly name assertion
function
>
>Therefore, can you let this slip here as it is not breaking anything and
>just a little bending of semantics?
OK, if it's fixed later in the series, that's fine.
>
>
>>
>> > return True
>> >
>> > def PotentialMasterCandidatesOnlyHavePublicKey(self,
query_node_name):
>> >@@ -342,6 +362,14 @@ class FakeSshFileManager(object):
>> > of SSH keys. No actual key files of any node is touched.
>> >
>> > """
>> >+ if node in self._max_retries:
>> >+ if node not in self._retries:
>> >+ self._retries[node] = 0
>> >+ self._retries[node] += 1
>> >+ if self._retries[node] < self._max_retries[node]:
>> >+ raise errors.OpExecError("(Fake) SSH connection to node '%s'
>> failed."
>> >+ % node)
>> >+
>> > assert base_cmd == pathutils.SSH_UPDATE
>> >
>> > if constants.SSHS_SSH_AUTHORIZED_KEYS in data:
>> >--
>> >2.2.0.rc0.207.ga3a616c
>> >
>>
>
>Interdiff for the max_retries constant:
>
> diff --git a/lib/backend.py b/lib/backend.py
>index 818f3fb..fd30bd8 100644
>--- a/lib/backend.py
>+++ b/lib/backend.py
>@@ -1520,10 +1520,8 @@ def AddNodeSshKey(node_uuid, node_name,
> _InitSshUpdateData(base_data, noded_cert_file, ssconf_store)
> cluster_name = base_data[constants.SSHS_CLUSTER_NAME]
>
>- # number of retries for SSH connections
>- max_retries = 3
>-
> # Update the target node itself
>+ logging.debug("Updating SSH key files of target node '%s'.", node_name)
> if get_public_keys:
> node_data = {}
> _InitSshUpdateData(node_data, noded_cert_file, ssconf_store)
>@@ -1532,7 +1530,7 @@ def AddNodeSshKey(node_uuid, node_name,
> (constants.SSHS_OVERRIDE, all_keys)
>
> last_exception = None
>- for _ in range(max_retries):
>+ for _ in range(constants.SSHS_MAX_RETRIES):
> try:
> run_cmd_fn(cluster_name, node_name, pathutils.SSH_UPDATE,
> ssh_port_map.get(node_name), node_data,
>diff --git a/src/Ganeti/Constants.hs b/src/Ganeti/Constants.hs
>index 01bf435..eddadf3 100644
>--- a/src/Ganeti/Constants.hs
>+++ b/src/Ganeti/Constants.hs
>@@ -4563,6 +4563,9 @@ sshsSshPublicKeys = "public_keys"
> sshsNodeDaemonCertificate :: String
> sshsNodeDaemonCertificate = "node_daemon_certificate"
>
>+sshsMaxRetries :: Integer
>+sshsMaxRetries = 3
Just nitpicking, the name doesn't make it clear to which SSH communication
the number applies. So perhaps name it something like
renewCryptoSShMaxRetries? Perhaps also add a comment to the constant.
No need to resend.
>+
> sshsAdd :: String
> sshsAdd = "add"
>
>diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>ganeti.backend_unittest.py
>index cc65501..288ed15 100755
>--- a/test/py/ganeti.backend_unittest.py
>+++ b/test/py/ganeti.backend_unittest.py
>@@ -1412,7 +1412,8 @@ class
>TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> new_node_name, new_node_uuid, new_node_key,
> is_potential_master_candidate, is_master_candidate,
> is_master)
>- self._ssh_file_manager.SetMaxRetries(new_node_name, 3)
>+ self._ssh_file_manager.SetMaxRetries(
>+ new_node_name, constants.SSHS_MAX_RETRIES)
>
> backend.AddNodeSshKey(new_node_uuid, new_node_name,
> self._potential_master_candidates,
>@@ -1442,7 +1443,8 @@ class
>TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
> new_node_name, new_node_uuid, new_node_key,
> is_potential_master_candidate, is_master_candidate,
> is_master)
>- self._ssh_file_manager.SetMaxRetries(new_node_name, 4)
>+ self._ssh_file_manager.SetMaxRetries(
>+ new_node_name, constants.SSHS_MAX_RETRIES + 1)
>
> self.assertRaises(
> errors.SshUpdateError, backend.AddNodeSshKey, new_node_uuid,
the interdiff LGTM, thanks