LGTM overall; your call whether to apply my comment regarding the message
prefixing the raising of master errors.

On Thu, Apr 23, 2015 at 12:49 PM, Helga Velroyen <[email protected]> wrote:

> Hi!
>
> On Wed, 22 Apr 2015 at 18:41 Hrvoje Ribicic <[email protected]> wrote:
>
>> On Fri, Apr 17, 2015 at 4:37 PM, 'Helga Velroyen' via ganeti-devel <
>> [email protected]> wrote:
>>
>>> This patch introduces retries for updating non-master nodes
>>> when a new SSH key is added to the cluster. If an SSH update
>>> fails even after the maximum number of retries is exhausted,
>>> this does not raise an exception, but an error message is
>>> added to the result of the method, which can be displayed
>>> in the LU operations so that the admin sees which nodes
>>> were the problem and need further attention.
>>>
>>> Signed-off-by: Helga Velroyen <[email protected]>
>>> ---
>>>  lib/backend.py                     |  73 +++++++++++++++++++--------
>>>  lib/cmdlib/cluster.py              |   9 +++-
>>>  lib/cmdlib/node.py                 |  14 ++++-
>>>  test/py/ganeti.backend_unittest.py | 101
>>> ++++++++++++++++++++++++++++++++++++-
>>>  4 files changed, 169 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/lib/backend.py b/lib/backend.py
>>> index 818f3fb..3b17306 100644
>>> --- a/lib/backend.py
>>> +++ b/lib/backend.py
>>> @@ -1566,17 +1566,38 @@ def AddNodeSshKey(node_uuid, node_name,
>>>    master_node = ssconf_store.GetMasterNode()
>>>    online_nodes = ssconf_store.GetOnlineNodeList()
>>>
>>> +  node_errors = {}
>>
>>    for node in all_nodes:
>>>      if node == master_node:
>>> +      logging.debug("Skipping master node '%s'.", master_node)
>>>        continue
>>>      if node not in online_nodes:
>>>        logging.debug("Skipping offline node '%s'.", node)
>>>        continue
>>>      if node in potential_master_candidates:
>>> -      run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> -                 ssh_port_map.get(node), pot_mc_data,
>>> -                 debug=False, verbose=False, use_cluster_key=False,
>>> -                 ask_key=False, strict_host_check=False)
>>> +      logging.debug("Updating SSH key files of node '%s'.")
>>> +      for i in range(max_retries):
>>> +        try:
>>> +          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> +                     ssh_port_map.get(node), pot_mc_data,
>>> +                     debug=False, verbose=False, use_cluster_key=False,
>>> +                     ask_key=False, strict_host_check=False)
>>> +          break
>>> +        except errors.OpExecError as e:
>>> +          logging.error("Updating SSH key files of node '%s' failed"
>>> +                        " at try no %s. Error: %s.", node, i, e)
>>> +          last_exception = e
>>> +      else:
>>> +        if last_exception:
>>> +          error_msg = ("When adding the key of node '%s', updating SSH
>>> key"
>>> +                       " files of node '%s' failed after %s retries."
>>> +                       " Not trying again. Last errors was: %s." %
>>>
>>
>> Nitpick: s/errors/error/
>>
>
> ACK.
>
>
>>
>>
>>> +                       (node, node_name, max_retries, last_exception))
>>
>> +          node_errors[node] = error_msg
>>> +          # We only log the error and don't throw an exception, because
>>> +          # one unreachable node shall not abort the entire procedure.
>>> +          logging.error(error_msg)
>>> +
>>>      else:
>>>        if to_authorized_keys:
>>>          run_cmd_fn(cluster_name, node, pathutils.SSH_UPDATE,
>>> @@ -1584,6 +1605,8 @@ def AddNodeSshKey(node_uuid, node_name,
>>>                     debug=False, verbose=False, use_cluster_key=False,
>>>                     ask_key=False, strict_host_check=False)
>>>
>>> +  return node_errors
>>> +
>>>
>>>  def RemoveNodeSshKey(node_uuid, node_name,
>>>                       master_candidate_uuids,
>>> @@ -1925,6 +1948,10 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>
>>>    master_node_name = ssconf_store.GetMasterNode()
>>>    master_node_uuid = _GetMasterNodeUUID(node_uuid_name_map,
>>> master_node_name)
>>> +  # List of all node errors that happened, but which did not abort the
>>> +  # procedure as a whole. It is important that this is a list to have a
>>> +  # somewhat chronological history of events.
>>> +  all_node_errors = []
>>>
>>>    # process non-master nodes
>>>    for node_uuid, node_name in node_uuid_name_map:
>>> @@ -1989,15 +2016,16 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>        ssh.AddPublicKey(node_uuid, pub_key, key_file=pub_key_file)
>>>
>>>      logging.debug("Add ssh key of node '%s'.", node_name)
>>> -    AddNodeSshKey(node_uuid, node_name,
>>> -                  potential_master_candidates,
>>> -                  ssh_port_map,
>>> -                  to_authorized_keys=master_candidate,
>>> -                  to_public_keys=potential_master_candidate,
>>> -                  get_public_keys=True,
>>> -                  pub_key_file=pub_key_file, ssconf_store=ssconf_store,
>>> -                  noded_cert_file=noded_cert_file,
>>> -                  run_cmd_fn=run_cmd_fn)
>>> +    node_errors = AddNodeSshKey(
>>> +        node_uuid, node_name, potential_master_candidates,
>>> +        ssh_port_map, to_authorized_keys=master_candidate,
>>> +        to_public_keys=potential_master_candidate,
>>> +        get_public_keys=True,
>>> +        pub_key_file=pub_key_file, ssconf_store=ssconf_store,
>>> +        noded_cert_file=noded_cert_file,
>>> +        run_cmd_fn=run_cmd_fn)
>>> +    if node_errors:
>>> +      all_node_errors = all_node_errors + node_errors.items()
>>>
>>>    # Renewing the master node's key
>>>
>>> @@ -2022,15 +2050,14 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>
>>>    # Add new master key to all node's public and authorized keys
>>>    logging.debug("Add new master key to all nodes.")
>>> -  AddNodeSshKey(master_node_uuid, master_node_name,
>>> -                potential_master_candidates,
>>> -                ssh_port_map,
>>> -                to_authorized_keys=True,
>>> -                to_public_keys=True,
>>> -                get_public_keys=False,
>>> -                pub_key_file=pub_key_file, ssconf_store=ssconf_store,
>>> -                noded_cert_file=noded_cert_file,
>>> -                run_cmd_fn=run_cmd_fn)
>>> +  node_errors = AddNodeSshKey(
>>> +      master_node_uuid, master_node_name, potential_master_candidates,
>>> +      ssh_port_map, to_authorized_keys=True, to_public_keys=True,
>>> +      get_public_keys=False, pub_key_file=pub_key_file,
>>> +      ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
>>> +      run_cmd_fn=run_cmd_fn)
>>> +  if node_errors:
>>> +    all_node_errors = all_node_errors + node_errors.items()
>>>
>>>    # Remove the old key file and rename the new key to the non-temporary
>>> filename
>>>    _ReplaceMasterKeyOnMaster(root_keyfiles)
>>> @@ -2053,6 +2080,8 @@ def RenewSshKeys(node_uuids, node_names,
>>> ssh_port_map,
>>>                     clear_authorized_keys=False,
>>>                     clear_public_keys=False)
>>>
>>> +  return all_node_errors
>>> +
>>>
>>>  def GetBlockDevSizes(devices):
>>>    """Return the size of the given block devices
>>> diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py
>>> index c504d60..14c18da 100644
>>> --- a/lib/cmdlib/cluster.py
>>> +++ b/lib/cmdlib/cluster.py
>>> @@ -210,7 +210,7 @@ class LUClusterRenewCrypto(NoHooksLU):
>>>      self.cfg.RemoveNodeFromCandidateCerts("%s-SERVER" % master_uuid)
>>>      self.cfg.RemoveNodeFromCandidateCerts("%s-OLDMASTER" % master_uuid)
>>>
>>> -  def _RenewSshKeys(self):
>>> +  def _RenewSshKeys(self, feedback_fn):
>>>      """Renew all nodes' SSH keys.
>>>
>>>      """
>>> @@ -230,6 +230,11 @@ class LUClusterRenewCrypto(NoHooksLU):
>>>        master_candidate_uuids,
>>>        potential_master_candidates)
>>>      result[master_uuid].Raise("Could not renew the SSH keys of all
>>> nodes")
>>>
>>
>> Just out of curiosity: under which conditions would this actually be
>> raised?
>> If my observation has merit, maybe you can change the message to reflect
>> the unexpectedness of the failure.
>>
>
> Well, that can have several reasons, the most obvious ones are:
> - key files on the master not reachable,
> - the keys that are supposed to be renewed are not in the public key file
> (that is a security mechanism),
> - generating a new key of a node worked, but then master was not able to
> fetch it.
>

The core of my objection here was the error message raised here should be
different given the changes in the code.
It makes it seem like there is a problem with the nodes whereas the master
is at fault in one way or another - we should be ignoring node failures.

That said, additional information should be present, so it's your call
whether you modify this or not.


>
>
>>
>> Also, a comment and/or an empty line would be useful here. Otherwise it
>> looks like a bit of a bug.
>> The same is true for the next two entries.
>>
>
> What do you mean by "next two entries"?
>

In the patch series, of a similar structure. Refactoring took care of that.


>
>
>>
>>
>>> +    node_errors = result[master_uuid].payload
>>> +    if node_errors:
>>> +      feedback_fn("Some nodes' SSH key files could not be updated:")
>>> +      for node_name, error_msg in node_errors:
>>> +        feedback_fn("%s: %s" % (node_name, error_msg))
>>>
>>>    def Exec(self, feedback_fn):
>>>      if self.op.node_certificates:
>>> @@ -237,7 +242,7 @@ class LUClusterRenewCrypto(NoHooksLU):
>>>        self._RenewNodeSslCertificates(feedback_fn)
>>>      if self.op.ssh_keys and not self._ssh_renewal_suppressed:
>>>        feedback_fn("Renewing SSH keys")
>>> -      self._RenewSshKeys()
>>> +      self._RenewSshKeys(feedback_fn)
>>>      elif self._ssh_renewal_suppressed:
>>>        feedback_fn("Cannot renew SSH keys if the cluster is configured
>>> to not"
>>>                    " modify the SSH setup.")
>>> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
>>> index 19b630a..6a13d8a 100644
>>> --- a/lib/cmdlib/node.py
>>> +++ b/lib/cmdlib/node.py
>>> @@ -339,7 +339,7 @@ class LUNodeAdd(LogicalUnit):
>>>        result.Raise("Failed to initialize OpenVSwitch on new node")
>>>
>>>    def _SshUpdate(self, new_node_uuid, new_node_name,
>>> is_master_candidate,
>>> -                 is_potential_master_candidate, rpcrunner, readd):
>>> +                 is_potential_master_candidate, rpcrunner, readd,
>>> feedback_fn):
>>>      """Update the SSH setup of all nodes after adding a new node.
>>>
>>>      @type readd: boolean
>>> @@ -373,6 +373,11 @@ class LUNodeAdd(LogicalUnit):
>>>        is_master_candidate, is_potential_master_candidate,
>>>        is_potential_master_candidate)
>>>      result[master_node].Raise("Could not update the node's SSH setup.")
>>> +    node_errors = result[master_node].payload
>>> +    if node_errors:
>>> +      feedback_fn("Some nodes' SSH key files could not be updated:")
>>> +      for node_name, error_msg in node_errors.items():
>>> +        feedback_fn("%s: %s." % (node_name, error_msg))
>>>
>>
>> Between SSL and the multiple placements, this really is calling out to be
>> a helper function.
>>
>
> Added it. Interdiff see below. This also has an effect on other patches.
> Will send interdiffs there too.
>
>
>>
>>
>>>
>>>    def Exec(self, feedback_fn):
>>>      """Adds the new node to the cluster.
>>> @@ -481,7 +486,7 @@ class LUNodeAdd(LogicalUnit):
>>>        # FIXME: so far, all nodes are considered potential master
>>> candidates
>>>        self._SshUpdate(self.new_node.uuid, self.new_node.name,
>>>                        self.new_node.master_candidate, True,
>>> -                      self.rpc, self.op.readd)
>>> +                      self.rpc, self.op.readd, feedback_fn)
>>>
>>>
>>>  class LUNodeSetParams(LogicalUnit):
>>> @@ -897,6 +902,11 @@ class LUNodeSetParams(LogicalUnit):
>>>            ssh_result[master_node].Raise(
>>>              "Could not update the SSH setup of node '%s' after
>>> promotion"
>>>              " (UUID: %s)." % (node.name, node.uuid))
>>
>> +          node_errors = ssh_result[master_node].payload
>>> +          if node_errors:
>>> +            feedback_fn("Some nodes' SSH key files could not be
>>> updated:")
>>> +            for node_name, error_msg in node_errors.items():
>>> +              feedback_fn("%s: %s." % (node_name, error_msg))
>>
>>
>>>      return result
>>>
>>> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
>>> ganeti.backend_unittest.py
>>> index cc65501..e05580d 100755
>>> --- a/test/py/ganeti.backend_unittest.py
>>> +++ b/test/py/ganeti.backend_unittest.py
>>> @@ -1400,7 +1400,14 @@ class
>>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>>>              node, node_key))
>>>
>>> -  def testAddKeySuccessfullyWithRetries(self):
>>> +  def testAddKeySuccessfullyOnNewNodeWithRetries(self):
>>> +    """Tests adding a new node's key when updating that node takes
>>> retries.
>>> +
>>> +    This test checks whether adding a new node's key successfully
>>> updates
>>> +    all node's SSH key files, even if updating the new node's key files
>>> itself
>>>
>>
>> Nit: s/node's/nodes'/ for the first one. The second one stays as-is.
>> Possible better formulation:
>>
>
> I think you actually meant the second one. However, I picked this
> alternative formulation:
>

We may have differed in lines.


>
>
>>
>> .. updates the SSH key files of all nodes, even if updating the new
>> node's files ...
>>
>
> See interdiff.
>
>
>
>>
>>
>>> +    takes a couple of retries to succeed.
>>> +
>>> +    """
>>>      new_node_name = "new_node_name"
>>>      new_node_uuid = "new_node_uuid"
>>>      new_node_key = "new_node_key"
>>> @@ -1430,7 +1437,14 @@ class
>>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>>      self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>>>          new_node_key))
>>>
>>> -  def testAddKeyFailedWithRetries(self):
>>> +  def testAddKeyFailedOnNewNodeWithRetries(self):
>>> +    """Tests clean up if updating a new node's SSH setup fails.
>>> +
>>> +    If adding a new node's keys fails, because the SSH key files of that
>>> +    new node fails, check whether already carried out operations are
>>>
>>
>> Nit: Better formulation: If adding the keys of a new node fails because
>> the SSH key files ...
>>
>
> Ack. See interdiff.
>
>
>>
>>
>>> +    successfully rolled back and thus the state of the cluster is
>>> cleaned up.
>>> +
>>> +    """
>>>      new_node_name = "new_node_name"
>>>      new_node_uuid = "new_node_uuid"
>>>      new_node_key = "new_node_key"
>>> @@ -1466,6 +1480,89 @@ class
>>> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>>>      self.assertTrue(self._ssh_file_manager.NoNodeHasPublicKey(
>>>          new_node_uuid, new_node_key))
>>>
>>> +  def testAddKeySuccessfullyOnOldNodeWithRetries(self):
>>> +    """Tests adding a new key even if updating nodes takes retries.
>>> +
>>> +    This tests whether adding a new node's key successfully finishes,
>>> +    even if one of the other cluster nodes takes a couple of retries
>>> +    to succeed.
>>> +
>>> +    """
>>> +    new_node_name = "new_node_name"
>>> +    new_node_uuid = "new_node_uuid"
>>> +    new_node_key = "new_node_key"
>>> +    is_master_candidate = True
>>> +    is_potential_master_candidate = True
>>> +    is_master = False
>>> +
>>> +    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
>>> +        .GetAllMasterCandidates()[0]
>>>
>>
>> Afaik, this will be fixed in a follow-up patch, right?
>> I second Petr's suggestion of named tuples.
>>
>
> Yes.
>
>
>>
>>
>>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 3)
>>> +    assert other_node_name != new_node_name
>>> +    self._AddNewNodeToTestData(
>>> +        new_node_name, new_node_uuid, new_node_key,
>>> +        is_potential_master_candidate, is_master_candidate,
>>> +        is_master)
>>> +
>>> +    backend.AddNodeSshKey(new_node_uuid, new_node_name,
>>> +                          self._potential_master_candidates,
>>> +                          self._ssh_port_map,
>>> +                          to_authorized_keys=is_master_candidate,
>>> +                          to_public_keys=is_potential_master_candidate,
>>> +                          get_public_keys=is_potential_master_candidate,
>>> +                          pub_key_file=self._pub_key_file,
>>> +                          ssconf_store=self._ssconf_mock,
>>> +                          noded_cert_file=self.noded_cert_file,
>>> +                          run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    self.assertTrue(self._ssh_file_manager.AllNodesHaveAuthorizedKey(
>>> +        new_node_key))
>>> +
>>> +  def testAddKeyFailedOnOldNodeWithRetries(self):
>>> +    """Tests adding keys when updating one node's SSH setup fails.
>>> +
>>> +    This tests, whether when adding a new node's key and one node is
>>>
>>
>> Nit: s/,//
>>
>>
>>> +    unreachable (but not marked as offline) the operation still finishes
>>> +    properly and only that unreachable node's SSH key setup did not get
>>> +    updated.
>>> +
>>> +    """
>>> +    new_node_name = "new_node_name"
>>> +    new_node_uuid = "new_node_uuid"
>>> +    new_node_key = "new_node_key"
>>> +    is_master_candidate = True
>>> +    is_potential_master_candidate = True
>>> +    is_master = False
>>> +
>>> +    other_node_name, _, _, _, _, _ = self._ssh_file_manager \
>>> +        .GetAllMasterCandidates()[0]
>>> +    self._ssh_file_manager.SetMaxRetries(other_node_name, 4)
>>> +    assert other_node_name != new_node_name
>>> +    self._AddNewNodeToTestData(
>>> +        new_node_name, new_node_uuid, new_node_key,
>>> +        is_potential_master_candidate, is_master_candidate,
>>> +        is_master)
>>> +
>>> +    node_errors = backend.AddNodeSshKey(
>>> +        new_node_uuid, new_node_name, self._potential_master_candidates,
>>> +        self._ssh_port_map, to_authorized_keys=is_master_candidate,
>>> +        to_public_keys=is_potential_master_candidate,
>>> +        get_public_keys=is_potential_master_candidate,
>>> +        pub_key_file=self._pub_key_file,
>>> +        ssconf_store=self._ssconf_mock,
>>> +        noded_cert_file=self.noded_cert_file,
>>> +        run_cmd_fn=self._run_cmd_mock)
>>> +
>>> +    for node in self._all_nodes:
>>> +      if node == other_node_name:
>>> +        self.assertFalse(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +          node, new_node_key))
>>> +      else:
>>> +        self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>>> +          node, new_node_key))
>>> +
>>> +    self.assertTrue(node_errors[other_node_name])
>>> +
>>>
>>>  class TestVerifySshSetup(testutils.GanetiTestCase):
>>>
>>> --
>>> 2.2.0.rc0.207.ga3a616c
>>>
>>>
> Interdiff:
>
> commit 49ced9fa7adefd1865ca9933d0a5bb5252fc1726
> Author: Helga Velroyen <[email protected]>
> Date:   Thu Apr 23 11:48:40 2015 +0200
>
>     interdiff
>
> diff --git a/lib/backend.py b/lib/backend.py
> index 5a09efa..6a17edb 100644
> --- a/lib/backend.py
> +++ b/lib/backend.py
> @@ -1564,7 +1564,7 @@ def AddNodeSshKey(node_uuid, node_name,
>    master_node = ssconf_store.GetMasterNode()
>    online_nodes = ssconf_store.GetOnlineNodeList()
>
> -  node_errors = {}
> +  node_errors = []
>    for node in all_nodes:
>      if node == master_node:
>        logging.debug("Skipping master node '%s'.", master_node)
> @@ -1589,10 +1589,10 @@ def AddNodeSshKey(node_uuid, node_name,
>          if last_exception:
>            error_msg = ("When adding the key of node '%s', updating SSH
> key"
>                         " files of node '%s' failed after %s retries."
> -                       " Not trying again. Last errors was: %s." %
> +                       " Not trying again. Last error was: %s." %
>                         (node, node_name, constants.SSHS_MAX_RETRIES,
>                          last_exception))
> -          node_errors[node] = error_msg
> +          node_errors.append((node, error_msg))
>            # We only log the error and don't throw an exception, because
>            # one unreachable node shall not abort the entire procedure.
>            logging.error(error_msg)
> @@ -2024,7 +2024,7 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>          noded_cert_file=noded_cert_file,
>          run_cmd_fn=run_cmd_fn)
>      if node_errors:
> -      all_node_errors = all_node_errors + node_errors.items()
> +      all_node_errors = all_node_errors + node_errors
>
>    # Renewing the master node's key
>
> @@ -2056,7 +2056,7 @@ def RenewSshKeys(node_uuids, node_names,
> ssh_port_map,
>        ssconf_store=ssconf_store, noded_cert_file=noded_cert_file,
>        run_cmd_fn=run_cmd_fn)
>    if node_errors:
> -    all_node_errors = all_node_errors + node_errors.items()
> +    all_node_errors = all_node_errors + node_errors
>
>    # Remove the old key file and rename the new key to the non-temporary
> filename
>    _ReplaceMasterKeyOnMaster(root_keyfiles)
> diff --git a/lib/cmdlib/cluster.py b/lib/cmdlib/cluster.py
> index 14c18da..b56ca61 100644
> --- a/lib/cmdlib/cluster.py
> +++ b/lib/cmdlib/cluster.py
> @@ -68,7 +68,8 @@ from ganeti.cmdlib.common import ShareAll, RunPostHook, \
>    CheckDiskAccessModeConsistency, CreateNewClientCert, \
>    AddInstanceCommunicationNetworkOp,
> ConnectInstanceCommunicationNetworkOp, \
>    CheckImageValidity, \
> -  CheckDiskAccessModeConsistency, CreateNewClientCert, EnsureKvmdOnNodes
> +  CheckDiskAccessModeConsistency, CreateNewClientCert, EnsureKvmdOnNodes,
> \
> +  EvaluateSshUpdateRPC
>
>  import ganeti.masterd.instance
>
> @@ -224,17 +225,19 @@ class LUClusterRenewCrypto(NoHooksLU):
>      port_map = ssh.GetSshPortMap(node_names, self.cfg)
>      potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
>      master_candidate_uuids = self.cfg.GetMasterCandidateUuids()
> +
>      result = self.rpc.call_node_ssh_keys_renew(
>        [master_uuid],
>        node_uuids, node_names, port_map,
>        master_candidate_uuids,
>        potential_master_candidates)
> +
> +    # Check if there were serious errors (for example master key files not
> +    # writable).
>      result[master_uuid].Raise("Could not renew the SSH keys of all nodes")
> -    node_errors = result[master_uuid].payload
> -    if node_errors:
> -      feedback_fn("Some nodes' SSH key files could not be updated:")
> -      for node_name, error_msg in node_errors:
> -        feedback_fn("%s: %s" % (node_name, error_msg))
> +
> +    # Process any non-disruptive errors (a few nodes unreachable etc.)
> +    EvaluateSshUpdateRPC(result, master_uuid, feedback_fn)
>
>    def Exec(self, feedback_fn):
>      if self.op.node_certificates:
> diff --git a/lib/cmdlib/common.py b/lib/cmdlib/common.py
> index 21948af..5e5f573 100644
> --- a/lib/cmdlib/common.py
> +++ b/lib/cmdlib/common.py
> @@ -1554,3 +1554,11 @@ def EnsureKvmdOnNodes(lu, feedback_fn, nodes=None):
>      for node_uuid in stop_nodes:
>        results[node_uuid].Warn("Failed to stop KVM daemon in node '%s'" %
>                                node_uuid, feedback_fn)
> +
> +
> +def EvaluateSshUpdateRPC(result, master_uuid, feedback_fn):
> +  node_errors = result[master_uuid].payload
> +  if node_errors:
> +    feedback_fn("Some nodes' SSH key files could not be updated:")
> +    for node_name, error_msg in node_errors:
> +      feedback_fn("%s: %s" % (node_name, error_msg))
> diff --git a/lib/cmdlib/node.py b/lib/cmdlib/node.py
> index 6a13d8a..c90965e 100644
> --- a/lib/cmdlib/node.py
> +++ b/lib/cmdlib/node.py
> @@ -53,7 +53,7 @@ from ganeti.cmdlib.common import CheckParamsNotGlobal, \
>    GetWantedNodes, MapInstanceLvsToNodes, RunPostHook, \
>    FindFaultyInstanceDisks, CheckStorageTypeEnabled, CreateNewClientCert, \
>    AddNodeCertToCandidateCerts, RemoveNodeCertFromCandidateCerts, \
> -  EnsureKvmdOnNodes
> +  EnsureKvmdOnNodes, EvaluateSshUpdateRPC
>  from ganeti.ssh import GetSshPortMap
>
>
> @@ -372,12 +372,9 @@ class LUNodeAdd(LogicalUnit):
>        potential_master_candidates, port_map,
>        is_master_candidate, is_potential_master_candidate,
>        is_potential_master_candidate)
> +
>      result[master_node].Raise("Could not update the node's SSH setup.")
> -    node_errors = result[master_node].payload
> -    if node_errors:
> -      feedback_fn("Some nodes' SSH key files could not be updated:")
> -      for node_name, error_msg in node_errors.items():
> -        feedback_fn("%s: %s." % (node_name, error_msg))
> +    EvaluateSshUpdateRPC(result, master_node, feedback_fn)
>
>    def Exec(self, feedback_fn):
>      """Adds the new node to the cluster.
> @@ -902,11 +899,7 @@ class LUNodeSetParams(LogicalUnit):
>            ssh_result[master_node].Raise(
>              "Could not update the SSH setup of node '%s' after promotion"
>              " (UUID: %s)." % (node.name, node.uuid))
> -          node_errors = ssh_result[master_node].payload
> -          if node_errors:
> -            feedback_fn("Some nodes' SSH key files could not be updated:")
> -            for node_name, error_msg in node_errors.items():
> -              feedback_fn("%s: %s." % (node_name, error_msg))
> +          EvaluateSshUpdateRPC(ssh_result, master_node, feedback_fn)
>
>      return result
>
> diff --git a/test/py/ganeti.backend_unittest.py b/test/py/
> ganeti.backend_unittest.py
> index ec61f07..77b4f46 100755
> --- a/test/py/ganeti.backend_unittest.py
> +++ b/test/py/ganeti.backend_unittest.py
> @@ -1404,8 +1404,8 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>      """Tests adding a new node's key when updating that node takes
> retries.
>
>      This test checks whether adding a new node's key successfully updates
> -    all node's SSH key files, even if updating the new node's key files
> itself
> -    takes a couple of retries to succeed.
> +    the SSH key files of all nodes, even if updating the new node's key
> files
> +    itself takes a couple of retries to succeed.
>
>      """
>      new_node_name = "new_node_name"
> @@ -1441,8 +1441,8 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>    def testAddKeyFailedOnNewNodeWithRetries(self):
>      """Tests clean up if updating a new node's SSH setup fails.
>
> -    If adding a new node's keys fails, because the SSH key files of that
> -    new node fails, check whether already carried out operations are
> +    If adding the keys of a new node fails, because updating the SSH key
> files
> +    of that new node fails, check whether already carried out operations
> are
>      successfully rolled back and thus the state of the cluster is cleaned
> up.
>
>      """
> @@ -1524,7 +1524,7 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>    def testAddKeyFailedOnOldNodeWithRetries(self):
>      """Tests adding keys when updating one node's SSH setup fails.
>
> -    This tests, whether when adding a new node's key and one node is
> +    This tests whether when adding a new node's key and one node is
>      unreachable (but not marked as offline) the operation still finishes
>      properly and only that unreachable node's SSH key setup did not get
>      updated.
> @@ -1565,7 +1565,8 @@ class
> TestAddRemoveGenerateNodeSshKey(testutils.GanetiTestCase):
>          self.assertTrue(self._ssh_file_manager.NodeHasAuthorizedKey(
>            node, new_node_key))
>
> -    self.assertTrue(node_errors[other_node_name])
> +    self.assertTrue([error_msg for (node, error_msg) in node_errors
> +                     if node == other_node_name])
>
>
>  class TestVerifySshSetup(testutils.GanetiTestCase):
>
>
>>
>> Hrvoje Ribicic
>> Ganeti Engineering
>> Google Germany GmbH
>> Dienerstr. 12, 80331, München
>>
>> Registergericht und -nummer: Hamburg, HRB 86891
>> Sitz der Gesellschaft: Hamburg
>> Geschäftsführer: Graham Law, Christine Elizabeth Flores
>> Steuernummer: 48/725/00206
>> Umsatzsteueridentifikationsnummer: DE813741370
>>
>
Hrvoje Ribicic
Ganeti Engineering
Google Germany GmbH
Dienerstr. 12, 80331, München

Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Steuernummer: 48/725/00206
Umsatzsteueridentifikationsnummer: DE813741370

Reply via email to