On Thu, Sep 17, 2009 at 9:42 AM, Iustin Pop <[email protected]> wrote:
>
> One of the issues we have in ganeti is that it's very hard to test the
> error-handling paths; QA and burnin only test the OK code-path, since
> it's hard to simulate errors.
>
> LUVerifyCluster is special amongst the LUs in the fact that a) it has a
> lot of error paths and b) the error paths only log the error, they don't
> do any rollback or other similar actions. Thus, it's enough for this LU
> to separate the testing of the error condition from the logging of the
> error condition.
>
> This patch does this by replacing code blocks of the form:
>
>  if x:
>    log_error()
>    [y]
>
> into:
>
>  log_error_if(x)
>  [if x:
>    y
>  ]
>
> After this change, it's simple enough to turn on logging of all errors
> by adding a special case inside log_error_if such that if the incoming
> opcode has a special ‘debug_simulate_errors’ attribute and it's true, it
> will log unconditionally the error.
>
> Surprisingly this also turns into an absolute code reduction, since some
> of the if blocks were simplified. The only downside to this patch is
> that the various _VerifyX() functions are now stateful (modifying an
> attribute on the LU instance) instead of returning a boolean result.
>
> Last note: yes, this discovered some error cases in the logging.
> ---
>  lib/cli.py          |    6 +
>  lib/cmdlib.py       |  357 
> ++++++++++++++++++++++++---------------------------
>  lib/opcodes.py      |    3 +-
>  scripts/gnt-cluster |    5 +-
>  4 files changed, 177 insertions(+), 194 deletions(-)
>
> diff --git a/lib/cli.py b/lib/cli.py
> index 46510c3..26a4b4d 100644
> --- a/lib/cli.py
> +++ b/lib/cli.py
> @@ -46,6 +46,7 @@ __all__ = [
>   # Command line options
>   "CONFIRM_OPT",
>   "DEBUG_OPT",
> +  "DEBUG_SIMERR_OPT",
>   "FIELDS_OPT",
>   "FORCE_OPT",
>   "NOHDR_OPT",
> @@ -464,6 +465,11 @@ VERBOSE_OPT = cli_option("-v", "--verbose", 
> default=False,
>                          action="store_true",
>                          help="Increase the verbosity of the operation")
>
> +DEBUG_SIMERR_OPT = cli_option("--debug-simulate-errors", default=False,
> +                              action="store_true", dest="simulate_errors",
> +                              help="Debugging option that makes the 
> operation"
> +                              " treat most runtime checks as failed")
> +
>
>  def _ParseArgs(argv, commands, aliases):
>   """Parser for the command line arguments.
> diff --git a/lib/cmdlib.py b/lib/cmdlib.py
> index 0309a75..eeae2b9 100644
> --- a/lib/cmdlib.py
> +++ b/lib/cmdlib.py
> @@ -858,7 +858,7 @@ class LUVerifyCluster(LogicalUnit):
>   """
>   HPATH = "cluster-verify"
>   HTYPE = constants.HTYPE_CLUSTER
> -  _OP_REQP = ["skip_checks", "verbose", "error_codes"]
> +  _OP_REQP = ["skip_checks", "verbose", "error_codes", 
> "debug_simulate_errors"]
>   REQ_BGL = False
>
>   TCLUSTER = "cluster"
> @@ -918,6 +918,17 @@ class LUVerifyCluster(LogicalUnit):
>     # and finally report it via the feedback_fn
>     self._feedback_fn("  - %s" % msg)
>
> +  def _ErrorIf(self, cond, *args, **kwargs):
> +    """Log an error message if the passed condition is True.
> +
> +    """
> +    cond = bool(cond) or self.op.debug_simulate_errors
> +    if cond:
> +      self._Error(*args, **kwargs)
> +    # do not mark the operation as failed for WARN cases only
> +    if kwargs.get("code", "ERROR") == "ERROR":

Can you use (even internal) constants here?

> +      self.bad = self.bad or cond
> +
>   def _VerifyNode(self, nodeinfo, file_list, local_cksum,
>                   node_result, master_files, drbd_map, vg_name):
>     """Run multiple tests against a node.
> @@ -942,138 +953,125 @@ class LUVerifyCluster(LogicalUnit):
>
>     """
>     node = nodeinfo.name
> +    _ErrorIf = self._ErrorIf
>
>     # main result, node_result should be a non-empty dict
> -    if not node_result or not isinstance(node_result, dict):
> -      self._Error(self.ENODERPC, node,
> +    test = not node_result or not isinstance(node_result, dict)
> +    _ErrorIf(test, self.ENODERPC, node,
>                   "unable to verify node: no data returned")
> -      return True
> +    if test:
> +      return
>
>     # compares ganeti version
>     local_version = constants.PROTOCOL_VERSION
>     remote_version = node_result.get('version', None)
> -    if not (remote_version and isinstance(remote_version, (list, tuple)) and
> -            len(remote_version) == 2):
> -      self._Error(self.ENODERPC, node,
> -                  "connection to node returned invalid data")
> -      return True
> -
> -    if local_version != remote_version[0]:
> -      self._Error(self.ENODEVERSION, node,
> -                  "incompatible protocol versions: master %s,"
> -                  " node %s", local_version, remote_version[0])
> -      return True
> +    test = not (remote_version and
> +                isinstance(remote_version, (list, tuple)) and
> +                len(remote_version) == 2)
> +    _ErrorIf(test, self.ENODERPC, node,
> +             "connection to node returned invalid data")
> +    if test:
> +      return
>
> -    # node seems compatible, we can actually try to look into its results
> +    test = local_version != remote_version[0]
> +    _ErrorIf(test, self.ENODEVERSION, node,
> +             "incompatible protocol versions: master %s,"
> +             " node %s", local_version, remote_version[0])
> +    if test:
> +      return
>
> -    bad = False
> +    # node seems compatible, we can actually try to look into its results
>
>     # full package version
> -    if constants.RELEASE_VERSION != remote_version[1]:
> -      self._Error(self.ENODEVERSION, node,
> +    self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
> +                  self.ENODEVERSION, node,
>                   "software version mismatch: master %s, node %s",
> -                  constants.RELEASE_VERSION, remote_version[1],
> -                  code="WARNING")
> +                  constants.RELEASE_VERSION, remote_version[1], 
> code="WARNING")
>
>     # checks vg existence and size > 20G
>     if vg_name is not None:
>       vglist = node_result.get(constants.NV_VGLIST, None)
> -      if not vglist:
> -        self._Error(self.ENODELVM, node, "unable to check volume groups")
> -        bad = True
> -      else:
> +      test = not vglist
> +      _ErrorIf(test, self.ENODELVM, node, "unable to check volume groups")
> +      if not test:
>         vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
>                                               constants.MIN_VG_SIZE)
> -        if vgstatus:
> -          self._Error(self.ENODELVM, self.TNODE, node, vgstatus)
> -          bad = True
> +        _ErrorIf(vgstatus, self.ENODELVM, node, vgstatus)
>
>     # checks config file checksum
>
>     remote_cksum = node_result.get(constants.NV_FILELIST, None)
> -    if not isinstance(remote_cksum, dict):
> -      bad = True
> -      self._Error(self.ENODEFILECHECK, node,
> -                  "node hasn't returned file checksum data")
> -    else:
> +    test = not isinstance(remote_cksum, dict)
> +    _ErrorIf(test, self.ENODEFILECHECK, node,
> +             "node hasn't returned file checksum data")
> +    if not test:
>       for file_name in file_list:
>         node_is_mc = nodeinfo.master_candidate
> -        must_have_file = file_name not in master_files
> -        if file_name not in remote_cksum:
> -          if node_is_mc or must_have_file:
> -            bad = True
> -            self._Error(self.ENODEFILECHECK, node,
> -                        "file '%s' missing", file_name)
> -        elif remote_cksum[file_name] != local_cksum[file_name]:
> -          if node_is_mc or must_have_file:
> -            bad = True
> -            self._Error(self.ENODEFILECHECK, node,
> -                        "file '%s' has wrong checksum", file_name)
> -          else:
> -            # not candidate and this is not a must-have file
> -            bad = True
> -            self._Error(self.ENODEFILECHECK, node,
> -                        "file '%s' should not exist on non master"
> -                        " candidates (and the file is outdated)", file_name)
> -        else:
> -          # all good, except non-master/non-must have combination
> -          if not node_is_mc and not must_have_file:
> -            self._Error(self.ENODEFILECHECK, node, "file '%s' should not 
> exist"
> -                        " on non master candidates", file_name)
> +        must_have = (file_name not in master_files) or node_is_mc
> +        # missing
> +        test1 = file_name not in remote_cksum
> +        # invalid checksum
> +        test2 = not test1 and remote_cksum[file_name] != 
> local_cksum[file_name]
> +        # existing and good
> +        test3 = not test1 and remote_cksum[file_name] == 
> local_cksum[file_name]
> +        _ErrorIf(test1 and must_have, self.ENODEFILECHECK, node,
> +                 "file '%s' missing", file_name)
> +        _ErrorIf(test2 and must_have, self.ENODEFILECHECK, node,
> +                 "file '%s' has wrong checksum", file_name)
> +        # not candidate and this is not a must-have file
> +        _ErrorIf(test2 and not must_have, self.ENODEFILECHECK, node,
> +                 "file '%s' should not exist on non master"
> +                 " candidates (and the file is outdated)", file_name)
> +        # all good, except non-master/non-must have combination
> +        _ErrorIf(test3 and not must_have, self.ENODEFILECHECK, node,
> +                 "file '%s' should not exist"
> +                 " on non master candidates", file_name)
>
>     # checks ssh to any
>
> -    if constants.NV_NODELIST not in node_result:
> -      bad = True
> -      self._Error(self.ENODESSH, node,
> -                  "node hasn't returned node ssh connectivity data")
> -    else:
> +    test = constants.NV_NODELIST not in node_result
> +    _ErrorIf(test, self.ENODESSH, node,
> +             "node hasn't returned node ssh connectivity data")
> +    if not test:
>       if node_result[constants.NV_NODELIST]:
> -        bad = True
>         for a_node, a_msg in node_result[constants.NV_NODELIST].items():
> -          self._Error(self.ENODESSH, node,
> -                      "ssh communication with node '%s': %s", a_node, a_msg)
> +          _ErrorIf(True, self.ENODESSH, node,
> +                   "ssh communication with node '%s': %s", a_node, a_msg)
>
> -    if constants.NV_NODENETTEST not in node_result:
> -      bad = True
> -      self._Error(self.ENODENET, node,
> -                  "node hasn't returned node tcp connectivity data")
> -    else:
> +    test = constants.NV_NODENETTEST not in node_result
> +    _ErrorIf(test, self.ENODENET, node,
> +             "node hasn't returned node tcp connectivity data")
> +    if not test:
>       if node_result[constants.NV_NODENETTEST]:
> -        bad = True
>         nlist = utils.NiceSort(node_result[constants.NV_NODENETTEST].keys())
>         for anode in nlist:
> -          self._Error(self.ENODENET, node,
> -                      "tcp communication with node '%s': %s",
> -                      anode, node_result[constants.NV_NODENETTEST][anode])
> +          _ErrorIf(True, self.ENODENET, node,
> +                   "tcp communication with node '%s': %s",
> +                   anode, node_result[constants.NV_NODENETTEST][anode])
>
>     hyp_result = node_result.get(constants.NV_HYPERVISOR, None)
>     if isinstance(hyp_result, dict):
>       for hv_name, hv_result in hyp_result.iteritems():
> -        if hv_result is not None:
> -          self._Error(self.ENODEHV, node,
> -                      "hypervisor %s verify failure: '%s'", hv_name, 
> hv_result)
> +        test = hv_result is not None
> +        _ErrorIf(test, self.ENODEHV, node,
> +                 "hypervisor %s verify failure: '%s'", hv_name, hv_result)
>
>     # check used drbd list
>     if vg_name is not None:
>       used_minors = node_result.get(constants.NV_DRBDLIST, [])
> -      if not isinstance(used_minors, (tuple, list)):
> -        self._Error(self.ENODEDRBD, node,
> -                    "cannot parse drbd status file: %s", str(used_minors))
> -      else:
> +      test = not isinstance(used_minors, (tuple, list))
> +      _ErrorIf(test, self.ENODEDRBD, node,
> +               "cannot parse drbd status file: %s", str(used_minors))
> +      if not test:
>         for minor, (iname, must_exist) in drbd_map.items():
> -          if minor not in used_minors and must_exist:
> -            self._Error(self.ENODEDRBD, node,
> -                        "drbd minor %d of instance %s is not active",
> -                        minor, iname)
> -            bad = True
> +          test = minor not in used_minors and must_exist
> +          _ErrorIf(test, self.ENODEDRBD, node,
> +                   "drbd minor %d of instance %s is not active",
> +                   minor, iname)
>         for minor in used_minors:
> -          if minor not in drbd_map:
> -            self._Error(self.ENODEDRBD, node,
> -                        "unallocated drbd minor %d is in use", minor)
> -            bad = True
> -
> -    return bad
> +          test = minor not in drbd_map
> +          _ErrorIf(test, self.ENODEDRBD, node,
> +                   "unallocated drbd minor %d is in use", minor)
>
>   def _VerifyInstance(self, instance, instanceconfig, node_vol_is,
>                       node_instance, n_offline):
> @@ -1083,8 +1081,7 @@ class LUVerifyCluster(LogicalUnit):
>     available on the instance's node.
>
>     """
> -    bad = False
> -
> +    _ErrorIf = self._ErrorIf
>     node_current = instanceconfig.primary_node
>
>     node_vol_should = {}
> @@ -1095,28 +1092,23 @@ class LUVerifyCluster(LogicalUnit):
>         # ignore missing volumes on offline nodes
>         continue
>       for volume in node_vol_should[node]:
> -        if node not in node_vol_is or volume not in node_vol_is[node]:
> -          self._Error(self.EINSTANCEMISSINGDISK, instance,
> -                      "volume %s missing on node %s", volume, node)
> -          bad = True
> +        test = node not in node_vol_is or volume not in node_vol_is[node]
> +        _ErrorIf(test, self.EINSTANCEMISSINGDISK, instance,
> +                 "volume %s missing on node %s", volume, node)
>
>     if instanceconfig.admin_up:
> -      if ((node_current not in node_instance or
> -          not instance in node_instance[node_current]) and
> -          node_current not in n_offline):
> -        self._Error(self.EINSTANCEDOWN, instance,
> -                    "instance not running on its primary node %s",
> -                    node_current)
> -        bad = True
> +      test = ((node_current not in node_instance or
> +               not instance in node_instance[node_current]) and
> +              node_current not in n_offline)
> +      _ErrorIf(test, self.EINSTANCEDOWN, instance,
> +               "instance not running on its primary node %s",
> +               node_current)
>
>     for node in node_instance:
>       if (not node == node_current):
> -        if instance in node_instance[node]:
> -          self._Error(self.EINSTANCEWRONGNODE, instance,
> -                      "instance should not run on node %s", node)
> -          bad = True
> -
> -    return bad
> +        test = instance in node_instance[node]
> +        _ErrorIf(test, self.EINSTANCEWRONGNODE, instance,
> +                 "instance should not run on node %s", node)
>
>   def _VerifyOrphanVolumes(self, node_vol_should, node_vol_is):
>     """Verify if there are any unknown volumes in the cluster.
> @@ -1125,15 +1117,12 @@ class LUVerifyCluster(LogicalUnit):
>     reported as unknown.
>
>     """
> -    bad = False
> -
>     for node in node_vol_is:
>       for volume in node_vol_is[node]:
> -        if node not in node_vol_should or volume not in 
> node_vol_should[node]:
> -          self._Error(self.ENODEORPHANLV, node,
> +        test = (node not in node_vol_should or
> +                volume not in node_vol_should[node])
> +        self._ErrorIf(test, self.ENODEORPHANLV, node,
>                       "volume %s is unknown", volume)
> -          bad = True
> -    return bad
>
>   def _VerifyOrphanInstances(self, instancelist, node_instance):
>     """Verify the list of running instances.
> @@ -1141,14 +1130,11 @@ class LUVerifyCluster(LogicalUnit):
>     This checks what instances are running but unknown to the cluster.
>
>     """
> -    bad = False
>     for node in node_instance:
>       for o_inst in node_instance[node]:
> -        if o_inst not in instancelist:
> -          self._Error(self.ENODEORPHANINSTANCE, node,
> +        test = o_inst not in instancelist
> +        self._ErrorIf(test, self.ENODEORPHANINSTANCE, node,
>                       "instance %s on node %s should not exist", o_inst, node)
> -          bad = True
> -    return bad
>
>   def _VerifyNPlusOneMemory(self, node_info, instance_cfg):
>     """Verify N+1 Memory Resilience.
> @@ -1157,8 +1143,6 @@ class LUVerifyCluster(LogicalUnit):
>     was primary for.
>
>     """
> -    bad = False
> -
>     for node, nodeinfo in node_info.iteritems():
>       # This code checks that every node which is now listed as secondary has
>       # enough memory to host all instances it is supposed to should a single
> @@ -1174,12 +1158,10 @@ class LUVerifyCluster(LogicalUnit):
>           bep = self.cfg.GetClusterInfo().FillBE(instance_cfg[instance])
>           if bep[constants.BE_AUTO_BALANCE]:
>             needed_mem += bep[constants.BE_MEMORY]
> -        if nodeinfo['mfree'] < needed_mem:
> -          self._Error(self.ENODEN1, node,
> +        test = nodeinfo['mfree'] < needed_mem
> +        self._ErrorIf(test, self.ENODEN1, node,
>                       "not enough memory on to accommodate"
>                       " failovers should peer node %s fail", prinode)
> -          bad = True
> -    return bad
>
>   def CheckPrereq(self):
>     """Check prerequisites.
> @@ -1212,12 +1194,13 @@ class LUVerifyCluster(LogicalUnit):
>     """Verify integrity of cluster, performing various test on nodes.
>
>     """
> -    bad = False
> +    self.bad = False
> +    _ErrorIf = self._ErrorIf
>     verbose = self.op.verbose
>     self._feedback_fn = feedback_fn
>     feedback_fn("* Verifying global settings")
>     for msg in self.cfg.VerifyConfig():
> -      self._Error(self.ECLUSTERCFG, None, msg)
> +      _ErrorIf(True, self.ECLUSTERCFG, None, msg)
>
>     vg_name = self.cfg.GetVGName()
>     hypervisors = self.cfg.GetClusterInfo().enabled_hypervisors
> @@ -1293,57 +1276,55 @@ class LUVerifyCluster(LogicalUnit):
>         feedback_fn("* Verifying node %s (%s)" % (node, ntype))
>
>       msg = all_nvinfo[node].fail_msg
> +      _ErrorIf(msg, self.ENODERPC, node, "while contacting node: %s", msg)
>       if msg:
> -        self._Error(self.ENODERPC, node, "while contacting node: %s", msg)
> -        bad = True
>         continue
>
>       nresult = all_nvinfo[node].payload
>       node_drbd = {}
>       for minor, instance in all_drbd_map[node].items():
> -        if instance not in instanceinfo:
> -          self._Error(self.ECLUSTERCFG, None,
> -                      "ghost instance '%s' in temporary DRBD map", instance)
> +        test = instance not in instanceinfo
> +        _ErrorIf(test, self.ECLUSTERCFG, None,
> +                 "ghost instance '%s' in temporary DRBD map", instance)
>           # ghost instance should not be running, but otherwise we
>           # don't give double warnings (both ghost instance and
>           # unallocated minor in use)
> +        if test:
>           node_drbd[minor] = (instance, False)
>         else:
>           instance = instanceinfo[instance]
>           node_drbd[minor] = (instance.name, instance.admin_up)
> -      result = self._VerifyNode(node_i, file_names, local_checksums,
> -                                nresult, master_files, node_drbd, vg_name)
> -      bad = bad or result
> +      self._VerifyNode(node_i, file_names, local_checksums,
> +                       nresult, master_files, node_drbd, vg_name)
>
>       lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
>       if vg_name is None:
>         node_volume[node] = {}
>       elif isinstance(lvdata, basestring):
> -        self._Error(self.ENODELVM, node, "LVM problem on node: %s",
> -                    utils.SafeEncode(lvdata))
> -        bad = True
> +        _ErrorIf(True, self.ENODELVM, node, "LVM problem on node: %s",
> +                 utils.SafeEncode(lvdata))
>         node_volume[node] = {}
>       elif not isinstance(lvdata, dict):
> -        self._Error(self.ENODELVM, node, "rpc call to node failed (lvlist)")
> -        bad = True
> +        _ErrorIf(True, self.ENODELVM, node, "rpc call to node failed 
> (lvlist)")
>         continue
>       else:
>         node_volume[node] = lvdata
>
>       # node_instance
>       idata = nresult.get(constants.NV_INSTANCELIST, None)
> -      if not isinstance(idata, list):
> -        self._Error(self.ENODEHV, "rpc call to node failed (instancelist)")
> -        bad = True
> +      test = not isinstance(idata, list)
> +      _ErrorIf(test, self.ENODEHV, node,
> +               "rpc call to node failed (instancelist)")
> +      if test:
>         continue
>
>       node_instance[node] = idata
>
>       # node_info
>       nodeinfo = nresult.get(constants.NV_HVINFO, None)
> -      if not isinstance(nodeinfo, dict):
> -        self._Error(self.ENODEHV, node, "rpc call to node failed (hvinfo)")
> -        bad = True
> +      test = not isinstance(nodeinfo, dict)
> +      _ErrorIf(test, self.ENODEHV, node, "rpc call to node failed (hvinfo)")
> +      if test:
>         continue
>
>       try:
> @@ -1361,18 +1342,17 @@ class LUVerifyCluster(LogicalUnit):
>         }
>         # FIXME: devise a free space model for file based instances as well
>         if vg_name is not None:
> -          if (constants.NV_VGLIST not in nresult or
> -              vg_name not in nresult[constants.NV_VGLIST]):
> -            self._Error(self.ENODELVM, node,
> -                        "node didn't return data for the volume group '%s'"
> -                        " - it is either missing or broken", vg_name)
> -            bad = True
> +          test = (constants.NV_VGLIST not in nresult or
> +                  vg_name not in nresult[constants.NV_VGLIST])
> +          _ErrorIf(test, self.ENODELVM, node,
> +                   "node didn't return data for the volume group '%s'"
> +                   " - it is either missing or broken", vg_name)
> +          if test:
>             continue
>           node_info[node]["dfree"] = 
> int(nresult[constants.NV_VGLIST][vg_name])
>       except (ValueError, KeyError):
> -        self._Error(self.ENODERPC, node,
> -                    "node returned invalid nodeinfo, check lvm/hypervisor")
> -        bad = True
> +        _ErrorIf(True, self.ENODERPC, node,
> +                 "node returned invalid nodeinfo, check lvm/hypervisor")
>         continue
>
>     node_vol_should = {}
> @@ -1382,9 +1362,8 @@ class LUVerifyCluster(LogicalUnit):
>       if verbose:
>         feedback_fn("* Verifying instance %s" % instance)
>       inst_config = instanceinfo[instance]
> -      result =  self._VerifyInstance(instance, inst_config, node_volume,
> -                                     node_instance, n_offline)
> -      bad = bad or result
> +      self._VerifyInstance(instance, inst_config, node_volume,
> +                           node_instance, n_offline)
>       inst_nodes_offline = []
>
>       inst_config.MapLVsByNode(node_vol_should)
> @@ -1392,12 +1371,11 @@ class LUVerifyCluster(LogicalUnit):
>       instance_cfg[instance] = inst_config
>
>       pnode = inst_config.primary_node
> +      _ErrorIf(pnode not in node_info and pnode not in n_offline,
> +               self.ENODERPC, pnode, "instance %s, connection to"
> +               " primary node failed", instance)
>       if pnode in node_info:
>         node_info[pnode]['pinst'].append(instance)
> -      elif pnode not in n_offline:
> -        self._Error(self.ENODERPC, pnode, "instance %s, connection to"
> -                    " primary node failed", instance)
> -        bad = True
>
>       if pnode in n_offline:
>         inst_nodes_offline.append(pnode)
> @@ -1409,46 +1387,42 @@ class LUVerifyCluster(LogicalUnit):
>       # FIXME: does not support file-backed instances
>       if len(inst_config.secondary_nodes) == 0:
>         i_non_redundant.append(instance)
> -      elif len(inst_config.secondary_nodes) > 1:
> -        self._Error(self.EINSTANCELAYOUT, instance,
> -                    "instance has multiple secondary nodes", code="WARNING")
> +      _ErrorIf(len(inst_config.secondary_nodes) > 1,
> +               self.EINSTANCELAYOUT, instance,
> +               "instance has multiple secondary nodes", code="WARNING")
>
>       if not cluster.FillBE(inst_config)[constants.BE_AUTO_BALANCE]:
>         i_non_a_balanced.append(instance)
>
>       for snode in inst_config.secondary_nodes:
> +        _ErrorIf(snode not in node_info and snode not in n_offline,
> +                 self.ENODERPC, snode,
> +                 "instance %s, connection to secondary node"
> +                 "failed", instance)
> +
>         if snode in node_info:
>           node_info[snode]['sinst'].append(instance)
>           if pnode not in node_info[snode]['sinst-by-pnode']:
>             node_info[snode]['sinst-by-pnode'][pnode] = []
>           node_info[snode]['sinst-by-pnode'][pnode].append(instance)
> -        elif snode not in n_offline:
> -          self._Error(self.ENODERPC, snode,
> -                      "instance %s, connection to secondary node"
> -                      "failed", instance)
> -          bad = True
> +
>         if snode in n_offline:
>           inst_nodes_offline.append(snode)
>
> -      if inst_nodes_offline:
> -        # warn that the instance lives on offline nodes, and set bad=True
> -        self._Error(self.EINSTANCEBADNODE, instance,
> -                    "instance lives on offline node(s) %s",
> -                    ", ".join(inst_nodes_offline))
> -        bad = True
> +      # warn that the instance lives on offline nodes
> +      _ErrorIf(inst_nodes_offline, self.EINSTANCEBADNODE, instance,
> +               "instance lives on offline node(s) %s",
> +               ", ".join(inst_nodes_offline))
>
>     feedback_fn("* Verifying orphan volumes")
> -    result = self._VerifyOrphanVolumes(node_vol_should, node_volume)
> -    bad = bad or result
> +    self._VerifyOrphanVolumes(node_vol_should, node_volume)
>
>     feedback_fn("* Verifying remaining instances")
> -    result = self._VerifyOrphanInstances(instancelist, node_instance)
> -    bad = bad or result
> +    self._VerifyOrphanInstances(instancelist, node_instance)
>
>     if constants.VERIFY_NPLUSONE_MEM not in self.skip_set:
>       feedback_fn("* Verifying N+1 Memory redundancy")
> -      result = self._VerifyNPlusOneMemory(node_info, instance_cfg)
> -      bad = bad or result
> +      self._VerifyNPlusOneMemory(node_info, instance_cfg)
>
>     feedback_fn("* Other Notes")
>     if i_non_redundant:
> @@ -1465,7 +1439,7 @@ class LUVerifyCluster(LogicalUnit):
>     if n_drained:
>       feedback_fn("  - NOTICE: %d drained node(s) found." % len(n_drained))
>
> -    return not bad
> +    return not self.bad
>
>   def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
>     """Analyze the post-hooks' result
> @@ -1494,18 +1468,19 @@ class LUVerifyCluster(LogicalUnit):
>         show_node_header = True
>         res = hooks_results[node_name]
>         msg = res.fail_msg
> -        if msg:
> -          if res.offline:
> -            # no need to warn or set fail return value
> -            continue
> -          self._Error(self.ENODEHOOKS, node_name,
> +        test = msg and not res.offline
> +        self._ErrorIf(test, self.ENODEHOOKS, node_name,
>                       "Communication failure in hooks execution: %s", msg)
> +        if test:
> +          # override manually lu_result here as _ErrorIf only
> +          # overrides self.bad
>           lu_result = 1
>           continue
>         for script, hkr, output in res.payload:
> -          if hkr == constants.HKR_FAIL:
> -            self._Error(self.ENODEHOOKS, node_name,
> +          test = hkr == constants.HKR_FAIL
> +          self._ErrorIf(test, self.ENODEHOOKS, node_name,
>                         "Script %s failed, output:", script)
> +          if test:
>             output = indent_re.sub('      ', output)
>             feedback_fn("%s" % output)
>             lu_result = 1
> diff --git a/lib/opcodes.py b/lib/opcodes.py
> index 24d7bd2..9fea1a1 100644
> --- a/lib/opcodes.py
> +++ b/lib/opcodes.py
> @@ -209,7 +209,8 @@ class OpVerifyCluster(OpCode):
>
>   """
>   OP_ID = "OP_CLUSTER_VERIFY"
> -  __slots__ = OpCode.__slots__ + ["skip_checks", "verbose", "error_codes"]
> +  __slots__ = OpCode.__slots__ + ["skip_checks", "verbose", "error_codes",
> +                                  "debug_simulate_errors"]
>
>
>  class OpVerifyDisks(OpCode):
> diff --git a/scripts/gnt-cluster b/scripts/gnt-cluster
> index 8dfa179..fb34cca 100755
> --- a/scripts/gnt-cluster
> +++ b/scripts/gnt-cluster
> @@ -340,7 +340,8 @@ def VerifyCluster(opts, args):
>     skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
>   op = opcodes.OpVerifyCluster(skip_checks=skip_checks,
>                                verbose=opts.verbose,
> -                               error_codes=opts.error_codes)
> +                               error_codes=opts.error_codes,
> +                               debug_simulate_errors=opts.simulate_errors)
>   if SubmitOpCode(op):
>     return 0
>   else:
> @@ -669,7 +670,7 @@ commands = {
>                   "Forces a push of the configuration file and ssconf files"
>                   " to the nodes in the cluster"),
>   'verify': (VerifyCluster, ARGS_NONE,
> -             [DEBUG_OPT, VERBOSE_OPT,
> +             [DEBUG_OPT, VERBOSE_OPT, DEBUG_SIMERR_OPT,
>               cli_option("--error-codes", dest="error_codes",
>                          help="Enable parseable error messages",
>                          action="store_true", default=False),

LGTM for the rest,

Thanks,

Guido

Reply via email to