From: Adeodato Simo <[email protected]>

With this change, LUClusterVerifyConfig becomes a "light" LU that only
verifies the global config and other, master-only settings, and the bulk of
node/instance verification is done by LUClusterVerifyGroup, which only acts
on nodes and instances of a given group.

To ensure that `gnt-cluster verify` continues to operate on the whole
cluster, the client creates an OpClusterVerifyGroup job per node group; for
convenience, the list of node groups is returned by LUClusterVerifyConfig.

Signed-off-by: Adeodato Simo <[email protected]>
---
 lib/client/gnt_cluster.py      |   32 +++++++++---
 lib/cmdlib.py                  |  110 ++++++++++++++++++++++++++++++++--------
 lib/opcodes.py                 |   17 ++++++-
 test/ganeti.cmdlib_unittest.py |    4 +-
 4 files changed, 130 insertions(+), 33 deletions(-)

diff --git a/lib/client/gnt_cluster.py b/lib/client/gnt_cluster.py
index 3baabf0..c5eebf7 100644
--- a/lib/client/gnt_cluster.py
+++ b/lib/client/gnt_cluster.py
@@ -453,17 +453,33 @@ def VerifyCluster(opts, args):
   @return: the desired exit code
 
   """
+  simulate = opts.simulate_errors
   skip_checks = []
+
+  # Verify cluster config.
+  op = opcodes.OpClusterVerifyConfig(verbose=opts.verbose,
+                                     error_codes=opts.error_codes,
+                                     debug_simulate_errors=simulate)
+
+  success, all_groups = SubmitOpCode(op, opts=opts)
+
   if opts.skip_nplusone_mem:
     skip_checks.append(constants.VERIFY_NPLUSONE_MEM)
-  op = opcodes.OpClusterVerify(skip_checks=skip_checks,
-                               verbose=opts.verbose,
-                               error_codes=opts.error_codes,
-                               debug_simulate_errors=opts.simulate_errors)
-  if SubmitOpCode(op, opts=opts):
-    return 0
-  else:
-    return 1
+
+  jex = JobExecutor(opts=opts, verbose=False)
+
+  for group in all_groups:
+    op = opcodes.OpClusterVerifyGroup(group_name=group,
+                                      skip_checks=skip_checks,
+                                      verbose=opts.verbose,
+                                      error_codes=opts.error_codes,
+                                      debug_simulate_errors=simulate)
+    jex.QueueJob('group ' + group, op)
+
+  results = jex.GetResults()
+  success &= compat.all(r[1][0] for r in results)
+
+  return (not success and 1 or 0)
 
 
 def VerifyDisks(opts, args):
diff --git a/lib/cmdlib.py b/lib/cmdlib.py
index c26411f..808e26a 100644
--- a/lib/cmdlib.py
+++ b/lib/cmdlib.py
@@ -1254,7 +1254,7 @@ class LUClusterDestroy(LogicalUnit):
 
 
 def _VerifyCertificate(filename):
-  """Verifies a certificate for LUClusterVerify.
+  """Verifies a certificate for LUClusterVerifyConfig.
 
   @type filename: string
   @param filename: Path to PEM file
@@ -1264,7 +1264,7 @@ def _VerifyCertificate(filename):
     cert = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM,
                                            utils.ReadFile(filename))
   except Exception, err: # pylint: disable-msg=W0703
-    return (LUClusterVerify.ETYPE_ERROR,
+    return (LUClusterVerifyConfig.ETYPE_ERROR,
             "Failed to load X509 certificate %s: %s" % (filename, err))
 
   (errcode, msg) = \
@@ -1279,9 +1279,9 @@ def _VerifyCertificate(filename):
   if errcode is None:
     return (None, fnamemsg)
   elif errcode == utils.CERT_WARNING:
-    return (LUClusterVerify.ETYPE_WARNING, fnamemsg)
+    return (LUClusterVerifyConfig.ETYPE_WARNING, fnamemsg)
   elif errcode == utils.CERT_ERROR:
-    return (LUClusterVerify.ETYPE_ERROR, fnamemsg)
+    return (LUClusterVerifyConfig.ETYPE_ERROR, fnamemsg)
 
   raise errors.ProgrammerError("Unhandled certificate error code %r" % errcode)
 
@@ -1368,10 +1368,43 @@ class _VerifyErrors(object):
       self.bad = self.bad or cond
 
 
-class LUClusterVerify(LogicalUnit, _VerifyErrors):
-  """Verifies the cluster status.
+class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
+  """Verifies the cluster config.
 
   """
+
+  REQ_BGL = False
+
+  def ExpandNames(self):
+    self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
+    self.needed_locks = {}
+
+  def Exec(self, feedback_fn):
+    """Verify integrity of cluster, performing various test on nodes.
+
+    """
+    self.bad = False
+    self._feedback_fn = feedback_fn
+
+    feedback_fn("* Verifying cluster config")
+
+    for msg in self.cfg.VerifyConfig():
+      self._ErrorIf(True, self.ECLUSTERCFG, None, msg)
+
+    feedback_fn("* Verifying cluster certificate files")
+
+    for cert_filename in constants.ALL_CERT_FILES:
+      (errcode, msg) = _VerifyCertificate(cert_filename)
+      self._ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
+
+    return (not self.bad, [g.name for g in self.all_group_info.values()])
+
+
+class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
+  """Verifies the status of a node group.
+
+  """
+
   HPATH = "cluster-verify"
   HTYPE = constants.HTYPE_CLUSTER
   REQ_BGL = False
@@ -1429,19 +1462,62 @@ class LUClusterVerify(LogicalUnit, _VerifyErrors):
       self.oslist = {}
 
   def ExpandNames(self):
+    # This raises errors.OpPrereqError on its own:
+    self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
+
+    all_node_info = self.cfg.GetAllNodesInfo()
+    all_inst_info = self.cfg.GetAllInstancesInfo()
+
+    node_names = set(node.name
+                     for node in all_node_info.values()
+                     if node.group == self.group_uuid)
+
+    inst_names = [inst.name
+                  for inst in all_inst_info.values()
+                  if inst.primary_node in node_names]
+
     self.needed_locks = {
-      locking.LEVEL_NODE: locking.ALL_SET,
-      locking.LEVEL_INSTANCE: locking.ALL_SET,
+      locking.LEVEL_NODEGROUP: [self.group_uuid],
+      locking.LEVEL_NODE: list(node_names),
+      locking.LEVEL_INSTANCE: inst_names,
     }
+
     self.share_locks = dict.fromkeys(locking.LEVELS, 1)
 
   def CheckPrereq(self):
     self.all_node_info = self.cfg.GetAllNodesInfo()
     self.all_inst_info = self.cfg.GetAllInstancesInfo()
-    self.my_node_names = utils.NiceSort(list(self.all_node_info))
-    self.my_node_info = self.all_node_info
-    self.my_inst_names = utils.NiceSort(list(self.all_inst_info))
-    self.my_inst_info = self.all_inst_info
+
+    group_nodes = set(node.name
+                      for node in self.all_node_info.values()
+                      if node.group == self.group_uuid)
+
+    group_instances = set(inst.name
+                          for inst in self.all_inst_info.values()
+                          if inst.primary_node in group_nodes)
+
+    unlocked_nodes = \
+        group_nodes.difference(self.glm.list_owned(locking.LEVEL_NODE))
+
+    unlocked_instances = \
+        group_instances.difference(self.glm.list_owned(locking.LEVEL_INSTANCE))
+
+    if unlocked_nodes:
+      raise errors.OpPrereqError("missing lock for nodes: %s" %
+                                 utils.CommaJoin(unlocked_nodes))
+
+    if unlocked_instances:
+      raise errors.OpPrereqError("missing lock for instances: %s" %
+                                 utils.CommaJoin(unlocked_instances))
+
+    self.my_node_names = utils.NiceSort(group_nodes)
+    self.my_inst_names = utils.NiceSort(group_instances)
+
+    self.my_node_info = dict((name, self.all_node_info[name])
+                             for name in self.my_node_names)
+
+    self.my_inst_info = dict((name, self.all_inst_info[name])
+                             for name in self.my_inst_names)
 
   def _VerifyNode(self, ninfo, nresult):
     """Perform some basic validation on data returned from a node.
@@ -2241,7 +2317,7 @@ class LUClusterVerify(LogicalUnit, _VerifyErrors):
     return ([], self.my_node_names)
 
   def Exec(self, feedback_fn):
-    """Verify integrity of cluster, performing various test on nodes.
+    """Verify integrity of the node group, performing various test on nodes.
 
     """
     # This method has too many local variables. pylint: disable-msg=R0914
@@ -2249,14 +2325,6 @@ class LUClusterVerify(LogicalUnit, _VerifyErrors):
     _ErrorIf = self._ErrorIf # pylint: disable-msg=C0103
     verbose = self.op.verbose
     self._feedback_fn = feedback_fn
-    feedback_fn("* Verifying global settings")
-    for msg in self.cfg.VerifyConfig():
-      _ErrorIf(True, self.ECLUSTERCFG, None, msg)
-
-    # Check the cluster certificates
-    for cert_filename in constants.ALL_CERT_FILES:
-      (errcode, msg) = _VerifyCertificate(cert_filename)
-      _ErrorIf(errcode, self.ECLUSTERCERT, None, msg, code=errcode)
 
     vg_name = self.cfg.GetVGName()
     drbd_helper = self.cfg.GetDRBDHelper()
diff --git a/lib/opcodes.py b/lib/opcodes.py
index 1946284..976288e 100644
--- a/lib/opcodes.py
+++ b/lib/opcodes.py
@@ -509,8 +509,19 @@ class OpClusterQuery(OpCode):
   """Query cluster information."""
 
 
-class OpClusterVerify(OpCode):
-  """Verify the cluster state.
+class OpClusterVerifyConfig(OpCode):
+  """Verify the cluster config.
+
+  """
+  OP_PARAMS = [
+    ("verbose", False, ht.TBool, None),
+    ("error_codes", False, ht.TBool, None),
+    ("debug_simulate_errors", False, ht.TBool, None),
+    ]
+
+
+class OpClusterVerifyGroup(OpCode):
+  """Run verify on a node group from the cluster.
 
   @type skip_checks: C{list}
   @ivar skip_checks: steps to be skipped from the verify process; this
@@ -519,7 +530,9 @@ class OpClusterVerify(OpCode):
                      only L{constants.VERIFY_NPLUSONE_MEM} can be passed
 
   """
+  OP_DSC_FIELD = "group_name"
   OP_PARAMS = [
+    ("group_name", ht.NoDefault, ht.TNonEmptyString, None),
     ("skip_checks", ht.EmptyList,
      ht.TListOf(ht.TElemOf(constants.VERIFY_OPTIONAL_CHECKS)), None),
     ("verbose", False, ht.TBool, None),
diff --git a/test/ganeti.cmdlib_unittest.py b/test/ganeti.cmdlib_unittest.py
index 65ac193..b44c547 100755
--- a/test/ganeti.cmdlib_unittest.py
+++ b/test/ganeti.cmdlib_unittest.py
@@ -57,12 +57,12 @@ class TestCertVerification(testutils.GanetiTestCase):
     nonexist_filename = os.path.join(self.tmpdir, "does-not-exist")
 
     (errcode, msg) = cmdlib._VerifyCertificate(nonexist_filename)
-    self.assertEqual(errcode, cmdlib.LUClusterVerify.ETYPE_ERROR)
+    self.assertEqual(errcode, cmdlib.LUClusterVerifyConfig.ETYPE_ERROR)
 
     # Try to load non-certificate file
     invalid_cert = self._TestDataFilename("bdev-net.txt")
     (errcode, msg) = cmdlib._VerifyCertificate(invalid_cert)
-    self.assertEqual(errcode, cmdlib.LUClusterVerify.ETYPE_ERROR)
+    self.assertEqual(errcode, cmdlib.LUClusterVerifyConfig.ETYPE_ERROR)
 
 
 class TestOpcodeParams(testutils.GanetiTestCase):
-- 
1.7.2.5

Reply via email to