Run these tests by master candidates only. In larger clusters, these
test were effectively making a DoS attack on the master node.
LUClusterVerify() calls node_verify on each node, that would run
TCPPing on noded's port for everyone in the nodegroup + master ip. This
was causing timeouts on the master node being unable to receive
node_verify RPC. Reducing this test to master candidates can reduce the
number of connections to noded on the master node by a factor of 8.

Signed-off-by: Viktor Bachraty <[email protected]>
---
 lib/backend.py                     | 103 ++++++++++++++++++++++++++-----------
 lib/cmdlib/cluster/verify.py       |  59 +++++++++++----------
 test/py/ganeti.backend_unittest.py |  53 +++++++++++++++++--
 3 files changed, 155 insertions(+), 60 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index 6c61ebe..c770e33 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -1104,6 +1104,74 @@ def _VerifySshClutter(node_status_list, my_name):
 
   return result
 
+def VerifyNodeNetTest(my_name, test_config):
+  """Verify nodes are reachable.
+
+  @type my_name: string
+  @param my_name: name of the node this test is running on
+
+  @type test_config: tuple (node_list, master_candidate_list)
+  @param test_config: configuration for test as passed from
+      LUClusterVerify() in what[constants.NV_NODENETTEST]
+
+  @rtype: dict
+  @return: a dictionary with node names as keys and error messages
+      as values
+  """
+  result = {}
+  nodes, master_candidates = test_config
+  port = netutils.GetDaemonPort(constants.NODED)
+
+  if my_name not in master_candidates:
+    return result
+
+  my_pip = my_sip = None
+  for name, pip, sip in nodes:
+    if name == my_name:
+      my_pip = pip
+      my_sip = sip
+      break
+  if not my_pip:
+    result[my_name] = ("Can't find my own primary/secondary IP"
+                       " in the node list")
+    return result
+
+  for name, pip, sip in nodes:
+    fail = []
+    if not netutils.TcpPing(pip, port, source=my_pip):
+      fail.append("primary")
+      if sip != pip:
+        if not netutils.TcpPing(sip, port, source=my_sip):
+          fail.append("secondary")
+      if fail:
+        result[name] = ("failure using the %s interface(s)" %
+                        " and ".join(fail))
+  return result
+
+def VerifyMasterIP(my_name, test_config):
+  """Verify master IP is reachable.
+
+  @type my_name: string
+  @param my_name: name of the node this test is running on
+
+  @type test_config: tuple (master_name, master_up, master_candidates)
+  @param test_config: configuration for test as passed from
+      LUClusterVerify() in what[constants.NV_MASTERIP]
+
+  @rtype: bool or None
+  @return: Boolean test result, None if skipped
+  """
+  master_name, master_ip, master_candidates = test_config
+  port = netutils.GetDaemonPort(constants.NODED)
+  if my_name not in master_candidates:
+    return None
+
+  if master_name == my_name:
+    source = constants.IP4_ADDRESS_LOCALHOST
+  else:
+    source = None
+  return netutils.TcpPing(master_ip, port, source=source)
+
 
 def VerifyNode(what, cluster_name, all_hvparams):
   """Verify the status of the local node.
@@ -1140,7 +1208,6 @@ def VerifyNode(what, cluster_name, all_hvparams):
   """
   result = {}
   my_name = netutils.Hostname.GetSysName()
-  port = netutils.GetDaemonPort(constants.NODED)
   vm_capable = my_name not in what.get(constants.NV_NONVMNODES, [])
 
   _VerifyHypervisors(what, vm_capable, result, all_hvparams)
@@ -1193,38 +1260,12 @@ def VerifyNode(what, cluster_name, all_hvparams):
     result[constants.NV_NODELIST] = val
 
   if constants.NV_NODENETTEST in what:
-    result[constants.NV_NODENETTEST] = tmp = {}
-    my_pip = my_sip = None
-    for name, pip, sip in what[constants.NV_NODENETTEST]:
-      if name == my_name:
-        my_pip = pip
-        my_sip = sip
-        break
-    if not my_pip:
-      tmp[my_name] = ("Can't find my own primary/secondary IP"
-                      " in the node list")
-    else:
-      for name, pip, sip in what[constants.NV_NODENETTEST]:
-        fail = []
-        if not netutils.TcpPing(pip, port, source=my_pip):
-          fail.append("primary")
-        if sip != pip:
-          if not netutils.TcpPing(sip, port, source=my_sip):
-            fail.append("secondary")
-        if fail:
-          tmp[name] = ("failure using the %s interface(s)" %
-                       " and ".join(fail))
+    result[constants.NV_NODENETTEST] = VerifyNodeNetTest(
+        my_name, what[constants.NV_NODENETTEST])
 
   if constants.NV_MASTERIP in what:
-    # FIXME: add checks on incoming data structures (here and in the
-    # rest of the function)
-    master_name, master_ip = what[constants.NV_MASTERIP]
-    if master_name == my_name:
-      source = constants.IP4_ADDRESS_LOCALHOST
-    else:
-      source = None
-    result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port,
-                                                     source=source)
+    result[constants.NV_MASTERIP] = VerifyMasterIP(
+        my_name, what[constants.NV_MASTERIP])
 
   if constants.NV_USERSCRIPTS in what:
     result[constants.NV_USERSCRIPTS] = \
diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index 4ef5a2e..868f7a6 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -139,7 +139,7 @@ class _VerifyErrors(object):
             log_type, object_type, object_name, msg))
 
     # Report messages via the feedback_fn
-    self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list) # pylint: 
disable=E1101,C0302
+    self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list) # pylint: 
disable=E1101,C0301
 
     # do not mark the operation as failed for WARN cases only
     if log_type == self.ETYPE_ERROR:
@@ -748,27 +748,26 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
           self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
                         "ssh communication with node '%s': %s", a_node, a_msg)
 
-    test = constants.NV_NODENETTEST not in nresult
-    self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
-                  "node hasn't returned node tcp connectivity data")
-    if not test:
-      if nresult[constants.NV_NODENETTEST]:
-        nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
-        for anode in nlist:
-          self._ErrorIf(True, constants.CV_ENODENET, ninfo.name,
-                        "tcp communication with node '%s': %s",
-                        anode, nresult[constants.NV_NODENETTEST][anode])
-
-    test = constants.NV_MASTERIP not in nresult
-    self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
-                  "node hasn't returned node master IP reachability data")
-    if not test:
-      if not nresult[constants.NV_MASTERIP]:
-        if ninfo.uuid == self.master_node:
-          msg = "the master node cannot reach the master IP (not configured?)"
-        else:
-          msg = "cannot reach the master IP"
-        self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
+    if constants.NV_NODENETTEST not in nresult:
+      self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
+                     "node hasn't returned node tcp connectivity data")
+    elif nresult[constants.NV_NODENETTEST]:
+      nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
+      msglist = []
+      for node in nlist:
+        msglist.append("tcp communication with node '%s': %s" %
+                       (node, nresult[constants.NV_NODENETTEST][node]))
+      self._ErrorMsgList(constants.CV_ENODENET, ninfo.name, msglist)
+
+    if constants.NV_MASTERIP not in nresult:
+      self._ErrorMsg(constants.CV_ENODENET, ninfo.name,
+                    "node hasn't returned node master IP reachability data")
+    elif nresult[constants.NV_MASTERIP] == False: # be explicit, could be None
+      if ninfo.uuid == self.master_node:
+        msg = "the master node cannot reach the master IP (not configured?)"
+      else:
+        msg = "cannot reach the master IP"
+      self._ErrorMsg(constants.CV_ENODENET, ninfo.name, msg)
 
   def _VerifyInstance(self, instance, node_image, diskstatus):
     """Verify an instance.
@@ -1874,12 +1873,20 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
     master_node_uuid = self.master_node = self.cfg.GetMasterNode()
     master_ip = self.cfg.GetMasterIP()
 
+    online_master_candidates = sorted(
+        node.name for node in node_data_list
+        if (node.master_candidate and not node.offline))
+
     feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
 
     user_scripts = []
     if self.cfg.GetUseExternalMipScript():
       user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
 
+    online_nodes = [(node.name, node.primary_ip, node.secondary_ip)
+                    for node in node_data_list if not node.offline]
+    node_nettest_params = (online_nodes, online_master_candidates)
+
     node_verify_param = {
       constants.NV_FILELIST:
         map(vcluster.MakeVirtualPath,
@@ -1892,15 +1899,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
       constants.NV_HYPERVISOR: hypervisors,
       constants.NV_HVPARAMS:
         _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
-      constants.NV_NODENETTEST: [(node.name, node.primary_ip, 
node.secondary_ip)
-                                 for node in node_data_list
-                                 if not node.offline],
+      constants.NV_NODENETTEST: node_nettest_params,
       constants.NV_INSTANCELIST: hypervisors,
       constants.NV_VERSION: None,
       constants.NV_HVINFO: self.cfg.GetHypervisorType(),
       constants.NV_NODESETUP: None,
       constants.NV_TIME: None,
-      constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip),
+      constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip,
+                              online_master_candidates),
       constants.NV_OSLIST: None,
       constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
       constants.NV_USERSCRIPTS: user_scripts,
@@ -2029,6 +2035,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
       # locking the configuration for something but very fast, pure operations.
       cluster_name = self.cfg.GetClusterName()
       hvparams = self.cfg.GetClusterInfo().hvparams
+
       all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
                                              node_verify_param,
                                              cluster_name,
diff --git a/test/py/ganeti.backend_unittest.py 
b/test/py/ganeti.backend_unittest.py
index e1a599c..897fcba 100755
--- a/test/py/ganeti.backend_unittest.py
+++ b/test/py/ganeti.backend_unittest.py
@@ -133,18 +133,31 @@ class TestNodeVerify(testutils.GanetiTestCase):
 
   def testMasterIPLocalhost(self):
     # this a real functional test, but requires localhost to be reachable
+    my_name = netutils.Hostname.GetSysName()
+    local_data = (my_name,constants.IP4_ADDRESS_LOCALHOST, [my_name])
+    result = backend.VerifyNode({constants.NV_MASTERIP: local_data},
+                                None, {})
+    self.failUnless(constants.NV_MASTERIP in result,
+                    "Master IP data not returned")
+    self.failUnless(result[constants.NV_MASTERIP],
+                    "Cannot reach localhost")
+
+  def testMasterIPSkipTest(self):
+    # this a real functional test, but requires localhost to be reachable
     local_data = (netutils.Hostname.GetSysName(),
-                  constants.IP4_ADDRESS_LOCALHOST)
+                  constants.IP4_ADDRESS_LOCALHOST, [])
     result = backend.VerifyNode({constants.NV_MASTERIP: local_data},
                                 None, {})
     self.failUnless(constants.NV_MASTERIP in result,
                     "Master IP data not returned")
-    self.failUnless(result[constants.NV_MASTERIP], "Cannot reach localhost")
+    self.failUnless(result[constants.NV_MASTERIP] == None,
+                    "Test ran by non master candidate")
 
   def testMasterIPUnreachable(self):
     # Network 192.0.2.0/24 is reserved for test/documentation as per
     # RFC 5737
-    bad_data =  ("master.example.com", "192.0.2.1")
+    my_name = "master.example.com"
+    bad_data =  (my_name, "192.0.2.1", [my_name])
     # we just test that whatever TcpPing returns, VerifyNode returns too
     netutils.TcpPing = lambda a, b, source=None: False
     result = backend.VerifyNode({constants.NV_MASTERIP: bad_data},
@@ -154,6 +167,40 @@ class TestNodeVerify(testutils.GanetiTestCase):
     self.failIf(result[constants.NV_MASTERIP],
                 "Result from netutils.TcpPing corrupted")
 
+  def testVerifyNodeNetTestMissingSelf(self):
+    my_name = netutils.Hostname.GetSysName()
+    local_data = ([('n1.test.com', "any", "any")], [my_name])
+    result = backend.VerifyNode({constants.NV_NODENETTEST: local_data},
+                                None, {})
+
+    self.failUnless(constants.NV_NODENETTEST in result,
+                    "NodeNetTest data not returned")
+    self.failUnless(my_name in result[constants.NV_NODENETTEST],
+                    "Missing failure in net test")
+
+  def testVerifyNodeNetTest(self):
+    my_name = netutils.Hostname.GetSysName()
+    local_data = ([(my_name, "any", "any")], [my_name])
+
+    # we just test that whatever TcpPing returns, VerifyNode returns too
+    netutils.TcpPing = lambda a, b, source=None: True
+    result = backend.VerifyNode({constants.NV_NODENETTEST: local_data},
+                                None, {})
+
+    self.failUnless(constants.NV_NODENETTEST in result,
+                    "NodeNetTest data not returned")
+    self.failUnless(result[constants.NV_NODENETTEST] == {},
+                    "NodeNetTest failed")
+
+  def testVerifyNodeNetSkipTest(self):
+    local_data = ([('n1.test.com', "any", "any")], [])
+    result = backend.VerifyNode({constants.NV_NODENETTEST: local_data},
+                                None, {})
+    self.failUnless(constants.NV_NODENETTEST in result,
+                    "NodeNetTest data not returned")
+    self.failUnless(result[constants.NV_NODENETTEST] == {},
+                    "Test ran by non master candidate")
+
   def testVerifyHvparams(self):
     test_hvparams = {constants.HV_XEN_CMD: constants.XEN_CMD_XL}
     test_what = {constants.NV_HVPARAMS: \
-- 
2.8.0.rc3.226.g39d4020

Reply via email to