Run these tests by master candidates only. In larger clusters, these test were effectively making a DoS attack on the master node. LUClusterVerify() calls node_verify on each node, that would run TCPPing on noded's port for everyone in the nodegroup + master ip. This was causing timeouts on the master node being unable to receive node_verify RPC. Reducing this test to master candidates can reduce the number of connections to noded on the master node by a factor of 8.
Signed-off-by: Viktor Bachraty <[email protected]> --- lib/backend.py | 103 ++++++++++++++++++++++++++----------- lib/cmdlib/cluster/verify.py | 59 +++++++++++---------- test/py/ganeti.backend_unittest.py | 53 +++++++++++++++++-- 3 files changed, 155 insertions(+), 60 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 6c61ebe..c770e33 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -1104,6 +1104,74 @@ def _VerifySshClutter(node_status_list, my_name): return result +def VerifyNodeNetTest(my_name, test_config): + """Verify nodes are reachable. + + @type my_name: string + @param my_name: name of the node this test is running on + + @type test_config: tuple (node_list, master_candidate_list) + @param test_config: configuration for test as passed from + LUClusterVerify() in what[constants.NV_NODENETTEST] + + @rtype: dict + @return: a dictionary with node names as keys and error messages + as values + """ + result = {} + nodes, master_candidates = test_config + port = netutils.GetDaemonPort(constants.NODED) + + if my_name not in master_candidates: + return result + + my_pip = my_sip = None + for name, pip, sip in nodes: + if name == my_name: + my_pip = pip + my_sip = sip + break + if not my_pip: + result[my_name] = ("Can't find my own primary/secondary IP" + " in the node list") + return result + + for name, pip, sip in nodes: + fail = [] + if not netutils.TcpPing(pip, port, source=my_pip): + fail.append("primary") + if sip != pip: + if not netutils.TcpPing(sip, port, source=my_sip): + fail.append("secondary") + if fail: + result[name] = ("failure using the %s interface(s)" % + " and ".join(fail)) + return result + +def VerifyMasterIP(my_name, test_config): + """Verify master IP is reachable. + + @type my_name: string + @param my_name: name of the node this test is running on + + @type test_config: tuple (master_name, master_up, master_candidates) + @param test_config: configuration for test as passed from + LUClusterVerify() in what[constants.NV_MASTERIP] + + @rtype: bool or None + @return: Boolean test result, None if skipped + """ + master_name, master_ip, master_candidates = test_config + port = netutils.GetDaemonPort(constants.NODED) + if my_name not in master_candidates: + return None + + if master_name == my_name: + source = constants.IP4_ADDRESS_LOCALHOST + else: + source = None + return netutils.TcpPing(master_ip, port, source=source) + def VerifyNode(what, cluster_name, all_hvparams): """Verify the status of the local node. @@ -1140,7 +1208,6 @@ def VerifyNode(what, cluster_name, all_hvparams): """ result = {} my_name = netutils.Hostname.GetSysName() - port = netutils.GetDaemonPort(constants.NODED) vm_capable = my_name not in what.get(constants.NV_NONVMNODES, []) _VerifyHypervisors(what, vm_capable, result, all_hvparams) @@ -1193,38 +1260,12 @@ def VerifyNode(what, cluster_name, all_hvparams): result[constants.NV_NODELIST] = val if constants.NV_NODENETTEST in what: - result[constants.NV_NODENETTEST] = tmp = {} - my_pip = my_sip = None - for name, pip, sip in what[constants.NV_NODENETTEST]: - if name == my_name: - my_pip = pip - my_sip = sip - break - if not my_pip: - tmp[my_name] = ("Can't find my own primary/secondary IP" - " in the node list") - else: - for name, pip, sip in what[constants.NV_NODENETTEST]: - fail = [] - if not netutils.TcpPing(pip, port, source=my_pip): - fail.append("primary") - if sip != pip: - if not netutils.TcpPing(sip, port, source=my_sip): - fail.append("secondary") - if fail: - tmp[name] = ("failure using the %s interface(s)" % - " and ".join(fail)) + result[constants.NV_NODENETTEST] = VerifyNodeNetTest( + my_name, what[constants.NV_NODENETTEST]) if constants.NV_MASTERIP in what: - # FIXME: add checks on incoming data structures (here and in the - # rest of the function) - master_name, master_ip = what[constants.NV_MASTERIP] - if master_name == my_name: - source = constants.IP4_ADDRESS_LOCALHOST - else: - source = None - result[constants.NV_MASTERIP] = netutils.TcpPing(master_ip, port, - source=source) + result[constants.NV_MASTERIP] = VerifyMasterIP( + my_name, what[constants.NV_MASTERIP]) if constants.NV_USERSCRIPTS in what: result[constants.NV_USERSCRIPTS] = \ diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py index 4ef5a2e..868f7a6 100644 --- a/lib/cmdlib/cluster/verify.py +++ b/lib/cmdlib/cluster/verify.py @@ -139,7 +139,7 @@ class _VerifyErrors(object): log_type, object_type, object_name, msg)) # Report messages via the feedback_fn - self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list) # pylint: disable=E1101,C0302 + self._feedback_fn(constants.ELOG_MESSAGE_LIST, prefixed_list) # pylint: disable=E1101,C0301 # do not mark the operation as failed for WARN cases only if log_type == self.ETYPE_ERROR: @@ -748,27 +748,26 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name, "ssh communication with node '%s': %s", a_node, a_msg) - test = constants.NV_NODENETTEST not in nresult - self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, - "node hasn't returned node tcp connectivity data") - if not test: - if nresult[constants.NV_NODENETTEST]: - nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) - for anode in nlist: - self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, - "tcp communication with node '%s': %s", - anode, nresult[constants.NV_NODENETTEST][anode]) - - test = constants.NV_MASTERIP not in nresult - self._ErrorIf(test, constants.CV_ENODENET, ninfo.name, - "node hasn't returned node master IP reachability data") - if not test: - if not nresult[constants.NV_MASTERIP]: - if ninfo.uuid == self.master_node: - msg = "the master node cannot reach the master IP (not configured?)" - else: - msg = "cannot reach the master IP" - self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg) + if constants.NV_NODENETTEST not in nresult: + self._ErrorMsg(constants.CV_ENODENET, ninfo.name, + "node hasn't returned node tcp connectivity data") + elif nresult[constants.NV_NODENETTEST]: + nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys()) + msglist = [] + for node in nlist: + msglist.append("tcp communication with node '%s': %s" % + (node, nresult[constants.NV_NODENETTEST][node])) + self._ErrorMsgList(constants.CV_ENODENET, ninfo.name, msglist) + + if constants.NV_MASTERIP not in nresult: + self._ErrorMsg(constants.CV_ENODENET, ninfo.name, + "node hasn't returned node master IP reachability data") + elif nresult[constants.NV_MASTERIP] == False: # be explicit, could be None + if ninfo.uuid == self.master_node: + msg = "the master node cannot reach the master IP (not configured?)" + else: + msg = "cannot reach the master IP" + self._ErrorMsg(constants.CV_ENODENET, ninfo.name, msg) def _VerifyInstance(self, instance, node_image, diskstatus): """Verify an instance. @@ -1874,12 +1873,20 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): master_node_uuid = self.master_node = self.cfg.GetMasterNode() master_ip = self.cfg.GetMasterIP() + online_master_candidates = sorted( + node.name for node in node_data_list + if (node.master_candidate and not node.offline)) + feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids)) user_scripts = [] if self.cfg.GetUseExternalMipScript(): user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT) + online_nodes = [(node.name, node.primary_ip, node.secondary_ip) + for node in node_data_list if not node.offline] + node_nettest_params = (online_nodes, online_master_candidates) + node_verify_param = { constants.NV_FILELIST: map(vcluster.MakeVirtualPath, @@ -1892,15 +1899,14 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): constants.NV_HYPERVISOR: hypervisors, constants.NV_HVPARAMS: _GetAllHypervisorParameters(cluster, self.all_inst_info.values()), - constants.NV_NODENETTEST: [(node.name, node.primary_ip, node.secondary_ip) - for node in node_data_list - if not node.offline], + constants.NV_NODENETTEST: node_nettest_params, constants.NV_INSTANCELIST: hypervisors, constants.NV_VERSION: None, constants.NV_HVINFO: self.cfg.GetHypervisorType(), constants.NV_NODESETUP: None, constants.NV_TIME: None, - constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip), + constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip, + online_master_candidates), constants.NV_OSLIST: None, constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(), constants.NV_USERSCRIPTS: user_scripts, @@ -2029,6 +2035,7 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # locking the configuration for something but very fast, pure operations. cluster_name = self.cfg.GetClusterName() hvparams = self.cfg.GetClusterInfo().hvparams + all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids, node_verify_param, cluster_name, diff --git a/test/py/ganeti.backend_unittest.py b/test/py/ganeti.backend_unittest.py index e1a599c..897fcba 100755 --- a/test/py/ganeti.backend_unittest.py +++ b/test/py/ganeti.backend_unittest.py @@ -133,18 +133,31 @@ class TestNodeVerify(testutils.GanetiTestCase): def testMasterIPLocalhost(self): # this a real functional test, but requires localhost to be reachable + my_name = netutils.Hostname.GetSysName() + local_data = (my_name,constants.IP4_ADDRESS_LOCALHOST, [my_name]) + result = backend.VerifyNode({constants.NV_MASTERIP: local_data}, + None, {}) + self.failUnless(constants.NV_MASTERIP in result, + "Master IP data not returned") + self.failUnless(result[constants.NV_MASTERIP], + "Cannot reach localhost") + + def testMasterIPSkipTest(self): + # this a real functional test, but requires localhost to be reachable local_data = (netutils.Hostname.GetSysName(), - constants.IP4_ADDRESS_LOCALHOST) + constants.IP4_ADDRESS_LOCALHOST, []) result = backend.VerifyNode({constants.NV_MASTERIP: local_data}, None, {}) self.failUnless(constants.NV_MASTERIP in result, "Master IP data not returned") - self.failUnless(result[constants.NV_MASTERIP], "Cannot reach localhost") + self.failUnless(result[constants.NV_MASTERIP] == None, + "Test ran by non master candidate") def testMasterIPUnreachable(self): # Network 192.0.2.0/24 is reserved for test/documentation as per # RFC 5737 - bad_data = ("master.example.com", "192.0.2.1") + my_name = "master.example.com" + bad_data = (my_name, "192.0.2.1", [my_name]) # we just test that whatever TcpPing returns, VerifyNode returns too netutils.TcpPing = lambda a, b, source=None: False result = backend.VerifyNode({constants.NV_MASTERIP: bad_data}, @@ -154,6 +167,40 @@ class TestNodeVerify(testutils.GanetiTestCase): self.failIf(result[constants.NV_MASTERIP], "Result from netutils.TcpPing corrupted") + def testVerifyNodeNetTestMissingSelf(self): + my_name = netutils.Hostname.GetSysName() + local_data = ([('n1.test.com', "any", "any")], [my_name]) + result = backend.VerifyNode({constants.NV_NODENETTEST: local_data}, + None, {}) + + self.failUnless(constants.NV_NODENETTEST in result, + "NodeNetTest data not returned") + self.failUnless(my_name in result[constants.NV_NODENETTEST], + "Missing failure in net test") + + def testVerifyNodeNetTest(self): + my_name = netutils.Hostname.GetSysName() + local_data = ([(my_name, "any", "any")], [my_name]) + + # we just test that whatever TcpPing returns, VerifyNode returns too + netutils.TcpPing = lambda a, b, source=None: True + result = backend.VerifyNode({constants.NV_NODENETTEST: local_data}, + None, {}) + + self.failUnless(constants.NV_NODENETTEST in result, + "NodeNetTest data not returned") + self.failUnless(result[constants.NV_NODENETTEST] == {}, + "NodeNetTest failed") + + def testVerifyNodeNetSkipTest(self): + local_data = ([('n1.test.com', "any", "any")], []) + result = backend.VerifyNode({constants.NV_NODENETTEST: local_data}, + None, {}) + self.failUnless(constants.NV_NODENETTEST in result, + "NodeNetTest data not returned") + self.failUnless(result[constants.NV_NODENETTEST] == {}, + "Test ran by non master candidate") + def testVerifyHvparams(self): test_hvparams = {constants.HV_XEN_CMD: constants.XEN_CMD_XL} test_what = {constants.NV_HVPARAMS: \ -- 2.8.0.rc3.226.g39d4020
