There was only one memory check during the cluster verification - check for N+1 consistency. It's modified in order to account memory over-commitment. Also hv_state used instead of mem_dom0 in cluster verify. Corresponding test mock object is also updated.
Signed-off-by: Oleg Ponomarev <[email protected]> --- lib/cmdlib/cluster/verify.py | 27 +++++++++++++++++++++++---- test/py/cmdlib/cluster_unittest.py | 8 ++++++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py index d2c8d85..3dcf487 100644 --- a/lib/cmdlib/cluster/verify.py +++ b/lib/cmdlib/cluster/verify.py @@ -335,6 +335,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): @ivar sbp: dictionary of {primary-node: list of instances} for all instances for which this node is secondary (config) @ivar mfree: free memory, as reported by hypervisor (runtime) + @ivar mtotal: total memory, as reported by hypervisor (runtime) + @ivar mdom0: domain0 memory, as reported by hypervisor (runtime) @ivar dfree: free disk, as reported by the node (runtime) @ivar offline: the offline status (config) @type rpc_fail: boolean @@ -366,6 +368,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): self.sinst = [] self.sbp = {} self.mfree = 0 + self.mtotal = 0 + self.mdom0 = 0 self.dfree = 0 self.offline = offline self.vm_capable = vm_capable @@ -929,6 +933,10 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): """ cluster_info = self.cfg.GetClusterInfo() + ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info, + self.group_info) + memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO] + for node_uuid, n_img in node_image.items(): # This code checks that every node which is now listed as # secondary has enough memory to host all instances it is @@ -938,8 +946,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): # WARNING: we currently take into account down instances as well # as up ones, considering that even if they're down someone # might want to start them even in the event of a node failure. + node_cfg = self.all_node_info[node_uuid] if n_img.offline or \ - self.all_node_info[node_uuid].group != self.group_uuid: + node_cfg.group != self.group_uuid: # we're skipping nodes marked offline and nodes in other groups from # the N+1 warning, since most likely we don't have good memory # information from them; we already list instances living on such @@ -952,7 +961,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): bep = cluster_info.FillBE(all_insts[inst_uuid]) if bep[constants.BE_AUTO_BALANCE]: needed_mem += bep[constants.BE_MINMEM] - test = n_img.mfree < needed_mem + mnode = n_img.mdom0 + (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0] + if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM: + mnode = hv_state["mem_node"] + # minimum allowed free memory (it's negative due to over-commitment) + mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1) + test = n_img.mfree - needed_mem < mem_treshold self._ErrorIf(test, constants.CV_ENODEN1, self.cfg.GetNodeName(node_uuid), "not enough memory to accomodate instance failovers" @@ -1540,12 +1555,16 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors): """ # try to read free memory (from the hypervisor) hv_info = nresult.get(constants.NV_HVINFO, None) - test = not isinstance(hv_info, dict) or "memory_free" not in hv_info + test = not isinstance(hv_info, dict) or "memory_free" not in hv_info \ + or "memory_total" not in hv_info \ + or "memory_dom0" not in hv_info self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name, "rpc call to node failed (hvinfo)") if not test: try: - nimg.mfree = int(hv_info["memory_free"]) + nimg.mfree = int(hv_info["memory_free"]) + nimg.mtotal = int(hv_info["memory_total"]) + nimg.mdom0 = int(hv_info["memory_dom0"]) except (ValueError, TypeError): self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name, "node returned invalid nodeinfo, check hypervisor") diff --git a/test/py/cmdlib/cluster_unittest.py b/test/py/cmdlib/cluster_unittest.py index 90099a0..24c1ca9 100644 --- a/test/py/cmdlib/cluster_unittest.py +++ b/test/py/cmdlib/cluster_unittest.py @@ -2136,7 +2136,9 @@ class TestLUClusterVerifyGroupUpdateNodeInfo(TestLUClusterVerifyGroupMethods): def setUp(self): super(TestLUClusterVerifyGroupUpdateNodeInfo, self).setUp() self.nimg = verify.LUClusterVerifyGroup.NodeImage(uuid=self.master_uuid) - self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024}} + self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024, + "memory_total": 4096, + "memory_dom0": 3072}} @withLockedLU def testInvalidHvNodeResult(self, lu): @@ -2148,7 +2150,9 @@ class TestLUClusterVerifyGroupUpdateNodeInfo(TestLUClusterVerifyGroupMethods): @withLockedLU def testInvalidMemoryFreeHvNodeResult(self, lu): lu._UpdateNodeInfo(self.master, - {constants.NV_HVINFO: {"memory_free": "abc"}}, + {constants.NV_HVINFO: {"memory_free": 'abc', + "memory_total": 1024, + "memory_dom0": 2048}}, self.nimg, None) self.mcpu.assertLogContainsRegex( "node returned invalid nodeinfo, check hypervisor") -- 2.6.0.rc2.230.g3dd15c0
