[PATCH v2 master 13/16] Allow memory over-commitment in cluster verify

'Oleg Ponomarev' via ganeti-devel Tue, 06 Oct 2015 05:11:49 -0700

There was only one memory check during the cluster verification - check
for N+1 consistency. It's modified in order to account memory
over-commitment. Also hv_state used instead of mem_dom0 in cluster verify.
Corresponding test mock object is also updated.


Signed-off-by: Oleg Ponomarev <[email protected]>
---
 lib/cmdlib/cluster/verify.py       | 27 +++++++++++++++++++++++----
 test/py/cmdlib/cluster_unittest.py |  8 ++++++--
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/lib/cmdlib/cluster/verify.py b/lib/cmdlib/cluster/verify.py
index d2c8d85..3dcf487 100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@ -335,6 +335,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
     @ivar sbp: dictionary of {primary-node: list of instances} for all
         instances for which this node is secondary (config)
     @ivar mfree: free memory, as reported by hypervisor (runtime)
+    @ivar mtotal: total memory, as reported by hypervisor (runtime)
+    @ivar mdom0: domain0 memory, as reported by hypervisor (runtime)
     @ivar dfree: free disk, as reported by the node (runtime)
     @ivar offline: the offline status (config)
     @type rpc_fail: boolean
@@ -366,6 +368,8 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
       self.sinst = []
       self.sbp = {}
       self.mfree = 0
+      self.mtotal = 0
+      self.mdom0 = 0
       self.dfree = 0
       self.offline = offline
       self.vm_capable = vm_capable
@@ -929,6 +933,10 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
 
     """
     cluster_info = self.cfg.GetClusterInfo()
+    ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster_info,
+                                                            self.group_info)
+    memory_ratio = ipolicy[constants.IPOLICY_MEMORY_RATIO]
+
     for node_uuid, n_img in node_image.items():
       # This code checks that every node which is now listed as
       # secondary has enough memory to host all instances it is
@@ -938,8 +946,9 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
       # WARNING: we currently take into account down instances as well
       # as up ones, considering that even if they're down someone
       # might want to start them even in the event of a node failure.
+      node_cfg = self.all_node_info[node_uuid]
       if n_img.offline or \
-         self.all_node_info[node_uuid].group != self.group_uuid:
+         node_cfg.group != self.group_uuid:
         # we're skipping nodes marked offline and nodes in other groups from
         # the N+1 warning, since most likely we don't have good memory
         # information from them; we already list instances living on such
@@ -952,7 +961,13 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
           bep = cluster_info.FillBE(all_insts[inst_uuid])
           if bep[constants.BE_AUTO_BALANCE]:
             needed_mem += bep[constants.BE_MINMEM]
-        test = n_img.mfree < needed_mem
+        mnode = n_img.mdom0
+        (hv, hv_state) = self.cfg.GetFilledHvStateParams(node_cfg).items()[0]
+        if hv != constants.HT_XEN_PVM and hv != constants.HT_XEN_HVM:
+          mnode = hv_state["mem_node"]
+        # minimum allowed free memory (it's negative due to over-commitment)
+        mem_treshold = (n_img.mtotal - mnode) * (memory_ratio - 1)
+        test = n_img.mfree - needed_mem < mem_treshold
         self._ErrorIf(test, constants.CV_ENODEN1,
                       self.cfg.GetNodeName(node_uuid),
                       "not enough memory to accomodate instance failovers"
@@ -1540,12 +1555,16 @@ class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
     """
     # try to read free memory (from the hypervisor)
     hv_info = nresult.get(constants.NV_HVINFO, None)
-    test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
+    test = not isinstance(hv_info, dict) or "memory_free"  not in hv_info \
+                                         or "memory_total" not in hv_info \
+                                         or "memory_dom0" not in hv_info
     self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
                   "rpc call to node failed (hvinfo)")
     if not test:
       try:
-        nimg.mfree = int(hv_info["memory_free"])
+        nimg.mfree  = int(hv_info["memory_free"])
+        nimg.mtotal = int(hv_info["memory_total"])
+        nimg.mdom0  = int(hv_info["memory_dom0"])
       except (ValueError, TypeError):
         self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
                       "node returned invalid nodeinfo, check hypervisor")
diff --git a/test/py/cmdlib/cluster_unittest.py 
b/test/py/cmdlib/cluster_unittest.py
index 90099a0..24c1ca9 100644
--- a/test/py/cmdlib/cluster_unittest.py
+++ b/test/py/cmdlib/cluster_unittest.py
@@ -2136,7 +2136,9 @@ class 
TestLUClusterVerifyGroupUpdateNodeInfo(TestLUClusterVerifyGroupMethods):
   def setUp(self):
     super(TestLUClusterVerifyGroupUpdateNodeInfo, self).setUp()
     self.nimg = verify.LUClusterVerifyGroup.NodeImage(uuid=self.master_uuid)
-    self.valid_hvresult = {constants.NV_HVINFO: {"memory_free": 1024}}
+    self.valid_hvresult = {constants.NV_HVINFO: {"memory_free":  1024,
+                                                 "memory_total": 4096,
+                                                 "memory_dom0":  3072}}
 
   @withLockedLU
   def testInvalidHvNodeResult(self, lu):
@@ -2148,7 +2150,9 @@ class 
TestLUClusterVerifyGroupUpdateNodeInfo(TestLUClusterVerifyGroupMethods):
   @withLockedLU
   def testInvalidMemoryFreeHvNodeResult(self, lu):
     lu._UpdateNodeInfo(self.master,
-                       {constants.NV_HVINFO: {"memory_free": "abc"}},
+                       {constants.NV_HVINFO: {"memory_free":  'abc',
+                                              "memory_total": 1024,
+                                              "memory_dom0":  2048}},
                        self.nimg, None)
     self.mcpu.assertLogContainsRegex(
       "node returned invalid nodeinfo, check hypervisor")
-- 
2.6.0.rc2.230.g3dd15c0

[PATCH v2 master 13/16] Allow memory over-commitment in cluster verify

Reply via email to