From: Johannes Weiner <han...@cmpxchg.org>

Dave Hansen reports a massive scalability regression in an uncontained
page fault benchmark with more than 30 concurrent threads, which he
bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
res_counter") and pin-pointed on res_counter spinlock contention.

That change relied on the per-cpu charge caches to mostly swallow the
res_counter costs, but it's apparent that the caches don't scale yet.

Revert memcg back to bypassing res_counters on the root level in order
to restore performance for uncontained workloads.

Reported-by: Dave Hansen <d...@sr71.net>
Signed-off-by: Johannes Weiner <han...@cmpxchg.org>
Tested-by: Dave Hansen <dave.han...@intel.com>
Acked-by: Michal Hocko <mho...@suse.cz>
Acked-by: Vladimir Davydov <vdavy...@parallels.com>
Signed-off-by: Linus Torvalds <torva...@linux-foundation.org>

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit ce00a967377baadf2481521e131771adc7652856)
Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
---
 mm/memcontrol.c | 82 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 11c06d3..cb7657e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2814,13 +2814,14 @@ done:
 
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-       if (!mem_cgroup_is_root(memcg)) {
-               unsigned long bytes = nr_pages * PAGE_SIZE;
+       unsigned long bytes = nr_pages * PAGE_SIZE;
 
-               res_counter_uncharge(&memcg->res, bytes);
-               if (do_swap_account)
-                       res_counter_uncharge(&memcg->memsw, bytes);
-       }
+       if (mem_cgroup_is_root(memcg))
+               return;
+
+       res_counter_uncharge(&memcg->res, bytes);
+       if (do_swap_account)
+               res_counter_uncharge(&memcg->memsw, bytes);
 }
 
 /*
@@ -4183,7 +4184,6 @@ out:
        return retval;
 }
 
-
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
                                               enum mem_cgroup_stat_index idx)
 {
@@ -4199,6 +4199,30 @@ static unsigned long mem_cgroup_recursive_stat(struct 
mem_cgroup *memcg,
        return val;
 }
 
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+       u64 val;
+
+       if (!mem_cgroup_is_root(memcg)) {
+               if (!swap)
+                       return res_counter_read_u64(&memcg->res, RES_USAGE);
+               else
+                       return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+       }
+
+       /*
+        * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+        * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+        */
+       val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+       val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+       if (swap)
+               val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+       return val << PAGE_SHIFT;
+}
+
 void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 {
        int nid;
@@ -4236,30 +4260,6 @@ int mem_cgroup_enough_memory(struct mem_cgroup *memcg, 
long pages)
        return free < pages ? -ENOMEM : 0;
 }
 
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
-       u64 val;
-
-       if (!mem_cgroup_is_root(memcg)) {
-               if (!swap)
-                       return res_counter_read_u64(&memcg->res, RES_USAGE);
-               else
-                       return res_counter_read_u64(&memcg->memsw, RES_USAGE);
-       }
-
-       /*
-        * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
-        * as well as in MEM_CGROUP_STAT_RSS_HUGE.
-        */
-       val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-       val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
-
-       if (swap)
-               val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
-
-       return val << PAGE_SHIFT;
-}
-
 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
                               struct file *file, char __user *buf,
                               size_t nbytes, loff_t *ppos)
@@ -6429,7 +6429,7 @@ static void __mem_cgroup_clear_mc(void)
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
-                                               PAGE_SIZE * mc.moved_swap);
+                                            PAGE_SIZE * mc.moved_swap);
 
                for (i = 0; i < mc.moved_swap; i++)
                        css_put(&mc.from->css);
@@ -6811,7 +6811,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
        rcu_read_lock();
        memcg = mem_cgroup_lookup(id);
        if (memcg) {
-               res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+               if (!mem_cgroup_is_root(memcg))
+                       res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                mem_cgroup_swap_statistics(memcg, false);
                css_put(&memcg->css);
        }
@@ -6970,12 +6971,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
unsigned long pgpgout,
 {
        unsigned long flags;
 
-       if (nr_mem)
-               res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
-       if (nr_memsw)
-               res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
-
-       memcg_oom_recover(memcg);
+       if (!mem_cgroup_is_root(memcg)) {
+               if (nr_mem)
+                       res_counter_uncharge(&memcg->res,
+                                            nr_mem * PAGE_SIZE);
+               if (nr_memsw)
+                       res_counter_uncharge(&memcg->memsw,
+                                            nr_memsw * PAGE_SIZE);
+               memcg_oom_recover(memcg);
+       }
 
        local_irq_save(flags);
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-- 
2.7.3

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to