From: Johannes Weiner <han...@cmpxchg.org> Dave Hansen reports a massive scalability regression in an uncontained page fault benchmark with more than 30 concurrent threads, which he bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup res_counter") and pin-pointed on res_counter spinlock contention.
That change relied on the per-cpu charge caches to mostly swallow the res_counter costs, but it's apparent that the caches don't scale yet. Revert memcg back to bypassing res_counters on the root level in order to restore performance for uncontained workloads. Reported-by: Dave Hansen <d...@sr71.net> Signed-off-by: Johannes Weiner <han...@cmpxchg.org> Tested-by: Dave Hansen <dave.han...@intel.com> Acked-by: Michal Hocko <mho...@suse.cz> Acked-by: Vladimir Davydov <vdavy...@parallels.com> Signed-off-by: Linus Torvalds <torva...@linux-foundation.org> https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit ce00a967377baadf2481521e131771adc7652856) Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- mm/memcontrol.c | 82 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 11c06d3..cb7657e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2814,13 +2814,14 @@ done: static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + if (mem_cgroup_is_root(memcg)) + return; + + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -4183,7 +4184,6 @@ out: return retval; } - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { @@ -4199,6 +4199,30 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, return val; } +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + u64 val; + + if (!mem_cgroup_is_root(memcg)) { + if (!swap) + return res_counter_read_u64(&memcg->res, RES_USAGE); + else + return res_counter_read_u64(&memcg->memsw, RES_USAGE); + } + + /* + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS + * as well as in MEM_CGROUP_STAT_RSS_HUGE. + */ + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); + + if (swap) + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); + + return val << PAGE_SHIFT; +} + void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi) { int nid; @@ -4236,30 +4260,6 @@ int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages) return free < pages ? -ENOMEM : 0; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) @@ -6429,7 +6429,7 @@ static void __mem_cgroup_clear_mc(void) /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); @@ -6811,7 +6811,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_lock(); memcg = mem_cgroup_lookup(id); if (memcg) { - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -6970,12 +6971,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, { unsigned long flags; - if (nr_mem) - res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); - if (nr_memsw) - res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); - - memcg_oom_recover(memcg); + if (!mem_cgroup_is_root(memcg)) { + if (nr_mem) + res_counter_uncharge(&memcg->res, + nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, + nr_memsw * PAGE_SIZE); + memcg_oom_recover(memcg); + } local_irq_save(flags); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); -- 2.7.3 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel