mem_cgroup_dcache_is_low() is called during memory reclaim for every
mem cgroup, but it's awfully slow. It iterates through every possible
cpu to collect anon, file and slab reclaimable counters.

Switch to percpu_counter anon,file and slab reclaimable counters.
This allows to read them by doing a single load instead of itearting
over all cpus.

https://jira.sw.ru/browse/PSBM-68644
Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
---
 mm/memcontrol.c | 108 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 24 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9678957bf22..11f9bc07e41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -95,28 +95,38 @@ enum mem_cgroup_stat_index {
        /*
         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
         */
-       MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
-       MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
        MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
        MEM_CGROUP_STAT_SHMEM,          /* # of charged shmem pages */
-       MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
        MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE, /* # of unreclaimable slab pages */
        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
 };
 
+enum mem_cgroup_stat2_index {
+       /*
+        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+        */
+       MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
+       MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
+       MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
+       MEM_CGROUP_STAT2_NSTATS,
+};
+
 static const char * const mem_cgroup_stat_names[] = {
-       "cache",
-       "rss",
        "rss_huge",
        "mapped_file",
        "shmem",
-       "slab_reclaimable",
        "slab_unreclaimable",
        "swap",
 };
 
+static const char * const mem_cgroup_stat2_names[] = {
+       "cache",
+       "rss",
+       "slab_reclaimable",
+};
+
 enum mem_cgroup_events_index {
        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
@@ -167,6 +177,10 @@ struct mem_cgroup_stat_cpu {
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
+struct mem_cgroup_stat2_cpu {
+       struct percpu_counter counters[MEM_CGROUP_STAT2_NSTATS];
+};
+
 struct mem_cgroup_reclaim_iter {
        /*
         * last scanned hierarchy member. Valid only if last_dead_count
@@ -368,6 +382,7 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
+       struct mem_cgroup_stat2_cpu stat2;
        spinlock_t pcp_counter_lock;
 
        atomic_t        dead_count;
@@ -956,6 +971,11 @@ mem_cgroup_read_stat(struct mem_cgroup *memcg, enum 
mem_cgroup_stat_index idx)
                val = 0;
        return val;
 }
+static inline unsigned long
+mem_cgroup_read_stat2(struct mem_cgroup *memcg, enum mem_cgroup_stat2_index 
idx)
+{
+       return percpu_counter_read_positive(&memcg->stat2.counters[idx]);
+}
 
 static void mem_cgroup_update_swap_max(struct mem_cgroup *memcg)
 {
@@ -1013,10 +1033,10 @@ static void mem_cgroup_charge_statistics(struct 
mem_cgroup *memcg,
         * counted as CACHE even if it's on ANON LRU.
         */
        if (PageAnon(page))
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+               percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS],
                                nr_pages);
        else {
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+               
percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
                if (PageSwapBacked(page))
                        
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
@@ -1593,9 +1613,9 @@ bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg)
        if (vfs_cache_min_ratio <= 0)
                return false;
 
-       anon = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
-       file = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
-       dcache = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+       anon = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_RSS);
+       file = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+       dcache = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
 
        return dcache / vfs_cache_min_ratio <
                        (anon + file + dcache) / 100;
@@ -1979,6 +1999,10 @@ done:
                        pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
                                K(mem_cgroup_read_stat(iter, i)));
                }
+               for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+                       pr_cont(" %s:%luKB", mem_cgroup_stat2_names[i],
+                               K(mem_cgroup_read_stat2(iter, i)));
+               }
 
                for (i = 0; i < NR_LRU_LISTS; i++)
                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
@@ -3120,10 +3144,11 @@ int __memcg_charge_slab(struct kmem_cache *s, gfp_t 
gfp, unsigned int nr_pages)
        if (s->flags & SLAB_RECLAIM_ACCOUNT) {
                page_counter_charge(&memcg->dcache, nr_pages);
                idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
-       } else
+               percpu_counter_add(&memcg->stat2.counters[idx], nr_pages);
+       } else {
                idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
-
-       this_cpu_add(memcg->stat->count[idx], nr_pages);
+               this_cpu_add(memcg->stat->count[idx], nr_pages);
+       }
        return 0;
 }
 
@@ -3139,10 +3164,11 @@ void __memcg_uncharge_slab(struct kmem_cache *s, 
unsigned int nr_pages)
        if (s->flags & SLAB_RECLAIM_ACCOUNT) {
                page_counter_uncharge(&memcg->dcache, nr_pages);
                idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
-       } else
+               percpu_counter_sub(&memcg->stat2.counters[idx], nr_pages);
+       } else {
                idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
-
-       this_cpu_sub(memcg->stat->count[idx], nr_pages);
+               this_cpu_sub(memcg->stat->count[idx], nr_pages);
+       }
 }
 
 /*
@@ -4195,6 +4221,17 @@ static unsigned long mem_cgroup_recursive_stat(struct 
mem_cgroup *memcg,
 
        return val;
 }
+static unsigned long mem_cgroup_recursive_stat2(struct mem_cgroup *memcg,
+                                              enum mem_cgroup_stat2_index idx)
+{
+       struct mem_cgroup *iter;
+       unsigned long val = 0;
+
+       for_each_mem_cgroup_tree(iter, memcg)
+               val += mem_cgroup_read_stat2(iter, idx);
+
+       return val;
+}
 
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
@@ -4211,8 +4248,8 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup 
*memcg, bool swap)
         * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
         * as well as in MEM_CGROUP_STAT_RSS_HUGE.
         */
-       val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-       val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+       val = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+       val += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_RSS);
 
        if (swap)
                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
@@ -4228,11 +4265,11 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, 
struct meminfo *mi)
        for_each_online_node(nid)
                mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
 
-       mi->slab_reclaimable = mem_cgroup_recursive_stat(memcg,
+       mi->slab_reclaimable = mem_cgroup_recursive_stat2(memcg,
                                        MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
        mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
                                        MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
-       mi->cached = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+       mi->cached = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
        mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
 }
 
@@ -4247,7 +4284,7 @@ int mem_cgroup_enough_memory(struct mem_cgroup *memcg, 
long pages)
        free += page_counter_read(&memcg->dcache);
 
        /* assume file cache is reclaimable */
-       free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+       free += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
 
        /* but do not count shmem pages as they can't be purged,
         * only swapped out */
@@ -5094,6 +5131,10 @@ static int memcg_stat_show(struct cgroup *cont, struct 
cftype *cft,
                seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
        }
+       for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+               seq_printf(m, "%s %lu\n", mem_cgroup_stat2_names[i],
+                          mem_cgroup_read_stat2(memcg, i) * PAGE_SIZE);
+       }
 
        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
                seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
@@ -5124,6 +5165,13 @@ static int memcg_stat_show(struct cgroup *cont, struct 
cftype *cft,
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
                seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
        }
+       for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+               unsigned long long val = 0;
+
+               for_each_mem_cgroup_tree(mi, memcg)
+                       val += mem_cgroup_read_stat2(mi, i) * PAGE_SIZE;
+               seq_printf(m, "total_%s %llu\n", mem_cgroup_stat2_names[i], 
val);
+       }
 
        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
                unsigned long long val = 0;
@@ -5858,6 +5906,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *memcg;
        size_t size;
+       int i, ret;
 
        size = sizeof(struct mem_cgroup);
        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -5869,9 +5918,20 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!memcg->stat)
                goto out_free;
+
+       for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+               ret = percpu_counter_init(&memcg->stat2.counters[i], 0, 
GFP_KERNEL);
+               if (ret)
+                       goto out_pcpu_free;
+       }
        spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
 
+out_pcpu_free:
+       while (--i >= 0)
+               percpu_counter_destroy(&memcg->stat2.counters[i]);
+
+       free_percpu(memcg->stat);
 out_free:
        kfree(memcg);
        return NULL;
@@ -7017,8 +7077,8 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
unsigned long pgpgout,
        }
 
        local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+       percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS], 
nr_anon);
+       percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE], 
nr_file);
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
        __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
-- 
2.13.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to