[PATCH RFC 1/2] memcg: use percpu_counter for statistics

Vladimir Davydov Thu, 11 Sep 2014 08:48:29 -0700

In the next patch I need a quick way to get a value of
MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is
slow (iterates over all cpus) and may sleep (uses get/put_online_cpus),
so it's a no-go.


This patch converts memory cgroup statistics to use percpu_counter so
that percpu_counter_read will do the trick.

Signed-off-by: Vladimir Davydov <[email protected]>
---
 mm/memcontrol.c |  217 ++++++++++++++++++-------------------------------------
 1 file changed, 69 insertions(+), 148 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 085dc6d2f876..7e8d65e0608a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -136,9 +136,7 @@ enum mem_cgroup_events_target {
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET 1024
 
-struct mem_cgroup_stat_cpu {
-       long count[MEM_CGROUP_STAT_NSTATS];
-       unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+struct mem_cgroup_ratelimit_state {
        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -341,16 +339,10 @@ struct mem_cgroup {
        atomic_t        moving_account;
        /* taken only while moving_account > 0 */
        spinlock_t      move_lock;
-       /*
-        * percpu counter.
-        */
-       struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
-       spinlock_t pcp_counter_lock;
+
+       struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS];
+       struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS];
+       struct mem_cgroup_ratelimit_state __percpu *ratelimit;
 
        atomic_t        dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct 
mem_cgroup_tree_per_zone *mctz)
        return mz;
 }
 
-/*
- * Implementation Note: reading percpu statistics for memcg.
- *
- * Both of vmstat[] and percpu_counter has threshold and do periodic
- * synchronization to implement "quick" read. There are trade-off between
- * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronizion of counter in memcg's counter.
- *
- * But this _read() function is used for user interface now. The user accounts
- * memory usage by memory cgroup and he _always_ requires exact value because
- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
- * have to visit all online cpus and make sum. So, for now, unnecessary
- * synchronization is not implemented. (just implemented for cpu hotplug)
- *
- * If there are kernel internal actions which can make use of some not-exact
- * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threashold and synchonization as vmstat[] should be
- * implemented.
- */
 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
                                 enum mem_cgroup_stat_index idx)
 {
-       long val = 0;
-       int cpu;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu)
-               val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
-       return val;
+       return percpu_counter_read(&memcg->stat[idx]);
 }
 
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
                                            enum mem_cgroup_events_index idx)
 {
-       unsigned long val = 0;
-       int cpu;
-
-       get_online_cpus();
-       for_each_online_cpu(cpu)
-               val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
-       return val;
+       return percpu_counter_read(&memcg->events[idx]);
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct 
mem_cgroup *memcg,
         * counted as CACHE even if it's on ANON LRU.
         */
        if (PageAnon(page))
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+               percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS],
                                nr_pages);
        else
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+               percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
 
        if (PageTransHuge(page))
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+               percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
                                nr_pages);
 
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
-               __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
-       else {
-               __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
-               nr_pages = -nr_pages; /* for event */
-       }
-
-       __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+               percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]);
+       else
+               percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 }
 
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
*memcg,
 {
        unsigned long val, next;
 
-       val = __this_cpu_read(memcg->stat->nr_page_events);
-       next = __this_cpu_read(memcg->stat->targets[target]);
+       val = __this_cpu_read(memcg->ratelimit->nr_page_events);
+       next = __this_cpu_read(memcg->ratelimit->targets[target]);
        /* from time_after() in jiffies.h */
        if ((long)next - (long)val < 0) {
                switch (target) {
@@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
*memcg,
                default:
                        break;
                }
-               __this_cpu_write(memcg->stat->targets[target], next);
+               __this_cpu_write(memcg->ratelimit->targets[target], next);
                return true;
        }
        return false;
@@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
*memcg,
 
 /*
  * Check events in order.
- *
  */
-static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page,
+                              unsigned long nr_pages)
 {
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages);
+
        /* threshold event is triggered in finer grain than soft limit */
        if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                MEM_CGROUP_TARGET_THRESH))) {
@@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, 
struct page *page)
                        atomic_inc(&memcg->numainfo_events);
 #endif
        }
+       local_irq_restore(flags);
 }
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, 
enum vm_event_item idx)
 
        switch (idx) {
        case PGFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+               percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]);
                break;
        case PGMAJFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+               
percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
                break;
        default:
                BUG();
@@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        if (unlikely(!memcg || !PageCgroupUsed(pc)))
                return;
 
-       this_cpu_add(memcg->stat->count[idx], val);
+       percpu_counter_add(&memcg->stat[idx], val);
 }
 
 /*
@@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup 
*root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-       int i;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
 
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
@@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
 
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
-
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head)
                pc->mem_cgroup = memcg;
                pc->flags = head_pc->flags;
        }
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
-                      HPAGE_PMD_NR);
+       percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
+                          HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page,
        move_lock_mem_cgroup(from, &flags);
 
        if (!PageAnon(page) && page_mapped(page)) {
-               __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                              nr_pages);
-               __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                              nr_pages);
+               percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED],
+                                  nr_pages);
+               percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED],
+                                  nr_pages);
        }
 
        if (PageWriteback(page)) {
-               __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                              nr_pages);
-               __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                              nr_pages);
+               percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK],
+                                  nr_pages);
+               percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK],
+                                  nr_pages);
        }
 
        /*
@@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page,
        move_unlock_mem_cgroup(from, &flags);
        ret = 0;
 
-       local_irq_disable();
        mem_cgroup_charge_statistics(to, page, nr_pages);
-       memcg_check_events(to, page);
+       memcg_check_events(to, page, nr_pages);
        mem_cgroup_charge_statistics(from, page, -nr_pages);
-       memcg_check_events(from, page);
-       local_irq_enable();
+       memcg_check_events(from, page, nr_pages);
 out_unlock:
        unlock_page(page);
 out:
@@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup 
*memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-       this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+       percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val);
 }
 
 /**
@@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
 
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
-       struct mem_cgroup *memcg;
        size_t size;
 
        size = sizeof(struct mem_cgroup);
        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
-       memcg = kzalloc(size, GFP_KERNEL);
-       if (!memcg)
-               return NULL;
-
-       memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-       if (!memcg->stat)
-               goto out_free;
-       spin_lock_init(&memcg->pcp_counter_lock);
-       return memcg;
-
-out_free:
-       kfree(memcg);
-       return NULL;
+       return kzalloc(size, GFP_KERNEL);
 }
 
 /*
@@ -5448,13 +5355,20 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
+       int i;
 
        mem_cgroup_remove_from_trees(memcg);
 
        for_each_node(node)
                free_mem_cgroup_per_zone_info(memcg, node);
 
-       free_percpu(memcg->stat);
+       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
+               percpu_counter_destroy(&memcg->stat[i]);
+
+       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+               percpu_counter_destroy(&memcg->events[i]);
+
+       free_percpu(memcg->ratelimit);
 
        /*
         * We need to make sure that (at least for now), the jump label
@@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
        struct mem_cgroup *memcg;
        long error = -ENOMEM;
        int node;
+       int i;
 
        memcg = mem_cgroup_alloc();
        if (!memcg)
                return ERR_PTR(error);
 
+       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
+               if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL))
+                       goto free_out;
+
+       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+               if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL))
+                       goto free_out;
+
+       memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state);
+       if (!memcg->ratelimit)
+               goto free_out;
+
        for_each_node(node)
                if (alloc_mem_cgroup_per_zone_info(memcg, node))
                        goto free_out;
@@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, struct 
mem_cgroup *memcg,
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
 
-       local_irq_disable();
        mem_cgroup_charge_statistics(memcg, page, nr_pages);
-       memcg_check_events(memcg, page);
-       local_irq_enable();
+       memcg_check_events(memcg, page, nr_pages);
 
        if (do_swap_account && PageSwapCache(page)) {
                swp_entry_t entry = { .val = page_private(page) };
@@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
unsigned long pgpgout,
                           unsigned long nr_anon, unsigned long nr_file,
                           unsigned long nr_huge, struct page *dummy_page)
 {
-       unsigned long flags;
-
        if (!mem_cgroup_is_root(memcg)) {
                if (nr_mem)
                        res_counter_uncharge(&memcg->res,
@@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
unsigned long pgpgout,
                memcg_oom_recover(memcg);
        }
 
-       local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
-       __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
-       __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
-       memcg_check_events(memcg, dummy_page);
-       local_irq_restore(flags);
+       percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon);
+       percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file);
+       percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+       percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+
+       memcg_check_events(memcg, dummy_page, nr_anon + nr_file);
 }
 
 static void uncharge_list(struct list_head *page_list)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFC 1/2] memcg: use percpu_counter for statistics

Reply via email to