Re: [PATCH RFC 1/2] memcg: use percpu_counter for statistics

Kamezawa Hiroyuki Thu, 11 Sep 2014 18:12:25 -0700

(2014/09/12 0:41), Vladimir Davydov wrote:
> In the next patch I need a quick way to get a value of
> MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is
> slow (iterates over all cpus) and may sleep (uses get/put_online_cpus),
> so it's a no-go.
> 
> This patch converts memory cgroup statistics to use percpu_counter so
> that percpu_counter_read will do the trick.
> 
> Signed-off-by: Vladimir Davydov <[email protected]>



I have no strong objections but you need performance comparison to go with this.

I thought percpu counter was messy to be used for "array".
I can't understand why you started from fixing future performance problem before
merging new feature.

Thanks,
-Kame


> ---
>   mm/memcontrol.c |  217 
> ++++++++++++++++++-------------------------------------
>   1 file changed, 69 insertions(+), 148 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 085dc6d2f876..7e8d65e0608a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -136,9 +136,7 @@ enum mem_cgroup_events_target {
>   #define SOFTLIMIT_EVENTS_TARGET 1024
>   #define NUMAINFO_EVENTS_TARGET      1024
>   
> -struct mem_cgroup_stat_cpu {
> -     long count[MEM_CGROUP_STAT_NSTATS];
> -     unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
> +struct mem_cgroup_ratelimit_state {
>       unsigned long nr_page_events;
>       unsigned long targets[MEM_CGROUP_NTARGETS];
>   };
> @@ -341,16 +339,10 @@ struct mem_cgroup {
>       atomic_t        moving_account;
>       /* taken only while moving_account > 0 */
>       spinlock_t      move_lock;
> -     /*
> -      * percpu counter.
> -      */
> -     struct mem_cgroup_stat_cpu __percpu *stat;
> -     /*
> -      * used when a cpu is offlined or other synchronizations
> -      * See mem_cgroup_read_stat().
> -      */
> -     struct mem_cgroup_stat_cpu nocpu_base;
> -     spinlock_t pcp_counter_lock;
> +
> +     struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS];
> +     struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS];
> +     struct mem_cgroup_ratelimit_state __percpu *ratelimit;
>   
>       atomic_t        dead_count;
>   #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
> @@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct 
> mem_cgroup_tree_per_zone *mctz)
>       return mz;
>   }
>   
> -/*
> - * Implementation Note: reading percpu statistics for memcg.
> - *
> - * Both of vmstat[] and percpu_counter has threshold and do periodic
> - * synchronization to implement "quick" read. There are trade-off between
> - * reading cost and precision of value. Then, we may have a chance to 
> implement
> - * a periodic synchronizion of counter in memcg's counter.
> - *
> - * But this _read() function is used for user interface now. The user 
> accounts
> - * memory usage by memory cgroup and he _always_ requires exact value because
> - * he accounts memory. Even if we provide quick-and-fuzzy read, we always
> - * have to visit all online cpus and make sum. So, for now, unnecessary
> - * synchronization is not implemented. (just implemented for cpu hotplug)
> - *
> - * If there are kernel internal actions which can make use of some not-exact
> - * value, and reading all cpu value can be performance bottleneck in some
> - * common workload, threashold and synchonization as vmstat[] should be
> - * implemented.
> - */
>   static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
>                                enum mem_cgroup_stat_index idx)
>   {
> -     long val = 0;
> -     int cpu;
> -
> -     get_online_cpus();
> -     for_each_online_cpu(cpu)
> -             val += per_cpu(memcg->stat->count[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> -     spin_lock(&memcg->pcp_counter_lock);
> -     val += memcg->nocpu_base.count[idx];
> -     spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> -     put_online_cpus();
> -     return val;
> +     return percpu_counter_read(&memcg->stat[idx]);
>   }
>   
>   static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
>                                           enum mem_cgroup_events_index idx)
>   {
> -     unsigned long val = 0;
> -     int cpu;
> -
> -     get_online_cpus();
> -     for_each_online_cpu(cpu)
> -             val += per_cpu(memcg->stat->events[idx], cpu);
> -#ifdef CONFIG_HOTPLUG_CPU
> -     spin_lock(&memcg->pcp_counter_lock);
> -     val += memcg->nocpu_base.events[idx];
> -     spin_unlock(&memcg->pcp_counter_lock);
> -#endif
> -     put_online_cpus();
> -     return val;
> +     return percpu_counter_read(&memcg->events[idx]);
>   }
>   
>   static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
> @@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct 
> mem_cgroup *memcg,
>        * counted as CACHE even if it's on ANON LRU.
>        */
>       if (PageAnon(page))
> -             __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
> +             percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS],
>                               nr_pages);
>       else
> -             __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
> +             percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE],
>                               nr_pages);
>   
>       if (PageTransHuge(page))
> -             __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> +             percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
>                               nr_pages);
>   
>       /* pagein of a big page is an event. So, ignore page size */
>       if (nr_pages > 0)
> -             __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
> -     else {
> -             __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
> -             nr_pages = -nr_pages; /* for event */
> -     }
> -
> -     __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
> +             percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]);
> +     else
> +             percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]);
>   }
>   
>   unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list 
> lru)
> @@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
> *memcg,
>   {
>       unsigned long val, next;
>   
> -     val = __this_cpu_read(memcg->stat->nr_page_events);
> -     next = __this_cpu_read(memcg->stat->targets[target]);
> +     val = __this_cpu_read(memcg->ratelimit->nr_page_events);
> +     next = __this_cpu_read(memcg->ratelimit->targets[target]);
>       /* from time_after() in jiffies.h */
>       if ((long)next - (long)val < 0) {
>               switch (target) {
> @@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup 
> *memcg,
>               default:
>                       break;
>               }
> -             __this_cpu_write(memcg->stat->targets[target], next);
> +             __this_cpu_write(memcg->ratelimit->targets[target], next);
>               return true;
>       }
>       return false;
> @@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct 
> mem_cgroup *memcg,
>   
>   /*
>    * Check events in order.
> - *
>    */
> -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page,
> +                            unsigned long nr_pages)
>   {
> +     unsigned long flags;
> +
> +     local_irq_save(flags);
> +     __this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages);
> +
>       /* threshold event is triggered in finer grain than soft limit */
>       if (unlikely(mem_cgroup_event_ratelimit(memcg,
>                                               MEM_CGROUP_TARGET_THRESH))) {
> @@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, 
> struct page *page)
>                       atomic_inc(&memcg->numainfo_events);
>   #endif
>       }
> +     local_irq_restore(flags);
>   }
>   
>   struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
> @@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct 
> *mm, enum vm_event_item idx)
>   
>       switch (idx) {
>       case PGFAULT:
> -             this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
> +             percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]);
>               break;
>       case PGMAJFAULT:
> -             this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
> +             
> percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
>               break;
>       default:
>               BUG();
> @@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page,
>       if (unlikely(!memcg || !PageCgroupUsed(pc)))
>               return;
>   
> -     this_cpu_add(memcg->stat->count[idx], val);
> +     percpu_counter_add(&memcg->stat[idx], val);
>   }
>   
>   /*
> @@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup 
> *root_memcg)
>       mutex_unlock(&percpu_charge_mutex);
>   }
>   
> -/*
> - * This function drains percpu counter value from DEAD cpu and
> - * move it to local cpu. Note that this function can be preempted.
> - */
> -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
> -{
> -     int i;
> -
> -     spin_lock(&memcg->pcp_counter_lock);
> -     for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
> -             long x = per_cpu(memcg->stat->count[i], cpu);
> -
> -             per_cpu(memcg->stat->count[i], cpu) = 0;
> -             memcg->nocpu_base.count[i] += x;
> -     }
> -     for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
> -             unsigned long x = per_cpu(memcg->stat->events[i], cpu);
> -
> -             per_cpu(memcg->stat->events[i], cpu) = 0;
> -             memcg->nocpu_base.events[i] += x;
> -     }
> -     spin_unlock(&memcg->pcp_counter_lock);
> -}
> -
>   static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
>                                       unsigned long action,
>                                       void *hcpu)
>   {
>       int cpu = (unsigned long)hcpu;
>       struct memcg_stock_pcp *stock;
> -     struct mem_cgroup *iter;
>   
>       if (action == CPU_ONLINE)
>               return NOTIFY_OK;
> @@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct 
> notifier_block *nb,
>       if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
>               return NOTIFY_OK;
>   
> -     for_each_mem_cgroup(iter)
> -             mem_cgroup_drain_pcp_counter(iter, cpu);
> -
>       stock = &per_cpu(memcg_stock, cpu);
>       drain_stock(stock);
>       return NOTIFY_OK;
> @@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>               pc->mem_cgroup = memcg;
>               pc->flags = head_pc->flags;
>       }
> -     __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> -                    HPAGE_PMD_NR);
> +     percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE],
> +                        HPAGE_PMD_NR);
>   }
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   
> @@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page,
>       move_lock_mem_cgroup(from, &flags);
>   
>       if (!PageAnon(page) && page_mapped(page)) {
> -             __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> -                            nr_pages);
> -             __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
> -                            nr_pages);
> +             percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> +                                nr_pages);
> +             percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED],
> +                                nr_pages);
>       }
>   
>       if (PageWriteback(page)) {
> -             __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> -                            nr_pages);
> -             __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
> -                            nr_pages);
> +             percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK],
> +                                nr_pages);
> +             percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK],
> +                                nr_pages);
>       }
>   
>       /*
> @@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page,
>       move_unlock_mem_cgroup(from, &flags);
>       ret = 0;
>   
> -     local_irq_disable();
>       mem_cgroup_charge_statistics(to, page, nr_pages);
> -     memcg_check_events(to, page);
> +     memcg_check_events(to, page, nr_pages);
>       mem_cgroup_charge_statistics(from, page, -nr_pages);
> -     memcg_check_events(from, page);
> -     local_irq_enable();
> +     memcg_check_events(from, page, nr_pages);
>   out_unlock:
>       unlock_page(page);
>   out:
> @@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct 
> mem_cgroup *memcg,
>                                        bool charge)
>   {
>       int val = (charge) ? 1 : -1;
> -     this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
> +     percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val);
>   }
>   
>   /**
> @@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct 
> mem_cgroup *memcg, int node)
>   
>   static struct mem_cgroup *mem_cgroup_alloc(void)
>   {
> -     struct mem_cgroup *memcg;
>       size_t size;
>   
>       size = sizeof(struct mem_cgroup);
>       size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
> -
> -     memcg = kzalloc(size, GFP_KERNEL);
> -     if (!memcg)
> -             return NULL;
> -
> -     memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
> -     if (!memcg->stat)
> -             goto out_free;
> -     spin_lock_init(&memcg->pcp_counter_lock);
> -     return memcg;
> -
> -out_free:
> -     kfree(memcg);
> -     return NULL;
> +     return kzalloc(size, GFP_KERNEL);
>   }
>   
>   /*
> @@ -5448,13 +5355,20 @@ out_free:
>   static void __mem_cgroup_free(struct mem_cgroup *memcg)
>   {
>       int node;
> +     int i;
>   
>       mem_cgroup_remove_from_trees(memcg);
>   
>       for_each_node(node)
>               free_mem_cgroup_per_zone_info(memcg, node);
>   
> -     free_percpu(memcg->stat);
> +     for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> +             percpu_counter_destroy(&memcg->stat[i]);
> +
> +     for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> +             percpu_counter_destroy(&memcg->events[i]);
> +
> +     free_percpu(memcg->ratelimit);
>   
>       /*
>        * We need to make sure that (at least for now), the jump label
> @@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state 
> *parent_css)
>       struct mem_cgroup *memcg;
>       long error = -ENOMEM;
>       int node;
> +     int i;
>   
>       memcg = mem_cgroup_alloc();
>       if (!memcg)
>               return ERR_PTR(error);
>   
> +     for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++)
> +             if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL))
> +                     goto free_out;
> +
> +     for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
> +             if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL))
> +                     goto free_out;
> +
> +     memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state);
> +     if (!memcg->ratelimit)
> +             goto free_out;
> +
>       for_each_node(node)
>               if (alloc_mem_cgroup_per_zone_info(memcg, node))
>                       goto free_out;
> @@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, 
> struct mem_cgroup *memcg,
>               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
>       }
>   
> -     local_irq_disable();
>       mem_cgroup_charge_statistics(memcg, page, nr_pages);
> -     memcg_check_events(memcg, page);
> -     local_irq_enable();
> +     memcg_check_events(memcg, page, nr_pages);
>   
>       if (do_swap_account && PageSwapCache(page)) {
>               swp_entry_t entry = { .val = page_private(page) };
> @@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
> unsigned long pgpgout,
>                          unsigned long nr_anon, unsigned long nr_file,
>                          unsigned long nr_huge, struct page *dummy_page)
>   {
> -     unsigned long flags;
> -
>       if (!mem_cgroup_is_root(memcg)) {
>               if (nr_mem)
>                       res_counter_uncharge(&memcg->res,
> @@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, 
> unsigned long pgpgout,
>               memcg_oom_recover(memcg);
>       }
>   
> -     local_irq_save(flags);
> -     __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
> -     __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
> -     __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> -     __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> -     __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
> -     memcg_check_events(memcg, dummy_page);
> -     local_irq_restore(flags);
> +     percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon);
> +     percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file);
> +     percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
> +     percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
> +
> +     memcg_check_events(memcg, dummy_page, nr_anon + nr_file);
>   }
>   
>   static void uncharge_list(struct list_head *page_list)
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFC 1/2] memcg: use percpu_counter for statistics

Reply via email to