Re: [patch 1/3] mm: embed the memcg pointer directly into struct page

Michal Hocko Mon, 03 Nov 2014 08:53:09 -0800

On Sat 01-11-14 23:15:54, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers.  To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
> 
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged.  The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.  With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space.  Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.


CONFIG_SLAB should be OK as well because there should be one slot for
pointer left if no special debugging is enabled.

>    text    data     bss     dec     hex     filename
> 8828345 1725264  983040 11536649 b00909  vmlinux.old
> 8827425 1725264  966656 11519345 afc571  vmlinux.new
> 
> Signed-off-by: Johannes Weiner <han...@cmpxchg.org>

Very nice!

Acked-by: Michal Hocko <mho...@suse.cz>

Thanks!

> ---
>  include/linux/memcontrol.h  |   6 +-
>  include/linux/mm_types.h    |   5 +
>  include/linux/mmzone.h      |  12 --
>  include/linux/page_cgroup.h |  53 --------
>  init/main.c                 |   7 -
>  mm/memcontrol.c             | 124 +++++------------
>  mm/page_alloc.c             |   2 -
>  mm/page_cgroup.c            | 319 
> --------------------------------------------
>  8 files changed, 41 insertions(+), 487 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d4575a1d6e99..dafba59b31b4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -25,7 +25,6 @@
>  #include <linux/jump_label.h>
>  
>  struct mem_cgroup;
> -struct page_cgroup;
>  struct page;
>  struct mm_struct;
>  struct kmem_cache;
> @@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup 
> **memcg, int order)
>   * memcg_kmem_uncharge_pages: uncharge pages from memcg
>   * @page: pointer to struct page being freed
>   * @order: allocation order.
> - *
> - * there is no need to specify memcg here, since it is embedded in 
> page_cgroup
>   */
>  static inline void
>  memcg_kmem_uncharge_pages(struct page *page, int order)
> @@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
>   *
>   * Needs to be called after memcg_kmem_newpage_charge, regardless of success 
> or
>   * failure of the allocation. if @page is NULL, this function will revert the
> - * charges. Otherwise, it will commit the memcg given by @memcg to the
> - * corresponding page_cgroup.
> + * charges. Otherwise, it will commit @page to @memcg.
>   */
>  static inline void
>  memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int 
> order)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index de183328abb0..57e47dffbdda 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -23,6 +23,7 @@
>  #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
>  
>  struct address_space;
> +struct mem_cgroup;
>  
>  #define USE_SPLIT_PTE_PTLOCKS        (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
>  #define USE_SPLIT_PMD_PTLOCKS        (USE_SPLIT_PTE_PTLOCKS && \
> @@ -168,6 +169,10 @@ struct page {
>               struct page *first_page;        /* Compound tail pages */
>       };
>  
> +#ifdef CONFIG_MEMCG
> +     struct mem_cgroup *mem_cgroup;
> +#endif
> +
>       /*
>        * On machines where all RAM is mapped into kernel address space,
>        * we can simply calculate the virtual address. On machines with
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 48bf12ef6620..de32d9344446 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -713,9 +713,6 @@ typedef struct pglist_data {
>       int nr_zones;
>  #ifdef CONFIG_FLAT_NODE_MEM_MAP      /* means !SPARSEMEM */
>       struct page *node_mem_map;
> -#ifdef CONFIG_MEMCG
> -     struct page_cgroup *node_page_cgroup;
> -#endif
>  #endif
>  #ifndef CONFIG_NO_BOOTMEM
>       struct bootmem_data *bdata;
> @@ -1069,7 +1066,6 @@ static inline unsigned long early_pfn_to_nid(unsigned 
> long pfn)
>  #define SECTION_ALIGN_DOWN(pfn)      ((pfn) & PAGE_SECTION_MASK)
>  
>  struct page;
> -struct page_cgroup;
>  struct mem_section {
>       /*
>        * This is, logically, a pointer to an array of struct
> @@ -1087,14 +1083,6 @@ struct mem_section {
>  
>       /* See declaration of similar field in struct zone */
>       unsigned long *pageblock_flags;
> -#ifdef CONFIG_MEMCG
> -     /*
> -      * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
> -      * section. (see memcontrol.h/page_cgroup.h about this.)
> -      */
> -     struct page_cgroup *page_cgroup;
> -     unsigned long pad;
> -#endif
>       /*
>        * WARNING: mem_section must be a power-of-2 in size for the
>        * calculation and use of SECTION_ROOT_MASK to make sense.
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 1289be6b436c..65be35785c86 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,59 +1,6 @@
>  #ifndef __LINUX_PAGE_CGROUP_H
>  #define __LINUX_PAGE_CGROUP_H
>  
> -struct pglist_data;
> -
> -#ifdef CONFIG_MEMCG
> -struct mem_cgroup;
> -
> -/*
> - * Page Cgroup can be considered as an extended mem_map.
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - * All page cgroups are allocated at boot or memory hotplug event,
> - * then the page cgroup for pfn always exists.
> - */
> -struct page_cgroup {
> -     struct mem_cgroup *mem_cgroup;
> -};
> -
> -extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
> -
> -#ifdef CONFIG_SPARSEMEM
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -extern void page_cgroup_init(void);
> -#else
> -extern void page_cgroup_init_flatmem(void);
> -static inline void page_cgroup_init(void)
> -{
> -}
> -#endif
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page);
> -
> -#else /* !CONFIG_MEMCG */
> -struct page_cgroup;
> -
> -static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -}
> -
> -static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -     return NULL;
> -}
> -
> -static inline void page_cgroup_init(void)
> -{
> -}
> -
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -#endif /* CONFIG_MEMCG */
> -
>  #include <linux/swap.h>
>  
>  #ifdef CONFIG_MEMCG_SWAP
> diff --git a/init/main.c b/init/main.c
> index c4912d9abaee..a091bbcadd33 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -51,7 +51,6 @@
>  #include <linux/mempolicy.h>
>  #include <linux/key.h>
>  #include <linux/buffer_head.h>
> -#include <linux/page_cgroup.h>
>  #include <linux/debug_locks.h>
>  #include <linux/debugobjects.h>
>  #include <linux/lockdep.h>
> @@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
>   */
>  static void __init mm_init(void)
>  {
> -     /*
> -      * page_cgroup requires contiguous pages,
> -      * bigger than MAX_ORDER unless SPARSEMEM.
> -      */
> -     page_cgroup_init_flatmem();
>       mem_init();
>       kmem_cache_init();
>       percpu_init_late();
> @@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
>               initrd_start = 0;
>       }
>  #endif
> -     page_cgroup_init();
>       debug_objects_mem_init();
>       kmemleak_init();
>       setup_per_cpu_pageset();
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 19bac2ce827c..dc5e0abb18cb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page 
> *page, struct zone *zone)
>  {
>       struct mem_cgroup_per_zone *mz;
>       struct mem_cgroup *memcg;
> -     struct page_cgroup *pc;
>       struct lruvec *lruvec;
>  
>       if (mem_cgroup_disabled()) {
> @@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page 
> *page, struct zone *zone)
>               goto out;
>       }
>  
> -     pc = lookup_page_cgroup(page);
> -     memcg = pc->mem_cgroup;
> +     memcg = page->mem_cgroup;
>       /*
>        * Swapcache readahead pages are added to the LRU - and
>        * possibly migrated - before they are charged.
> @@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct 
> page *page,
>                                             unsigned long *flags)
>  {
>       struct mem_cgroup *memcg;
> -     struct page_cgroup *pc;
>  
>       rcu_read_lock();
>  
>       if (mem_cgroup_disabled())
>               return NULL;
> -
> -     pc = lookup_page_cgroup(page);
>  again:
> -     memcg = pc->mem_cgroup;
> +     memcg = page->mem_cgroup;
>       if (unlikely(!memcg))
>               return NULL;
>  
> @@ -2038,7 +2033,7 @@ again:
>               return memcg;
>  
>       spin_lock_irqsave(&memcg->move_lock, *flags);
> -     if (memcg != pc->mem_cgroup) {
> +     if (memcg != page->mem_cgroup) {
>               spin_unlock_irqrestore(&memcg->move_lock, *flags);
>               goto again;
>       }
> @@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned 
> short id)
>  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
>  {
>       struct mem_cgroup *memcg;
> -     struct page_cgroup *pc;
>       unsigned short id;
>       swp_entry_t ent;
>  
>       VM_BUG_ON_PAGE(!PageLocked(page), page);
>  
> -     pc = lookup_page_cgroup(page);
> -     memcg = pc->mem_cgroup;
> -
> +     memcg = page->mem_cgroup;
>       if (memcg) {
>               if (!css_tryget_online(&memcg->css))
>                       memcg = NULL;
> @@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int 
> isolated)
>  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
>                         bool lrucare)
>  {
> -     struct page_cgroup *pc = lookup_page_cgroup(page);
>       int isolated;
>  
> -     VM_BUG_ON_PAGE(pc->mem_cgroup, page);
> +     VM_BUG_ON_PAGE(page->mem_cgroup, page);
>  
>       /*
>        * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
> @@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct 
> mem_cgroup *memcg,
>  
>       /*
>        * Nobody should be changing or seriously looking at
> -      * pc->mem_cgroup at this point:
> +      * page->mem_cgroup at this point:
>        *
>        * - the page is uncharged
>        *
> @@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct 
> mem_cgroup *memcg,
>        * - a page cache insertion, a swapin fault, or a migration
>        *   have the page locked
>        */
> -     pc->mem_cgroup = memcg;
> +     page->mem_cgroup = memcg;
>  
>       if (lrucare)
>               unlock_page_lru(page, isolated);
> @@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct 
> mem_cgroup **_memcg, int order)
>  void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
>                             int order)
>  {
> -     struct page_cgroup *pc;
> -
>       VM_BUG_ON(mem_cgroup_is_root(memcg));
>  
>       /* The page allocation failed. Revert */
> @@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, 
> struct mem_cgroup *memcg,
>               memcg_uncharge_kmem(memcg, 1 << order);
>               return;
>       }
> -     pc = lookup_page_cgroup(page);
> -     pc->mem_cgroup = memcg;
> +     page->mem_cgroup = memcg;
>  }
>  
>  void __memcg_kmem_uncharge_pages(struct page *page, int order)
>  {
> -     struct page_cgroup *pc = lookup_page_cgroup(page);
> -     struct mem_cgroup *memcg = pc->mem_cgroup;
> +     struct mem_cgroup *memcg = page->mem_cgroup;
>  
>       if (!memcg)
>               return;
> @@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int 
> order)
>       VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
>  
>       memcg_uncharge_kmem(memcg, 1 << order);
> -     pc->mem_cgroup = NULL;
> +     page->mem_cgroup = NULL;
>  }
>  #else
>  static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
> @@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct 
> mem_cgroup *memcg)
>   */
>  void mem_cgroup_split_huge_fixup(struct page *head)
>  {
> -     struct page_cgroup *pc = lookup_page_cgroup(head);
>       int i;
>  
>       if (mem_cgroup_disabled())
>               return;
>  
>       for (i = 1; i < HPAGE_PMD_NR; i++)
> -             pc[i].mem_cgroup = pc[0].mem_cgroup;
> +             head[i].mem_cgroup = head->mem_cgroup;
>  
> -     __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> +     __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
>                      HPAGE_PMD_NR);
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> @@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>   * mem_cgroup_move_account - move account of the page
>   * @page: the page
>   * @nr_pages: number of regular pages (>1 for huge pages)
> - * @pc:      page_cgroup of the page.
>   * @from: mem_cgroup which the page is moved from.
>   * @to:      mem_cgroup which the page is moved to. @from != @to.
>   *
> @@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
>   */
>  static int mem_cgroup_move_account(struct page *page,
>                                  unsigned int nr_pages,
> -                                struct page_cgroup *pc,
>                                  struct mem_cgroup *from,
>                                  struct mem_cgroup *to)
>  {
> @@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
>               goto out;
>  
>       /*
> -      * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
> +      * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
>        * of its source page while we change it: page migration takes
>        * both pages off the LRU, but page cache replacement doesn't.
>        */
> @@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
>               goto out;
>  
>       ret = -EINVAL;
> -     if (pc->mem_cgroup != from)
> +     if (page->mem_cgroup != from)
>               goto out_unlock;
>  
>       spin_lock_irqsave(&from->move_lock, flags);
> @@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
>       }
>  
>       /*
> -      * It is safe to change pc->mem_cgroup here because the page
> +      * It is safe to change page->mem_cgroup here because the page
>        * is referenced, charged, and isolated - we can't race with
>        * uncharging, charging, migration, or LRU putback.
>        */
>  
>       /* caller should have done css_get */
> -     pc->mem_cgroup = to;
> +     page->mem_cgroup = to;
>       spin_unlock_irqrestore(&from->move_lock, flags);
>  
>       ret = 0;
> @@ -3174,36 +3158,17 @@ static inline int 
> mem_cgroup_move_swap_account(swp_entry_t entry,
>  #endif
>  
>  #ifdef CONFIG_DEBUG_VM
> -static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
> -{
> -     struct page_cgroup *pc;
> -
> -     pc = lookup_page_cgroup(page);
> -     /*
> -      * Can be NULL while feeding pages into the page allocator for
> -      * the first time, i.e. during boot or memory hotplug;
> -      * or when mem_cgroup_disabled().
> -      */
> -     if (likely(pc) && pc->mem_cgroup)
> -             return pc;
> -     return NULL;
> -}
> -
>  bool mem_cgroup_bad_page_check(struct page *page)
>  {
>       if (mem_cgroup_disabled())
>               return false;
>  
> -     return lookup_page_cgroup_used(page) != NULL;
> +     return page->mem_cgroup != NULL;
>  }
>  
>  void mem_cgroup_print_bad_page(struct page *page)
>  {
> -     struct page_cgroup *pc;
> -
> -     pc = lookup_page_cgroup_used(page);
> -     if (pc)
> -             pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
> +     pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
>  }
>  #endif
>  
> @@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct 
> vm_area_struct *vma,
>               unsigned long addr, pte_t ptent, union mc_target *target)
>  {
>       struct page *page = NULL;
> -     struct page_cgroup *pc;
>       enum mc_target_type ret = MC_TARGET_NONE;
>       swp_entry_t ent = { .val = 0 };
>  
> @@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct 
> vm_area_struct *vma,
>       if (!page && !ent.val)
>               return ret;
>       if (page) {
> -             pc = lookup_page_cgroup(page);
>               /*
>                * Do only loose check w/o serialization.
> -              * mem_cgroup_move_account() checks the pc is valid or
> +              * mem_cgroup_move_account() checks the page is valid or
>                * not under LRU exclusion.
>                */
> -             if (pc->mem_cgroup == mc.from) {
> +             if (page->mem_cgroup == mc.from) {
>                       ret = MC_TARGET_PAGE;
>                       if (target)
>                               target->page = page;
> @@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct 
> vm_area_struct *vma,
>               unsigned long addr, pmd_t pmd, union mc_target *target)
>  {
>       struct page *page = NULL;
> -     struct page_cgroup *pc;
>       enum mc_target_type ret = MC_TARGET_NONE;
>  
>       page = pmd_page(pmd);
>       VM_BUG_ON_PAGE(!page || !PageHead(page), page);
>       if (!move_anon())
>               return ret;
> -     pc = lookup_page_cgroup(page);
> -     if (pc->mem_cgroup == mc.from) {
> +     if (page->mem_cgroup == mc.from) {
>               ret = MC_TARGET_PAGE;
>               if (target) {
>                       get_page(page);
> @@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
>       enum mc_target_type target_type;
>       union mc_target target;
>       struct page *page;
> -     struct page_cgroup *pc;
>  
>       /*
>        * We don't take compound_lock() here but no race with splitting thp
> @@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
>               if (target_type == MC_TARGET_PAGE) {
>                       page = target.page;
>                       if (!isolate_lru_page(page)) {
> -                             pc = lookup_page_cgroup(page);
>                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
> -                                                     pc, mc.from, mc.to)) {
> +                                                          mc.from, mc.to)) {
>                                       mc.precharge -= HPAGE_PMD_NR;
>                                       mc.moved_charge += HPAGE_PMD_NR;
>                               }
> @@ -5429,9 +5388,7 @@ retry:
>                       page = target.page;
>                       if (isolate_lru_page(page))
>                               goto put;
> -                     pc = lookup_page_cgroup(page);
> -                     if (!mem_cgroup_move_account(page, 1, pc,
> -                                                  mc.from, mc.to)) {
> +                     if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
>                               mc.precharge--;
>                               /* we uncharge from mc.from later. */
>                               mc.moved_charge++;
> @@ -5623,7 +5580,6 @@ static void __init enable_swap_cgroup(void)
>  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
>  {
>       struct mem_cgroup *memcg;
> -     struct page_cgroup *pc;
>       unsigned short oldid;
>  
>       VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -5632,8 +5588,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t 
> entry)
>       if (!do_swap_account)
>               return;
>  
> -     pc = lookup_page_cgroup(page);
> -     memcg = pc->mem_cgroup;
> +     memcg = page->mem_cgroup;
>  
>       /* Readahead page, never charged */
>       if (!memcg)
> @@ -5643,7 +5598,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t 
> entry)
>       VM_BUG_ON_PAGE(oldid, page);
>       mem_cgroup_swap_statistics(memcg, true);
>  
> -     pc->mem_cgroup = NULL;
> +     page->mem_cgroup = NULL;
>  
>       if (!mem_cgroup_is_root(memcg))
>               page_counter_uncharge(&memcg->memory, 1);
> @@ -5710,7 +5665,6 @@ int mem_cgroup_try_charge(struct page *page, struct 
> mm_struct *mm,
>               goto out;
>  
>       if (PageSwapCache(page)) {
> -             struct page_cgroup *pc = lookup_page_cgroup(page);
>               /*
>                * Every swap fault against a single page tries to charge the
>                * page, bail as early as possible.  shmem_unuse() encounters
> @@ -5718,7 +5672,7 @@ int mem_cgroup_try_charge(struct page *page, struct 
> mm_struct *mm,
>                * the page lock, which serializes swap cache removal, which
>                * in turn serializes uncharging.
>                */
> -             if (pc->mem_cgroup)
> +             if (page->mem_cgroup)
>                       goto out;
>       }
>  
> @@ -5871,7 +5825,6 @@ static void uncharge_list(struct list_head *page_list)
>       next = page_list->next;
>       do {
>               unsigned int nr_pages = 1;
> -             struct page_cgroup *pc;
>  
>               page = list_entry(next, struct page, lru);
>               next = page->lru.next;
> @@ -5879,23 +5832,22 @@ static void uncharge_list(struct list_head *page_list)
>               VM_BUG_ON_PAGE(PageLRU(page), page);
>               VM_BUG_ON_PAGE(page_count(page), page);
>  
> -             pc = lookup_page_cgroup(page);
> -             if (!pc->mem_cgroup)
> +             if (!page->mem_cgroup)
>                       continue;
>  
>               /*
>                * Nobody should be changing or seriously looking at
> -              * pc->mem_cgroup at this point, we have fully
> +              * page->mem_cgroup at this point, we have fully
>                * exclusive access to the page.
>                */
>  
> -             if (memcg != pc->mem_cgroup) {
> +             if (memcg != page->mem_cgroup) {
>                       if (memcg) {
>                               uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
>                                              nr_huge, page);
>                               pgpgout = nr_anon = nr_file = nr_huge = 0;
>                       }
> -                     memcg = pc->mem_cgroup;
> +                     memcg = page->mem_cgroup;
>               }
>  
>               if (PageTransHuge(page)) {
> @@ -5909,7 +5861,7 @@ static void uncharge_list(struct list_head *page_list)
>               else
>                       nr_file += nr_pages;
>  
> -             pc->mem_cgroup = NULL;
> +             page->mem_cgroup = NULL;
>  
>               pgpgout++;
>       } while (next != page_list);
> @@ -5928,14 +5880,11 @@ static void uncharge_list(struct list_head *page_list)
>   */
>  void mem_cgroup_uncharge(struct page *page)
>  {
> -     struct page_cgroup *pc;
> -
>       if (mem_cgroup_disabled())
>               return;
>  
>       /* Don't touch page->lru of any random page, pre-check: */
> -     pc = lookup_page_cgroup(page);
> -     if (!pc->mem_cgroup)
> +     if (!page->mem_cgroup)
>               return;
>  
>       INIT_LIST_HEAD(&page->lru);
> @@ -5972,7 +5921,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
> page *newpage,
>                       bool lrucare)
>  {
>       struct mem_cgroup *memcg;
> -     struct page_cgroup *pc;
>       int isolated;
>  
>       VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
> @@ -5987,8 +5935,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
> page *newpage,
>               return;
>  
>       /* Page cache replacement: new page already charged? */
> -     pc = lookup_page_cgroup(newpage);
> -     if (pc->mem_cgroup)
> +     if (newpage->mem_cgroup)
>               return;
>  
>       /*
> @@ -5997,15 +5944,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
> page *newpage,
>        * uncharged page when the PFN walker finds a page that
>        * reclaim just put back on the LRU but has not released yet.
>        */
> -     pc = lookup_page_cgroup(oldpage);
> -     memcg = pc->mem_cgroup;
> +     memcg = oldpage->mem_cgroup;
>       if (!memcg)
>               return;
>  
>       if (lrucare)
>               lock_page_lru(oldpage, &isolated);
>  
> -     pc->mem_cgroup = NULL;
> +     oldpage->mem_cgroup = NULL;
>  
>       if (lrucare)
>               unlock_page_lru(oldpage, isolated);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fa9426389c6c..6a952237a677 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,7 +48,6 @@
>  #include <linux/backing-dev.h>
>  #include <linux/fault-inject.h>
>  #include <linux/page-isolation.h>
> -#include <linux/page_cgroup.h>
>  #include <linux/debugobjects.h>
>  #include <linux/kmemleak.h>
>  #include <linux/compaction.h>
> @@ -4899,7 +4898,6 @@ static void __paginginit free_area_init_core(struct 
> pglist_data *pgdat,
>  #endif
>       init_waitqueue_head(&pgdat->kswapd_wait);
>       init_waitqueue_head(&pgdat->pfmemalloc_wait);
> -     pgdat_page_cgroup_init(pgdat);
>  
>       for (j = 0; j < MAX_NR_ZONES; j++) {
>               struct zone *zone = pgdat->node_zones + j;
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5331c2bd85a2..f0f31c1d4d0c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -1,326 +1,7 @@
>  #include <linux/mm.h>
> -#include <linux/mmzone.h>
> -#include <linux/bootmem.h>
> -#include <linux/bit_spinlock.h>
>  #include <linux/page_cgroup.h>
> -#include <linux/hash.h>
> -#include <linux/slab.h>
> -#include <linux/memory.h>
>  #include <linux/vmalloc.h>
> -#include <linux/cgroup.h>
>  #include <linux/swapops.h>
> -#include <linux/kmemleak.h>
> -
> -static unsigned long total_usage;
> -
> -#if !defined(CONFIG_SPARSEMEM)
> -
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -     pgdat->node_page_cgroup = NULL;
> -}
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -     unsigned long pfn = page_to_pfn(page);
> -     unsigned long offset;
> -     struct page_cgroup *base;
> -
> -     base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
> -#ifdef CONFIG_DEBUG_VM
> -     /*
> -      * The sanity checks the page allocator does upon freeing a
> -      * page can reach here before the page_cgroup arrays are
> -      * allocated when feeding a range of pages to the allocator
> -      * for the first time during bootup or memory hotplug.
> -      */
> -     if (unlikely(!base))
> -             return NULL;
> -#endif
> -     offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
> -     return base + offset;
> -}
> -
> -static int __init alloc_node_page_cgroup(int nid)
> -{
> -     struct page_cgroup *base;
> -     unsigned long table_size;
> -     unsigned long nr_pages;
> -
> -     nr_pages = NODE_DATA(nid)->node_spanned_pages;
> -     if (!nr_pages)
> -             return 0;
> -
> -     table_size = sizeof(struct page_cgroup) * nr_pages;
> -
> -     base = memblock_virt_alloc_try_nid_nopanic(
> -                     table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> -                     BOOTMEM_ALLOC_ACCESSIBLE, nid);
> -     if (!base)
> -             return -ENOMEM;
> -     NODE_DATA(nid)->node_page_cgroup = base;
> -     total_usage += table_size;
> -     return 0;
> -}
> -
> -void __init page_cgroup_init_flatmem(void)
> -{
> -
> -     int nid, fail;
> -
> -     if (mem_cgroup_disabled())
> -             return;
> -
> -     for_each_online_node(nid)  {
> -             fail = alloc_node_page_cgroup(nid);
> -             if (fail)
> -                     goto fail;
> -     }
> -     printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -     printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
> -     " don't want memory cgroups\n");
> -     return;
> -fail:
> -     printk(KERN_CRIT "allocation of page_cgroup failed.\n");
> -     printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
> -     panic("Out of memory");
> -}
> -
> -#else /* CONFIG_FLAT_NODE_MEM_MAP */
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> -     unsigned long pfn = page_to_pfn(page);
> -     struct mem_section *section = __pfn_to_section(pfn);
> -#ifdef CONFIG_DEBUG_VM
> -     /*
> -      * The sanity checks the page allocator does upon freeing a
> -      * page can reach here before the page_cgroup arrays are
> -      * allocated when feeding a range of pages to the allocator
> -      * for the first time during bootup or memory hotplug.
> -      */
> -     if (!section->page_cgroup)
> -             return NULL;
> -#endif
> -     return section->page_cgroup + pfn;
> -}
> -
> -static void *__meminit alloc_page_cgroup(size_t size, int nid)
> -{
> -     gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
> -     void *addr = NULL;
> -
> -     addr = alloc_pages_exact_nid(nid, size, flags);
> -     if (addr) {
> -             kmemleak_alloc(addr, size, 1, flags);
> -             return addr;
> -     }
> -
> -     if (node_state(nid, N_HIGH_MEMORY))
> -             addr = vzalloc_node(size, nid);
> -     else
> -             addr = vzalloc(size);
> -
> -     return addr;
> -}
> -
> -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
> -{
> -     struct mem_section *section;
> -     struct page_cgroup *base;
> -     unsigned long table_size;
> -
> -     section = __pfn_to_section(pfn);
> -
> -     if (section->page_cgroup)
> -             return 0;
> -
> -     table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -     base = alloc_page_cgroup(table_size, nid);
> -
> -     /*
> -      * The value stored in section->page_cgroup is (base - pfn)
> -      * and it does not point to the memory block allocated above,
> -      * causing kmemleak false positives.
> -      */
> -     kmemleak_not_leak(base);
> -
> -     if (!base) {
> -             printk(KERN_ERR "page cgroup allocation failure\n");
> -             return -ENOMEM;
> -     }
> -
> -     /*
> -      * The passed "pfn" may not be aligned to SECTION.  For the calculation
> -      * we need to apply a mask.
> -      */
> -     pfn &= PAGE_SECTION_MASK;
> -     section->page_cgroup = base - pfn;
> -     total_usage += table_size;
> -     return 0;
> -}
> -#ifdef CONFIG_MEMORY_HOTPLUG
> -static void free_page_cgroup(void *addr)
> -{
> -     if (is_vmalloc_addr(addr)) {
> -             vfree(addr);
> -     } else {
> -             struct page *page = virt_to_page(addr);
> -             size_t table_size =
> -                     sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -
> -             BUG_ON(PageReserved(page));
> -             kmemleak_free(addr);
> -             free_pages_exact(addr, table_size);
> -     }
> -}
> -
> -static void __free_page_cgroup(unsigned long pfn)
> -{
> -     struct mem_section *ms;
> -     struct page_cgroup *base;
> -
> -     ms = __pfn_to_section(pfn);
> -     if (!ms || !ms->page_cgroup)
> -             return;
> -     base = ms->page_cgroup + pfn;
> -     free_page_cgroup(base);
> -     ms->page_cgroup = NULL;
> -}
> -
> -static int __meminit online_page_cgroup(unsigned long start_pfn,
> -                             unsigned long nr_pages,
> -                             int nid)
> -{
> -     unsigned long start, end, pfn;
> -     int fail = 0;
> -
> -     start = SECTION_ALIGN_DOWN(start_pfn);
> -     end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> -     if (nid == -1) {
> -             /*
> -              * In this case, "nid" already exists and contains valid memory.
> -              * "start_pfn" passed to us is a pfn which is an arg for
> -              * online__pages(), and start_pfn should exist.
> -              */
> -             nid = pfn_to_nid(start_pfn);
> -             VM_BUG_ON(!node_state(nid, N_ONLINE));
> -     }
> -
> -     for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
> -             if (!pfn_present(pfn))
> -                     continue;
> -             fail = init_section_page_cgroup(pfn, nid);
> -     }
> -     if (!fail)
> -             return 0;
> -
> -     /* rollback */
> -     for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> -             __free_page_cgroup(pfn);
> -
> -     return -ENOMEM;
> -}
> -
> -static int __meminit offline_page_cgroup(unsigned long start_pfn,
> -                             unsigned long nr_pages, int nid)
> -{
> -     unsigned long start, end, pfn;
> -
> -     start = SECTION_ALIGN_DOWN(start_pfn);
> -     end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> -     for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> -             __free_page_cgroup(pfn);
> -     return 0;
> -
> -}
> -
> -static int __meminit page_cgroup_callback(struct notifier_block *self,
> -                            unsigned long action, void *arg)
> -{
> -     struct memory_notify *mn = arg;
> -     int ret = 0;
> -     switch (action) {
> -     case MEM_GOING_ONLINE:
> -             ret = online_page_cgroup(mn->start_pfn,
> -                                mn->nr_pages, mn->status_change_nid);
> -             break;
> -     case MEM_OFFLINE:
> -             offline_page_cgroup(mn->start_pfn,
> -                             mn->nr_pages, mn->status_change_nid);
> -             break;
> -     case MEM_CANCEL_ONLINE:
> -             offline_page_cgroup(mn->start_pfn,
> -                             mn->nr_pages, mn->status_change_nid);
> -             break;
> -     case MEM_GOING_OFFLINE:
> -             break;
> -     case MEM_ONLINE:
> -     case MEM_CANCEL_OFFLINE:
> -             break;
> -     }
> -
> -     return notifier_from_errno(ret);
> -}
> -
> -#endif
> -
> -void __init page_cgroup_init(void)
> -{
> -     unsigned long pfn;
> -     int nid;
> -
> -     if (mem_cgroup_disabled())
> -             return;
> -
> -     for_each_node_state(nid, N_MEMORY) {
> -             unsigned long start_pfn, end_pfn;
> -
> -             start_pfn = node_start_pfn(nid);
> -             end_pfn = node_end_pfn(nid);
> -             /*
> -              * start_pfn and end_pfn may not be aligned to SECTION and the
> -              * page->flags of out of node pages are not initialized.  So we
> -              * scan [start_pfn, the biggest section's pfn < end_pfn) here.
> -              */
> -             for (pfn = start_pfn;
> -                  pfn < end_pfn;
> -                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
> -
> -                     if (!pfn_valid(pfn))
> -                             continue;
> -                     /*
> -                      * Nodes's pfns can be overlapping.
> -                      * We know some arch can have a nodes layout such as
> -                      * -------------pfn-------------->
> -                      * N0 | N1 | N2 | N0 | N1 | N2|....
> -                      */
> -                     if (pfn_to_nid(pfn) != nid)
> -                             continue;
> -                     if (init_section_page_cgroup(pfn, nid))
> -                             goto oom;
> -             }
> -     }
> -     hotplug_memory_notifier(page_cgroup_callback, 0);
> -     printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -     printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
> -                      "don't want memory cgroups\n");
> -     return;
> -oom:
> -     printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
> -     panic("Out of memory");
> -}
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -     return;
> -}
> -
> -#endif
> -
>  
>  #ifdef CONFIG_MEMCG_SWAP
>  
> -- 
> 2.1.3
> 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/3] mm: embed the memcg pointer directly into struct page

Reply via email to