Re: [PATCH 4/7] bio-cgroup: Split the cgroup memory subsystem into two parts

KAMEZAWA Hiroyuki Sun, 17 Aug 2008 18:42:45 -0700

On Tue, 12 Aug 2008 21:35:33 +0900 (JST)
Ryo Tsuruta <[EMAIL PROTECTED]> wrote:


> This patch splits the cgroup memory subsystem into two parts.
> One is for tracking pages to find out the owners. The other is
> for controlling how much amount of memory should be assigned to
> each cgroup.
> 
> With this patch, you can use the page tracking mechanism even if
> the memory subsystem is off.
> 

I'm now writing remove-lock-page-cgroup patches. it works well.
please wait for a while...

Thanks,
-Kame


> Based on 2.6.27-rc1-mm1
> Signed-off-by: Ryo Tsuruta <[EMAIL PROTECTED]>
> Signed-off-by: Hirokazu Takahashi <[EMAIL PROTECTED]>
> 
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h 
> linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h
> --- linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h    2008-08-12 
> 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h       2008-08-12 
> 14:47:11.000000000 +0900
> @@ -20,12 +20,62 @@
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
>  
> +#include <linux/rcupdate.h>
> +#include <linux/mm.h>
> +#include <linux/smp.h>
> +#include <linux/bit_spinlock.h>
> +
>  struct mem_cgroup;
>  struct page_cgroup;
>  struct page;
>  struct mm_struct;
>  
> +#ifdef CONFIG_CGROUP_PAGE
> +/*
> + * We use the lower bit of the page->page_cgroup pointer as a bit spin
> + * lock.  We need to ensure that page->page_cgroup is at least two
> + * byte aligned (based on comments from Nick Piggin).  But since
> + * bit_spin_lock doesn't actually set that lock bit in a non-debug
> + * uniprocessor kernel, we should avoid setting it here too.
> + */
> +#define PAGE_CGROUP_LOCK_BIT    0x0
> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> +#define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
> +#else
> +#define PAGE_CGROUP_LOCK        0x0
> +#endif
> +
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     struct list_head lru;           /* per cgroup LRU list */
> +     struct mem_cgroup *mem_cgroup;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +     struct page *page;
> +     int flags;
> +};
> +#define PAGE_CGROUP_FLAG_CACHE       (0x1)   /* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE      (0x2)   /* page is active in this 
> cgroup */
> +#define PAGE_CGROUP_FLAG_FILE        (0x4)   /* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)   /* page is unevictableable */
> +
> +static inline void lock_page_cgroup(struct page *page)
> +{
> +     bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static inline int try_lock_page_cgroup(struct page *page)
> +{
> +     return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static inline void unlock_page_cgroup(struct page *page)
> +{
> +     bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
>  
>  #define page_reset_bad_cgroup(page)  ((page)->page_cgroup = 0)
>  
> @@ -34,45 +84,15 @@ extern int mem_cgroup_charge(struct page
>                               gfp_t gfp_mask);
>  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>                                       gfp_t gfp_mask);
> -extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
>  extern void mem_cgroup_uncharge_page(struct page *page);
>  extern void mem_cgroup_uncharge_cache_page(struct page *page);
> -extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
> -
> -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> -                                     struct list_head *dst,
> -                                     unsigned long *scanned, int order,
> -                                     int mode, struct zone *z,
> -                                     struct mem_cgroup *mem_cont,
> -                                     int active, int file);
> -extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
> -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup 
> *mem);
> -
> -extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> -
> -#define mm_match_cgroup(mm, cgroup)  \
> -     ((cgroup) == mem_cgroup_from_task((mm)->owner))
>  
>  extern int
>  mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
>  extern void mem_cgroup_end_migration(struct page *page);
> +extern void page_cgroup_init(void);
>  
> -/*
> - * For memory reclaim.
> - */
> -extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
> -extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
> -
> -extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
> -extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
> -                                                     int priority);
> -extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
> -                                                     int priority);
> -
> -extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone 
> *zone,
> -                                     int priority, enum lru_list lru);
> -
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_CGROUP_PAGE */
>  static inline void page_reset_bad_cgroup(struct page *page)
>  {
>  }
> @@ -102,6 +122,53 @@ static inline void mem_cgroup_uncharge_c
>  {
>  }
>  
> +static inline int
> +mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> +{
> +     return 0;
> +}
> +
> +static inline void mem_cgroup_end_migration(struct page *page)
> +{
> +}
> +#endif /* CONFIG_CGROUP_PAGE */
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +
> +extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
> +extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
> +
> +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> +                                     struct list_head *dst,
> +                                     unsigned long *scanned, int order,
> +                                     int mode, struct zone *z,
> +                                     struct mem_cgroup *mem_cont,
> +                                     int active, int file);
> +extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
> +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup 
> *mem);
> +
> +extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
> +
> +#define mm_match_cgroup(mm, cgroup)  \
> +     ((cgroup) == mem_cgroup_from_task((mm)->owner))
> +
> +/*
> + * For memory reclaim.
> + */
> +extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem);
> +extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem);
> +
> +extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
> +extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
> +                                                     int priority);
> +extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
> +                                                     int priority);
> +
> +extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone 
> *zone,
> +                                     int priority, enum lru_list lru);
> +
> +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
>  static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t 
> gfp_mask)
>  {
>       return 0;
> @@ -122,16 +189,6 @@ static inline int task_in_mem_cgroup(str
>       return 1;
>  }
>  
> -static inline int
> -mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> -{
> -     return 0;
> -}
> -
> -static inline void mem_cgroup_end_migration(struct page *page)
> -{
> -}
> -
>  static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
>  {
>       return 0;
> @@ -163,7 +220,8 @@ static inline long mem_cgroup_calc_recla
>  {
>       return 0;
>  }
> -#endif /* CONFIG_CGROUP_MEM_CONT */
> +
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
>  
>  #endif /* _LINUX_MEMCONTROL_H */
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/include/linux/mm_types.h 
> linux-2.6.27-rc1-mm1.cg0/include/linux/mm_types.h
> --- linux-2.6.27-rc1-mm1.ioband/include/linux/mm_types.h      2008-08-12 
> 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/include/linux/mm_types.h 2008-08-12 
> 14:47:11.000000000 +0900
> @@ -92,7 +92,7 @@ struct page {
>       void *virtual;                  /* Kernel virtual address (NULL if
>                                          not kmapped, ie. highmem) */
>  #endif /* WANT_PAGE_VIRTUAL */
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_CGROUP_PAGE
>       unsigned long page_cgroup;
>  #endif
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/init/Kconfig 
> linux-2.6.27-rc1-mm1.cg0/init/Kconfig
> --- linux-2.6.27-rc1-mm1.ioband/init/Kconfig  2008-08-12 14:30:19.000000000 
> +0900
> +++ linux-2.6.27-rc1-mm1.cg0/init/Kconfig     2008-08-12 14:47:11.000000000 
> +0900
> @@ -418,6 +418,10 @@ config CGROUP_MEMRLIMIT_CTLR
>         memory RSS and Page Cache control. Virtual address space control
>         is provided by this controller.
>  
> +config CGROUP_PAGE
> +     def_bool y
> +     depends on CGROUP_MEM_RES_CTLR
> +
>  config SYSFS_DEPRECATED
>       bool
>  
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/mm/Makefile 
> linux-2.6.27-rc1-mm1.cg0/mm/Makefile
> --- linux-2.6.27-rc1-mm1.ioband/mm/Makefile   2008-08-12 14:30:19.000000000 
> +0900
> +++ linux-2.6.27-rc1-mm1.cg0/mm/Makefile      2008-08-12 14:47:11.000000000 
> +0900
> @@ -34,5 +34,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_PAGE) += memcontrol.o
>  obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
> diff -Ndupr linux-2.6.27-rc1-mm1.ioband/mm/memcontrol.c 
> linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c
> --- linux-2.6.27-rc1-mm1.ioband/mm/memcontrol.c       2008-08-12 
> 14:30:19.000000000 +0900
> +++ linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c  2008-08-12 14:47:11.000000000 
> +0900
> @@ -36,10 +36,25 @@
>  
>  #include <asm/uaccess.h>
>  
> -struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> +enum charge_type {
> +     MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> +     MEM_CGROUP_CHARGE_TYPE_MAPPED,
> +     MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
> +};
> +
> +static void __mem_cgroup_uncharge_common(struct page *, enum charge_type);
> +
>  static struct kmem_cache *page_cgroup_cache __read_mostly;
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +struct cgroup_subsys mem_cgroup_subsys __read_mostly;
>  #define MEM_CGROUP_RECLAIM_RETRIES   5
>  
> +static inline int mem_cgroup_disabled(void)
> +{
> +     return mem_cgroup_subsys.disabled;
> +}
> +
>  /*
>   * Statistics for memory cgroup.
>   */
> @@ -136,35 +151,6 @@ struct mem_cgroup {
>  };
>  static struct mem_cgroup init_mem_cgroup;
>  
> -/*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT         0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK     (1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK     0x0
> -#endif
> -
> -/*
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - */
> -struct page_cgroup {
> -     struct list_head lru;           /* per cgroup LRU list */
> -     struct page *page;
> -     struct mem_cgroup *mem_cgroup;
> -     int flags;
> -};
> -#define PAGE_CGROUP_FLAG_CACHE          (0x1)        /* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE    (0x2)     /* page is active in this 
> cgroup */
> -#define PAGE_CGROUP_FLAG_FILE           (0x4)        /* page is file system 
> backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)   /* page is unevictableable */
> -
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
>       return page_to_nid(pc->page);
> @@ -175,12 +161,6 @@ static enum zone_type page_cgroup_zid(st
>       return page_zonenum(pc->page);
>  }
>  
> -enum charge_type {
> -     MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> -     MEM_CGROUP_CHARGE_TYPE_MAPPED,
> -     MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
> -};
> -
>  /*
>   * Always modified under lru lock. Then, not necessary to preempt_disable()
>   */
> @@ -248,37 +228,6 @@ struct mem_cgroup *mem_cgroup_from_task(
>                               struct mem_cgroup, css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> -{
> -     return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup 
> *pc)
> -{
> -     VM_BUG_ON(!page_cgroup_locked(page));
> -     page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> -}
> -
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> -{
> -     return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> -}
> -
> -static void lock_page_cgroup(struct page *page)
> -{
> -     bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static int try_lock_page_cgroup(struct page *page)
> -{
> -     return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void unlock_page_cgroup(struct page *page)
> -{
> -     bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
>                       struct page_cgroup *pc)
>  {
> @@ -367,7 +316,7 @@ void mem_cgroup_move_lists(struct page *
>       struct mem_cgroup_per_zone *mz;
>       unsigned long flags;
>  
> -     if (mem_cgroup_subsys.disabled)
> +     if (mem_cgroup_disabled())
>               return;
>  
>       /*
> @@ -506,273 +455,6 @@ unsigned long mem_cgroup_isolate_pages(u
>  }
>  
>  /*
> - * Charge the memory controller for page usage.
> - * Return
> - * 0 if the charge was successful
> - * < 0 if the cgroup is over its limit
> - */
> -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
> -                             gfp_t gfp_mask, enum charge_type ctype,
> -                             struct mem_cgroup *memcg)
> -{
> -     struct mem_cgroup *mem;
> -     struct page_cgroup *pc;
> -     unsigned long flags;
> -     unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> -     struct mem_cgroup_per_zone *mz;
> -
> -     pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> -     if (unlikely(pc == NULL))
> -             goto err;
> -
> -     /*
> -      * We always charge the cgroup the mm_struct belongs to.
> -      * The mm_struct's mem_cgroup changes on task migration if the
> -      * thread group leader migrates. It's possible that mm is not
> -      * set, if so charge the init_mm (happens for pagecache usage).
> -      */
> -     if (likely(!memcg)) {
> -             rcu_read_lock();
> -             mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> -             /*
> -              * For every charge from the cgroup, increment reference count
> -              */
> -             css_get(&mem->css);
> -             rcu_read_unlock();
> -     } else {
> -             mem = memcg;
> -             css_get(&memcg->css);
> -     }
> -
> -     while (res_counter_charge(&mem->res, PAGE_SIZE)) {
> -             if (!(gfp_mask & __GFP_WAIT))
> -                     goto out;
> -
> -             if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
> -                     continue;
> -
> -             /*
> -              * try_to_free_mem_cgroup_pages() might not give us a full
> -              * picture of reclaim. Some pages are reclaimed and might be
> -              * moved to swap cache or just unmapped from the cgroup.
> -              * Check the limit again to see if the reclaim reduced the
> -              * current usage of the cgroup before giving up
> -              */
> -             if (res_counter_check_under_limit(&mem->res))
> -                     continue;
> -
> -             if (!nr_retries--) {
> -                     mem_cgroup_out_of_memory(mem, gfp_mask);
> -                     goto out;
> -             }
> -     }
> -
> -     pc->mem_cgroup = mem;
> -     pc->page = page;
> -     /*
> -      * If a page is accounted as a page cache, insert to inactive list.
> -      * If anon, insert to active list.
> -      */
> -     if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> -             pc->flags = PAGE_CGROUP_FLAG_CACHE;
> -             if (page_is_file_cache(page))
> -                     pc->flags |= PAGE_CGROUP_FLAG_FILE;
> -             else
> -                     pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> -     } else
> -             pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -
> -     lock_page_cgroup(page);
> -     if (unlikely(page_get_page_cgroup(page))) {
> -             unlock_page_cgroup(page);
> -             res_counter_uncharge(&mem->res, PAGE_SIZE);
> -             css_put(&mem->css);
> -             kmem_cache_free(page_cgroup_cache, pc);
> -             goto done;
> -     }
> -     page_assign_page_cgroup(page, pc);
> -
> -     mz = page_cgroup_zoneinfo(pc);
> -     spin_lock_irqsave(&mz->lru_lock, flags);
> -     __mem_cgroup_add_list(mz, pc);
> -     spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -     unlock_page_cgroup(page);
> -done:
> -     return 0;
> -out:
> -     css_put(&mem->css);
> -     kmem_cache_free(page_cgroup_cache, pc);
> -err:
> -     return -ENOMEM;
> -}
> -
> -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t 
> gfp_mask)
> -{
> -     if (mem_cgroup_subsys.disabled)
> -             return 0;
> -
> -     /*
> -      * If already mapped, we don't have to account.
> -      * If page cache, page->mapping has address_space.
> -      * But page->mapping may have out-of-use anon_vma pointer,
> -      * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
> -      * is NULL.
> -      */
> -     if (page_mapped(page) || (page->mapping && !PageAnon(page)))
> -             return 0;
> -     if (unlikely(!mm))
> -             mm = &init_mm;
> -     return mem_cgroup_charge_common(page, mm, gfp_mask,
> -                             MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
> -}
> -
> -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> -                             gfp_t gfp_mask)
> -{
> -     if (mem_cgroup_subsys.disabled)
> -             return 0;
> -
> -     /*
> -      * Corner case handling. This is called from add_to_page_cache()
> -      * in usual. But some FS (shmem) precharges this page before calling it
> -      * and call add_to_page_cache() with GFP_NOWAIT.
> -      *
> -      * For GFP_NOWAIT case, the page may be pre-charged before calling
> -      * add_to_page_cache(). (See shmem.c) check it here and avoid to call
> -      * charge twice. (It works but has to pay a bit larger cost.)
> -      */
> -     if (!(gfp_mask & __GFP_WAIT)) {
> -             struct page_cgroup *pc;
> -
> -             lock_page_cgroup(page);
> -             pc = page_get_page_cgroup(page);
> -             if (pc) {
> -                     VM_BUG_ON(pc->page != page);
> -                     VM_BUG_ON(!pc->mem_cgroup);
> -                     unlock_page_cgroup(page);
> -                     return 0;
> -             }
> -             unlock_page_cgroup(page);
> -     }
> -
> -     if (unlikely(!mm))
> -             mm = &init_mm;
> -
> -     return mem_cgroup_charge_common(page, mm, gfp_mask,
> -                             MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
> -}
> -
> -/*
> - * uncharge if !page_mapped(page)
> - */
> -static void
> -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> -{
> -     struct page_cgroup *pc;
> -     struct mem_cgroup *mem;
> -     struct mem_cgroup_per_zone *mz;
> -     unsigned long flags;
> -
> -     if (mem_cgroup_subsys.disabled)
> -             return;
> -
> -     /*
> -      * Check if our page_cgroup is valid
> -      */
> -     lock_page_cgroup(page);
> -     pc = page_get_page_cgroup(page);
> -     if (unlikely(!pc))
> -             goto unlock;
> -
> -     VM_BUG_ON(pc->page != page);
> -
> -     if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> -         && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
> -             || page_mapped(page)))
> -             goto unlock;
> -
> -     mz = page_cgroup_zoneinfo(pc);
> -     spin_lock_irqsave(&mz->lru_lock, flags);
> -     __mem_cgroup_remove_list(mz, pc);
> -     spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -     page_assign_page_cgroup(page, NULL);
> -     unlock_page_cgroup(page);
> -
> -     mem = pc->mem_cgroup;
> -     res_counter_uncharge(&mem->res, PAGE_SIZE);
> -     css_put(&mem->css);
> -
> -     kmem_cache_free(page_cgroup_cache, pc);
> -     return;
> -unlock:
> -     unlock_page_cgroup(page);
> -}
> -
> -void mem_cgroup_uncharge_page(struct page *page)
> -{
> -     __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
> -}
> -
> -void mem_cgroup_uncharge_cache_page(struct page *page)
> -{
> -     VM_BUG_ON(page_mapped(page));
> -     __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
> -}
> -
> -/*
> - * Before starting migration, account against new page.
> - */
> -int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> -{
> -     struct page_cgroup *pc;
> -     struct mem_cgroup *mem = NULL;
> -     enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
> -     int ret = 0;
> -
> -     if (mem_cgroup_subsys.disabled)
> -             return 0;
> -
> -     lock_page_cgroup(page);
> -     pc = page_get_page_cgroup(page);
> -     if (pc) {
> -             mem = pc->mem_cgroup;
> -             css_get(&mem->css);
> -             if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
> -                     ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> -     }
> -     unlock_page_cgroup(page);
> -     if (mem) {
> -             ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
> -                     ctype, mem);
> -             css_put(&mem->css);
> -     }
> -     return ret;
> -}
> -
> -/* remove redundant charge if migration failed*/
> -void mem_cgroup_end_migration(struct page *newpage)
> -{
> -     /*
> -      * At success, page->mapping is not NULL.
> -      * special rollback care is necessary when
> -      * 1. at migration failure. (newpage->mapping is cleared in this case)
> -      * 2. the newpage was moved but not remapped again because the task
> -      *    exits and the newpage is obsolete. In this case, the new page
> -      *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
> -      *    always for avoiding mess. The  page_cgroup will be removed if
> -      *    unnecessary. File cache pages is still on radix-tree. Don't
> -      *    care it.
> -      */
> -     if (!newpage->mapping)
> -             __mem_cgroup_uncharge_common(newpage,
> -                                      MEM_CGROUP_CHARGE_TYPE_FORCE);
> -     else if (PageAnon(newpage))
> -             mem_cgroup_uncharge_page(newpage);
> -}
> -
> -/*
>   * A call to try to shrink memory usage under specified resource controller.
>   * This is typically used for page reclaiming for shmem for reducing side
>   * effect of page allocation from shmem, which is used by some mem_cgroup.
> @@ -783,7 +465,7 @@ int mem_cgroup_shrink_usage(struct mm_st
>       int progress = 0;
>       int retry = MEM_CGROUP_RECLAIM_RETRIES;
>  
> -     if (mem_cgroup_subsys.disabled)
> +     if (mem_cgroup_disabled())
>               return 0;
>  
>       rcu_read_lock();
> @@ -1104,7 +786,7 @@ mem_cgroup_create(struct cgroup_subsys *
>  
>       if (unlikely((cont->parent) == NULL)) {
>               mem = &init_mem_cgroup;
> -             page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
> +             page_cgroup_init();
>       } else {
>               mem = mem_cgroup_alloc();
>               if (!mem)
> @@ -1188,3 +870,325 @@ struct cgroup_subsys mem_cgroup_subsys =
>       .attach = mem_cgroup_move_task,
>       .early_init = 0,
>  };
> +
> +#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +static inline int mem_cgroup_disabled(void)
> +{
> +     return 1;
> +}
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +static inline int page_cgroup_locked(struct page *page)
> +{
> +     return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> +}
> +
> +static void page_assign_page_cgroup(struct page *page, struct page_cgroup 
> *pc)
> +{
> +     VM_BUG_ON(!page_cgroup_locked(page));
> +     page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> +}
> +
> +struct page_cgroup *page_get_page_cgroup(struct page *page)
> +{
> +     return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> +}
> +
> +/*
> + * Charge the memory controller for page usage.
> + * Return
> + * 0 if the charge was successful
> + * < 0 if the cgroup is over its limit
> + */
> +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
> +                             gfp_t gfp_mask, enum charge_type ctype,
> +                             struct mem_cgroup *memcg)
> +{
> +     struct page_cgroup *pc;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     struct mem_cgroup *mem;
> +     unsigned long flags;
> +     unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> +     struct mem_cgroup_per_zone *mz;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> +     if (unlikely(pc == NULL))
> +             goto err;
> +
> +     /*
> +      * We always charge the cgroup the mm_struct belongs to.
> +      * The mm_struct's mem_cgroup changes on task migration if the
> +      * thread group leader migrates. It's possible that mm is not
> +      * set, if so charge the init_mm (happens for pagecache usage).
> +      */
> +     if (likely(!memcg)) {
> +             rcu_read_lock();
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +             mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> +             /*
> +              * For every charge from the cgroup, increment reference count
> +              */
> +             css_get(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +             rcu_read_unlock();
> +     } else {
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +             mem = memcg;
> +             css_get(&memcg->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +     }
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     while (res_counter_charge(&mem->res, PAGE_SIZE)) {
> +             if (!(gfp_mask & __GFP_WAIT))
> +                     goto out;
> +
> +             if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
> +                     continue;
> +
> +             /*
> +              * try_to_free_mem_cgroup_pages() might not give us a full
> +              * picture of reclaim. Some pages are reclaimed and might be
> +              * moved to swap cache or just unmapped from the cgroup.
> +              * Check the limit again to see if the reclaim reduced the
> +              * current usage of the cgroup before giving up
> +              */
> +             if (res_counter_check_under_limit(&mem->res))
> +                     continue;
> +
> +             if (!nr_retries--) {
> +                     mem_cgroup_out_of_memory(mem, gfp_mask);
> +                     goto out;
> +             }
> +     }
> +     pc->mem_cgroup = mem;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     pc->page = page;
> +     /*
> +      * If a page is accounted as a page cache, insert to inactive list.
> +      * If anon, insert to active list.
> +      */
> +     if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> +             pc->flags = PAGE_CGROUP_FLAG_CACHE;
> +             if (page_is_file_cache(page))
> +                     pc->flags |= PAGE_CGROUP_FLAG_FILE;
> +             else
> +                     pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> +     } else
> +             pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> +
> +     lock_page_cgroup(page);
> +     if (unlikely(page_get_page_cgroup(page))) {
> +             unlock_page_cgroup(page);
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +             res_counter_uncharge(&mem->res, PAGE_SIZE);
> +             css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +             kmem_cache_free(page_cgroup_cache, pc);
> +             goto done;
> +     }
> +     page_assign_page_cgroup(page, pc);
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     mz = page_cgroup_zoneinfo(pc);
> +     spin_lock_irqsave(&mz->lru_lock, flags);
> +     __mem_cgroup_add_list(mz, pc);
> +     spin_unlock_irqrestore(&mz->lru_lock, flags);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     unlock_page_cgroup(page);
> +done:
> +     return 0;
> +out:
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +     kmem_cache_free(page_cgroup_cache, pc);
> +err:
> +     return -ENOMEM;
> +}
> +
> +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t 
> gfp_mask)
> +{
> +     if (mem_cgroup_disabled())
> +             return 0;
> +
> +     /*
> +      * If already mapped, we don't have to account.
> +      * If page cache, page->mapping has address_space.
> +      * But page->mapping may have out-of-use anon_vma pointer,
> +      * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
> +      * is NULL.
> +      */
> +     if (page_mapped(page) || (page->mapping && !PageAnon(page)))
> +             return 0;
> +     if (unlikely(!mm))
> +             mm = &init_mm;
> +     return mem_cgroup_charge_common(page, mm, gfp_mask,
> +                             MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
> +}
> +
> +int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> +                             gfp_t gfp_mask)
> +{
> +     if (mem_cgroup_disabled())
> +             return 0;
> +
> +     /*
> +      * Corner case handling. This is called from add_to_page_cache()
> +      * in usual. But some FS (shmem) precharges this page before calling it
> +      * and call add_to_page_cache() with GFP_NOWAIT.
> +      *
> +      * For GFP_NOWAIT case, the page may be pre-charged before calling
> +      * add_to_page_cache(). (See shmem.c) check it here and avoid to call
> +      * charge twice. (It works but has to pay a bit larger cost.)
> +      */
> +     if (!(gfp_mask & __GFP_WAIT)) {
> +             struct page_cgroup *pc;
> +
> +             lock_page_cgroup(page);
> +             pc = page_get_page_cgroup(page);
> +             if (pc) {
> +                     VM_BUG_ON(pc->page != page);
> +                     VM_BUG_ON(!pc->mem_cgroup);
> +                     unlock_page_cgroup(page);
> +                     return 0;
> +             }
> +             unlock_page_cgroup(page);
> +     }
> +
> +     if (unlikely(!mm))
> +             mm = &init_mm;
> +
> +     return mem_cgroup_charge_common(page, mm, gfp_mask,
> +                             MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
> +}
> +
> +/*
> + * uncharge if !page_mapped(page)
> + */
> +static void
> +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> +{
> +     struct page_cgroup *pc;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     struct mem_cgroup *mem;
> +     struct mem_cgroup_per_zone *mz;
> +     unsigned long flags;
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     if (mem_cgroup_disabled())
> +             return;
> +
> +     /*
> +      * Check if our page_cgroup is valid
> +      */
> +     lock_page_cgroup(page);
> +     pc = page_get_page_cgroup(page);
> +     if (unlikely(!pc))
> +             goto unlock;
> +
> +     VM_BUG_ON(pc->page != page);
> +
> +     if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> +         && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
> +             || page_mapped(page)
> +             || PageSwapCache(page)))
> +             goto unlock;
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     mz = page_cgroup_zoneinfo(pc);
> +     spin_lock_irqsave(&mz->lru_lock, flags);
> +     __mem_cgroup_remove_list(mz, pc);
> +     spin_unlock_irqrestore(&mz->lru_lock, flags);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     page_assign_page_cgroup(page, NULL);
> +     unlock_page_cgroup(page);
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +     mem = pc->mem_cgroup;
> +     res_counter_uncharge(&mem->res, PAGE_SIZE);
> +     css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +
> +     kmem_cache_free(page_cgroup_cache, pc);
> +     return;
> +unlock:
> +     unlock_page_cgroup(page);
> +}
> +
> +void mem_cgroup_uncharge_page(struct page *page)
> +{
> +     __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
> +}
> +
> +void mem_cgroup_uncharge_cache_page(struct page *page)
> +{
> +     VM_BUG_ON(page_mapped(page));
> +     __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
> +}
> +
> +/*
> + * Before starting migration, account against new page.
> + */
> +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
> +{
> +     struct page_cgroup *pc;
> +     struct mem_cgroup *mem = NULL;
> +     enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
> +     int ret = 0;
> +
> +     if (mem_cgroup_disabled())
> +             return 0;
> +
> +     lock_page_cgroup(page);
> +     pc = page_get_page_cgroup(page);
> +     if (pc) {
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +             mem = pc->mem_cgroup;
> +             css_get(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +             if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
> +                     ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> +     }
> +     unlock_page_cgroup(page);
> +     if (mem) {
> +             ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
> +                     ctype, mem);
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +             css_put(&mem->css);
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
> +     }
> +     return ret;
> +}
> +
> +/* remove redundant charge if migration failed*/
> +void mem_cgroup_end_migration(struct page *newpage)
> +{
> +     /*
> +      * At success, page->mapping is not NULL.
> +      * special rollback care is necessary when
> +      * 1. at migration failure. (newpage->mapping is cleared in this case)
> +      * 2. the newpage was moved but not remapped again because the task
> +      *    exits and the newpage is obsolete. In this case, the new page
> +      *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
> +      *    always for avoiding mess. The  page_cgroup will be removed if
> +      *    unnecessary. File cache pages is still on radix-tree. Don't
> +      *    care it.
> +      */
> +     if (!newpage->mapping)
> +             __mem_cgroup_uncharge_common(newpage,
> +                                      MEM_CGROUP_CHARGE_TYPE_FORCE);
> +     else if (PageAnon(newpage))
> +             mem_cgroup_uncharge_page(newpage);
> +}
> +
> +void page_cgroup_init()
> +{
> +     if (!page_cgroup_cache)
> +             page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
> +}

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization

Re: [PATCH 4/7] bio-cgroup: Split the cgroup memory subsystem into two parts

Reply via email to