Hi Andrew, Mel and other guys,

How about this V3 patch, any comments?

thanks,
linfeng

On 02/21/2013 07:01 PM, Lin Feng wrote:
> get_user_pages() always tries to allocate pages from movable zone, which is 
> not
>  reliable to memory hotremove framework in some case.
> 
> This patch introduces a new library function called 
> get_user_pages_non_movable()
>  to pin pages only from zone non-movable in memory.
> It's a wrapper of get_user_pages() but it makes sure that all pages come from
> non-movable zone via additional page migration. But if migration fails it
> will at least keep the base functionality of get_user_pages().
> 
> Cc: Andrew Morton <a...@linux-foundation.org>
> Cc: Mel Gorman <mgor...@suse.de>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hir...@jp.fujitsu.com>
> Cc: Yasuaki Ishimatsu <isimatu.yasu...@jp.fujitsu.com>
> Cc: Jeff Moyer <jmo...@redhat.com>
> Cc: Minchan Kim <minc...@kernel.org>
> Cc: Zach Brown <z...@redhat.com>
> Reviewed-by: Tang Chen <tangc...@cn.fujitsu.com>
> Reviewed-by: Gu Zheng <guz.f...@cn.fujitsu.com>
> Signed-off-by: Lin Feng <linf...@cn.fujitsu.com>
> ---
>  include/linux/mm.h     |   14 ++++++
>  include/linux/mmzone.h |    4 ++
>  mm/memory.c            |  103 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/page_isolation.c    |    8 ++++
>  4 files changed, 129 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 5625c1c..737dc39 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1025,6 +1025,20 @@ long get_user_pages(struct task_struct *tsk, struct 
> mm_struct *mm,
>                   struct vm_area_struct **vmas);
>  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
>                       struct page **pages);
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm,
> +             unsigned long start, int nr_pages, int write, int force,
> +             struct page **pages, struct vm_area_struct **vmas);
> +#else
> +static inline
> +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm,
> +             unsigned long start, int nr_pages, int write, int force,
> +             struct page **pages, struct vm_area_struct **vmas)
> +{
> +     return get_user_pages(tsk, mm, start, nr_pages, write, force, pages,
> +                             vmas);
> +}
> +#endif
>  struct kvec;
>  int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
>                       struct page **pages);
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index ab20a60..c31007e 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -851,6 +851,10 @@ static inline int is_normal_idx(enum zone_type idx)
>       return (idx == ZONE_NORMAL);
>  }
>  
> +static inline int zone_is_movable(struct zone *zone)
> +{
> +     return zone_idx(zone) == ZONE_MOVABLE;
> +}
>  /**
>   * is_highmem - helper function to quickly check if a struct zone is a 
>   *              highmem zone or not.  This is an attempt to keep references
> diff --git a/mm/memory.c b/mm/memory.c
> index 16ca5d0..83db7dd 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -58,6 +58,8 @@
>  #include <linux/elf.h>
>  #include <linux/gfp.h>
>  #include <linux/migrate.h>
> +#include <linux/page-isolation.h>
> +#include <linux/mm_inline.h>
>  #include <linux/string.h>
>  
>  #include <asm/io.h>
> @@ -2014,6 +2016,107 @@ long get_user_pages(struct task_struct *tsk, struct 
> mm_struct *mm,
>  }
>  EXPORT_SYMBOL(get_user_pages);
>  
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +/**
> + * It's a wrapper of get_user_pages() but it makes sure that all pages come 
> from
> + * non-movable zone via additional page migration. It's designed for memory
> + * hotremove framework.
> + *
> + * Currently get_user_pages() always tries to allocate pages from movable 
> zone,
> + * in some case users of get_user_pages() is easy to pin user pages for a 
> long
> + * time(for now we found that pages pinned as aio ring pages is such case),
> + * which is fatal for memory hotremove framework.
> + *
> + * This function first calls get_user_pages() to get the candidate pages, and
> + * then check to ensure all pages are from non movable zone. Otherwise 
> migrate
> + * them to non movable zone, then retry. It will at most retry once. If
> + * migration fails, it will keep the base functionality of get_user_pages()
> + * and issue WARN message for memory hot-remove people.
> + *
> + * Fixme: now we don't support non movable version of GUP for hugepage.
> + */
> +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm,
> +             unsigned long start, int nr_pages, int write, int force,
> +             struct page **pages, struct vm_area_struct **vmas)
> +{
> +     int ret, i, tried = 0;
> +     bool isolate_err, migrate_prepped;
> +     LIST_HEAD(pagelist);
> +
> +retry:
> +     BUG_ON(tried == 2);
> +     ret = get_user_pages(tsk, mm, start, nr_pages, write, force, pages,
> +                             vmas);
> +     /* No ZONE_MOVABLE populated, all pages are from non movable zone */
> +     if (movable_zone == ZONE_MOVABLE || ret <= 0)
> +             return ret;
> +
> +     isolate_err = false;
> +     migrate_prepped = false;
> +
> +     for (i = 0; i < ret; i++) {
> +             if (zone_is_movable(page_zone(pages[i]))) {
> +                     /* Fixme: improve for hugepage non movable support */
> +                     if (PageHuge(pages[i])) {
> +                             WARN_ONCE(1, "Non movable GUP for hugepages "
> +                                     "haven't been implemented yet, it may "
> +                                     "lead to memory hot-remove failure.\n");
> +                             continue;
> +                     }
> +
> +                     /* Hugepage or THP's head page has covered tail pages */
> +                     if (PageTail(pages[i]) && (page_count(pages[i]) == 1))
> +                             continue;
> +
> +                     if (!migrate_prepped) {
> +                             BUG_ON(migrate_prep());
> +                             migrate_prepped = true;
> +                     }
> +
> +                     /* Fixme: isolate_lru_page() takes the LRU lock every
> +                      * time, batching the lock could avoid potential lock
> +                      * contention problems. -Mel Gorman
> +                      */
> +                     if (!isolate_lru_page(pages[i])) {
> +                             inc_zone_page_state(pages[i], NR_ISOLATED_ANON +
> +                                              page_is_file_cache(pages[i]));
> +                             list_add(&pages[i]->lru, &pagelist);
> +                     } else {
> +                             isolate_err = true;
> +                             break;
> +                     }
> +             }
> +     }
> +
> +     /* All pages are non movable, we are done :) */
> +     if (i == ret && list_empty(&pagelist))
> +             return ret;
> +
> +     /* Undo the effects of former get_user_pages(), ready for another try */
> +     release_pages(pages, ret, 1);
> +
> +     if (!isolate_err) {
> +             ret = migrate_pages(&pagelist, alloc_migrate_target, 1,
> +                                     MIGRATE_SYNC, MR_SYSCALL);
> +             /* Steal pages from non-movable zone successfully? */
> +             if (!ret) {
> +                     tried++;
> +                     goto retry;
> +             }
> +     }
> +
> +     putback_lru_pages(&pagelist);
> +     /* Migration failed, in order to keep at least the base functionality of
> +      * get_user_pages(), we pin pages again but give WARN info to remind
> +      * memory hot-remove people, which is a trade-off.
> +      */
> +     WARN_ONCE(1, "Non movable zone migration failed, "
> +             "it may lead to memroy hot-remove failure.\n");
> +     return get_user_pages(tsk, mm, start, nr_pages, write, force, pages,
> +                             vmas);
> +}
> +EXPORT_SYMBOL(get_user_pages_non_movable);
> +#endif
>  /**
>   * get_dump_page() - pin user page in memory while writing it to core dump
>   * @addr: user address
> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index 383bdbb..7823ea5 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -247,6 +247,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned 
> long end_pfn,
>       return ret ? 0 : -EBUSY;
>  }
>  
> +/**
> + * @private: 0 means page can be alloced from movable zone, otherwise 
> forbidden
> + */
>  struct page *alloc_migrate_target(struct page *page, unsigned long private,
>                                 int **resultp)
>  {
> @@ -254,6 +257,11 @@ struct page *alloc_migrate_target(struct page *page, 
> unsigned long private,
>  
>       if (PageHighMem(page))
>               gfp_mask |= __GFP_HIGHMEM;
> +#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_HIGHMEM)
> +     BUILD_BUG_ON(1);
> +#endif
> +     if (unlikely(private != 0))
> +             gfp_mask &= ~__GFP_HIGHMEM;
>  
>       return alloc_page(gfp_mask);
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to