Andrea Arcangeli <aarca...@redhat.com> writes:

> Hello,
>
> On Tue, Jan 08, 2019 at 10:21:09AM +0530, Aneesh Kumar K.V wrote:
>> @@ -187,41 +149,25 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
>> unsigned long ua,
>>              goto unlock_exit;
>>      }
>>  
>> +    ret = get_user_pages_cma_migrate(ua, entries, 1, mem->hpages);
>
> In terms of gup APIs, I've been wondering if this shall become
> get_user_pages_longerm(FOLL_CMA_MIGRATE). So basically moving this
> CMA migrate logic inside get_user_pages_longerm.

Do we need the FOLL_CMA_MIGRATE flag? Wondering whether a long term pin
won't imply a CMA migrate? What is the benefit of that FOLL_CMA_MIGRATE
flags. We can do better by taking a list of pages for migration and I
guess it is much simpler if we limit that migration logic to
get_user_pages_longterm()?

I ended up with something like below. Do you suggest we should add those
isolate_lru and other details via FOLL_CMA_MIGRATE flag and do that when
we take the page reference instead of doing this by iterating the page array in
get_user_pages_longterm as in the below diff?

diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..6e8152594e83 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -13,6 +13,9 @@
 #include <linux/sched/signal.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>
 
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long 
nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+
 #ifdef CONFIG_FS_DAX
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+       long i;
+       struct vm_area_struct *vma_prev = NULL;
+
+       for (i = 0; i < nr_pages; i++) {
+               struct vm_area_struct *vma = vmas[i];
+
+               if (vma == vma_prev)
+                       continue;
+
+               vma_prev = vma;
+
+               if (vma_is_fsdax(vma))
+                       return true;
+       }
+       return false;
+}
+#else
+static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+       return false;
+}
+#endif
+
+#ifdef CONFIG_CMA
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+       /*
+        * We want to make sure we allocate the new page from the same node
+        * as the source page.
+        */
+       int nid = page_to_nid(page);
+       /*
+        * Trying to allocate a page for migration. Ignore allocation
+        * failure warnings. We don't force __GFP_THISNODE here because
+        * this node here is the node where we have CMA reservation and
+        * in some case these nodes will have really less non movable
+        * allocation memory.
+        */
+       gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
+
+       if (PageHighMem(page))
+               gfp_mask |= __GFP_HIGHMEM;
+
+#ifdef CONFIG_HUGETLB_PAGE
+       if (PageHuge(page)) {
+               struct hstate *h = page_hstate(page);
+               /*
+                * We don't want to dequeue from the pool because pool pages 
will
+                * mostly be from the CMA region.
+                */
+               return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+       }
+#endif
+       if (PageTransHuge(page)) {
+               struct page *thp;
+               /*
+                * ignore allocation failure warnings
+                */
+               gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
+
+               /*
+                * Remove the movable mask so that we don't allocate from
+                * CMA area again.
+                */
+               thp_gfpmask &= ~__GFP_MOVABLE;
+               thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+               if (!thp)
+                       return NULL;
+               prep_transhuge_page(thp);
+               return thp;
+       }
+
+       return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+                                       unsigned int gup_flags,
+                                       struct page **pages,
+                                       struct vm_area_struct **vmas)
+{
+       long i;
+       bool drain_allow = true;
+       bool migrate_allow = true;
+       LIST_HEAD(cma_page_list);
+
+check_again:
+       for (i = 0; i < nr_pages; i++) {
+               /*
+                * If we get a page from the CMA zone, since we are going to
+                * be pinning these entries, we might as well move them out
+                * of the CMA zone if possible.
+                */
+               if (is_migrate_cma_page(pages[i])) {
+
+                       struct page *head = compound_head(pages[i]);
+
+                       if (PageHuge(head)) {
+                               isolate_huge_page(head, &cma_page_list);
+                       } else {
+                               if (!PageLRU(head) && drain_allow) {
+                                       lru_add_drain_all();
+                                       drain_allow = false;
+                               }
+
+                               if (!isolate_lru_page(head)) {
+                                       list_add_tail(&head->lru, 
&cma_page_list);
+                                       mod_node_page_state(page_pgdat(head),
+                                                           NR_ISOLATED_ANON +
+                                                           
page_is_file_cache(head),
+                                                           
hpage_nr_pages(head));
+                               }
+                       }
+               }
+       }
+
+       if (!list_empty(&cma_page_list)) {
+               /*
+                * drop the above get_user_pages reference.
+                */
+               for (i = 0; i < nr_pages; i++)
+                       put_page(pages[i]);
+
+               if (migrate_pages(&cma_page_list, new_non_cma_page,
+                                 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+                       /*
+                        * some of the pages failed migration. Do get_user_pages
+                        * without migration.
+                        */
+                       migrate_allow = false;
+
+                       if (!list_empty(&cma_page_list))
+                               putback_movable_pages(&cma_page_list);
+               }
+               /*
+                * We did migrate all the pages, Try to get the page references 
again
+                * migrating any new CMA pages which we failed to isolate 
earlier.
+                */
+               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, 
vmas);
+               if ((nr_pages > 0) && migrate_allow) {
+                       drain_allow = true;
+                       goto check_again;
+               }
+       }
+
+       return nr_pages;
+}
+#else
+static inline long check_and_migrate_cma_pages(unsigned long start, long 
nr_pages,
+                                              unsigned int gup_flags,
+                                              struct page **pages,
+                                              struct vm_area_struct **vmas)
+{
+       return nr_pages;
+}
+#endif
+
 /*
  * This is the same as get_user_pages() in that it assumes we are
  * operating on the current task's mm, but it goes further to validate
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages);
  * Contrast this to iov_iter_get_pages() usages which are transient.
  */
 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-               unsigned int gup_flags, struct page **pages,
-               struct vm_area_struct **vmas_arg)
+                            unsigned int gup_flags, struct page **pages,
+                            struct vm_area_struct **vmas_arg)
 {
        struct vm_area_struct **vmas = vmas_arg;
-       struct vm_area_struct *vma_prev = NULL;
+       unsigned long flags;
        long rc, i;
 
        if (!pages)
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, 
unsigned long nr_pages,
                        return -ENOMEM;
        }
 
+       flags = memalloc_nocma_save();
        rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+       memalloc_nocma_restore(flags);
+       if (rc < 0)
+               goto out;
 
-       for (i = 0; i < rc; i++) {
-               struct vm_area_struct *vma = vmas[i];
-
-               if (vma == vma_prev)
-                       continue;
-
-               vma_prev = vma;
-
-               if (vma_is_fsdax(vma))
-                       break;
-       }
-
-       /*
-        * Either get_user_pages() failed, or the vma validation
-        * succeeded, in either case we don't need to put_page() before
-        * returning.
-        */
-       if (i >= rc)
+       if (check_dax_vmas(vmas, rc)) {
+               for (i = 0; i < rc; i++)
+                       put_page(pages[i]);
+               rc = -EOPNOTSUPP;
                goto out;
+       }
 
-       for (i = 0; i < rc; i++)
-               put_page(pages[i]);
-       rc = -EOPNOTSUPP;
+       rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
 out:
        if (vmas != vmas_arg)
                kfree(vmas);

Reply via email to