On 12/8/20 9:29 AM, Joao Martins wrote:
Similar to follow_hugetlb_page() add a follow_devmap_page which rather
than calling follow_page() per 4K page in a PMD/PUD it does so for the
entire PMD, where we lock the pmd/pud, get all pages , unlock.

While doing so, we only change the refcount once when PGMAP_COMPOUND is
passed in.

This let us improve {pin,get}_user_pages{,_longterm}() considerably:

$ gup_benchmark -f /dev/dax0.2 -m 16384 -r 10 -S [-U,-b,-L] -n 512 -w

d> (<test>) [before] -> [after]
(get_user_pages 2M pages) ~150k us -> ~8.9k us
(pin_user_pages 2M pages) ~192k us -> ~9k us
(pin_user_pages_longterm 2M pages) ~200k us -> ~19k us


Yes, this is a massive improvement.


Signed-off-by: Joao Martins <joao.m.mart...@oracle.com>
---
I've special-cased this to device-dax vmas given its similar page size
guarantees as hugetlbfs, but I feel this is a bit wrong. I am
replicating follow_hugetlb_page() as RFC ought to seek feedback whether
this should be generalized if no fundamental issues exist. In such case,
should I be changing follow_page_mask() to take either an array of pages
or a function pointer and opaque arguments which would let caller pick
its structure?

I don't know which approach is better because I haven't yet attempted to
find the common elements in these routines. But if there is *any way* to
avoid this copy-paste creation of yet more following of pages, then it's
*very* good to do.

thanks,
--
John Hubbard
NVIDIA

---
  include/linux/huge_mm.h |   4 +
  include/linux/mm.h      |   2 +
  mm/gup.c                |  22 ++++-
  mm/huge_memory.c        | 202 ++++++++++++++++++++++++++++++++++++++++
  4 files changed, 227 insertions(+), 3 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0365aa97f8e7..da87ecea19e6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -293,6 +293,10 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, 
unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, int flags, struct dev_pagemap **pgmap);
+long follow_devmap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                       struct page **pages, struct vm_area_struct **vmas,
+                       unsigned long *position, unsigned long *nr_pages,
+                       long i, unsigned int flags, int *locked);
extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b0155441835..466c88679628 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1164,6 +1164,8 @@ static inline void get_page(struct page *page)
        page_ref_inc(page);
  }
+__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
+                                                  unsigned int flags);
  bool __must_check try_grab_page(struct page *page, unsigned int flags);
static inline __must_check bool try_get_page(struct page *page)
diff --git a/mm/gup.c b/mm/gup.c
index 3a9a7229f418..50effb9cc349 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -78,7 +78,7 @@ static inline struct page *try_get_compound_head(struct page 
*page, int refs)
   * considered failure, and furthermore, a likely bug in the caller, so a 
warning
   * is also emitted.
   */
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
                                                          int refs,
                                                          unsigned int flags)
  {
@@ -880,8 +880,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned 
long address,
   * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
   * is, *@locked will be set to 0 and -EBUSY returned.
   */
-static int faultin_page(struct vm_area_struct *vma,
-               unsigned long address, unsigned int *flags, int *locked)
+int faultin_page(struct vm_area_struct *vma,
+                unsigned long address, unsigned int *flags, int *locked)
  {
        unsigned int fault_flags = 0;
        vm_fault_t ret;
@@ -1103,6 +1103,22 @@ static long __get_user_pages(struct mm_struct *mm,
                                }
                                continue;
                        }
+                       if (vma_is_dax(vma)) {
+                               i = follow_devmap_page(mm, vma, pages, vmas,
+                                                      &start, &nr_pages, i,
+                                                      gup_flags, locked);
+                               if (locked && *locked == 0) {
+                                       /*
+                                        * We've got a VM_FAULT_RETRY
+                                        * and we've lost mmap_lock.
+                                        * We must stop here.
+                                        */
+                                       BUG_ON(gup_flags & FOLL_NOWAIT);
+                                       BUG_ON(ret != 0);
+                                       goto out;
+                               }
+                               continue;
+                       }
                }
  retry:
                /*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ec2bb93f7431..20bfbf211dc3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1168,6 +1168,208 @@ struct page *follow_devmap_pud(struct vm_area_struct 
*vma, unsigned long addr,
        return page;
  }
+long follow_devmap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                       struct page **pages, struct vm_area_struct **vmas,
+                       unsigned long *position, unsigned long *nr_pages,
+                       long i, unsigned int flags, int *locked)
+{
+       unsigned long pfn_offset;
+       unsigned long vaddr = *position;
+       unsigned long remainder = *nr_pages;
+       unsigned long align = vma_kernel_pagesize(vma);
+       unsigned long align_nr_pages = align >> PAGE_SHIFT;
+       unsigned long mask = ~(align-1);
+       unsigned long nr_pages_hpage = 0;
+       struct dev_pagemap *pgmap = NULL;
+       int err = -EFAULT;
+
+       if (align == PAGE_SIZE)
+               return i;
+
+       while (vaddr < vma->vm_end && remainder) {
+               pte_t *pte;
+               spinlock_t *ptl = NULL;
+               int absent;
+               struct page *page;
+
+               /*
+                * If we have a pending SIGKILL, don't keep faulting pages and
+                * potentially allocating memory.
+                */
+               if (fatal_signal_pending(current)) {
+                       remainder = 0;
+                       break;
+               }
+
+               /*
+                * Some archs (sparc64, sh*) have multiple pte_ts to
+                * each hugepage.  We have to make sure we get the
+                * first, for the page indexing below to work.
+                *
+                * Note that page table lock is not held when pte is null.
+                */
+               pte = huge_pte_offset(mm, vaddr & mask, align);
+               if (pte) {
+                       if (align == PMD_SIZE)
+                               ptl = pmd_lockptr(mm, (pmd_t *) pte);
+                       else if (align == PUD_SIZE)
+                               ptl = pud_lockptr(mm, (pud_t *) pte);
+                       spin_lock(ptl);
+               }
+               absent = !pte || pte_none(ptep_get(pte));
+
+               if (absent && (flags & FOLL_DUMP)) {
+                       if (pte)
+                               spin_unlock(ptl);
+                       remainder = 0;
+                       break;
+               }
+
+               if (absent ||
+                   ((flags & FOLL_WRITE) &&
+                     !pte_write(ptep_get(pte)))) {
+                       vm_fault_t ret;
+                       unsigned int fault_flags = 0;
+
+                       if (pte)
+                               spin_unlock(ptl);
+                       if (flags & FOLL_WRITE)
+                               fault_flags |= FAULT_FLAG_WRITE;
+                       if (locked)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+                                       FAULT_FLAG_KILLABLE;
+                       if (flags & FOLL_NOWAIT)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+                                       FAULT_FLAG_RETRY_NOWAIT;
+                       if (flags & FOLL_TRIED) {
+                               /*
+                                * Note: FAULT_FLAG_ALLOW_RETRY and
+                                * FAULT_FLAG_TRIED can co-exist
+                                */
+                               fault_flags |= FAULT_FLAG_TRIED;
+                       }
+                       ret = handle_mm_fault(vma, vaddr, flags, NULL);
+                       if (ret & VM_FAULT_ERROR) {
+                               err = vm_fault_to_errno(ret, flags);
+                               remainder = 0;
+                               break;
+                       }
+                       if (ret & VM_FAULT_RETRY) {
+                               if (locked &&
+                                   !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+                                       *locked = 0;
+                               *nr_pages = 0;
+                               /*
+                                * VM_FAULT_RETRY must not return an
+                                * error, it will return zero
+                                * instead.
+                                *
+                                * No need to update "position" as the
+                                * caller will not check it after
+                                * *nr_pages is set to 0.
+                                */
+                               return i;
+                       }
+                       continue;
+               }
+
+               pfn_offset = (vaddr & ~mask) >> PAGE_SHIFT;
+               page = pte_page(ptep_get(pte));
+
+               pgmap = get_dev_pagemap(page_to_pfn(page), pgmap);
+               if (!pgmap) {
+                       spin_unlock(ptl);
+                       remainder = 0;
+                       err = -EFAULT;
+                       break;
+               }
+
+               /*
+                * If subpage information not requested, update counters
+                * and skip the same_page loop below.
+                */
+               if (!pages && !vmas && !pfn_offset &&
+                   (vaddr + align < vma->vm_end) &&
+                   (remainder >= (align_nr_pages))) {
+                       vaddr += align;
+                       remainder -= align_nr_pages;
+                       i += align_nr_pages;
+                       spin_unlock(ptl);
+                       continue;
+               }
+
+               nr_pages_hpage = 0;
+
+same_page:
+               if (pages) {
+                       pages[i] = mem_map_offset(page, pfn_offset);
+
+                       /*
+                        * try_grab_page() should always succeed here, because:
+                        * a) we hold the ptl lock, and b) we've just checked
+                        * that the huge page is present in the page tables.
+                        */
+                       if (!(pgmap->flags & PGMAP_COMPOUND) &&
+                           WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+                               spin_unlock(ptl);
+                               remainder = 0;
+                               err = -ENOMEM;
+                               break;
+                       }
+
+               }
+
+               if (vmas)
+                       vmas[i] = vma;
+
+               vaddr += PAGE_SIZE;
+               ++pfn_offset;
+               --remainder;
+               ++i;
+               nr_pages_hpage++;
+               if (vaddr < vma->vm_end && remainder &&
+                               pfn_offset < align_nr_pages) {
+                       /*
+                        * We use pfn_offset to avoid touching the pageframes
+                        * of this compound page.
+                        */
+                       goto same_page;
+               } else {
+                       /*
+                        * try_grab_compound_head() should always succeed here,
+                        * because: a) we hold the ptl lock, and b) we've just
+                        * checked that the huge page is present in the page
+                        * tables. If the huge page is present, then the tail
+                        * pages must also be present. The ptl prevents the
+                        * head page and tail pages from being rearranged in
+                        * any way. So this page must be available at this
+                        * point, unless the page refcount overflowed:
+                        */
+                       if ((pgmap->flags & PGMAP_COMPOUND) &&
+                           WARN_ON_ONCE(!try_grab_compound_head(pages[i-1],
+                                                                nr_pages_hpage,
+                                                                flags))) {
+                               put_dev_pagemap(pgmap);
+                               spin_unlock(ptl);
+                               remainder = 0;
+                               err = -ENOMEM;
+                               break;
+                       }
+                       put_dev_pagemap(pgmap);
+               }
+               spin_unlock(ptl);
+       }
+       *nr_pages = remainder;
+       /*
+        * setting position is actually required only if remainder is
+        * not zero but it's faster not to add a "if (remainder)"
+        * branch.
+        */
+       *position = vaddr;
+
+       return i ? i : err;
+}
+
  int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma)

_______________________________________________
Linux-nvdimm mailing list -- linux-nvdimm@lists.01.org
To unsubscribe send an email to linux-nvdimm-le...@lists.01.org

Reply via email to