Re: [PATCH 1/2] mm: Allow architectures to request 'old' entries when prefaulting

Kirill A. Shutemov Fri, 18 Dec 2020 03:05:42 -0800

On Thu, Dec 17, 2020 at 10:22:33AM -0800, Linus Torvalds wrote:
> +       head = xas_find(&xas, end_pgoff);
> +       for (; ; head = xas_next_entry(&xas, end_pgoff)) {
> +               if (!head) {
> +                       rcu_read_unlock();
> +                       return;
> +               }
> +               if (likely(!xas_retry(&xas, head))
> +                       break;
> +       }
> 
> instead. So that if we don't find any cached entries, we won't do
> anything, rather than take locks and then not do anything.


This should do. See below.

> Then that second loop very naturally becomes a "do { } while ()" one.

I don't see it. I haven't found a reasonable way to rework it do-while.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..2825153ad0d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -534,8 +534,8 @@ struct vm_fault {
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;         /* Pre-allocated pte page table.
-                                        * vm_ops->map_pages() calls
-                                        * alloc_set_pte() from atomic context.
+                                        * vm_ops->map_pages() sets up a page
+                                        * table from from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
@@ -972,7 +972,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct 
vm_area_struct *vma)
        return pte;
 }
 
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+void do_set_pte(struct vm_fault *vmf, struct page *page);
+
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e237004d498d..869c1921ceda 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1259,6 +1259,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
 
+/*
+ * the ordering of these checks is important for pmds with _page_devmap set.
+ * if we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+       return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
 #ifndef CONFIG_NUMA_BALANCING
 /*
  * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b2067b3c328..04975682296d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,7 @@
 #include <linux/psi.h>
 #include <linux/ramfs.h>
 #include <linux/page_idle.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -2831,10 +2832,74 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page,
+                                 struct xa_state *xas)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+
+       /* Huge page is mapped? No need to proceed. */
+       if (pmd_trans_huge(*vmf->pmd))
+               return true;
+
+       if (xa_is_value(page))
+               goto nohuge;
+
+       if (!pmd_none(*vmf->pmd))
+               goto nohuge;
+
+       if (!PageTransHuge(page) || PageLocked(page))
+               goto nohuge;
+
+       if (!page_cache_get_speculative(page))
+               goto nohuge;
+
+       if (page != xas_reload(xas))
+               goto unref;
+
+       if (!PageTransHuge(page))
+               goto unref;
+
+       if (!PageUptodate(page) || PageReadahead(page) || PageHWPoison(page))
+               goto unref;
+
+       if (!trylock_page(page))
+               goto unref;
+
+       if (page->mapping != mapping || !PageUptodate(page))
+               goto unlock;
+
+       if (xas->xa_index >= DIV_ROUND_UP(i_size_read(mapping->host), 
PAGE_SIZE))
+               goto unlock;
+
+       do_set_pmd(vmf, page);
+       unlock_page(page);
+       return true;
+unlock:
+       unlock_page(page);
+unref:
+       put_page(page);
+nohuge:
+       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+       if (likely(pmd_none(*vmf->pmd))) {
+               mm_inc_nr_ptes(vma->vm_mm);
+               pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+               vmf->prealloc_pte = NULL;
+       }
+       spin_unlock(vmf->ptl);
+
+       /* See comment in handle_pte_fault() */
+       if (pmd_devmap_trans_unstable(vmf->pmd))
+               return true;
+
+       return false;
+}
+
 void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
 {
-       struct file *file = vmf->vma->vm_file;
+       struct vm_area_struct *vma = vmf->vma;
+       struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long max_idx;
@@ -2843,20 +2908,37 @@ void filemap_map_pages(struct vm_fault *vmf,
        unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
 
        rcu_read_lock();
-       xas_for_each(&xas, head, end_pgoff) {
+       head = xas_find(&xas, end_pgoff);
+       for (; ; head = xas_next_entry(&xas, end_pgoff)) {
+               if (!head) {
+                       rcu_read_unlock();
+                       return;
+               }
+               if (likely(!xas_retry(&xas, head)))
+                   break;
+       }
+
+       if (filemap_map_pmd(vmf, head, &xas)) {
+               rcu_read_unlock();
+               return;
+       }
+
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                      vmf->address, &vmf->ptl);
+
+       for (; head; head = xas_next_entry(&xas, end_pgoff)) {
                if (xas_retry(&xas, head))
                        continue;
                if (xa_is_value(head))
-                       goto next;
-
+                       continue;
                /*
                 * Check for a locked page first, as a speculative
                 * reference may adversely influence page migration.
                 */
                if (PageLocked(head))
-                       goto next;
+                       continue;
                if (!page_cache_get_speculative(head))
-                       goto next;
+                       continue;
 
                /* Has the page moved or been split? */
                if (unlikely(head != xas_reload(&xas)))
@@ -2884,19 +2966,18 @@ void filemap_map_pages(struct vm_fault *vmf,
                if (vmf->pte)
                        vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
-               if (alloc_set_pte(vmf, page))
-                       goto unlock;
+               if (pte_none(*vmf->pte))
+                       do_set_pte(vmf, page);
+               /* no need to invalidate: a not-present page won't be cached */
+               update_mmu_cache(vma, vmf->address, vmf->pte);
                unlock_page(head);
-               goto next;
+               continue;
 unlock:
                unlock_page(head);
 skip:
                put_page(head);
-next:
-               /* Huge page is mapped? No need to proceed. */
-               if (pmd_trans_huge(*vmf->pmd))
-                       break;
        }
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
        rcu_read_unlock();
        WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
 }
diff --git a/mm/memory.c b/mm/memory.c
index c48f8df6e502..96d62774096a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3490,7 +3490,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;
 
-       /* See the comment in pte_alloc_one_map() */
+       /* See the comment in map_set_pte() */
        if (unlikely(pmd_trans_unstable(vmf->pmd)))
                return 0;
 
@@ -3630,66 +3630,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
        return ret;
 }
 
-/*
- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
- */
-static int pmd_devmap_trans_unstable(pmd_t *pmd)
-{
-       return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
-}
-
-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
-{
-       struct vm_area_struct *vma = vmf->vma;
-
-       if (!pmd_none(*vmf->pmd))
-               goto map_pte;
-       if (vmf->prealloc_pte) {
-               vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-               if (unlikely(!pmd_none(*vmf->pmd))) {
-                       spin_unlock(vmf->ptl);
-                       goto map_pte;
-               }
-
-               mm_inc_nr_ptes(vma->vm_mm);
-               pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
-               spin_unlock(vmf->ptl);
-               vmf->prealloc_pte = NULL;
-       } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
-               return VM_FAULT_OOM;
-       }
-map_pte:
-       /*
-        * If a huge pmd materialized under us just retry later.  Use
-        * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
-        * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
-        * under us and then back to pmd_none, as a result of MADV_DONTNEED
-        * running immediately after a huge pmd fault in a different thread of
-        * this mm, in turn leading to a misleading pmd_trans_huge() retval.
-        * All we have to ensure is that it is a regular pmd that we can walk
-        * with pte_offset_map() and we can do that through an atomic read in
-        * C, which is what pmd_trans_unstable() provides.
-        */
-       if (pmd_devmap_trans_unstable(vmf->pmd))
-               return VM_FAULT_NOPAGE;
-
-       /*
-        * At this point we know that our vmf->pmd points to a page of ptes
-        * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
-        * for the duration of the fault.  If a racing MADV_DONTNEED runs and
-        * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
-        * be valid and we will re-check to make sure the vmf->pte isn't
-        * pte_none() under vmf->ptl protection when we return to
-        * alloc_set_pte().
-        */
-       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-                       &vmf->ptl);
-       return 0;
-}
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
@@ -3704,7 +3644,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
        vmf->prealloc_pte = NULL;
 }
 
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3769,45 +3709,11 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, 
struct page *page)
 }
 #endif
 
-/**
- * alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the function allocates page table or use pre-allocated.
- *
- * @vmf: fault environment
- * @page: page to map
- *
- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
- * return.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- *
- * Return: %0 on success, %VM_FAULT_ code in case of error.
- */
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page)
 {
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        pte_t entry;
-       vm_fault_t ret;
-
-       if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
-               ret = do_set_pmd(vmf, page);
-               if (ret != VM_FAULT_FALLBACK)
-                       return ret;
-       }
-
-       if (!vmf->pte) {
-               ret = pte_alloc_one_map(vmf);
-               if (ret)
-                       return ret;
-       }
-
-       /* Re-check under ptl */
-       if (unlikely(!pte_none(*vmf->pte))) {
-               update_mmu_tlb(vma, vmf->address, vmf->pte);
-               return VM_FAULT_NOPAGE;
-       }
 
        flush_icache_page(vma, page);
        entry = mk_pte(page, vma->vm_page_prot);
@@ -3824,14 +3730,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct 
page *page)
                page_add_file_rmap(page, false);
        }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
-
-       /* no need to invalidate: a not-present page won't be cached */
-       update_mmu_cache(vma, vmf->address, vmf->pte);
-
-       return 0;
 }
 
-
 /**
  * finish_fault - finish page fault once we have prepared the page to fault
  *
@@ -3849,12 +3749,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct 
page *page)
  */
 vm_fault_t finish_fault(struct vm_fault *vmf)
 {
+       struct vm_area_struct *vma = vmf->vma;
        struct page *page;
-       vm_fault_t ret = 0;
+       vm_fault_t ret;
 
        /* Did we COW the page? */
-       if ((vmf->flags & FAULT_FLAG_WRITE) &&
-           !(vmf->vma->vm_flags & VM_SHARED))
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
                page = vmf->cow_page;
        else
                page = vmf->page;
@@ -3863,13 +3763,35 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
         * check even for read faults because we might have lost our CoWed
         * page
         */
-       if (!(vmf->vma->vm_flags & VM_SHARED))
-               ret = check_stable_address_space(vmf->vma->vm_mm);
-       if (!ret)
-               ret = alloc_set_pte(vmf, page);
-       if (vmf->pte)
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-       return ret;
+       if (!(vma->vm_flags & VM_SHARED))
+               ret = check_stable_address_space(vma->vm_mm);
+       if (ret)
+               return ret;
+
+       if (pmd_none(*vmf->pmd)) {
+               if (PageTransCompound(page)) {
+                       ret = do_set_pmd(vmf, page);
+                       if (ret != VM_FAULT_FALLBACK)
+                               return ret;
+               }
+
+               if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+                       return VM_FAULT_OOM;
+       }
+
+       /* See comment in handle_pte_fault() */
+       if (pmd_devmap_trans_unstable(vmf->pmd))
+               return 0;
+
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                     vmf->address, &vmf->ptl);
+       /* Re-check under ptl */
+       if (likely(pte_none(*vmf->pte)))
+               do_set_pte(vmf, page);
+
+       update_mmu_tlb(vma, vmf->address, vmf->pte);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       return 0;
 }
 
 static unsigned long fault_around_bytes __read_mostly =
@@ -3980,7 +3902,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
        vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
        if (!pte_none(*vmf->pte))
                ret = VM_FAULT_NOPAGE;
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
        vmf->address = address;
        vmf->pte = NULL;
@@ -4340,7 +4261,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                 */
                vmf->pte = NULL;
        } else {
-               /* See comment in pte_alloc_one_map() */
+               /*
+                * If a huge pmd materialized under us just retry later.  Use
+                * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
+                * of pmd_trans_huge() to ensure the pmd didn't become
+                * pmd_trans_huge under us and then back to pmd_none, as a
+                * result of MADV_DONTNEED running immediately after a huge pmd
+                * fault in a different thread of this mm, in turn leading to a
+                * misleading pmd_trans_huge() retval. All we have to ensure is
+                * that it is a regular pmd that we can walk with
+                * pte_offset_map() and we can do that through an atomic read
+                * in C, which is what pmd_trans_unstable() provides.
+                */
                if (pmd_devmap_trans_unstable(vmf->pmd))
                        return 0;
                /*
-- 
 Kirill A. Shutemov

Re: [PATCH 1/2] mm: Allow architectures to request 'old' entries when prefaulting

Reply via email to