From: Jason Gunthorpe <[email protected]>

The pagewalker runs while only holding the mmap_sem for read. The pud can
be set asynchronously, while also holding the mmap_sem for read

eg from:

 handle_mm_fault()
  __handle_mm_fault()
   create_huge_pmd()
    dev_dax_huge_fault()
     __dev_dax_pud_fault()
      vmf_insert_pfn_pud()
       insert_pfn_pud()
        pud_lock()
        set_pud_at()

At least x86 sets the PUD using WRITE_ONCE(), so an unlocked read of
unstable data should be paired to use READ_ONCE().

For the pagewalker to work locklessly the PUD must work similarly to the
PMD: once the PUD entry becomes a pointer to a PMD, it must be stable, and
safe to pass to pmd_offset()

Passing the value from READ_ONCE into the callbacks prevents the callers
from seeing inconsistencies after they re-read, such as seeing pud_none().

If a callback does obtain the pud_lock then it should trigger ACTION_AGAIN
if a data race caused the original value to change.

Use the same pattern as gup_pmd_range() and pass in the address of the
local READ_ONCE stack variable to pmd_offset() to avoid reading it again.

Signed-off-by: Jason Gunthorpe <[email protected]>
---
 include/linux/pagewalk.h   |  2 +-
 mm/hmm.c                   | 16 +++++++---------
 mm/mapping_dirty_helpers.c |  6 ++----
 mm/pagewalk.c              | 28 ++++++++++++++++------------
 mm/ptdump.c                |  3 +--
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb..6caf28aadafb 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -39,7 +39,7 @@ struct mm_walk_ops {
                         unsigned long next, struct mm_walk *walk);
        int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
                         unsigned long next, struct mm_walk *walk);
-       int (*pud_entry)(pud_t *pud, unsigned long addr,
+       int (*pud_entry)(pud_t pud, pud_t *pudp, unsigned long addr,
                         unsigned long next, struct mm_walk *walk);
        int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
                         unsigned long next, struct mm_walk *walk);
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..419e9e50fd51 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -402,28 +402,26 @@ static inline unsigned long pud_to_hmm_pfn_flags(struct 
hmm_range *range,
               hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
 }
 
-static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long 
end,
-               struct mm_walk *walk)
+static int hmm_vma_walk_pud(pud_t pud, pud_t *pudp, unsigned long start,
+                           unsigned long end, struct mm_walk *walk)
 {
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        unsigned long addr = start;
-       pud_t pud;
        int ret = 0;
        spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
 
        if (!ptl)
                return 0;
+       if (memcmp(pudp, &pud, sizeof(pud)) != 0) {
+               walk->action = ACTION_AGAIN;
+               spin_unlock(ptl);
+               return 0;
+       }
 
        /* Normally we don't want to split the huge page */
        walk->action = ACTION_CONTINUE;
 
-       pud = READ_ONCE(*pudp);
-       if (pud_none(pud)) {
-               spin_unlock(ptl);
-               return hmm_vma_walk_hole(start, end, -1, walk);
-       }
-
        if (pud_huge(pud) && pud_devmap(pud)) {
                unsigned long i, npages, pfn;
                unsigned int required_fault;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2c7d03675903..9fc46ebef497 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -150,11 +150,9 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long 
addr, unsigned long end,
  * causes dirty info loss. The pagefault handler should do
  * that if needed.
  */
-static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long 
end,
-                             struct mm_walk *walk)
+static int wp_clean_pud_entry(pud_t pudval, pud_t *pudp, unsigned long addr,
+                             unsigned long end, struct mm_walk *walk)
 {
-       pud_t pudval = READ_ONCE(*pud);
-
        if (!pud_trans_unstable(&pudval))
                return 0;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..15d1e423b4a3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
        return err;
 }
 
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                          struct mm_walk *walk)
 {
        pmd_t *pmd;
@@ -67,7 +67,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
        int err = 0;
        int depth = real_depth(3);
 
-       pmd = pmd_offset(pud, addr);
+       pmd = pmd_offset(&pud, addr);
        do {
 again:
                next = pmd_addr_end(addr, end);
@@ -119,17 +119,19 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                          struct mm_walk *walk)
 {
-       pud_t *pud;
+       pud_t *pudp;
+       pud_t pud;
        unsigned long next;
        const struct mm_walk_ops *ops = walk->ops;
        int err = 0;
        int depth = real_depth(2);
 
-       pud = pud_offset(p4d, addr);
+       pudp = pud_offset(p4d, addr);
        do {
  again:
+               pud = READ_ONCE(*pudp);
                next = pud_addr_end(addr, end);
-               if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+               if (pud_none(pud) || (!walk->vma && !walk->no_vma)) {
                        if (ops->pte_hole)
                                err = ops->pte_hole(addr, next, depth, walk);
                        if (err)
@@ -140,27 +142,29 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
                walk->action = ACTION_SUBTREE;
 
                if (ops->pud_entry)
-                       err = ops->pud_entry(pud, addr, next, walk);
+                       err = ops->pud_entry(pud, pudp, addr, next, walk);
                if (err)
                        break;
 
                if (walk->action == ACTION_AGAIN)
                        goto again;
 
-               if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+               if ((!walk->vma && (pud_leaf(pud) || !pud_present(pud))) ||
                    walk->action == ACTION_CONTINUE ||
                    !(ops->pmd_entry || ops->pte_entry))
                        continue;
 
-               if (walk->vma)
-                       split_huge_pud(walk->vma, pud, addr);
-               if (pud_none(*pud))
-                       goto again;
+               if (walk->vma) {
+                       split_huge_pud(walk->vma, pudp, addr);
+                       pud = READ_ONCE(*pudp);
+                       if (pud_none(pud))
+                               goto again;
+               }
 
                err = walk_pmd_range(pud, addr, next, walk);
                if (err)
                        break;
-       } while (pud++, addr = next, addr != end);
+       } while (pudp++, addr = next, addr != end);
 
        return err;
 }
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..2055b940408e 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -65,11 +65,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
        return 0;
 }
 
-static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+static int ptdump_pud_entry(pud_t val, pud_t *pudp, unsigned long addr,
                            unsigned long next, struct mm_walk *walk)
 {
        struct ptdump_state *st = walk->private;
-       pud_t val = READ_ONCE(*pud);
 
 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
        if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
-- 
2.28.0

Reply via email to