Allow UFFDIO_CONTINUE on anonymous VMAs with VM_UFFD_MINOR. For shmem,
CONTINUE installs a PTE from page cache. For anonymous memory, the
page is already mapped via a protnone PTE — CONTINUE restores the
original VMA permissions.

PTE level: mfill_atomic_pte_continue_anon() walks to the PTE, verifies
protnone, restores permissions. Rename the shmem path to
mfill_atomic_pte_continue_shmem() for clarity.

PMD/THP level: mfill_atomic_pmd_continue_anon() restores protnone PMD
permissions in place without splitting. Handles PMD races with EAGAIN
retry in the mfill_atomic loop.

Add protnone PTE/PMD checks in userfaultfd_must_wait() so sync minor
faults properly block until resolved.

Signed-off-by: Kiryl Shutsemau (Meta) <[email protected]>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c |  9 +++++-
 mm/userfaultfd.c | 82 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b317c9854b86..43064238fd8d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -340,8 +340,11 @@ static inline bool userfaultfd_must_wait(struct 
userfaultfd_ctx *ctx,
        if (!pmd_present(_pmd))
                return false;
 
-       if (pmd_trans_huge(_pmd))
+       if (pmd_trans_huge(_pmd)) {
+               if (pmd_protnone(_pmd) && (reason & VM_UFFD_MINOR))
+                       return true;
                return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+       }
 
        pte = pte_offset_map(pmd, address);
        if (!pte)
@@ -366,6 +369,9 @@ static inline bool userfaultfd_must_wait(struct 
userfaultfd_ctx *ctx,
         */
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                goto out;
+       /* PTE is still protnone (deactivated), wait for userspace to resolve. 
*/
+       if (pte_protnone(ptent) && (reason & VM_UFFD_MINOR))
+               goto out;
 
        ret = false;
 out:
@@ -1820,6 +1826,7 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx 
*ctx,
        return ret;
 }
 
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
        __s64 ret;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3373b11b9d83..4c52fa5d1608 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -380,8 +380,61 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
        return ret;
 }
 
-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue_anon(pmd_t *dst_pmd,
+                                         struct vm_area_struct *dst_vma,
+                                         unsigned long dst_addr,
+                                         uffd_flags_t flags)
+{
+       pte_t *ptep, pte;
+       spinlock_t *ptl;
+       int ret = -EFAULT;
+
+       ptep = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+       if (!ptep)
+               return ret;
+
+       pte = ptep_get(ptep);
+       if (!pte_protnone(pte))
+               goto out_unlock;
+
+       pte = pte_modify(pte, dst_vma->vm_page_prot);
+       pte = pte_mkyoung(pte);
+       if (flags & MFILL_ATOMIC_WP)
+               pte = pte_wrprotect(pte);
+       set_pte_at(dst_vma->vm_mm, dst_addr, ptep, pte);
+       update_mmu_cache(dst_vma, dst_addr, ptep);
+       ret = 0;
+out_unlock:
+       pte_unmap_unlock(ptep, ptl);
+       return ret;
+}
+
+static int mfill_atomic_pmd_continue_anon(struct mm_struct *mm,
+                                         struct vm_area_struct *vma,
+                                         unsigned long addr,
+                                         pmd_t *pmd, pmd_t orig_pmd,
+                                         uffd_flags_t flags)
+{
+       spinlock_t *ptl;
+       pmd_t entry;
+
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_same(pmdp_get(pmd), orig_pmd))) {
+               spin_unlock(ptl);
+               return -EAGAIN;
+       }
+
+       entry = pmd_modify(orig_pmd, vma->vm_page_prot);
+       entry = pmd_mkyoung(entry);
+       if (flags & MFILL_ATOMIC_WP)
+               entry = pmd_wrprotect(entry);
+       set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, entry);
+       update_mmu_cache_pmd(vma, addr, pmd);
+       spin_unlock(ptl);
+       return 0;
+}
+
+static int mfill_atomic_pte_continue_shmem(pmd_t *dst_pmd,
                                     struct vm_area_struct *dst_vma,
                                     unsigned long dst_addr,
                                     uffd_flags_t flags)
@@ -667,7 +720,10 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t 
*dst_pmd,
        ssize_t err;
 
        if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
-               return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+               if (vma_is_anonymous(dst_vma))
+                       return mfill_atomic_pte_continue_anon(dst_pmd, dst_vma,
+                                                             dst_addr, flags);
+               return mfill_atomic_pte_continue_shmem(dst_pmd, dst_vma,
                                                 dst_addr, flags);
        } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
                return mfill_atomic_pte_poison(dst_pmd, dst_vma,
@@ -802,11 +858,25 @@ static __always_inline ssize_t mfill_atomic(struct 
userfaultfd_ctx *ctx,
                        break;
                }
                /*
-                * If the dst_pmd is THP don't override it and just be strict.
-                * (This includes the case where the PMD used to be THP and
-                * changed back to none after __pte_alloc().)
+                * THP PMD: for anon CONTINUE, restore protnone PMD
+                * permissions in place. For other operations, reject.
                 */
                if (unlikely(pmd_trans_huge(dst_pmdval))) {
+                       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+                           vma_is_anonymous(dst_vma) &&
+                           pmd_protnone(dst_pmdval)) {
+                               err = mfill_atomic_pmd_continue_anon(
+                                       dst_mm, dst_vma, dst_addr,
+                                       dst_pmd, dst_pmdval, flags);
+                               if (err == -EAGAIN)
+                                       continue; /* PMD changed, re-read it */
+                               if (err)
+                                       break;
+                               dst_addr += HPAGE_PMD_SIZE;
+                               src_addr += HPAGE_PMD_SIZE;
+                               copied += HPAGE_PMD_SIZE;
+                               continue;
+                       }
                        err = -EEXIST;
                        break;
                }
-- 
2.51.2


Reply via email to