UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR range: - Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE). Pages stay resident with PFNs preserved, only permissions removed. MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA, bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.
- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single(). Pages stay in page cache. - Private hugetlb: rejected with -EINVAL (zapping would destroy content). Cleanup on unregister/close: restore protnone PTEs to normal permissions in userfaultfd_clear_vma(), preventing permanently inaccessible pages. Signed-off-by: Kiryl Shutsemau (Meta) <[email protected]> Assisted-by: Claude:claude-opus-4-6 --- fs/userfaultfd.c | 35 ++++++++++++++++ include/linux/mm.h | 2 + include/linux/userfaultfd_k.h | 2 + mm/huge_memory.c | 9 ++-- mm/mprotect.c | 9 +++- mm/userfaultfd.c | 78 +++++++++++++++++++++++++++++++++-- 6 files changed, 127 insertions(+), 8 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8d508ad19e89..b317c9854b86 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + /* DEACTIVATE is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to @@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return ret; } +static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_range; + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + if (copy_from_user(&uffdio_range, (void __user *)arg, + sizeof(uffdio_range))) + return -EFAULT; + + ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len); + if (ret) + return ret; + + if (mmget_not_zero(ctx->mm)) { + ret = mdeactivate_range(ctx, uffdio_range.start, + uffdio_range.len); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + return ret; +} + static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; @@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_POISON: ret = userfaultfd_poison(ctx, arg); break; + case UFFDIO_DEACTIVATE: + ret = userfaultfd_deactivate(ctx, arg); + break; } return ret; } diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..fc2841264d56 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +/* Whether this change is for uffd deactivation */ +#define MM_CP_UFFD_DEACTIVATE (1UL << 4) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d1d4ed4a08b0..c94b5c5b5f24 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); +extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b298cba853ab..2ad736ff007c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; pmd_t oldpmd, entry; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int ret = 1; @@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto unlock; } - if (prot_numa) { + /* Already protnone — nothing to do for either NUMA or uffd */ + if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd)) + goto unlock; + if (prot_numa) { /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(*pmd)) goto unlock; - if (pmd_protnone(*pmd)) - goto unlock; - if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma, vma_is_single_threaded_private(vma))) goto unlock; diff --git a/mm/mprotect.c b/mm/mprotect.c index c0571445bef7..7c612a680014 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb, long pages = 0; bool is_private_single_threaded; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int nr_ptes; @@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t ptent; /* Already in the desired state. */ - if (prot_numa && pte_protnone(oldpte)) + if ((prot_numa || uffd_deactivate) && + pte_protnone(oldpte)) continue; page = vm_normal_page(vma, addr, oldpte); @@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb, /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. + * Skip this filter for uffd deactivation which + * must set protnone regardless of NUMA placement. */ if (prot_numa && !folio_can_map_prot_numa(folio, vma, @@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb, WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA); #endif + if (cp_flags & MM_CP_UFFD_DEACTIVATE) + newprot = PAGE_NONE; + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot, cp_flags); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dba1ea26fdfe..3373b11b9d83 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (!vma_is_shmem(dst_vma) && + if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) && uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; @@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, break; } dst_pmdval = pmdp_get_lockless(dst_pmd); + if (unlikely(!pmd_present(dst_pmdval))) { + err = -EEXIST; + break; + } /* * If the dst_pmd is THP don't override it and just be strict. * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().) */ - if (unlikely(!pmd_present(dst_pmdval) || - pmd_trans_huge(dst_pmdval))) { + if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } @@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, return err; } +int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len) +{ + struct mm_struct *dst_mm = ctx->mm; + unsigned long end = start + len; + struct vm_area_struct *dst_vma; + long err; + VMA_ITERATOR(vmi, dst_mm, start); + + VM_WARN_ON_ONCE(start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(start + len <= start); + + guard(mmap_read_lock)(dst_mm); + guard(rwsem_read)(&ctx->map_changing_lock); + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + err = -ENOENT; + for_each_vma_range(vmi, dst_vma, end) { + unsigned long vma_start = max(dst_vma->vm_start, start); + unsigned long vma_end = min(dst_vma->vm_end, end); + + if (!userfaultfd_minor(dst_vma)) { + err = -ENOENT; + break; + } + + /* + * Private hugetlb has no page cache to fall back on — + * zapping PTEs would destroy page content. + */ + if (is_vm_hugetlb_page(dst_vma) && + !(dst_vma->vm_flags & VM_SHARED)) { + err = -EINVAL; + break; + } + + if (vma_is_anonymous(dst_vma)) { + /* Anonymous: set protnone, pages stay resident */ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, dst_mm); + err = change_protection(&tlb, dst_vma, vma_start, + vma_end, + MM_CP_UFFD_DEACTIVATE); + tlb_finish_mmu(&tlb); + if (err < 0) + break; + } else { + /* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */ + zap_page_range_single(dst_vma, vma_start, + vma_end - vma_start, NULL); + } + err = 0; + } + return err; +} void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2) @@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, end - start, false); + /* Restore protnone PTEs to normal permissions */ + if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) { + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + change_protection(&tlb, vma, start, end, + MM_CP_TRY_CHANGE_WRITABLE); + tlb_finish_mmu(&tlb); + } + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, vma->vm_flags & ~__VM_UFFD_FLAGS, NULL_VM_UFFD_CTX, give_up_on_oom); -- 2.51.2

