When a protnone PTE/PMD fault occurs on a VMA with VM_UFFD_MINOR, dispatch to the userfaultfd minor fault path instead of NUMA balancing. Async: restore permissions inline. Sync: deliver via handle_userfault().
Feed NUMA locality stats from the fault path via task_numa_fault() so the scheduler retains placement data even though NUMA scanning is skipped on these VMAs. Signed-off-by: Kiryl Shutsemau (Meta) <[email protected]> Assisted-by: Claude:claude-opus-4-6 --- include/linux/huge_mm.h | 6 +++++ mm/huge_memory.c | 24 +++++++++++++++++++ mm/memory.c | 51 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4d9f964dfde..a900bb530998 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -519,6 +519,7 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) } vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf); vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); @@ -707,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf) +{ + return 0; +} + static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2ad736ff007c..264c646a8573 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2181,6 +2181,30 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, return pmd_dirty(pmd); } +vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (userfaultfd_minor_async(vma)) { + pmd_t pmd; + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } + pmd = pmd_modify(vmf->orig_pmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + set_pmd_at(vma->vm_mm, vmf->address & HPAGE_PMD_MASK, + vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + return 0; + } + + return handle_userfault(vmf, VM_UFFD_MINOR); +} + /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { diff --git a/mm/memory.c b/mm/memory.c index c65e82c86fed..f068ff4027e8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6045,6 +6045,47 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru } } +static void uffd_minor_feed_numa_fault(struct vm_fault *vmf) +{ + struct folio *folio; + + folio = vm_normal_folio(vmf->vma, vmf->address, vmf->orig_pte); + if (folio) { + int nid = folio_nid(folio); + int flags = 0; + + if (nid == numa_node_id()) + flags |= TNF_FAULT_LOCAL; + task_numa_fault(folio_last_cpupid(folio), nid, 1, flags); + } +} + +static vm_fault_t do_uffd_minor_anon(struct vm_fault *vmf) +{ + /* Feed NUMA stats even though we skip NUMA scanning on this VMA */ + uffd_minor_feed_numa_fault(vmf); + + if (userfaultfd_minor_async(vmf->vma)) { + pte_t pte; + + spin_lock(vmf->ptl); + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + pte = pte_modify(vmf->orig_pte, vmf->vma->vm_page_prot); + pte = pte_mkyoung(pte); + set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + + /* Sync mode: unmap PTE and deliver to userfaultfd handler */ + pte_unmap(vmf->pte); + return handle_userfault(vmf, VM_UFFD_MINOR); +} + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6319,8 +6360,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); - if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) { + if (userfaultfd_minor(vmf->vma)) + return do_uffd_minor_anon(vmf); return do_numa_page(vmf); + } spin_lock(vmf->ptl); entry = vmf->orig_pte; @@ -6434,8 +6478,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return 0; } if (pmd_trans_huge(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) { + if (userfaultfd_minor(vma)) + return do_huge_pmd_uffd_minor(&vmf); return do_huge_pmd_numa_page(&vmf); + } if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { -- 2.51.2

