Services that intercept write faults (e.g., for promotion tracking) need PTEs to stay read-only. This requires preventing mprotect from silently upgrade the PTE, bypassing the service's handle_fault callback.
Add NP_OPS_PROTECT_WRITE and folio_managed_wrprotect(). In change_pte_range() and change_huge_pmd(), suppress PTE write-upgrade when MM_CP_TRY_CHANGE_WRITABLE is sees the folio is write-protected. In handle_pte_fault() and do_huge_pmd_wp_page(), dispatch to the node's ops->handle_fault callback when set, allowing the service to handle write faults with promotion or other custom logic. NP_OPS_MEMPOLICY is incompatible with NP_OPS_PROTECT_WRITE to avoid the footgun of binding a writable VMA to a write-protected node. Signed-off-by: Gregory Price <[email protected]> --- drivers/base/node.c | 4 ++ include/linux/node_private.h | 22 ++++++++ mm/huge_memory.c | 17 ++++++- mm/internal.h | 99 ++++++++++++++++++++++++++++++++++++ mm/memory.c | 15 ++++++ mm/migrate.c | 14 +---- mm/mprotect.c | 4 +- 7 files changed, 159 insertions(+), 16 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index c08b5a948779..a4955b9b5b93 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -957,6 +957,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops) !(ops->flags & NP_OPS_MIGRATION)) return -EINVAL; + if ((ops->flags & NP_OPS_MEMPOLICY) && + (ops->flags & NP_OPS_PROTECT_WRITE)) + return -EINVAL; + mutex_lock(&node_private_lock); np = rcu_dereference_protected(NODE_DATA(nid)->node_private, lockdep_is_held(&node_private_lock)); diff --git a/include/linux/node_private.h b/include/linux/node_private.h index e254e36056cd..27d6e5d84e61 100644 --- a/include/linux/node_private.h +++ b/include/linux/node_private.h @@ -70,6 +70,24 @@ struct vm_fault; * PFN-based metadata (compression tables, device page tables, DMA * mappings, etc.) before any access through the page tables. * + * @handle_fault: Handle fault on folio on this private node. + * [folio-referenced callback, PTL held on entry] + * + * Called from handle_pte_fault() (PTE level) or do_huge_pmd_wp_page() + * (PMD level) after lock acquisition and entry verification. + * @folio is the faulting folio, @level indicates the page table level. + * + * For PGTABLE_LEVEL_PTE: vmf->pte is mapped and vmf->ptl is the + * PTE lock. Release via pte_unmap_unlock(vmf->pte, vmf->ptl). + * + * For PGTABLE_LEVEL_PMD: vmf->pte is NULL and vmf->ptl is the + * PMD lock. Release via spin_unlock(vmf->ptl). + * + * The callback MUST release PTL on ALL paths. + * The caller will NOT touch the page table entry after this returns. + * + * Returns: vm_fault_t result (0, VM_FAULT_RETRY, etc.) + * * @flags: Operation exclusion flags (NP_OPS_* constants). * */ @@ -81,6 +99,8 @@ struct node_private_ops { enum migrate_reason reason, unsigned int *nr_succeeded); void (*folio_migrate)(struct folio *src, struct folio *dst); + vm_fault_t (*handle_fault)(struct folio *folio, struct vm_fault *vmf, + enum pgtable_level level); unsigned long flags; }; @@ -90,6 +110,8 @@ struct node_private_ops { #define NP_OPS_MEMPOLICY BIT(1) /* Node participates as a demotion target in memory-tiers */ #define NP_OPS_DEMOTION BIT(2) +/* Prevent mprotect/NUMA from upgrading PTEs to writable on this node */ +#define NP_OPS_PROTECT_WRITE BIT(3) /** * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2ecae494291a..d9ba6593244d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,12 +2063,14 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; + vm_fault_t ret; + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) { - vm_fault_t ret = do_huge_zero_wp_pmd(vmf); + ret = do_huge_zero_wp_pmd(vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -2088,6 +2090,13 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); + /* Private-managed write-protect: let the service handle the fault */ + if (unlikely(folio_is_private_managed(folio))) { + if (folio_managed_handle_fault(folio, vmf, + PGTABLE_LEVEL_PMD, &ret)) + return ret; + } + /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; @@ -2633,7 +2642,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && - can_change_pmd_writable(vma, addr, entry)) + can_change_pmd_writable(vma, addr, entry) && + !folio_managed_wrprotect(pmd_folio(entry))) entry = pmd_mkwrite(entry, vma); ret = HPAGE_PMD_NR; @@ -4943,6 +4953,9 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry)) pmde = pmd_mkdirty(pmde); + if (folio_managed_wrprotect(folio)) + pmde = pmd_wrprotect(pmde); + if (folio_is_device_private(folio)) { swp_entry_t entry; diff --git a/mm/internal.h b/mm/internal.h index 5950e20d4023..ae4ff86e8dc6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> +#include <linux/node_private.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> @@ -18,6 +19,7 @@ #include <linux/leafops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> +#include <linux/node_private.h> /* Internal core VMA manipulation functions. */ #include "vma.h" @@ -1449,6 +1451,103 @@ static inline bool folio_managed_on_free(struct folio *folio) return false; } +/* + * folio_managed_handle_fault - Dispatch fault on managed-memory folio + * @folio: the faulting folio (must not be NULL) + * @vmf: the vm_fault descriptor (PTL held: vmf->ptl locked) + * @level: page table level (PGTABLE_LEVEL_PTE or PGTABLE_LEVEL_PMD) + * @ret: output fault result if handled + * + * Called with PTL held. If a handle_fault callback exists, it is invoked + * with PTL still held. The callback is responsible for releasing PTL on + * all paths. + * + * Returns true if the service handled the fault (PTL released by callback, + * caller returns *ret). Returns false if no handler exists (PTL still held, + * caller continues with normal fault handling). + */ +static inline bool folio_managed_handle_fault(struct folio *folio, + struct vm_fault *vmf, + enum pgtable_level level, + vm_fault_t *ret) +{ + /* Zone device pages use swap entries; handled in do_swap_page */ + if (folio_is_zone_device(folio)) + return false; + + if (folio_is_private_node(folio)) { + const struct node_private_ops *ops = + folio_node_private_ops(folio); + + if (ops && ops->handle_fault) { + *ret = ops->handle_fault(folio, vmf, level); + return true; + } + } + return false; +} + +/** + * folio_managed_wrprotect - Should this folio's mappings stay write-protected? + * @folio: the folio to check + * + * Returns true if the folio is on a private node with NP_OPS_PROTECT_WRITE, + * meaning page table entries (PTE or PMD) should not be made writable. + * Write faults are intercepted by the service's handle_fault callback + * to promote the folio to DRAM. + * + * Used by: + * - change_pte_range() / change_huge_pmd(): prevent mprotect write-upgrade + * - remove_migration_pte() / remove_migration_pmd(): strip write after migration + * - do_huge_pmd_wp_page(): dispatch to fault handler instead of reuse + */ +static inline bool folio_managed_wrprotect(struct folio *folio) +{ + return unlikely(folio_is_private_node(folio) && + folio_private_flags(folio, NP_OPS_PROTECT_WRITE)); +} + +/** + * folio_managed_fixup_migration_pte - Fixup PTE after migration for + * managed memory pages. + * @new: the destination page + * @pte: the PTE being installed (normal PTE built by caller) + * @old_pte: the original PTE (before migration, for swap entry flags) + * @vma: the VMA + * + * For MEMORY_DEVICE_PRIVATE pages: replaces the PTE with a device-private + * swap entry, preserving soft_dirty and uffd_wp from old_pte. + * + * For N_MEMORY_PRIVATE pages with NP_OPS_PROTECT_WRITE: strips the write + * bit so the next write triggers the fault handler for promotion. + * + * For normal pages: returns pte unmodified. + */ +static inline pte_t folio_managed_fixup_migration_pte(struct page *new, + pte_t pte, + pte_t old_pte, + struct vm_area_struct *vma) +{ + if (unlikely(is_device_private_page(new))) { + softleaf_t entry; + + if (pte_write(pte)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); + pte = softleaf_to_pte(entry); + if (pte_swp_soft_dirty(old_pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(old_pte)) + pte = pte_swp_mkuffd_wp(pte); + } else if (folio_managed_wrprotect(page_folio(new))) { + pte = pte_wrprotect(pte); + } + return pte; +} + /** * folio_managed_migrate_notify - Notify service that a folio changed location * @src: the old folio (about to be freed) diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..0f78988befef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6079,6 +6079,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ + if (unlikely(folio && folio_managed_wrprotect(folio))) { + writable = false; + ignore_writable = true; + } if (folio && folio_test_large(folio)) numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, pte_write_upgrade); @@ -6228,6 +6232,7 @@ static void fix_spurious_fault(struct vm_fault *vmf, */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { + struct folio *folio; pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { @@ -6284,6 +6289,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } + + folio = vm_normal_folio(vmf->vma, vmf->address, entry); + if (unlikely(folio && folio_is_private_managed(folio))) { + vm_fault_t fault_ret; + + if (folio_managed_handle_fault(folio, vmf, PGTABLE_LEVEL_PTE, + &fault_ret)) + return fault_ret; + } + if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); diff --git a/mm/migrate.c b/mm/migrate.c index a54d4af04df3..f632e8b03504 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -398,19 +398,7 @@ static bool remove_migration_pte(struct folio *folio, if (folio_test_anon(folio) && !softleaf_is_migration_read(entry)) rmap_flags |= RMAP_EXCLUSIVE; - if (unlikely(is_device_private_page(new))) { - if (pte_write(pte)) - entry = make_writable_device_private_entry( - page_to_pfn(new)); - else - entry = make_readable_device_private_entry( - page_to_pfn(new)); - pte = softleaf_to_pte(entry); - if (pte_swp_soft_dirty(old_pte)) - pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(old_pte)) - pte = pte_swp_mkuffd_wp(pte); - } + pte = folio_managed_fixup_migration_pte(new, pte, old_pte, vma); #ifdef CONFIG_HUGETLB_PAGE if (folio_test_hugetlb(folio)) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..830be609bc24 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -30,6 +30,7 @@ #include <linux/mm_inline.h> #include <linux/pgtable.h> #include <linux/userfaultfd_k.h> +#include <linux/node_private.h> #include <uapi/linux/mman.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -290,7 +291,8 @@ static long change_pte_range(struct mmu_gather *tlb, * COW or special handling is required. */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && - !pte_write(ptent)) + !pte_write(ptent) && + !(folio && folio_managed_wrprotect(folio))) set_write_prot_commit_flush_ptes(vma, folio, page, addr, pte, oldpte, ptent, nr_ptes, tlb); else -- 2.53.0
