Services that intercept write faults (e.g., for promotion tracking)
need PTEs to stay read-only. This requires preventing mprotect
from silently upgrade the PTE, bypassing the service's handle_fault
callback.

Add NP_OPS_PROTECT_WRITE and folio_managed_wrprotect().

In change_pte_range() and change_huge_pmd(), suppress PTE write-upgrade
when MM_CP_TRY_CHANGE_WRITABLE is sees the folio is write-protected.

In handle_pte_fault() and do_huge_pmd_wp_page(), dispatch to the node's
ops->handle_fault callback when set, allowing the service to handle write
faults with promotion or other custom logic.

NP_OPS_MEMPOLICY is incompatible with NP_OPS_PROTECT_WRITE to avoid the
footgun of binding a writable VMA to a write-protected node.

Signed-off-by: Gregory Price <[email protected]>
---
 drivers/base/node.c          |  4 ++
 include/linux/node_private.h | 22 ++++++++
 mm/huge_memory.c             | 17 ++++++-
 mm/internal.h                | 99 ++++++++++++++++++++++++++++++++++++
 mm/memory.c                  | 15 ++++++
 mm/migrate.c                 | 14 +----
 mm/mprotect.c                |  4 +-
 7 files changed, 159 insertions(+), 16 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index c08b5a948779..a4955b9b5b93 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -957,6 +957,10 @@ int node_private_set_ops(int nid, const struct 
node_private_ops *ops)
            !(ops->flags & NP_OPS_MIGRATION))
                return -EINVAL;
 
+       if ((ops->flags & NP_OPS_MEMPOLICY) &&
+           (ops->flags & NP_OPS_PROTECT_WRITE))
+               return -EINVAL;
+
        mutex_lock(&node_private_lock);
        np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
                                       lockdep_is_held(&node_private_lock));
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index e254e36056cd..27d6e5d84e61 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -70,6 +70,24 @@ struct vm_fault;
  *     PFN-based metadata (compression tables, device page tables, DMA
  *     mappings, etc.) before any access through the page tables.
  *
+ * @handle_fault: Handle fault on folio on this private node.
+ *   [folio-referenced callback, PTL held on entry]
+ *
+ *   Called from handle_pte_fault() (PTE level) or do_huge_pmd_wp_page()
+ *   (PMD level) after lock acquisition and entry verification.
+ *   @folio is the faulting folio, @level indicates the page table level.
+ *
+ *   For PGTABLE_LEVEL_PTE: vmf->pte is mapped and vmf->ptl is the
+ *   PTE lock.  Release via pte_unmap_unlock(vmf->pte, vmf->ptl).
+ *
+ *   For PGTABLE_LEVEL_PMD: vmf->pte is NULL and vmf->ptl is the
+ *   PMD lock.  Release via spin_unlock(vmf->ptl).
+ *
+ *   The callback MUST release PTL on ALL paths.
+ *   The caller will NOT touch the page table entry after this returns.
+ *
+ *   Returns: vm_fault_t result (0, VM_FAULT_RETRY, etc.)
+ *
  * @flags: Operation exclusion flags (NP_OPS_* constants).
  *
  */
@@ -81,6 +99,8 @@ struct node_private_ops {
                                  enum migrate_reason reason,
                                  unsigned int *nr_succeeded);
        void (*folio_migrate)(struct folio *src, struct folio *dst);
+       vm_fault_t (*handle_fault)(struct folio *folio, struct vm_fault *vmf,
+                                  enum pgtable_level level);
        unsigned long flags;
 };
 
@@ -90,6 +110,8 @@ struct node_private_ops {
 #define NP_OPS_MEMPOLICY               BIT(1)
 /* Node participates as a demotion target in memory-tiers */
 #define NP_OPS_DEMOTION                        BIT(2)
+/* Prevent mprotect/NUMA from upgrading PTEs to writable on this node */
+#define NP_OPS_PROTECT_WRITE           BIT(3)
 
 /**
  * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2ecae494291a..d9ba6593244d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2063,12 +2063,14 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t orig_pmd = vmf->orig_pmd;
+       vm_fault_t ret;
+
 
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
        if (is_huge_zero_pmd(orig_pmd)) {
-               vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
+               ret = do_huge_zero_wp_pmd(vmf);
 
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
@@ -2088,6 +2090,13 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
        folio = page_folio(page);
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
+       /* Private-managed write-protect: let the service handle the fault */
+       if (unlikely(folio_is_private_managed(folio))) {
+               if (folio_managed_handle_fault(folio, vmf,
+                                             PGTABLE_LEVEL_PMD, &ret))
+                       return ret;
+       }
+
        /* Early check when only holding the PT lock. */
        if (PageAnonExclusive(page))
                goto reuse;
@@ -2633,7 +2642,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 
        /* See change_pte_range(). */
        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
-           can_change_pmd_writable(vma, addr, entry))
+           can_change_pmd_writable(vma, addr, entry) &&
+           !folio_managed_wrprotect(pmd_folio(entry)))
                entry = pmd_mkwrite(entry, vma);
 
        ret = HPAGE_PMD_NR;
@@ -4943,6 +4953,9 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
*pvmw, struct page *new)
        if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
                pmde = pmd_mkdirty(pmde);
 
+       if (folio_managed_wrprotect(folio))
+               pmde = pmd_wrprotect(pmde);
+
        if (folio_is_device_private(folio)) {
                swp_entry_t entry;
 
diff --git a/mm/internal.h b/mm/internal.h
index 5950e20d4023..ae4ff86e8dc6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
 #include <linux/khugepaged.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
+#include <linux/node_private.h>
 #include <linux/pagemap.h>
 #include <linux/pagewalk.h>
 #include <linux/rmap.h>
@@ -18,6 +19,7 @@
 #include <linux/leafops.h>
 #include <linux/swap_cgroup.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/node_private.h>
 
 /* Internal core VMA manipulation functions. */
 #include "vma.h"
@@ -1449,6 +1451,103 @@ static inline bool folio_managed_on_free(struct folio 
*folio)
        return false;
 }
 
+/*
+ * folio_managed_handle_fault - Dispatch fault on managed-memory folio
+ * @folio: the faulting folio (must not be NULL)
+ * @vmf: the vm_fault descriptor (PTL held: vmf->ptl locked)
+ * @level: page table level (PGTABLE_LEVEL_PTE or PGTABLE_LEVEL_PMD)
+ * @ret: output fault result if handled
+ *
+ * Called with PTL held.  If a handle_fault callback exists, it is invoked
+ * with PTL still held.  The callback is responsible for releasing PTL on
+ * all paths.
+ *
+ * Returns true if the service handled the fault (PTL released by callback,
+ * caller returns *ret).  Returns false if no handler exists (PTL still held,
+ * caller continues with normal fault handling).
+ */
+static inline bool folio_managed_handle_fault(struct folio *folio,
+                                             struct vm_fault *vmf,
+                                             enum pgtable_level level,
+                                             vm_fault_t *ret)
+{
+       /* Zone device pages use swap entries; handled in do_swap_page */
+       if (folio_is_zone_device(folio))
+               return false;
+
+       if (folio_is_private_node(folio)) {
+               const struct node_private_ops *ops =
+                       folio_node_private_ops(folio);
+
+               if (ops && ops->handle_fault) {
+                       *ret = ops->handle_fault(folio, vmf, level);
+                       return true;
+               }
+       }
+       return false;
+}
+
+/**
+ * folio_managed_wrprotect - Should this folio's mappings stay write-protected?
+ * @folio: the folio to check
+ *
+ * Returns true if the folio is on a private node with NP_OPS_PROTECT_WRITE,
+ * meaning page table entries (PTE or PMD) should not be made writable.
+ * Write faults are intercepted by the service's handle_fault callback
+ * to promote the folio to DRAM.
+ *
+ * Used by:
+ *   - change_pte_range() / change_huge_pmd(): prevent mprotect write-upgrade
+ *   - remove_migration_pte() / remove_migration_pmd(): strip write after 
migration
+ *   - do_huge_pmd_wp_page(): dispatch to fault handler instead of reuse
+ */
+static inline bool folio_managed_wrprotect(struct folio *folio)
+{
+       return unlikely(folio_is_private_node(folio) &&
+                       folio_private_flags(folio, NP_OPS_PROTECT_WRITE));
+}
+
+/**
+ * folio_managed_fixup_migration_pte - Fixup PTE after migration for
+ *                                     managed memory pages.
+ * @new: the destination page
+ * @pte: the PTE being installed (normal PTE built by caller)
+ * @old_pte: the original PTE (before migration, for swap entry flags)
+ * @vma: the VMA
+ *
+ * For MEMORY_DEVICE_PRIVATE pages: replaces the PTE with a device-private
+ * swap entry, preserving soft_dirty and uffd_wp from old_pte.
+ *
+ * For N_MEMORY_PRIVATE pages with NP_OPS_PROTECT_WRITE: strips the write
+ * bit so the next write triggers the fault handler for promotion.
+ *
+ * For normal pages: returns pte unmodified.
+ */
+static inline pte_t folio_managed_fixup_migration_pte(struct page *new,
+                                                     pte_t pte,
+                                                     pte_t old_pte,
+                                                     struct vm_area_struct 
*vma)
+{
+       if (unlikely(is_device_private_page(new))) {
+               softleaf_t entry;
+
+               if (pte_write(pte))
+                       entry = make_writable_device_private_entry(
+                                               page_to_pfn(new));
+               else
+                       entry = make_readable_device_private_entry(
+                                               page_to_pfn(new));
+               pte = softleaf_to_pte(entry);
+               if (pte_swp_soft_dirty(old_pte))
+                       pte = pte_swp_mksoft_dirty(pte);
+               if (pte_swp_uffd_wp(old_pte))
+                       pte = pte_swp_mkuffd_wp(pte);
+       } else if (folio_managed_wrprotect(page_folio(new))) {
+               pte = pte_wrprotect(pte);
+       }
+       return pte;
+}
+
 /**
  * folio_managed_migrate_notify - Notify service that a folio changed location
  * @src: the old folio (about to be freed)
diff --git a/mm/memory.c b/mm/memory.c
index 2a55edc48a65..0f78988befef 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6079,6 +6079,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
+       if (unlikely(folio && folio_managed_wrprotect(folio))) {
+               writable = false;
+               ignore_writable = true;
+       }
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, 
ignore_writable,
                                           pte_write_upgrade);
@@ -6228,6 +6232,7 @@ static void fix_spurious_fault(struct vm_fault *vmf,
  */
 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 {
+       struct folio *folio;
        pte_t entry;
 
        if (unlikely(pmd_none(*vmf->pmd))) {
@@ -6284,6 +6289,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
+
+       folio = vm_normal_folio(vmf->vma, vmf->address, entry);
+       if (unlikely(folio && folio_is_private_managed(folio))) {
+               vm_fault_t fault_ret;
+
+               if (folio_managed_handle_fault(folio, vmf, PGTABLE_LEVEL_PTE,
+                                              &fault_ret))
+                       return fault_ret;
+       }
+
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
diff --git a/mm/migrate.c b/mm/migrate.c
index a54d4af04df3..f632e8b03504 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -398,19 +398,7 @@ static bool remove_migration_pte(struct folio *folio,
                if (folio_test_anon(folio) && 
!softleaf_is_migration_read(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;
 
-               if (unlikely(is_device_private_page(new))) {
-                       if (pte_write(pte))
-                               entry = make_writable_device_private_entry(
-                                                       page_to_pfn(new));
-                       else
-                               entry = make_readable_device_private_entry(
-                                                       page_to_pfn(new));
-                       pte = softleaf_to_pte(entry);
-                       if (pte_swp_soft_dirty(old_pte))
-                               pte = pte_swp_mksoft_dirty(pte);
-                       if (pte_swp_uffd_wp(old_pte))
-                               pte = pte_swp_mkuffd_wp(pte);
-               }
+               pte = folio_managed_fixup_migration_pte(new, pte, old_pte, vma);
 
 #ifdef CONFIG_HUGETLB_PAGE
                if (folio_test_hugetlb(folio)) {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1ce..830be609bc24 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -30,6 +30,7 @@
 #include <linux/mm_inline.h>
 #include <linux/pgtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/node_private.h>
 #include <uapi/linux/mman.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
@@ -290,7 +291,8 @@ static long change_pte_range(struct mmu_gather *tlb,
                         * COW or special handling is required.
                         */
                        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
-                            !pte_write(ptent))
+                            !pte_write(ptent) &&
+                            !(folio && folio_managed_wrprotect(folio)))
                                set_write_prot_commit_flush_ptes(vma, folio, 
page,
                                addr, pte, oldpte, ptent, nr_ptes, tlb);
                        else
-- 
2.53.0


Reply via email to