Not all private nodes may wish to engage in NUMA balancing faults.

Add the NP_OPS_NUMA_BALANCING flag (BIT(5)) as an opt-in method.

Introduce folio_managed_allows_numa() helper:
   ZONE_DEVICE folios always return false (never NUMA-scanned)
   NP_OPS_NUMA_BALANCING filters for private nodes

In do_numa_page(), if a private-node folio with NP_OPS_PROTECT_WRITE
is still on its node after a failed/skipped migration, enforce
write-protection so the next write triggers handle_fault.

Signed-off-by: Gregory Price <[email protected]>
---
 drivers/base/node.c          |  4 ++++
 include/linux/node_private.h | 16 ++++++++++++++++
 mm/memory.c                  | 11 +++++++++++
 mm/mempolicy.c               |  5 ++++-
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index a4955b9b5b93..88aaac45e814 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -961,6 +961,10 @@ int node_private_set_ops(int nid, const struct 
node_private_ops *ops)
            (ops->flags & NP_OPS_PROTECT_WRITE))
                return -EINVAL;
 
+       if ((ops->flags & NP_OPS_NUMA_BALANCING) &&
+           !(ops->flags & NP_OPS_MIGRATION))
+               return -EINVAL;
+
        mutex_lock(&node_private_lock);
        np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
                                       lockdep_is_held(&node_private_lock));
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index 34d862f09e24..5ac60db1f044 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -140,6 +140,8 @@ struct node_private_ops {
 #define NP_OPS_PROTECT_WRITE           BIT(3)
 /* Kernel reclaim (kswapd, direct reclaim, OOM) operates on this node */
 #define NP_OPS_RECLAIM                 BIT(4)
+/* Allow NUMA balancing to scan and migrate folios on this node */
+#define NP_OPS_NUMA_BALANCING          BIT(5)
 
 /* Private node is OOM-eligible: reclaim can run and pages can be demoted here 
*/
 #define NP_OPS_OOM_ELIGIBLE            (NP_OPS_RECLAIM | NP_OPS_DEMOTION)
@@ -263,6 +265,15 @@ static inline void folio_managed_split_cb(struct folio 
*original_folio,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static inline bool folio_managed_allows_numa(struct folio *folio)
+{
+       if (!folio_is_private_managed(folio))
+               return true;
+       if (folio_is_zone_device(folio))
+               return false;
+       return folio_private_flags(folio, NP_OPS_NUMA_BALANCING);
+}
+
 static inline int folio_managed_allows_user_migrate(struct folio *folio)
 {
        if (folio_is_zone_device(folio))
@@ -443,6 +454,11 @@ int node_private_clear_ops(int nid, const struct 
node_private_ops *ops);
 
 #else /* !CONFIG_NUMA || !CONFIG_MEMORY_HOTPLUG */
 
+static inline bool folio_managed_allows_numa(struct folio *folio)
+{
+       return !folio_is_zone_device(folio);
+}
+
 static inline int folio_managed_allows_user_migrate(struct folio *folio)
 {
        return -ENOENT;
diff --git a/mm/memory.c b/mm/memory.c
index 0f78988befef..88a581baae40 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,6 +78,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/pgalloc.h>
 #include <linux/uaccess.h>
+#include <linux/node_private.h>
 
 #include <trace/events/kmem.h>
 
@@ -6041,6 +6042,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        if (!folio || folio_is_zone_device(folio))
                goto out_map;
 
+       /*
+        * We do not need to check private-node folios here because the private
+        * memory service either never opted in to NUMA balancing, or it did
+        * and we need to restore private PTE controls on the failure path.
+        */
+
        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);
 
@@ -6078,6 +6085,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
+        *
+        * If the folio is still on a private node with NP_OPS_PROTECT_WRITE,
+        * enforce write-protection so the next write triggers handle_fault.
+        * This covers migration-failed and migration-skipped paths.
         */
        if (unlikely(folio && folio_managed_wrprotect(folio))) {
                writable = false;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8ac014950e88..8a3a9916ab59 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,7 +861,10 @@ bool folio_can_map_prot_numa(struct folio *folio, struct 
vm_area_struct *vma,
 {
        int nid;
 
-       if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
+       if (!folio || folio_test_ksm(folio))
+               return false;
+
+       if (unlikely(!folio_managed_allows_numa(folio)))
                return false;
 
        /* Also skip shared copy-on-write folios */
-- 
2.53.0


Reply via email to