Some private nodes want userland to directly allocate from the node
via set_mempolicy() and mbind() - but don't want that node as normal
allocable system memory in the fallback lists.

Add NP_OPS_MEMPOLICY flag requiring NP_OPS_MIGRATION (since mbind can
drive migrations).  Only allow private nodes in policy nodemasks if
all private nodes in the mask support NP_OPS_MEMPOLICY. This prevents
__GFP_PRIVATE from unlocking nodes without NP_OPS_MEMPOLICY support.

Add __GFP_PRIVATE to mempolicy migration sites so moves to opted-in
private nodes succeed.

Update the sysfs "has_memory" attribute to include N_MEMORY_PRIVATE
nodes with NP_OPS_MEMPOLICY set, allowing existing numactl userland
tools to work without modification.

Signed-off-by: Gregory Price <[email protected]>
---
 drivers/base/node.c            | 22 +++++++++++++-
 include/linux/node_private.h   | 40 +++++++++++++++++++++++++
 include/uapi/linux/mempolicy.h |  1 +
 mm/mempolicy.c                 | 54 ++++++++++++++++++++++++++++++----
 mm/page_alloc.c                |  5 ++++
 5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e587f5781135..c08b5a948779 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -953,6 +953,10 @@ int node_private_set_ops(int nid, const struct 
node_private_ops *ops)
            (!ops->migrate_to || !ops->folio_migrate))
                return -EINVAL;
 
+       if ((ops->flags & NP_OPS_MEMPOLICY) &&
+           !(ops->flags & NP_OPS_MIGRATION))
+               return -EINVAL;
+
        mutex_lock(&node_private_lock);
        np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
                                       lockdep_is_held(&node_private_lock));
@@ -1145,6 +1149,21 @@ static ssize_t show_node_state(struct device *dev,
                          nodemask_pr_args(&node_states[na->state]));
 }
 
+/* has_memory includes N_MEMORY + N_MEMORY_PRIVATE that support mempolicy. */
+static ssize_t show_has_memory(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       nodemask_t mask = node_states[N_MEMORY];
+       int nid;
+
+       for_each_node_state(nid, N_MEMORY_PRIVATE) {
+               if (node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+                       node_set(nid, mask);
+       }
+
+       return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&mask));
+}
+
 #define _NODE_ATTR(name, state) \
        { __ATTR(name, 0444, show_node_state, NULL), state }
 
@@ -1155,7 +1174,8 @@ static struct node_attr node_state_attr[] = {
 #ifdef CONFIG_HIGHMEM
        [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
-       [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+       [N_MEMORY] = { __ATTR(has_memory, 0444, show_has_memory, NULL),
+                      N_MEMORY },
        [N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE),
        [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
        [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index 0c5be1ee6e60..e9b58afa366b 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -86,6 +86,8 @@ struct node_private_ops {
 
 /* Allow user/kernel migration; requires migrate_to and folio_migrate */
 #define NP_OPS_MIGRATION               BIT(0)
+/* Allow mempolicy-directed allocation and mbind migration to this node */
+#define NP_OPS_MEMPOLICY               BIT(1)
 
 /**
  * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -276,6 +278,34 @@ static inline int node_private_migrate_to(struct list_head 
*folios, int nid,
 
        return ret;
 }
+
+static inline bool node_mpol_eligible(int nid)
+{
+       bool ret;
+
+       if (!node_state(nid, N_MEMORY_PRIVATE))
+               return node_state(nid, N_MEMORY);
+
+       rcu_read_lock();
+       ret = node_private_has_flag(nid, NP_OPS_MEMPOLICY);
+       rcu_read_unlock();
+       return ret;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+       int nid;
+       bool eligible = false;
+
+       for_each_node_mask(nid, *nodes) {
+               if (!node_state(nid, N_MEMORY_PRIVATE))
+                       continue;
+               if (!node_mpol_eligible(nid))
+                       return false;
+               eligible = true;
+       }
+       return eligible;
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #else /* !CONFIG_NUMA */
@@ -364,6 +394,16 @@ static inline int node_private_migrate_to(struct list_head 
*folios, int nid,
        return -ENODEV;
 }
 
+static inline bool node_mpol_eligible(int nid)
+{
+       return false;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+       return false;
+}
+
 static inline int node_private_register(int nid, struct node_private *np)
 {
        return -ENODEV;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..b606eae983c8 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,6 +64,7 @@ enum {
 #define MPOL_F_SHARED  (1 << 0)        /* identify shared policies */
 #define MPOL_F_MOF     (1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON   (1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_PRIVATE (1 << 5) /* policy targets private node; use 
__GFP_PRIVATE */
 
 /*
  * Enabling zone reclaim means the page allocator will attempt to fulfill
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2b0f9762d171..8ac014950e88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -406,8 +406,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const 
nodemask_t *nodes)
 static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-       int ret;
-
        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
@@ -416,9 +414,12 @@ static int mpol_set_nodemask(struct mempolicy *pol,
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;
 
-       /* Check N_MEMORY */
+       /* Check N_MEMORY and N_MEMORY_PRIVATE*/
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
+       nodes_and(nsc->mask2, cpuset_current_mems_allowed,
+                 node_states[N_MEMORY_PRIVATE]);
+       nodes_or(nsc->mask1, nsc->mask1, nsc->mask2);
 
        VM_BUG_ON(!nodes);
 
@@ -432,8 +433,13 @@ static int mpol_set_nodemask(struct mempolicy *pol,
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 
-       ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
-       return ret;
+       /* All private nodes in the mask must have NP_OPS_MEMPOLICY. */
+       if (nodes_private_mpol_allowed(&nsc->mask2))
+               pol->flags |= MPOL_F_PRIVATE;
+       else if (nodes_intersects(nsc->mask2, node_states[N_MEMORY_PRIVATE]))
+               return -EINVAL;
+
+       return mpol_ops[pol->mode].create(pol, &nsc->mask2);
 }
 
 /*
@@ -500,6 +506,7 @@ static void mpol_rebind_default(struct mempolicy *pol, 
const nodemask_t *nodes)
 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t 
*nodes)
 {
        nodemask_t tmp;
+       int nid;
 
        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
@@ -514,6 +521,21 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, 
const nodemask_t *nodes)
        if (nodes_empty(tmp))
                tmp = *nodes;
 
+       /*
+        * Drop private nodes that don't have mempolicy support.
+        * cpusets guarantees at least one N_MEMORY node in effective_mems
+        * and mems_allowed, so dropping private nodes here is safe.
+        */
+       for_each_node_mask(nid, tmp) {
+               if (node_state(nid, N_MEMORY_PRIVATE) &&
+                   !node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+                       node_clear(nid, tmp);
+       }
+       if (nodes_intersects(tmp, node_states[N_MEMORY_PRIVATE]))
+               pol->flags |= MPOL_F_PRIVATE;
+       else
+               pol->flags &= ~MPOL_F_PRIVATE;
+
        pol->nodes = tmp;
 }
 
@@ -661,6 +683,9 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk 
*walk)
        }
        if (!queue_folio_required(folio, qp))
                return;
+       if (folio_is_private_node(folio) &&
+           !folio_private_flags(folio, NP_OPS_MIGRATION))
+               return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
@@ -717,6 +742,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long 
addr,
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
+               if (folio_is_private_node(folio) &&
+                   !folio_private_flags(folio, NP_OPS_MIGRATION))
+                       continue;
                if (folio_test_large(folio) && max_nr != 1)
                        nr = folio_pte_batch(folio, pte, ptent, max_nr);
                /*
@@ -1451,6 +1479,9 @@ static struct folio 
*alloc_migration_target_by_mpol(struct folio *src,
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
 
+       if (pol->flags & MPOL_F_PRIVATE)
+               gfp |= __GFP_PRIVATE;
+
        return folio_alloc_mpol(gfp, order, pol, ilx, nid);
 }
 #else
@@ -2280,6 +2311,15 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct 
mempolicy *pol,
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
+               else if ((pol->flags & MPOL_F_PRIVATE) &&
+                        !node_isset(*nid, pol->nodes)) {
+                       /*
+                        * Private nodes are not in N_MEMORY nodes' zonelists.
+                        * When the preferred nid (usually numa_node_id()) can't
+                        * reach the policy nodes, start from a policy node.
+                        */
+                       *nid = first_node(pol->nodes);
+               }
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
@@ -2533,6 +2573,10 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int 
order, struct vm_area_struct
                gfp |= __GFP_NOWARN;
 
        pol = get_vma_policy(vma, addr, order, &ilx);
+
+       if (pol->flags & MPOL_F_PRIVATE)
+               gfp |= __GFP_PRIVATE;
+
        folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return folio;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a1b35421d78..ec6c1f8e85d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3849,8 +3849,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
order, int alloc_flags,
                 * if another process has NUMA bindings and is causing
                 * kswapd wakeups on only some nodes. Avoid accidental
                 * "node_reclaim_mode"-like behavior in this case.
+                *
+                * Nodes without kswapd (some private nodes) are never
+                * skipped - this causes some mempolicies to silently
+                * fall back to DRAM even if the node is eligible.
                 */
                if (skip_kswapd_nodes &&
+                   zone->zone_pgdat->kswapd &&
                    !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
                        skipped_kswapd_nodes = true;
                        continue;
-- 
2.53.0


Reply via email to