Some private nodes want userland to directly allocate from the node via set_mempolicy() and mbind() - but don't want that node as normal allocable system memory in the fallback lists.
Add NP_OPS_MEMPOLICY flag requiring NP_OPS_MIGRATION (since mbind can drive migrations). Only allow private nodes in policy nodemasks if all private nodes in the mask support NP_OPS_MEMPOLICY. This prevents __GFP_PRIVATE from unlocking nodes without NP_OPS_MEMPOLICY support. Add __GFP_PRIVATE to mempolicy migration sites so moves to opted-in private nodes succeed. Update the sysfs "has_memory" attribute to include N_MEMORY_PRIVATE nodes with NP_OPS_MEMPOLICY set, allowing existing numactl userland tools to work without modification. Signed-off-by: Gregory Price <[email protected]> --- drivers/base/node.c | 22 +++++++++++++- include/linux/node_private.h | 40 +++++++++++++++++++++++++ include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 54 ++++++++++++++++++++++++++++++---- mm/page_alloc.c | 5 ++++ 5 files changed, 116 insertions(+), 6 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index e587f5781135..c08b5a948779 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -953,6 +953,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops) (!ops->migrate_to || !ops->folio_migrate)) return -EINVAL; + if ((ops->flags & NP_OPS_MEMPOLICY) && + !(ops->flags & NP_OPS_MIGRATION)) + return -EINVAL; + mutex_lock(&node_private_lock); np = rcu_dereference_protected(NODE_DATA(nid)->node_private, lockdep_is_held(&node_private_lock)); @@ -1145,6 +1149,21 @@ static ssize_t show_node_state(struct device *dev, nodemask_pr_args(&node_states[na->state])); } +/* has_memory includes N_MEMORY + N_MEMORY_PRIVATE that support mempolicy. */ +static ssize_t show_has_memory(struct device *dev, + struct device_attribute *attr, char *buf) +{ + nodemask_t mask = node_states[N_MEMORY]; + int nid; + + for_each_node_state(nid, N_MEMORY_PRIVATE) { + if (node_private_has_flag(nid, NP_OPS_MEMPOLICY)) + node_set(nid, mask); + } + + return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&mask)); +} + #define _NODE_ATTR(name, state) \ { __ATTR(name, 0444, show_node_state, NULL), state } @@ -1155,7 +1174,8 @@ static struct node_attr node_state_attr[] = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), #endif - [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), + [N_MEMORY] = { __ATTR(has_memory, 0444, show_has_memory, NULL), + N_MEMORY }, [N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE), [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, diff --git a/include/linux/node_private.h b/include/linux/node_private.h index 0c5be1ee6e60..e9b58afa366b 100644 --- a/include/linux/node_private.h +++ b/include/linux/node_private.h @@ -86,6 +86,8 @@ struct node_private_ops { /* Allow user/kernel migration; requires migrate_to and folio_migrate */ #define NP_OPS_MIGRATION BIT(0) +/* Allow mempolicy-directed allocation and mbind migration to this node */ +#define NP_OPS_MEMPOLICY BIT(1) /** * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes @@ -276,6 +278,34 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid, return ret; } + +static inline bool node_mpol_eligible(int nid) +{ + bool ret; + + if (!node_state(nid, N_MEMORY_PRIVATE)) + return node_state(nid, N_MEMORY); + + rcu_read_lock(); + ret = node_private_has_flag(nid, NP_OPS_MEMPOLICY); + rcu_read_unlock(); + return ret; +} + +static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes) +{ + int nid; + bool eligible = false; + + for_each_node_mask(nid, *nodes) { + if (!node_state(nid, N_MEMORY_PRIVATE)) + continue; + if (!node_mpol_eligible(nid)) + return false; + eligible = true; + } + return eligible; +} #endif /* CONFIG_MEMORY_HOTPLUG */ #else /* !CONFIG_NUMA */ @@ -364,6 +394,16 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid, return -ENODEV; } +static inline bool node_mpol_eligible(int nid) +{ + return false; +} + +static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes) +{ + return false; +} + static inline int node_private_register(int nid, struct node_private *np) { return -ENODEV; diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8fbbe613611a..b606eae983c8 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,6 +64,7 @@ enum { #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +#define MPOL_F_PRIVATE (1 << 5) /* policy targets private node; use __GFP_PRIVATE */ /* * Enabling zone reclaim means the page allocator will attempt to fulfill diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2b0f9762d171..8ac014950e88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -406,8 +406,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { - int ret; - /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special @@ -416,9 +414,12 @@ static int mpol_set_nodemask(struct mempolicy *pol, if (!pol || pol->mode == MPOL_LOCAL) return 0; - /* Check N_MEMORY */ + /* Check N_MEMORY and N_MEMORY_PRIVATE*/ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); + nodes_and(nsc->mask2, cpuset_current_mems_allowed, + node_states[N_MEMORY_PRIVATE]); + nodes_or(nsc->mask1, nsc->mask1, nsc->mask2); VM_BUG_ON(!nodes); @@ -432,8 +433,13 @@ static int mpol_set_nodemask(struct mempolicy *pol, else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); - return ret; + /* All private nodes in the mask must have NP_OPS_MEMPOLICY. */ + if (nodes_private_mpol_allowed(&nsc->mask2)) + pol->flags |= MPOL_F_PRIVATE; + else if (nodes_intersects(nsc->mask2, node_states[N_MEMORY_PRIVATE])) + return -EINVAL; + + return mpol_ops[pol->mode].create(pol, &nsc->mask2); } /* @@ -500,6 +506,7 @@ static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; + int nid; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); @@ -514,6 +521,21 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) if (nodes_empty(tmp)) tmp = *nodes; + /* + * Drop private nodes that don't have mempolicy support. + * cpusets guarantees at least one N_MEMORY node in effective_mems + * and mems_allowed, so dropping private nodes here is safe. + */ + for_each_node_mask(nid, tmp) { + if (node_state(nid, N_MEMORY_PRIVATE) && + !node_private_has_flag(nid, NP_OPS_MEMPOLICY)) + node_clear(nid, tmp); + } + if (nodes_intersects(tmp, node_states[N_MEMORY_PRIVATE])) + pol->flags |= MPOL_F_PRIVATE; + else + pol->flags &= ~MPOL_F_PRIVATE; + pol->nodes = tmp; } @@ -661,6 +683,9 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) } if (!queue_folio_required(folio, qp)) return; + if (folio_is_private_node(folio) && + !folio_private_flags(folio, NP_OPS_MIGRATION)) + return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) @@ -717,6 +742,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; + if (folio_is_private_node(folio) && + !folio_private_flags(folio, NP_OPS_MIGRATION)) + continue; if (folio_test_large(folio) && max_nr != 1) nr = folio_pte_batch(folio, pte, ptent, max_nr); /* @@ -1451,6 +1479,9 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; + if (pol->flags & MPOL_F_PRIVATE) + gfp |= __GFP_PRIVATE; + return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else @@ -2280,6 +2311,15 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; + else if ((pol->flags & MPOL_F_PRIVATE) && + !node_isset(*nid, pol->nodes)) { + /* + * Private nodes are not in N_MEMORY nodes' zonelists. + * When the preferred nid (usually numa_node_id()) can't + * reach the policy nodes, start from a policy node. + */ + *nid = first_node(pol->nodes); + } /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the @@ -2533,6 +2573,10 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct gfp |= __GFP_NOWARN; pol = get_vma_policy(vma, addr, order, &ilx); + + if (pol->flags & MPOL_F_PRIVATE) + gfp |= __GFP_PRIVATE; + folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return folio; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a1b35421d78..ec6c1f8e85d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3849,8 +3849,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * if another process has NUMA bindings and is causing * kswapd wakeups on only some nodes. Avoid accidental * "node_reclaim_mode"-like behavior in this case. + * + * Nodes without kswapd (some private nodes) are never + * skipped - this causes some mempolicies to silently + * fall back to DRAM even if the node is eligible. */ if (skip_kswapd_nodes && + zone->zone_pgdat->kswapd && !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) { skipped_kswapd_nodes = true; continue; -- 2.53.0
