The memory-tier subsystem needs to know which private nodes should
appear as demotion targets.

Add NP_OPS_DEMOTION (BIT(2)):
   Node can be added as a demotion target by memory-tiers.

Add demotion backpressure support so private nodes can reject
new demotions cleanly, allowing vmscan to fall back to swap.

In the demotion path, try demotion to private nodes invididually,
then clear private nodes from the demotion target mask until a
non-private node is found, then fall back to the remaining mask.
This prevents LRU inversion while still allowing forward progress.

This is the closest match to the current behavior without making
private nodes inaccessible or preventing forward progress. We
should probably completely re-do the demotion logic to allow less
fallback and kick kswapd instead - right now we induce LRU
inversions by simply falling back to any node in the demotion list.

Add memory_tier_refresh_demotion() export for services to trigger
re-evaluation of demotion targets after changing their flags.

Signed-off-by: Gregory Price <[email protected]>
---
 include/linux/memory-tiers.h |  9 +++++++
 include/linux/node_private.h | 22 +++++++++++++++++
 mm/internal.h                |  7 ++++++
 mm/memory-tiers.c            | 46 ++++++++++++++++++++++++++++++++----
 mm/page_alloc.c              | 12 +++++++---
 mm/vmscan.c                  | 30 ++++++++++++++++++++++-
 6 files changed, 117 insertions(+), 9 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 3e1159f6762c..e1476432e359 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -58,6 +58,7 @@ struct memory_dev_type *mt_get_memory_type(int adist);
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 bool node_is_toptier(int node);
+void memory_tier_refresh_demotion(void);
 #else
 static inline int next_demotion_node(int node)
 {
@@ -73,6 +74,10 @@ static inline bool node_is_toptier(int node)
 {
        return true;
 }
+
+static inline void memory_tier_refresh_demotion(void)
+{
+}
 #endif
 
 #else
@@ -106,6 +111,10 @@ static inline bool node_is_toptier(int node)
        return true;
 }
 
+static inline void memory_tier_refresh_demotion(void)
+{
+}
+
 static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
 {
        return 0;
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index e9b58afa366b..e254e36056cd 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -88,6 +88,8 @@ struct node_private_ops {
 #define NP_OPS_MIGRATION               BIT(0)
 /* Allow mempolicy-directed allocation and mbind migration to this node */
 #define NP_OPS_MEMPOLICY               BIT(1)
+/* Node participates as a demotion target in memory-tiers */
+#define NP_OPS_DEMOTION                        BIT(2)
 
 /**
  * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -101,12 +103,14 @@ struct node_private_ops {
  *             callbacks that may sleep; 0 = fully released)
  * @released: Signaled when refcount drops to 0; unregister waits on this
  * @ops: Service callbacks and exclusion flags (NULL until service registers)
+ * @migration_blocked: Service signals migrations should pause
  */
 struct node_private {
        void *owner;
        refcount_t refcount;
        struct completion released;
        const struct node_private_ops *ops;
+       bool migration_blocked;
 };
 
 #ifdef CONFIG_NUMA
@@ -306,6 +310,19 @@ static inline bool nodes_private_mpol_allowed(const 
nodemask_t *nodes)
        }
        return eligible;
 }
+
+static inline bool node_private_migration_blocked(int nid)
+{
+       struct node_private *np;
+       bool blocked;
+
+       rcu_read_lock();
+       np = rcu_dereference(NODE_DATA(nid)->node_private);
+       blocked = np && READ_ONCE(np->migration_blocked);
+       rcu_read_unlock();
+
+       return blocked;
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #else /* !CONFIG_NUMA */
@@ -404,6 +421,11 @@ static inline bool nodes_private_mpol_allowed(const 
nodemask_t *nodes)
        return false;
 }
 
+static inline bool node_private_migration_blocked(int nid)
+{
+       return false;
+}
+
 static inline int node_private_register(int nid, struct node_private *np)
 {
        return -ENODEV;
diff --git a/mm/internal.h b/mm/internal.h
index 6ab4679fe943..5950e20d4023 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1206,6 +1206,8 @@ extern int node_reclaim_mode;
 
 extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
 extern int find_next_best_node(int node, nodemask_t *used_node_mask);
+extern int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+                                 const nodemask_t *candidates);
 extern bool numa_zone_alloc_allowed(int alloc_flags, struct zone *zone,
                              gfp_t gfp_mask);
 #else
@@ -1220,6 +1222,11 @@ static inline int find_next_best_node(int node, 
nodemask_t *used_node_mask)
 {
        return NUMA_NO_NODE;
 }
+static inline int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+                                        const nodemask_t *candidates)
+{
+       return NUMA_NO_NODE;
+}
 static inline bool numa_zone_alloc_allowed(int alloc_flags, struct zone *zone,
                                     gfp_t gfp_mask)
 {
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 9c742e18e48f..434190fdc078 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -3,6 +3,7 @@
 #include <linux/lockdep.h>
 #include <linux/sysfs.h>
 #include <linux/kobject.h>
+#include <linux/node_private.h>
 #include <linux/memory.h>
 #include <linux/memory-tiers.h>
 #include <linux/notifier.h>
@@ -380,6 +381,8 @@ static void disable_all_demotion_targets(void)
                if (memtier)
                        memtier->lower_tier_mask = NODE_MASK_NONE;
        }
+       for_each_node_state(node, N_MEMORY_PRIVATE)
+               node_demotion[node].preferred = NODE_MASK_NONE;
        /*
         * Ensure that the "disable" is visible across the system.
         * Readers will see either a combination of before+disable
@@ -421,6 +424,7 @@ static void establish_demotion_targets(void)
        int target = NUMA_NO_NODE, node;
        int distance, best_distance;
        nodemask_t tier_nodes, lower_tier;
+       nodemask_t all_memory;
 
        lockdep_assert_held_once(&memory_tier_lock);
 
@@ -429,6 +433,13 @@ static void establish_demotion_targets(void)
 
        disable_all_demotion_targets();
 
+       /* Include private nodes that have opted in to demotion. */
+       all_memory = node_states[N_MEMORY];
+       for_each_node_state(node, N_MEMORY_PRIVATE) {
+               if (node_private_has_flag(node, NP_OPS_DEMOTION))
+                       node_set(node, all_memory);
+       }
+
        for_each_node_state(node, N_MEMORY) {
                best_distance = -1;
                nd = &node_demotion[node];
@@ -442,12 +453,12 @@ static void establish_demotion_targets(void)
                memtier = list_next_entry(memtier, list);
                tier_nodes = get_memtier_nodemask(memtier);
                /*
-                * find_next_best_node, use 'used' nodemask as a skip list.
+                * find_next_best_node_in, use 'used' nodemask as a skip list.
                 * Add all memory nodes except the selected memory tier
                 * nodelist to skip list so that we find the best node from the
                 * memtier nodelist.
                 */
-               nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
+               nodes_andnot(tier_nodes, all_memory, tier_nodes);
 
                /*
                 * Find all the nodes in the memory tier node list of same best 
distance.
@@ -455,7 +466,8 @@ static void establish_demotion_targets(void)
                 * in the preferred mask when allocating pages during demotion.
                 */
                do {
-                       target = find_next_best_node(node, &tier_nodes);
+                       target = find_next_best_node_in(node, &tier_nodes,
+                                                       &all_memory);
                        if (target == NUMA_NO_NODE)
                                break;
 
@@ -495,7 +507,7 @@ static void establish_demotion_targets(void)
         * allocation to a set of nodes that is closer the above selected
         * preferred node.
         */
-       lower_tier = node_states[N_MEMORY];
+       lower_tier = all_memory;
        list_for_each_entry(memtier, &memory_tiers, list) {
                /*
                 * Keep removing current tier from lower_tier nodes,
@@ -542,7 +554,7 @@ static struct memory_tier *set_node_memory_tier(int node)
 
        lockdep_assert_held_once(&memory_tier_lock);
 
-       if (!node_state(node, N_MEMORY))
+       if (!node_state(node, N_MEMORY) && !node_state(node, N_MEMORY_PRIVATE))
                return ERR_PTR(-EINVAL);
 
        mt_calc_adistance(node, &adist);
@@ -865,6 +877,30 @@ int mt_calc_adistance(int node, int *adist)
 }
 EXPORT_SYMBOL_GPL(mt_calc_adistance);
 
+/**
+ * memory_tier_refresh_demotion() - Re-establish demotion targets
+ *
+ * Called by services after registering or unregistering ops->migrate_to on
+ * a private node, so that establish_demotion_targets() picks up the change.
+ */
+void memory_tier_refresh_demotion(void)
+{
+       int nid;
+
+       mutex_lock(&memory_tier_lock);
+       /*
+        * Ensure private nodes are registered with a tier, otherwise
+        * they won't show up in any node's demotion targets nodemask.
+        */
+       for_each_node_state(nid, N_MEMORY_PRIVATE) {
+               if (!__node_get_memory_tier(nid))
+                       set_node_memory_tier(nid);
+       }
+       establish_demotion_targets();
+       mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(memory_tier_refresh_demotion);
+
 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
                                              unsigned long action, void *_arg)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ec6c1f8e85d8..e272dfdc6b00 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5589,7 +5589,8 @@ static int node_load[MAX_NUMNODES];
  *
  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
  */
-int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+                          const nodemask_t *candidates)
 {
        int n, val;
        int min_val = INT_MAX;
@@ -5599,12 +5600,12 @@ int find_next_best_node(int node, nodemask_t 
*used_node_mask)
         * Use the local node if we haven't already, but for memoryless local
         * node, we should skip it and fall back to other nodes.
         */
-       if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
+       if (!node_isset(node, *used_node_mask) && node_isset(node, 
*candidates)) {
                node_set(node, *used_node_mask);
                return node;
        }
 
-       for_each_node_state(n, N_MEMORY) {
+       for_each_node_mask(n, *candidates) {
 
                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
@@ -5636,6 +5637,11 @@ int find_next_best_node(int node, nodemask_t 
*used_node_mask)
        return best_node;
 }
 
+int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+       return find_next_best_node_in(node, used_node_mask,
+                                     &node_states[N_MEMORY]);
+}
 
 /*
  * Build zonelists ordered by node and zones within node.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6113be4d3519..0f534428ea88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,6 +58,7 @@
 #include <linux/random.h>
 #include <linux/mmu_notifier.h>
 #include <linux/parser.h>
+#include <linux/node_private.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -355,6 +356,10 @@ static bool can_demote(int nid, struct scan_control *sc,
        if (demotion_nid == NUMA_NO_NODE)
                return false;
 
+       /* Don't demote when the target's service signals backpressure */
+       if (node_private_migration_blocked(demotion_nid))
+               return false;
+
        /* If demotion node isn't in the cgroup's mems_allowed, fall back */
        return mem_cgroup_node_allowed(memcg, demotion_nid);
 }
@@ -1022,8 +1027,10 @@ static unsigned int demote_folio_list(struct list_head 
*demote_folios,
                                     struct pglist_data *pgdat)
 {
        int target_nid = next_demotion_node(pgdat->node_id);
-       unsigned int nr_succeeded;
+       int first_nid = target_nid;
+       unsigned int nr_succeeded = 0;
        nodemask_t allowed_mask;
+       int ret;
 
        struct migration_target_control mtc = {
                /*
@@ -1046,6 +1053,27 @@ static unsigned int demote_folio_list(struct list_head 
*demote_folios,
 
        node_get_allowed_targets(pgdat, &allowed_mask);
 
+       /* Try private node targets until we find non-private node */
+       while (node_state(target_nid, N_MEMORY_PRIVATE)) {
+               unsigned int nr = 0;
+
+               ret = node_private_migrate_to(demote_folios, target_nid,
+                                             MIGRATE_ASYNC, MR_DEMOTION,
+                                             &nr);
+               nr_succeeded += nr;
+               if (ret == 0 || list_empty(demote_folios))
+                       return nr_succeeded;
+
+               target_nid = next_node_in(target_nid, allowed_mask);
+               if (target_nid == first_nid)
+                       return nr_succeeded;
+               if (!node_state(target_nid, N_MEMORY_PRIVATE))
+                       break;
+       }
+
+       /* target_nid is a non-private node; use standard migration */
+       mtc.nid = target_nid;
+
        /* Demotion ignores all cpuset and mempolicy settings */
        migrate_pages(demote_folios, alloc_demote_folio, NULL,
                      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
-- 
2.53.0


Reply via email to