Private node services may want to support user-driven migration (migrate_pages syscall, mbind) to allow data movement between regular and private nodes.
ZONE_DEVICE always rejects user migration, but private nodes should be able to opt in. Add NP_OPS_MIGRATION flag and folio_managed_user_migrate() wrapper that dispatches migration requests. Private nodes can either set the flag and provide a custom migrate_to callback for driver-managed migration. In migrate_to_node(), allows GFP_PRIVATE when the destination node supports NP_OPS_MIGRATION, enabling migrate_pages syscall to target private nodes. Signed-off-by: Gregory Price <[email protected]> --- drivers/base/node.c | 4 ++ include/linux/migrate.h | 10 +++ include/linux/node_private.h | 122 +++++++++++++++++++++++++++++++++++ mm/damon/paddr.c | 3 + mm/internal.h | 24 +++++++ mm/mempolicy.c | 10 +-- mm/migrate.c | 49 ++++++++++---- mm/rmap.c | 4 +- 8 files changed, 206 insertions(+), 20 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 646dc48a23b5..e587f5781135 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -949,6 +949,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops) if (!node_possible(nid)) return -EINVAL; + if ((ops->flags & NP_OPS_MIGRATION) && + (!ops->migrate_to || !ops->folio_migrate)) + return -EINVAL; + mutex_lock(&node_private_lock); np = rcu_dereference_protected(NODE_DATA(nid)->node_private, lockdep_is_held(&node_private_lock)); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..7b2da3875ff2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -71,6 +71,9 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); int set_movable_ops(const struct movable_operations *ops, enum pagetype type); +int migrate_folios_to_node(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason); #else @@ -96,6 +99,13 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag { return -ENOSYS; } +static inline int migrate_folios_to_node(struct list_head *folios, + int nid, + enum migrate_mode mode, + enum migrate_reason reason) +{ + return -ENOSYS; +} #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/node_private.h b/include/linux/node_private.h index f9dd2d25c8a5..0c5be1ee6e60 100644 --- a/include/linux/node_private.h +++ b/include/linux/node_private.h @@ -4,6 +4,7 @@ #include <linux/completion.h> #include <linux/memremap.h> +#include <linux/migrate_mode.h> #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> @@ -52,15 +53,40 @@ struct vm_fault; * or NULL when called for the final (original) folio after all sub-folios * have been split off. * + * @migrate_to: Migrate folios TO this node. + * [refcounted callback] + * Returns: 0 on full success, >0 = number of folios that failed to + * migrate, <0 = error. Matches migrate_pages() semantics. + * @nr_succeeded is set to the number of successfully migrated + * folios (may be NULL if caller doesn't need it). + * + * @folio_migrate: Post-migration notification that a folio on this private node + * changed physical location (on the same node or a different node). + * [folio-referenced callback] + * Called from migrate_folio_move() after data has been copied but before + * migration entries are replaced with real PTEs. Both @src and @dst are + * locked. Faults block in migration_entry_wait() until + * remove_migration_ptes() runs, so the service can safely update + * PFN-based metadata (compression tables, device page tables, DMA + * mappings, etc.) before any access through the page tables. + * * @flags: Operation exclusion flags (NP_OPS_* constants). * */ struct node_private_ops { bool (*free_folio)(struct folio *folio); void (*folio_split)(struct folio *folio, struct folio *new_folio); + int (*migrate_to)(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason, + unsigned int *nr_succeeded); + void (*folio_migrate)(struct folio *src, struct folio *dst); unsigned long flags; }; +/* Allow user/kernel migration; requires migrate_to and folio_migrate */ +#define NP_OPS_MIGRATION BIT(0) + /** * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes * @@ -177,6 +203,81 @@ static inline void folio_managed_split_cb(struct folio *original_folio, node_private_split_cb(original_folio, new_folio); } +#ifdef CONFIG_MEMORY_HOTPLUG +static inline int folio_managed_allows_user_migrate(struct folio *folio) +{ + if (folio_is_zone_device(folio)) + return -ENOENT; + return node_private_has_flag(folio_nid(folio), NP_OPS_MIGRATION) ? + folio_nid(folio) : -ENOENT; +} + +/** + * folio_managed_allows_migrate - Check if a managed folio supports migration + * @folio: The folio to check + * + * Returns true if the folio can be migrated. For zone_device folios, only + * device_private and device_coherent support migration. For private node + * folios, migration requires NP_OPS_MIGRATION. Normal folios always + * return true. + */ +static inline bool folio_managed_allows_migrate(struct folio *folio) +{ + if (folio_is_zone_device(folio)) + return folio_is_device_private(folio) || + folio_is_device_coherent(folio); + if (folio_is_private_node(folio)) + return folio_private_flags(folio, NP_OPS_MIGRATION); + return true; +} + +/** + * node_private_migrate_to - Attempt service-specific migration to a private node + * @folios: list of folios to migrate (may sleep) + * @nid: target node + * @mode: migration mode (MIGRATE_ASYNC, MIGRATE_SYNC, etc.) + * @reason: migration reason (MR_DEMOTION, MR_SYSCALL, etc.) + * @nr_succeeded: optional output for number of successfully migrated folios + * + * If @nid is an N_MEMORY_PRIVATE node with a migrate_to callback, + * invokes the callback and returns the result with migrate_pages() + * semantics (0 = full success, >0 = failure count, <0 = error). + * Returns -ENODEV if the node is not private or the service is being + * torn down. + * + * The source folios are on other nodes, so they do not pin the target + * node's node_private. A temporary refcount is taken under rcu_read_lock + * to keep node_private (and the service module) alive across the callback. + */ +static inline int node_private_migrate_to(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason, + unsigned int *nr_succeeded) +{ + int (*fn)(struct list_head *, int, enum migrate_mode, + enum migrate_reason, unsigned int *); + struct node_private *np; + int ret; + + rcu_read_lock(); + np = rcu_dereference(NODE_DATA(nid)->node_private); + if (!np || !np->ops || !np->ops->migrate_to || + !refcount_inc_not_zero(&np->refcount)) { + rcu_read_unlock(); + return -ENODEV; + } + fn = np->ops->migrate_to; + rcu_read_unlock(); + + ret = fn(folios, nid, mode, reason, nr_succeeded); + + if (refcount_dec_and_test(&np->refcount)) + complete(&np->released); + + return ret; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + #else /* !CONFIG_NUMA */ static inline bool folio_is_private_node(struct folio *folio) @@ -242,6 +343,27 @@ int node_private_clear_ops(int nid, const struct node_private_ops *ops); #else /* !CONFIG_NUMA || !CONFIG_MEMORY_HOTPLUG */ +static inline int folio_managed_allows_user_migrate(struct folio *folio) +{ + return -ENOENT; +} + +static inline bool folio_managed_allows_migrate(struct folio *folio) +{ + if (folio_is_zone_device(folio)) + return folio_is_device_private(folio) || + folio_is_device_coherent(folio); + return true; +} + +static inline int node_private_migrate_to(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason, + unsigned int *nr_succeeded) +{ + return -ENODEV; +} + static inline int node_private_register(int nid, struct node_private *np) { return -ENODEV; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 07a8aead439e..532b8e2c62b0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -277,6 +277,9 @@ static unsigned long damon_pa_migrate(struct damon_region *r, else *sz_filter_passed += folio_size(folio) / addr_unit; + if (!folio_managed_allows_migrate(folio)) + goto put_folio; + if (!folio_isolate_lru(folio)) goto put_folio; list_add(&folio->lru, &folio_list); diff --git a/mm/internal.h b/mm/internal.h index 658da41cdb8e..6ab4679fe943 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1442,6 +1442,30 @@ static inline bool folio_managed_on_free(struct folio *folio) return false; } +/** + * folio_managed_migrate_notify - Notify service that a folio changed location + * @src: the old folio (about to be freed) + * @dst: the new folio (data already copied, migration entries still in place) + * + * Called from migrate_folio_move() after data has been copied but before + * remove_migration_ptes() installs real PTEs pointing to @dst. While + * migration entries are in place, faults block in migration_entry_wait(), + * so the service can safely update PFN-based metadata before any access + * through the page tables. Both @src and @dst are locked. + */ +static inline void folio_managed_migrate_notify(struct folio *src, + struct folio *dst) +{ + const struct node_private_ops *ops; + + if (!folio_is_private_node(src)) + return; + + ops = folio_node_private_ops(src); + if (ops && ops->folio_migrate) + ops->folio_migrate(src, dst); +} + struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long vm_flags, unsigned long start, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68a98ba57882..2b0f9762d171 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -111,6 +111,7 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/leafops.h> +#include <linux/node_private.h> #include <linux/gcd.h> #include <asm/tlbflush.h> @@ -1282,11 +1283,6 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, LIST_HEAD(pagelist); long nr_failed; long err = 0; - struct migration_target_control mtc = { - .nid = dest, - .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, - .reason = MR_SYSCALL, - }; nodes_clear(nmask); node_set(source, nmask); @@ -1311,8 +1307,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, mmap_read_unlock(mm); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + err = migrate_folios_to_node(&pagelist, dest, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 5169f9717f60..a54d4af04df3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -43,6 +43,7 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> +#include <linux/node_private.h> #include <asm/tlbflush.h> @@ -1387,6 +1388,8 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, if (old_page_state & PAGE_WAS_MLOCKED) lru_add_drain(); + folio_managed_migrate_notify(src, dst); + if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, 0); @@ -2165,6 +2168,7 @@ int migrate_pages(struct list_head *from, new_folio_t get_new_folio, return rc_gather; } +EXPORT_SYMBOL_GPL(migrate_pages); struct folio *alloc_migration_target(struct folio *src, unsigned long private) { @@ -2204,6 +2208,31 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } +EXPORT_SYMBOL_GPL(alloc_migration_target); + +static int __migrate_folios_to_node(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason) +{ + struct migration_target_control mtc = { + .nid = nid, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = reason, + }; + + return migrate_pages(folios, alloc_migration_target, NULL, + (unsigned long)&mtc, mode, reason, NULL); +} + +int migrate_folios_to_node(struct list_head *folios, int nid, + enum migrate_mode mode, + enum migrate_reason reason) +{ + if (node_state(nid, N_MEMORY_PRIVATE)) + return node_private_migrate_to(folios, nid, mode, + reason, NULL); + return __migrate_folios_to_node(folios, nid, mode, reason); +} #ifdef CONFIG_NUMA @@ -2221,14 +2250,8 @@ static int store_status(int __user *status, int start, int value, int nr) static int do_move_pages_to_node(struct list_head *pagelist, int node) { int err; - struct migration_target_control mtc = { - .nid = node, - .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, - .reason = MR_SYSCALL, - }; - err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + err = migrate_folios_to_node(pagelist, node, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(pagelist); return err; @@ -2240,7 +2263,7 @@ static int __add_folio_for_migration(struct folio *folio, int node, if (is_zero_folio(folio) || is_huge_zero_folio(folio)) return -EFAULT; - if (folio_is_zone_device(folio)) + if (!folio_managed_allows_migrate(folio)) return -ENOENT; if (folio_nid(folio) == node) @@ -2364,7 +2387,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) goto out_flush; - if (!node_state(node, N_MEMORY)) + if (!node_state(node, N_MEMORY) && + !node_state(node, N_MEMORY_PRIVATE)) goto out_flush; err = -EACCES; @@ -2449,8 +2473,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (folio) { if (is_zero_folio(folio) || is_huge_zero_folio(folio)) err = -EFAULT; - else if (folio_is_zone_device(folio)) - err = -ENOENT; + else if (unlikely(folio_is_private_managed(folio))) + err = folio_managed_allows_user_migrate(folio); else err = folio_nid(folio); folio_walk_end(&fw, vma); @@ -2660,6 +2684,9 @@ int migrate_misplaced_folio_prepare(struct folio *folio, int nr_pages = folio_nr_pages(folio); pg_data_t *pgdat = NODE_DATA(node); + if (!folio_managed_allows_migrate(folio)) + return -ENOENT; + if (folio_is_file_lru(folio)) { /* * Do not migrate file folios that are mapped in multiple diff --git a/mm/rmap.c b/mm/rmap.c index f955f02d570e..805f9ceb82f3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -72,6 +72,7 @@ #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> +#include <linux/node_private.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> #include <linux/oom.h> @@ -2616,8 +2617,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) TTU_SYNC | TTU_BATCH_FLUSH))) return; - if (folio_is_zone_device(folio) && - (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) + if (!folio_managed_allows_migrate(folio)) return; /* -- 2.53.0
