Add a new function for drivers to hotplug memory as N_MEMORY_PRIVATE. This function combines node_private_region_register() with __add_memory_driver_managed() to ensure proper ordering:
1. Register the private region first (sets private node context) 2. Then hotplug the memory (sets N_MEMORY_PRIVATE) 3. On failure, unregister the private region to avoid leaving the node in an inconsistent state. When the last of memory is removed, hotplug also removes the private node context. If migration is not supported and the node is still online, fire a warning (likely bug in the driver). Signed-off-by: Gregory Price <[email protected]> --- include/linux/memory_hotplug.h | 11 +++ include/linux/mmzone.h | 12 ++++ mm/memory_hotplug.c | 122 ++++++++++++++++++++++++++++++--- 3 files changed, 135 insertions(+), 10 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1f19f08552ea..e5abade9450a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -293,6 +293,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); +extern int offline_and_remove_private_memory(int nid, u64 start, u64 size); #else static inline void try_offline_node(int nid) {} @@ -309,6 +310,12 @@ static inline int remove_memory(u64 start, u64 size) } static inline void __remove_memory(u64 start, u64 size) {} + +static inline int offline_and_remove_private_memory(int nid, u64 start, + u64 size) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG @@ -326,6 +333,10 @@ int __add_memory_driver_managed(int nid, u64 start, u64 size, extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name, mhp_t mhp_flags); +int add_private_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name, + mhp_t mhp_flags, enum mmop online_type, + struct node_private *np); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 992eb1c5a2c6..cc532b67ad3f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1524,6 +1524,18 @@ typedef struct pglist_data { #endif } pg_data_t; +#ifdef CONFIG_NUMA +static inline bool pgdat_is_private(pg_data_t *pgdat) +{ + return pgdat->private; +} +#else +static inline bool pgdat_is_private(pg_data_t *pgdat) +{ + return false; +} +#endif + #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d2dc527bd5b0..9d72f44a30dc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -36,6 +36,7 @@ #include <linux/rmap.h> #include <linux/module.h> #include <linux/node.h> +#include <linux/node_private.h> #include <asm/tlbflush.h> @@ -1173,8 +1174,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, true); - if (!node_state(nid, N_MEMORY)) { - /* Adding memory to the node for the first time */ + if (!node_state(nid, N_MEMORY) && !node_state(nid, N_MEMORY_PRIVATE)) { node_arg.nid = nid; ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg); ret = notifier_to_errno(ret); @@ -1208,8 +1208,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); - if (node_arg.nid >= 0) - node_set_state(nid, N_MEMORY); + if (node_arg.nid >= 0) { + if (pgdat_is_private(NODE_DATA(nid))) + node_set_state(nid, N_MEMORY_PRIVATE); + else + node_set_state(nid, N_MEMORY); + } if (need_zonelists_rebuild) build_all_zonelists(NULL); @@ -1227,8 +1231,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); - kswapd_run(nid); - kcompactd_run(nid); + /* + * Don't start reclaim/compaction daemons for private nodes. + * Private node services will decide whether to start these services. + */ + if (!pgdat_is_private(NODE_DATA(nid))) { + kswapd_run(nid); + kcompactd_run(nid); + } if (node_arg.nid >= 0) /* First memory added successfully. Notify consumers. */ @@ -1722,6 +1732,54 @@ int add_memory_driver_managed(int nid, u64 start, u64 size, } EXPORT_SYMBOL_GPL(add_memory_driver_managed); +/** + * add_private_memory_driver_managed - add driver-managed N_MEMORY_PRIVATE memory + * @nid: NUMA node ID (or memory group ID when MHP_NID_IS_MGID is set) + * @start: Start physical address + * @size: Size in bytes + * @resource_name: "System RAM ($DRIVER)" format + * @mhp_flags: Memory hotplug flags + * @online_type: MMOP_* online type + * @np: Driver-owned node_private structure (owner, refcount) + * + * Registers node_private first, then hotplugs the memory. + * + * On failure, unregisters the node_private. + */ +int add_private_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name, + mhp_t mhp_flags, enum mmop online_type, + struct node_private *np) +{ + struct memory_group *group; + int real_nid = nid; + int rc; + + if (!np) + return -EINVAL; + + if (mhp_flags & MHP_NID_IS_MGID) { + group = memory_group_find_by_id(nid); + if (!group) + return -EINVAL; + real_nid = group->nid; + } + + rc = node_private_register(real_nid, np); + if (rc) + return rc; + + rc = __add_memory_driver_managed(nid, start, size, resource_name, + mhp_flags, online_type); + if (rc) { + node_private_unregister(real_nid); + return rc; + } + + return 0; +} +EXPORT_SYMBOL_GPL(add_private_memory_driver_managed); + /* * Platforms should define arch_get_mappable_range() that provides * maximum possible addressable physical memory range for which the @@ -1872,6 +1930,15 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto put_folio; } + /* Private nodes w/o migration must ensure folios are offline */ + if (folio_is_private_node(folio) && + !folio_private_flags(folio, NP_OPS_MIGRATION)) { + WARN_ONCE(1, "hot-unplug on non-migratable node %d pfn %lx\n", + folio_nid(folio), pfn); + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; + goto put_folio; + } + if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", @@ -2014,8 +2081,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* * Check whether the node will have no present pages after we offline - * 'nr_pages' more. If so, we know that the node will become empty, and - * so we will clear N_MEMORY for it. + * 'nr_pages' more. If so, send pre-notification for last memory removal. + * We will clear N_MEMORY(_PRIVATE) if this is the case. */ if (nr_pages >= pgdat->node_present_pages) { node_arg.nid = node; @@ -2108,8 +2175,12 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ - if (node_arg.nid >= 0) - node_clear_state(node, N_MEMORY); + if (node_arg.nid >= 0) { + if (node_state(node, N_MEMORY)) + node_clear_state(node, N_MEMORY); + else if (node_state(node, N_MEMORY_PRIVATE)) + node_clear_state(node, N_MEMORY_PRIVATE); + } if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); @@ -2461,4 +2532,35 @@ int offline_and_remove_memory(u64 start, u64 size) return rc; } EXPORT_SYMBOL_GPL(offline_and_remove_memory); + +/** + * offline_and_remove_private_memory - offline, remove, and unregister private memory + * @nid: NUMA node ID of the private memory + * @start: Start physical address + * @size: Size in bytes + * + * Counterpart to add_private_memory_driver_managed(). Offlines and removes + * the memory range, then attempts to unregister the node_private. + * + * offline_and_remove_memory() clears N_MEMORY_PRIVATE when the last block + * is offlined, which allows node_private_unregister() to clear the + * pgdat->node_private pointer. If other private memory ranges remain on + * the node, node_private_unregister() returns -EBUSY (N_MEMORY_PRIVATE + * is still set) and the node_private remains registered. + * + * Return: 0 on full success (memory removed and node_private unregistered), + * -EBUSY if memory was removed but node still has other private memory, + * other negative error code if offline/remove failed. + */ +int offline_and_remove_private_memory(int nid, u64 start, u64 size) +{ + int rc; + + rc = offline_and_remove_memory(start, size); + if (rc) + return rc; + + return node_private_unregister(nid); +} +EXPORT_SYMBOL_GPL(offline_and_remove_private_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ -- 2.53.0
