On Fri, Apr 16, 2021 at 12:51:31PM +0200, David Hildenbrand wrote:
> The thing is: move_pfn_range_to_zone() in case of ordinary online_pages()
> won't touch the pages but only the memmap. The memmap has a proper kasan
> shadow already. Pages won't be touched before exposing them to the page
> allocator via generic_online_pages().
> 
> This is different in this case :)

Meh, yeah, I missed that.

My brain went almost friday-mood, but this should be the complete diff then:


diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f209925a5d4e..2e2b2f654f0a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -173,16 +173,72 @@ static int memory_block_online(struct memory_block *mem)
 {
        unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, 
nr_pages);
+
+       /*
+        * Although vmemmap pages have a different lifecycle than the pages
+        * they describe (they remain until the memory is unplugged), doing
+        * their initialization and accounting at memory onlining/offlining
+        * stage simplifies things a lot.
+        */
+       if (nr_vmemmap_pages) {
+               ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
+               if (ret)
+                       return ret;
+       }
+
+       ret = online_pages(start_pfn + nr_vmemmap_pages,
+                          nr_pages - nr_vmemmap_pages, zone);
+       if (ret) {
+               if (nr_vmemmap_pages)
+                       mhp_deinit_memmap_on_memory(start_pfn, 
nr_vmemmap_pages);
+               return ret;
+       }
+
+       /*
+        * Account once onlining succeeded. If the zone was unpopulated, it is
+        * now already properly populated.
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, nr_vmemmap_pages);

-       return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid);
+       return ret;
 }

 static int memory_block_offline(struct memory_block *mem)
 {
        unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+       unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+       struct zone *zone;
+       int ret;
+
+       zone = page_zone(pfn_to_page(start_pfn));
+
+       /*
+        * Unaccount before offlining, such that unpopulated zone and kthreads
+        * can properly be torn down in offline_pages().
+        */
+       if (nr_vmemmap_pages)
+               adjust_present_page_count(zone, -nr_vmemmap_pages);

-       return offline_pages(start_pfn, nr_pages);
+       ret = offline_pages(start_pfn + nr_vmemmap_pages,
+                           nr_pages - nr_vmemmap_pages);
+       if (ret) {
+               /* offline_pages() failed. Account back. */
+               if (nr_vmemmap_pages)
+                       adjust_present_page_count(zone, nr_vmemmap_pages);
+               return ret;
+       }
+
+       if (nr_vmemmap_pages)
+               mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+       return ret;
 }

 /*
@@ -576,7 +632,8 @@ int register_memory(struct memory_block *memory)
        return ret;
 }

-static int init_memory_block(unsigned long block_id, unsigned long state)
+static int init_memory_block(unsigned long block_id, unsigned long state,
+                            unsigned long nr_vmemmap_pages)
 {
        struct memory_block *mem;
        int ret = 0;
@@ -593,6 +650,7 @@ static int init_memory_block(unsigned long block_id, 
unsigned long state)
        mem->start_section_nr = block_id * sections_per_block;
        mem->state = state;
        mem->nid = NUMA_NO_NODE;
+       mem->nr_vmemmap_pages = nr_vmemmap_pages;

        ret = register_memory(mem);

@@ -612,7 +670,7 @@ static int add_memory_block(unsigned long base_section_nr)
        if (section_count == 0)
                return 0;
        return init_memory_block(memory_block_id(base_section_nr),
-                                MEM_ONLINE);
+                                MEM_ONLINE, 0);
 }

 static void unregister_memory(struct memory_block *memory)
@@ -634,7 +692,8 @@ static void unregister_memory(struct memory_block *memory)
  *
  * Called under device_hotplug_lock.
  */
-int create_memory_block_devices(unsigned long start, unsigned long size)
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages)
 {
        const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
        unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -647,7 +706,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size)
                return -EINVAL;

        for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-               ret = init_memory_block(block_id, MEM_OFFLINE);
+               ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
                if (ret)
                        break;
        }
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4da95e684e20..97e92e8b556a 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,11 @@ struct memory_block {
        int online_type;                /* for passing data to online routine */
        int nid;                        /* NID for this memory block */
        struct device dev;
+       /*
+        * Number of vmemmap pages. These pages
+        * lay at the beginning of the memory block.
+        */
+       unsigned long nr_vmemmap_pages;
 };

 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+                               unsigned long vmemmap_pages);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7288aa5ef73b..28f32fd00fe9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -55,6 +55,14 @@ typedef int __bitwise mhp_t;
  */
 #define MHP_MERGE_RESOURCE     ((__force mhp_t)BIT(0))

+/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+
 /*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
@@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 /* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                                    struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long 
nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-                       int online_type, int nid);
+                       struct zone *zone);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
                                         unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid, unsigned start_
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
                                      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */

 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f5b464daeeca..45a79da89c5f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -17,7 +17,7 @@ struct device;
  * @alloc: track pages consumed, private to vmemmap_populate()
  */
 struct vmem_altmap {
-       const unsigned long base_pfn;
+       unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..76f4ca5ed230 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -427,6 +427,11 @@ enum zone_type {
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
+        * 6. Memory-hotplug: when using memmap_on_memory and onlining the 
memory
+        *    to the MOVABLE zone, the vmemmap pages are also placed in such
+        *    zone. Such pages cannot be really moved around as they are
+        *    self-stored in the range, but they are treated as movable when
+        *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1378,10 +1383,8 @@ static inline int online_section_nr(unsigned long nr)

 #ifdef CONFIG_MEMORY_HOTPLUG
 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
 #endif
-#endif

 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 24c045b24b95..febf805000f8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -183,6 +183,11 @@ config MEMORY_HOTREMOVE
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION

+config MHP_MEMMAP_ON_MEMORY
+       def_bool y
+       depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+       depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d05056b3c173..489e2b538ff8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,8 @@
 #include "internal.h"
 #include "shuffle.h"

+static bool memmap_on_memory;
+
 /*
  * online_page_callback contains pointer to current page onlining function.
  * Initially it is generic_online_page(). If it is required it could be
@@ -641,7 +643,12 @@ EXPORT_SYMBOL_GPL(generic_online_page);
 static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 {
        const unsigned long end_pfn = start_pfn + nr_pages;
-       unsigned long pfn;
+       unsigned long pfn = start_pfn;
+
+       while (!IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)) {
+               (*online_page_callback)(pfn_to_page(pfn), pageblock_order);
+               pfn += pageblock_nr_pages;
+       }

        /*
         * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
@@ -649,7 +656,7 @@ static void online_pages_range(unsigned long start_pfn, 
unsigned long nr_pages)
         * later). We account all pages as being online and belonging to this
         * zone ("present").
         */
-       for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+       for (; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
                (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);

        /* mark all involved sections as online */
@@ -829,7 +836,11 @@ struct zone * zone_for_pfn_range(int online_type, int nid, 
unsigned start_pfn,
        return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }

-static void adjust_present_page_count(struct zone *zone, long nr_pages)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
 {
        unsigned long flags;

@@ -839,12 +850,56 @@ static void adjust_present_page_count(struct zone *zone, 
long nr_pages)
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }

-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-                      int online_type, int nid)
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+                             struct zone *zone)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+       int ret;
+
+       ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+       if (ret)
+               return ret;
+
+       /*
+        * Initialize vmemmap pages with the corresponding node, zone links set.
+        */
+       move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections online here as otherwise they will be
+        * left offline.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               online_mem_sections(pfn, ALIGN_DOWN(end_pfn, 
PAGES_PER_SECTION));
+
+       return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long end_pfn = pfn + nr_pages;
+        /*
+        * The pages associated with this vmemmap have been offlined, so
+        * we can reset its state here.
+        */
+       remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+       kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+
+       /*
+        * It might be that the vmemmap_pages fully span sections. If that is
+        * the case, mark those sections offline here as otherwise they will be
+        * left online.
+        */
+       if (nr_pages >= PAGES_PER_SECTION)
+               offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, 
PAGES_PER_SECTION));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone 
*zone)
 {
        unsigned long flags;
-       struct zone *zone;
        int need_zonelists_rebuild = 0;
+       const int nid = zone_to_nid(zone);
        int ret;
        struct memory_notify arg;

@@ -861,7 +916,6 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages,
        mem_hotplug_begin();

        /* associate pfn range with the zone */
-       zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
        move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);

        arg.start_pfn = pfn;
@@ -1075,6 +1129,45 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
        return device_online(&mem->dev);
 }

+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+       unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+       unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+       unsigned long remaining_size = size - vmemmap_size;
+
+       /*
+        * Besides having arch support and the feature enabled at runtime, we
+        * need a few more assumptions to hold true:
+        *
+        * a) We span a single memory block: memory onlining/offlinin;g happens
+        *    in memory block granularity. We don't want the vmemmap of online
+        *    memory blocks to reside on offline memory blocks. In the future,
+        *    we might want to support variable-sized memory blocks to make the
+        *    feature more versatile.
+        *
+        * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+        *    to populate memory from the altmap for unrelated parts (i.e.,
+        *    other memory blocks)
+        *
+        * c) The vmemmap pages (and thereby the pages that will be exposed to
+        *    the buddy) have to cover full pageblocks: memory 
onlining/offlining
+        *    code requires applicable ranges to be page-aligned, for example, 
to
+        *    set the migratetypes properly.
+        *
+        * TODO: Although we have a check here to make sure that vmemmap pages
+        *       fully populate a PMD, it is not the right place to check for
+        *       this. A much better solution involves improving vmemmap code
+        *       to fallback to base pages when trying to populate vmemmap using
+        *       altmap as an alternative source of memory, and we do not 
exactly
+        *       populate a single PMD.
+        */
+       return memmap_on_memory &&
+              IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+              size == memory_block_size_bytes() &&
+              IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+              IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1084,6 +1177,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+       struct vmem_altmap mhp_altmap = {};
        u64 start, size;
        bool new_node = false;
        int ret;
@@ -1110,13 +1204,26 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
                goto error;
        new_node = ret;

+       /*
+        * Self hosted memmap array
+        */
+       if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+               if (!mhp_supports_memmap_on_memory(size)) {
+                       ret = -EINVAL;
+                       goto error;
+               }
+               mhp_altmap.free = PHYS_PFN(size);
+               mhp_altmap.base_pfn = PHYS_PFN(start);
+               params.altmap = &mhp_altmap;
+       }
+
        /* call arch's memory hotadd */
        ret = arch_add_memory(nid, start, size, &params);
        if (ret < 0)
                goto error;

        /* create memory block devices after memory was added */
-       ret = create_memory_block_devices(start, size);
+       ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
        if (ret) {
                arch_remove_memory(nid, start, size, NULL);
                goto error;
@@ -1762,6 +1869,14 @@ static int check_memblock_offlined_cb(struct 
memory_block *mem, void *arg)
        return 0;
 }

+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+       /*
+        * If not set, continue with the next block.
+        */
+       return mem->nr_vmemmap_pages;
+}
+
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
        int cpu;
@@ -1836,6 +1951,9 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
        int rc = 0;
+       struct vmem_altmap mhp_altmap = {};
+       struct vmem_altmap *altmap = NULL;
+       unsigned long nr_vmemmap_pages;

        BUG_ON(check_hotplug_memory_range(start, size));

@@ -1848,6 +1966,31 @@ static int __ref try_remove_memory(int nid, u64 start, 
u64 size)
        if (rc)
                return rc;

+       /*
+        * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+        * the same granularity it was added - a single memory block.
+        */
+       if (memmap_on_memory) {
+               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+                                                     get_nr_vmemmap_pages_cb);
+               if (nr_vmemmap_pages) {
+                       if (size != memory_block_size_bytes()) {
+                               pr_warn("Refuse to remove %#llx - %#llx,"
+                                       "wrong granularity\n",
+                                       start, start + size);
+                               return -EINVAL;
+                       }
+
+                       /*
+                        * Let remove_pmd_table->free_hugepage_table do the
+                        * right thing if we used vmem_altmap when hot-adding
+                        * the range.
+                        */
+                       mhp_altmap.alloc = nr_vmemmap_pages;
+                       altmap = &mhp_altmap;
+               }
+       }
+
        /* remove memmap entry */
        firmware_map_remove(start, start + size, "System RAM");

@@ -1859,7 +2002,7 @@ static int __ref try_remove_memory(int nid, u64 start, 
u64 size)

        mem_hotplug_begin();

-       arch_remove_memory(nid, start, size, NULL);
+       arch_remove_memory(nid, start, size, altmap);

        if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
                memblock_free(start, size);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7bd23f9d6cef..8e96cf00536b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -623,7 +623,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned 
long end_pfn)
        }
 }

-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -644,7 +643,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned 
long end_pfn)
                ms->section_mem_map &= ~SECTION_IS_ONLINE;
        }
 }
-#endif

 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static struct page * __meminit populate_section_memmap(unsigned long pfn,

 

-- 
Oscar Salvador
SUSE L3

Reply via email to