Towards enabling memory hotplug to track partial population of a
section, introduce 'struct mem_section_usage'.

A pointer to a 'struct mem_section_usage' instance replaces the existing
pointer to a 'pageblock_flags' bitmap. Effectively it adds one more
'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to
house a new 'subsection_map' bitmap.  The new bitmap enables the memory
hot{plug,remove} implementation to act on incremental sub-divisions of a
section.

The default SUBSECTION_SHIFT is chosen to keep the 'subsection_map' no
larger than a single 'unsigned long' on the major architectures.
Alternatively an architecture can define ARCH_SUBSECTION_SHIFT to
override the default PMD_SHIFT. Note that PowerPC needs to use
ARCH_SUBSECTION_SHIFT to workaround PMD_SHIFT being a non-constant
expression on PowerPC.

The primary motivation for this functionality is to support platforms
that mix "System RAM" and "Persistent Memory" within a single section,
or multiple PMEM ranges with different mapping lifetimes within a single
section. The section restriction for hotplug has caused an ongoing saga
of hacks and bugs for devm_memremap_pages() users.

Beyond the fixups to teach existing paths how to retrieve the 'usemap'
from a section, and updates to usemap allocation path, there are no
expected behavior changes.

Cc: Michal Hocko <mho...@suse.com>
Cc: Vlastimil Babka <vba...@suse.cz>
Cc: Logan Gunthorpe <log...@deltatee.com>
Cc: Oscar Salvador <osalva...@suse.de>
Cc: Pavel Tatashin <pasha.tatas...@soleen.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 arch/powerpc/include/asm/sparsemem.h |    3 +
 include/linux/mmzone.h               |   48 +++++++++++++++++++-
 mm/memory_hotplug.c                  |   18 ++++----
 mm/page_alloc.c                      |    2 -
 mm/sparse.c                          |   81 +++++++++++++++++-----------------
 5 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..1aa3c9303bf8 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -10,6 +10,9 @@
  */
 #define SECTION_SIZE_BITS       24
 
+/* Reflect the largest possible PMD-size as the subsection-size constant */
+#define ARCH_SUBSECTION_SHIFT 24
+
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 427b79c39b3c..ac163f2f274f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1161,6 +1161,44 @@ static inline unsigned long section_nr_to_pfn(unsigned 
long sec)
 #define SECTION_ALIGN_UP(pfn)  (((pfn) + PAGES_PER_SECTION - 1) & 
PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)
 
+/*
+ * SUBSECTION_SHIFT must be constant since it is used to declare
+ * subsection_map and related bitmaps without triggering the generation
+ * of variable-length arrays. The most natural size for a subsection is
+ * a PMD-page. For architectures that do not have a constant PMD-size
+ * ARCH_SUBSECTION_SHIFT can be set to a constant max size, or otherwise
+ * fallback to 2MB.
+ */
+#if defined(ARCH_SUBSECTION_SHIFT)
+#define SUBSECTION_SHIFT (ARCH_SUBSECTION_SHIFT)
+#elif defined(PMD_SHIFT)
+#define SUBSECTION_SHIFT (PMD_SHIFT)
+#else
+/*
+ * Memory hotplug enabled platforms avoid this default because they
+ * either define ARCH_SUBSECTION_SHIFT, or PMD_SHIFT is a constant, but
+ * this is kept as a backstop to allow compilation on
+ * !ARCH_ENABLE_MEMORY_HOTPLUG archs.
+ */
+#define SUBSECTION_SHIFT 21
+#endif
+
+#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
+#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
+#define PAGE_SUBSECTION_MASK ((~(PAGES_PER_SUBSECTION-1)))
+
+#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
+#error Subsection size exceeds section size
+#else
+#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
+#endif
+
+struct mem_section_usage {
+       DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+       /* See declaration of similar field in struct zone */
+       unsigned long pageblock_flags[0];
+};
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1178,8 +1216,7 @@ struct mem_section {
         */
        unsigned long section_mem_map;
 
-       /* See declaration of similar field in struct zone */
-       unsigned long *pageblock_flags;
+       struct mem_section_usage *usage;
 #ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
@@ -1210,6 +1247,11 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+static inline unsigned long *section_to_usemap(struct mem_section *ms)
+{
+       return ms->usage->pageblock_flags;
+}
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 #ifdef CONFIG_SPARSEMEM_EXTREME
@@ -1221,7 +1263,7 @@ static inline struct mem_section 
*__nr_to_section(unsigned long nr)
        return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern int __section_nr(struct mem_section* ms);
-extern unsigned long usemap_size(void);
+extern size_t mem_section_usage_size(void);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a88c5f334e5a..7b963c2d3a0d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-       unsigned long *usemap, mapsize, section_nr, i;
+       unsigned long mapsize, section_nr, i;
        struct mem_section *ms;
        struct page *page, *memmap;
+       struct mem_section_usage *usage;
 
        section_nr = pfn_to_section_nr(start_pfn);
        ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
        for (i = 0; i < mapsize; i++, page++)
                get_page_bootmem(section_nr, page, SECTION_INFO);
 
-       usemap = ms->pageblock_flags;
-       page = virt_to_page(usemap);
+       usage = ms->usage;
+       page = virt_to_page(usage);
 
-       mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+       mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
        for (i = 0; i < mapsize; i++, page++)
                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
 #else /* CONFIG_SPARSEMEM_VMEMMAP */
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-       unsigned long *usemap, mapsize, section_nr, i;
+       unsigned long mapsize, section_nr, i;
        struct mem_section *ms;
        struct page *page, *memmap;
+       struct mem_section_usage *usage;
 
        section_nr = pfn_to_section_nr(start_pfn);
        ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned 
long start_pfn)
 
        register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 
-       usemap = ms->pageblock_flags;
-       page = virt_to_page(usemap);
+       usage = ms->usage;
+       page = virt_to_page(usage);
 
-       mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+       mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
        for (i = 0; i < mapsize; i++, page++)
                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c061f66c2d0c..c6d8224d792e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -404,7 +404,7 @@ static inline unsigned long *get_pageblock_bitmap(struct 
page *page,
                                                        unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
-       return __pfn_to_section(pfn)->pageblock_flags;
+       return section_to_usemap(__pfn_to_section(pfn));
 #else
        return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
diff --git a/mm/sparse.c b/mm/sparse.c
index 1552c855d62a..71da15cc7432 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long 
coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
                unsigned long pnum, struct page *mem_map,
-               unsigned long *pageblock_bitmap)
+               struct mem_section_usage *usage)
 {
        ms->section_mem_map &= ~SECTION_MAP_MASK;
        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
                                                        SECTION_HAS_MEM_MAP;
-       ms->pageblock_flags = pageblock_bitmap;
+       ms->usage = usage;
 }
 
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
 {
        return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
 {
-       return kmalloc(usemap_size(), GFP_KERNEL);
+       return sizeof(struct mem_section_usage) + usemap_size();
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
+       struct mem_section_usage *usage;
        unsigned long goal, limit;
-       unsigned long *p;
        int nid;
        /*
         * A page may contain usemaps for other sections preventing the
@@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct 
pglist_data *pgdat,
        limit = goal + (1UL << PA_SECTION_SHIFT);
        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-       p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
-       if (!p && limit) {
+       usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+       if (!usage && limit) {
                limit = 0;
                goto again;
        }
-       return p;
+       return usage;
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+               struct mem_section_usage *usage)
 {
        unsigned long usemap_snr, pgdat_snr;
        static unsigned long old_usemap_snr;
@@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, 
unsigned long *usemap)
                old_pgdat_snr = NR_MEM_SECTIONS;
        }
 
-       usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+       usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
        pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
        if (usemap_snr == pgdat_snr)
                return;
@@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, 
unsigned long *usemap)
                usemap_snr, pgdat_snr, nid);
 }
 #else
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
        return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+               struct mem_section_usage *usage)
 {
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long 
pnum_begin,
                                   unsigned long pnum_end,
                                   unsigned long map_count)
 {
-       unsigned long pnum, usemap_longs, *usemap;
+       struct mem_section_usage *usage;
+       unsigned long pnum;
        struct page *map;
 
-       usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
-       usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
-                                                         usemap_size() *
-                                                         map_count);
-       if (!usemap) {
+       usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+                       mem_section_usage_size() * map_count);
+       if (!usage) {
                pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
                goto failed;
        }
@@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long 
pnum_begin,
                        pnum_begin = pnum;
                        goto failed;
                }
-               check_usemap_section_nr(nid, usemap);
-               sparse_init_one_section(__nr_to_section(pnum), pnum, map, 
usemap);
-               usemap += usemap_longs;
+               check_usemap_section_nr(nid, usage);
+               sparse_init_one_section(__nr_to_section(pnum), pnum, map, 
usage);
+               usage = (void *) usage + mem_section_usage_size();
        }
        sparse_buffer_fini();
        return;
@@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long 
start_pfn,
                                     struct vmem_altmap *altmap)
 {
        unsigned long section_nr = pfn_to_section_nr(start_pfn);
+       struct mem_section_usage *usage;
        struct mem_section *ms;
        struct page *memmap;
-       unsigned long *usemap;
        int ret;
 
        /*
@@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long 
start_pfn,
        memmap = kmalloc_section_memmap(section_nr, nid, altmap);
        if (!memmap)
                return -ENOMEM;
-       usemap = __kmalloc_section_usemap();
-       if (!usemap) {
+       usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+       if (!usage) {
                __kfree_section_memmap(memmap, altmap);
                return -ENOMEM;
        }
@@ -732,11 +731,11 @@ int __meminit sparse_add_one_section(int nid, unsigned 
long start_pfn,
        page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
 
        section_mark_present(ms);
-       sparse_init_one_section(ms, section_nr, memmap, usemap);
+       sparse_init_one_section(ms, section_nr, memmap, usage);
 
 out:
        if (ret < 0) {
-               kfree(usemap);
+               kfree(usage);
                __kfree_section_memmap(memmap, altmap);
        }
        return ret;
@@ -772,20 +771,20 @@ static inline void clear_hwpoisoned_pages(struct page 
*memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
-               struct vmem_altmap *altmap)
+static void free_section_usage(struct page *memmap,
+               struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-       struct page *usemap_page;
+       struct page *usage_page;
 
-       if (!usemap)
+       if (!usage)
                return;
 
-       usemap_page = virt_to_page(usemap);
+       usage_page = virt_to_page(usage);
        /*
         * Check to see if allocation came from hot-plug-add
         */
-       if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-               kfree(usemap);
+       if (PageSlab(usage_page) || PageCompound(usage_page)) {
+               kfree(usage);
                if (memmap)
                        __kfree_section_memmap(memmap, altmap);
                return;
@@ -804,18 +803,18 @@ void sparse_remove_one_section(struct mem_section *ms, 
unsigned long map_offset,
                               struct vmem_altmap *altmap)
 {
        struct page *memmap = NULL;
-       unsigned long *usemap = NULL;
+       struct mem_section_usage *usage = NULL;
 
        if (ms->section_mem_map) {
-               usemap = ms->pageblock_flags;
+               usage = ms->usage;
                memmap = sparse_decode_mem_map(ms->section_mem_map,
                                                __section_nr(ms));
                ms->section_mem_map = 0;
-               ms->pageblock_flags = NULL;
+               ms->usage = NULL;
        }
 
        clear_hwpoisoned_pages(memmap + map_offset,
                        PAGES_PER_SECTION - map_offset);
-       free_section_usemap(memmap, usemap, altmap);
+       free_section_usage(memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */

Reply via email to