Currently, memmap_init_range() unconditionally initializes all struct pages
within a section. However, when HugeTLB Vmemmap Optimization (HVO) is enabled,
shared vmemmap tail pages are allocated during the vmemmap population phase
(e.g., via vmemmap_get_tail()). These shared tail pages are left intentionally
uninitialized at that time because the subsequent memmap_init() would simply
overwrite them.

If memmap_init_range() continues to initialize these shared tail pages, it
will overwrite the carefully constructed HVO mappings and metadata. This forces
subsystems like HugeTLB to implement workarounds (like re-initializing or
compensating for the overwritten data in their own init routines, as seen
in hugetlb_vmemmap_init()).

Therefore, the primary motivation of this patch is to prevent 
memmap_init_range()
from incorrectly overwriting the shared vmemmap tail pages. By detecting if a
page is an optimizable compound vmemmap page (using the newly introduced section
order), we can safely skip its redundant initialization.

As a significant side-effect, skipping the initialization of these shared tail
pages also saves substantial CPU cycles during the early boot stage.

Signed-off-by: Muchun Song <[email protected]>
---
 mm/internal.h | 11 +++++++++++
 mm/mm_init.c  | 19 +++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index a8acabcd1d93..1060d7c07f5b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1011,6 +1011,17 @@ static inline void sparse_init_subsection_map(void)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+static inline bool vmemmap_page_optimizable(const struct page *page)
+{
+       unsigned long pfn = page_to_pfn(page);
+       unsigned int order = section_order(__pfn_to_section(pfn));
+
+       if (!is_power_of_2(sizeof(struct page)))
+               return false;
+
+       return (pfn & ((1L << order) - 1)) >= 
OPTIMIZED_FOLIO_VMEMMAP_PAGE_STRUCTS;
+}
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 977a837b7ef6..7f5b326e9298 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -676,12 +676,13 @@ static inline void fixup_hashdist(void) {}
 
 static __meminit void pageblock_migratetype_init_range(unsigned long pfn,
                                                       unsigned long nr_pages,
-                                                      int migratetype)
+                                                      int migratetype,
+                                                      bool isolate)
 {
        unsigned long end = pfn + nr_pages;
 
        for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) {
-               init_pageblock_migratetype(pfn_to_page(pfn), migratetype, 
false);
+               init_pageblock_migratetype(pfn_to_page(pfn), migratetype, 
isolate);
                cond_resched();
        }
 }
@@ -912,6 +913,16 @@ void __meminit memmap_init_range(unsigned long size, int 
nid, unsigned long zone
                }
 
                page = pfn_to_page(pfn);
+               if (vmemmap_page_optimizable(page)) {
+                       struct mem_section *ms = __pfn_to_section(pfn);
+                       unsigned long start = pfn;
+
+                       pfn = min(ALIGN(start, 1L << section_order(ms)), 
end_pfn);
+                       pageblock_migratetype_init_range(start, pfn - start, 
migratetype,
+                                                        isolate_pageblock);
+                       continue;
+               }
+
                __init_single_page(page, pfn, zone, nid);
                if (context == MEMINIT_HOTPLUG) {
 #ifdef CONFIG_ZONE_DEVICE
@@ -1138,7 +1149,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
         * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
         * because this is done early in section_activate()
         */
-       pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE);
+       pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE, 
false);
 
        pr_debug("%s initialised %lu pages in %ums\n", __func__,
                nr_pages, jiffies_to_msecs(jiffies - start));
@@ -1963,7 +1974,7 @@ static void __init deferred_free_pages(unsigned long pfn,
        if (!nr_pages)
                return;
 
-       pageblock_migratetype_init_range(pfn, nr_pages, MIGRATE_MOVABLE);
+       pageblock_migratetype_init_range(pfn, nr_pages, MIGRATE_MOVABLE, false);
 
        page = pfn_to_page(pfn);
 
-- 
2.20.1


Reply via email to