Now that hugetlb reservation runs after zone initialization, bootmem
gigantic page allocation can detect pages that span multiple zones.

Keep those cross-zone pages separate during allocation and free them
after allocation completes, so later hugetlb initialization only sees
zone-valid gigantic pages.

This chooses to free cross-zone gigantic pages directly instead of
retrying allocation. In practice, such cross-zone cases are expected to
be very rare, so adding retry logic does not seem justified at this
point. Keeping the handling simple also preserves the previous behavior.
If similar real-world reports show up later, retry support can be
reconsidered then.

Signed-off-by: Muchun Song <[email protected]>
---
 mm/hugetlb.c | 75 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 11 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e9ba0be2eb17..d5d324f69d7a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3077,12 +3077,15 @@ void *__init __alloc_bootmem_huge_page(struct hstate 
*h, int nid)
 
 static bool __init alloc_bootmem_huge_page(struct hstate *h, int nid)
 {
+       unsigned long pfn;
+       unsigned int nid_request = nid;
        struct huge_bootmem_page *m = arch_alloc_bootmem_huge_page(h, nid);
 
        if (!m)
                return false;
 
-       nid = early_pfn_to_nid(PHYS_PFN(__pa(m)));
+       pfn = PHYS_PFN(__pa(m));
+       nid = early_pfn_to_nid(pfn);
        /*
         * Use the beginning of the huge page to store the huge_bootmem_page
         * struct (until gather_bootmem puts them into the mem_map).
@@ -3090,22 +3093,38 @@ static bool __init alloc_bootmem_huge_page(struct 
hstate *h, int nid)
         * Put them into a private list first because mem_map is not up yet.
         */
        INIT_LIST_HEAD(&m->list);
-       list_add(&m->list, &huge_boot_pages[nid]);
        m->hstate = h;
        if (!hugetlb_early_cma(h)) {
                m->cma = NULL;
                m->flags = 0;
        }
 
-       /*
-        * Only initialize the head struct page in memmap_init_reserved_pages,
-        * rest of the struct pages will be initialized by the HugeTLB
-        * subsystem itself.
-        * The head struct page is used to get folio information by the HugeTLB
-        * subsystem like zone id and node id.
-        */
-       memblock_reserved_mark_noinit(__pa((void *)m + PAGE_SIZE),
-               huge_page_size(h) - PAGE_SIZE);
+       /* CMA pages: zone-crossing is validated in hugetlb_cma_reserve(). */
+       if (!hugetlb_early_cma(h) &&
+           pfn_range_intersects_zones(nid, pfn, pages_per_huge_page(h))) {
+               /*
+                * If the allocated page is on a different node than requested
+                * (e.g. on PowerPC LPARs), put it on the requested node's list.
+                * Otherwise, the cross-zone page will be stranded and never
+                * freed, as the cleanup code only operates on the requested 
node.
+                */
+               if (WARN_ON_ONCE(nid_request != NUMA_NO_NODE && nid != 
nid_request))
+                       list_add(&m->list, &huge_boot_pages[nid_request]);
+               else
+                       list_add(&m->list, &huge_boot_pages[nid]);
+       } else {
+               list_add_tail(&m->list, &huge_boot_pages[nid]);
+               m->flags |= HUGE_BOOTMEM_ZONES_VALID;
+               /*
+                * Only initialize the head struct page in 
memmap_init_reserved_pages,
+                * rest of the struct pages will be initialized by the HugeTLB
+                * subsystem itself.
+                * The head struct page is used to get folio information by the 
HugeTLB
+                * subsystem like zone id and node id.
+                */
+               memblock_reserved_mark_noinit(__pa((void *)m + PAGE_SIZE),
+                               huge_page_size(h) - PAGE_SIZE);
+       }
 
        return true;
 }
@@ -3384,6 +3403,34 @@ void __init hugetlb_struct_page_init(void)
        padata_do_multithreaded(&job);
 }
 
+static unsigned long __init hugetlb_free_cross_zone_pages(struct hstate *h, 
int nid)
+{
+       unsigned long freed = 0;
+       struct huge_bootmem_page *m, *tmp;
+
+       if (!hstate_is_gigantic(h))
+               return freed;
+
+       list_for_each_entry_safe(m, tmp, &huge_boot_pages[nid], list) {
+               if (m->flags & HUGE_BOOTMEM_ZONES_VALID)
+                       break;
+
+               list_del(&m->list);
+               memblock_free(m, huge_page_size(h));
+               freed++;
+       }
+
+       if (freed) {
+               char buf[32];
+
+               string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 
sizeof(buf));
+               pr_warn("HugeTLB: freed %lu cross-zone hugepages of size %s on 
node %d.\n",
+                       freed, buf, nid);
+       }
+
+       return freed;
+}
+
 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int 
nid)
 {
        unsigned long i;
@@ -3414,6 +3461,8 @@ static void __init 
hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
                cond_resched();
        }
 
+       i -= hugetlb_free_cross_zone_pages(h, nid);
+
        if (!list_empty(&folio_list))
                prep_and_add_allocated_folios(h, &folio_list);
 
@@ -3487,6 +3536,7 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned 
long start, unsigned l
 
 static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 {
+       int nid;
        unsigned long i;
 
        for (i = 0; i < h->max_huge_pages; ++i) {
@@ -3495,6 +3545,9 @@ static unsigned long __init 
hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
                cond_resched();
        }
 
+       for_each_node(nid)
+               i -= hugetlb_free_cross_zone_pages(h, nid);
+
        return i;
 }
 
-- 
2.54.0


Reply via email to