Recent refactoring introduced common vmemmap optimization logic via
CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION. While HugeTLB already uses it,
DAX requires slightly different handling because it needs to preserve
2 vmemmap pages, instead of the 1 page HugeTLB preserves.

This patch updates DAX vmemmap optimization to manually allocate the
second vmemmap page, and integrates DAX memory setup to correctly set
the compound order and allocate/reuse the shared vmemmap tail page.

Note that manually allocating the vmemmap page is a temporary solution
and will be unified with the logic that HugeTLB relies on in the future.

Signed-off-by: Muchun Song <[email protected]>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c |  5 +-
 mm/memory_hotplug.c                      |  5 +-
 mm/mm_init.c                             |  8 ++-
 mm/sparse-vmemmap.c                      | 82 ++++++++++++++----------
 4 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index dfa2f7dc7e15..ad44883b1030 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1124,9 +1124,10 @@ int __meminit radix__vmemmap_populate(unsigned long 
start, unsigned long end, in
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
+       unsigned long pfn = page_to_pfn((struct page *)start);
 
-       if (vmemmap_can_optimize(altmap, pgmap))
-               return vmemmap_populate_compound_pages(page_to_pfn((struct page 
*)start), start, end, node, pgmap);
+       if (vmemmap_can_optimize(altmap, pgmap) && 
section_vmemmap_optimizable(__pfn_to_section(pfn)))
+               return vmemmap_populate_compound_pages(pfn, start, end, node, 
pgmap);
        /*
         * If altmap is present, Make sure we align the start vmemmap addr
         * to PAGE_SIZE so that we calculate the correct start_pfn in
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 05f5df12d843..28306196c0fe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,8 +551,9 @@ void remove_pfn_range_from_zone(struct zone *zone,
                /* Select all remaining pages up to the next section boundary */
                cur_nr_pages =
                        min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
-               page_init_poison(pfn_to_page(pfn),
-                                sizeof(struct page) * cur_nr_pages);
+               if (!section_vmemmap_optimizable(__pfn_to_section(pfn)))
+                       page_init_poison(pfn_to_page(pfn),
+                                        sizeof(struct page) * cur_nr_pages);
        }
 
        /*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index e47d08b63154..636a0f9644f6 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1069,9 +1069,10 @@ static void __ref __init_zone_device_page(struct page 
*page, unsigned long pfn,
  * of an altmap. See vmemmap_populate_compound_pages().
  */
 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
-                                             struct dev_pagemap *pgmap)
+                                             struct dev_pagemap *pgmap,
+                                             const struct mem_section *ms)
 {
-       if (!vmemmap_can_optimize(altmap, pgmap))
+       if (!section_vmemmap_optimizable(ms))
                return pgmap_vmemmap_nr(pgmap);
 
        return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
@@ -1140,7 +1141,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
                        continue;
 
                memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
-                                    compound_nr_pages(altmap, pgmap));
+                                    compound_nr_pages(altmap, pgmap,
+                                                      __pfn_to_section(pfn)));
        }
 
        /*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 309d935fb05e..6f959a999d5b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -353,8 +353,12 @@ struct page *vmemmap_shared_tail_page(unsigned int order, 
struct zone *zone)
        if (!addr)
                return NULL;
 
-       for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
-               init_compound_tail((struct page *)addr + i, NULL, order, zone);
+       for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) {
+               page = (struct page *)addr + i;
+               if (zone_is_zone_device(zone))
+                       __SetPageReserved(page);
+               init_compound_tail(page, NULL, order, zone);
+       }
 
        page = virt_to_page(addr);
        if (cmpxchg(&zone->vmemmap_tails[idx], NULL, page) != NULL) {
@@ -458,23 +462,6 @@ static bool __meminit reuse_compound_section(unsigned long 
start_pfn,
        return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
 }
 
-static pte_t * __meminit compound_section_tail_page(unsigned long addr)
-{
-       pte_t *pte;
-
-       addr -= PAGE_SIZE;
-
-       /*
-        * Assuming sections are populated sequentially, the previous section's
-        * page data can be reused.
-        */
-       pte = pte_offset_kernel(pmd_off_k(addr), addr);
-       if (!pte)
-               return NULL;
-
-       return pte;
-}
-
 static int __meminit vmemmap_populate_compound_pages(unsigned long start,
                                                     unsigned long end, int 
node,
                                                     struct dev_pagemap *pgmap)
@@ -483,42 +470,62 @@ static int __meminit 
vmemmap_populate_compound_pages(unsigned long start,
        pte_t *pte;
        int rc;
        unsigned long start_pfn = page_to_pfn((struct page *)start);
+       const struct mem_section *ms = __pfn_to_section(start_pfn);
+       struct page *tail = NULL;
 
-       if (reuse_compound_section(start_pfn, pgmap)) {
-               pte = compound_section_tail_page(start);
-               if (!pte)
-                       return -ENOMEM;
+       /* This may occur in sub-section scenarios. */
+       if (!section_vmemmap_optimizable(ms))
+               return vmemmap_populate_range(start, end, node, NULL, -1);
 
-               /*
-                * Reuse the page that was populated in the prior iteration
-                * with just tail struct pages.
-                */
+#ifdef CONFIG_ZONE_DEVICE
+       tail = vmemmap_shared_tail_page(section_order(ms),
+                                       
&NODE_DATA(node)->node_zones[ZONE_DEVICE]);
+#endif
+       if (!tail)
+               return -ENOMEM;
+
+       if (reuse_compound_section(start_pfn, pgmap))
                return vmemmap_populate_range(start, end, node, NULL,
-                                             pte_pfn(ptep_get(pte)));
-       }
+                                             page_to_pfn(tail));
 
        size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
        for (addr = start; addr < end; addr += size) {
                unsigned long next, last = addr + size;
+               void *p;
 
                /* Populate the head page vmemmap page */
                pte = vmemmap_populate_address(addr, node, NULL, -1);
                if (!pte)
                        return -ENOMEM;
 
+               /*
+                * Allocate manually since vmemmap_populate_address() will 
assume DAX
+                * only needs 1 vmemmap page to be reserved, however DAX now 
needs 2
+                * vmemmap pages. This is a temporary solution and will be 
unified
+                * with HugeTLB in the future.
+                */
+               p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+               if (!p)
+                       return -ENOMEM;
+
                /* Populate the tail pages vmemmap page */
                next = addr + PAGE_SIZE;
-               pte = vmemmap_populate_address(next, node, NULL, -1);
+               pte = vmemmap_populate_address(next, node, NULL, 
PHYS_PFN(__pa(p)));
+               /*
+                * get_page() is called above. Since we are not actually
+                * reusing it, to avoid a memory leak, we call put_page() here.
+                */
+               put_page(virt_to_page(p));
                if (!pte)
                        return -ENOMEM;
 
                /*
-                * Reuse the previous page for the rest of tail pages
+                * Reuse the shared vmemmap page for the rest of tail pages
                 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
                 */
                next += PAGE_SIZE;
                rc = vmemmap_populate_range(next, last, node, NULL,
-                                           pte_pfn(ptep_get(pte)));
+                                           page_to_pfn(tail));
                if (rc)
                        return -ENOMEM;
        }
@@ -744,8 +751,10 @@ static void section_deactivate(unsigned long pfn, unsigned 
long nr_pages,
                free_map_bootmem(memmap);
        }
 
-       if (empty)
+       if (empty) {
                ms->section_mem_map = (unsigned long)NULL;
+               section_set_order(ms, 0);
+       }
 }
 
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
@@ -824,6 +833,9 @@ int __meminit sparse_add_section(int nid, unsigned long 
start_pfn,
        if (ret < 0)
                return ret;
 
+       ms = __nr_to_section(section_nr);
+       if (vmemmap_can_optimize(altmap, pgmap) && nr_pages == 
PAGES_PER_SECTION)
+               section_set_order(ms, pgmap->vmemmap_shift);
        memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
        if (IS_ERR(memmap))
                return PTR_ERR(memmap);
@@ -832,9 +844,9 @@ int __meminit sparse_add_section(int nid, unsigned long 
start_pfn,
         * Poison uninitialized struct pages in order to catch invalid flags
         * combinations.
         */
-       page_init_poison(memmap, sizeof(struct page) * nr_pages);
+       if (!section_vmemmap_optimizable(ms))
+               page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
-       ms = __nr_to_section(section_nr);
        __section_mark_present(ms, section_nr);
 
        /* Align memmap to section boundary in the subsection case */
-- 
2.20.1


Reply via email to