When we free a HugeTLB page to the buddy allocator, we should allocate the
vmemmap pages associated with it. We can do that in the __free_hugepage()
before freeing it to buddy.

Signed-off-by: Muchun Song <songmuc...@bytedance.com>
---
 mm/hugetlb.c         |  2 ++
 mm/hugetlb_vmemmap.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++---
 mm/hugetlb_vmemmap.h |  5 +++
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ff9b90e524f..542e6cb81321 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1362,6 +1362,8 @@ static void __free_hugepage(struct hstate *h, struct page 
*page)
 {
        int i;
 
+       alloc_huge_page_vmemmap(h, page);
+
        for (i = 0; i < pages_per_huge_page(h); i++) {
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d080488cde16..4587a0062808 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -169,6 +169,7 @@
 #define pr_fmt(fmt)    "HugeTLB vmemmap: " fmt
 
 #include <linux/bootmem_info.h>
+#include <linux/delay.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -181,6 +182,8 @@
 #define RESERVE_VMEMMAP_NR             2U
 #define RESERVE_VMEMMAP_SIZE           (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 #define VMEMMAP_TAIL_PAGE_REUSE                -1
+#define GFP_VMEMMAP_PAGE               \
+       (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_HIGH | __GFP_NOWARN)
 
 #ifndef VMEMMAP_HPAGE_SHIFT
 #define VMEMMAP_HPAGE_SHIFT            HPAGE_SHIFT
@@ -197,6 +200,11 @@
        (__boundary - 1 < (end) - 1) ? __boundary : (end);               \
 })
 
+typedef void (*vmemmap_remap_pte_func_t)(struct page *reuse, pte_t *pte,
+                                        unsigned long start, unsigned long end,
+                                        void *priv);
+
+
 static inline unsigned int vmemmap_pages_per_hpage(struct hstate *h)
 {
        return free_vmemmap_pages_per_hpage(h) + RESERVE_VMEMMAP_NR;
@@ -236,9 +244,39 @@ static pmd_t *vmemmap_to_pmd(unsigned long addr)
        return pmd;
 }
 
+static void vmemmap_restore_pte_range(struct page *reuse, pte_t *pte,
+                                     unsigned long start, unsigned long end,
+                                     void *priv)
+{
+       pgprot_t pgprot = PAGE_KERNEL;
+       void *from = page_to_virt(reuse);
+       unsigned long addr;
+       struct list_head *pages = priv;
+
+       for (addr = start; addr < end; addr += PAGE_SIZE) {
+               void *to;
+               struct page *page;
+
+               VM_BUG_ON(pte_none(*pte) || pte_page(*pte) != reuse);
+
+               page = list_first_entry(pages, struct page, lru);
+               list_del(&page->lru);
+               to = page_to_virt(page);
+               copy_page(to, from);
+
+               /*
+                * Make sure that any data that writes to the @to is made
+                * visible to the physical page.
+                */
+               flush_kernel_vmap_range(to, PAGE_SIZE);
+
+               set_pte_at(&init_mm, addr, pte++, mk_pte(page, pgprot));
+       }
+}
+
 static void vmemmap_reuse_pte_range(struct page *reuse, pte_t *pte,
                                    unsigned long start, unsigned long end,
-                                   struct list_head *vmemmap_pages)
+                                   void *priv)
 {
        /*
         * Make the tail pages are mapped with read-only to catch
@@ -247,6 +285,7 @@ static void vmemmap_reuse_pte_range(struct page *reuse, 
pte_t *pte,
        pgprot_t pgprot = PAGE_KERNEL_RO;
        pte_t entry = mk_pte(reuse, pgprot);
        unsigned long addr;
+       struct list_head *pages = priv;
 
        for (addr = start; addr < end; addr += PAGE_SIZE, pte++) {
                struct page *page;
@@ -254,14 +293,14 @@ static void vmemmap_reuse_pte_range(struct page *reuse, 
pte_t *pte,
                VM_BUG_ON(pte_none(*pte));
 
                page = pte_page(*pte);
-               list_add(&page->lru, vmemmap_pages);
+               list_add(&page->lru, pages);
 
                set_pte_at(&init_mm, addr, pte, entry);
        }
 }
 
 static void vmemmap_remap_range(unsigned long start, unsigned long end,
-                               struct list_head *vmemmap_pages)
+                               vmemmap_remap_pte_func_t func, void *priv)
 {
        pmd_t *pmd;
        unsigned long next, addr = start;
@@ -281,12 +320,52 @@ static void vmemmap_remap_range(unsigned long start, 
unsigned long end,
                        reuse = pte_page(pte[VMEMMAP_TAIL_PAGE_REUSE]);
 
                next = vmemmap_hpage_addr_end(addr, end);
-               vmemmap_reuse_pte_range(reuse, pte, addr, next, vmemmap_pages);
+               func(reuse, pte, addr, next, priv);
        } while (pmd++, addr = next, addr != end);
 
        flush_tlb_kernel_range(start, end);
 }
 
+static inline void alloc_vmemmap_pages(struct hstate *h, struct list_head 
*list)
+{
+       unsigned int nr = free_vmemmap_pages_per_hpage(h);
+
+       while (nr--) {
+               struct page *page;
+
+retry:
+               page = alloc_page(GFP_VMEMMAP_PAGE);
+               if (unlikely(!page)) {
+                       msleep(100);
+                       /*
+                        * We should retry infinitely, because we cannot
+                        * handle allocation failures. Once we allocate
+                        * vmemmap pages successfully, then we can free
+                        * a HugeTLB page.
+                        */
+                       goto retry;
+               }
+               list_add_tail(&page->lru, list);
+       }
+}
+
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+       unsigned long start, end;
+       unsigned long vmemmap_addr = (unsigned long)head;
+       LIST_HEAD(vmemmap_pages);
+
+       if (!free_vmemmap_pages_per_hpage(h))
+               return;
+
+       alloc_vmemmap_pages(h, &vmemmap_pages);
+
+       start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
+       end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
+       vmemmap_remap_range(start, end, vmemmap_restore_pte_range,
+                           &vmemmap_pages);
+}
+
 /*
  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  * allocator or buddy allocator. If the PG_reserved flag is set, it means
@@ -322,7 +401,7 @@ void free_huge_page_vmemmap(struct hstate *h, struct page 
*head)
 
        start = vmemmap_addr + RESERVE_VMEMMAP_SIZE;
        end = vmemmap_addr + vmemmap_pages_size_per_hpage(h);
-       vmemmap_remap_range(start, end, &vmemmap_pages);
+       vmemmap_remap_range(start, end, vmemmap_reuse_pte_range, 
&vmemmap_pages);
 
        free_vmemmap_page_list(&vmemmap_pages);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index bf22cd003acb..8fd57c49e230 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,10 @@ static inline unsigned int 
free_vmemmap_pages_per_hpage(struct hstate *h)
        return 0;
 }
 #else
+static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
-- 
2.11.0

Reply via email to