From: Zi Yan <z...@nvidia.com>

First promote 512 PMD-mapped THPs to a PMD-mapped PUD THP, then promote
a PMD-mapped PUD THP to a PUD-mapped PUD THP.

Signed-off-by: Zi Yan <z...@nvidia.com>
---
 arch/x86/include/asm/pgalloc.h |   2 +
 include/asm-generic/pgtable.h  |  10 +
 mm/huge_memory.c               | 497 ++++++++++++++++++++++++++++++++-
 mm/internal.h                  |   2 +
 mm/pgtable-generic.c           |  20 ++
 mm/rmap.c                      |  23 +-
 6 files changed, 540 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index ebcb022f6bb9..153a6749f92b 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -119,6 +119,8 @@ static inline void pud_populate_with_pgtable(struct 
mm_struct *mm, pud_t *pud,
        set_pud(pud, __pud(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
 }
 
+#define pud_pgtable(pud) pud_page(pud)
+
 #if CONFIG_PGTABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 1ae33b6590b8..9984c75d64ce 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -302,6 +302,8 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
+extern pud_t pudp_collapse_flush(struct vm_area_struct *vma,
+                                unsigned long address, pud_t *pudp);
 #else
 static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
@@ -310,7 +312,15 @@ static inline pmd_t pmdp_collapse_flush(struct 
vm_area_struct *vma,
        BUILD_BUG();
        return *pmdp;
 }
+static inline pud_t pudp_collapse_flush(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pud_t *pudp)
+{
+       BUILD_BUG();
+       return *pudp;
+}
 #define pmdp_collapse_flush pmdp_collapse_flush
+#define pudp_collapse_flush pudp_collapse_flush
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f856f7e39095..67fd1821f4dc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2958,7 +2958,7 @@ void split_huge_pud_address(struct vm_area_struct *vma, 
unsigned long address,
        __split_huge_pud(vma, pud, address, freeze, page);
 }
 
-static void freeze_pud_page(struct page *page)
+static void unmap_pud_page(struct page *page)
 {
        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PUD;
@@ -2973,7 +2973,7 @@ static void freeze_pud_page(struct page *page)
        VM_BUG_ON_PAGE(!unmap_success, page);
 }
 
-static void unfreeze_pud_page(struct page *page)
+static void remap_pud_page(struct page *page)
 {
        int i;
 
@@ -3109,7 +3109,7 @@ static void __split_huge_pud_page(struct page *page, 
struct list_head *list,
 
        spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
 
-       unfreeze_pud_page(head);
+       remap_pud_page(head);
 
        for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) {
                struct page *subpage = head + i;
@@ -3210,7 +3210,7 @@ int split_huge_pud_page_to_list(struct page *page, struct 
list_head *list)
        }
 
        /*
-        * Racy check if we can split the page, before freeze_pud_page() will
+        * Racy check if we can split the page, before unmap_pud_page() will
         * split PUDs
         */
        if (!can_split_huge_pud_page(head, &extra_pins)) {
@@ -3219,7 +3219,7 @@ int split_huge_pud_page_to_list(struct page *page, struct 
list_head *list)
        }
 
        mlocked = PageMlocked(page);
-       freeze_pud_page(head);
+       unmap_pud_page(head);
        VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
        /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3285,7 +3285,7 @@ int split_huge_pud_page_to_list(struct page *page, struct 
list_head *list)
                        xa_unlock(&mapping->i_pages);
                }
                spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
-               unfreeze_pud_page(head);
+               remap_pud_page(head);
                ret = -EBUSY;
        }
 
@@ -4703,3 +4703,488 @@ int promote_huge_page_address(struct vm_area_struct 
*vma, unsigned long haddr)
 
        return promote_list_to_huge_page(head, &subpage_list);
 }
+
+static pud_t *mm_find_pud(struct mm_struct *mm, unsigned long address)
+{
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud = NULL;
+       pud_t pude;
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               goto out;
+
+       p4d = p4d_offset(pgd, address);
+       if (!p4d_present(*p4d))
+               goto out;
+
+       pud = pud_offset(p4d, address);
+
+       pude = *pud;
+       barrier();
+       if (!pud_present(pude) || pud_trans_huge(pude))
+               pud = NULL;
+out:
+       return pud;
+}
+
+/* promote HPAGE_PUD_SIZE range into a PUD map.
+ * mmap_sem needs to be down_write.
+ */
+int promote_huge_pud_address(struct vm_area_struct *vma, unsigned long haddr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pud_t *pud, _pud;
+       pmd_t *pmd, *_pmd;
+       spinlock_t *pud_ptl, *pmd_ptl;
+       struct mmu_notifier_range range;
+       pgtable_t pgtable;
+       struct page *page, *head;
+       unsigned long address = haddr;
+       int ret = -EBUSY;
+
+       VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
+
+       if (haddr < vma->vm_start || (haddr + HPAGE_PUD_SIZE) > vma->vm_end)
+               return -EINVAL;
+
+       pud = mm_find_pud(mm, haddr);
+       if (!pud)
+               goto out;
+
+       anon_vma_lock_write(vma->anon_vma);
+
+       pmd = pmd_offset(pud, haddr);
+       pmd_ptl = pmd_lockptr(mm, pmd);
+
+       head = page = vm_normal_page_pmd(vma, haddr, *pmd);
+       if (!page || !PageTransCompound(page) ||
+               compound_order(page) != HPAGE_PUD_ORDER)
+               goto out_unlock;
+       VM_BUG_ON(head != compound_head(page));
+       lock_page(head);
+
+       mmu_notifier_range_init(&range, mm, haddr, haddr + HPAGE_PUD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
+       pud_ptl = pud_lock(mm, pud);
+       /*
+        * After this gup_fast can't run anymore. This also removes
+        * any huge TLB entry from the CPU so we won't allow
+        * huge and small TLB entries for the same virtual address
+        * to avoid the risk of CPU bugs in that area.
+        */
+
+       _pud = pudp_collapse_flush(vma, haddr, pud);
+       spin_unlock(pud_ptl);
+       mmu_notifier_invalidate_range_end(&range);
+
+       /* remove ptes */
+       for (_pmd = pmd; _pmd < pmd + (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER));
+                               _pmd++, page += HPAGE_PMD_NR, address += 
HPAGE_PMD_SIZE) {
+               pmd_t pmdval = *_pmd;
+
+               if (pmd_none(pmdval) || is_zero_pfn(pmd_pfn(pmdval))) {
+                       if (is_zero_pfn(pmd_pfn(pmdval))) {
+                               /*
+                                * ptl mostly unnecessary.
+                                */
+                               spin_lock(pmd_ptl);
+                               /*
+                                * paravirt calls inside pte_clear here are
+                                * superfluous.
+                                */
+                               pmd_clear(_pmd);
+                               spin_unlock(pmd_ptl);
+                       }
+               } else {
+                       /*
+                        * ptl mostly unnecessary, but preempt has to
+                        * be disabled to update the per-cpu stats
+                        * inside page_remove_rmap().
+                        */
+                       spin_lock(pmd_ptl);
+                       /*
+                        * paravirt calls inside pte_clear here are
+                        * superfluous.
+                        */
+                       pmd_clear(_pmd);
+                       atomic_dec(sub_compound_mapcount_ptr(page, 1));
+                       __dec_node_page_state(page, NR_ANON_THPS);
+                       spin_unlock(pmd_ptl);
+               }
+       }
+       page_ref_sub(head, (1<<(HPAGE_PUD_ORDER-HPAGE_PMD_ORDER)) - 1);
+
+       pgtable = pud_pgtable(_pud);
+
+       _pud = mk_huge_pud(head, vma->vm_page_prot);
+       _pud = maybe_pud_mkwrite(pud_mkdirty(_pud), vma);
+
+       /*
+        * spin_lock() below is not the equivalent of smp_wmb(), so
+        * this is needed to avoid the copy_huge_page writes to become
+        * visible after the set_pmd_at() write.
+        */
+       smp_wmb();
+
+       spin_lock(pud_ptl);
+       BUG_ON(!pud_none(*pud));
+       pgtable_trans_huge_pud_deposit(mm, pud, pgtable);
+       set_pud_at(mm, haddr, pud, _pud);
+       update_mmu_cache_pud(vma, haddr, pud);
+       __inc_node_page_state(head, NR_ANON_THPS_PUD);
+       atomic_inc(compound_mapcount_ptr(head));
+       spin_unlock(pud_ptl);
+       unlock_page(head);
+       ret = 0;
+
+out_unlock:
+       anon_vma_unlock_write(vma->anon_vma);
+out:
+       return ret;
+}
+
+/* Racy check whether the huge page can be split */
+static bool can_promote_huge_pud_page(struct page *page)
+{
+       int extra_pins;
+
+       /* Additional pins from radix tree */
+       if (PageAnon(page))
+               extra_pins = PageSwapCache(page) ? 1 : 0;
+       else
+               return false;
+       if (PageSwapCache(page))
+               return false;
+       if (PageWriteback(page))
+               return false;
+       return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
+
+static void release_pmd_page(struct page *page)
+{
+       mod_node_page_state(page_pgdat(page),
+               NR_ISOLATED_ANON + page_is_file_cache(page),
+               -hpage_nr_pages(page));
+       unlock_page(page);
+       putback_lru_page(page);
+}
+
+void release_pmd_pages(pmd_t *pmd, pmd_t *_pmd)
+{
+       while (--_pmd >= pmd) {
+               pmd_t pmdval = *_pmd;
+
+               if (!pmd_none(pmdval) && !is_zero_pfn(pmd_pfn(pmdval)))
+                       release_pmd_page(pmd_page(pmdval));
+       }
+}
+
+/* write a __promote_huge_page_isolate(struct vm_area_struct *vma,
+ * unsigned long address, pte_t *pte) to isolate all subpages into a list,
+ * then call promote_list_to_huge_page() to promote in-place
+ */
+
+static int __promote_huge_pud_page_isolate(struct vm_area_struct *vma,
+                                       unsigned long haddr, pmd_t *pmd,
+                                       struct page **head, struct list_head 
*subpage_list)
+{
+       struct page *page = NULL;
+       pmd_t *_pmd;
+       bool writable = false;
+       unsigned long address = haddr;
+
+       *head = NULL;
+
+       lru_add_drain();
+       for (_pmd = pmd; _pmd < pmd+PTRS_PER_PMD;
+            _pmd++, address += HPAGE_PMD_SIZE) {
+               pmd_t pmdval = *_pmd;
+
+               if (pmd_none(pmdval) || (pmd_trans_huge(pmdval) &&
+                               is_zero_pfn(pmd_pfn(pmdval))))
+                       goto out;
+               if (!pmd_present(pmdval))
+                       goto out;
+               page = vm_normal_page_pmd(vma, address, pmdval);
+               if (unlikely(!page))
+                       goto out;
+
+               if (address == haddr) {
+                       *head = page;
+                       if (page_to_pfn(page) & ((1<<HPAGE_PUD_ORDER) - 1))
+                               goto out;
+               }
+
+               if ((*head + (address - haddr)/PAGE_SIZE) != page)
+                       goto out;
+
+               if (!PageCompound(page) || compound_order(page) != 
HPAGE_PMD_ORDER)
+                       goto out;
+
+               if (PageMlocked(page))
+                       goto out;
+
+               VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+               /*
+                * We can do it before isolate_lru_page because the
+                * page can't be freed from under us. NOTE: PG_lock
+                * is needed to serialize against split_huge_page
+                * when invoked from the VM.
+                */
+               if (!trylock_page(page))
+                       goto out;
+
+               /*
+                * cannot use mapcount: can't collapse if there's a gup pin.
+                * The page must only be referenced by the scanned process
+                * and page swap cache.
+                */
+               if (page_count(page) != page_mapcount(page) + 
PageSwapCache(page)) {
+                       unlock_page(page);
+                       goto out;
+               }
+               if (pmd_write(pmdval)) {
+                       writable = true;
+               } else {
+                       if (PageSwapCache(page) &&
+                           !reuse_swap_page(page, NULL)) {
+                               unlock_page(page);
+                               goto out;
+                       }
+                       /*
+                        * Page is not in the swap cache. It can be collapsed
+                        * into a THP.
+                        */
+               }
+
+               /*
+                * Isolate the page to avoid collapsing an hugepage
+                * currently in use by the VM.
+                */
+               if (isolate_lru_page(page)) {
+                       unlock_page(page);
+                       goto out;
+               }
+
+               mod_node_page_state(page_pgdat(page),
+                               NR_ISOLATED_ANON + page_is_file_cache(page),
+                               hpage_nr_pages(page));
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               VM_BUG_ON_PAGE(PageLRU(page), page);
+       }
+       if (likely(writable)) {
+               int i;
+
+               for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) {
+                       struct page *p = *head + i;
+
+                       list_add_tail(&p->lru, subpage_list);
+                       VM_BUG_ON_PAGE(!PageLocked(p), p);
+               }
+               return 1;
+       } else {
+               /*result = SCAN_PAGE_RO;*/
+       }
+
+out:
+       release_pmd_pages(pmd, _pmd);
+       return 0;
+}
+
+static int promote_huge_pud_page_isolate(struct vm_area_struct *vma,
+                                       unsigned long haddr,
+                                       struct page **head, struct list_head 
*subpage_list)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pud_t *pud;
+       pmd_t *pmd;
+       spinlock_t *pmd_ptl;
+       int ret = -EBUSY;
+
+       pud = mm_find_pud(mm, haddr);
+       if (!pud)
+               goto out;
+
+       anon_vma_lock_write(vma->anon_vma);
+
+       pmd = pmd_offset(pud, haddr);
+       if (!pmd)
+               goto out_unlock;
+       pmd_ptl = pmd_lockptr(mm, pmd);
+
+       spin_lock(pmd_ptl);
+       ret = __promote_huge_pud_page_isolate(vma, haddr, pmd, head, 
subpage_list);
+       spin_unlock(pmd_ptl);
+
+       if (unlikely(!ret)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+       ret = 0;
+       /*
+        * All pages are isolated and locked so anon_vma rmap
+        * can't run anymore.
+        */
+out_unlock:
+       anon_vma_unlock_write(vma->anon_vma);
+out:
+       return ret;
+}
+
+/*
+ * This function promotes normal pages into a huge page. @list point to all
+ * subpages of huge page to promote, @head point to the head page.
+ *
+ * Only caller must hold pin on the pages on @list, otherwise promotion
+ * fails with -EBUSY. All subpages must be locked.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. *
+ *
+ * Returns 0 if the hugepage is promoted successfully.
+ * Returns -EBUSY if any subpage is pinned or if anon_vma disappeared from
+ * under us.
+ */
+int promote_list_to_huge_pud_page(struct page *head, struct list_head *list)
+{
+       struct anon_vma *anon_vma = NULL;
+       int ret = 0;
+       DECLARE_BITMAP(subpage_bitmap, HPAGE_PMD_NR);
+       struct page *subpage;
+       int i;
+
+       /* no file-backed page support yet */
+       if (PageAnon(head)) {
+               /*
+                * The caller does not necessarily hold an mmap_sem that would
+                * prevent the anon_vma disappearing so we first we take a
+                * reference to it and then lock the anon_vma for write. This
+                * is similar to page_lock_anon_vma_read except the write lock
+                * is taken to serialise against parallel split or collapse
+                * operations.
+                */
+               anon_vma = page_get_anon_vma(head);
+               if (!anon_vma) {
+                       ret = -EBUSY;
+                       goto out;
+               }
+               anon_vma_lock_write(anon_vma);
+       } else {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       /* Racy check each subpage to see if any has extra pin */
+       list_for_each_entry(subpage, list, lru) {
+               if (can_promote_huge_pud_page(subpage))
+                       bitmap_set(subpage_bitmap, (subpage - 
head)/HPAGE_PMD_NR, 1);
+       }
+       /* Proceed only if none of subpages has extra pin.  */
+       if (!bitmap_full(subpage_bitmap, HPAGE_PMD_NR)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       list_for_each_entry(subpage, list, lru) {
+               enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS 
|
+                       TTU_RMAP_LOCKED;
+               bool unmap_success;
+               struct pglist_data *pgdata = NULL;
+
+               if (PageAnon(subpage))
+                       ttu_flags |= TTU_SPLIT_FREEZE;
+
+               unmap_success = try_to_unmap(subpage, ttu_flags);
+               VM_BUG_ON_PAGE(!unmap_success, subpage);
+
+               /* remove subpages from page_deferred_list */
+               pgdata = NODE_DATA(page_to_nid(subpage));
+               spin_lock(&pgdata->split_queue_lock);
+               if (!list_empty(page_deferred_list(subpage))) {
+                       pgdata->split_queue_len--;
+                       list_del_init(page_deferred_list(subpage));
+               }
+               spin_unlock(&pgdata->split_queue_lock);
+       }
+
+       /*first_compound_mapcount = compound_mapcount(head);*/
+       /* Take care of migration wait list:
+        * make compound page first, since it is impossible to move waiting
+        * process from subpage queues to the head page queue.
+        */
+       set_compound_page_dtor(head, COMPOUND_PAGE_DTOR);
+       set_compound_order(head, HPAGE_PUD_ORDER);
+       __SetPageHead(head);
+       list_del(&head->lru);
+       for (i = 1; i < HPAGE_PUD_NR; i++) {
+               struct page *p = head + i;
+
+               if (i % HPAGE_PMD_NR == 0) {
+                       list_del(&p->lru);
+                       /* move subpage refcount to head page */
+                       page_ref_add(head, page_count(p) - 1);
+               }
+               p->index = 0;
+               p->mapping = TAIL_MAPPING;
+               p->mem_cgroup = NULL;
+               ClearPageActive(p);
+               set_page_count(p, 0);
+               set_compound_head(p, head);
+       }
+       atomic_set(compound_mapcount_ptr(head), -1);
+       for (i = 0; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR)
+               atomic_set(sub_compound_mapcount_ptr(&head[i], 1), -1);
+       prep_transhuge_page(head);
+       /* Set first PMD-mapped page sub_compound_mapcount */
+
+       remap_pud_page(head);
+
+       for (i = HPAGE_PMD_NR; i < HPAGE_PUD_NR; i += HPAGE_PMD_NR) {
+               struct page *subpage = head + i;
+
+               __unlock_page(subpage);
+       }
+
+       INIT_LIST_HEAD(&head->lru);
+       unlock_page(head);
+       putback_lru_page(head);
+
+       mod_node_page_state(page_pgdat(head),
+                       NR_ISOLATED_ANON + page_is_file_cache(head), 
-HPAGE_PUD_NR);
+out_unlock:
+       if (anon_vma) {
+               anon_vma_unlock_write(anon_vma);
+               put_anon_vma(anon_vma);
+       }
+out:
+       while (!list_empty(list)) {
+               struct page *p = list_first_entry(list, struct page, lru);
+               list_del(&p->lru);
+               unlock_page(p);
+               putback_lru_page(p);
+       }
+       return ret;
+}
+
+/* assume mmap_sem is down_write, wrapper for madvise */
+int promote_huge_pud_page_address(struct vm_area_struct *vma, unsigned long 
haddr)
+{
+       LIST_HEAD(subpage_list);
+       struct page *head;
+
+       if (haddr & (HPAGE_PUD_SIZE - 1))
+               return -EINVAL;
+       if (haddr < vma->vm_start || (haddr + HPAGE_PUD_SIZE) > vma->vm_end)
+               return -EINVAL;
+
+       if (promote_huge_pud_page_isolate(vma, haddr, &head, &subpage_list))
+               return -EBUSY;
+
+       return promote_list_to_huge_pud_page(head, &subpage_list);
+}
diff --git a/mm/internal.h b/mm/internal.h
index c5e5a0f1cc58..6d5ebcdcde4c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -584,7 +584,9 @@ void prep_new_page(struct page *page, unsigned int order, 
gfp_t gfp_flags,
 void __unlock_page(struct page *page);
 
 int promote_huge_pmd_address(struct vm_area_struct *vma, unsigned long haddr);
+int promote_huge_pud_address(struct vm_area_struct *vma, unsigned long haddr);
 
 int promote_huge_page_address(struct vm_area_struct *vma, unsigned long haddr);
+int promote_huge_pud_page_address(struct vm_area_struct *vma, unsigned long 
haddr);
 
 #endif /* __MM_INTERNAL_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 95af1d67f209..99c4fb526c04 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -266,4 +266,24 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 
unsigned long address,
        return pmd;
 }
 #endif
+
+#ifndef pudp_collapse_flush
+pud_t pudp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                         pud_t *pudp)
+{
+       /*
+        * pud and hugepage pte format are same. So we could
+        * use the same function.
+        */
+       pud_t pud;
+
+       VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+       VM_BUG_ON(pud_trans_huge(*pudp));
+       pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
+
+       /* collapse entails shooting down ptes not pmd */
+       flush_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+       return pud;
+}
+#endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/rmap.c b/mm/rmap.c
index 39f446a6775d..49ccbf0cfe4d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1112,12 +1112,13 @@ void do_page_add_anon_rmap(struct page *page,
 {
        bool compound = flags & RMAP_COMPOUND;
        bool first;
+       struct page *head = compound_head(page);
 
        if (compound) {
                atomic_t *mapcount;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-               if (compound_order(page) == HPAGE_PUD_ORDER) {
+               VM_BUG_ON_PAGE(!PMDPageInPUD(page) && !PageTransHuge(page), 
page);
+               if (compound_order(head) == HPAGE_PUD_ORDER) {
                        if (order == HPAGE_PUD_ORDER) {
                                mapcount = compound_mapcount_ptr(page);
                        } else if (order == HPAGE_PMD_ORDER) {
@@ -1125,7 +1126,7 @@ void do_page_add_anon_rmap(struct page *page,
                                mapcount = sub_compound_mapcount_ptr(page, 1);
                        } else
                                VM_BUG_ON(1);
-               } else if (compound_order(page) == HPAGE_PMD_ORDER) {
+               } else if (compound_order(head) == HPAGE_PMD_ORDER) {
                        mapcount = compound_mapcount_ptr(page);
                } else
                        VM_BUG_ON(1);
@@ -1135,7 +1136,8 @@ void do_page_add_anon_rmap(struct page *page,
        }
 
        if (first) {
-               int nr = compound ? hpage_nr_pages(page) : 1;
+               /*int nr = compound ? hpage_nr_pages(page) : 1;*/
+               int nr = 1<<order;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
@@ -1429,6 +1431,7 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
        bool ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)arg;
+       int order = 0;
 
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1505,12 +1508,16 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
 
                /* Unexpected PMD-mapped THP? */
 
-               if (pvmw.pte)
+               if (pvmw.pte) {
                        subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
-               else if (!pvmw.pte && pvmw.pmd)
+                       order = 0;
+               } else if (!pvmw.pte && pvmw.pmd) {
                        subpage = page - page_to_pfn(page) + pmd_pfn(*pvmw.pmd);
-               else if (!pvmw.pte && !pvmw.pmd && pvmw.pud)
+                       order = HPAGE_PMD_ORDER;
+               } else if (!pvmw.pte && !pvmw.pmd && pvmw.pud) {
                        subpage = page - page_to_pfn(page) + pud_pfn(*pvmw.pud);
+                       order = HPAGE_PUD_ORDER;
+               }
                VM_BUG_ON(!subpage);
                address = pvmw.address;
 
@@ -1794,7 +1801,7 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
                 *
                 * See Documentation/vm/mmu_notifier.rst
                 */
-               page_remove_rmap(subpage, PageHuge(page), 0);
+               page_remove_rmap(subpage, PageHuge(page) || order >= 
HPAGE_PMD_ORDER, order);
                put_page(page);
        }
 
-- 
2.20.1

Reply via email to