Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.

For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.

Signed-off-by: Nico Pache <[email protected]>
---
 mm/khugepaged.c | 93 +++++++++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 37a5f6791816..f49bef78cf51 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1207,34 +1207,36 @@ static enum scan_result alloc_charge_folio(struct folio 
**foliop, struct mm_stru
  * the mmap_lock during that. We must recheck the vma after taking it again in
  * write mode.
  */
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long 
address,
-               int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long 
start_addr,
+               int referenced, int unmapped, struct collapse_control *cc,
+               unsigned int order)
 {
+       const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+       const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
-       pte_t *pte;
+       pte_t *pte = NULL;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        enum scan_result result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;
+       bool anon_vma_locked = false;
 
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-       result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+       result = alloc_charge_folio(&folio, mm, cc, order);
        if (result != SCAN_SUCCEED)
                goto out_nolock;
 
        mmap_read_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-                                        HPAGE_PMD_ORDER);
+       result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+                                        &vma, cc, order);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
 
-       result = find_pmd_or_thp_or_none(mm, address, &pmd);
+       result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
@@ -1246,8 +1248,8 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
-               result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-                                                    referenced, 
HPAGE_PMD_ORDER);
+               result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+                                                    referenced, order);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }
@@ -1262,20 +1264,21 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * mmap_lock.
         */
        mmap_write_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-                                        HPAGE_PMD_ORDER);
+       result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+                                        &vma, cc, order);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        vma_start_write(vma);
-       result = check_pmd_still_valid(mm, address, pmd);
+       result = check_pmd_still_valid(mm, pmd_addr, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
 
        anon_vma_lock_write(vma->anon_vma);
+       anon_vma_locked = true;
 
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
-                               address + HPAGE_PMD_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+                               end_addr);
        mmu_notifier_invalidate_range_start(&range);
 
        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1287,26 +1290,23 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
-       _pmd = pmdp_collapse_flush(vma, address, pmd);
+       _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();
 
-       pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+       pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
        if (pte) {
-               result = __collapse_huge_page_isolate(vma, address, pte, cc,
-                                                     HPAGE_PMD_ORDER,
-                                                     &compound_pagelist);
+               result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+                                                     order, 
&compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_NO_PTE_TABLE;
        }
 
        if (unlikely(result != SCAN_SUCCEED)) {
-               if (pte)
-                       pte_unmap(pte);
                spin_lock(pmd_ptl);
-               BUG_ON(!pmd_none(*pmd));
+               WARN_ON_ONCE(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
@@ -1314,21 +1314,24 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
-               anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }
 
        /*
-        * All pages are isolated and locked so anon_vma rmap
-        * can't run anymore.
+        * For PMD collapse all pages are isolated and locked so anon_vma
+        * rmap can't run anymore. For mTHP collapse the PMD entry has been
+        * removed and not all pages are isolated and locked, so we must hold
+        * the lock to prevent neighboring folios from attempting to access
+        * this PMD until its reinstalled.
         */
-       anon_vma_unlock_write(vma->anon_vma);
+       if (is_pmd_order(order)) {
+               anon_vma_unlock_write(vma->anon_vma);
+               anon_vma_locked = false;
+       }
 
        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
-                                          vma, address, pte_ptl,
-                                          HPAGE_PMD_ORDER,
-                                          &compound_pagelist);
-       pte_unmap(pte);
+                                          vma, start_addr, pte_ptl,
+                                          order, &compound_pagelist);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;
 
@@ -1338,18 +1341,32 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * write.
         */
        __folio_mark_uptodate(folio);
-       pgtable = pmd_pgtable(_pmd);
-
        spin_lock(pmd_ptl);
-       BUG_ON(!pmd_none(*pmd));
-       pgtable_trans_huge_deposit(mm, pmd, pgtable);
-       map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+       WARN_ON_ONCE(!pmd_none(*pmd));
+       if (is_pmd_order(order)) {
+               pgtable = pmd_pgtable(_pmd);
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
+               map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+       } else {
+               /*
+                * set_ptes is called in map_anon_folio_pte_nopf with the
+                * pmd_ptl lock still held; this is safe as the PMD is expected
+                * to be none. The pmd entry is then repopulated below.
+                */
+               map_anon_folio_pte_nopf(folio, pte, vma, start_addr, 
/*uffd_wp=*/ false);
+               smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+               pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+       }
        spin_unlock(pmd_ptl);
 
        folio = NULL;
 
        result = SCAN_SUCCEED;
 out_up_write:
+       if (anon_vma_locked)
+               anon_vma_unlock_write(vma->anon_vma);
+       if (pte)
+               pte_unmap(pte);
        mmap_write_unlock(mm);
 out_nolock:
        if (folio)
@@ -1529,7 +1546,7 @@ static enum scan_result collapse_scan_pmd(struct 
mm_struct *mm,
                /* collapse_huge_page expects the lock to be dropped before 
calling */
                mmap_read_unlock(mm);
                result = collapse_huge_page(mm, start_addr, referenced,
-                                           unmapped, cc);
+                                           unmapped, cc, HPAGE_PMD_ORDER);
                /* collapse_huge_page will return with the mmap_lock released */
                *lock_dropped = true;
        }
-- 
2.54.0


Reply via email to