Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.

For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
access/changes to the page tables. This can happen if the rmap walkers hit
a pmd_none while the PMD entry is currently unavailable due to being
temporarily removed during the collapse phase.

Signed-off-by: Nico Pache <[email protected]>
---
 mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 283bb63854a5..ff6f9f1883ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct folio 
**foliop, struct mm_stru
        return SCAN_SUCCEED;
 }
 
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long 
address,
-               int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long 
start_addr,
+               int referenced, int unmapped, struct collapse_control *cc,
+               unsigned int order)
 {
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
-       pte_t *pte;
+       pte_t *pte = NULL;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        enum scan_result result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;
+       bool anon_vma_locked = false;
+       const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
+       const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
 
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-       /*
-        * Before allocating the hugepage, release the mmap_lock read lock.
-        * The allocation can take potentially a long time if it involves
-        * sync compaction, and we do not need to hold the mmap_lock during
-        * that. We will recheck the vma after taking it again in write mode.
-        */
-       mmap_read_unlock(mm);
-
-       result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+       result = alloc_charge_folio(&folio, mm, cc, order);
        if (result != SCAN_SUCCEED)
                goto out_nolock;
 
        mmap_read_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-                                        HPAGE_PMD_ORDER);
+       result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+                                        &vma, cc, order);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }
 
-       result = find_pmd_or_thp_or_none(mm, address, &pmd);
+       result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
@@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
-               result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-                                                    referenced, 
HPAGE_PMD_ORDER);
+               result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+                                                    referenced, order);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }
@@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * mmap_lock.
         */
        mmap_write_lock(mm);
-       result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-                                        HPAGE_PMD_ORDER);
+       result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
+                                        &vma, cc, order);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        vma_start_write(vma);
-       result = check_pmd_still_valid(mm, address, pmd);
+       result = check_pmd_still_valid(mm, pmd_addr, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
 
        anon_vma_lock_write(vma->anon_vma);
+       anon_vma_locked = true;
 
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
-                               address + HPAGE_PMD_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+                               end_addr);
        mmu_notifier_invalidate_range_start(&range);
 
        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
-       _pmd = pmdp_collapse_flush(vma, address, pmd);
+       _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();
 
-       pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+       pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
        if (pte) {
-               result = __collapse_huge_page_isolate(vma, address, pte, cc,
-                                                     HPAGE_PMD_ORDER,
-                                                     &compound_pagelist);
+               result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+                                                     order, 
&compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_NO_PTE_TABLE;
        }
 
        if (unlikely(result != SCAN_SUCCEED)) {
-               if (pte)
-                       pte_unmap(pte);
                spin_lock(pmd_ptl);
-               BUG_ON(!pmd_none(*pmd));
+               WARN_ON_ONCE(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
@@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
-               anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }
 
        /*
-        * All pages are isolated and locked so anon_vma rmap
-        * can't run anymore.
+        * For PMD collapse all pages are isolated and locked so anon_vma
+        * rmap can't run anymore. For mTHP collapse the PMD entry has been
+        * removed and not all pages are isolated and locked, so we must hold
+        * the lock to prevent neighboring folios from attempting to access
+        * this PMD until its reinstalled.
         */
-       anon_vma_unlock_write(vma->anon_vma);
+       if (is_pmd_order(order)) {
+               anon_vma_unlock_write(vma->anon_vma);
+               anon_vma_locked = false;
+       }
 
        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
-                                          vma, address, pte_ptl,
-                                          HPAGE_PMD_ORDER,
-                                          &compound_pagelist);
-       pte_unmap(pte);
+                                          vma, start_addr, pte_ptl,
+                                          order, &compound_pagelist);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;
 
@@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct 
mm_struct *mm, unsigned long a
         * write.
         */
        __folio_mark_uptodate(folio);
-       pgtable = pmd_pgtable(_pmd);
-
        spin_lock(pmd_ptl);
-       BUG_ON(!pmd_none(*pmd));
-       pgtable_trans_huge_deposit(mm, pmd, pgtable);
-       map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+       WARN_ON_ONCE(!pmd_none(*pmd));
+       if (is_pmd_order(order)) { /* PMD collapse */
+               pgtable = pmd_pgtable(_pmd);
+               pgtable_trans_huge_deposit(mm, pmd, pgtable);
+               map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
+       } else { /* mTHP collapse */
+               map_anon_folio_pte_nopf(folio, pte, vma, start_addr, 
/*uffd_wp=*/ false);
+               smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+               pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+       }
        spin_unlock(pmd_ptl);
 
        folio = NULL;
 
        result = SCAN_SUCCEED;
 out_up_write:
+       if (anon_vma_locked)
+               anon_vma_unlock_write(vma->anon_vma);
+       if (pte)
+               pte_unmap(pte);
        mmap_write_unlock(mm);
 out_nolock:
        if (folio)
@@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct 
mm_struct *mm,
 out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (result == SCAN_SUCCEED) {
+               /*
+                * Before allocating the hugepage, release the mmap_lock read 
lock.
+                * The allocation can take potentially a long time if it 
involves
+                * sync compaction, and we do not need to hold the mmap_lock 
during
+                * that. We will recheck the vma after taking it again in write 
mode.
+                */
+               mmap_read_unlock(mm);
                result = collapse_huge_page(mm, start_addr, referenced,
-                                           unmapped, cc);
+                                           unmapped, cc, HPAGE_PMD_ORDER);
                /* collapse_huge_page will return with the mmap_lock released */
                *lock_dropped = true;
        }
-- 
2.53.0


Reply via email to