khugepaged: generalize collapse_huge_page for mTHP collapse

Nico Pache Wed, 29 Apr 2026 08:38:53 -0700

On 4/20/26 8:20 AM, Usama Arif wrote:
> On Sun, 19 Apr 2026 12:57:42 -0600 Nico Pache <[email protected]> wrote:
>
>> Pass an order and offset to collapse_huge_page to support collapsing anon
>> memory to arbitrary orders within a PMD. order indicates what mTHP size we
>> are attempting to collapse to, and offset indicates were in the PMD to
>> start the collapse attempt.
>>
>> For non-PMD collapse we must leave the anon VMA write locked until after
>> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>> the mTHP case this is not true, and we must keep the lock to prevent
>> access/changes to the page tables. This can happen if the rmap walkers hit
>> a pmd_none while the PMD entry is currently unavailable due to being
>> temporarily removed during the collapse phase.
>>
>> Signed-off-by: Nico Pache <[email protected]>
>> ---
>>   mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
>>   1 file changed, 57 insertions(+), 46 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 283bb63854a5..ff6f9f1883ed 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct 
>> folio **foliop, struct mm_stru
>>      return SCAN_SUCCEED;
>>   }
>>
>> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned 
>> long address,
>> -            int referenced, int unmapped, struct collapse_control *cc)
>> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned 
>> long start_addr,
>> +            int referenced, int unmapped, struct collapse_control *cc,
>> +            unsigned int order)
>>   {
>>      LIST_HEAD(compound_pagelist);
>>      pmd_t *pmd, _pmd;
>> -    pte_t *pte;
>> +    pte_t *pte = NULL;
>>      pgtable_t pgtable;
>>      struct folio *folio;
>>      spinlock_t *pmd_ptl, *pte_ptl;
>>      enum scan_result result = SCAN_FAIL;
>>      struct vm_area_struct *vma;
>>      struct mmu_notifier_range range;
>> +    bool anon_vma_locked = false;
>> +    const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
>> +    const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>>
>> -    VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> -
>> -    /*
>> -     * Before allocating the hugepage, release the mmap_lock read lock.
>> -     * The allocation can take potentially a long time if it involves
>> -     * sync compaction, and we do not need to hold the mmap_lock during
>> -     * that. We will recheck the vma after taking it again in write mode.
>> -     */
>> -    mmap_read_unlock(mm);
>> -
>
> My understanding now is that the caller will need to drop the mmap_read lock?
>
> This needs explicit documentation at the start of the function IMO. I think
> there are callers in later patches and its very easy to miss this.


Yeah as David suggested I will probably seperate this out into its own
patch. And add some proper comments to describe the requirement. I didnt
even realize I had forgot to update the patch to describe this change
(apart from the coverletter). Thank you :)

>
>
>> -    result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>> +    result = alloc_charge_folio(&folio, mm, cc, order);
>>      if (result != SCAN_SUCCEED)
>>              goto out_nolock;
>>
>>      mmap_read_lock(mm);
>> -    result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> -                                     HPAGE_PMD_ORDER);
>> +    result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> +                                     &vma, cc, order);
>>      if (result != SCAN_SUCCEED) {
>>              mmap_read_unlock(mm);
>>              goto out_nolock;
>>      }
>>
>> -    result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> +    result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>>      if (result != SCAN_SUCCEED) {
>>              mmap_read_unlock(mm);
>>              goto out_nolock;
>> @@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct 
>> mm_struct *mm, unsigned long a
>>               * released when it fails. So we jump out_nolock directly in
>>               * that case.  Continuing to collapse causes inconsistency.
>>               */
>> -            result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> -                                                 referenced, 
>> HPAGE_PMD_ORDER);
>> +            result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
>> +                                                 referenced, order);
>>              if (result != SCAN_SUCCEED)
>>                      goto out_nolock;
>>      }
>> @@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct 
>> mm_struct *mm, unsigned long a
>>       * mmap_lock.
>>       */
>>      mmap_write_lock(mm);
>> -    result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> -                                     HPAGE_PMD_ORDER);
>> +    result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> +                                     &vma, cc, order);
>>      if (result != SCAN_SUCCEED)
>>              goto out_up_write;
>>      /* check if the pmd is still valid */
>>      vma_start_write(vma);
>> -    result = check_pmd_still_valid(mm, address, pmd);
>> +    result = check_pmd_still_valid(mm, pmd_addr, pmd);
>>      if (result != SCAN_SUCCEED)
>>              goto out_up_write;
>>
>>      anon_vma_lock_write(vma->anon_vma);
>> +    anon_vma_locked = true;
>>
>> -    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
>> -                            address + HPAGE_PMD_SIZE);
>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
>> +                            end_addr);
>>      mmu_notifier_invalidate_range_start(&range);
>>
>>      pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
>> @@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct 
>> mm_struct *mm, unsigned long a
>>       * Parallel GUP-fast is fine since GUP-fast will back off when
>>       * it detects PMD is changed.
>>       */
>> -    _pmd = pmdp_collapse_flush(vma, address, pmd);
>> +    _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
>
>
> For an mTHP collapse covering, say, 64KiB of a 2MiB PMD, the patch still
> flushes the entire PMD via pmdp_collapse_flush and tlb_remove_table_sync_one.
> That triggers cross-CPU TLB shootdowns for ~1.94MiB of unrelated mappings on
> every successful sub-PMD collapse. Probably acceptable as a first cut?

Hmm interesting consideration, I believe this is neccisary from a GUP
perspective as the comment suggests. Me, Dev, and David had discussed
that once that this lands we wanted to play around with a lot of the
locking and previously defined "requirements" to see what is actually
needed. So yeah I think for now we leave it as is. Hopefully we can make
more improvements in the future.

>
>
>>      spin_unlock(pmd_ptl);
>>      mmu_notifier_invalidate_range_end(&range);
>>      tlb_remove_table_sync_one();
>>
>> -    pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>> +    pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>>      if (pte) {
>> -            result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> -                                                  HPAGE_PMD_ORDER,
>> -                                                  &compound_pagelist);
>> +            result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
>> +                                                  order, 
>> &compound_pagelist);
>>              spin_unlock(pte_ptl);
>>      } else {
>>              result = SCAN_NO_PTE_TABLE;
>>      }
>>
>>      if (unlikely(result != SCAN_SUCCEED)) {
>> -            if (pte)
>> -                    pte_unmap(pte);
>>              spin_lock(pmd_ptl);
>> -            BUG_ON(!pmd_none(*pmd));
>> +            WARN_ON_ONCE(!pmd_none(*pmd));
>
> Why was this turned into WARN_ON_ONCE? Would be good to add to commit message
> what the reason is if it has been discussed earlier.
>
> The next line writes a PMD entry over an existing one — that leaks the
> previous page table or PMD-mapped folio and can corrupt VA mappings. BUG_ON
> failed loudly and safely; WARN_ON_ONCE continues into the corruption and falls
> silent after the first hit.

Discussed here:
https://lore.kernel.org/lkml/[email protected]/

But yes im more so on your side that id rather crash the system then
warn if we are potentially corrupting things in memory.

@lorenzo and @david can we come to a definitive consensus on this?

>
>
>>              /*
>>               * We can only use set_pmd_at when establishing
>>               * hugepmds and never for establishing regular pmds that
>> @@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct 
>> mm_struct *mm, unsigned long a
>>               */
>>              pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>>              spin_unlock(pmd_ptl);
>> -            anon_vma_unlock_write(vma->anon_vma);
>>              goto out_up_write;
>>      }
>>
>>      /*
>> -     * All pages are isolated and locked so anon_vma rmap
>> -     * can't run anymore.
>> +     * For PMD collapse all pages are isolated and locked so anon_vma
>> +     * rmap can't run anymore. For mTHP collapse the PMD entry has been
>> +     * removed and not all pages are isolated and locked, so we must hold
>> +     * the lock to prevent neighboring folios from attempting to access
>> +     * this PMD until its reinstalled.
>>       */
>> -    anon_vma_unlock_write(vma->anon_vma);
>> +    if (is_pmd_order(order)) {
>> +            anon_vma_unlock_write(vma->anon_vma);
>> +            anon_vma_locked = false;
>> +    }
>>
>>      result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>> -                                       vma, address, pte_ptl,
>> -                                       HPAGE_PMD_ORDER,
>> -                                       &compound_pagelist);
>> -    pte_unmap(pte);
>> +                                       vma, start_addr, pte_ptl,
>> +                                       order, &compound_pagelist);
>>      if (unlikely(result != SCAN_SUCCEED))
>>              goto out_up_write;
>>
>> @@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct 
>> mm_struct *mm, unsigned long a
>>       * write.
>>       */
>>      __folio_mark_uptodate(folio);
>> -    pgtable = pmd_pgtable(_pmd);
>> -
>>      spin_lock(pmd_ptl);
>> -    BUG_ON(!pmd_none(*pmd));
>> -    pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> -    map_anon_folio_pmd_nopf(folio, pmd, vma, address);
>> +    WARN_ON_ONCE(!pmd_none(*pmd));
>> +    if (is_pmd_order(order)) { /* PMD collapse */
>> +            pgtable = pmd_pgtable(_pmd);
>> +            pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> +            map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
>> +    } else { /* mTHP collapse */
>> +            map_anon_folio_pte_nopf(folio, pte, vma, start_addr, 
>> /*uffd_wp=*/ false);
>
> map_anon_folio_pte_nopf calls set_ptes and modifies pagetable, while holding
> pmd_ptl only. It should be safe as we expect pmd_none. But I think you should
> put a comment about this?

Yeah doesnt hurt. Ill try to clarify that.


>
>
>> +            smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
>> +            pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>> +    }
>>      spin_unlock(pmd_ptl);
>>
>>      folio = NULL;
>>
>>      result = SCAN_SUCCEED;
>>   out_up_write:
>> +    if (anon_vma_locked)
>> +            anon_vma_unlock_write(vma->anon_vma);
>> +    if (pte)
>> +            pte_unmap(pte);
>>      mmap_write_unlock(mm);
>>   out_nolock:
>>      if (folio)
>> @@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct 
>> mm_struct *mm,
>>   out_unmap:
>>      pte_unmap_unlock(pte, ptl);
>>      if (result == SCAN_SUCCEED) {
>> +            /*
>> +             * Before allocating the hugepage, release the mmap_lock read 
>> lock.
>> +             * The allocation can take potentially a long time if it 
>> involves
>> +             * sync compaction, and we do not need to hold the mmap_lock 
>> during
>> +             * that. We will recheck the vma after taking it again in write 
>> mode.
>> +             */
>> +            mmap_read_unlock(mm);
>>              result = collapse_huge_page(mm, start_addr, referenced,
>> -                                        unmapped, cc);
>> +                                        unmapped, cc, HPAGE_PMD_ORDER);
>>              /* collapse_huge_page will return with the mmap_lock released */
>>              *lock_dropped = true;
>>      }
>> --
>> 2.53.0
>>
>>
>

Re: [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse

Reply via email to