On 4/20/26 8:20 AM, Usama Arif wrote:
> On Sun, 19 Apr 2026 12:57:42 -0600 Nico Pache <[email protected]> wrote:
>
>> Pass an order and offset to collapse_huge_page to support collapsing anon
>> memory to arbitrary orders within a PMD. order indicates what mTHP size we
>> are attempting to collapse to, and offset indicates were in the PMD to
>> start the collapse attempt.
>>
>> For non-PMD collapse we must leave the anon VMA write locked until after
>> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>> the mTHP case this is not true, and we must keep the lock to prevent
>> access/changes to the page tables. This can happen if the rmap walkers hit
>> a pmd_none while the PMD entry is currently unavailable due to being
>> temporarily removed during the collapse phase.
>>
>> Signed-off-by: Nico Pache <[email protected]>
>> ---
>> mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
>> 1 file changed, 57 insertions(+), 46 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 283bb63854a5..ff6f9f1883ed 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct
>> folio **foliop, struct mm_stru
>> return SCAN_SUCCEED;
>> }
>>
>> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned
>> long address,
>> - int referenced, int unmapped, struct collapse_control *cc)
>> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned
>> long start_addr,
>> + int referenced, int unmapped, struct collapse_control *cc,
>> + unsigned int order)
>> {
>> LIST_HEAD(compound_pagelist);
>> pmd_t *pmd, _pmd;
>> - pte_t *pte;
>> + pte_t *pte = NULL;
>> pgtable_t pgtable;
>> struct folio *folio;
>> spinlock_t *pmd_ptl, *pte_ptl;
>> enum scan_result result = SCAN_FAIL;
>> struct vm_area_struct *vma;
>> struct mmu_notifier_range range;
>> + bool anon_vma_locked = false;
>> + const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
>> + const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>>
>> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> -
>> - /*
>> - * Before allocating the hugepage, release the mmap_lock read lock.
>> - * The allocation can take potentially a long time if it involves
>> - * sync compaction, and we do not need to hold the mmap_lock during
>> - * that. We will recheck the vma after taking it again in write mode.
>> - */
>> - mmap_read_unlock(mm);
>> -
>
> My understanding now is that the caller will need to drop the mmap_read lock?
>
> This needs explicit documentation at the start of the function IMO. I think
> there are callers in later patches and its very easy to miss this.
Yeah as David suggested I will probably seperate this out into its own
patch. And add some proper comments to describe the requirement. I didnt
even realize I had forgot to update the patch to describe this change
(apart from the coverletter). Thank you :)
>
>
>> - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>> + result = alloc_charge_folio(&folio, mm, cc, order);
>> if (result != SCAN_SUCCEED)
>> goto out_nolock;
>>
>> mmap_read_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> - HPAGE_PMD_ORDER);
>> + result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> + &vma, cc, order);
>> if (result != SCAN_SUCCEED) {
>> mmap_read_unlock(mm);
>> goto out_nolock;
>> }
>>
>> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
>> + result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>> if (result != SCAN_SUCCEED) {
>> mmap_read_unlock(mm);
>> goto out_nolock;
>> @@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct
>> mm_struct *mm, unsigned long a
>> * released when it fails. So we jump out_nolock directly in
>> * that case. Continuing to collapse causes inconsistency.
>> */
>> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
>> - referenced,
>> HPAGE_PMD_ORDER);
>> + result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
>> + referenced, order);
>> if (result != SCAN_SUCCEED)
>> goto out_nolock;
>> }
>> @@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct
>> mm_struct *mm, unsigned long a
>> * mmap_lock.
>> */
>> mmap_write_lock(mm);
>> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
>> - HPAGE_PMD_ORDER);
>> + result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
>> + &vma, cc, order);
>> if (result != SCAN_SUCCEED)
>> goto out_up_write;
>> /* check if the pmd is still valid */
>> vma_start_write(vma);
>> - result = check_pmd_still_valid(mm, address, pmd);
>> + result = check_pmd_still_valid(mm, pmd_addr, pmd);
>> if (result != SCAN_SUCCEED)
>> goto out_up_write;
>>
>> anon_vma_lock_write(vma->anon_vma);
>> + anon_vma_locked = true;
>>
>> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
>> - address + HPAGE_PMD_SIZE);
>> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
>> + end_addr);
>> mmu_notifier_invalidate_range_start(&range);
>>
>> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
>> @@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct
>> mm_struct *mm, unsigned long a
>> * Parallel GUP-fast is fine since GUP-fast will back off when
>> * it detects PMD is changed.
>> */
>> - _pmd = pmdp_collapse_flush(vma, address, pmd);
>> + _pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);
>
>
> For an mTHP collapse covering, say, 64KiB of a 2MiB PMD, the patch still
> flushes the entire PMD via pmdp_collapse_flush and tlb_remove_table_sync_one.
> That triggers cross-CPU TLB shootdowns for ~1.94MiB of unrelated mappings on
> every successful sub-PMD collapse. Probably acceptable as a first cut?
Hmm interesting consideration, I believe this is neccisary from a GUP
perspective as the comment suggests. Me, Dev, and David had discussed
that once that this lands we wanted to play around with a lot of the
locking and previously defined "requirements" to see what is actually
needed. So yeah I think for now we leave it as is. Hopefully we can make
more improvements in the future.
>
>
>> spin_unlock(pmd_ptl);
>> mmu_notifier_invalidate_range_end(&range);
>> tlb_remove_table_sync_one();
>>
>> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>> + pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>> if (pte) {
>> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> - HPAGE_PMD_ORDER,
>> - &compound_pagelist);
>> + result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
>> + order,
>> &compound_pagelist);
>> spin_unlock(pte_ptl);
>> } else {
>> result = SCAN_NO_PTE_TABLE;
>> }
>>
>> if (unlikely(result != SCAN_SUCCEED)) {
>> - if (pte)
>> - pte_unmap(pte);
>> spin_lock(pmd_ptl);
>> - BUG_ON(!pmd_none(*pmd));
>> + WARN_ON_ONCE(!pmd_none(*pmd));
>
> Why was this turned into WARN_ON_ONCE? Would be good to add to commit message
> what the reason is if it has been discussed earlier.
>
> The next line writes a PMD entry over an existing one — that leaks the
> previous page table or PMD-mapped folio and can corrupt VA mappings. BUG_ON
> failed loudly and safely; WARN_ON_ONCE continues into the corruption and falls
> silent after the first hit.
Discussed here:
https://lore.kernel.org/lkml/[email protected]/
But yes im more so on your side that id rather crash the system then
warn if we are potentially corrupting things in memory.
@lorenzo and @david can we come to a definitive consensus on this?
>
>
>> /*
>> * We can only use set_pmd_at when establishing
>> * hugepmds and never for establishing regular pmds that
>> @@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct
>> mm_struct *mm, unsigned long a
>> */
>> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>> spin_unlock(pmd_ptl);
>> - anon_vma_unlock_write(vma->anon_vma);
>> goto out_up_write;
>> }
>>
>> /*
>> - * All pages are isolated and locked so anon_vma rmap
>> - * can't run anymore.
>> + * For PMD collapse all pages are isolated and locked so anon_vma
>> + * rmap can't run anymore. For mTHP collapse the PMD entry has been
>> + * removed and not all pages are isolated and locked, so we must hold
>> + * the lock to prevent neighboring folios from attempting to access
>> + * this PMD until its reinstalled.
>> */
>> - anon_vma_unlock_write(vma->anon_vma);
>> + if (is_pmd_order(order)) {
>> + anon_vma_unlock_write(vma->anon_vma);
>> + anon_vma_locked = false;
>> + }
>>
>> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>> - vma, address, pte_ptl,
>> - HPAGE_PMD_ORDER,
>> - &compound_pagelist);
>> - pte_unmap(pte);
>> + vma, start_addr, pte_ptl,
>> + order, &compound_pagelist);
>> if (unlikely(result != SCAN_SUCCEED))
>> goto out_up_write;
>>
>> @@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct
>> mm_struct *mm, unsigned long a
>> * write.
>> */
>> __folio_mark_uptodate(folio);
>> - pgtable = pmd_pgtable(_pmd);
>> -
>> spin_lock(pmd_ptl);
>> - BUG_ON(!pmd_none(*pmd));
>> - pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
>> + WARN_ON_ONCE(!pmd_none(*pmd));
>> + if (is_pmd_order(order)) { /* PMD collapse */
>> + pgtable = pmd_pgtable(_pmd);
>> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
>> + } else { /* mTHP collapse */
>> + map_anon_folio_pte_nopf(folio, pte, vma, start_addr,
>> /*uffd_wp=*/ false);
>
> map_anon_folio_pte_nopf calls set_ptes and modifies pagetable, while holding
> pmd_ptl only. It should be safe as we expect pmd_none. But I think you should
> put a comment about this?
Yeah doesnt hurt. Ill try to clarify that.
>
>
>> + smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
>> + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>> + }
>> spin_unlock(pmd_ptl);
>>
>> folio = NULL;
>>
>> result = SCAN_SUCCEED;
>> out_up_write:
>> + if (anon_vma_locked)
>> + anon_vma_unlock_write(vma->anon_vma);
>> + if (pte)
>> + pte_unmap(pte);
>> mmap_write_unlock(mm);
>> out_nolock:
>> if (folio)
>> @@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct
>> mm_struct *mm,
>> out_unmap:
>> pte_unmap_unlock(pte, ptl);
>> if (result == SCAN_SUCCEED) {
>> + /*
>> + * Before allocating the hugepage, release the mmap_lock read
>> lock.
>> + * The allocation can take potentially a long time if it
>> involves
>> + * sync compaction, and we do not need to hold the mmap_lock
>> during
>> + * that. We will recheck the vma after taking it again in write
>> mode.
>> + */
>> + mmap_read_unlock(mm);
>> result = collapse_huge_page(mm, start_addr, referenced,
>> - unmapped, cc);
>> + unmapped, cc, HPAGE_PMD_ORDER);
>> /* collapse_huge_page will return with the mmap_lock released */
>> *lock_dropped = true;
>> }
>> --
>> 2.53.0
>>
>>
>