Classify the source PMD via pmd_present() and vm_normal_folio_pmd(), matching the way the PTE path uses pte_present() and vm_normal_page(). This moves the present-PMD decision from VMA identity checks to the actual PMD/folio state.
Drop the defensive "if (!pmd_trans_huge(pmd)) goto out_unlock" branch: with mmap_write_lock held during fork, it should not occur. Extract the present-PMD side of copy_huge_pmd() into copy_present_huge_pmd(). The helper owns the child pgtable passed by the caller: it either deposits the pgtable when installing a copied PMD, or frees it on paths that do not install one. The child pgtable is now allocated once up front and freed on every skip path. This makes file/shmem and PFNMAP/special skip paths take the PMD locks and free the preallocated pgtable before returning. These are not expected to be hot paths, and the PFNMAP case is reused by the follow-up PMD PFNMAP copy support. Signed-off-by: Yin Tirui <[email protected]> --- mm/huge_memory.c | 175 +++++++++++++++++++++++++---------------------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9832ee910d5e..3964258ff91d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1879,6 +1879,82 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, return false; } +static int copy_present_huge_pmd( + struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t pmd, pgtable_t pgtable, bool *need_split) +{ + struct folio *src_folio; + bool wrprotect = true; + + src_folio = vm_normal_folio_pmd(src_vma, addr, pmd); + if (!src_folio) { + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + /* + * mm_get_huge_zero_folio() will never allocate a new + * folio here, since we already have a zero page to + * copy. It just takes a reference. + */ + mm_get_huge_zero_folio(dst_mm); + goto set_pmd; + } + + /* + * Making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + pte_free(dst_mm, pgtable); + pgtable = NULL; + wrprotect = false; + goto set_pmd; + } + + /* File THPs are copied lazily by refaulting. */ + if (!folio_test_anon(src_folio)) { + pte_free(dst_mm, pgtable); + return 0; + } + + folio_get(src_folio); + if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, + &src_folio->page, + dst_vma, src_vma))) { + /* Page maybe pinned: split and retry the fault on PTEs. */ + folio_put(src_folio); + pte_free(dst_mm, pgtable); + *need_split = true; + return -EAGAIN; + } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + +set_pmd: + if (pgtable) { + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + } + + if (wrprotect) { + pmdp_set_wrprotect(src_mm, addr, src_pmd); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_clear_uffd_wp(pmd); + pmd = pmd_wrprotect(pmd); + } + + pmd = pmd_mkold(pmd); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + + return 0; +} + static void copy_huge_non_present_pmd( struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, @@ -1940,104 +2016,43 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { spinlock_t *dst_ptl, *src_ptl; - struct page *src_page; - struct folio *src_folio; - pmd_t pmd; pgtable_t pgtable = NULL; - int ret = -ENOMEM; - - pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_present(pmd) && pmd_special(pmd) && - !is_huge_zero_pmd(pmd))) { - dst_ptl = pmd_lock(dst_mm, dst_pmd); - src_ptl = pmd_lockptr(src_mm, src_pmd); - spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - /* - * No need to recheck the pmd, it can't change with write - * mmap lock held here. - * - * Meanwhile, making sure it's not a CoW VMA with writable - * mapping, otherwise it means either the anon page wrongly - * applied special bit, or we made the PRIVATE mapping be - * able to wrongly write to the backend MMIO. - */ - VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); - goto set_pmd; - } - - /* Skip if can be re-fill on fault */ - if (!vma_is_anonymous(dst_vma)) - return 0; + bool need_split = false; + int ret = 0; + pmd_t pmd; pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) - goto out; + return -ENOMEM; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(thp_migration_supported() && - pmd_is_valid_softleaf(pmd))) { - copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, + if (likely(pmd_present(pmd))) { + ret = copy_present_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, + dst_vma, src_vma, pmd, pgtable, &need_split); + } else if (unlikely(thp_migration_supported() && pmd_is_valid_softleaf(pmd))) { + if (unlikely(!vma_is_anonymous(dst_vma))) + pte_free(dst_mm, pgtable); + else + copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma, pmd, pgtable); - ret = 0; - goto out_unlock; - } - - if (unlikely(!pmd_trans_huge(pmd))) { + } else { + VM_WARN_ONCE(1, "unexpected non-present PMD %llx\n", + (unsigned long long)pmd_val(pmd)); pte_free(dst_mm, pgtable); - goto out_unlock; - } - /* - * When page table lock is held, the huge zero pmd should not be - * under splitting since we don't split the page itself, only pmd to - * a page table. - */ - if (is_huge_zero_pmd(pmd)) { - /* - * mm_get_huge_zero_folio() will never allocate a new - * folio here, since we already have a zero page to - * copy. It just takes a reference. - */ - mm_get_huge_zero_folio(dst_mm); - goto out_zero_page; + ret = -EAGAIN; } - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - src_folio = page_folio(src_page); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); - folio_get(src_folio); - if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) { - /* Page maybe pinned: split and retry the fault on PTEs. */ - folio_put(src_folio); - pte_free(dst_mm, pgtable); - spin_unlock(src_ptl); - spin_unlock(dst_ptl); + if (unlikely(need_split)) __split_huge_pmd(src_vma, src_pmd, addr, false); - return -EAGAIN; - } - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); -out_zero_page: - mm_inc_nr_ptes(dst_mm); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); - pmdp_set_wrprotect(src_mm, addr, src_pmd); - if (!userfaultfd_wp(dst_vma)) - pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_wrprotect(pmd); -set_pmd: - pmd = pmd_mkold(pmd); - set_pmd_at(dst_mm, addr, dst_pmd, pmd); - ret = 0; -out_unlock: - spin_unlock(src_ptl); - spin_unlock(dst_ptl); -out: return ret; } -- 2.43.0
