On Wed, Jun 11, 2025 at 02:06:54PM +0200, David Hildenbrand wrote: > Marking PUDs that map a "normal" refcounted folios as special is > against our rules documented for vm_normal_page().
Might be worth referring to specifically which rule. I'm guessing it's the general one of special == don't touch (from vm_normal_page() comment): /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * ... * */ But don't we already violate this E.g.: if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); I mean this in itself perhaps means we should update this comment to say 'except when file-backed and there is a find_special_page() hook'. > > Fortunately, there are not that many pud_special() check that can be > mislead and are right now rather harmless: e.g., none so far > bases decisions whether to grab a folio reference on that decision. > > Well, and GUP-fast will fallback to GUP-slow. All in all, so far no big > implications as it seems. > > Getting this right will get more important as we introduce > folio_normal_page_pud() and start using it in more place where we > currently special-case based on other VMA flags. > > Fix it just like we fixed vmf_insert_folio_pmd(). > > Add folio_mk_pud() to mimic what we do with folio_mk_pmd(). > > Fixes: dbe54153296d ("mm/huge_memory: add vmf_insert_folio_pud()") > Signed-off-by: David Hildenbrand <da...@redhat.com> LGTM, so: Reviewed-by: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> Couple nits/comments below. > --- > include/linux/mm.h | 19 ++++++++++++++++- > mm/huge_memory.c | 51 +++++++++++++++++++++++++--------------------- > 2 files changed, 46 insertions(+), 24 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index fa538feaa8d95..912b6d40a12d6 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1816,7 +1816,24 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, > pgprot_t pgprot) > { > return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); > } > -#endif > + > +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD > +/** > + * folio_mk_pud - Create a PUD for this folio > + * @folio: The folio to create a PUD for > + * @pgprot: The page protection bits to use > + * > + * Create a page table entry for the first page of this folio. > + * This is suitable for passing to set_pud_at(). > + * > + * Return: A page table entry suitable for mapping this folio. > + */ > +static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot) Nice to have some consistency around pud, it seems so often we do a pmd version of relevant functions then with pud we go 'meh whatever' :) > +{ > + return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); > +} > +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ > +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > #endif /* CONFIG_MMU */ > > static inline bool folio_has_pincount(const struct folio *folio) > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 7e3e9028873e5..4734de1dc0ae4 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1535,15 +1535,18 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct > vm_area_struct *vma) > return pud; > } > > -static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, > - pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) > +static void insert_pud(struct vm_area_struct *vma, unsigned long addr, > + pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) > { > struct mm_struct *mm = vma->vm_mm; > pud_t entry; > > if (!pud_none(*pud)) { > + const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : > + pfn_t_to_pfn(fop.pfn); > + > if (write) { > - if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) > + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) > return; > entry = pud_mkyoung(*pud); > entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); > @@ -1553,11 +1556,19 @@ static void insert_pfn_pud(struct vm_area_struct > *vma, unsigned long addr, > return; > } > > - entry = pud_mkhuge(pfn_t_pud(pfn, prot)); > - if (pfn_t_devmap(pfn)) > - entry = pud_mkdevmap(entry); > - else > - entry = pud_mkspecial(entry); > + if (fop.is_folio) { > + entry = folio_mk_pud(fop.folio, vma->vm_page_prot); > + > + folio_get(fop.folio); > + folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma); > + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR); Nit, but might be nice to abstract for PMD/PUD. > + } else { > + entry = pud_mkhuge(pfn_t_pud(fop.pfn, prot)); Same incredibly pedantic whitespace comment from previous patch :) > + if (pfn_t_devmap(fop.pfn)) > + entry = pud_mkdevmap(entry); > + else > + entry = pud_mkspecial(entry); > + } > if (write) { > entry = pud_mkyoung(pud_mkdirty(entry)); > entry = maybe_pud_mkwrite(entry, vma); > @@ -1581,6 +1592,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, > pfn_t pfn, bool write) > unsigned long addr = vmf->address & PUD_MASK; > struct vm_area_struct *vma = vmf->vma; > pgprot_t pgprot = vma->vm_page_prot; > + struct folio_or_pfn fop = { > + .pfn = pfn, > + }; > spinlock_t *ptl; > > /* > @@ -1600,7 +1614,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, > pfn_t pfn, bool write) > pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); > > ptl = pud_lock(vma->vm_mm, vmf->pud); > - insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); > + insert_pud(vma, addr, vmf->pud, fop, pgprot, write); > spin_unlock(ptl); > > return VM_FAULT_NOPAGE; > @@ -1622,6 +1636,10 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, > struct folio *folio, > unsigned long addr = vmf->address & PUD_MASK; > pud_t *pud = vmf->pud; > struct mm_struct *mm = vma->vm_mm; > + struct folio_or_pfn fop = { > + .folio = folio, > + .is_folio = true, > + }; > spinlock_t *ptl; > > if (addr < vma->vm_start || addr >= vma->vm_end) > @@ -1631,20 +1649,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, > struct folio *folio, > return VM_FAULT_SIGBUS; > > ptl = pud_lock(mm, pud); > - > - /* > - * If there is already an entry present we assume the folio is > - * already mapped, hence no need to take another reference. We > - * still call insert_pfn_pud() though in case the mapping needs > - * upgrading to writeable. > - */ > - if (pud_none(*vmf->pud)) { > - folio_get(folio); > - folio_add_file_rmap_pud(folio, &folio->page, vma); > - add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); > - } > - insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), > - vma->vm_page_prot, write); > + insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); > spin_unlock(ptl); > > return VM_FAULT_NOPAGE; > -- > 2.49.0 >