Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
On Wed, May 29, 2024 at 10:07:55AM +, Christophe Leroy wrote: > We can't but I didn't want to leave nb undefined or with a value that > might lead to writing in the weed. Value 1 seems a safe default. Might be worth to throw a WARN_ON? > >> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c > >> b/arch/powerpc/mm/book3s64/hugetlbpage.c > >> index 5a2e512e96db..83c3361b358b 100644 > >> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c > >> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c > >> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long > >> access, unsigned long vsid, > >>/* If PTE permissions don't match, take page fault */ > >>if (unlikely(!check_pte_access(access, old_pte))) > >>return 1; > >> + /* > >> + * If hash-4k, hugepages use seeral contiguous PxD entries > > 'several' > >> + * so bail out and let mm make the page young or dirty > >> + */ > >> + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { > >> + if (!(old_pte & _PAGE_ACCESSED)) > >> + return 1; > >> + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) > >> + return 1; > > > > I have 0 clue about this code. What would happen if we do not bail out? > > > > In that case the pte_xchg() in the while () will only set ACCESS or > DIRTY bit on the first PxD entry, not on all cont-PxD entries. I see, thanks for explaining. -- Oscar Salvador SUSE Labs
Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
Le 29/05/2024 à 11:23, Oscar Salvador a écrit : > On Mon, May 27, 2024 at 03:30:12PM +0200, Christophe Leroy wrote: >> On book3s/64, the only user of hugepd is hash in 4k mode. >> >> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD. >> >> Rework hash-4k to use contiguous PMD and PUD instead. >> >> In that setup there are only two huge page sizes: 16M and 16G. >> >> 16M sits at PMD level and 16G at PUD level. > > > On 4k mode, PMD_SIZE is 2MB and PUD_SIZE is 256MB, right? Correct, as documented in arch/powerpc/include/asm/book3s/64/hash-4k.h > >> +static inline unsigned long hash__pte_update(struct mm_struct *mm, >> + unsigned long addr, >> + pte_t *ptep, unsigned long clr, >> + unsigned long set, >> + int huge) >> +{ >> +unsigned long old; >> + >> +old = hash__pte_update_one(ptep, clr, set); >> + >> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) { >> +unsigned int psize = get_slice_psize(mm, addr); >> +int nb, i; >> + >> +if (psize == MMU_PAGE_16M) >> +nb = SZ_16M / PMD_SIZE; >> +else if (psize == MMU_PAGE_16G) >> +nb = SZ_16G / PUD_SIZE; >> +else >> +nb = 1; > > On 4K, hugepages are either 16M or 16G. How can we end up in a situation > whwere the is pte is huge, but is is neither MMU_PAGE_16G nor MMU_PAGE_16M? We can't but I didn't want to leave nb undefined or with a value that might lead to writing in the weed. Value 1 seems a safe default. > >> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c >> b/arch/powerpc/mm/book3s64/hugetlbpage.c >> index 5a2e512e96db..83c3361b358b 100644 >> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c >> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c >> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long >> access, unsigned long vsid, >> /* If PTE permissions don't match, take page fault */ >> if (unlikely(!check_pte_access(access, old_pte))) >> return 1; >> +/* >> + * If hash-4k, hugepages use seeral contiguous PxD entries > 'several' >> + * so bail out and let mm make the page young or dirty >> + */ >> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { >> +if (!(old_pte & _PAGE_ACCESSED)) >> +return 1; >> +if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) >> +return 1; > > I have 0 clue about this code. What would happen if we do not bail out? > In that case the pte_xchg() in the while () will only set ACCESS or DIRTY bit on the first PxD entry, not on all cont-PxD entries. Christophe
Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
On Mon, May 27, 2024 at 03:30:12PM +0200, Christophe Leroy wrote: > On book3s/64, the only user of hugepd is hash in 4k mode. > > All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD. > > Rework hash-4k to use contiguous PMD and PUD instead. > > In that setup there are only two huge page sizes: 16M and 16G. > > 16M sits at PMD level and 16G at PUD level. On 4k mode, PMD_SIZE is 2MB and PUD_SIZE is 256MB, right? > +static inline unsigned long hash__pte_update(struct mm_struct *mm, > + unsigned long addr, > + pte_t *ptep, unsigned long clr, > + unsigned long set, > + int huge) > +{ > + unsigned long old; > + > + old = hash__pte_update_one(ptep, clr, set); > + > + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) { > + unsigned int psize = get_slice_psize(mm, addr); > + int nb, i; > + > + if (psize == MMU_PAGE_16M) > + nb = SZ_16M / PMD_SIZE; > + else if (psize == MMU_PAGE_16G) > + nb = SZ_16G / PUD_SIZE; > + else > + nb = 1; On 4K, hugepages are either 16M or 16G. How can we end up in a situation whwere the is pte is huge, but is is neither MMU_PAGE_16G nor MMU_PAGE_16M? > diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c > b/arch/powerpc/mm/book3s64/hugetlbpage.c > index 5a2e512e96db..83c3361b358b 100644 > --- a/arch/powerpc/mm/book3s64/hugetlbpage.c > +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c > @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long > access, unsigned long vsid, > /* If PTE permissions don't match, take page fault */ > if (unlikely(!check_pte_access(access, old_pte))) > return 1; > + /* > + * If hash-4k, hugepages use seeral contiguous PxD entries 'several' > + * so bail out and let mm make the page young or dirty > + */ > + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) { > + if (!(old_pte & _PAGE_ACCESSED)) > + return 1; > + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY)) > + return 1; I have 0 clue about this code. What would happen if we do not bail out? -- Oscar Salvador SUSE Labs
[RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD
On book3s/64, the only user of hugepd is hash in 4k mode. All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD. Rework hash-4k to use contiguous PMD and PUD instead. In that setup there are only two huge page sizes: 16M and 16G. 16M sits at PMD level and 16G at PUD level. pte_update doesn't know page size, lets use the same trick as hpte_need_flush() to get page size from segment properties. That's not the most efficient way but let's do that until callers of pte_update() provide page size instead of just a huge flag. Signed-off-by: Christophe Leroy --- v3: - Add missing pmd_leaf_size() and pud_leaf_size() - More cleanup in hugetlbpage_init() - Take a page fault when DIRTY or ACCESSED is missing on hash-4 hugepage v4: Rebased on v6.10-rc1 --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 15 -- arch/powerpc/include/asm/book3s/64/hash.h | 38 --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 38 --- .../include/asm/book3s/64/pgtable-4k.h| 47 --- .../include/asm/book3s/64/pgtable-64k.h | 20 arch/powerpc/include/asm/book3s/64/pgtable.h | 22 +++-- arch/powerpc/include/asm/hugetlb.h| 4 ++ .../powerpc/include/asm/nohash/hugetlb-e500.h | 4 -- arch/powerpc/include/asm/page.h | 8 arch/powerpc/mm/book3s64/hash_utils.c | 11 +++-- arch/powerpc/mm/book3s64/hugetlbpage.c| 10 arch/powerpc/mm/book3s64/pgtable.c| 12 - arch/powerpc/mm/hugetlbpage.c | 27 --- arch/powerpc/mm/pgtable.c | 2 +- arch/powerpc/platforms/Kconfig.cputype| 1 - 15 files changed, 72 insertions(+), 187 deletions(-) delete mode 100644 arch/powerpc/include/asm/book3s/64/pgtable-4k.h diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 6472b08fa1b0..c654c376ef8b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -74,21 +74,6 @@ #define remap_4k_pfn(vma, addr, pfn, prot) \ remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) -#ifdef CONFIG_HUGETLB_PAGE -static inline int hash__hugepd_ok(hugepd_t hpd) -{ - unsigned long hpdval = hpd_val(hpd); - /* -* if it is not a pte and have hugepd shift mask -* set, then it is a hugepd directory pointer -*/ - if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) && - ((hpdval & HUGEPD_SHIFT_MASK) != 0)) - return true; - return false; -} -#endif - /* * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just * a matter of returning the PTE bits that need to be modified. On 64K PTE, diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index faf3e3b4e4b2..8202c27afe23 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include +#include /* * Common bits between 4K and 64K pages in a linux-style PTE. @@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags); /* Atomic PTE updates */ -static inline unsigned long hash__pte_update(struct mm_struct *mm, -unsigned long addr, -pte_t *ptep, unsigned long clr, -unsigned long set, -int huge) +static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long clr, +unsigned long set) { __be64 old_be, tmp_be; - unsigned long old; __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ @@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct mm_struct *mm, : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); + + return be64_to_cpu(old_be); +} + +static inline unsigned long hash__pte_update(struct mm_struct *mm, +unsigned long addr, +pte_t *ptep, unsigned long clr, +unsigned long set, +int huge) +{ + unsigned long old; + + old = hash__pte_update_one(ptep, clr, set); + + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) { + unsigned int psize = get_slice_psize(mm, addr); + int nb, i; + + if (psize == MMU_PAGE_16M) + nb = SZ_16M / PMD_SIZE; + else if (psize == MMU_PAGE_16G) +