Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-29 Thread Oscar Salvador
On Wed, May 29, 2024 at 10:07:55AM +, Christophe Leroy wrote:
> We can't but I didn't want to leave nb undefined or with a value that 
> might lead to writing in the weed. Value 1 seems a safe default.

Might be worth to throw a WARN_ON?

> >> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c 
> >> b/arch/powerpc/mm/book3s64/hugetlbpage.c
> >> index 5a2e512e96db..83c3361b358b 100644
> >> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c
> >> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
> >> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> >> access, unsigned long vsid,
> >>/* If PTE permissions don't match, take page fault */
> >>if (unlikely(!check_pte_access(access, old_pte)))
> >>return 1;
> >> +  /*
> >> +   * If hash-4k, hugepages use seeral contiguous PxD entries
> > 'several'
> >> +   * so bail out and let mm make the page young or dirty
> >> +   */
> >> +  if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
> >> +  if (!(old_pte & _PAGE_ACCESSED))
> >> +  return 1;
> >> +  if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
> >> +  return 1;
> > 
> > I have 0 clue about this code. What would happen if we do not bail out?
> > 
> 
> In that case the pte_xchg() in the while () will only set ACCESS or 
> DIRTY bit on the first PxD entry, not on all cont-PxD entries.

I see, thanks for explaining.


-- 
Oscar Salvador
SUSE Labs


Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-29 Thread Christophe Leroy


Le 29/05/2024 à 11:23, Oscar Salvador a écrit :
> On Mon, May 27, 2024 at 03:30:12PM +0200, Christophe Leroy wrote:
>> On book3s/64, the only user of hugepd is hash in 4k mode.
>>
>> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
>>
>> Rework hash-4k to use contiguous PMD and PUD instead.
>>
>> In that setup there are only two huge page sizes: 16M and 16G.
>>
>> 16M sits at PMD level and 16G at PUD level.
> 
> 
> On 4k mode, PMD_SIZE is 2MB and PUD_SIZE is 256MB, right?

Correct, as documented in arch/powerpc/include/asm/book3s/64/hash-4k.h

> 
>> +static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> + unsigned long addr,
>> + pte_t *ptep, unsigned long clr,
>> + unsigned long set,
>> + int huge)
>> +{
>> +unsigned long old;
>> +
>> +old = hash__pte_update_one(ptep, clr, set);
>> +
>> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
>> +unsigned int psize = get_slice_psize(mm, addr);
>> +int nb, i;
>> +
>> +if (psize == MMU_PAGE_16M)
>> +nb = SZ_16M / PMD_SIZE;
>> +else if (psize == MMU_PAGE_16G)
>> +nb = SZ_16G / PUD_SIZE;
>> +else
>> +nb = 1;
> 
> On 4K, hugepages are either 16M or 16G. How can we end up in a situation
> whwere the is pte is huge, but is is neither MMU_PAGE_16G nor MMU_PAGE_16M?

We can't but I didn't want to leave nb undefined or with a value that 
might lead to writing in the weed. Value 1 seems a safe default.

> 
>> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c 
>> b/arch/powerpc/mm/book3s64/hugetlbpage.c
>> index 5a2e512e96db..83c3361b358b 100644
>> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c
>> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
>> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long 
>> access, unsigned long vsid,
>>  /* If PTE permissions don't match, take page fault */
>>  if (unlikely(!check_pte_access(access, old_pte)))
>>  return 1;
>> +/*
>> + * If hash-4k, hugepages use seeral contiguous PxD entries
> 'several'
>> + * so bail out and let mm make the page young or dirty
>> + */
>> +if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
>> +if (!(old_pte & _PAGE_ACCESSED))
>> +return 1;
>> +if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
>> +return 1;
> 
> I have 0 clue about this code. What would happen if we do not bail out?
> 

In that case the pte_xchg() in the while () will only set ACCESS or 
DIRTY bit on the first PxD entry, not on all cont-PxD entries.

Christophe


Re: [RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-29 Thread Oscar Salvador
On Mon, May 27, 2024 at 03:30:12PM +0200, Christophe Leroy wrote:
> On book3s/64, the only user of hugepd is hash in 4k mode.
> 
> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
> 
> Rework hash-4k to use contiguous PMD and PUD instead.
> 
> In that setup there are only two huge page sizes: 16M and 16G.
> 
> 16M sits at PMD level and 16G at PUD level.


On 4k mode, PMD_SIZE is 2MB and PUD_SIZE is 256MB, right?

> +static inline unsigned long hash__pte_update(struct mm_struct *mm,
> +  unsigned long addr,
> +  pte_t *ptep, unsigned long clr,
> +  unsigned long set,
> +  int huge)
> +{
> + unsigned long old;
> +
> + old = hash__pte_update_one(ptep, clr, set);
> +
> + if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
> + unsigned int psize = get_slice_psize(mm, addr);
> + int nb, i;
> +
> + if (psize == MMU_PAGE_16M)
> + nb = SZ_16M / PMD_SIZE;
> + else if (psize == MMU_PAGE_16G)
> + nb = SZ_16G / PUD_SIZE;
> + else
> + nb = 1;

On 4K, hugepages are either 16M or 16G. How can we end up in a situation
whwere the is pte is huge, but is is neither MMU_PAGE_16G nor MMU_PAGE_16M?

> diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c 
> b/arch/powerpc/mm/book3s64/hugetlbpage.c
> index 5a2e512e96db..83c3361b358b 100644
> --- a/arch/powerpc/mm/book3s64/hugetlbpage.c
> +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c
> @@ -53,6 +53,16 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> access, unsigned long vsid,
>   /* If PTE permissions don't match, take page fault */
>   if (unlikely(!check_pte_access(access, old_pte)))
>   return 1;
> + /*
> +  * If hash-4k, hugepages use seeral contiguous PxD entries
'several'
> +  * so bail out and let mm make the page young or dirty
> +  */
> + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
> + if (!(old_pte & _PAGE_ACCESSED))
> + return 1;
> + if ((access & _PAGE_WRITE) && !(old_pte & _PAGE_DIRTY))
> + return 1;

I have 0 clue about this code. What would happen if we do not bail out?


-- 
Oscar Salvador
SUSE Labs


[RFC PATCH v4 14/16] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

2024-05-27 Thread Christophe Leroy
On book3s/64, the only user of hugepd is hash in 4k mode.

All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.

Rework hash-4k to use contiguous PMD and PUD instead.

In that setup there are only two huge page sizes: 16M and 16G.

16M sits at PMD level and 16G at PUD level.

pte_update doesn't know page size, lets use the same trick as
hpte_need_flush() to get page size from segment properties. That's
not the most efficient way but let's do that until callers of
pte_update() provide page size instead of just a huge flag.

Signed-off-by: Christophe Leroy 
---
v3:
- Add missing pmd_leaf_size() and pud_leaf_size()
- More cleanup in hugetlbpage_init()
- Take a page fault when DIRTY or ACCESSED is missing on hash-4 hugepage

v4: Rebased on v6.10-rc1
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  | 15 --
 arch/powerpc/include/asm/book3s/64/hash.h | 38 ---
 arch/powerpc/include/asm/book3s/64/hugetlb.h  | 38 ---
 .../include/asm/book3s/64/pgtable-4k.h| 47 ---
 .../include/asm/book3s/64/pgtable-64k.h   | 20 
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 22 +++--
 arch/powerpc/include/asm/hugetlb.h|  4 ++
 .../powerpc/include/asm/nohash/hugetlb-e500.h |  4 --
 arch/powerpc/include/asm/page.h   |  8 
 arch/powerpc/mm/book3s64/hash_utils.c | 11 +++--
 arch/powerpc/mm/book3s64/hugetlbpage.c| 10 
 arch/powerpc/mm/book3s64/pgtable.c| 12 -
 arch/powerpc/mm/hugetlbpage.c | 27 ---
 arch/powerpc/mm/pgtable.c |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  1 -
 15 files changed, 72 insertions(+), 187 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/book3s/64/pgtable-4k.h

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 6472b08fa1b0..c654c376ef8b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -74,21 +74,6 @@
 #define remap_4k_pfn(vma, addr, pfn, prot) \
remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
-#ifdef CONFIG_HUGETLB_PAGE
-static inline int hash__hugepd_ok(hugepd_t hpd)
-{
-   unsigned long hpdval = hpd_val(hpd);
-   /*
-* if it is not a pte and have hugepd shift mask
-* set, then it is a hugepd directory pointer
-*/
-   if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
-   ((hpdval & HUGEPD_SHIFT_MASK) != 0))
-   return true;
-   return false;
-}
-#endif
-
 /*
  * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
  * a matter of returning the PTE bits that need to be modified. On 64K PTE,
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index faf3e3b4e4b2..8202c27afe23 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include 
+#include 
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
@@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, 
unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long 
flags);
 /* Atomic PTE updates */
-static inline unsigned long hash__pte_update(struct mm_struct *mm,
-unsigned long addr,
-pte_t *ptep, unsigned long clr,
-unsigned long set,
-int huge)
+static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long 
clr,
+unsigned long set)
 {
__be64 old_be, tmp_be;
-   unsigned long old;
 
__asm__ __volatile__(
"1: ldarx   %0,0,%3 # pte_update\n\
@@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct 
mm_struct *mm,
: "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
: "cc" );
+
+   return be64_to_cpu(old_be);
+}
+
+static inline unsigned long hash__pte_update(struct mm_struct *mm,
+unsigned long addr,
+pte_t *ptep, unsigned long clr,
+unsigned long set,
+int huge)
+{
+   unsigned long old;
+
+   old = hash__pte_update_one(ptep, clr, set);
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && huge) {
+   unsigned int psize = get_slice_psize(mm, addr);
+   int nb, i;
+
+   if (psize == MMU_PAGE_16M)
+   nb = SZ_16M / PMD_SIZE;
+   else if (psize == MMU_PAGE_16G)
+