Re: [RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-24 Thread Christophe Leroy


Le 24/05/2024 à 12:02, Oscar Salvador a écrit :
> On Fri, May 17, 2024 at 09:00:01PM +0200, Christophe Leroy wrote:
>> In order to fit better with standard Linux page tables layout, add
>> support for 8M pages using contiguous PTE entries in a standard
>> page table. Page tables will then be populated with 1024 similar
>> entries and two PMD entries will point to that page table.
>>
>> The PMD entries also get a flag to tell it is addressing an 8M page,
>> this is required for the HW tablewalk assistance.
>>
>> Signed-off-by: Christophe Leroy 
> 
> I guess that this will slightly change if you remove patch#1 and patch#2
> as you said you will.
> So I will not comment on the overall design because I do not know how it will
> look afterwards, but just some things that caught my eye

Sure. I should send-out a v3 today or tomorrow, once I've done a few 
more tests.


> 
>> --- a/arch/powerpc/include/asm/hugetlb.h
>> +++ b/arch/powerpc/include/asm/hugetlb.h
>> @@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, 
>> unsigned long addr,
>>   static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>>  unsigned long addr, pte_t *ptep)
>>   {
>> -return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
>> +pmd_t *pmdp = (pmd_t *)ptep;
>> +pte_t pte;
>> +
>> +if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
>> SZ_8M))) {
> 
> There are quite some places where you do the "pmd_off" to check whether that
> is a 8MB entry.

I refactored the code, now I have only two places with it: pte_update() 
and huge_ptep_get()

By the way it doesn't check that PMD is 8M, it checks that the ptep 
points to the first PMD entry matching the said address.

> I think it would make somse sense to have some kind of macro/function to make
> more clear what we are checking against.
> e.g:
> 
>   #define pmd_is_SZ_8M(mm, addr, pmdp) (pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
> SZ_8M)))
>   (or whatever name you see fit)
>   
> then you would just need
> 
>   if (IS_ENABLED(CONFIG_PPC_8xx && pmd_is_SZ_8M(mm, addr, pdmp))
> 
> Because I see that is also scaterred in 8xx code.
> 
> 
>> +pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), 
>> ~0UL, 0, 1));
>> +pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
>> 1);
> 
> I have this fresh one because I recently read about 8xx pagetables, but not 
> sure
> how my memory will survive this, so maybe throw a little comment in there that
> we are pointing the two pmds to the area.

The two PMD are now pointing to there own areas, we are not anymore in 
the hugepd case where the PMD was pointing to a single HUGEPD containing 
a single HUGEPTE.

> 
> Also, the way we pass the parameters here to pte_update() is a bit awkward.
> Ideally we should be using some meaningful names?
> 
>   clr_all_bits = ~0UL
>   set_bits = 0
>   bool is_huge = true
> 
>   pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr_all_bits, 
> set_bits, is_huge)
> 
> or something along those lines

Well, with my refactoring those functions are not modified anymore so I 
won't change them.

> 
>> -static inline int check_and_get_huge_psize(int shift)
>> -{
>> -return shift_to_mmu_psize(shift);
>> +if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
> 
> Here you could also use the pmd_is_SZ_8M()

Yes, may do that.

> 
>> +ptep = pte_offset_kernel(pmdp, 0);
>> +return ptep_get(ptep);
>>   }
>>   
>>   #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> @@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
>> addr, pte_t *ptep,
>>   static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>>pte_t *ptep, unsigned long sz)
>>   {
>> -pte_update(mm, addr, ptep, ~0UL, 0, 1);
>> +pmd_t *pmdp = (pmd_t *)ptep;
>> +
>> +if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
>> +pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
>> +pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
>> 1);
>> +} else {
>> +pte_update(mm, addr, ptep, ~0UL, 0, 1);
>> +}
> 
> Could we not leverage this in huge_ptep_get_and_clear()?

I'm not modifying that anymore

> AFAICS,
> 
>   huge_pet_get_and_clear(mm, addr, pte_t *p)
>   {
>pte_t pte = pte_val(*p);
> 
>huge_pte_clear(mm, addr, p);
>return pte;
>   }
> 
> Or maybe it is not that easy if different powerpc platforms provide their own.
> It might be worth checking though.
> 
>>   }
>>   
>>   #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
>> @@ -63,7 +50,14 @@ static inline void huge_ptep_set_wrprotect(struct 
>> mm_struct *mm,
>>  unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
>>  unsigned long set = pte_val(pte_wrprotect(__pte(0)));
>>   
>> -pte_update(mm, addr, ptep, clr, set, 1);
>> +pmd_t *pmdp = (pmd_t *)ptep;
>> +
>> +if (pmdp == pmd_off(mm, 

Re: [RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-24 Thread Oscar Salvador
On Fri, May 17, 2024 at 09:00:01PM +0200, Christophe Leroy wrote:
> In order to fit better with standard Linux page tables layout, add
> support for 8M pages using contiguous PTE entries in a standard
> page table. Page tables will then be populated with 1024 similar
> entries and two PMD entries will point to that page table.
> 
> The PMD entries also get a flag to tell it is addressing an 8M page,
> this is required for the HW tablewalk assistance.
> 
> Signed-off-by: Christophe Leroy 

I guess that this will slightly change if you remove patch#1 and patch#2
as you said you will.
So I will not comment on the overall design because I do not know how it will
look afterwards, but just some things that caught my eye

> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, 
> unsigned long addr,
>  static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>   unsigned long addr, pte_t *ptep)
>  {
> - return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
> + pmd_t *pmdp = (pmd_t *)ptep;
> + pte_t pte;
> +
> + if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
> SZ_8M))) {

There are quite some places where you do the "pmd_off" to check whether that
is a 8MB entry.
I think it would make somse sense to have some kind of macro/function to make
more clear what we are checking against.
e.g:

 #define pmd_is_SZ_8M(mm, addr, pmdp) (pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
SZ_8M)))
 (or whatever name you see fit)
 
then you would just need

 if (IS_ENABLED(CONFIG_PPC_8xx && pmd_is_SZ_8M(mm, addr, pdmp))

Because I see that is also scaterred in 8xx code.


> + pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), 
> ~0UL, 0, 1));
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
> 1);

I have this fresh one because I recently read about 8xx pagetables, but not sure
how my memory will survive this, so maybe throw a little comment in there that
we are pointing the two pmds to the area.

Also, the way we pass the parameters here to pte_update() is a bit awkward.
Ideally we should be using some meaningful names?

 clr_all_bits = ~0UL
 set_bits = 0
 bool is_huge = true

 pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr_all_bits, set_bits, 
is_huge)

or something along those lines

> -static inline int check_and_get_huge_psize(int shift)
> -{
> - return shift_to_mmu_psize(shift);
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))

Here you could also use the pmd_is_SZ_8M()

> + ptep = pte_offset_kernel(pmdp, 0);
> + return ptep_get(ptep);
>  }
>  
>  #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> @@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
> addr, pte_t *ptep,
>  static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, unsigned long sz)
>  {
> - pte_update(mm, addr, ptep, ~0UL, 0, 1);
> + pmd_t *pmdp = (pmd_t *)ptep;
> +
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> + pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
> 1);
> + } else {
> + pte_update(mm, addr, ptep, ~0UL, 0, 1);
> + }

Could we not leverage this in huge_ptep_get_and_clear()?
AFAICS,

 huge_pet_get_and_clear(mm, addr, pte_t *p) 
 {
  pte_t pte = pte_val(*p);

  huge_pte_clear(mm, addr, p);
  return pte;
 }

Or maybe it is not that easy if different powerpc platforms provide their own.
It might be worth checking though.

>  }
>  
>  #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> @@ -63,7 +50,14 @@ static inline void huge_ptep_set_wrprotect(struct 
> mm_struct *mm,
>   unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
>   unsigned long set = pte_val(pte_wrprotect(__pte(0)));
>  
> - pte_update(mm, addr, ptep, clr, set, 1);
> + pmd_t *pmdp = (pmd_t *)ptep;
> +
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> + pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, 1);
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, 
> 1);
> + } else {
> + pte_update(mm, addr, ptep, clr, set, 1);

I would replace the "1" with "is_huge" or "huge", as being done in
__ptep_set_access_flags , something that makes it more clear without the need
to check pte_update().

  
>  #endif /* _ASM_POWERPC_PGALLOC_32_H */
> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
> b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> index 07df6b664861..b05cc4f87713 100644
> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
...
> - * For other page sizes, we have a single entry in the table.
> + * For 8M pages, we have 1024 entries as 

[RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

2024-05-17 Thread Christophe Leroy
In order to fit better with standard Linux page tables layout, add
support for 8M pages using contiguous PTE entries in a standard
page table. Page tables will then be populated with 1024 similar
entries and two PMD entries will point to that page table.

The PMD entries also get a flag to tell it is addressing an 8M page,
this is required for the HW tablewalk assistance.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig  |  1 -
 arch/powerpc/include/asm/hugetlb.h| 11 +++-
 .../include/asm/nohash/32/hugetlb-8xx.h   | 54 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h  |  2 +
 arch/powerpc/include/asm/nohash/32/pte-8xx.h  | 57 +--
 arch/powerpc/include/asm/nohash/pgtable.h |  4 --
 arch/powerpc/include/asm/page.h   |  5 --
 arch/powerpc/include/asm/pgtable.h|  3 +
 arch/powerpc/kernel/head_8xx.S| 10 +---
 arch/powerpc/mm/hugetlbpage.c | 18 +++---
 arch/powerpc/mm/kasan/8xx.c   | 15 +++--
 arch/powerpc/mm/nohash/8xx.c  | 43 +++---
 arch/powerpc/mm/pgtable.c | 24 +---
 arch/powerpc/mm/pgtable_32.c  |  2 +-
 arch/powerpc/platforms/Kconfig.cputype|  2 +
 15 files changed, 139 insertions(+), 112 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a1a3b3363008..6a4ea7dad23f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,7 +135,6 @@ config PPC
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 79176a499763..36ed6d976cf9 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned 
long addr,
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+   pmd_t *pmdp = (pmd_t *)ptep;
+   pte_t pte;
+
+   if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, 
SZ_8M))) {
+   pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), 
~0UL, 0, 1));
+   pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 
1);
+   } else {
+   pte = __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
+   }
+   return pte;
 }
 
 #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 92df40c6cc6b..1414cfd28987 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -4,45 +4,25 @@
 
 #define PAGE_SHIFT_8M  23
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-   BUG_ON(!hugepd_ok(hpd));
-
-   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-   return PAGE_SHIFT_8M;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
-   unsigned int pdshift)
-{
-   unsigned long idx = (addr & (SZ_4M - 1)) >> PAGE_SHIFT;
-
-   return hugepd_page(hpd) + idx;
-}
-
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
  unsigned long vmaddr)
 {
flush_tlb_page(vma, vmaddr);
 }
 
-static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int 
pshift)
+static inline int check_and_get_huge_psize(int shift)
 {
-   *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
+   return shift_to_mmu_psize(shift);
 }
 
-static inline void hugepd_populate_kernel(hugepd_t *hpdp, pte_t *new, unsigned 
int pshift)
+#define __HAVE_ARCH_HUGE_PTEP_GET
+static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep)
 {
-   *hpdp = __hugepd(__pa(new) | _PMD_PRESENT | _PMD_PAGE_8M);
-}
+   pmd_t *pmdp = (pmd_t *)ptep;
 
-static inline int check_and_get_huge_psize(int shift)
-{
-   return shift_to_mmu_psize(shift);
+   if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
+   ptep = pte_offset_kernel(pmdp, 0);
+   return ptep_get(ptep);
 }
 
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
@@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep,
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep, unsigned long sz)
 {
-   pte_update(mm, addr,