Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread John Hubbard

On 9/11/20 1:36 PM, Vasily Gorbik wrote:

Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
  unsigned int flags, struct page **pages, int *nr)
...
 pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-levels paging, crossing a 2 GB pud boundary:

// addr = 0x1007000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
  unsigned int flags, struct page **pages, int *nr)
{
 unsigned long next;
 pud_t *pudp;

 // pud_offset returns &p4d itself (a pointer to a value on stack)
 pudp = pud_offset(&p4d, addr);
 do {
 // on second iteratation reading "random" stack value
 pud_t pud = READ_ONCE(*pudp);

 // next = 0x1008000, due to PUD_SIZE/MASK != 
PGDIR_SIZE/MASK on s390
 next = pud_addr_end(addr, end);
 ...
 } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
stack

 return 1;
}

This happens since s390 moved to common gup code with
commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
and commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code"). s390 tried to mimic static level folding by
changing pXd_offset primitives to always calculate top level page table
offset in pgd_offset and just return the value passed when pXd_offset
has to act as folded.

What is crucial for gup_fast and what has been overlooked is
that PxD_SIZE/MASK and thus pXd_addr_end should also change
correspondingly. And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original
pXdp pointers down to gup_pXd_range functions. And introduce
pXd_offset_lockless helpers, which take an additional pXd
entry value parameter. This has already been discussed in
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Cc:  # 5.2+
Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Reviewed-by: Gerald Schaefer 
Reviewed-by: Alexander Gordeev 
Signed-off-by: Vasily Gorbik 
---


Looks cleaner than I'd dared hope for. :)

Reviewed-by: John Hubbard 


thanks,
--
John Hubbard
NVIDIA


v2: added brackets &pgd -> &(pgd)

  arch/s390/include/asm/pgtable.h | 42 +++--
  include/linux/pgtable.h | 10 
  mm/gup.c| 18 +++---
  3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..b55561cc8786 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, 
unsigned long address)
  
  #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
  
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long 
address)
  {
-   if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-   return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-   return (p4d_t *) pgd;
+   if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+   return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+   return (p4d_t *) pgdp;
  }
+#define p4d_offset_lockless p4d_offset_lockless
  
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)

+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
  {
-   if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-   return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-   return (pud_t *) p4d;
+   return p4d_offset_lockless(pgdp, *pgdp, address);
+}
+
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long 
address)
+{
+   if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+   return (pud_t *) p4d_deref(p4d) + pud_index(address);
+   return (pud_t *) p4dp;
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
+{
+   return pud_offset_lockless(p4

Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Mike Rapoport
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
> 
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> ...
> pudp = pud_offset(&p4d, addr);
> 
> This function passes a reference on that local value copy to pXd_offset,
> and might get the very same pointer in return. This happens when the
> level is folded (on most arches), and that pointer should not be iterated.
> 
> On s390 due to the fact that each task might have different 5,4 or
> 3-level address translation and hence different levels folded the logic
> is more complex and non-iteratable pointer to a local copy leads to
> severe problems.
> 
> Here is an example of what happens with gup_fast on s390, for a task
> with 3-levels paging, crossing a 2 GB pud boundary:
> 
> // addr = 0x1007000, end = 0x10080001000
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> {
> unsigned long next;
> pud_t *pudp;
> 
> // pud_offset returns &p4d itself (a pointer to a value on stack)
> pudp = pud_offset(&p4d, addr);
> do {
> // on second iteratation reading "random" stack value
> pud_t pud = READ_ONCE(*pudp);
> 
> // next = 0x1008000, due to PUD_SIZE/MASK != 
> PGDIR_SIZE/MASK on s390
> next = pud_addr_end(addr, end);
> ...
> } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
> stack
> 
> return 1;
> }
> 
> This happens since s390 moved to common gup code with
> commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
> and commit 1a42010cdc26 ("s390/mm: convert to the generic
> get_user_pages_fast code"). s390 tried to mimic static level folding by
> changing pXd_offset primitives to always calculate top level page table
> offset in pgd_offset and just return the value passed when pXd_offset
> has to act as folded.
> 
> What is crucial for gup_fast and what has been overlooked is
> that PxD_SIZE/MASK and thus pXd_addr_end should also change
> correspondingly. And the latter is not possible with dynamic folding.
> 
> To fix the issue in addition to pXd values pass original
> pXdp pointers down to gup_pXd_range functions. And introduce
> pXd_offset_lockless helpers, which take an additional pXd
> entry value parameter. This has already been discussed in
> https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
> 
> Cc:  # 5.2+
> Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast 
> code")
> Reviewed-by: Gerald Schaefer 
> Reviewed-by: Alexander Gordeev 
> Signed-off-by: Vasily Gorbik 

Reviewed-by: Mike Rapoport 

> ---
> v2: added brackets &pgd -> &(pgd)
> 
>  arch/s390/include/asm/pgtable.h | 42 +++--
>  include/linux/pgtable.h | 10 
>  mm/gup.c| 18 +++---
>  3 files changed, 49 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 7eb01a5459cd..b55561cc8786 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, 
> unsigned long address)
>  
>  #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
>  
> -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
> +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned 
> long address)
>  {
> - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
> - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
> - return (p4d_t *) pgd;
> + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
> + return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
> + return (p4d_t *) pgdp;
>  }
> +#define p4d_offset_lockless p4d_offset_lockless
>  
> -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
>  {
> - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
> - return (pud_t *) p4d_deref(*p4d) + pud_index(address);
> - return (pud_t *) p4d;
> + return p4d_offset_lockless(pgdp, *pgdp, address);
> +}
> +
> +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned 
> long address)
> +{
> + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
> + return (pud_t *) p4d_deref(p4d) + pud_index(address);
> + return (pud_t *) p4dp;
> +}
> +#define pud_offset_lockless pud_offset_

Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Jason Gunthorpe
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
> 
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> ...
> pudp = pud_offset(&p4d, addr);
> 
> This function passes a reference on that local value copy to pXd_offset,
> and might get the very same pointer in return. This happens when the
> level is folded (on most arches), and that pointer should not be iterated.
> 
> On s390 due to the fact that each task might have different 5,4 or
> 3-level address translation and hence different levels folded the logic
> is more complex and non-iteratable pointer to a local copy leads to
> severe problems.
> 
> Here is an example of what happens with gup_fast on s390, for a task
> with 3-levels paging, crossing a 2 GB pud boundary:
> 
> // addr = 0x1007000, end = 0x10080001000
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> {
> unsigned long next;
> pud_t *pudp;
> 
> // pud_offset returns &p4d itself (a pointer to a value on stack)
> pudp = pud_offset(&p4d, addr);
> do {
> // on second iteratation reading "random" stack value
> pud_t pud = READ_ONCE(*pudp);
> 
> // next = 0x1008000, due to PUD_SIZE/MASK != 
> PGDIR_SIZE/MASK on s390
> next = pud_addr_end(addr, end);
> ...
> } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
> stack
> 
> return 1;
> }
> 
> This happens since s390 moved to common gup code with
> commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
> and commit 1a42010cdc26 ("s390/mm: convert to the generic
> get_user_pages_fast code"). s390 tried to mimic static level folding by
> changing pXd_offset primitives to always calculate top level page table
> offset in pgd_offset and just return the value passed when pXd_offset
> has to act as folded.
> 
> What is crucial for gup_fast and what has been overlooked is
> that PxD_SIZE/MASK and thus pXd_addr_end should also change
> correspondingly. And the latter is not possible with dynamic folding.
> 
> To fix the issue in addition to pXd values pass original
> pXdp pointers down to gup_pXd_range functions. And introduce
> pXd_offset_lockless helpers, which take an additional pXd
> entry value parameter. This has already been discussed in
> https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
> 
> Cc:  # 5.2+
> Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast 
> code")
> Reviewed-by: Gerald Schaefer 
> Reviewed-by: Alexander Gordeev 
> Signed-off-by: Vasily Gorbik 
> ---
> v2: added brackets &pgd -> &(pgd)

Reviewed-by: Jason Gunthorpe 

Regards,
Jason


Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Vasily Gorbik
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
...snip...
> ---
> v2: added brackets &pgd -> &(pgd)
> 
>  arch/s390/include/asm/pgtable.h | 42 +++--
>  include/linux/pgtable.h | 10 
>  mm/gup.c| 18 +++---
>  3 files changed, 49 insertions(+), 21 deletions(-)

Andrew, any chance you would pick this up?

There is an Ack from Linus. And I haven't seen any objections from Jason or 
John.
This seems to be as safe for other architectures as possible.

@Jason and John
Any acks/nacks?

Thank you,
Vasily


[PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-11 Thread Vasily Gorbik
Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 unsigned int flags, struct page **pages, int *nr)
...
pudp = pud_offset(&p4d, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-levels paging, crossing a 2 GB pud boundary:

// addr = 0x1007000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;

// pud_offset returns &p4d itself (a pointer to a value on stack)
pudp = pud_offset(&p4d, addr);
do {
// on second iteratation reading "random" stack value
pud_t pud = READ_ONCE(*pudp);

// next = 0x1008000, due to PUD_SIZE/MASK != 
PGDIR_SIZE/MASK on s390
next = pud_addr_end(addr, end);
...
} while (pudp++, addr = next, addr != end); // pudp++ iterating over 
stack

return 1;
}

This happens since s390 moved to common gup code with
commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
and commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code"). s390 tried to mimic static level folding by
changing pXd_offset primitives to always calculate top level page table
offset in pgd_offset and just return the value passed when pXd_offset
has to act as folded.

What is crucial for gup_fast and what has been overlooked is
that PxD_SIZE/MASK and thus pXd_addr_end should also change
correspondingly. And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original
pXdp pointers down to gup_pXd_range functions. And introduce
pXd_offset_lockless helpers, which take an additional pXd
entry value parameter. This has already been discussed in
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Cc:  # 5.2+
Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Reviewed-by: Gerald Schaefer 
Reviewed-by: Alexander Gordeev 
Signed-off-by: Vasily Gorbik 
---
v2: added brackets &pgd -> &(pgd)

 arch/s390/include/asm/pgtable.h | 42 +++--
 include/linux/pgtable.h | 10 
 mm/gup.c| 18 +++---
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..b55561cc8786 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, 
unsigned long address)
 
 #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
 
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long 
address)
 {
-   if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-   return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-   return (p4d_t *) pgd;
+   if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+   return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+   return (p4d_t *) pgdp;
 }
+#define p4d_offset_lockless p4d_offset_lockless
 
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
 {
-   if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-   return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-   return (pud_t *) p4d;
+   return p4d_offset_lockless(pgdp, *pgdp, address);
+}
+
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long 
address)
+{
+   if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+   return (pud_t *) p4d_deref(p4d) + pud_index(address);
+   return (pud_t *) p4dp;
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
+{
+   return pud_offset_lockless(p4dp, *p4dp, address);
 }
 #define pud_offset pud_offset
 
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+static inline pmd_t *pmd_offset_lockless(pud_t *pudp