Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
On 9/11/20 1:36 PM, Vasily Gorbik wrote: Currently to make sure that every page table entry is read just once gup_fast walks perform READ_ONCE and pass pXd value down to the next gup_pXd_range function by value e.g.: static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) ... pudp = pud_offset(&p4d, addr); This function passes a reference on that local value copy to pXd_offset, and might get the very same pointer in return. This happens when the level is folded (on most arches), and that pointer should not be iterated. On s390 due to the fact that each task might have different 5,4 or 3-level address translation and hence different levels folded the logic is more complex and non-iteratable pointer to a local copy leads to severe problems. Here is an example of what happens with gup_fast on s390, for a task with 3-levels paging, crossing a 2 GB pud boundary: // addr = 0x1007000, end = 0x10080001000 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; // pud_offset returns &p4d itself (a pointer to a value on stack) pudp = pud_offset(&p4d, addr); do { // on second iteratation reading "random" stack value pud_t pud = READ_ONCE(*pudp); // next = 0x1008000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390 next = pud_addr_end(addr, end); ... } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack return 1; } This happens since s390 moved to common gup code with commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") and commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code"). s390 tried to mimic static level folding by changing pXd_offset primitives to always calculate top level page table offset in pgd_offset and just return the value passed when pXd_offset has to act as folded. What is crucial for gup_fast and what has been overlooked is that PxD_SIZE/MASK and thus pXd_addr_end should also change correspondingly. And the latter is not possible with dynamic folding. To fix the issue in addition to pXd values pass original pXdp pointers down to gup_pXd_range functions. And introduce pXd_offset_lockless helpers, which take an additional pXd entry value parameter. This has already been discussed in https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1 Cc: # 5.2+ Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code") Reviewed-by: Gerald Schaefer Reviewed-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- Looks cleaner than I'd dared hope for. :) Reviewed-by: John Hubbard thanks, -- John Hubbard NVIDIA v2: added brackets &pgd -> &(pgd) arch/s390/include/asm/pgtable.h | 42 +++-- include/linux/pgtable.h | 10 mm/gup.c| 18 +++--- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7eb01a5459cd..b55561cc8786 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) { - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); - return (p4d_t *) pgd; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); + return (p4d_t *) pgdp; } +#define p4d_offset_lockless p4d_offset_lockless -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) { - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) - return (pud_t *) p4d_deref(*p4d) + pud_index(address); - return (pud_t *) p4d; + return p4d_offset_lockless(pgdp, *pgdp, address); +} + +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(p4d) + pud_index(address); + return (pud_t *) p4dp; +} +#define pud_offset_lockless pud_offset_lockless + +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) +{ + return pud_offset_lockless(p4
Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote: > Currently to make sure that every page table entry is read just once > gup_fast walks perform READ_ONCE and pass pXd value down to the next > gup_pXd_range function by value e.g.: > > static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, > unsigned int flags, struct page **pages, int *nr) > ... > pudp = pud_offset(&p4d, addr); > > This function passes a reference on that local value copy to pXd_offset, > and might get the very same pointer in return. This happens when the > level is folded (on most arches), and that pointer should not be iterated. > > On s390 due to the fact that each task might have different 5,4 or > 3-level address translation and hence different levels folded the logic > is more complex and non-iteratable pointer to a local copy leads to > severe problems. > > Here is an example of what happens with gup_fast on s390, for a task > with 3-levels paging, crossing a 2 GB pud boundary: > > // addr = 0x1007000, end = 0x10080001000 > static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, > unsigned int flags, struct page **pages, int *nr) > { > unsigned long next; > pud_t *pudp; > > // pud_offset returns &p4d itself (a pointer to a value on stack) > pudp = pud_offset(&p4d, addr); > do { > // on second iteratation reading "random" stack value > pud_t pud = READ_ONCE(*pudp); > > // next = 0x1008000, due to PUD_SIZE/MASK != > PGDIR_SIZE/MASK on s390 > next = pud_addr_end(addr, end); > ... > } while (pudp++, addr = next, addr != end); // pudp++ iterating over > stack > > return 1; > } > > This happens since s390 moved to common gup code with > commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") > and commit 1a42010cdc26 ("s390/mm: convert to the generic > get_user_pages_fast code"). s390 tried to mimic static level folding by > changing pXd_offset primitives to always calculate top level page table > offset in pgd_offset and just return the value passed when pXd_offset > has to act as folded. > > What is crucial for gup_fast and what has been overlooked is > that PxD_SIZE/MASK and thus pXd_addr_end should also change > correspondingly. And the latter is not possible with dynamic folding. > > To fix the issue in addition to pXd values pass original > pXdp pointers down to gup_pXd_range functions. And introduce > pXd_offset_lockless helpers, which take an additional pXd > entry value parameter. This has already been discussed in > https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1 > > Cc: # 5.2+ > Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast > code") > Reviewed-by: Gerald Schaefer > Reviewed-by: Alexander Gordeev > Signed-off-by: Vasily Gorbik Reviewed-by: Mike Rapoport > --- > v2: added brackets &pgd -> &(pgd) > > arch/s390/include/asm/pgtable.h | 42 +++-- > include/linux/pgtable.h | 10 > mm/gup.c| 18 +++--- > 3 files changed, 49 insertions(+), 21 deletions(-) > > diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h > index 7eb01a5459cd..b55561cc8786 100644 > --- a/arch/s390/include/asm/pgtable.h > +++ b/arch/s390/include/asm/pgtable.h > @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, > unsigned long address) > > #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) > > -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) > +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned > long address) > { > - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) > - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); > - return (p4d_t *) pgd; > + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) > + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); > + return (p4d_t *) pgdp; > } > +#define p4d_offset_lockless p4d_offset_lockless > > -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) > +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) > { > - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) > - return (pud_t *) p4d_deref(*p4d) + pud_index(address); > - return (pud_t *) p4d; > + return p4d_offset_lockless(pgdp, *pgdp, address); > +} > + > +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned > long address) > +{ > + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) > + return (pud_t *) p4d_deref(p4d) + pud_index(address); > + return (pud_t *) p4dp; > +} > +#define pud_offset_lockless pud_offset_
Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote: > Currently to make sure that every page table entry is read just once > gup_fast walks perform READ_ONCE and pass pXd value down to the next > gup_pXd_range function by value e.g.: > > static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, > unsigned int flags, struct page **pages, int *nr) > ... > pudp = pud_offset(&p4d, addr); > > This function passes a reference on that local value copy to pXd_offset, > and might get the very same pointer in return. This happens when the > level is folded (on most arches), and that pointer should not be iterated. > > On s390 due to the fact that each task might have different 5,4 or > 3-level address translation and hence different levels folded the logic > is more complex and non-iteratable pointer to a local copy leads to > severe problems. > > Here is an example of what happens with gup_fast on s390, for a task > with 3-levels paging, crossing a 2 GB pud boundary: > > // addr = 0x1007000, end = 0x10080001000 > static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, > unsigned int flags, struct page **pages, int *nr) > { > unsigned long next; > pud_t *pudp; > > // pud_offset returns &p4d itself (a pointer to a value on stack) > pudp = pud_offset(&p4d, addr); > do { > // on second iteratation reading "random" stack value > pud_t pud = READ_ONCE(*pudp); > > // next = 0x1008000, due to PUD_SIZE/MASK != > PGDIR_SIZE/MASK on s390 > next = pud_addr_end(addr, end); > ... > } while (pudp++, addr = next, addr != end); // pudp++ iterating over > stack > > return 1; > } > > This happens since s390 moved to common gup code with > commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") > and commit 1a42010cdc26 ("s390/mm: convert to the generic > get_user_pages_fast code"). s390 tried to mimic static level folding by > changing pXd_offset primitives to always calculate top level page table > offset in pgd_offset and just return the value passed when pXd_offset > has to act as folded. > > What is crucial for gup_fast and what has been overlooked is > that PxD_SIZE/MASK and thus pXd_addr_end should also change > correspondingly. And the latter is not possible with dynamic folding. > > To fix the issue in addition to pXd values pass original > pXdp pointers down to gup_pXd_range functions. And introduce > pXd_offset_lockless helpers, which take an additional pXd > entry value parameter. This has already been discussed in > https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1 > > Cc: # 5.2+ > Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast > code") > Reviewed-by: Gerald Schaefer > Reviewed-by: Alexander Gordeev > Signed-off-by: Vasily Gorbik > --- > v2: added brackets &pgd -> &(pgd) Reviewed-by: Jason Gunthorpe Regards, Jason
Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote: > Currently to make sure that every page table entry is read just once > gup_fast walks perform READ_ONCE and pass pXd value down to the next > gup_pXd_range function by value e.g.: ...snip... > --- > v2: added brackets &pgd -> &(pgd) > > arch/s390/include/asm/pgtable.h | 42 +++-- > include/linux/pgtable.h | 10 > mm/gup.c| 18 +++--- > 3 files changed, 49 insertions(+), 21 deletions(-) Andrew, any chance you would pick this up? There is an Ack from Linus. And I haven't seen any objections from Jason or John. This seems to be as safe for other architectures as possible. @Jason and John Any acks/nacks? Thank you, Vasily
[PATCH v2] mm/gup: fix gup_fast with dynamic page table folding
Currently to make sure that every page table entry is read just once gup_fast walks perform READ_ONCE and pass pXd value down to the next gup_pXd_range function by value e.g.: static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) ... pudp = pud_offset(&p4d, addr); This function passes a reference on that local value copy to pXd_offset, and might get the very same pointer in return. This happens when the level is folded (on most arches), and that pointer should not be iterated. On s390 due to the fact that each task might have different 5,4 or 3-level address translation and hence different levels folded the logic is more complex and non-iteratable pointer to a local copy leads to severe problems. Here is an example of what happens with gup_fast on s390, for a task with 3-levels paging, crossing a 2 GB pud boundary: // addr = 0x1007000, end = 0x10080001000 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; // pud_offset returns &p4d itself (a pointer to a value on stack) pudp = pud_offset(&p4d, addr); do { // on second iteratation reading "random" stack value pud_t pud = READ_ONCE(*pudp); // next = 0x1008000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390 next = pud_addr_end(addr, end); ... } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack return 1; } This happens since s390 moved to common gup code with commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") and commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code"). s390 tried to mimic static level folding by changing pXd_offset primitives to always calculate top level page table offset in pgd_offset and just return the value passed when pXd_offset has to act as folded. What is crucial for gup_fast and what has been overlooked is that PxD_SIZE/MASK and thus pXd_addr_end should also change correspondingly. And the latter is not possible with dynamic folding. To fix the issue in addition to pXd values pass original pXdp pointers down to gup_pXd_range functions. And introduce pXd_offset_lockless helpers, which take an additional pXd entry value parameter. This has already been discussed in https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1 Cc: # 5.2+ Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code") Reviewed-by: Gerald Schaefer Reviewed-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- v2: added brackets &pgd -> &(pgd) arch/s390/include/asm/pgtable.h | 42 +++-- include/linux/pgtable.h | 10 mm/gup.c| 18 +++--- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7eb01a5459cd..b55561cc8786 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) { - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); - return (p4d_t *) pgd; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); + return (p4d_t *) pgdp; } +#define p4d_offset_lockless p4d_offset_lockless -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) { - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) - return (pud_t *) p4d_deref(*p4d) + pud_index(address); - return (pud_t *) p4d; + return p4d_offset_lockless(pgdp, *pgdp, address); +} + +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(p4d) + pud_index(address); + return (pud_t *) p4dp; +} +#define pud_offset_lockless pud_offset_lockless + +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) +{ + return pud_offset_lockless(p4dp, *p4dp, address); } #define pud_offset pud_offset -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +static inline pmd_t *pmd_offset_lockless(pud_t *pudp