Re: [PATCH 07/13] mm: close race in generic_access_phys

2020-10-08 Thread Daniel Vetter
On Thu, Oct 8, 2020 at 2:44 AM John Hubbard  wrote:
>
> On 10/7/20 9:44 AM, Daniel Vetter wrote:
> > Way back it was a reasonable assumptions that iomem mappings never
> > change the pfn range they point at. But this has changed:
> >
> > - gpu drivers dynamically manage their memory nowadays, invalidating
> >ptes with unmap_mapping_range when buffers get moved
> >
> > - contiguous dma allocations have moved from dedicated carvetouts to
>
> s/carvetouts/carveouts/
>
> >cma regions. This means if we miss the unmap the pfn might contain
> >pagecache or anon memory (well anything allocated with GFP_MOVEABLE)
> >
> > - even /dev/mem now invalidates mappings when the kernel requests that
> >iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
> >("/dev/mem: Revoke mappings when a driver claims the region")
>
> Thanks for putting these references into the log, it's very helpful.
> ...
> > diff --git a/mm/memory.c b/mm/memory.c
> > index fcfc4ca36eba..8d467e23b44e 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4873,28 +4873,68 @@ int follow_phys(struct vm_area_struct *vma,
> >   return ret;
> >   }
> >
> > +/**
> > + * generic_access_phys - generic implementation for iomem mmap access
> > + * @vma: the vma to access
> > + * @addr: userspace addres, not relative offset within @vma
> > + * @buf: buffer to read/write
> > + * @len: length of transfer
> > + * @write: set to FOLL_WRITE when writing, otherwise reading
> > + *
> > + * This is a generic implementation for _operations_struct.access for an
> > + * iomem mapping. This callback is used by access_process_vm() when the 
> > @vma is
> > + * not page based.
> > + */
> >   int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
> >   void *buf, int len, int write)
> >   {
> >   resource_size_t phys_addr;
> >   unsigned long prot = 0;
> >   void __iomem *maddr;
> > + pte_t *ptep, pte;
> > + spinlock_t *ptl;
> >   int offset = addr & (PAGE_SIZE-1);
> > + int ret = -EINVAL;
> > +
> > + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > + return -EINVAL;
> > +
> > +retry:
> > + if (follow_pte(vma->vm_mm, addr, , ))
> > + return -EINVAL;
> > + pte = *ptep;
> > + pte_unmap_unlock(ptep, ptl);
> >
> > - if (follow_phys(vma, addr, write, , _addr))
> > + prot = pgprot_val(pte_pgprot(pte));
> > + phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
> > +
> > + if ((write & FOLL_WRITE) && !pte_write(pte))
> >   return -EINVAL;
> >
> >   maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
> >   if (!maddr)
> >   return -ENOMEM;
> >
> > + if (follow_pte(vma->vm_mm, addr, , ))
> > + goto out_unmap;
> > +
> > + if (pte_same(pte, *ptep)) {
>
>
> The ioremap area is something I'm sorta new to, so a newbie question:
> is it possible for the same pte to already be there, ever? If so, we
> be stuck in an infinite loop here.  I'm sure that's not the case, but
> it's not yet obvious to me why it's impossible. Resource reservations
> maybe?

It's just buggy, it should be !pte_same. And I need to figure out how
to test this I guess.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH 07/13] mm: close race in generic_access_phys

2020-10-07 Thread John Hubbard

On 10/7/20 9:44 AM, Daniel Vetter wrote:

Way back it was a reasonable assumptions that iomem mappings never
change the pfn range they point at. But this has changed:

- gpu drivers dynamically manage their memory nowadays, invalidating
   ptes with unmap_mapping_range when buffers get moved

- contiguous dma allocations have moved from dedicated carvetouts to


s/carvetouts/carveouts/


   cma regions. This means if we miss the unmap the pfn might contain
   pagecache or anon memory (well anything allocated with GFP_MOVEABLE)

- even /dev/mem now invalidates mappings when the kernel requests that
   iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
   ("/dev/mem: Revoke mappings when a driver claims the region")


Thanks for putting these references into the log, it's very helpful.
...

diff --git a/mm/memory.c b/mm/memory.c
index fcfc4ca36eba..8d467e23b44e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4873,28 +4873,68 @@ int follow_phys(struct vm_area_struct *vma,
return ret;
  }
  
+/**

+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace addres, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for _operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
  {
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
+   pte_t *ptep, pte;
+   spinlock_t *ptl;
int offset = addr & (PAGE_SIZE-1);
+   int ret = -EINVAL;
+
+   if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+   return -EINVAL;
+
+retry:
+   if (follow_pte(vma->vm_mm, addr, , ))
+   return -EINVAL;
+   pte = *ptep;
+   pte_unmap_unlock(ptep, ptl);
  
-	if (follow_phys(vma, addr, write, , _addr))

+   prot = pgprot_val(pte_pgprot(pte));
+   phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+   if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
  
  	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);

if (!maddr)
return -ENOMEM;
  
+	if (follow_pte(vma->vm_mm, addr, , ))

+   goto out_unmap;
+
+   if (pte_same(pte, *ptep)) {



The ioremap area is something I'm sorta new to, so a newbie question:
is it possible for the same pte to already be there, ever? If so, we
be stuck in an infinite loop here.  I'm sure that's not the case, but
it's not yet obvious to me why it's impossible. Resource reservations
maybe?


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 07/13] mm: close race in generic_access_phys

2020-10-07 Thread Jason Gunthorpe
On Wed, Oct 07, 2020 at 08:01:42PM +0200, Daniel Vetter wrote:
> I think it'd fix the bug, until someone wires ->access up for
> drivers/gpu, or the next subsystem. This is also just for ptrace, so
> we really don't care when we stall the vm badly and other silly
> things. So I figured the somewhat ugly, but full generic solution is
> the better one, so that people who want to be able to ptrace
> read/write their iomem mmaps can just sprinkle this wherever they feel
> like.
> 
> But yeah if we go with most minimal fix, i.e. only trying to fix the
> current users, then your thing should work and is simpler. But it
> leaves the door open for future problems.

The only other idea I had was to fully make the 'vma of __iomem
memory' some generic utility, completely take over the vm_ops.

We did something like this in RDMA, what I found was even just
implementing mmap() using the kernel helpers turned out to be pretty
tricky, many drivers did it wrong in small ways.

Jason


Re: [PATCH 07/13] mm: close race in generic_access_phys

2020-10-07 Thread Daniel Vetter
On Wed, Oct 7, 2020 at 7:27 PM Jason Gunthorpe  wrote:
>
> On Wed, Oct 07, 2020 at 06:44:20PM +0200, Daniel Vetter wrote:
> > Way back it was a reasonable assumptions that iomem mappings never
> > change the pfn range they point at. But this has changed:
> >
> > - gpu drivers dynamically manage their memory nowadays, invalidating
> >   ptes with unmap_mapping_range when buffers get moved
> >
> > - contiguous dma allocations have moved from dedicated carvetouts to
> >   cma regions. This means if we miss the unmap the pfn might contain
> >   pagecache or anon memory (well anything allocated with GFP_MOVEABLE)
> >
> > - even /dev/mem now invalidates mappings when the kernel requests that
> >   iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
> >   ("/dev/mem: Revoke mappings when a driver claims the region")
> >
> > Accessing pfns obtained from ptes without holding all the locks is
> > therefore no longer a good idea. Fix this.
> >
> > Since ioremap might need to manipulate pagetables too we need to drop
> > the pt lock and have a retry loop if we raced.
> >
> > While at it, also add kerneldoc and improve the comment for the
> > vma_ops->access function. It's for accessing, not for moving the
> > memory from iomem to system memory, as the old comment seemed to
> > suggest.
> >
> > References: 28b2ee20c7cb ("access_process_vm device memory infrastructure")
> > Cc: Jason Gunthorpe 
> > Cc: Dan Williams 
> > Cc: Kees Cook 
> > Cc: Rik van Riel 
> > Cc: Benjamin Herrensmidt 
> > Cc: Dave Airlie 
> > Cc: Hugh Dickins 
> > Cc: Andrew Morton 
> > Cc: John Hubbard 
> > Cc: Jérôme Glisse 
> > Cc: Jan Kara 
> > Cc: Dan Williams 
> > Cc: linux...@kvack.org
> > Cc: linux-arm-ker...@lists.infradead.org
> > Cc: linux-samsung-...@vger.kernel.org
> > Cc: linux-me...@vger.kernel.org
> > Signed-off-by: Daniel Vetter 
> > ---
> >  include/linux/mm.h |  3 ++-
> >  mm/memory.c| 44 ++--
> >  2 files changed, 44 insertions(+), 3 deletions(-)
>
> This does seem to solve the race with revoke_devmem(), but it is really ugly.
>
> It would be much nicer to wrap a rwsem around this access and the unmap.
>
> Any place using it has a nice linear translation from vm_off to pfn,
> so I don't think there is a such a good reason to use follow_pte in
> the first place.
>
> ie why not the helper be this:
>
>  int generic_access_phys(unsigned long pfn, unsigned long pgprot,
>   void *buf, size_t len, bool write)
>
> Then something like dev/mem would compute pfn and obtain the lock:
>
> dev_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int 
> len, int write)
> {
>  cpu_addr = vma->vm_pgoff*PAGE_SIZE + (addr - vma->vm_start));
>
>  /* FIXME: Has to be over each page of len */
>  if (!devmem_is_allowed_access(PHYS_PFN(cpu_addr/4096)))
>return -EPERM;
>
>  down_read(_sem);
>  generic_access_phys(cpu_addr/4096, pgprot_val(vma->vm_page_prot),
>  buf, len, write);
>  up_read(_sem);
> }
>
> The other cases looked simpler because they don't revoke, here the
> mmap_sem alone should be enough protection, they would just need to
> provide the linear translation to pfn.
>
> What do you think?

I think it'd fix the bug, until someone wires ->access up for
drivers/gpu, or the next subsystem. This is also just for ptrace, so
we really don't care when we stall the vm badly and other silly
things. So I figured the somewhat ugly, but full generic solution is
the better one, so that people who want to be able to ptrace
read/write their iomem mmaps can just sprinkle this wherever they feel
like.

But yeah if we go with most minimal fix, i.e. only trying to fix the
current users, then your thing should work and is simpler. But it
leaves the door open for future problems.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH 07/13] mm: close race in generic_access_phys

2020-10-07 Thread Jason Gunthorpe
On Wed, Oct 07, 2020 at 06:44:20PM +0200, Daniel Vetter wrote:
> Way back it was a reasonable assumptions that iomem mappings never
> change the pfn range they point at. But this has changed:
> 
> - gpu drivers dynamically manage their memory nowadays, invalidating
>   ptes with unmap_mapping_range when buffers get moved
> 
> - contiguous dma allocations have moved from dedicated carvetouts to
>   cma regions. This means if we miss the unmap the pfn might contain
>   pagecache or anon memory (well anything allocated with GFP_MOVEABLE)
> 
> - even /dev/mem now invalidates mappings when the kernel requests that
>   iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
>   ("/dev/mem: Revoke mappings when a driver claims the region")
> 
> Accessing pfns obtained from ptes without holding all the locks is
> therefore no longer a good idea. Fix this.
> 
> Since ioremap might need to manipulate pagetables too we need to drop
> the pt lock and have a retry loop if we raced.
> 
> While at it, also add kerneldoc and improve the comment for the
> vma_ops->access function. It's for accessing, not for moving the
> memory from iomem to system memory, as the old comment seemed to
> suggest.
> 
> References: 28b2ee20c7cb ("access_process_vm device memory infrastructure")
> Cc: Jason Gunthorpe 
> Cc: Dan Williams 
> Cc: Kees Cook 
> Cc: Rik van Riel 
> Cc: Benjamin Herrensmidt 
> Cc: Dave Airlie 
> Cc: Hugh Dickins 
> Cc: Andrew Morton 
> Cc: John Hubbard 
> Cc: Jérôme Glisse 
> Cc: Jan Kara 
> Cc: Dan Williams 
> Cc: linux...@kvack.org
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-samsung-...@vger.kernel.org
> Cc: linux-me...@vger.kernel.org
> Signed-off-by: Daniel Vetter 
> ---
>  include/linux/mm.h |  3 ++-
>  mm/memory.c| 44 ++--
>  2 files changed, 44 insertions(+), 3 deletions(-)

This does seem to solve the race with revoke_devmem(), but it is really ugly.

It would be much nicer to wrap a rwsem around this access and the unmap.

Any place using it has a nice linear translation from vm_off to pfn,
so I don't think there is a such a good reason to use follow_pte in
the first place.

ie why not the helper be this:

 int generic_access_phys(unsigned long pfn, unsigned long pgprot,
  void *buf, size_t len, bool write)

Then something like dev/mem would compute pfn and obtain the lock:

dev_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, 
int write)
{
 cpu_addr = vma->vm_pgoff*PAGE_SIZE + (addr - vma->vm_start));

 /* FIXME: Has to be over each page of len */
 if (!devmem_is_allowed_access(PHYS_PFN(cpu_addr/4096)))
   return -EPERM;

 down_read(_sem);
 generic_access_phys(cpu_addr/4096, pgprot_val(vma->vm_page_prot),
 buf, len, write);
 up_read(_sem);
}

The other cases looked simpler because they don't revoke, here the
mmap_sem alone should be enough protection, they would just need to
provide the linear translation to pfn.

What do you think?

Jason


[PATCH 07/13] mm: close race in generic_access_phys

2020-10-07 Thread Daniel Vetter
Way back it was a reasonable assumptions that iomem mappings never
change the pfn range they point at. But this has changed:

- gpu drivers dynamically manage their memory nowadays, invalidating
  ptes with unmap_mapping_range when buffers get moved

- contiguous dma allocations have moved from dedicated carvetouts to
  cma regions. This means if we miss the unmap the pfn might contain
  pagecache or anon memory (well anything allocated with GFP_MOVEABLE)

- even /dev/mem now invalidates mappings when the kernel requests that
  iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
  ("/dev/mem: Revoke mappings when a driver claims the region")

Accessing pfns obtained from ptes without holding all the locks is
therefore no longer a good idea. Fix this.

Since ioremap might need to manipulate pagetables too we need to drop
the pt lock and have a retry loop if we raced.

While at it, also add kerneldoc and improve the comment for the
vma_ops->access function. It's for accessing, not for moving the
memory from iomem to system memory, as the old comment seemed to
suggest.

References: 28b2ee20c7cb ("access_process_vm device memory infrastructure")
Cc: Jason Gunthorpe 
Cc: Dan Williams 
Cc: Kees Cook 
Cc: Rik van Riel 
Cc: Benjamin Herrensmidt 
Cc: Dave Airlie 
Cc: Hugh Dickins 
Cc: Andrew Morton 
Cc: John Hubbard 
Cc: Jérôme Glisse 
Cc: Jan Kara 
Cc: Dan Williams 
Cc: linux...@kvack.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-samsung-...@vger.kernel.org
Cc: linux-me...@vger.kernel.org
Signed-off-by: Daniel Vetter 
---
 include/linux/mm.h |  3 ++-
 mm/memory.c| 44 ++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index acd60fbf1a5a..2a16631c1fda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -566,7 +566,8 @@ struct vm_operations_struct {
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
 
/* called by access_process_vm when get_user_pages() fails, typically
-* for use by special VMAs that can switch between memory and hardware
+* for use by special VMAs. See also generic_access_phys() for a generic
+* implementation useful for any iomem mapping.
 */
int (*access)(struct vm_area_struct *vma, unsigned long addr,
  void *buf, int len, int write);
diff --git a/mm/memory.c b/mm/memory.c
index fcfc4ca36eba..8d467e23b44e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4873,28 +4873,68 @@ int follow_phys(struct vm_area_struct *vma,
return ret;
 }
 
+/**
+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace addres, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for _operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
 {
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
+   pte_t *ptep, pte;
+   spinlock_t *ptl;
int offset = addr & (PAGE_SIZE-1);
+   int ret = -EINVAL;
+
+   if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+   return -EINVAL;
+
+retry:
+   if (follow_pte(vma->vm_mm, addr, , ))
+   return -EINVAL;
+   pte = *ptep;
+   pte_unmap_unlock(ptep, ptl);
 
-   if (follow_phys(vma, addr, write, , _addr))
+   prot = pgprot_val(pte_pgprot(pte));
+   phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+   if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
 
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
 
+   if (follow_pte(vma->vm_mm, addr, , ))
+   goto out_unmap;
+
+   if (pte_same(pte, *ptep)) {
+   pte_unmap_unlock(ptep, ptl);
+   iounmap(maddr);
+
+   goto retry;
+   }
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
+   ret = len;
+   pte_unmap_unlock(ptep, ptl);
+out_unmap:
iounmap(maddr);
 
-   return len;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(generic_access_phys);
 #endif
-- 
2.28.0