Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-04-18 Thread Yi Liu

Hi Jason,

On 2022/4/17 22:56, Yi Liu wrote:

On 2022/4/13 22:49, Yi Liu wrote:

On 2022/4/13 22:36, Jason Gunthorpe wrote:

On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:

+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ *
+ * The requested range must exactly match an existing range.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+    unsigned long length)
+{
+    struct iopt_pages *pages;
+    struct iopt_area *area;
+    unsigned long iova_end;
+    int rc;
+
+    if (!length)
+    return -EINVAL;
+
+    if (check_add_overflow(iova, length - 1, &iova_end))
+    return -EOVERFLOW;
+
+    down_read(&iopt->domains_rwsem);
+    down_write(&iopt->iova_rwsem);
+    area = iopt_find_exact_area(iopt, iova, iova_end);


when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
shows. Qemu failed when trying to do map due to an IOVA still in use.
After debugging, the 0xf000 IOVA is mapped but not unmapped. But 
per log

#2, Qemu has issued unmap with a larger range (0xff00 -
0x1) which includes the 0xf000. But iopt_find_exact_area()
doesn't find any area. So 0xf000 is not unmapped. Is this correct? 
Same

test passed with vfio iommu type1 driver. any idea?


There are a couple of good reasons why the iopt_unmap_iova() should
proccess any contiguous range of fully contained areas, so I would
consider this something worth fixing. can you send a small patch and
test case and I'll fold it in?


sure. just spotted it, so haven't got fix patch yet. I may work on
it tomorrow.


Hi Jason,

Got below patch for it. Also pushed to the exploration branch.

https://github.com/luxis1999/iommufd/commit/d764f3288de0fd52c578684788a437701ec31b2d 


0-day reports a use without initialization to me. So updated it. Please get
the change in below commit. Sorry for the noise.

https://github.com/luxis1999/iommufd/commit/10674417c235cb4a4caf2202fffb078611441da2

--
Regards,
Yi Liu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-04-17 Thread Yi Liu

On 2022/4/13 22:49, Yi Liu wrote:

On 2022/4/13 22:36, Jason Gunthorpe wrote:

On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:

+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ *
+ * The requested range must exactly match an existing range.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+    unsigned long length)
+{
+    struct iopt_pages *pages;
+    struct iopt_area *area;
+    unsigned long iova_end;
+    int rc;
+
+    if (!length)
+    return -EINVAL;
+
+    if (check_add_overflow(iova, length - 1, &iova_end))
+    return -EOVERFLOW;
+
+    down_read(&iopt->domains_rwsem);
+    down_write(&iopt->iova_rwsem);
+    area = iopt_find_exact_area(iopt, iova, iova_end);


when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
shows. Qemu failed when trying to do map due to an IOVA still in use.
After debugging, the 0xf000 IOVA is mapped but not unmapped. But per 
log

#2, Qemu has issued unmap with a larger range (0xff00 -
0x1) which includes the 0xf000. But iopt_find_exact_area()
doesn't find any area. So 0xf000 is not unmapped. Is this correct? Same
test passed with vfio iommu type1 driver. any idea?


There are a couple of good reasons why the iopt_unmap_iova() should
proccess any contiguous range of fully contained areas, so I would
consider this something worth fixing. can you send a small patch and
test case and I'll fold it in?


sure. just spotted it, so haven't got fix patch yet. I may work on
it tomorrow.


Hi Jason,

Got below patch for it. Also pushed to the exploration branch.

https://github.com/luxis1999/iommufd/commit/d764f3288de0fd52c578684788a437701ec31b2d

From 22a758c401a1c7f6656625013bb87204c9ea65fe Mon Sep 17 00:00:00 2001
From: Yi Liu 
Date: Sun, 17 Apr 2022 07:39:03 -0700
Subject: [PATCH] iommufd/io_pagetable: Support unmap fully contained areas

Changes:
- return the unmapped bytes to caller
- supports unmap fully containerd contiguous areas
- add a test case in selftest

Signed-off-by: Yi Liu 
---
 drivers/iommu/iommufd/io_pagetable.c| 90 -
 drivers/iommu/iommufd/ioas.c|  8 ++-
 drivers/iommu/iommufd/iommufd_private.h |  4 +-
 drivers/iommu/iommufd/vfio_compat.c |  8 ++-
 include/uapi/linux/iommufd.h|  2 +-
 tools/testing/selftests/iommu/iommufd.c | 40 +++
 6 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/drivers/iommu/iommufd/io_pagetable.c 
b/drivers/iommu/iommufd/io_pagetable.c

index f9f3b06946bf..5142f797a812 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -315,61 +315,26 @@ static int __iopt_unmap_iova(struct io_pagetable 
*iopt, struct iopt_area *area,

return 0;
 }

-/**
- * iopt_unmap_iova() - Remove a range of iova
- * @iopt: io_pagetable to act on
- * @iova: Starting iova to unmap
- * @length: Number of bytes to unmap
- *
- * The requested range must exactly match an existing range.
- * Splitting/truncating IOVA mappings is not allowed.
- */
-int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
-   unsigned long length)
-{
-   struct iopt_pages *pages;
-   struct iopt_area *area;
-   unsigned long iova_end;
-   int rc;
-
-   if (!length)
-   return -EINVAL;
-
-   if (check_add_overflow(iova, length - 1, &iova_end))
-   return -EOVERFLOW;
-
-   down_read(&iopt->domains_rwsem);
-   down_write(&iopt->iova_rwsem);
-   area = iopt_find_exact_area(iopt, iova, iova_end);
-   if (!area) {
-   up_write(&iopt->iova_rwsem);
-   up_read(&iopt->domains_rwsem);
-   return -ENOENT;
-   }
-   pages = area->pages;
-   area->pages = NULL;
-   up_write(&iopt->iova_rwsem);
-
-   rc = __iopt_unmap_iova(iopt, area, pages);
-   up_read(&iopt->domains_rwsem);
-   return rc;
-}
-
-int iopt_unmap_all(struct io_pagetable *iopt)
+static int __iopt_unmap_iova_range(struct io_pagetable *iopt,
+  unsigned long start,
+  unsigned long end,
+  unsigned long *unmapped)
 {
struct iopt_area *area;
+   unsigned long unmapped_bytes = 0;
int rc;

down_read(&iopt->domains_rwsem);
down_write(&iopt->iova_rwsem);
-   while ((area = iopt_area_iter_first(iopt, 0, ULONG_MAX))) {
+   while ((area = iopt_area_iter_first(iopt, start, end))) {
struct iopt_pages *pages;

-   /* Userspace should not race unmap all and map */
-   if (!area->pages) {
-   rc = -EBUSY;
+   if (!area->pages || iopt_area_iova(area) < start ||
+   iopt_area_last_iova(area) > end) {

Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-04-13 Thread Yi Liu

On 2022/4/13 22:36, Jason Gunthorpe wrote:

On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:

+/**
+ * iopt_unmap_iova() - Remove a range of iova
+ * @iopt: io_pagetable to act on
+ * @iova: Starting iova to unmap
+ * @length: Number of bytes to unmap
+ *
+ * The requested range must exactly match an existing range.
+ * Splitting/truncating IOVA mappings is not allowed.
+ */
+int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
+   unsigned long length)
+{
+   struct iopt_pages *pages;
+   struct iopt_area *area;
+   unsigned long iova_end;
+   int rc;
+
+   if (!length)
+   return -EINVAL;
+
+   if (check_add_overflow(iova, length - 1, &iova_end))
+   return -EOVERFLOW;
+
+   down_read(&iopt->domains_rwsem);
+   down_write(&iopt->iova_rwsem);
+   area = iopt_find_exact_area(iopt, iova, iova_end);


when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
shows. Qemu failed when trying to do map due to an IOVA still in use.
After debugging, the 0xf000 IOVA is mapped but not unmapped. But per log
#2, Qemu has issued unmap with a larger range (0xff00 -
0x1) which includes the 0xf000. But iopt_find_exact_area()
doesn't find any area. So 0xf000 is not unmapped. Is this correct? Same
test passed with vfio iommu type1 driver. any idea?


There are a couple of good reasons why the iopt_unmap_iova() should
proccess any contiguous range of fully contained areas, so I would
consider this something worth fixing. can you send a small patch and
test case and I'll fold it in?


sure. just spotted it, so haven't got fix patch yet. I may work on
it tomorrow.

--
Regards,
Yi Liu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-04-13 Thread Jason Gunthorpe via iommu
On Wed, Apr 13, 2022 at 10:02:58PM +0800, Yi Liu wrote:
> > +/**
> > + * iopt_unmap_iova() - Remove a range of iova
> > + * @iopt: io_pagetable to act on
> > + * @iova: Starting iova to unmap
> > + * @length: Number of bytes to unmap
> > + *
> > + * The requested range must exactly match an existing range.
> > + * Splitting/truncating IOVA mappings is not allowed.
> > + */
> > +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
> > +   unsigned long length)
> > +{
> > +   struct iopt_pages *pages;
> > +   struct iopt_area *area;
> > +   unsigned long iova_end;
> > +   int rc;
> > +
> > +   if (!length)
> > +   return -EINVAL;
> > +
> > +   if (check_add_overflow(iova, length - 1, &iova_end))
> > +   return -EOVERFLOW;
> > +
> > +   down_read(&iopt->domains_rwsem);
> > +   down_write(&iopt->iova_rwsem);
> > +   area = iopt_find_exact_area(iopt, iova, iova_end);
> 
> when testing vIOMMU with Qemu using iommufd, I hit a problem as log #3
> shows. Qemu failed when trying to do map due to an IOVA still in use.
> After debugging, the 0xf000 IOVA is mapped but not unmapped. But per log
> #2, Qemu has issued unmap with a larger range (0xff00 -
> 0x1) which includes the 0xf000. But iopt_find_exact_area()
> doesn't find any area. So 0xf000 is not unmapped. Is this correct? Same
> test passed with vfio iommu type1 driver. any idea?

There are a couple of good reasons why the iopt_unmap_iova() should
proccess any contiguous range of fully contained areas, so I would
consider this something worth fixing. can you send a small patch and
test case and I'll fold it in?

Thanks,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-04-13 Thread Yi Liu

Hi Jason,

On 2022/3/19 01:27, Jason Gunthorpe wrote:

This is the remainder of the IOAS data structure. Provide an object called
an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
along with a list of iommu_domains that mirror the IOVA to PFN map.

At the top this is a simple interval tree of iopt_areas indicating the map
of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
on the attached domains there is a minimum alignment for areas (which may
be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
can't be mapped.

The concept of a 'user' refers to something like a VFIO mdev that is
accessing the IOVA and using a 'struct page *' for CPU based access.

Externally an API is provided that matches the requirements of the IOCTL
interface for map/unmap and domain attachment.

The API provides a 'copy' primitive to establish a new IOVA map in a
different IOAS from an existing mapping.

This is designed to support a pre-registration flow where userspace would
setup an dummy IOAS with no domains, map in memory and then establish a
user to pin all PFNs into the xarray.

Copy can then be used to create new IOVA mappings in a different IOAS,
with iommu_domains attached. Upon copy the PFNs will be read out of the
xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
overheads.

Signed-off-by: Jason Gunthorpe 
---
  drivers/iommu/iommufd/Makefile  |   1 +
  drivers/iommu/iommufd/io_pagetable.c| 890 
  drivers/iommu/iommufd/iommufd_private.h |  35 +
  3 files changed, 926 insertions(+)
  create mode 100644 drivers/iommu/iommufd/io_pagetable.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 05a0e91e30afad..b66a8c47ff55ec 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@
  # SPDX-License-Identifier: GPL-2.0-only
  iommufd-y := \
+   io_pagetable.o \
main.o \
pages.o
  
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c

new file mode 100644
index 00..f9f3b06946bfb9
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,890 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "io_pagetable.h"
+
+static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+unsigned long iova)
+{
+   if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+   WARN_ON(iova < iopt_area_iova(area) ||
+   iova > iopt_area_last_iova(area));
+   return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+}
+
+static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova)
+{
+   struct iopt_area *area;
+
+   area = iopt_area_iter_first(iopt, iova, last_iova);
+   if (!area || !area->pages || iopt_area_iova(area) != iova ||
+   iopt_area_last_iova(area) != last_iova)
+   return NULL;
+   return area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
+   unsigned long length,
+   unsigned long iova_alignment,
+   unsigned long page_offset)
+{
+   if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
+   return false;
+
+   span->start_hole =
+   ALIGN(span->start_hole, iova_alignment) | page_offset;
+   if (span->start_hole > span->last_hole ||
+   span->last_hole - span->start_hole < length - 1)
+   return false;
+   return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+  unsigned long uptr, unsigned long length)
+{
+   struct interval_tree_span_iter reserved_span;
+   unsigned long page_offset = uptr % PAGE_SIZE;
+   struct interval_tree_span_iter area_span;
+   unsigned long iova_alignment;
+
+   lockdep_assert_held(&iopt->iova_rwsem);
+
+   /* Protect roundup_pow-of_two() from overflow */
+   if (length == 0 || length >= ULONG_MAX / 2)
+   return -EOVERFLOW;
+
+   /*
+* Keep alignment present in the uptr when building 

Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-25 Thread Jason Gunthorpe via iommu
On Fri, Mar 25, 2022 at 09:34:08PM +0800, zhangfei@foxmail.com wrote:

> > +   iopt->iova_alignment = new_iova_alignment;
> > +   xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
> > +   iopt->next_domain_id++;
> Not understand here.
> 
> Do we get the domain = xa_load(&iopt->domains, iopt->next_domain_id-1)?
> Then how to get the domain if next_domain_id++.
> For example, iopt_table_add_domain 3 times with 3 domains,
> how to know which next_domain_id is the correct one.

There is no "correct one" this is just a simple list of domains, the
alorithms either need to pick any single domain or iterate over every
domain.

Basically this bit of code is building a vector with the operations
'push_back', 'front', 'erase' and 'for each'

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-25 Thread zhangfei....@foxmail.com

Hi, Jason

On 2022/3/19 上午1:27, Jason Gunthorpe via iommu wrote:

This is the remainder of the IOAS data structure. Provide an object called
an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
along with a list of iommu_domains that mirror the IOVA to PFN map.

At the top this is a simple interval tree of iopt_areas indicating the map
of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
on the attached domains there is a minimum alignment for areas (which may
be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
can't be mapped.

The concept of a 'user' refers to something like a VFIO mdev that is
accessing the IOVA and using a 'struct page *' for CPU based access.

Externally an API is provided that matches the requirements of the IOCTL
interface for map/unmap and domain attachment.

The API provides a 'copy' primitive to establish a new IOVA map in a
different IOAS from an existing mapping.

This is designed to support a pre-registration flow where userspace would
setup an dummy IOAS with no domains, map in memory and then establish a
user to pin all PFNs into the xarray.

Copy can then be used to create new IOVA mappings in a different IOAS,
with iommu_domains attached. Upon copy the PFNs will be read out of the
xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
overheads.

Signed-off-by: Jason Gunthorpe 
---
  drivers/iommu/iommufd/Makefile  |   1 +
  drivers/iommu/iommufd/io_pagetable.c| 890 
  drivers/iommu/iommufd/iommufd_private.h |  35 +
  3 files changed, 926 insertions(+)
  create mode 100644 drivers/iommu/iommufd/io_pagetable.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 05a0e91e30afad..b66a8c47ff55ec 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@
  # SPDX-License-Identifier: GPL-2.0-only
  iommufd-y := \
+   io_pagetable.o \
main.o \
pages.o
  
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c

new file mode 100644
index 00..f9f3b06946bfb9
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,890 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "io_pagetable.h"
+
+static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+unsigned long iova)
+{
+   if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+   WARN_ON(iova < iopt_area_iova(area) ||
+   iova > iopt_area_last_iova(area));
+   return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+}
+
+static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova)
+{
+   struct iopt_area *area;
+
+   area = iopt_area_iter_first(iopt, iova, last_iova);
+   if (!area || !area->pages || iopt_area_iova(area) != iova ||
+   iopt_area_last_iova(area) != last_iova)
+   return NULL;
+   return area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
+   unsigned long length,
+   unsigned long iova_alignment,
+   unsigned long page_offset)
+{
+   if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
+   return false;
+
+   span->start_hole =
+   ALIGN(span->start_hole, iova_alignment) | page_offset;
+   if (span->start_hole > span->last_hole ||
+   span->last_hole - span->start_hole < length - 1)
+   return false;
+   return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+  unsigned long uptr, unsigned long length)
+{
+   struct interval_tree_span_iter reserved_span;
+   unsigned long page_offset = uptr % PAGE_SIZE;
+   struct interval_tree_span_iter area_span;
+   unsigned long iova_alignment;
+
+   lockdep_assert_held(&iopt->iova_rwsem);
+
+   /* Protect roundup_pow-of_two() from overflow */
+   if (length == 0 || length >= ULONG_MAX / 2)
+   return -EOVERFLOW;
+
+   /*
+* Keep alignment present in the uptr whe

Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-24 Thread Jason Gunthorpe via iommu
On Thu, Mar 24, 2022 at 03:09:46AM +, Tian, Kevin wrote:
> + /*
> +  * Can't cross areas that are not aligned to the system page
> +  * size with this API.
> +  */
> + if (cur_iova % PAGE_SIZE) {
> + rc = -EINVAL;
> + goto out_remove;
> + }
> 
> Currently it's done after iopt_pages_add_user() but before cur_iova 
> is adjusted, which implies the last add_user() will not be reverted in
> case of failed check here.

Oh, yes that's right too..

The above is wrong even, it didn't get fixed when page_offset was
done.

So more like this:

diff --git a/drivers/iommu/iommufd/io_pagetable.c 
b/drivers/iommu/iommufd/io_pagetable.c
index 1c08ae9b848fcf..9505f119df982e 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -23,7 +23,7 @@ static unsigned long iopt_area_iova_to_index(struct iopt_area 
*area,
if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
WARN_ON(iova < iopt_area_iova(area) ||
iova > iopt_area_last_iova(area));
-   return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+   return (iova - (iopt_area_iova(area) - area->page_offset)) / PAGE_SIZE;
 }
 
 static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
@@ -436,31 +436,45 @@ int iopt_access_pages(struct io_pagetable *iopt, unsigned 
long iova,
unsigned long index;
 
/* Need contiguous areas in the access */
-   if (iopt_area_iova(area) < cur_iova || !area->pages) {
+   if (iopt_area_iova(area) > cur_iova || !area->pages) {
rc = -EINVAL;
goto out_remove;
}
 
index = iopt_area_iova_to_index(area, cur_iova);
last_index = iopt_area_iova_to_index(area, last);
+
+   /*
+* The API can only return aligned pages, so the starting point
+* must be at a page boundary.
+*/
+   if ((cur_iova - (iopt_area_iova(area) - area->page_offset)) %
+   PAGE_SIZE) {
+   rc = -EINVAL;
+   goto out_remove;
+   }
+
+   /*
+* and an interior ending point must be at a page boundary
+*/
+   if (last != last_iova &&
+   (iopt_area_last_iova(area) - cur_iova + 1) % PAGE_SIZE) {
+   rc = -EINVAL;
+   goto out_remove;
+   }
+
rc = iopt_pages_add_user(area->pages, index, last_index,
 out_pages, write);
if (rc)
goto out_remove;
if (last == last_iova)
break;
-   /*
-* Can't cross areas that are not aligned to the system page
-* size with this API.
-*/
-   if (cur_iova % PAGE_SIZE) {
-   rc = -EINVAL;
-   goto out_remove;
-   }
cur_iova = last + 1;
out_pages += last_index - index;
atomic_inc(&area->num_users);
}
+   if (cur_iova != last_iova)
+   goto out_remove;
 
up_read(&iopt->iova_rwsem);
return 0;
diff --git a/tools/testing/selftests/iommu/iommufd.c 
b/tools/testing/selftests/iommu/iommufd.c
index 5c47d706ed9449..a46e0f0ae82553 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1221,5 +1221,6 @@ TEST_F(vfio_compat_mock_domain, get_info)
 /* FIXME test VFIO_IOMMU_MAP_DMA */
 /* FIXME test VFIO_IOMMU_UNMAP_DMA */
 /* FIXME test 2k iova alignment */
+/* FIXME cover boundary cases for iopt_access_pages()  */
 
 TEST_HARNESS_MAIN
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-23 Thread Tian, Kevin
> From: Jason Gunthorpe 
> Sent: Thursday, March 24, 2022 2:16 AM
> 
> On Tue, Mar 22, 2022 at 04:15:44PM -0600, Alex Williamson wrote:
> 
> > > +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> > > +   unsigned long length, struct page **out_pages, bool write)
> > > +{
> > > + unsigned long cur_iova = iova;
> > > + unsigned long last_iova;
> > > + struct iopt_area *area;
> > > + int rc;
> > > +
> > > + if (!length)
> > > + return -EINVAL;
> > > + if (check_add_overflow(iova, length - 1, &last_iova))
> > > + return -EOVERFLOW;
> > > +
> > > + down_read(&iopt->iova_rwsem);
> > > + for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> > > +  area = iopt_area_iter_next(area, iova, last_iova)) {
> > > + unsigned long last = min(last_iova, iopt_area_last_iova(area));
> > > + unsigned long last_index;
> > > + unsigned long index;
> > > +
> > > + /* Need contiguous areas in the access */
> > > + if (iopt_area_iova(area) < cur_iova || !area->pages) {
> > ^^^
> > Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?
> 
> Oh good eye
> 
> That is a typo < should be >:
> 
>   if (iopt_area_iova(area) > cur_iova || !area->pages) {
> 
> There are three boundary conditions here to worry about
>  - interval trees return any nodes that intersect the queried range
>so the first found node can start after iova
> 
>  - There can be a gap in the intervals
> 
>  - The final area can end short of last_iova
> 
> The last one is botched too and needs this:
> for ... { ...
>   }
> + if (cur_iova != last_iova)
> + goto out_remove;
> 
> The test suite isn't covering the boundary cases here yet, I added a
> FIXME for this for now.
> 

Another nit about below:

+   /*
+* Can't cross areas that are not aligned to the system page
+* size with this API.
+*/
+   if (cur_iova % PAGE_SIZE) {
+   rc = -EINVAL;
+   goto out_remove;
+   }

Currently it's done after iopt_pages_add_user() but before cur_iova 
is adjusted, which implies the last add_user() will not be reverted in
case of failed check here.

suppose this should be checked at the start of the loop.

Thanks
Kevin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-23 Thread Jason Gunthorpe via iommu
On Tue, Mar 22, 2022 at 04:15:44PM -0600, Alex Williamson wrote:

> > +static struct iopt_area *
> > +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> > +   unsigned long iova, unsigned long start_byte,
> > +   unsigned long length, int iommu_prot, unsigned int flags)
> > +{
> > +   struct iopt_area *area;
> > +   int rc;
> > +
> > +   area = kzalloc(sizeof(*area), GFP_KERNEL);
> > +   if (!area)
> > +   return ERR_PTR(-ENOMEM);
> > +
> > +   area->iopt = iopt;
> > +   area->iommu_prot = iommu_prot;
> > +   area->page_offset = start_byte % PAGE_SIZE;
> > +   area->pages_node.start = start_byte / PAGE_SIZE;
> > +   if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> > +   return ERR_PTR(-EOVERFLOW);
> > +   area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> > +   if (WARN_ON(area->pages_node.last >= pages->npages))
> > +   return ERR_PTR(-EOVERFLOW);
> 
> @area leaked in the above two error cases.

Yes, thanks

> > +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> > + unsigned long length, struct page **out_pages, bool write)
> > +{
> > +   unsigned long cur_iova = iova;
> > +   unsigned long last_iova;
> > +   struct iopt_area *area;
> > +   int rc;
> > +
> > +   if (!length)
> > +   return -EINVAL;
> > +   if (check_add_overflow(iova, length - 1, &last_iova))
> > +   return -EOVERFLOW;
> > +
> > +   down_read(&iopt->iova_rwsem);
> > +   for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> > +area = iopt_area_iter_next(area, iova, last_iova)) {
> > +   unsigned long last = min(last_iova, iopt_area_last_iova(area));
> > +   unsigned long last_index;
> > +   unsigned long index;
> > +
> > +   /* Need contiguous areas in the access */
> > +   if (iopt_area_iova(area) < cur_iova || !area->pages) {
> ^^^
> Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

Oh good eye

That is a typo < should be >:

if (iopt_area_iova(area) > cur_iova || !area->pages) {

There are three boundary conditions here to worry about
 - interval trees return any nodes that intersect the queried range
   so the first found node can start after iova

 - There can be a gap in the intervals

 - The final area can end short of last_iova

The last one is botched too and needs this:
for ... { ...
}
+   if (cur_iova != last_iova)
+   goto out_remove;

The test suite isn't covering the boundary cases here yet, I added a
FIXME for this for now.

Thanks,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-22 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:32 -0300
Jason Gunthorpe  wrote:
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + 
> length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> + unsigned long iova, unsigned long start_byte,
> + unsigned long length, int iommu_prot, unsigned int flags)
> +{
> + struct iopt_area *area;
> + int rc;
> +
> + area = kzalloc(sizeof(*area), GFP_KERNEL);
> + if (!area)
> + return ERR_PTR(-ENOMEM);
> +
> + area->iopt = iopt;
> + area->iommu_prot = iommu_prot;
> + area->page_offset = start_byte % PAGE_SIZE;
> + area->pages_node.start = start_byte / PAGE_SIZE;
> + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> + return ERR_PTR(-EOVERFLOW);
> + area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> + if (WARN_ON(area->pages_node.last >= pages->npages))
> + return ERR_PTR(-EOVERFLOW);

@area leaked in the above two error cases.

> +
> + down_write(&iopt->iova_rwsem);
> + if (flags & IOPT_ALLOC_IOVA) {
> + rc = iopt_alloc_iova(iopt, &iova,
> +  (uintptr_t)pages->uptr + start_byte,
> +  length);
> + if (rc)
> + goto out_unlock;
> + }
> +
> + if (check_add_overflow(iova, length - 1, &area->node.last)) {
> + rc = -EOVERFLOW;
> + goto out_unlock;
> + }
> +
> + if (!(flags & IOPT_ALLOC_IOVA)) {
> + if ((iova & (iopt->iova_alignment - 1)) ||
> + (length & (iopt->iova_alignment - 1)) || !length) {
> + rc = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* No reserved IOVA intersects the range */
> + if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
> +  area->node.last)) {
> + rc = -ENOENT;
> + goto out_unlock;
> + }
> +
> + /* Check that there is not already a mapping in the range */
> + if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> + rc = -EADDRINUSE;
> + goto out_unlock;
> + }
> + }
> +
> + /*
> +  * The area is inserted with a NULL pages indicating it is not fully
> +  * initialized yet.
> +  */
> + area->node.start = iova;
> + interval_tree_insert(&area->node, &area->iopt->area_itree);
> + up_write(&iopt->iova_rwsem);
> + return area;
> +
> +out_unlock:
> + up_write(&iopt->iova_rwsem);
> + kfree(area);
> + return ERR_PTR(rc);
> +}
...
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. 
> These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not 
> to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +   unsigned long length, struct page **out_pages, bool write)
> +{
> + unsigned long cur_iova = iova;
> + unsigned long last_iova;
> + struct iopt_area *area;
> + int rc;
> +
> + if (!length)
> + return -EINVAL;
> + if (check_add_overflow(iova, length - 1, &last_iova))
> + return -EOVERFLOW;
> +
> + down_read(&iopt->iova_rwsem);
> + for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +  area = iopt_area_iter_next(area, iova, last_iova)) {
> + unsigned long last = min(last_iova, iopt_area_last_iova(area));
> + unsigned long last_index;
> + unsigned long index;
> +
> + /* Need contiguous areas in the access */
> + if (iopt_area_iova(area) < cur_iova || !area->pages) {
^^^
Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

I can't see how we'd require in-kernel page users to know the iopt_area
alignment from userspace, so I think this needs to skip the first
iteration.  Thanks,

Alex

> + rc = -EINVAL;
> + goto out_remove;
> + }
> +
> + index = iopt_area_iova_to_index(area, cur_i

[PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-18 Thread Jason Gunthorpe via iommu
This is the remainder of the IOAS data structure. Provide an object called
an io_pagetable that is composed of iopt_areas pointing at iopt_pages,
along with a list of iommu_domains that mirror the IOVA to PFN map.

At the top this is a simple interval tree of iopt_areas indicating the map
of IOVA to iopt_pages. An xarray keeps track of a list of domains. Based
on the attached domains there is a minimum alignment for areas (which may
be smaller than PAGE_SIZE) and an interval tree of reserved IOVA that
can't be mapped.

The concept of a 'user' refers to something like a VFIO mdev that is
accessing the IOVA and using a 'struct page *' for CPU based access.

Externally an API is provided that matches the requirements of the IOCTL
interface for map/unmap and domain attachment.

The API provides a 'copy' primitive to establish a new IOVA map in a
different IOAS from an existing mapping.

This is designed to support a pre-registration flow where userspace would
setup an dummy IOAS with no domains, map in memory and then establish a
user to pin all PFNs into the xarray.

Copy can then be used to create new IOVA mappings in a different IOAS,
with iommu_domains attached. Upon copy the PFNs will be read out of the
xarray and mapped into the iommu_domains, avoiding any pin_user_pages()
overheads.

Signed-off-by: Jason Gunthorpe 
---
 drivers/iommu/iommufd/Makefile  |   1 +
 drivers/iommu/iommufd/io_pagetable.c| 890 
 drivers/iommu/iommufd/iommufd_private.h |  35 +
 3 files changed, 926 insertions(+)
 create mode 100644 drivers/iommu/iommufd/io_pagetable.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 05a0e91e30afad..b66a8c47ff55ec 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
+   io_pagetable.o \
main.o \
pages.o
 
diff --git a/drivers/iommu/iommufd/io_pagetable.c 
b/drivers/iommu/iommufd/io_pagetable.c
new file mode 100644
index 00..f9f3b06946bfb9
--- /dev/null
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -0,0 +1,890 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
+ * PFNs can be placed into an iommu_domain, or returned to the caller as a page
+ * list for access by an in-kernel user.
+ *
+ * The datastructure uses the iopt_pages to optimize the storage of the PFNs
+ * between the domains and xarray.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "io_pagetable.h"
+
+static unsigned long iopt_area_iova_to_index(struct iopt_area *area,
+unsigned long iova)
+{
+   if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+   WARN_ON(iova < iopt_area_iova(area) ||
+   iova > iopt_area_last_iova(area));
+   return (iova - (iopt_area_iova(area) & PAGE_MASK)) / PAGE_SIZE;
+}
+
+static struct iopt_area *iopt_find_exact_area(struct io_pagetable *iopt,
+ unsigned long iova,
+ unsigned long last_iova)
+{
+   struct iopt_area *area;
+
+   area = iopt_area_iter_first(iopt, iova, last_iova);
+   if (!area || !area->pages || iopt_area_iova(area) != iova ||
+   iopt_area_last_iova(area) != last_iova)
+   return NULL;
+   return area;
+}
+
+static bool __alloc_iova_check_hole(struct interval_tree_span_iter *span,
+   unsigned long length,
+   unsigned long iova_alignment,
+   unsigned long page_offset)
+{
+   if (!span->is_hole || span->last_hole - span->start_hole < length - 1)
+   return false;
+
+   span->start_hole =
+   ALIGN(span->start_hole, iova_alignment) | page_offset;
+   if (span->start_hole > span->last_hole ||
+   span->last_hole - span->start_hole < length - 1)
+   return false;
+   return true;
+}
+
+/*
+ * Automatically find a block of IOVA that is not being used and not reserved.
+ * Does not return a 0 IOVA even if it is valid.
+ */
+static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
+  unsigned long uptr, unsigned long length)
+{
+   struct interval_tree_span_iter reserved_span;
+   unsigned long page_offset = uptr % PAGE_SIZE;
+   struct interval_tree_span_iter area_span;
+   unsigned long iova_alignment;
+
+   lockdep_assert_held(&iopt->iova_rwsem);
+
+   /* Protect roundup_pow-of_two() from overflow */
+   if (length == 0 || length >= ULONG_MAX / 2)
+   return -EOVERFLOW;
+
+   /*
+* Keep alignment present in the uptr when building the IOVA, this
+* increases the chance we can map a THP.