[PATCH 2/2] vfio: hugepage support for vfio_iommu_type1

2013-05-24 Thread Alex Williamson
We currently send all mappings to the iommu in PAGE_SIZE chunks,
which prevents the iommu from enabling support for larger page sizes.
We still need to pin pages, which means we step through them in
PAGE_SIZE chunks, but we can batch up contiguous physical memory
chunks to allow the iommu the opportunity to use larger pages.  The
approach here is a bit different that the one currently used for
legacy KVM device assignment.  Rather than looking at the vma page
size and using that as the maximum size to pass to the iommu, we
instead simply look at whether the next page is physically
contiguous.  This means we might ask the iommu to map a 4MB region,
while legacy KVM might limit itself to a maximum of 2MB.

Splitting our mapping path also allows us to be smarter about locked
memory because we can more easily unwind if the user attempts to
exceed the limit.  Therefore, rather than assuming that a mapping
will result in locked memory, we test each page as it is pinned to
determine whether it locks RAM vs an mmap'd MMIO region.  This should
result in better locking granularity and less locked page fudge
factors in userspace.

The unmap path uses the same algorithm as legacy KVM.  We don't want
to track the pfn for each mapping ourselves, but we need the pfn in
order to unpin pages.  We therefore ask the iommu for the iova to
physical address translation, ask it to unpin a page, and see how many
pages were actually unpinned.  iommus supporting large pages will
often return something bigger than a page here, which we know will be
physically contiguous and we can unpin a batch of pfns.  iommus not
supporting large mappings won't see an improvement in batching here as
they only unmap a page at a time.

With this change, we also make a clarification to the API for mapping
and unmapping DMA.  We can only guarantee unmaps at the same
granularity as used for the original mapping.  In other words,
unmapping a subregion of a previous mapping is not guaranteed and may
result in a larger or smaller unmapping than requested.  The size
field in the unmapping structure is updated to reflect this.
Previously this was unmodified on mapping, always returning the the
requested unmap size.  This is now updated to return the actual unmap
size on success, allowing userspace to appropriately track mappings.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |  523 +--
 include/uapi/linux/vfio.h   |8 -
 2 files changed, 344 insertions(+), 187 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0e863b3..6654a7e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   vaddr;  /* Process virtual addr */
-   longnpage;  /* Number of pages */
+   size_t  size;   /* Map size (bytes) */
int prot;   /* IOMMU_READ/WRITE */
 };
 
@@ -74,8 +74,6 @@ struct vfio_group {
  * into DMA'ble space using the IOMMU
  */
 
-#define NPAGE_TO_SIZE(npage)   ((size_t)(npage) << PAGE_SHIFT)
-
 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  dma_addr_t start, size_t size)
 {
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu 
*iommu,
 
if (start + size <= dma->iova)
node = node->rb_left;
-   else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+   else if (start >= dma->iova + dma->size)
node = node->rb_right;
else
return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, 
struct vfio_dma *new)
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);
 
-   if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+   if (new->iova + new->size <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
struct vwork *vwork;
struct mm_struct *mm;
 
-   if (!current->mm)
-   return; /* process exited */
+   if (!current->mm || !npage)
+   return; /* process exited or nothing to do */
 
if (down_write_trylock(¤t->mm->mmap_sem)) {
current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
return 0;
 }
 
-/* Unmap DMA region */
-static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-long npage, int prot)
-{
-   long i, unlocked = 0;
-
-   for (i = 0; i < npage; 

[PATCH 1/2] vfio: Convert type1 iommu to use rbtree

2013-05-24 Thread Alex Williamson
We need to keep track of all the DMA mappings of an iommu container so
that it can be automatically unmapped when the user releases the file
descriptor.  We currently do this using a simple list, where we merge
entries with contiguous iovas and virtual addresses.  Using a tree for
this is a bit more efficient and allows us to use common code instead
of inventing our own.

Signed-off-by: Alex Williamson 
---
 drivers/vfio/vfio_iommu_type1.c |  190 ---
 1 file changed, 96 insertions(+), 94 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f3fbc4..0e863b3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include  /* pci_bus_type */
+#include 
 #include 
 #include 
 #include 
@@ -50,13 +51,13 @@ MODULE_PARM_DESC(allow_unsafe_interrupts,
 struct vfio_iommu {
struct iommu_domain *domain;
struct mutexlock;
-   struct list_headdma_list;
+   struct rb_root  dma_list;
struct list_headgroup_list;
boolcache;
 };
 
 struct vfio_dma {
-   struct list_headnext;
+   struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   vaddr;  /* Process virtual addr */
longnpage;  /* Number of pages */
@@ -75,6 +76,49 @@ struct vfio_group {
 
 #define NPAGE_TO_SIZE(npage)   ((size_t)(npage) << PAGE_SHIFT)
 
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+ dma_addr_t start, size_t size)
+{
+   struct rb_node *node = iommu->dma_list.rb_node;
+
+   while (node) {
+   struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+   if (start + size <= dma->iova)
+   node = node->rb_left;
+   else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+   node = node->rb_right;
+   else
+   return dma;
+   }
+
+   return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+   struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+   struct vfio_dma *dma;
+
+   while (*link) {
+   parent = *link;
+   dma = rb_entry(parent, struct vfio_dma, node);
+
+   if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+   link = &(*link)->rb_left;
+   else
+   link = &(*link)->rb_right;
+   }
+
+   rb_link_node(&new->node, parent, link);
+   rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+   rb_erase(&old->node, &iommu->dma_list);
+}
+
 struct vwork {
struct mm_struct*mm;
longnpage;
@@ -289,31 +333,8 @@ static int __vfio_dma_map(struct vfio_iommu *iommu, 
dma_addr_t iova,
return 0;
 }
 
-static inline bool ranges_overlap(dma_addr_t start1, size_t size1,
- dma_addr_t start2, size_t size2)
-{
-   if (start1 < start2)
-   return (start2 - start1 < size1);
-   else if (start2 < start1)
-   return (start1 - start2 < size2);
-   return (size1 > 0 && size2 > 0);
-}
-
-static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
-   dma_addr_t start, size_t size)
-{
-   struct vfio_dma *dma;
-
-   list_for_each_entry(dma, &iommu->dma_list, next) {
-   if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
-  start, size))
-   return dma;
-   }
-   return NULL;
-}
-
-static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
-   size_t size, struct vfio_dma *dma)
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+  size_t size, struct vfio_dma *dma)
 {
struct vfio_dma *split;
long npage_lo, npage_hi;
@@ -322,10 +343,9 @@ static long vfio_remove_dma_overlap(struct vfio_iommu 
*iommu, dma_addr_t start,
if (start <= dma->iova &&
start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-   list_del(&dma->next);
-   npage_lo = dma->npage;
+   vfio_remove_dma(iommu, dma);
kfree(dma);
-   return npage_lo;
+   return 0;
}
 
/* Overlap low address of existing range */
@@ -339,7 +359,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu 
*iommu, dma_addr_t start,
dma->iova += ov

[PATCH 0/2] vfio: type1 iommu hugepage support

2013-05-24 Thread Alex Williamson
This series let's the vfio type1 iommu backend take advantage of iommu
large page support.  See patch 2/2 for the details.  This has been
tested on both amd_iommu and intel_iommu, but only my AMD system has
large page support.  I'd appreciate any testing and feedback on other
systems, particularly vt-d systems supporting large pages.  Mapping
efficiency should be improved a bit without iommu hugepages, but I
hope that it's much more noticeable with huge pages, especially for
very large QEMU guests.

This change includes a clarification to the mapping expectations for
users of the type1 iommu, but is compatible with known users and works
with existing QEMU userspace supporting vfio.  Thanks,

Alex

---

Alex Williamson (2):
  vfio: Convert type1 iommu to use rbtree
  vfio: hugepage support for vfio_iommu_type1


 drivers/vfio/vfio_iommu_type1.c |  607 ---
 include/uapi/linux/vfio.h   |8 -
 2 files changed, 387 insertions(+), 228 deletions(-)
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu: Split iommu_unmaps

2013-05-24 Thread Alex Williamson
iommu_map splits requests into pages that the iommu driver reports
that it can handle.  The iommu_unmap path does not do the same.  This
can cause problems not only from callers that might expect the same
behavior as the map path, but even from the failure path of iommu_map,
should it fail at a point where it has mapped and needs to unwind a
set of pages that the iommu driver cannot handle directly.  amd_iommu,
for example, will BUG_ON if asked to unmap a non power of 2 size.

Fix this by extracting and generalizing the sizing code from the
iommu_map path and use it for both map and unmap.

Signed-off-by: Alex Williamson 
---
 drivers/iommu/iommu.c |   63 +++--
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d8f98b1..4b0b56b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -754,6 +754,38 @@ int iommu_domain_has_cap(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
 
+static size_t iommu_pgsize(struct iommu_domain *domain,
+  unsigned long addr_merge, size_t size)
+{
+   unsigned int pgsize_idx;
+   size_t pgsize;
+
+   /* Max page size that still fits into 'size' */
+   pgsize_idx = __fls(size);
+
+   /* need to consider alignment requirements ? */
+   if (likely(addr_merge)) {
+   /* Max page size allowed by address */
+   unsigned int align_pgsize_idx = __ffs(addr_merge);
+   pgsize_idx = min(pgsize_idx, align_pgsize_idx);
+   }
+
+   /* build a mask of acceptable page sizes */
+   pgsize = (1UL << (pgsize_idx + 1)) - 1;
+
+   /* throw away page sizes not supported by the hardware */
+   pgsize &= domain->ops->pgsize_bitmap;
+
+   /* make sure we're still sane */
+   BUG_ON(!pgsize);
+
+   /* pick the biggest page */
+   pgsize_idx = __fls(pgsize);
+   pgsize = 1UL << pgsize_idx;
+
+   return pgsize;
+}
+
 int iommu_map(struct iommu_domain *domain, unsigned long iova,
  phys_addr_t paddr, size_t size, int prot)
 {
@@ -785,32 +817,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long 
iova,
(unsigned long)paddr, (unsigned long)size);
 
while (size) {
-   unsigned long pgsize, addr_merge = iova | paddr;
-   unsigned int pgsize_idx;
-
-   /* Max page size that still fits into 'size' */
-   pgsize_idx = __fls(size);
-
-   /* need to consider alignment requirements ? */
-   if (likely(addr_merge)) {
-   /* Max page size allowed by both iova and paddr */
-   unsigned int align_pgsize_idx = __ffs(addr_merge);
-
-   pgsize_idx = min(pgsize_idx, align_pgsize_idx);
-   }
-
-   /* build a mask of acceptable page sizes */
-   pgsize = (1UL << (pgsize_idx + 1)) - 1;
-
-   /* throw away page sizes not supported by the hardware */
-   pgsize &= domain->ops->pgsize_bitmap;
-
-   /* make sure we're still sane */
-   BUG_ON(!pgsize);
-
-   /* pick the biggest page */
-   pgsize_idx = __fls(pgsize);
-   pgsize = 1UL << pgsize_idx;
+   size_t pgsize = iommu_pgsize(domain, iova | paddr, size);
 
pr_debug("mapping: iova 0x%lx pa 0x%lx pgsize %lu\n", iova,
(unsigned long)paddr, pgsize);
@@ -863,9 +870,9 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned 
long iova, size_t size)
 * or we hit an area that isn't mapped.
 */
while (unmapped < size) {
-   size_t left = size - unmapped;
+   size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
 
-   unmapped_page = domain->ops->unmap(domain, iova, left);
+   unmapped_page = domain->ops->unmap(domain, iova, pgsize);
if (!unmapped_page)
break;
 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu