[PATCH 2/2] vfio: hugepage support for vfio_iommu_type1
We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 523 +-- include/uapi/linux/vfio.h |8 - 2 files changed, 344 insertions(+), 187 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0e863b3..6654a7e 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -60,7 +60,7 @@ struct vfio_dma { struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ - longnpage; /* Number of pages */ + size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ }; @@ -74,8 +74,6 @@ struct vfio_group { * into DMA'ble space using the IOMMU */ -#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) - static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, dma_addr_t start, size_t size) { @@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, if (start + size <= dma->iova) node = node->rb_left; - else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + else if (start >= dma->iova + dma->size) node = node->rb_right; else return dma; @@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) parent = *link; dma = rb_entry(parent, struct vfio_dma, node); - if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + if (new->iova + new->size <= dma->iova) link = &(*link)->rb_left; else link = &(*link)->rb_right; @@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage) struct vwork *vwork; struct mm_struct *mm; - if (!current->mm) - return; /* process exited */ + if (!current->mm || !npage) + return; /* process exited or nothing to do */ if (down_write_trylock(¤t->mm->mmap_sem)) { current->mm->locked_vm += npage; @@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -/* Unmap DMA region */ -static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, -long npage, int prot) -{ - long i, unlocked = 0; - - for (i = 0; i < npage;
[PATCH 1/2] vfio: Convert type1 iommu to use rbtree
We need to keep track of all the DMA mappings of an iommu container so that it can be automatically unmapped when the user releases the file descriptor. We currently do this using a simple list, where we merge entries with contiguous iovas and virtual addresses. Using a tree for this is a bit more efficient and allows us to use common code instead of inventing our own. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 190 --- 1 file changed, 96 insertions(+), 94 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6f3fbc4..0e863b3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -31,6 +31,7 @@ #include #include #include /* pci_bus_type */ +#include #include #include #include @@ -50,13 +51,13 @@ MODULE_PARM_DESC(allow_unsafe_interrupts, struct vfio_iommu { struct iommu_domain *domain; struct mutexlock; - struct list_headdma_list; + struct rb_root dma_list; struct list_headgroup_list; boolcache; }; struct vfio_dma { - struct list_headnext; + struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ longnpage; /* Number of pages */ @@ -75,6 +76,49 @@ struct vfio_group { #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) +static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, + dma_addr_t start, size_t size) +{ + struct rb_node *node = iommu->dma_list.rb_node; + + while (node) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); + + if (start + size <= dma->iova) + node = node->rb_left; + else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + node = node->rb_right; + else + return dma; + } + + return NULL; +} + +static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) +{ + struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; + struct vfio_dma *dma; + + while (*link) { + parent = *link; + dma = rb_entry(parent, struct vfio_dma, node); + + if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &iommu->dma_list); +} + +static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) +{ + rb_erase(&old->node, &iommu->dma_list); +} + struct vwork { struct mm_struct*mm; longnpage; @@ -289,31 +333,8 @@ static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, return 0; } -static inline bool ranges_overlap(dma_addr_t start1, size_t size1, - dma_addr_t start2, size_t size2) -{ - if (start1 < start2) - return (start2 - start1 < size1); - else if (start2 < start1) - return (start1 - start2 < size2); - return (size1 > 0 && size2 > 0); -} - -static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, - dma_addr_t start, size_t size) -{ - struct vfio_dma *dma; - - list_for_each_entry(dma, &iommu->dma_list, next) { - if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), - start, size)) - return dma; - } - return NULL; -} - -static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) +static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, + size_t size, struct vfio_dma *dma) { struct vfio_dma *split; long npage_lo, npage_hi; @@ -322,10 +343,9 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, if (start <= dma->iova && start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - list_del(&dma->next); - npage_lo = dma->npage; + vfio_remove_dma(iommu, dma); kfree(dma); - return npage_lo; + return 0; } /* Overlap low address of existing range */ @@ -339,7 +359,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, dma->iova += ov
[PATCH 0/2] vfio: type1 iommu hugepage support
This series let's the vfio type1 iommu backend take advantage of iommu large page support. See patch 2/2 for the details. This has been tested on both amd_iommu and intel_iommu, but only my AMD system has large page support. I'd appreciate any testing and feedback on other systems, particularly vt-d systems supporting large pages. Mapping efficiency should be improved a bit without iommu hugepages, but I hope that it's much more noticeable with huge pages, especially for very large QEMU guests. This change includes a clarification to the mapping expectations for users of the type1 iommu, but is compatible with known users and works with existing QEMU userspace supporting vfio. Thanks, Alex --- Alex Williamson (2): vfio: Convert type1 iommu to use rbtree vfio: hugepage support for vfio_iommu_type1 drivers/vfio/vfio_iommu_type1.c | 607 --- include/uapi/linux/vfio.h |8 - 2 files changed, 387 insertions(+), 228 deletions(-) ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu: Split iommu_unmaps
iommu_map splits requests into pages that the iommu driver reports that it can handle. The iommu_unmap path does not do the same. This can cause problems not only from callers that might expect the same behavior as the map path, but even from the failure path of iommu_map, should it fail at a point where it has mapped and needs to unwind a set of pages that the iommu driver cannot handle directly. amd_iommu, for example, will BUG_ON if asked to unmap a non power of 2 size. Fix this by extracting and generalizing the sizing code from the iommu_map path and use it for both map and unmap. Signed-off-by: Alex Williamson --- drivers/iommu/iommu.c | 63 +++-- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d8f98b1..4b0b56b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -754,6 +754,38 @@ int iommu_domain_has_cap(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_domain_has_cap); +static size_t iommu_pgsize(struct iommu_domain *domain, + unsigned long addr_merge, size_t size) +{ + unsigned int pgsize_idx; + size_t pgsize; + + /* Max page size that still fits into 'size' */ + pgsize_idx = __fls(size); + + /* need to consider alignment requirements ? */ + if (likely(addr_merge)) { + /* Max page size allowed by address */ + unsigned int align_pgsize_idx = __ffs(addr_merge); + pgsize_idx = min(pgsize_idx, align_pgsize_idx); + } + + /* build a mask of acceptable page sizes */ + pgsize = (1UL << (pgsize_idx + 1)) - 1; + + /* throw away page sizes not supported by the hardware */ + pgsize &= domain->ops->pgsize_bitmap; + + /* make sure we're still sane */ + BUG_ON(!pgsize); + + /* pick the biggest page */ + pgsize_idx = __fls(pgsize); + pgsize = 1UL << pgsize_idx; + + return pgsize; +} + int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { @@ -785,32 +817,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, (unsigned long)paddr, (unsigned long)size); while (size) { - unsigned long pgsize, addr_merge = iova | paddr; - unsigned int pgsize_idx; - - /* Max page size that still fits into 'size' */ - pgsize_idx = __fls(size); - - /* need to consider alignment requirements ? */ - if (likely(addr_merge)) { - /* Max page size allowed by both iova and paddr */ - unsigned int align_pgsize_idx = __ffs(addr_merge); - - pgsize_idx = min(pgsize_idx, align_pgsize_idx); - } - - /* build a mask of acceptable page sizes */ - pgsize = (1UL << (pgsize_idx + 1)) - 1; - - /* throw away page sizes not supported by the hardware */ - pgsize &= domain->ops->pgsize_bitmap; - - /* make sure we're still sane */ - BUG_ON(!pgsize); - - /* pick the biggest page */ - pgsize_idx = __fls(pgsize); - pgsize = 1UL << pgsize_idx; + size_t pgsize = iommu_pgsize(domain, iova | paddr, size); pr_debug("mapping: iova 0x%lx pa 0x%lx pgsize %lu\n", iova, (unsigned long)paddr, pgsize); @@ -863,9 +870,9 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) * or we hit an area that isn't mapped. */ while (unmapped < size) { - size_t left = size - unmapped; + size_t pgsize = iommu_pgsize(domain, iova, size - unmapped); - unmapped_page = domain->ops->unmap(domain, iova, left); + unmapped_page = domain->ops->unmap(domain, iova, pgsize); if (!unmapped_page) break; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu