The patch introduces support of VFIO on POWER.

The patch consists of:

1. IOMMU driver for VFIO.
It does not use IOMMU API at all, instead it calls POWER
IOMMU API directly (ppc_md callbacks).

2. A piece of code (module_init) which creates IOMMU groups.
TBD: what is a better place for it?

The patch is made on top of
git://github.com/awilliam/linux-vfio.git iommu-group-vfio-20120523
(which is iommu-group-vfio-20120521 + some fixes)

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/Kconfig             |    6 +
 arch/powerpc/include/asm/iommu.h |    3 +
 arch/powerpc/kernel/Makefile     |    1 +
 arch/powerpc/kernel/iommu_vfio.c |  371 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 381 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/kernel/iommu_vfio.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index feab3ba..13d12ac 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -319,6 +319,12 @@ config 8XX_MINIMAL_FPEMU
 config IOMMU_HELPER
        def_bool PPC64
 
+config IOMMU_VFIO
+       select IOMMU_API
+       depends on PPC64
+       tristate "Enable IOMMU chardev to support user-space PCI"
+       default n
+
 config SWIOTLB
        bool "SWIOTLB support"
        default n
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 957a83f..c64bce7 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
        unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
        spinlock_t     it_lock;      /* Protects it_map */
        unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+       struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f5808a3..7cfd68e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_RELOCATABLE_PPC32)       += reloc_32.o
 
 obj-$(CONFIG_PPC32)            += entry_32.o setup_32.o
 obj-$(CONFIG_PPC64)            += dma-iommu.o iommu.o
+obj-$(CONFIG_IOMMU_VFIO)       += iommu_vfio.o
 obj-$(CONFIG_KGDB)             += kgdb.o
 obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)   += prom_init.o
 obj-$(CONFIG_MODULES)          += ppc_ksyms.o
diff --git a/arch/powerpc/kernel/iommu_vfio.c b/arch/powerpc/kernel/iommu_vfio.c
new file mode 100644
index 0000000..68a93dd
--- /dev/null
+++ b/arch/powerpc/kernel/iommu_vfio.c
@@ -0,0 +1,371 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <a...@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_x86.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.william...@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "a...@ozlabs.ru"
+#define DRIVER_DESC     "POWER IOMMU chardev for VFIO"
+
+#define IOMMU_CHECK_EXTENSION  _IO(VFIO_TYPE, VFIO_BASE + 1)
+
+/* -------- API for POWERPC IOMMU -------- */
+
+#define POWERPC_IOMMU          2
+
+struct tce_iommu_info {
+       __u32 argsz;
+       __u32 dma32_window_start;
+       __u32 dma32_window_size;
+};
+
+#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct tce_iommu_dma_map {
+       __u32 argsz;
+       __u64 va;
+       __u64 dmaaddr;
+};
+
+#define POWERPC_IOMMU_MAP_DMA  _IO(VFIO_TYPE, VFIO_BASE + 13)
+#define POWERPC_IOMMU_UNMAP_DMA        _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+/* ***************************************************************** */
+
+struct tce_iommu {
+       struct iommu_table *tbl;
+};
+
+static int tce_iommu_attach_group(void *iommu_data,
+               struct iommu_group *iommu_group)
+{
+       struct tce_iommu *tceiommu = iommu_data;
+
+       if (tceiommu->tbl) {
+               printk(KERN_ERR "Only one group per IOMMU instance is 
allowed\n");
+               return -EFAULT;
+       }
+       tceiommu->tbl = iommu_group_get_iommudata(iommu_group);
+
+       return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+               struct iommu_group *iommu_group)
+{
+       struct tce_iommu *tceiommu = iommu_data;
+
+       if (!tceiommu->tbl) {
+               printk(KERN_ERR "IOMMU already released\n");
+               return;
+       }
+       tceiommu->tbl = NULL;
+}
+
+static void *tce_iommu_open(unsigned long arg)
+{
+       struct tce_iommu *tceiommu;
+
+       if (arg != POWERPC_IOMMU)
+               return ERR_PTR(-EINVAL);
+
+       tceiommu = kzalloc(sizeof(*tceiommu), GFP_KERNEL);
+       if (!tceiommu)
+               return ERR_PTR(-ENOMEM);
+
+       return tceiommu;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+       struct tce_iommu *tceiommu = iommu_data;
+       kfree(tceiommu);
+}
+
+static int tce_iommu_map(struct iommu_table *tbl, unsigned long iova,
+                  phys_addr_t paddr)
+{
+       unsigned long entry, flags;
+       int build_fail;
+
+       spin_lock_irqsave(&(tbl->it_lock), flags);
+       entry = iova >> IOMMU_PAGE_SHIFT;
+       build_fail = ppc_md.tce_build(tbl, entry, 1/*pages*/,
+                       (unsigned long)paddr & IOMMU_PAGE_MASK,
+                       DMA_BIDIRECTIONAL, NULL/*attrs*/);
+
+       /* ppc_md.tce_build() only returns non-zero for transient errors.
+        * Clean up the table bitmap in this case and return
+        * DMA_ERROR_CODE. For all other errors the functionality is
+        * not altered.
+        */
+       if (unlikely(build_fail)) {
+               printk("Failed to add TCE\n");
+               spin_unlock_irqrestore(&(tbl->it_lock), flags);
+               return -EFAULT;
+       }
+       /* Flush/invalidate TLB caches if necessary */
+       if (ppc_md.tce_flush)
+               ppc_md.tce_flush(tbl);
+
+       spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+       /* Make sure updates are seen by hardware */
+       mb();
+
+       return 0;
+}
+
+static void tce_iommu_unmap(struct iommu_table *tbl, unsigned long iova)
+{
+       unsigned long entry, flags;
+       entry = iova >> IOMMU_PAGE_SHIFT;
+
+       spin_lock_irqsave(&(tbl->it_lock), flags);
+       ppc_md.tce_free(tbl, entry, 1);
+       /* Flush/invalidate TLB caches if necessary */
+       if (ppc_md.tce_flush)
+               ppc_md.tce_flush(tbl);
+
+       spin_unlock_irqrestore(&(tbl->it_lock), flags);
+
+       /* Make sure updates are seen by hardware */
+       mb();
+}
+
+static phys_addr_t tce_iommu_iova_to_va(struct iommu_table *tbl,
+               unsigned long iova)
+{
+       unsigned long entry = iova >> IOMMU_PAGE_SHIFT;
+       phys_addr_t ret = 0;
+
+       if (ppc_md.tce_get)
+               ret = ppc_md.tce_get(tbl, entry);
+
+       return ret;
+}
+
+static struct page *tceaddr_to_page(void *addr)
+{
+       return pfn_to_page(__pa(addr) >> PAGE_SHIFT);
+}
+
+static long tce_dmamap_page(struct iommu_table *tbl,
+               uint64_t va, uint64_t dmaaddr)
+{
+       int ret = -EFAULT;
+       phys_addr_t addr;
+       struct page *page[1];
+       int iswrite = 1;
+       void *kva;
+
+       if (NULL == tbl) {
+               printk(KERN_ERR"tce_iommu: (map) IOMMU table has not "
+                               "been initialized yet!\n");
+               return -EFAULT;
+       }
+       addr = tce_iommu_iova_to_va(tbl, dmaaddr);
+       if (addr) {
+               printk(KERN_WARNING"tce_iommu: already mapped va=%llx "
+                               "da=%llx addr=%llx\n",
+                               va, dmaaddr, addr);
+               /*TODO: unmap! */
+       }
+
+       ret = get_user_pages_fast(va, 1, iswrite, page);
+       if (1 != ret) {
+               printk(KERN_ERR"tce_iommu: get_user_pages_fast failed "
+                               "va=%llx da=%llx addr=%llx ret=%d\n",
+                               va, dmaaddr, addr, ret);
+               return -EFAULT;
+       }
+       ret = -EFAULT;
+       kva = (void *) page_address(page[0]);
+       if (kva) {
+               ret = tce_iommu_map(tbl, dmaaddr, (phys_addr_t) kva);
+       }
+       if (ret) {
+               printk(KERN_ERR"tce_iommu: tce_iommu_map va=%llx "
+                               "da=%llx kva=%p\n",
+                               va, dmaaddr, kva);
+               if (iswrite)
+                       SetPageDirty(page[0]);
+               put_page(page[0]);
+       }
+
+       return ret;
+}
+
+static long tce_dmaunmap_page(struct iommu_table *tbl, uint64_t dmaaddr)
+{
+       int ret = 0;
+       phys_addr_t addr;
+       struct page *page;
+
+       if (NULL == tbl) {
+               printk(KERN_ERR"tce_iommu: (unmap) IOMMU table has not been "
+                               "initialized yet!\n");
+               return -EFAULT;
+       }
+       addr = tce_iommu_iova_to_va(tbl, dmaaddr);
+       if (addr) {
+               page = tceaddr_to_page((void*)addr);
+               if (!page) {
+                       printk(KERN_ERR"DMAUNMAP error: pfn_to_page(%llx) "
+                                       "failed\n", addr);
+                       ret = -EFAULT;
+               } else {
+                       SetPageDirty(page);
+                       put_page(page);
+               }
+       }
+       tce_iommu_unmap(tbl, dmaaddr);
+       if (ret)
+               printk(KERN_ERR"Failed to DMAUNMAP: da=%llx pfn=%llx\n",
+                               dmaaddr, addr);
+       return ret;
+}
+
+
+static long tce_iommu_ioctl(void *iommu_data,
+                                unsigned int cmd, unsigned long arg)
+{
+       struct tce_iommu *tceiommu = iommu_data;
+       unsigned long minsz;
+
+       if (cmd == IOMMU_CHECK_EXTENSION) {
+               switch (arg) {
+               case POWERPC_IOMMU:
+                       return 1;
+               default:
+                       return 0;
+               }
+       } else if (cmd == POWERPC_IOMMU_GET_INFO) {
+               struct tce_iommu_info info;
+
+               minsz = offsetofend(struct tce_iommu_info, dma32_window_size);
+
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (info.argsz < minsz)
+                       return -EINVAL;
+
+               info.dma32_window_start =
+                               tceiommu->tbl->it_offset << IOMMU_PAGE_SHIFT;
+               info.dma32_window_size =
+                               tceiommu->tbl->it_size << IOMMU_PAGE_SHIFT;
+
+               return copy_to_user((void __user *)arg, &info, minsz);
+
+       } else if (cmd == POWERPC_IOMMU_MAP_DMA) {
+               struct tce_iommu_dma_map map;
+
+               minsz = offsetofend(struct tce_iommu_dma_map, dmaaddr);
+
+               if (copy_from_user(&map, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (map.argsz < minsz)
+                       return -EINVAL;
+
+               return tce_dmamap_page(tceiommu->tbl, map.va, map.dmaaddr);
+
+       } else if (cmd == POWERPC_IOMMU_UNMAP_DMA) {
+               struct tce_iommu_dma_map unmap;
+
+               minsz = offsetofend(struct tce_iommu_dma_map, dmaaddr);
+
+               if (copy_from_user(&unmap, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (unmap.argsz < minsz)
+                       return -EINVAL;
+
+               return tce_dmaunmap_page(tceiommu->tbl, unmap.dmaaddr);
+       }
+
+       return -ENOTTY;
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+       .name           = "vfio-iommu-powerpc",
+       .owner          = THIS_MODULE,
+       .open           = tce_iommu_open,
+       .release        = tce_iommu_release,
+       .ioctl          = tce_iommu_ioctl,
+       .attach_group   = tce_iommu_attach_group,
+       .detach_group   = tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+       struct pci_dev *pdev = NULL;
+       struct iommu_table *tbl = NULL;
+       struct iommu_group *grp = NULL;
+       int ret = 0;
+
+       /* TODO: Do this for all devices, not just for PCI */
+       for_each_pci_dev(pdev) {
+
+               tbl = get_iommu_table_base(&pdev->dev);
+               if (NULL == tbl) {
+                       printk("Skipping device %s\n", pdev->dev.kobj.name);
+                       continue;
+               }
+               if (!tbl->it_group) {
+                       struct iommu_group *tmp = iommu_group_alloc();
+                       if (IS_ERR(tmp)) {
+                               printk("Failed to create new IOMMU group, "
+                                               "ret = %ld\n", PTR_ERR(tmp));
+                               break;
+                       }
+                       tbl->it_group = tmp;
+                       iommu_group_set_iommudata(tmp, tbl, NULL);
+               }
+
+               grp = iommu_group_get(&pdev->dev);
+               if (!grp) {
+                       ret = iommu_group_add_device(tbl->it_group, &pdev->dev);
+                       if (ret < 0)
+                               printk("iommu_group_add_device(%s) failed "
+                                               "with %d\n",
+                                               pdev->dev.kobj.name, ret);
+               }
+       }
+
+       return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+       vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+

-- 
1.7.7.3

Reply via email to