The patch adds the following functionality: 1. Implements VFIO-IOMMU host kernel driver support;
2. Implements interface between SPAPR TCE and VFIO via sPAPRVFIOData's map/unmap hooks; 3. Implements PHB scan for devices within the same IOMMU group. To use VFIO on spapr platform, the "spapr-pci-host-bridge" device needs to be created with mandatory "index" and "iommu" properties such as: -device spapr-pci-host-bridge,busname=USB,iommu=4,index=5 where: "index" - PHB number which is used to build all other PHB properties such as MMIO window, BUID, etc; "iommu" - IOMMU ID which represents a Partitionable Endpoint. Optional parameters are: "forceaddr" - forces QEMU to assign device:function from the host address; "multifunction" - enables multifunction what might make sense if the user wants to use the configuration from the host in the guest such as NEC USB PCI adapter which is visible as a single device with 3 PCI functions, without this switch QEMU will create 3 device with 1 function on each; "scan" - disables scan and lets the user put to QEMU only some devices from PE; "busname" - name of the bus, it used to connect vfio-pci devices with a PHB when scan is disabled. If scan is disabled, no PCI device is automatically added and the user has to add them manuall as in the example below which adds PHB and 3 PCI devices:: -device spapr-pci-host-bridge,busname=USB,iommu=4,scan=0,index=5 \ -device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true \ -device vfio-pci,host=4:0:1.1,addr=1.1 \ -device vfio-pci,host=4:0:1.2,addr=1.2 Cc: David Gibson <da...@gibson.dropbear.id.au> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> --- hw/spapr.h | 4 ++ hw/spapr_iommu.c | 111 ++++++++++++++++++++++++++++++++++++++ hw/spapr_iommu_vfio.h | 34 ++++++++++++ hw/spapr_pci.c | 129 +++++++++++++++++++++++++++++++++++++++++--- hw/spapr_pci.h | 6 +++ hw/vfio_pci.c | 62 +++++++++++++++++++++ linux-headers/linux/vfio.h | 27 ++++++++++ trace-events | 6 ++- 8 files changed, 370 insertions(+), 9 deletions(-) create mode 100644 hw/spapr_iommu_vfio.h diff --git a/hw/spapr.h b/hw/spapr.h index bc0cd27..0ecfae2 100644 --- a/hw/spapr.h +++ b/hw/spapr.h @@ -3,6 +3,7 @@ #include "dma.h" #include "hw/xics.h" +#include "hw/spapr_iommu_vfio.h" struct VIOsPAPRBus; struct sPAPRPHBState; @@ -406,4 +407,7 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, DMAContext *dma); +DMAContext *spapr_vfio_init_dma(uint32_t liobn, int iommu_id, + sPAPRVFIOData *data); + #endif /* !defined (__HW_SPAPR_H__) */ diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c index 94630c1..462f593 100644 --- a/hw/spapr_iommu.c +++ b/hw/spapr_iommu.c @@ -22,8 +22,10 @@ #include "kvm_ppc.h" #include "dma.h" #include "exec-memory.h" +#include "trace.h" #include "hw/spapr.h" +#include "hw/spapr_iommu_vfio.h" #include <libfdt.h> @@ -234,6 +236,101 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, return H_SUCCESS; } +typedef struct sPAPRVFIOTable { + DMAContext dma; + sPAPRVFIOData *data; + uint32_t liobn; + QLIST_ENTRY(sPAPRVFIOTable) list; +} sPAPRVFIOTable; + +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; + +DMAContext *spapr_vfio_init_dma(uint32_t liobn, int iommu_id, + sPAPRVFIOData *data) +{ + sPAPRVFIOTable *t; + + if (kvmppc_create_spapr_tce_iommu(liobn, iommu_id)) + return NULL; + + t = g_malloc0(sizeof(*t)); + t->data = data; + t->liobn = liobn; + + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); + + return &t->dma; +} + +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong *tces, + target_ulong tce_value, target_ulong npages) +{ + int i, ret; + bool found = false; + __u64 size = SPAPR_TCE_PAGE_SIZE; + sPAPRVFIOTable *t; + + QLIST_FOREACH(t, &vfio_tce_tables, list) { + if (t->liobn == liobn) { + found = true; + break; + } + } + if (!found) { + return H_CONTINUE; /* positive non-zero value */ + } + + for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) { + target_ulong tce = tces ? tces[i] : tce_value; + + if (tce & SPAPR_TCE_PAGE_MASK) { + struct vfio_iommu_type1_dma_map param = { + .argsz = sizeof(param), + .iova = ioba, + .vaddr = (__u64)(uintptr_t) + qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK), + .flags = 0, + .size = size + }; + + switch (tce & SPAPR_TCE_PAGE_MASK) { + case SPAPR_TCE_RO: + param.flags = VFIO_DMA_MAP_FLAG_READ; + break; + case SPAPR_TCE_WO: + param.flags = VFIO_DMA_MAP_FLAG_WRITE; + break; + case SPAPR_TCE_RW: + param.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + break; + } + + ret = t->data->map(t->data, ¶m); + trace_spapr_iommu("vfio map", liobn, ioba, tce, ret); + if (ret < 0) { + perror("spapr_tce map"); + return H_PARAMETER; + } + } else { + struct vfio_iommu_type1_dma_unmap param = { + .argsz = sizeof(param), + .iova = ioba, + .flags = 0, + .size = size + }; + + ret = t->data->unmap(t->data, ¶m); + trace_spapr_iommu("vfio unmap", liobn, ioba, 0, ret); + if (ret < 0) { + perror("spapr_tce unmap"); + return H_PARAMETER; + } + } + } + + return H_SUCCESS; +} + static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, sPAPREnvironment *spapr, target_ulong opcode, target_ulong *args) @@ -260,6 +357,11 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, } return ret; } + ret = put_tce_vfio(liobn, ioba, tces, -1, npages); + if (ret != H_CONTINUE) { + return ret; + } + #ifdef DEBUG_TCE fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/ " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx "\n", @@ -294,6 +396,10 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, } return ret; } + ret = put_tce_vfio(liobn, ioba, NULL, tce_value, npages); + if (ret != H_CONTINUE) { + return ret; + } #ifdef DEBUG_TCE fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/ " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx "\n", @@ -310,6 +416,7 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, target_ulong ioba = args[1]; target_ulong tce = args[2]; sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn); + int ret; if (liobn & 0xFFFFFFFF00000000ULL) { hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN " @@ -322,6 +429,10 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, if (tcet) { return put_tce_emu(tcet, ioba, tce); } + ret = put_tce_vfio(liobn, ioba, &tce, -1, 1); + if (ret != H_CONTINUE) { + return ret; + } #ifdef DEBUG_TCE fprintf(stderr, "%s on liobn=" TARGET_FMT_lx /*%s*/ " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx "\n", diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h new file mode 100644 index 0000000..9c2fff3 --- /dev/null +++ b/hw/spapr_iommu_vfio.h @@ -0,0 +1,34 @@ +/* + * Definitions for VFIO IOMMU implementation for SPAPR TCE. + * + * Copyright (c) 2012 Alexey Kardashevskiy <a...@olabs.ru> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__) +#define __HW_SPAPR_IOMMU_VFIO_H__ + +#include <linux/vfio.h> + +typedef struct sPAPRVFIOData sPAPRVFIOData; +typedef struct sPAPRVFIOData { + struct vfio_iommu_spapr_tce_info info; + int (*map)(sPAPRVFIOData *data, struct vfio_iommu_type1_dma_map *par); + int (*unmap)(sPAPRVFIOData *data, struct vfio_iommu_type1_dma_unmap *par); +} sPAPRVFIOData; + +void spapr_register_vfio_container(int groupid, sPAPRVFIOData *data); + +#endif diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c index a6885c4..2631332 100644 --- a/hw/spapr_pci.c +++ b/hw/spapr_pci.c @@ -22,6 +22,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include <sys/types.h> +#include <dirent.h> + #include "hw.h" #include "pci.h" #include "msi.h" @@ -514,6 +517,94 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque, return phb->dma; } +void spapr_register_vfio_container(int groupid, sPAPRVFIOData *data) +{ + sPAPRPHBState *phb; + + QLIST_FOREACH(phb, &spapr->phbs, list) { + if (phb->iommugroupid == groupid) { + phb->vfio_data = data; + phb->dma_window_start = phb->vfio_data->info.dma32_window_start; + phb->dma_window_size = phb->vfio_data->info.dma32_window_size; + phb->dma = spapr_vfio_init_dma(phb->dma_liobn, groupid, + phb->vfio_data); + return; + } + } +} + +static int spapr_pci_scan_vfio(sPAPRPHBState *sphb) +{ + PCIHostState *phb = PCI_HOST_BRIDGE(sphb); + char iommupath[256]; + DIR *dirp; + struct dirent *entry; + + if (!sphb->scan) { + trace_spapr_pci("autoscan disabled for ", sphb->dtbusname); + return 0; + } + + snprintf(iommupath, sizeof(iommupath), + "/sys/kernel/iommu_groups/%d/devices/", sphb->iommugroupid); + dirp = opendir(iommupath); + if (!dirp) { + fprintf(stderr, "failed to scan group=%d\n", sphb->iommugroupid); + return -1; + } + + while ((entry = readdir(dirp)) != NULL) { + char *tmp; + FILE *deviceclassfile; + unsigned deviceclass = 0, domainid, busid, devid, fnid; + char addr[32]; + DeviceState *dev; + + if (sscanf(entry->d_name, "%X:%X:%X.%x", + &domainid, &busid, &devid, &fnid) != 4) { + continue; + } + + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); + trace_spapr_pci("Reading device class from ", tmp); + + deviceclassfile = fopen(tmp, "r"); + if (deviceclassfile) { + fscanf(deviceclassfile, "%x", &deviceclass); + fclose(deviceclassfile); + } + g_free(tmp); + + if (!deviceclass) { + continue; + } + if ((sphb->scan < 2) && + ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) { + /* Skip _any_ bridge */ + continue; + } + trace_spapr_pci("Creating device from ", entry->d_name); + + dev = qdev_create(&phb->bus->qbus, "vfio-pci"); + if (!dev) { + fprintf(stderr, "failed to create vfio-pci\n"); + continue; + } + qdev_prop_parse(dev, "host", entry->d_name); + if (sphb->force_addr) { + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); + qdev_prop_parse(dev, "addr", addr); + } + if (sphb->enable_multifunction) { + qdev_prop_set_bit(dev, "multifunction", 1); + } + qdev_init_nofail(dev); + } + closedir(dirp); + + return 0; +} + static int spapr_phb_init(SysBusDevice *s) { sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s); @@ -627,13 +718,6 @@ static int spapr_phb_init(SysBusDevice *s) PCI_DEVFN(0, 0), PCI_NUM_PINS); phb->bus = bus; - sphb->dma_window_start = 0; - sphb->dma_window_size = 0x40000000; - sphb->dma = spapr_tce_new_dma_context(sphb->dma_liobn, sphb->dma_window_size); - if (!sphb->dma) { - fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname); - return -1; - } pci_setup_iommu(bus, spapr_pci_dma_context_fn, sphb); QLIST_INSERT_HEAD(&spapr->phbs, sphb, list); @@ -650,6 +734,25 @@ static int spapr_phb_init(SysBusDevice *s) sphb->lsi_table[i].irq = irq; } + if (sphb->iommugroupid >= 0) { + if (spapr_pci_scan_vfio(sphb) < 0) { + return -1; + } + /* dma_window_xxxx will be initialized from + spapr_register_vfio_container() when VFIO will create the very first + device in the group */ + return 0; + } + + sphb->dma_window_start = 0; + sphb->dma_window_size = 0x40000000; + sphb->dma = spapr_tce_new_dma_context(sphb->dma_liobn, + sphb->dma_window_size); + if (!sphb->dma) { + fprintf(stderr, "Unable to create TCE table for %s\n", sphb->dtbusname); + return -1; + } + return 0; } @@ -659,7 +762,9 @@ static void spapr_phb_reset(DeviceState *qdev) sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s); /* Reset the IOMMU state */ - spapr_tce_reset(sphb->dma); + if (sphb->iommugroupid == -1) { + spapr_tce_reset(sphb->dma); + } } static Property spapr_phb_properties[] = { @@ -674,6 +779,10 @@ static Property spapr_phb_properties[] = { DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, SPAPR_PCI_IO_WIN_SIZE), DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, -1), + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -846,6 +955,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, sizeof(interrupt_map))); + if (!phb->dma_window_size) { + fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n"); + exit(1); + } spapr_dma_dt(fdt, bus_off, "ibm,dma-window", phb->dma_liobn, phb->dma_window_start, phb->dma_window_size); diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h index b05241d..41a9cb1 100644 --- a/hw/spapr_pci.h +++ b/hw/spapr_pci.h @@ -26,6 +26,7 @@ #include "hw/pci.h" #include "hw/pci_host.h" #include "hw/xics.h" +#include "hw/spapr_iommu_vfio.h" #define SPAPR_MSIX_MAX_DEVS 32 @@ -62,6 +63,11 @@ typedef struct sPAPRPHBState { uint32_t nvec; } msi_table[SPAPR_MSIX_MAX_DEVS]; + struct sPAPRVFIOData *vfio_data; + int32_t iommugroupid; + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ + uint8_t enable_multifunction, force_addr; + QLIST_ENTRY(sPAPRPHBState) list; } sPAPRPHBState; diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 7c27834..7862193 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -39,6 +39,8 @@ #include "qemu-queue.h" #include "range.h" +#include "spapr_iommu_vfio.h" + /* #define DEBUG_VFIO */ #ifdef DEBUG_VFIO #define DPRINTF(fmt, ...) \ @@ -94,6 +96,7 @@ typedef struct VFIOContainer { /* enable abstraction to support various iommu backends */ union { MemoryListener listener; /* Used by type1 iommu */ + sPAPRVFIOData spapr; /* Used by SPAPR TCE (POWERPC) iommu */ }; void (*release)(struct VFIOContainer *); } iommu_data; @@ -1193,6 +1196,25 @@ static void vfio_listener_release(VFIOContainer *container) } /* + * sPAPR TCE DMA interface + */ +static int spapr_tce_map(sPAPRVFIOData *data, + struct vfio_iommu_type1_dma_map *param) +{ + VFIOContainer *container = container_of(data, VFIOContainer, + iommu_data.spapr); + return ioctl(container->fd, VFIO_IOMMU_MAP_DMA, param); +} + +static int spapr_tce_unmap(sPAPRVFIOData *data, + struct vfio_iommu_type1_dma_unmap *param) +{ + VFIOContainer *container = container_of(data, VFIOContainer, + iommu_data.spapr); + return ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, param); +} + +/* * Interrupt setup */ static void vfio_disable_interrupts(VFIODevice *vdev) @@ -1670,6 +1692,46 @@ static int vfio_connect_container(VFIOGroup *group) container->iommu_data.release = vfio_listener_release; memory_listener_register(&container->iommu_data.listener, &address_space_memory); + + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); + if (ret) { + error_report("vfio: failed to set group container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); + if (ret) { + error_report("vfio: failed to set iommu for container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + container->iommu_data.spapr.info.argsz = + sizeof(container->iommu_data.spapr.info); + ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, + &container->iommu_data.spapr.info); + if (ret) { + error_report("vfio: failed to get iommu info for container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + /* + * At the moment of adding VFIO for SPAPR (server POWERPC), only one + * group per container is supported. This may change later. + */ + container->iommu_data.spapr.map = spapr_tce_map; + container->iommu_data.spapr.unmap = spapr_tce_unmap; + spapr_register_vfio_container(group->groupid, + &container->iommu_data.spapr); } else { error_report("vfio: No available IOMMU models\n"); g_free(container); diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 4758d1b..92dc88b 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -22,6 +22,7 @@ /* Extensions */ #define VFIO_TYPE1_IOMMU 1 +#define VFIO_SPAPR_TCE_IOMMU 2 /* * The IOCTL interface is designed for extensibility by embedding the @@ -365,4 +366,30 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + +/* + * The SPAPR TCE info struct provides the information about the PCI bus + * address ranges available for DMA, these values are programmed into + * the hardware so the guest has to know that information. + * + * The IOMMU page size is always 4K. + */ + +struct vfio_iommu_spapr_tce_info { + __u32 argsz; + __u32 flags; /* reserved for future use */ + __u32 dma32_window_start; /* 32 bit window start (bytes) */ + __u32 dma32_window_size; /* 32 bit window size (bytes) */ +}; + +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/* Reuse type1 map/unmap structs as they are the same at the moment */ +typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map; +typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap; + +/* ***************************************************************** */ + + #endif /* _UAPIVFIO_H */ diff --git a/trace-events b/trace-events index e280fba..388a107 100644 --- a/trace-events +++ b/trace-events @@ -1016,6 +1016,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, qxl_render_update_area_done(void *cookie) "%p" # hw/spapr_pci.c +spapr_pci(const char *msg1, const char *msg2) "%s%s" spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)" spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64 spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u" @@ -1034,4 +1035,7 @@ xics_masked_pending(void) "set_irq_msi: masked pending" xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]" xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x" xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]" -xics_ics_eoi(int nr) "ics_eoi: irq %#x" \ No newline at end of file +xics_ics_eoi(int nr) "ics_eoi: irq %#x" + +# hw/spapr_iommu.c +spapr_iommu(const char *op, uint32_t liobn, uint64_t ioba, uint64_t tce, int ret) "%s %x ioba=%"PRIx64" tce=%"PRIx64" ret=%d" -- 1.7.10.4