Amit Shah wrote:
This patch has been contributed to by the following people:

From: Or Sagi <[EMAIL PROTECTED]>
From: Nir Peleg <[EMAIL PROTECTED]>
From: Amit Shah <[EMAIL PROTECTED]>
From: Ben-Ami Yassour <[EMAIL PROTECTED]>
From: Weidong Han <[EMAIL PROTECTED]>
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
To invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this:

        -pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

* The RAW_IO capability is needed for this to work

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
---
 qemu/Makefile.target        |    1 +
 qemu/hw/device-assignment.c |  619 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |   98 +++++++
 qemu/hw/pc.c                |    6 +
 qemu/hw/pci.c               |    7 +
 qemu/vl.c                   |   18 ++
 6 files changed, 749 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..05a1d84 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..5ba21a0
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include "hw.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "console.h"
+#include <linux/kvm_para.h>

Is this header really necessary?

+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO       0x00000100  /* Resource type */
+#define IORESOURCE_MEM      0x00000200
+#define IORESOURCE_IRQ      0x00000400
+#define IORESOURCE_DMA      0x00000800
+#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, args...)                               \

Please use C99 style varidacs.

+    do {                                                  \
+      fprintf(stderr, "%s: " fmt, __func__ , ## args);    \
+    } while (0)
+#else
+#define DEBUG(fmt, args...) do { } while(0)
+#endif
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+                                       uint32_t value)
+{
+    AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;

Cast is unnecessary.

+    uint32_t r_pio = (unsigned long)r_access->r_virtbase
+        + (addr - r_access->e_physbase);

It would be nice to make this a function to make it more obvious that you were translated from guest to host regions. The cast to unsigned long should probably be target_ulong too.

+    DEBUG(stderr, "%s: r_pio=%08x e_physbase=%08x"
+          " r_virtbase=%08lx value=%08x\n",
+          __func__, r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);

This debug statement looks wrong to me. You're passing stderr. It's true for all of these functions.

+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("%s: e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
+          __func__, e_phys, (uint32_t)region->r_virtbase, type, e_size,
+          region_num);

You already have __func__ in your debug printf().

+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    /* FIXME: Add support for emulated MMIO for non-kvm guests */
+    if (kvm_enabled()) {

I don't think having a kvm_enabled() check here is very useful. I think device-assignment.c should be conditional on USE_KVM, and the only kvm_enabled() check should be when creating the initial device assignment. Practically speaking, QEMU is never going to support device assignment outside of the context of KVM because I strongly doubt anything like irqhook will make it upstream.

+        if (!first_map)
+            kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
+        if (e_size > 0)
+            ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->r_virtbase, e_size, 0);
+        if (ret != 0)
+            fprintf(stderr, "%s: Error: create new mapping failed\n", 
__func__);

If we do get an error here, we shouldn't keep going. This error is probably going to happen in practice if a guest tries to pass through too many devices and we run out of slots.

+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    int r;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("%s: e_phys=0x%x r_virt=%x type=0x%x len=%d region_num=%d \n",
+          __func__, addr, (uint32_t)region->r_virtbase, type, size, 
region_num);

Need to fix this DEBUG().

+    r = ioperm((uint32_t)region->r_virtbase, size, 1);

I don't think this is enough for KVM. This will only do the ioperm in the thread that triggered the IO. If you have an SMP guest, ioperm needs to be done on each VCPU's thread.

+    if (r < 0) {
+        perror("assigned_dev_ioport_map: ioperm");
+        return;
+    }

Again, if we fail, we have to exit QEMU gracefully.

+    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (void *) (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (void *) (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (void *) (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (void *) (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (void *) (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (void *) (r_dev->v_addrs + region_num));
+}

You never need to explicitly cast a pointer to void *.

+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd, r;
+
+    DEBUG("%s: (%x.%x): address=%04x val=0x%08x len=%d\n",
+          __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);

bad DEBUG()

+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+    DEBUG("%s: NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+    r = lseek(fd, address, SEEK_SET);
+    if (r < 0) {
+        fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno);
+        return;
+    }
+again:
+    r = write(fd, &val, len);

Can you just do a pwrite()?  That'll make things simpler.

+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: write failed, errno = %d\n", __func__, errno);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd, r;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;

Can you explain the point of this?

+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+    r = lseek(fd, address, SEEK_SET);
+    if (r < 0) {
+        fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno);
+        return val;
+    }
+again:
+    r = read(fd, &val, len);

pread().

+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n",
+            __func__, errno);

Should bail out gracefully.

+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) {

Please use MAP_FAILED and don't use a defensive if.

+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].r_virtbase =
+            (void *)(long)cur_region->base_addr;

I think virtbase would make more sense as a target_ulong.

+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128], comp[16];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, 128, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+            r_bus, r_dev, r_func);

just use sizeof().

+    strncpy(name, dir, 128);
+    strncat(name, "config", 6);

strncpy() doesn't do what you think it does. Why not just snprintf(name, sizeof(name), "%sconfig", dir)?

+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+    strncpy(name, dir, 128);
+    strncat(name, "resource", 8);

Just use snprintf().

+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    r = -1;
+    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
+        r++;
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+            snprintf(comp, 16, "resource%d", r);
+            strncpy(name, dir, 128);
+            strncat(name, comp, 16);
snprintf(name, sizeof(name), "%sresource%d", dir, r).
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice dev=bus:dev.func,dma=dma
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
+ */
+AssignedDevInfo *add_assigned_device(const char *arg)
+{
+    char *cp, *cp1;
+    char device[8];
+    char dma[6];
+    int r;
+    AssignedDevInfo *adev;
+
+    adev = qemu_mallocz(sizeof(AssignedDevInfo));
+    if (adev == NULL) {
+        fprintf(stderr, "%s: Out of memory\n", __func__);
+        return NULL;
+    }
+    r = get_param_value(device, sizeof(device), "host", arg);
+    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
+    if (!r)
+        strncpy(adev->name, device, 8);
+
+#ifdef KVM_CAP_IOMMU
+    r = get_param_value(dma, sizeof(dma), "dma", arg);
+    if (r && !strncmp(dma, "none", 4))
+        disable_iommu = 1;
+#endif
+    cp = device;
+    adev->bus = strtoul(cp, &cp1, 16);
+    if (*cp1 != ':')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->dev = strtoul(cp, &cp1, 16);
+    if (*cp1 != '.')
+        goto bad;
+    cp = cp1 + 1;
+
+    adev->func = strtoul(cp, &cp1, 16);
+
+    nr_assigned_devices++;
+    LIST_INSERT_HEAD(&adev_head, adev, next);
+    return adev;
+bad:
+    fprintf(stderr, "pcidevice argument parse error; "
+            "please check the help text for usage\n");
+    qemu_free(adev);
+    return NULL;
+}

diff --git a/qemu/vl.c b/qemu/vl.c
index 388e79d..5a39d12 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -38,6 +38,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -8692,6 +8693,12 @@ static void help(int exitcode)
 #endif
           "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
           "-no-kvm-pit        disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+           "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n"
+           "                expose a PCI device to the guest OS.\n"
+           "                dma=none: don't perform any dma translations (default 
is to use an iommu)\n"
+           "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
@@ -8811,6 +8818,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8900,6 +8910,9 @@ static const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9844,6 +9857,11 @@ int main(int argc, char **argv)
                kvm_pit = 0;
                break;
            }
+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+                add_assigned_device(optarg);
+                break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;

This is the wrong general model for doing this. The way the rest of QEMU works is to maintain an array of strings representing the assigned devices. The option handling just saves the name of the option. Then in pc.c, you iterate through the list of assigned devices, and then add them. Other architectures may have a completely different implementation of device assignment so it's better to let the individual architectures decide what to do with the assigned devices.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to