Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Tuesday 23 September 2008 22:54:53 Amit Shah wrote: > +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t > address, +int len) > +{ > + uint32_t val = 0; > + int fd, r; > + > + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || > + address == 0x3c || address == 0x3d) { > + val = pci_default_read_config(d, address, len); > + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", > + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, > val, + len); > + return val; > + } > + > + /* vga specific, remove later */ > + if (address == 0xFC) > + goto do_log; > + > + fd = ((AssignedDevice *)d)->real_device.config_fd; > + r = lseek(fd, address, SEEK_SET); > + if (r < 0) { > + fprintf(stderr, "%s: bad seek, errno = %d\n", > + __func__, errno); > + return val; > + } This read from configuration space method got a little trouble: vender id and device id read from configuration space directly rather than "vender" and "device" file in the sysfs. That's cause trouble with some device that configuration space inconsistent with "vender" and "device" file, e.g. some fix up by host PCI subsystem in kernel. Maybe it can be delay a little for a following patch, but we should address this issue... Maybe we can use libpci? There are more fields than vender and device got this problem, like "irq". -- regards Yang, Sheng -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Thursday 25 September 2008 12:54:46 Yang, Sheng wrote: > On Tuesday 23 September 2008 22:54:53 Amit Shah wrote: > > From: Or Sagi <[EMAIL PROTECTED]> > > From: Nir Peleg <[EMAIL PROTECTED]> > > From: Amit Shah <[EMAIL PROTECTED]> > > From: Ben-Ami Yassour <[EMAIL PROTECTED]> > > From: Weidong Han <[EMAIL PROTECTED]> > > From: Glauber de Oliveira Costa <[EMAIL PROTECTED]> > > > > With this patch, we can assign a device on the host machine to a > > guest. > > > > A new command-line option, -pcidevice is added. > > For example, to invoke it for a device sitting at PCI bus:dev.fn > > 04:08.0, use this: > > > > -pcidevice host=04:08.0 > > > > * The host driver for the device, if any, is to be removed before > > assigning the device (else device assignment will fail). > > > > * A device that shares IRQ with another host device cannot currently > > be assigned. > > > > This works only with the in-kernel irqchip method; to use the > > userspace irqchip, a kernel module (irqhook) and some extra changes > > are needed. > > Hi Amit > > I am afraid I got this when try to enable VT-d. > > create_userspace_phys_mem: Invalid argument > assigned_dev_iomem_map: Error: create new mapping failed > > Can you have a look at it? (and the patch you sent to Weidong don't got > this problem.) Oh, Weidong's patch "[PATCH] VT-d: Fix iommu map page for mmio pages" fix it. -- regards Yang, Sheng > > Thanks. > -- > regards > Yang, Sheng > > > Signed-off-by: Amit Shah <[EMAIL PROTECTED]> > > --- > > qemu/Makefile.target|1 + > > qemu/hw/device-assignment.c | 665 > > +++ qemu/hw/device-assignment.h | > > 93 ++ > > qemu/hw/pc.c|9 + > > qemu/hw/pci.c |7 + > > qemu/vl.c | 18 ++ > > 6 files changed, 793 insertions(+), 0 deletions(-) > > create mode 100644 qemu/hw/device-assignment.c > > create mode 100644 qemu/hw/device-assignment.h > > > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > > index 72f3db8..40eb273 100644 > > --- a/qemu/Makefile.target > > +++ b/qemu/Makefile.target > > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > > OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > > OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > > OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > > +OBJS+= device-assignment.o > > ifeq ($(USE_KVM_PIT), 1) > > OBJS+= i8254-kvm.o > > endif > > diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c > > new file mode 100644 > > index 000..e70daf2 > > --- /dev/null > > +++ b/qemu/hw/device-assignment.c > > @@ -0,0 +1,665 @@ > > +/* > > + * Copyright (c) 2007, Neocleus Corporation. > > + * > > + * This program is free software; you can redistribute it and/or modify > > it + * under the terms and conditions of the GNU General Public License, > > + * version 2, as published by the Free Software Foundation. > > + * > > + * This program is distributed in the hope it will be useful, but > > WITHOUT + * ANY WARRANTY; without even the implied warranty of > > MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > General Public License for + * more details. > > + * > > + * You should have received a copy of the GNU General Public License > > along with + * this program; if not, write to the Free Software > > Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 > > USA. > > + * > > + * > > + * Assign a PCI device from the host to a guest VM. > > + * > > + * Adapted for KVM by Qumranet. > > + * > > + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) > > + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) > > + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) > > + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) > > + */ > > +#include > > +#include > > +#include "qemu-kvm.h" > > +#include > > +#include "device-assignment.h" > > + > > +/* From linux/ioport.h */ > > +#define IORESOURCE_IO 0x0100 /* Resource type */ > > +#define IORESOURCE_MEM 0x0200 > > +#define IORESOURCE_IRQ 0x0400 > > +#define IORESOURCE_DMA 0x0800 > > +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ > > + > > +/* #define DEVICE_ASSIGNMENT_DEBUG */ > > + > > +#ifdef DEVICE_ASSIGNMENT_DEBUG > > +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## > > args) +#else > > +#define DEBUG(fmt, args...) > > +#endif > > + > > +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, > > + uint32_t value) > > +{ > > + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; > > + uint32_t r_pio = (unsigned long)r_access->r_virtbase > > + + (addr - r_access->e_physbase); > > + > > + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { > > + fprintf(stderr
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Tuesday 23 September 2008 22:54:53 Amit Shah wrote: > From: Or Sagi <[EMAIL PROTECTED]> > From: Nir Peleg <[EMAIL PROTECTED]> > From: Amit Shah <[EMAIL PROTECTED]> > From: Ben-Ami Yassour <[EMAIL PROTECTED]> > From: Weidong Han <[EMAIL PROTECTED]> > From: Glauber de Oliveira Costa <[EMAIL PROTECTED]> > > With this patch, we can assign a device on the host machine to a > guest. > > A new command-line option, -pcidevice is added. > For example, to invoke it for a device sitting at PCI bus:dev.fn > 04:08.0, use this: > > -pcidevice host=04:08.0 > > * The host driver for the device, if any, is to be removed before > assigning the device (else device assignment will fail). > > * A device that shares IRQ with another host device cannot currently > be assigned. > > This works only with the in-kernel irqchip method; to use the > userspace irqchip, a kernel module (irqhook) and some extra changes > are needed. > Hi Amit I am afraid I got this when try to enable VT-d. create_userspace_phys_mem: Invalid argument assigned_dev_iomem_map: Error: create new mapping failed Can you have a look at it? (and the patch you sent to Weidong don't got this problem.) Thanks. -- regards Yang, Sheng > Signed-off-by: Amit Shah <[EMAIL PROTECTED]> > --- > qemu/Makefile.target|1 + > qemu/hw/device-assignment.c | 665 > +++ qemu/hw/device-assignment.h | > 93 ++ > qemu/hw/pc.c|9 + > qemu/hw/pci.c |7 + > qemu/vl.c | 18 ++ > 6 files changed, 793 insertions(+), 0 deletions(-) > create mode 100644 qemu/hw/device-assignment.c > create mode 100644 qemu/hw/device-assignment.h > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > index 72f3db8..40eb273 100644 > --- a/qemu/Makefile.target > +++ b/qemu/Makefile.target > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > +OBJS+= device-assignment.o > ifeq ($(USE_KVM_PIT), 1) > OBJS+= i8254-kvm.o > endif > diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c > new file mode 100644 > index 000..e70daf2 > --- /dev/null > +++ b/qemu/hw/device-assignment.c > @@ -0,0 +1,665 @@ > +/* > + * Copyright (c) 2007, Neocleus Corporation. > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > for + * more details. > + * > + * You should have received a copy of the GNU General Public License along > with + * this program; if not, write to the Free Software Foundation, Inc., > 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + * > + * Assign a PCI device from the host to a guest VM. > + * > + * Adapted for KVM by Qumranet. > + * > + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) > + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) > + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) > + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) > + */ > +#include > +#include > +#include "qemu-kvm.h" > +#include > +#include "device-assignment.h" > + > +/* From linux/ioport.h */ > +#define IORESOURCE_IO 0x0100 /* Resource type */ > +#define IORESOURCE_MEM 0x0200 > +#define IORESOURCE_IRQ 0x0400 > +#define IORESOURCE_DMA 0x0800 > +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ > + > +/* #define DEVICE_ASSIGNMENT_DEBUG */ > + > +#ifdef DEVICE_ASSIGNMENT_DEBUG > +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## > args) +#else > +#define DEBUG(fmt, args...) > +#endif > + > +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, > + uint32_t value) > +{ > + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; > + uint32_t r_pio = (unsigned long)r_access->r_virtbase > + + (addr - r_access->e_physbase); > + > + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { > + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" > + " r_virtbase=%08lx value=%08x\n", > + __func__, r_pio, (int)r_access->e_physbase, > + (unsigned long)r_access->r_virtbase, value); > + } > + iopl(3); > + outb(value, r_pio); > +} > + > +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr, > +
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
* On Wednesday 24 Sep 2008 20:37:24 Anthony Liguori wrote: > Amit Shah wrote: > > * On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote: > >> Amit Shah wrote: > >>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target > >>> index 72f3db8..40eb273 100644 > >>> --- a/qemu/Makefile.target > >>> +++ b/qemu/Makefile.target > >>> @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > >>> OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > >>> OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > >>> OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > >>> +OBJS+= device-assignment.o > >> > >> This needs to be conditional on at least linux hosts, but probably also > >> kvm support. > > > > I didn't see any other file that's doing it. So I added this conditional > > in vl.c by having a #if defined(__linux__). That's how usb-linux.c does > > it as well. Is there a better way? > > aio and compatfd currently do it this way. block-raw-win32 and > block-raw-posix are this way. We're slowly moving things away from > #ifdef #else #endif to conditional compilation. So is ifdef(CONFIG_LINUX) OBJS+=.. endif supposed to work? (or is it CONFIG_LINUX_USER?) > > Not the whole functionality needs kvm support. This should be able to > > work even without kvm (for example, when the guest is 1:1 mapped in the > > host address space). > > KVM is needed for interrupt remapping though. That's something I don't > see happening for normal userspace any time soon. We already have an implementation that I've not submitted. I remember at least one user asking to support that functionality (because he wanted to try this on powerpc). > >>> + /* FIXME: Add support for emulated MMIO for non-kvm guests */ > >>> + if (kvm_enabled()) { > >> > >> This doesn't work at all if kvm isn't enabled right? You should > >> probably bail out in the init if kvm isn't enabled. If this whole file > >> is included conditionally based on KVM support, then you don't have to > >> worry about using kvm_enabled() guards to conditionally compile out > >> code. > > > > Non-kvm support is currently broken and should be fixed, but that can > > happen after we get this merged. > > But it would take bouncing interrupts to userspace? I don't think that > will ever happen upstream personally. At any rate, there's no point in > even trying to support something like that until progress is made > upstream on this front. With 1:1 mapping (Andrea's patches) and --no-kvm-irqchip (or indeed --no-kvm) and the userspace interrupt bouncing patches, we can support this. > > I can temporarily add a check for kvm_enabled and bail out. > > > >>> + sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/", > >>> + r_bus, r_dev, r_func); > >> > >> snprintf() > > > > It's guarded by the %02x modifiers; so this doesn't depend on user input. > > strcpy or sprintf should never be used. It doesn't matter if it's safe > in a particular instance. There are safer functions to use (like > snprintf). Hmm, qemu is littered with such strcpy()s though. I'll change this. > All it takes is for someone to come along and change the /sys/bus path > to be larger without adjusting the buffer size and everything goes to > hell. It's inherently brittle. > > >>> + fprintf(stderr, "Registered host PCI device %02x:%02x.%1x " > >>> + "(\"%s\") as guest device %02x:%02x.%1x\n", > >>> + r_bus, r_dev, r_func, e_dev_name, > >>> + pci_bus_num(e_bus), e_device, r_func); > >> > >> Please don't fprintf() unconditionally. > > > > OK; however, a vmdk file open does that so I though it was alright to do > > it. > > I obviously don't use vmdk or else I would have removed that by now :-) > > >> A lot more checks are needed here to see if things can succeed. We > >> definitely should bail out if they can't. > > > > Bailing out is done in the out: label below. What else do you think can > > fail? I've taken care of all the cases that do fail IMO. > > > >>> + return pci_dev; > >>> +out: > >>> + pci_unregister_device(&pci_dev->dev); > >>> + return NULL; > >>> +} > >>> > >>> > >>> > >>> +/* > >>> + * Syntax to assign device: > >>> + * > >>> + * -pcidevice dev=bus:dev.func,dma=dma > >>> + * > >>> + * Example: > >>> + * -pcidevice host=00:13.0,dma=pvdma > >>> + * > >>> + * dma can currently only be 'none' to disable iommu support. > >> > >> Does it actually work if you disable iommu support? > > > > If the guest is 1:1 mapped. > > You mean with Andrea's reserved ram patches? Yes. > >>> +#include > >> > >> Don't think this is needed here. > > > > We use mmap(), so this is needed. > > Ah. > > >>> +/* Initialize assigned devices */ > >>> +if (pci_enabled) { > >>> +int r = -1; > >>> +do { > >>> +init_assigned_device(pci_bus, &r); > >> > >> Why pass r by reference instead of just returning it? At any rate, you > >> should detect when this fails and gracefully terminate QEMU. > > > > 'r' is the
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
Amit Shah wrote: * On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote: Amit Shah wrote: diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o This needs to be conditional on at least linux hosts, but probably also kvm support. I didn't see any other file that's doing it. So I added this conditional in vl.c by having a #if defined(__linux__). That's how usb-linux.c does it as well. Is there a better way? aio and compatfd currently do it this way. block-raw-win32 and block-raw-posix are this way. We're slowly moving things away from #ifdef #else #endif to conditional compilation. Not the whole functionality needs kvm support. This should be able to work even without kvm (for example, when the guest is 1:1 mapped in the host address space). KVM is needed for interrupt remapping though. That's something I don't see happening for normal userspace any time soon. + /* FIXME: Add support for emulated MMIO for non-kvm guests */ + if (kvm_enabled()) { This doesn't work at all if kvm isn't enabled right? You should probably bail out in the init if kvm isn't enabled. If this whole file is included conditionally based on KVM support, then you don't have to worry about using kvm_enabled() guards to conditionally compile out code. Non-kvm support is currently broken and should be fixed, but that can happen after we get this merged. But it would take bouncing interrupts to userspace? I don't think that will ever happen upstream personally. At any rate, there's no point in even trying to support something like that until progress is made upstream on this front. I can temporarily add a check for kvm_enabled and bail out. + sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/", + r_bus, r_dev, r_func); snprintf() It's guarded by the %02x modifiers; so this doesn't depend on user input. strcpy or sprintf should never be used. It doesn't matter if it's safe in a particular instance. There are safer functions to use (like snprintf). All it takes is for someone to come along and change the /sys/bus path to be larger without adjusting the buffer size and everything goes to hell. It's inherently brittle. + fprintf(stderr, "Registered host PCI device %02x:%02x.%1x " + "(\"%s\") as guest device %02x:%02x.%1x\n", + r_bus, r_dev, r_func, e_dev_name, + pci_bus_num(e_bus), e_device, r_func); Please don't fprintf() unconditionally. OK; however, a vmdk file open does that so I though it was alright to do it. I obviously don't use vmdk or else I would have removed that by now :-) A lot more checks are needed here to see if things can succeed. We definitely should bail out if they can't. Bailing out is done in the out: label below. What else do you think can fail? I've taken care of all the cases that do fail IMO. + return pci_dev; +out: + pci_unregister_device(&pci_dev->dev); + return NULL; +} +/* + * Syntax to assign device: + * + * -pcidevice dev=bus:dev.func,dma=dma + * + * Example: + * -pcidevice host=00:13.0,dma=pvdma + * + * dma can currently only be 'none' to disable iommu support. Does it actually work if you disable iommu support? If the guest is 1:1 mapped. You mean with Andrea's reserved ram patches? +#include Don't think this is needed here. We use mmap(), so this is needed. Ah. +/* Initialize assigned devices */ +if (pci_enabled) { +int r = -1; +do { +init_assigned_device(pci_bus, &r); Why pass r by reference instead of just returning it? At any rate, you should detect when this fails and gracefully terminate QEMU. 'r' is the count of the number of assigned devices -- mostly needed because we have the data stored in an array. If we migrate to a list, this can be relaxed. ATM, I start the guest without assigning the device. I haven't figured out a way to gracefully terminate qemu yet. In the case of hot plug, you fail the hot plug. If you start with device assignment, just doing an "exit" would be sufficient. +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) + case QEMU_OPTION_pcidevice: + add_assigned_device(optarg); You should copy into an array, then in pc.c, iterate through the array and call into add_assigned_device. Is there any benefit in doing this? We're moving the iterate out of vl.c to pc.c and both will happe
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
* On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote: > Amit Shah wrote: > > diff --git a/qemu/Makefile.target b/qemu/Makefile.target > > index 72f3db8..40eb273 100644 > > --- a/qemu/Makefile.target > > +++ b/qemu/Makefile.target > > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o > > OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o > > OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o > > OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o > > +OBJS+= device-assignment.o > > This needs to be conditional on at least linux hosts, but probably also > kvm support. I didn't see any other file that's doing it. So I added this conditional in vl.c by having a #if defined(__linux__). That's how usb-linux.c does it as well. Is there a better way? Not the whole functionality needs kvm support. This should be able to work even without kvm (for example, when the guest is 1:1 mapped in the host address space). > > + /* FIXME: Add support for emulated MMIO for non-kvm guests */ > > + if (kvm_enabled()) { > > This doesn't work at all if kvm isn't enabled right? You should > probably bail out in the init if kvm isn't enabled. If this whole file > is included conditionally based on KVM support, then you don't have to > worry about using kvm_enabled() guards to conditionally compile out code. Non-kvm support is currently broken and should be fixed, but that can happen after we get this merged. I can temporarily add a check for kvm_enabled and bail out. > > + sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/", > > + r_bus, r_dev, r_func); > > snprintf() It's guarded by the %02x modifiers; so this doesn't depend on user input. > > + strcpy(name, dir); > > + strcat(name, "config"); > > snprintf() ... and as a result, all these don't depend on user input. > > +#defineMAX_ASSIGNED_DEVS 4 > > +struct { > > + char name[15]; > > + int bus; > > + int dev; > > + int func; > > + AssignedDevice *assigned_dev; > > +} assigned_devices[MAX_ASSIGNED_DEVS]; > > Any reason not to just use a list here? sys-queue.h makes that very easy. > > + fprintf(stderr, "Registered host PCI device %02x:%02x.%1x " > > + "(\"%s\") as guest device %02x:%02x.%1x\n", > > + r_bus, r_dev, r_func, e_dev_name, > > + pci_bus_num(e_bus), e_device, r_func); > > Please don't fprintf() unconditionally. OK; however, a vmdk file open does that so I though it was alright to do it. > A lot more checks are needed here to see if things can succeed. We > definitely should bail out if they can't. Bailing out is done in the out: label below. What else do you think can fail? I've taken care of all the cases that do fail IMO. > > + return pci_dev; > > +out: > > + pci_unregister_device(&pci_dev->dev); > > + return NULL; > > +} > > +/* > > + * Syntax to assign device: > > + * > > + * -pcidevice dev=bus:dev.func,dma=dma > > + * > > + * Example: > > + * -pcidevice host=00:13.0,dma=pvdma > > + * > > + * dma can currently only be 'none' to disable iommu support. > > Does it actually work if you disable iommu support? If the guest is 1:1 mapped. > > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h > > new file mode 100644 > > index 000..b77e484 > > --- /dev/null > > +++ b/qemu/hw/device-assignment.h > > +#include > > Don't think this is needed here. We use mmap(), so this is needed. > > +#include "qemu-common.h" > > +#include "pci.h" > > +#include > > Nor this. This isn't. > > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c > > index 6053103..4a611cc 100644 > > --- a/qemu/hw/pc.c > > +++ b/qemu/hw/pc.c > > +/* Initialize assigned devices */ > > +if (pci_enabled) { > > +int r = -1; > > +do { > > +init_assigned_device(pci_bus, &r); > > Why pass r by reference instead of just returning it? At any rate, you > should detect when this fails and gracefully terminate QEMU. 'r' is the count of the number of assigned devices -- mostly needed because we have the data stored in an array. If we migrate to a list, this can be relaxed. ATM, I start the guest without assigning the device. I haven't figured out a way to gracefully terminate qemu yet. > > --- a/qemu/vl.c > > +++ b/qemu/vl.c > > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__) > > + case QEMU_OPTION_pcidevice: > > + add_assigned_device(optarg); > > You should copy into an array, then in pc.c, iterate through the array > and call into add_assigned_device. Is there any benefit in doing this? We're moving the iterate out of vl.c to pc.c and both will happen at the same time. > > Regards, > > Anthony Liguori Thanks! Amit. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
Muli Ben-Yehuda wrote: On Tue, Sep 23, 2008 at 02:18:51PM -0500, Anthony Liguori wrote: Make sure you issue iopl() before any of the VCPU threads are spawned. Otherwise, you may be running into issues when something other than VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for that thread. Yeah, we thought of that, but as far as I can recall this happened with a single VCPU. In any case, we'll look into it again. The io thread runs in a separate thread than the VCPU. So if you were doing iopl(3) in the io thread (for instance, when called from machine_init), then the VCPU thread wouldn't necessarily have inherited the iopl(). You could also do it explicitly on vcpu thread construction. Regards, Anthony Liguori Cheers, Muli -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Tue, Sep 23, 2008 at 02:18:51PM -0500, Anthony Liguori wrote: > Make sure you issue iopl() before any of the VCPU threads are > spawned. Otherwise, you may be running into issues when something > other than VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for > that thread. Yeah, we thought of that, but as far as I can recall this happened with a single VCPU. In any case, we'll look into it again. Cheers, Muli -- The First Workshop on I/O Virtualization (WIOV '08) Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/ xxx SYSTOR 2009---The Israeli Experimental Systems Conference http://www.haifa.il.ibm.com/conferences/systor2009/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
Muli Ben-Yehuda wrote: On Tue, Sep 23, 2008 at 11:30:32AM -0500, Anthony Liguori wrote: + + (addr - r_access->e_physbase); + + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" + " r_virtbase=%08lx value=%08x\n", + __func__, r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->r_virtbase, value); + } + iopl(3); + outb(value, r_pio); The formatting is wrong for this entire file. Also, you shouldn't have device specific debug. Should probably error check iopl(3). It's not necessary to call it every time you do an outb, just once when initialized. We tried that at first, but ran into cases where even after iopl() ran, pio's from qemu still failed. Does qemu do anything to drop iopl() privileges? In any case calling iopl() unconditionally on every pio fixed it, but is obviously not the right long-term solution. Make sure you issue iopl() before any of the VCPU threads are spawned. Otherwise, you may be running into issues when something other than VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for that thread. Regards, Anthony Liguori Cheers, Muli -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
On Tue, Sep 23, 2008 at 11:30:32AM -0500, Anthony Liguori wrote: > >> ++ (addr - r_access->e_physbase); >> + >> +if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { >> +fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" >> +" r_virtbase=%08lx value=%08x\n", >> +__func__, r_pio, (int)r_access->e_physbase, >> +(unsigned long)r_access->r_virtbase, value); >> +} >> +iopl(3); >> +outb(value, r_pio); >> > > The formatting is wrong for this entire file. Also, you shouldn't > have device specific debug. Should probably error check iopl(3). > It's not necessary to call it every time you do an outb, just once > when initialized. We tried that at first, but ran into cases where even after iopl() ran, pio's from qemu still failed. Does qemu do anything to drop iopl() privileges? In any case calling iopl() unconditionally on every pio fixed it, but is obviously not the right long-term solution. Cheers, Muli -- The First Workshop on I/O Virtualization (WIOV '08) Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/ xxx SYSTOR 2009---The Israeli Experimental Systems Conference http://www.haifa.il.ibm.com/conferences/systor2009/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
Amit Shah wrote: From: Or Sagi <[EMAIL PROTECTED]> From: Nir Peleg <[EMAIL PROTECTED]> From: Amit Shah <[EMAIL PROTECTED]> From: Ben-Ami Yassour <[EMAIL PROTECTED]> From: Weidong Han <[EMAIL PROTECTED]> From: Glauber de Oliveira Costa <[EMAIL PROTECTED]> With this patch, we can assign a device on the host machine to a guest. A new command-line option, -pcidevice is added. For example, to invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this: -pcidevice host=04:08.0 * The host driver for the device, if any, is to be removed before assigning the device (else device assignment will fail). * A device that shares IRQ with another host device cannot currently be assigned. This works only with the in-kernel irqchip method; to use the userspace irqchip, a kernel module (irqhook) and some extra changes are needed. Signed-off-by: Amit Shah <[EMAIL PROTECTED]> --- qemu/Makefile.target|1 + qemu/hw/device-assignment.c | 665 +++ qemu/hw/device-assignment.h | 93 ++ qemu/hw/pc.c|9 + qemu/hw/pci.c |7 + qemu/vl.c | 18 ++ 6 files changed, 793 insertions(+), 0 deletions(-) create mode 100644 qemu/hw/device-assignment.c create mode 100644 qemu/hw/device-assignment.h diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o This needs to be conditional on at least linux hosts, but probably also kvm support. ifeq ($(USE_KVM_PIT), 1) OBJS+= i8254-kvm.o endif diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c new file mode 100644 index 000..e70daf2 --- /dev/null +++ b/qemu/hw/device-assignment.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * + * Assign a PCI device from the host to a guest VM. + * + * Adapted for KVM by Qumranet. + * + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) + */ +#include +#include +#include "qemu-kvm.h" +#include +#include "device-assignment.h" + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x0100 /* Resource type */ +#define IORESOURCE_MEM 0x0200 +#define IORESOURCE_IRQ 0x0400 +#define IORESOURCE_DMA 0x0800 +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ + +/* #define DEVICE_ASSIGNMENT_DEBUG */ + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args) +#else +#define DEBUG(fmt, args...) +#endif Both should be in do { } while(0) to preserve statement semantics. Please use C99 variadic macros too. +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio = (unsigned long)r_access->r_virtbase Should be target_ulong if it's a target virtual address. + + (addr - r_access->e_physbase); + + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" + " r_virtbase=%08lx value=%08x\n", + __func__, r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->r_virtbase, value); + } + iopl(3); + outb(value, r_pio); The formatting is wrong for this entire file. Also, you shouldn't have device specific debug. Should probably error check iopl(3). It's not necessary to call it every time you do an outb, just once when initialized. +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, +uint32_t e_phys, uint32_t e_size, int type) +{ + AssignedDevi
[PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests
From: Or Sagi <[EMAIL PROTECTED]> From: Nir Peleg <[EMAIL PROTECTED]> From: Amit Shah <[EMAIL PROTECTED]> From: Ben-Ami Yassour <[EMAIL PROTECTED]> From: Weidong Han <[EMAIL PROTECTED]> From: Glauber de Oliveira Costa <[EMAIL PROTECTED]> With this patch, we can assign a device on the host machine to a guest. A new command-line option, -pcidevice is added. For example, to invoke it for a device sitting at PCI bus:dev.fn 04:08.0, use this: -pcidevice host=04:08.0 * The host driver for the device, if any, is to be removed before assigning the device (else device assignment will fail). * A device that shares IRQ with another host device cannot currently be assigned. This works only with the in-kernel irqchip method; to use the userspace irqchip, a kernel module (irqhook) and some extra changes are needed. Signed-off-by: Amit Shah <[EMAIL PROTECTED]> --- qemu/Makefile.target|1 + qemu/hw/device-assignment.c | 665 +++ qemu/hw/device-assignment.h | 93 ++ qemu/hw/pc.c|9 + qemu/hw/pci.c |7 + qemu/vl.c | 18 ++ 6 files changed, 793 insertions(+), 0 deletions(-) create mode 100644 qemu/hw/device-assignment.c create mode 100644 qemu/hw/device-assignment.h diff --git a/qemu/Makefile.target b/qemu/Makefile.target index 72f3db8..40eb273 100644 --- a/qemu/Makefile.target +++ b/qemu/Makefile.target @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o +OBJS+= device-assignment.o ifeq ($(USE_KVM_PIT), 1) OBJS+= i8254-kvm.o endif diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c new file mode 100644 index 000..e70daf2 --- /dev/null +++ b/qemu/hw/device-assignment.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * + * Assign a PCI device from the host to a guest VM. + * + * Adapted for KVM by Qumranet. + * + * Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED]) + * Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED]) + * Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED]) + * Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED]) + */ +#include +#include +#include "qemu-kvm.h" +#include +#include "device-assignment.h" + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x0100 /* Resource type */ +#define IORESOURCE_MEM 0x0200 +#define IORESOURCE_IRQ 0x0400 +#define IORESOURCE_DMA 0x0800 +#define IORESOURCE_PREFETCH0x1000 /* No side effects */ + +/* #define DEVICE_ASSIGNMENT_DEBUG */ + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args) +#else +#define DEBUG(fmt, args...) +#endif + +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio = (unsigned long)r_access->r_virtbase + + (addr - r_access->e_physbase); + + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" + " r_virtbase=%08lx value=%08x\n", + __func__, r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->r_virtbase, value); + } + iopl(3); + outb(value, r_pio); +} + +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; + uint32_t r_pio = (unsigned long)r_access->r_virtbase + + (addr - r_access->e_physbase); + + if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) { + fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x" + " r_virtbase=%08lx value=%08x\n", + __func__, r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->r_virtbase, value); + } + iopl(3); + outw(value, r_