Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-25 Thread Yang, Sheng
On Tuesday 23 September 2008 22:54:53 Amit Shah wrote:
> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t
> address, +int len)
> +{
> +   uint32_t val = 0;
> +   int fd, r;
> +
> +   if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
> +   address == 0x3c || address == 0x3d) {
> +   val = pci_default_read_config(d, address, len);
> +   DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address,
> val, + len);
> +   return val;
> +   }
> +
> +   /* vga specific, remove later */
> +   if (address == 0xFC)
> +   goto do_log;
> +
> +   fd = ((AssignedDevice *)d)->real_device.config_fd;
> +   r = lseek(fd, address, SEEK_SET);
> +   if (r < 0) {
> +   fprintf(stderr, "%s: bad seek, errno = %d\n",
> +   __func__, errno);
> +   return val;
> +   }

This read from configuration space method got a little trouble: vender id and 
device id read from configuration space directly rather than "vender" 
and "device" file in the sysfs. That's cause trouble with some device that 
configuration space inconsistent with "vender" and "device" file, e.g. some 
fix up by host PCI subsystem in kernel. 

Maybe it can be delay a little for a following patch, but we should address 
this issue... Maybe we can use libpci? There are more fields than vender and 
device got this problem, like "irq".

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-24 Thread Yang, Sheng
On Thursday 25 September 2008 12:54:46 Yang, Sheng wrote:
> On Tuesday 23 September 2008 22:54:53 Amit Shah wrote:
> > From: Or Sagi <[EMAIL PROTECTED]>
> > From: Nir Peleg <[EMAIL PROTECTED]>
> > From: Amit Shah <[EMAIL PROTECTED]>
> > From: Ben-Ami Yassour <[EMAIL PROTECTED]>
> > From: Weidong Han <[EMAIL PROTECTED]>
> > From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
> >
> > With this patch, we can assign a device on the host machine to a
> > guest.
> >
> > A new command-line option, -pcidevice is added.
> > For example, to invoke it for a device sitting at PCI bus:dev.fn
> > 04:08.0, use this:
> >
> > -pcidevice host=04:08.0
> >
> > * The host driver for the device, if any, is to be removed before
> > assigning the device (else device assignment will fail).
> >
> > * A device that shares IRQ with another host device cannot currently
> > be assigned.
> >
> > This works only with the in-kernel irqchip method; to use the
> > userspace irqchip, a kernel module (irqhook) and some extra changes
> > are needed.
>
> Hi Amit
>
> I am afraid I got this when try to enable VT-d.
>
> create_userspace_phys_mem: Invalid argument
> assigned_dev_iomem_map: Error: create new mapping failed
>
> Can you have a look at it? (and the patch you sent to Weidong don't got
> this problem.)

Oh, Weidong's patch "[PATCH] VT-d: Fix iommu map page for mmio pages" fix it. 
--
regards
Yang, Sheng
>
> Thanks.
> --
> regards
> Yang, Sheng
>
> > Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
> > ---
> >  qemu/Makefile.target|1 +
> >  qemu/hw/device-assignment.c |  665
> > +++ qemu/hw/device-assignment.h |
> > 93 ++
> >  qemu/hw/pc.c|9 +
> >  qemu/hw/pci.c   |7 +
> >  qemu/vl.c   |   18 ++
> >  6 files changed, 793 insertions(+), 0 deletions(-)
> >  create mode 100644 qemu/hw/device-assignment.c
> >  create mode 100644 qemu/hw/device-assignment.h
> >
> > diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> > index 72f3db8..40eb273 100644
> > --- a/qemu/Makefile.target
> > +++ b/qemu/Makefile.target
> > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
> >  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
> >  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
> >  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> > +OBJS+= device-assignment.o
> >  ifeq ($(USE_KVM_PIT), 1)
> >  OBJS+= i8254-kvm.o
> >  endif
> > diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
> > new file mode 100644
> > index 000..e70daf2
> > --- /dev/null
> > +++ b/qemu/hw/device-assignment.c
> > @@ -0,0 +1,665 @@
> > +/*
> > + * Copyright (c) 2007, Neocleus Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > it + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but
> > WITHOUT + * ANY WARRANTY; without even the implied warranty of
> > MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > General Public License for + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > along with + * this program; if not, write to the Free Software
> > Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307
> > USA.
> > + *
> > + *
> > + *  Assign a PCI device from the host to a guest VM.
> > + *
> > + *  Adapted for KVM by Qumranet.
> > + *
> > + *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
> > + *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
> > + *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
> > + *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
> > + */
> > +#include 
> > +#include 
> > +#include "qemu-kvm.h"
> > +#include 
> > +#include "device-assignment.h"
> > +
> > +/* From linux/ioport.h */
> > +#define IORESOURCE_IO  0x0100  /* Resource type */
> > +#define IORESOURCE_MEM 0x0200
> > +#define IORESOURCE_IRQ 0x0400
> > +#define IORESOURCE_DMA 0x0800
> > +#define IORESOURCE_PREFETCH0x1000  /* No side effects */
> > +
> > +/* #define DEVICE_ASSIGNMENT_DEBUG */
> > +
> > +#ifdef DEVICE_ASSIGNMENT_DEBUG
> > +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ##
> > args) +#else
> > +#define DEBUG(fmt, args...)
> > +#endif
> > +
> > +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
> > +  uint32_t value)
> > +{
> > +   AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
> > +   uint32_t r_pio = (unsigned long)r_access->r_virtbase
> > +   + (addr - r_access->e_physbase);
> > +
> > +   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
> > +   fprintf(stderr

Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-24 Thread Yang, Sheng
On Tuesday 23 September 2008 22:54:53 Amit Shah wrote:
> From: Or Sagi <[EMAIL PROTECTED]>
> From: Nir Peleg <[EMAIL PROTECTED]>
> From: Amit Shah <[EMAIL PROTECTED]>
> From: Ben-Ami Yassour <[EMAIL PROTECTED]>
> From: Weidong Han <[EMAIL PROTECTED]>
> From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>
>
> With this patch, we can assign a device on the host machine to a
> guest.
>
> A new command-line option, -pcidevice is added.
> For example, to invoke it for a device sitting at PCI bus:dev.fn
> 04:08.0, use this:
>
> -pcidevice host=04:08.0
>
> * The host driver for the device, if any, is to be removed before
> assigning the device (else device assignment will fail).
>
> * A device that shares IRQ with another host device cannot currently
> be assigned.
>
> This works only with the in-kernel irqchip method; to use the
> userspace irqchip, a kernel module (irqhook) and some extra changes
> are needed.
>

Hi Amit

I am afraid I got this when try to enable VT-d.

create_userspace_phys_mem: Invalid argument
assigned_dev_iomem_map: Error: create new mapping failed

Can you have a look at it? (and the patch you sent to Weidong don't got this 
problem.)

Thanks.
--
regards
Yang, Sheng

> Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
> ---
>  qemu/Makefile.target|1 +
>  qemu/hw/device-assignment.c |  665
> +++ qemu/hw/device-assignment.h |  
> 93 ++
>  qemu/hw/pc.c|9 +
>  qemu/hw/pci.c   |7 +
>  qemu/vl.c   |   18 ++
>  6 files changed, 793 insertions(+), 0 deletions(-)
>  create mode 100644 qemu/hw/device-assignment.c
>  create mode 100644 qemu/hw/device-assignment.h
>
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> index 72f3db8..40eb273 100644
> --- a/qemu/Makefile.target
> +++ b/qemu/Makefile.target
> @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
>  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
>  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> +OBJS+= device-assignment.o
>  ifeq ($(USE_KVM_PIT), 1)
>  OBJS+= i8254-kvm.o
>  endif
> diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
> new file mode 100644
> index 000..e70daf2
> --- /dev/null
> +++ b/qemu/hw/device-assignment.c
> @@ -0,0 +1,665 @@
> +/*
> + * Copyright (c) 2007, Neocleus Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> for + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> with + * this program; if not, write to the Free Software Foundation, Inc.,
> 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + *
> + *  Assign a PCI device from the host to a guest VM.
> + *
> + *  Adapted for KVM by Qumranet.
> + *
> + *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
> + *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
> + *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
> + *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
> + */
> +#include 
> +#include 
> +#include "qemu-kvm.h"
> +#include 
> +#include "device-assignment.h"
> +
> +/* From linux/ioport.h */
> +#define IORESOURCE_IO  0x0100  /* Resource type */
> +#define IORESOURCE_MEM 0x0200
> +#define IORESOURCE_IRQ 0x0400
> +#define IORESOURCE_DMA 0x0800
> +#define IORESOURCE_PREFETCH0x1000  /* No side effects */
> +
> +/* #define DEVICE_ASSIGNMENT_DEBUG */
> +
> +#ifdef DEVICE_ASSIGNMENT_DEBUG
> +#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ##
> args) +#else
> +#define DEBUG(fmt, args...)
> +#endif
> +
> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
> +  uint32_t value)
> +{
> +   AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
> +   uint32_t r_pio = (unsigned long)r_access->r_virtbase
> +   + (addr - r_access->e_physbase);
> +
> +   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
> +   fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
> +   " r_virtbase=%08lx value=%08x\n",
> +   __func__, r_pio, (int)r_access->e_physbase,
> +   (unsigned long)r_access->r_virtbase, value);
> +   }
> +   iopl(3);
> +   outb(value, r_pio);
> +}
> +
> +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
> +

Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-24 Thread Amit Shah
* On Wednesday 24 Sep 2008 20:37:24 Anthony Liguori wrote:
> Amit Shah wrote:
> > * On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote:
> >> Amit Shah wrote:
> >>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> >>> index 72f3db8..40eb273 100644
> >>> --- a/qemu/Makefile.target
> >>> +++ b/qemu/Makefile.target
> >>> @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
> >>>  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
> >>>  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
> >>>  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> >>> +OBJS+= device-assignment.o
> >>
> >> This needs to be conditional on at least linux hosts, but probably also
> >> kvm support.
> >
> > I didn't see any other file that's doing it. So I added this conditional
> > in vl.c by having a #if defined(__linux__). That's how usb-linux.c does
> > it as well. Is there a better way?
>
> aio and compatfd currently do it this way.  block-raw-win32 and
> block-raw-posix are this way.  We're slowly moving things away from
> #ifdef #else #endif to conditional compilation.

So is

ifdef(CONFIG_LINUX)
OBJS+=..
endif

supposed to work?
(or is it CONFIG_LINUX_USER?)

> > Not the whole functionality needs kvm support. This should be able to
> > work even without kvm (for example, when the guest is 1:1 mapped in the
> > host address space).
>
> KVM is needed for interrupt remapping though.  That's something I don't
> see happening for normal userspace any time soon.

We already have an implementation that I've not submitted. I remember at least 
one user asking to support that functionality (because he wanted to try this 
on powerpc).

> >>> + /* FIXME: Add support for emulated MMIO for non-kvm guests */
> >>> + if (kvm_enabled()) {
> >>
> >> This doesn't work at all if kvm isn't enabled right?  You should
> >> probably bail out in the init if kvm isn't enabled.  If this whole file
> >> is included conditionally based on KVM support, then you don't have to
> >> worry about using kvm_enabled() guards to conditionally compile out
> >> code.
> >
> > Non-kvm support is currently broken and should be fixed, but that can
> > happen after we get this merged.
>
> But it would take bouncing interrupts to userspace?  I don't think that
> will ever happen upstream personally.  At any rate, there's no point in
> even trying to support something like that until progress is made
> upstream on this front.

With 1:1 mapping (Andrea's patches) and --no-kvm-irqchip (or indeed --no-kvm) 
and the userspace interrupt bouncing patches, we can support this.

> > I can temporarily add a check for kvm_enabled and bail out.
> >
> >>> + sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/",
> >>> + r_bus, r_dev, r_func);
> >>
> >> snprintf()
> >
> > It's guarded by the %02x modifiers; so this doesn't depend on user input.
>
> strcpy or sprintf should never be used.  It doesn't matter if it's safe
> in a particular instance.  There are safer functions to use (like
> snprintf).

Hmm, qemu is littered with such strcpy()s though. I'll change this.

> All it takes is for someone to come along and change the /sys/bus path
> to be larger without adjusting the buffer size and everything goes to
> hell.  It's inherently brittle.
>
> >>> + fprintf(stderr, "Registered host PCI device %02x:%02x.%1x "
> >>> + "(\"%s\") as guest device %02x:%02x.%1x\n",
> >>> + r_bus, r_dev, r_func, e_dev_name,
> >>> + pci_bus_num(e_bus), e_device, r_func);
> >>
> >> Please don't fprintf() unconditionally.
> >
> > OK; however, a vmdk file open does that so I though it was alright to do
> > it.
>
> I obviously don't use vmdk or else I would have removed that by now :-)
>
> >> A lot more checks are needed here to see if things can succeed.  We
> >> definitely should bail out if they can't.
> >
> > Bailing out is done in the out: label below. What else do  you think can
> > fail? I've taken care of all the cases that do fail IMO.
> >
> >>> + return pci_dev;
> >>> +out:
> >>> + pci_unregister_device(&pci_dev->dev);
> >>> + return NULL;
> >>> +}
> >>>
> >>>
> >>>
> >>> +/*
> >>> + * Syntax to assign device:
> >>> + *
> >>> + * -pcidevice dev=bus:dev.func,dma=dma
> >>> + *
> >>> + * Example:
> >>> + * -pcidevice host=00:13.0,dma=pvdma
> >>> + *
> >>> + * dma can currently only be 'none' to disable iommu support.
> >>
> >> Does it actually work if you disable iommu support?
> >
> > If the guest is 1:1 mapped.
>
> You mean with Andrea's reserved ram patches?

Yes.

> >>> +#include 
> >>
> >> Don't think this is needed here.
> >
> > We use mmap(), so this is needed.
>
> Ah.
>
> >>> +/* Initialize assigned devices */
> >>> +if (pci_enabled) {
> >>> +int r = -1;
> >>> +do {
> >>> +init_assigned_device(pci_bus, &r);
> >>
> >> Why pass r by reference instead of just returning it?  At any rate, you
> >> should detect when this fails and gracefully terminate QEMU.
> >
> > 'r' is the

Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-24 Thread Anthony Liguori

Amit Shah wrote:

* On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote:
  

Amit Shah wrote:



  

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 72f3db8..40eb273 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
  

This needs to be conditional on at least linux hosts, but probably also
kvm support.



I didn't see any other file that's doing it. So I added this conditional in 
vl.c by having a #if defined(__linux__). That's how usb-linux.c does it as 
well. Is there a better way?
  


aio and compatfd currently do it this way.  block-raw-win32 and 
block-raw-posix are this way.  We're slowly moving things away from 
#ifdef #else #endif to conditional compilation.


Not the whole functionality needs kvm support. This should be able to work 
even without kvm (for example, when the guest is 1:1 mapped in the host 
address space).
  


KVM is needed for interrupt remapping though.  That's something I don't 
see happening for normal userspace any time soon.



+   /* FIXME: Add support for emulated MMIO for non-kvm guests */
+   if (kvm_enabled()) {
  

This doesn't work at all if kvm isn't enabled right?  You should
probably bail out in the init if kvm isn't enabled.  If this whole file
is included conditionally based on KVM support, then you don't have to
worry about using kvm_enabled() guards to conditionally compile out code.



Non-kvm support is currently broken and should be fixed, but that can happen 
after we get this merged.
  


But it would take bouncing interrupts to userspace?  I don't think that 
will ever happen upstream personally.  At any rate, there's no point in 
even trying to support something like that until progress is made 
upstream on this front.



I can temporarily add a check for kvm_enabled and bail out.

  

+   sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/",
+   r_bus, r_dev, r_func);
  

snprintf()



It's guarded by the %02x modifiers; so this doesn't depend on user input.
  


strcpy or sprintf should never be used.  It doesn't matter if it's safe 
in a particular instance.  There are safer functions to use (like snprintf).


All it takes is for someone to come along and change the /sys/bus path 
to be larger without adjusting the buffer size and everything goes to 
hell.  It's inherently brittle.



+   fprintf(stderr, "Registered host PCI device %02x:%02x.%1x "
+   "(\"%s\") as guest device %02x:%02x.%1x\n",
+   r_bus, r_dev, r_func, e_dev_name,
+   pci_bus_num(e_bus), e_device, r_func);
  

Please don't fprintf() unconditionally.



OK; however, a vmdk file open does that so I though it was alright to do it.
  


I obviously don't use vmdk or else I would have removed that by now :-)


A lot more checks are needed here to see if things can succeed.  We
definitely should bail out if they can't.



Bailing out is done in the out: label below. What else do  you think can fail? 
I've taken care of all the cases that do fail IMO.


  

+   return pci_dev;
+out:
+   pci_unregister_device(&pci_dev->dev);
+   return NULL;
+}
  


  

+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice dev=bus:dev.func,dma=dma
+ *
+ * Example:
+ * -pcidevice host=00:13.0,dma=pvdma
+ *
+ * dma can currently only be 'none' to disable iommu support.
  

Does it actually work if you disable iommu support?



If the guest is 1:1 mapped.
  


You mean with Andrea's reserved ram patches?


+#include 
  

Don't think this is needed here.



We use mmap(), so this is needed.
  


Ah.


+/* Initialize assigned devices */
+if (pci_enabled) {
+int r = -1;
+do {
+init_assigned_device(pci_bus, &r);
  

Why pass r by reference instead of just returning it?  At any rate, you
should detect when this fails and gracefully terminate QEMU.



'r' is the count of the number of assigned devices -- mostly needed because we 
have the data stored in an array. If we migrate to a list, this can be 
relaxed.


ATM, I start the guest without assigning the device. I haven't figured out a 
way to gracefully terminate qemu yet.
  


In the case of hot plug, you fail the hot plug.  If you start with 
device assignment, just doing an "exit" would be sufficient.



+#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+   case QEMU_OPTION_pcidevice:
+   add_assigned_device(optarg);
  

You should copy into an array, then in pc.c, iterate through the array
and call into add_assigned_device.



Is there any benefit in doing this? We're moving the iterate out of vl.c to 
pc.c and both will happe

Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Amit Shah
* On Tuesday 23 Sep 2008 22:00:32 Anthony Liguori wrote:
> Amit Shah wrote:

> > diff --git a/qemu/Makefile.target b/qemu/Makefile.target
> > index 72f3db8..40eb273 100644
> > --- a/qemu/Makefile.target
> > +++ b/qemu/Makefile.target
> > @@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
> >  OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
> >  OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
> >  OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
> > +OBJS+= device-assignment.o
>
> This needs to be conditional on at least linux hosts, but probably also
> kvm support.

I didn't see any other file that's doing it. So I added this conditional in 
vl.c by having a #if defined(__linux__). That's how usb-linux.c does it as 
well. Is there a better way?

Not the whole functionality needs kvm support. This should be able to work 
even without kvm (for example, when the guest is 1:1 mapped in the host 
address space).

> > +   /* FIXME: Add support for emulated MMIO for non-kvm guests */
> > +   if (kvm_enabled()) {
>
> This doesn't work at all if kvm isn't enabled right?  You should
> probably bail out in the init if kvm isn't enabled.  If this whole file
> is included conditionally based on KVM support, then you don't have to
> worry about using kvm_enabled() guards to conditionally compile out code.

Non-kvm support is currently broken and should be fixed, but that can happen 
after we get this merged.

I can temporarily add a check for kvm_enabled and bail out.

> > +   sprintf(dir, "/sys/bus/pci/devices/:%02x:%02x.%x/",
> > +   r_bus, r_dev, r_func);
>
> snprintf()

It's guarded by the %02x modifiers; so this doesn't depend on user input.

> > +   strcpy(name, dir);
> > +   strcat(name, "config");
>
> snprintf()

... and as a result, all these don't depend on user input.

> > +#defineMAX_ASSIGNED_DEVS 4
> > +struct {
> > +   char name[15];
> > +   int bus;
> > +   int dev;
> > +   int func;
> > +   AssignedDevice *assigned_dev;
> > +} assigned_devices[MAX_ASSIGNED_DEVS];
>
> Any reason not to just use a list here?  sys-queue.h makes that very easy.

> > +   fprintf(stderr, "Registered host PCI device %02x:%02x.%1x "
> > +   "(\"%s\") as guest device %02x:%02x.%1x\n",
> > +   r_bus, r_dev, r_func, e_dev_name,
> > +   pci_bus_num(e_bus), e_device, r_func);
>
> Please don't fprintf() unconditionally.

OK; however, a vmdk file open does that so I though it was alright to do it.

> A lot more checks are needed here to see if things can succeed.  We
> definitely should bail out if they can't.

Bailing out is done in the out: label below. What else do  you think can fail? 
I've taken care of all the cases that do fail IMO.

> > +   return pci_dev;
> > +out:
> > +   pci_unregister_device(&pci_dev->dev);
> > +   return NULL;
> > +}

> > +/*
> > + * Syntax to assign device:
> > + *
> > + * -pcidevice dev=bus:dev.func,dma=dma
> > + *
> > + * Example:
> > + * -pcidevice host=00:13.0,dma=pvdma
> > + *
> > + * dma can currently only be 'none' to disable iommu support.
>
> Does it actually work if you disable iommu support?

If the guest is 1:1 mapped.


> > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
> > new file mode 100644
> > index 000..b77e484
> > --- /dev/null
> > +++ b/qemu/hw/device-assignment.h

> > +#include 
>
> Don't think this is needed here.

We use mmap(), so this is needed.

> > +#include "qemu-common.h"
> > +#include "pci.h"
> > +#include 
>
> Nor this.

This isn't.

> > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> > index 6053103..4a611cc 100644
> > --- a/qemu/hw/pc.c
> > +++ b/qemu/hw/pc.c

> > +/* Initialize assigned devices */
> > +if (pci_enabled) {
> > +int r = -1;
> > +do {
> > +init_assigned_device(pci_bus, &r);
>
> Why pass r by reference instead of just returning it?  At any rate, you
> should detect when this fails and gracefully terminate QEMU.

'r' is the count of the number of assigned devices -- mostly needed because we 
have the data stored in an array. If we migrate to a list, this can be 
relaxed.

ATM, I start the guest without assigning the device. I haven't figured out a 
way to gracefully terminate qemu yet.


> > --- a/qemu/vl.c
> > +++ b/qemu/vl.c

> > +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
> > +   case QEMU_OPTION_pcidevice:
> > +   add_assigned_device(optarg);
>
> You should copy into an array, then in pc.c, iterate through the array
> and call into add_assigned_device.

Is there any benefit in doing this? We're moving the iterate out of vl.c to 
pc.c and both will happen at the same time.

>
> Regards,
>
> Anthony Liguori

Thanks!

Amit.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Anthony Liguori

Muli Ben-Yehuda wrote:

On Tue, Sep 23, 2008 at 02:18:51PM -0500, Anthony Liguori wrote:

  

Make sure you issue iopl() before any of the VCPU threads are
spawned.  Otherwise, you may be running into issues when something
other than VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for
that thread.



Yeah, we thought of that, but as far as I can recall this happened
with a single VCPU. In any case, we'll look into it again.
  


The io thread runs in a separate thread than the VCPU.  So if you were 
doing iopl(3) in the io thread (for instance, when called from 
machine_init), then the VCPU thread wouldn't necessarily have inherited 
the iopl().


You could also do it explicitly on vcpu thread construction.

Regards,

Anthony Liguori


Cheers,
Muli
  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Muli Ben-Yehuda
On Tue, Sep 23, 2008 at 02:18:51PM -0500, Anthony Liguori wrote:

> Make sure you issue iopl() before any of the VCPU threads are
> spawned.  Otherwise, you may be running into issues when something
> other than VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for
> that thread.

Yeah, we thought of that, but as far as I can recall this happened
with a single VCPU. In any case, we'll look into it again.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
  xxx
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Anthony Liguori

Muli Ben-Yehuda wrote:

On Tue, Sep 23, 2008 at 11:30:32AM -0500, Anthony Liguori wrote:
  

+   + (addr - r_access->e_physbase);
+
+   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
+   fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
+   " r_virtbase=%08lx value=%08x\n",
+   __func__, r_pio, (int)r_access->e_physbase,
+   (unsigned long)r_access->r_virtbase, value);
+   }
+   iopl(3);
+   outb(value, r_pio);
  
  

The formatting is wrong for this entire file.  Also, you shouldn't
have device specific debug.  Should probably error check iopl(3).
It's not necessary to call it every time you do an outb, just once
when initialized.



We tried that at first, but ran into cases where even after iopl()
ran, pio's from qemu still failed. Does qemu do anything to drop
iopl() privileges? In any case calling iopl() unconditionally on every
pio fixed it, but is obviously not the right long-term solution.
  


Make sure you issue iopl() before any of the VCPU threads are spawned.  
Otherwise, you may be running into issues when something other than 
VCPU-0 is doing PIO/MMIO and you haven't iopl()'d for that thread.


Regards,

Anthony Liguori


Cheers,
Muli
  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Muli Ben-Yehuda
On Tue, Sep 23, 2008 at 11:30:32AM -0500, Anthony Liguori wrote:
>
>> ++ (addr - r_access->e_physbase);
>> +
>> +if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
>> +fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
>> +" r_virtbase=%08lx value=%08x\n",
>> +__func__, r_pio, (int)r_access->e_physbase,
>> +(unsigned long)r_access->r_virtbase, value);
>> +}
>> +iopl(3);
>> +outb(value, r_pio);
>>   
>
> The formatting is wrong for this entire file.  Also, you shouldn't
> have device specific debug.  Should probably error check iopl(3).
> It's not necessary to call it every time you do an outb, just once
> when initialized.

We tried that at first, but ran into cases where even after iopl()
ran, pio's from qemu still failed. Does qemu do anything to drop
iopl() privileges? In any case calling iopl() unconditionally on every
pio fixed it, but is obviously not the right long-term solution.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
  xxx
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Anthony Liguori

Amit Shah wrote:

From: Or Sagi <[EMAIL PROTECTED]>
From: Nir Peleg <[EMAIL PROTECTED]>
From: Amit Shah <[EMAIL PROTECTED]>
From: Ben-Ami Yassour <[EMAIL PROTECTED]>
From: Weidong Han <[EMAIL PROTECTED]>
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
For example, to invoke it for a device sitting at PCI bus:dev.fn
04:08.0, use this:

-pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
---
 qemu/Makefile.target|1 +
 qemu/hw/device-assignment.c |  665 +++
 qemu/hw/device-assignment.h |   93 ++
 qemu/hw/pc.c|9 +
 qemu/hw/pci.c   |7 +
 qemu/vl.c   |   18 ++
 6 files changed, 793 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 72f3db8..40eb273 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
  


This needs to be conditional on at least linux hosts, but probably also 
kvm support.



 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 000..e70daf2
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
+ */
+#include 
+#include 
+#include "qemu-kvm.h"
+#include 
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO  0x0100  /* Resource type */
+#define IORESOURCE_MEM 0x0200
+#define IORESOURCE_IRQ 0x0400
+#define IORESOURCE_DMA 0x0800
+#define IORESOURCE_PREFETCH0x1000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
  


Both should be in do { } while(0) to preserve statement semantics.  
Please use C99 variadic macros too.



+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+  uint32_t value)
+{
+   AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
+   uint32_t r_pio = (unsigned long)r_access->r_virtbase
  


Should be target_ulong if it's a target virtual address.


+   + (addr - r_access->e_physbase);
+
+   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
+   fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
+   " r_virtbase=%08lx value=%08x\n",
+   __func__, r_pio, (int)r_access->e_physbase,
+   (unsigned long)r_access->r_virtbase, value);
+   }
+   iopl(3);
+   outb(value, r_pio);
  


The formatting is wrong for this entire file.  Also, you shouldn't have 
device specific debug.  Should probably error check iopl(3).  It's not 
necessary to call it every time you do an outb, just once when initialized.



+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+uint32_t e_phys, uint32_t e_size, int type)
+{
+   AssignedDevi

[PATCH 5/7] KVM/userspace: Device Assignment: Support for assigning PCI devices to guests

2008-09-23 Thread Amit Shah
From: Or Sagi <[EMAIL PROTECTED]>
From: Nir Peleg <[EMAIL PROTECTED]>
From: Amit Shah <[EMAIL PROTECTED]>
From: Ben-Ami Yassour <[EMAIL PROTECTED]>
From: Weidong Han <[EMAIL PROTECTED]>
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
For example, to invoke it for a device sitting at PCI bus:dev.fn
04:08.0, use this:

-pcidevice host=04:08.0

* The host driver for the device, if any, is to be removed before
assigning the device (else device assignment will fail).

* A device that shares IRQ with another host device cannot currently
be assigned.

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
---
 qemu/Makefile.target|1 +
 qemu/hw/device-assignment.c |  665 +++
 qemu/hw/device-assignment.h |   93 ++
 qemu/hw/pc.c|9 +
 qemu/hw/pci.c   |7 +
 qemu/vl.c   |   18 ++
 6 files changed, 793 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 72f3db8..40eb273 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -616,6 +616,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 000..e70daf2
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
+ */
+#include 
+#include 
+#include "qemu-kvm.h"
+#include 
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO  0x0100  /* Resource type */
+#define IORESOURCE_MEM 0x0200
+#define IORESOURCE_IRQ 0x0400
+#define IORESOURCE_DMA 0x0800
+#define IORESOURCE_PREFETCH0x1000  /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
+  uint32_t value)
+{
+   AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
+   uint32_t r_pio = (unsigned long)r_access->r_virtbase
+   + (addr - r_access->e_physbase);
+
+   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
+   fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
+   " r_virtbase=%08lx value=%08x\n",
+   __func__, r_pio, (int)r_access->e_physbase,
+   (unsigned long)r_access->r_virtbase, value);
+   }
+   iopl(3);
+   outb(value, r_pio);
+}
+
+static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
+  uint32_t value)
+{
+   AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
+   uint32_t r_pio = (unsigned long)r_access->r_virtbase
+   + (addr - r_access->e_physbase);
+
+   if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {
+   fprintf(stderr, "%s: r_pio=%08x e_physbase=%08x"
+   " r_virtbase=%08lx value=%08x\n",
+   __func__, r_pio, (int)r_access->e_physbase,
+   (unsigned long)r_access->r_virtbase, value);
+   }
+   iopl(3);
+   outw(value, r_