On Fri, Oct 24, 2008 at 11:22:48AM -0500, Anthony Liguori wrote:

> Amit Shah wrote:

>> +#include <linux/kvm_para.h>
>>   
>
> Is this header really necessary?

No, removed.

>
>> +#include "device-assignment.h"
>> +
>> +/* From linux/ioport.h */
>> +#define IORESOURCE_IO       0x00000100  /* Resource type */
>> +#define IORESOURCE_MEM      0x00000200
>> +#define IORESOURCE_IRQ      0x00000400
>> +#define IORESOURCE_DMA      0x00000800
>> +#define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
>> +
>> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */
>> +
>> +#ifdef DEVICE_ASSIGNMENT_DEBUG
>> +#define DEBUG(fmt, args...)                               \
>>   
>
> Please use C99 style varidacs.

Done.

>
>> +    do {                                                  \
>> +      fprintf(stderr, "%s: " fmt, __func__ , ## args);    \
>> +    } while (0)
>> +#else
>> +#define DEBUG(fmt, args...) do { } while(0)
>> +#endif
>> +
>> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
>> +                                       uint32_t value)
>> +{
>> +    AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;
>>   
>
> Cast is unnecessary.

Removed.

>
>> +    uint32_t r_pio = (unsigned long)r_access->r_virtbase
>> +        + (addr - r_access->e_physbase);
>>   
>
> It would be nice to make this a function to make it more obvious that you 
> were translated from guest to host regions.  The cast to unsigned long 
> should probably be target_ulong too.

Done.

>
>> +    DEBUG(stderr, "%s: r_pio=%08x e_physbase=%08x"
>> +          " r_virtbase=%08lx value=%08x\n",
>> +          __func__, r_pio, (int)r_access->e_physbase,
>> +          (unsigned long)r_access->r_virtbase, value);
>>   
>
> This debug statement looks wrong to me.  You're passing stderr.
> It's true for all of these functions.

Fixed.

>
>> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
>> +                                   uint32_t e_phys, uint32_t e_size, int 
>> type)
>> +{
>> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    int first_map = (region->e_size == 0);
>> +    int ret = 0;
>> +
>> +    DEBUG("%s: e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
>> +          __func__, e_phys, (uint32_t)region->r_virtbase, type, e_size,
>> +          region_num);
>>   
>
> You already have __func__ in your debug printf().

Fixed.

>
>> +    region->e_physbase = e_phys;
>> +    region->e_size = e_size;
>> +
>> +    /* FIXME: Add support for emulated MMIO for non-kvm guests */
>> +    if (kvm_enabled()) {
>>   
>
> I don't think having a kvm_enabled() check here is very useful.  I
> think device-assignment.c should be conditional on USE_KVM, and the
> only kvm_enabled() check should be when creating the initial device
> assignment.  Practically speaking, QEMU is never going to support
> device assignment outside of the context of KVM because I strongly
> doubt anything like irqhook will make it upstream.

Reworked along your suggestions, please let me know if you have
further comments.

>> +        if (!first_map)
>> +            kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
>> +        if (e_size > 0)
>> +            ret = kvm_register_phys_mem(kvm_context, e_phys,
>> +                                        region->r_virtbase, e_size, 0);
>> +        if (ret != 0)
>> +            fprintf(stderr, "%s: Error: create new mapping failed\n", 
>> __func__);
>>   
>
> If we do get an error here, we shouldn't keep going.  This error is
> probably going to happen in practice if a guest tries to pass
> through too many devices and we run out of slots.

Fixed, we exit(1) now (is there a more graceful to bail out?).

>> +    }
>> +}
>> +
>> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
>> +                                    uint32_t addr, uint32_t size, int 
>> type)
>> +{
>> +    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
>> +    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
>> +    int r;
>> +
>> +    region->e_physbase = addr;
>> +    region->e_size = size;
>> +
>> +    DEBUG("%s: e_phys=0x%x r_virt=%x type=0x%x len=%d region_num=%d \n",
>> +          __func__, addr, (uint32_t)region->r_virtbase, type, size, 
>> region_num);
>>   
>
> Need to fix this DEBUG().

Fixed.

>
>> +    r = ioperm((uint32_t)region->r_virtbase, size, 1);
>>   
>
> I don't think this is enough for KVM.  This will only do the ioperm
> in the thread that triggered the IO.  If you have an SMP guest,
> ioperm needs to be done on each VCPU's thread.

Fixed.

>> +    if (r < 0) {
>> +        perror("assigned_dev_ioport_map: ioperm");
>> +        return;
>> +    }
>>   
>
> Again, if we fail, we have to exit QEMU gracefully.

Fixed.

>
>> +    register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
>> +                         (void *) (r_dev->v_addrs + region_num));
>> +    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
>> +                         (void *) (r_dev->v_addrs + region_num));
>> +    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
>> +                         (void *) (r_dev->v_addrs + region_num));
>> +    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
>> +                          (void *) (r_dev->v_addrs + region_num));
>> +    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
>> +                          (void *) (r_dev->v_addrs + region_num));
>> +    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
>> +                          (void *) (r_dev->v_addrs + region_num));
>> +}
>>   
>
> You never need to explicitly cast a pointer to void *.

Fixed.

>
>> +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
>> +                                          uint32_t val, int len)
>> +{
>> +    int fd, r;
>> +
>> +    DEBUG("%s: (%x.%x): address=%04x val=0x%08x len=%d\n",
>> +          __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
>> +          (uint16_t) address, val, len);
>>   
>
> bad DEBUG()

Fixed.

>
>> +    if (address == 0x4) {
>> +        pci_default_write_config(d, address, val, len);
>> +        /* Continue to program the card */
>> +    }
>> +
>> +    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
>> +        address == 0x3c || address == 0x3d) {
>> +        /* used for update-mappings (BAR emulation) */
>> +        pci_default_write_config(d, address, val, len);
>> +        return;
>> +    }
>> +    DEBUG("%s: NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
>> +          __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
>> +          (uint16_t) address, val, len);
>> +    fd = ((AssignedDevice *)d)->real_device.config_fd;
>> +    r = lseek(fd, address, SEEK_SET);
>> +    if (r < 0) {
>> +        fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno);
>> +        return;
>> +    }
>> +again:
>> +    r = write(fd, &val, len);
>>   
>
> Can you just do a pwrite()?  That'll make things simpler.

Fixed.

>
>> +    if (r < 0) {
>> +        if (errno == EINTR || errno == EAGAIN)
>> +            goto again;
>> +        fprintf(stderr, "%s: write failed, errno = %d\n", __func__, 
>> errno);
>> +    }
>> +}
>> +
>> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t 
>> address,
>> +                                             int len)
>> +{
>> +    uint32_t val = 0;
>> +    int fd, r;
>> +
>> +    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
>> +        address == 0x3c || address == 0x3d) {
>> +        val = pci_default_read_config(d, address, len);
>> +        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
>> +              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, 
>> len);
>> +        return val;
>> +    }
>> +
>> +    /* vga specific, remove later */
>> +    if (address == 0xFC)
>> +        goto do_log;
>>   
>
> Can you explain the point of this?

No. It appears to exist since the earliest versions of the
patch. Since removing it does modify the behavior, I kept it in for
now pending further investigation.

>> +    fd = ((AssignedDevice *)d)->real_device.config_fd;
>> +    r = lseek(fd, address, SEEK_SET);
>> +    if (r < 0) {
>> +        fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno);
>> +        return val;
>> +    }
>> +again:
>> +    r = read(fd, &val, len);
>>   
>
> pread().

Fixed.

>
>> +    if (r < 0) {
>> +        if (errno == EINTR || errno == EAGAIN)
>> +            goto again;
>> +        fprintf(stderr, "%s: read failed, errno = %d\n",
>> +            __func__, errno);
>>   
>
> Should bail out gracefully.

Done.

>
>> +static int assigned_dev_register_regions(PCIRegion *io_regions,
>> +                                         unsigned long regions_num,
>> +                                         AssignedDevice *pci_dev)
>> +{
>> +    uint32_t i;
>> +    PCIRegion *cur_region = io_regions;
>> +
>> +    for (i = 0; i < regions_num; i++, cur_region++) {
>> +        if (!cur_region->valid)
>> +            continue;
>> +        pci_dev->v_addrs[i].num = i;
>> +
>> +        /* handle memory io regions */
>> +        if (cur_region->type & IORESOURCE_MEM) {
>> +            int t = cur_region->type & IORESOURCE_PREFETCH
>> +                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
>> +                : PCI_ADDRESS_SPACE_MEM;
>> +
>> +            /* map physical memory */
>> +            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
>> +            pci_dev->v_addrs[i].r_virtbase =
>> +                mmap(NULL,
>> +                     (cur_region->size + 0xFFF) & 0xFFFFF000,
>> +                     PROT_WRITE | PROT_READ, MAP_SHARED,
>> +                     cur_region->resource_fd, (off_t) 0);
>> +
>> +            if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) {
>>   
>
> Please use MAP_FAILED and don't use a defensive if.

Fixed.

>
>> +                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
>> +                        "\n", __func__,
>> +                        (uint32_t) (cur_region->base_addr));
>> +                return -1;
>> +            }
>> +            pci_dev->v_addrs[i].r_size = cur_region->size;
>> +            pci_dev->v_addrs[i].e_size = 0;
>> +
>> +            /* add offset */
>> +            pci_dev->v_addrs[i].r_virtbase +=
>> +                (cur_region->base_addr & 0xFFF);
>> +
>> +            pci_register_io_region((PCIDevice *) pci_dev, i,
>> +                                   cur_region->size, t,
>> +                                   assigned_dev_iomem_map);
>> +            continue;
>> +        }
>> +        /* handle port io regions */
>> +        pci_register_io_region((PCIDevice *) pci_dev, i,
>> +                               cur_region->size, PCI_ADDRESS_SPACE_IO,
>> +                               assigned_dev_ioport_map);
>> +
>> +        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
>> +        pci_dev->v_addrs[i].r_virtbase =
>> +            (void *)(long)cur_region->base_addr;
>>   
>
> I think virtbase would make more sense as a target_ulong.

I split r_virtbase into a union of void* for memory regions and a
ulong32_t for port numbers.

>
>> +        /* not relevant for port io */
>> +        pci_dev->v_addrs[i].memory_index = 0;
>> +    }
>> +
>> +    /* success */
>> +    return 0;
>> +}
>> +
>> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
>> +                           uint8_t r_dev, uint8_t r_func)
>> +{
>> +    char dir[128], name[128], comp[16];
>> +    int fd, r = 0;
>> +    FILE *f;
>> +    unsigned long long start, end, size, flags;
>> +    PCIRegion *rp;
>> +    PCIDevRegions *dev = &pci_dev->real_device;
>> +
>> +    dev->region_number = 0;
>> +
>> +    snprintf(dir, 128, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
>> +         r_bus, r_dev, r_func);
>>   
>
> just use sizeof().

Done.

>
>> +    strncpy(name, dir, 128);
>> +    strncat(name, "config", 6);
>>   
>
> strncpy() doesn't do what you think it does.  Why not just snprintf(name, 
> sizeof(name), "%sconfig", dir)?

Fixed to use snprintf.

>
>> +    fd = open(name, O_RDWR);
>> +    if (fd == -1) {
>> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +        return 1;
>> +    }
>> +    dev->config_fd = fd;
>> +again:
>> +    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
>> +    if (r < 0) {
>> +        if (errno == EINTR || errno == EAGAIN)
>> +            goto again;
>> +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, 
>> errno);
>> +    }
>> +    strncpy(name, dir, 128);
>> +    strncat(name, "resource", 8);
>>   
>
> Just use snprintf().

Done.

>
>> +    f = fopen(name, "r");
>> +    if (f == NULL) {
>> +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
>> +        return 1;
>> +    }
>> +    r = -1;
>> +    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
>> +        r++;
>> +        rp = dev->regions + r;
>> +        rp->valid = 0;
>> +        size = end - start + 1;
>> +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
>> +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
>> +            continue;
>> +        if (flags & IORESOURCE_MEM) {
>> +            flags &= ~IORESOURCE_IO;
>> +            snprintf(comp, 16, "resource%d", r);
>> +            strncpy(name, dir, 128);
>> +            strncat(name, comp, 16);
>>   
> snprintf(name, sizeof(name), "%sresource%d", dir, r).

Done.


> This is the wrong general model for doing this.  The way the rest of
> QEMU works is to maintain an array of strings representing the
> assigned devices. The option handling just saves the name of the
> option.  Then in pc.c, you iterate through the list of assigned
> devices, and then add them.  Other architectures may have a
> completely different implementation of device assignment so it's
> better to let the individual architectures decide what to do with
> the assigned devices.

Split option parsing and initialization in two parts, as you
suggested.

Thanks for the detailed review comments!

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to