On Fri, Oct 24, 2008 at 11:22:48AM -0500, Anthony Liguori wrote: > Amit Shah wrote:
>> +#include <linux/kvm_para.h> >> > > Is this header really necessary? No, removed. > >> +#include "device-assignment.h" >> + >> +/* From linux/ioport.h */ >> +#define IORESOURCE_IO 0x00000100 /* Resource type */ >> +#define IORESOURCE_MEM 0x00000200 >> +#define IORESOURCE_IRQ 0x00000400 >> +#define IORESOURCE_DMA 0x00000800 >> +#define IORESOURCE_PREFETCH 0x00001000 /* No side effects */ >> + >> +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */ >> + >> +#ifdef DEVICE_ASSIGNMENT_DEBUG >> +#define DEBUG(fmt, args...) \ >> > > Please use C99 style varidacs. Done. > >> + do { \ >> + fprintf(stderr, "%s: " fmt, __func__ , ## args); \ >> + } while (0) >> +#else >> +#define DEBUG(fmt, args...) do { } while(0) >> +#endif >> + >> +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, >> + uint32_t value) >> +{ >> + AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; >> > > Cast is unnecessary. Removed. > >> + uint32_t r_pio = (unsigned long)r_access->r_virtbase >> + + (addr - r_access->e_physbase); >> > > It would be nice to make this a function to make it more obvious that you > were translated from guest to host regions. The cast to unsigned long > should probably be target_ulong too. Done. > >> + DEBUG(stderr, "%s: r_pio=%08x e_physbase=%08x" >> + " r_virtbase=%08lx value=%08x\n", >> + __func__, r_pio, (int)r_access->e_physbase, >> + (unsigned long)r_access->r_virtbase, value); >> > > This debug statement looks wrong to me. You're passing stderr. > It's true for all of these functions. Fixed. > >> +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, >> + uint32_t e_phys, uint32_t e_size, int >> type) >> +{ >> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev; >> + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; >> + int first_map = (region->e_size == 0); >> + int ret = 0; >> + >> + DEBUG("%s: e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n", >> + __func__, e_phys, (uint32_t)region->r_virtbase, type, e_size, >> + region_num); >> > > You already have __func__ in your debug printf(). Fixed. > >> + region->e_physbase = e_phys; >> + region->e_size = e_size; >> + >> + /* FIXME: Add support for emulated MMIO for non-kvm guests */ >> + if (kvm_enabled()) { >> > > I don't think having a kvm_enabled() check here is very useful. I > think device-assignment.c should be conditional on USE_KVM, and the > only kvm_enabled() check should be when creating the initial device > assignment. Practically speaking, QEMU is never going to support > device assignment outside of the context of KVM because I strongly > doubt anything like irqhook will make it upstream. Reworked along your suggestions, please let me know if you have further comments. >> + if (!first_map) >> + kvm_destroy_phys_mem(kvm_context, e_phys, e_size); >> + if (e_size > 0) >> + ret = kvm_register_phys_mem(kvm_context, e_phys, >> + region->r_virtbase, e_size, 0); >> + if (ret != 0) >> + fprintf(stderr, "%s: Error: create new mapping failed\n", >> __func__); >> > > If we do get an error here, we shouldn't keep going. This error is > probably going to happen in practice if a guest tries to pass > through too many devices and we run out of slots. Fixed, we exit(1) now (is there a more graceful to bail out?). >> + } >> +} >> + >> +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num, >> + uint32_t addr, uint32_t size, int >> type) >> +{ >> + AssignedDevice *r_dev = (AssignedDevice *) pci_dev; >> + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; >> + int r; >> + >> + region->e_physbase = addr; >> + region->e_size = size; >> + >> + DEBUG("%s: e_phys=0x%x r_virt=%x type=0x%x len=%d region_num=%d \n", >> + __func__, addr, (uint32_t)region->r_virtbase, type, size, >> region_num); >> > > Need to fix this DEBUG(). Fixed. > >> + r = ioperm((uint32_t)region->r_virtbase, size, 1); >> > > I don't think this is enough for KVM. This will only do the ioperm > in the thread that triggered the IO. If you have an SMP guest, > ioperm needs to be done on each VCPU's thread. Fixed. >> + if (r < 0) { >> + perror("assigned_dev_ioport_map: ioperm"); >> + return; >> + } >> > > Again, if we fail, we have to exit QEMU gracefully. Fixed. > >> + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb, >> + (void *) (r_dev->v_addrs + region_num)); >> + register_ioport_read(addr, size, 2, assigned_dev_ioport_readw, >> + (void *) (r_dev->v_addrs + region_num)); >> + register_ioport_read(addr, size, 4, assigned_dev_ioport_readl, >> + (void *) (r_dev->v_addrs + region_num)); >> + register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb, >> + (void *) (r_dev->v_addrs + region_num)); >> + register_ioport_write(addr, size, 2, assigned_dev_ioport_writew, >> + (void *) (r_dev->v_addrs + region_num)); >> + register_ioport_write(addr, size, 4, assigned_dev_ioport_writel, >> + (void *) (r_dev->v_addrs + region_num)); >> +} >> > > You never need to explicitly cast a pointer to void *. Fixed. > >> +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address, >> + uint32_t val, int len) >> +{ >> + int fd, r; >> + >> + DEBUG("%s: (%x.%x): address=%04x val=0x%08x len=%d\n", >> + __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), >> + (uint16_t) address, val, len); >> > > bad DEBUG() Fixed. > >> + if (address == 0x4) { >> + pci_default_write_config(d, address, val, len); >> + /* Continue to program the card */ >> + } >> + >> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || >> + address == 0x3c || address == 0x3d) { >> + /* used for update-mappings (BAR emulation) */ >> + pci_default_write_config(d, address, val, len); >> + return; >> + } >> + DEBUG("%s: NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n", >> + __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), >> + (uint16_t) address, val, len); >> + fd = ((AssignedDevice *)d)->real_device.config_fd; >> + r = lseek(fd, address, SEEK_SET); >> + if (r < 0) { >> + fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno); >> + return; >> + } >> +again: >> + r = write(fd, &val, len); >> > > Can you just do a pwrite()? That'll make things simpler. Fixed. > >> + if (r < 0) { >> + if (errno == EINTR || errno == EAGAIN) >> + goto again; >> + fprintf(stderr, "%s: write failed, errno = %d\n", __func__, >> errno); >> + } >> +} >> + >> +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t >> address, >> + int len) >> +{ >> + uint32_t val = 0; >> + int fd, r; >> + >> + if ((address >= 0x10 && address <= 0x24) || address == 0x34 || >> + address == 0x3c || address == 0x3d) { >> + val = pci_default_read_config(d, address, len); >> + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", >> + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, >> len); >> + return val; >> + } >> + >> + /* vga specific, remove later */ >> + if (address == 0xFC) >> + goto do_log; >> > > Can you explain the point of this? No. It appears to exist since the earliest versions of the patch. Since removing it does modify the behavior, I kept it in for now pending further investigation. >> + fd = ((AssignedDevice *)d)->real_device.config_fd; >> + r = lseek(fd, address, SEEK_SET); >> + if (r < 0) { >> + fprintf(stderr, "%s: bad seek, errno = %d\n", __func__, errno); >> + return val; >> + } >> +again: >> + r = read(fd, &val, len); >> > > pread(). Fixed. > >> + if (r < 0) { >> + if (errno == EINTR || errno == EAGAIN) >> + goto again; >> + fprintf(stderr, "%s: read failed, errno = %d\n", >> + __func__, errno); >> > > Should bail out gracefully. Done. > >> +static int assigned_dev_register_regions(PCIRegion *io_regions, >> + unsigned long regions_num, >> + AssignedDevice *pci_dev) >> +{ >> + uint32_t i; >> + PCIRegion *cur_region = io_regions; >> + >> + for (i = 0; i < regions_num; i++, cur_region++) { >> + if (!cur_region->valid) >> + continue; >> + pci_dev->v_addrs[i].num = i; >> + >> + /* handle memory io regions */ >> + if (cur_region->type & IORESOURCE_MEM) { >> + int t = cur_region->type & IORESOURCE_PREFETCH >> + ? PCI_ADDRESS_SPACE_MEM_PREFETCH >> + : PCI_ADDRESS_SPACE_MEM; >> + >> + /* map physical memory */ >> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; >> + pci_dev->v_addrs[i].r_virtbase = >> + mmap(NULL, >> + (cur_region->size + 0xFFF) & 0xFFFFF000, >> + PROT_WRITE | PROT_READ, MAP_SHARED, >> + cur_region->resource_fd, (off_t) 0); >> + >> + if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) { >> > > Please use MAP_FAILED and don't use a defensive if. Fixed. > >> + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!" >> + "\n", __func__, >> + (uint32_t) (cur_region->base_addr)); >> + return -1; >> + } >> + pci_dev->v_addrs[i].r_size = cur_region->size; >> + pci_dev->v_addrs[i].e_size = 0; >> + >> + /* add offset */ >> + pci_dev->v_addrs[i].r_virtbase += >> + (cur_region->base_addr & 0xFFF); >> + >> + pci_register_io_region((PCIDevice *) pci_dev, i, >> + cur_region->size, t, >> + assigned_dev_iomem_map); >> + continue; >> + } >> + /* handle port io regions */ >> + pci_register_io_region((PCIDevice *) pci_dev, i, >> + cur_region->size, PCI_ADDRESS_SPACE_IO, >> + assigned_dev_ioport_map); >> + >> + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; >> + pci_dev->v_addrs[i].r_virtbase = >> + (void *)(long)cur_region->base_addr; >> > > I think virtbase would make more sense as a target_ulong. I split r_virtbase into a union of void* for memory regions and a ulong32_t for port numbers. > >> + /* not relevant for port io */ >> + pci_dev->v_addrs[i].memory_index = 0; >> + } >> + >> + /* success */ >> + return 0; >> +} >> + >> +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus, >> + uint8_t r_dev, uint8_t r_func) >> +{ >> + char dir[128], name[128], comp[16]; >> + int fd, r = 0; >> + FILE *f; >> + unsigned long long start, end, size, flags; >> + PCIRegion *rp; >> + PCIDevRegions *dev = &pci_dev->real_device; >> + >> + dev->region_number = 0; >> + >> + snprintf(dir, 128, "/sys/bus/pci/devices/0000:%02x:%02x.%x/", >> + r_bus, r_dev, r_func); >> > > just use sizeof(). Done. > >> + strncpy(name, dir, 128); >> + strncat(name, "config", 6); >> > > strncpy() doesn't do what you think it does. Why not just snprintf(name, > sizeof(name), "%sconfig", dir)? Fixed to use snprintf. > >> + fd = open(name, O_RDWR); >> + if (fd == -1) { >> + fprintf(stderr, "%s: %s: %m\n", __func__, name); >> + return 1; >> + } >> + dev->config_fd = fd; >> +again: >> + r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config)); >> + if (r < 0) { >> + if (errno == EINTR || errno == EAGAIN) >> + goto again; >> + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, >> errno); >> + } >> + strncpy(name, dir, 128); >> + strncat(name, "resource", 8); >> > > Just use snprintf(). Done. > >> + f = fopen(name, "r"); >> + if (f == NULL) { >> + fprintf(stderr, "%s: %s: %m\n", __func__, name); >> + return 1; >> + } >> + r = -1; >> + while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) { >> + r++; >> + rp = dev->regions + r; >> + rp->valid = 0; >> + size = end - start + 1; >> + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; >> + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) >> + continue; >> + if (flags & IORESOURCE_MEM) { >> + flags &= ~IORESOURCE_IO; >> + snprintf(comp, 16, "resource%d", r); >> + strncpy(name, dir, 128); >> + strncat(name, comp, 16); >> > snprintf(name, sizeof(name), "%sresource%d", dir, r). Done. > This is the wrong general model for doing this. The way the rest of > QEMU works is to maintain an array of strings representing the > assigned devices. The option handling just saves the name of the > option. Then in pc.c, you iterate through the list of assigned > devices, and then add them. Other architectures may have a > completely different implementation of device assignment so it's > better to let the individual architectures decide what to do with > the assigned devices. Split option parsing and initialization in two parts, as you suggested. Thanks for the detailed review comments! Cheers, Muli -- The First Workshop on I/O Virtualization (WIOV '08) Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/ <-> SYSTOR 2009---The Israeli Experimental Systems Conference http://www.haifa.il.ibm.com/conferences/systor2009/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html