Amit,

Below is the patch for PCI passthrough tree, it enables a guest to access a device's
memory mapped I/O regions directly, without requiring the host to trap and
emulate every MMIO access.

This patch requires only userspace changes and it is relaying on the kernel patch by Anthony: "Handle vma regions with no backing page". Note that this patch requires CONFIG_NUMA to be set. It does require a change to the VT-d that Allen sent a while ago, to avoid mapping of memory slots with no backing page.

This patch was tested with the pci-passthrough VT-d using an e1000 NIC.

Regards,
Ben

From 8fe13bcea014a3b896a79fca5d15ddd32050694c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <[EMAIL PROTECTED]>
Date: Tue, 3 Jun 2008 12:34:51 +0300
Subject: [PATCH] KVM: PCIPT: direct mmio

This patch for PCI passthrough devices enables a guest to access a
device's memory mapped I/O regions directly, without requiring the host to trap
and emulate every MMIO access.

Signed-off-by: Ben-Ami Yassour <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 libkvm/libkvm.c           |   24 ++++++++----
 qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
 qemu/hw/pci-passthrough.h |    2 +
 3 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index d1e95a4..ce062cb 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, 
unsigned long phys_start,
 {
        int r;
        int prot = PROT_READ;
-       void *ptr;
+       void *ptr = NULL;
        struct kvm_userspace_memory_region memory = {
                .memory_size = len,
                .guest_phys_addr = phys_start,
@@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, 
unsigned long phys_start,
        if (writable)
                prot |= PROT_WRITE;

-       ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
-       if (ptr == MAP_FAILED) {
-               fprintf(stderr, "create_userspace_phys_mem: %s", 
strerror(errno));
-               return 0;
-       }
+       if (len > 0) {
+               ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+               if (ptr == MAP_FAILED) {
+                       fprintf(stderr, "create_userspace_phys_mem: %s",
+                               strerror(errno));
+                       return 0;
+               }

-       memset(ptr, 0, len);
+               memset(ptr, 0, len);
+       }

        memory.userspace_addr = (unsigned long)ptr;
-       memory.slot = get_free_slot(kvm);
+
+       if (len > 0)
+               memory.slot = get_free_slot(kvm);
+       else
+               memory.slot = get_slot(phys_start);
+
        r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
        if (r == -1) {
                fprintf(stderr, "create_userspace_phys_mem: %s", 
strerror(errno));
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 62953ae..1a3e50c 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
 extern kvm_context_t kvm_context;
 extern FILE *logfile;

-CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
-       pt_mmio_readb,
-       pt_mmio_readw,
-       pt_mmio_readl
-};
-
-CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
-       pt_mmio_writeb,
-       pt_mmio_writew,
-       pt_mmio_writel
-};
-
 //#define PT_DEBUG

 #ifdef PT_DEBUG
@@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
 #define DEBUG(fmt, args...)
 #endif

-#define pt_mmio_write(suffix, type)                                    \
-void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,    \
-                               uint32_t value)                         \
-{                                                                      \
-       pt_region_t *r_access = (pt_region_t *)opaque;                  \
-       void *r_virt = (u8 *)r_access->r_virtbase +                  \
-                       (e_phys - r_access->e_physbase);             \
-       if (r_access->debug & PT_DEBUG_MMIO) {                           \
-               fprintf(logfile, "pt_mmio_write" #suffix              \
-                       ": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
-                       (void *)r_access->e_physbase, (void *)e_phys,        \
-                       r_virt, value);                                 \
-       }                                                               \
-       *(type *)r_virt = (type)value;                                  \
-}
-
-pt_mmio_write(b, u8)
-pt_mmio_write(w, u16)
-pt_mmio_write(l, u32)
-
-#define pt_mmio_read(suffix, type)                                     \
-uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys) \
-{                                                                      \
-       pt_region_t *r_access = (pt_region_t *)opaque;                  \
-       void *r_virt = (u8 *)r_access->r_virtbase +                  \
-                       (e_phys - r_access->e_physbase);             \
-       uint32_t value = (u32) (*(type *) r_virt);                      \
-       if (r_access->debug & PT_DEBUG_MMIO) {                           \
-               fprintf(logfile,                                        \
-                       "pt_mmio_read" #suffix ": e_physbase=%p "   \
-                       "e_phys=%p r_virt=%p value=%08x\n",           \
-                       (void *)r_access->e_physbase,                        \
-                       (void *)e_phys, r_virt, value);                 \
-       }                                                               \
-       return value;                                                   \
-}
-
-pt_mmio_read(b, u8)
-pt_mmio_read(w, u16)
-pt_mmio_read(l, u32)
-
 #define pt_ioport_write(suffix)                                                
\
 void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
 {                                                                      \
@@ -127,22 +74,33 @@ pt_ioport_read(b)
 pt_ioport_read(w)
 pt_ioport_read(l)

-static void pt_iomem_map(PCIDevice * d, int region_num,
-                        uint32_t e_phys, uint32_t e_size, int type)
+void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys,
+                 uint32_t e_size, int type)
 {
-       pt_dev_t *r_dev = (pt_dev_t *) d;
-
-       r_dev->v_addrs[region_num].e_physbase = e_phys;
+       pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+       pt_region_t *region = &r_dev->v_addrs[region_num];
+       int first_map = (region->e_size == 0);
+       int ret = 0;

        DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
              e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
              region_num);

-       cpu_register_physical_memory(e_phys,
-                                    r_dev->dev.io_regions[region_num].size,
-                                    r_dev->v_addrs[region_num].memory_index);
-}
+       region->e_physbase = e_phys;
+       region->e_size = e_size;
+
+       if (!first_map)
+               kvm_destroy_phys_mem(kvm_context, e_phys, e_size);

+       if (e_size > 0)
+               ret = kvm_register_userspace_phys_mem(kvm_context,
+                                                     e_phys,
+                                                     region->r_virtbase,
+                                                     e_size,
+                                                     0);
+       if (ret != 0)
+               fprintf(logfile, "Error: create new mapping failed\n");
+}

 static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
                          uint32_t addr, uint32_t size, int type)
@@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions,
                                        (uint32_t) (cur_region->base_addr));
                                return (-1);
                        }
+                       pci_dev->v_addrs[i].r_size = cur_region->size;
+                       pci_dev->v_addrs[i].e_size = 0;

                        /* add offset */
                        pci_dev->v_addrs[i].r_virtbase +=
@@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions,
                                               cur_region->size, t,
                                               pt_iomem_map);

-                       pci_dev->v_addrs[i].memory_index =
-                           cpu_register_io_memory(0, pt_mmio_read_cb,
-                                                  pt_mmio_write_cb,
-                                                  (void *) 
&(pci_dev->v_addrs[i]));
-
                        continue;
                }
                /* handle port io regions */
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
index 1dcf752..67ac4dc 100644
--- a/qemu/hw/pci-passthrough.h
+++ b/qemu/hw/pci-passthrough.h
@@ -54,6 +54,8 @@ typedef struct pt_region_s {
        uint32_t memory_index;
        void *r_virtbase;       /* mmapped access address */
        int num;                /* our index within v_addrs[] */
+       uint32_t e_size;          /* emulated size of region in bytes */
+       uint32_t r_size;          /* real size of region in bytes */
        uint32_t debug;
 } pt_region_t;

--
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to