[Qemu-devel] [RFC PATCH V4 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

2018-04-10 Thread Yulei Zhang
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the
bitmap of pinned memory in iommu container, we need copy those
memory to the target during the migration as they are dirtied by
mdev devices.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/common.c   | 34 ++
 linux-headers/linux/vfio.h | 14 ++
 2 files changed, 48 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7007878..460b186 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "exec/ram_addr.h"
 
 struct vfio_group_head vfio_group_list =
 QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -624,9 +625,42 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static void vfio_log_sync(MemoryListener *listener,
+  MemoryRegionSection *section)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+VFIOGroup *group = QLIST_FIRST(>group_list);
+VFIODevice *vbasedev;
+QLIST_FOREACH(vbasedev, >device_list, next) {
+if (vbasedev->device_state == VFIO_DEVICE_START) {
+return;
+}
+}
+
+struct vfio_iommu_get_dirty_bitmap *d;
+ram_addr_t size = int128_get64(section->size);
+unsigned long page_nr = size >> TARGET_PAGE_BITS;
+unsigned long bitmap_size =
+(BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long);
+d = g_malloc0(sizeof(*d) + bitmap_size);
+d->start_addr = section->offset_within_address_space;
+d->page_nr = page_nr;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) {
+error_report("vfio: Failed to fetch dirty pages for migration");
+goto exit;
+}
+
+cpu_physical_memory_set_dirty_lebitmap((unsigned long *)>dirty_bitmap,
+   d->start_addr, d->page_nr);
+exit:
+g_free(d);
+}
+
 static const MemoryListener vfio_memory_listener = {
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
+.log_sync = vfio_log_sync,
 };
 
 static void vfio_listener_release(VFIOContainer *container)
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 2c911d9..56bf76f 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -589,6 +589,20 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_get_dirty_bitmap {
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.4




[Qemu-devel] [RFC PATCH V4 2/4] vfio: Add vm status change callback to stop/restart the mdev device

2018-04-09 Thread Yulei Zhang
VM status change handler is added to change the vfio pci device
status during the migration, write the demanded device status
to the DEVICE STATUS subregion to stop the device on the source side
before fetch its status and start the deivce on the target side
after restore its status.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c | 20 
 include/hw/vfio/vfio-common.h |  1 +
 linux-headers/linux/vfio.h|  6 ++
 roms/seabios  |  2 +-
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f98a9dd..13d8c73 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -38,6 +38,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2896,6 +2897,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_register_err_notifier(vdev);
 vfio_register_req_notifier(vdev);
 vfio_setup_resetfn_quirk(vdev);
+qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
 return;
 
@@ -2982,6 +2984,24 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
+{
+VFIOPCIDevice *vdev = pv;
+VFIODevice *vbasedev = >vbasedev;
+uint8_t dev_state;
+uint8_t sz = 1;
+
+dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP;
+
+if (pwrite(vdev->vbasedev.fd, _state,
+   sz, vdev->device_state.offset) != sz) {
+error_report("vfio: Failed to %s device", running ? "start" : "stop");
+return;
+}
+
+vbasedev->device_state = dev_state;
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f3a2ac9..9c14a8f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -125,6 +125,7 @@ typedef struct VFIODevice {
 unsigned int num_irqs;
 unsigned int num_regions;
 unsigned int flags;
+bool device_state;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index e3380ad..8f02f2f 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -304,6 +304,12 @@ struct vfio_region_info_cap_type {
 /* Mdev sub-type for device state save and restore */
 #define VFIO_REGION_SUBTYPE_DEVICE_STATE   (4)
 
+/* Offset in region to save device state */
+#define VFIO_DEVICE_STATE_OFFSET   1
+
+#define VFIO_DEVICE_START  0
+#define VFIO_DEVICE_STOP   1
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  * struct vfio_irq_info)
diff --git a/roms/seabios b/roms/seabios
index 63451fc..5f4c7b1 16
--- a/roms/seabios
+++ b/roms/seabios
@@ -1 +1 @@
-Subproject commit 63451fca13c75870e1703eb3e20584d91179aebc
+Subproject commit 5f4c7b13cdf9c450eb55645f4362ea58fa61b79b
-- 
2.7.4




[Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration

2018-04-09 Thread Yulei Zhang
Instead of using vm state description, add SaveVMHandlers for VFIO
device to support live migration.

Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory
bitmap that dirtied by vfio device during the iterative precopy stage
to shorten the system downtime afterward.

For vfio pci device status migrate, during the system downtime, it will
save the following states
1. pci configuration space addr0~addr5
2. pci configuration space msi_addr msi_data
3. pci device status fetch from device driver

And on the target side the vfio_load will restore the same states
1. re-setup the pci bar configuration
2. re-setup the pci device msi configuration
3. restore the pci device status

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 195 +++--
 linux-headers/linux/vfio.h |  14 
 2 files changed, 204 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 13d8c73..ac6a9c7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -33,9 +33,14 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/blocker.h"
+#include "migration/register.h"
+#include "exec/ram_addr.h"
 
 #define MSIX_CAP_LENGTH 12
 
+#define VFIO_SAVE_FLAG_SETUP 0
+#define VFIO_SAVE_FLAG_DEV_STATE 1
+
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
@@ -2639,6 +2644,190 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice 
*vdev)
 vdev->req_enabled = false;
 }
 
+static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev)
+{
+RAMBlock *block;
+struct vfio_device_get_dirty_bitmap *d;
+uint64_t page = 0;
+ram_addr_t size;
+unsigned long nr, bitmap;
+
+RAMBLOCK_FOREACH(block) {
+size = block->used_length;
+nr = size >> TARGET_PAGE_BITS;
+bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long);
+d = g_malloc0(sizeof(*d) +  bitmap);
+d->start_addr = block->offset;
+d->page_nr = nr;
+if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) {
+error_report("vfio: Failed to get device dirty bitmap");
+g_free(d);
+goto exit;
+}
+
+if (d->page_nr) {
+cpu_physical_memory_set_dirty_lebitmap(
+ (unsigned long *)>dirty_bitmap,
+ d->start_addr, d->page_nr);
+page += d->page_nr;
+}
+g_free(d);
+}
+
+exit:
+return page;
+}
+
+static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t 
max_size,
+   uint64_t *non_postcopiable_pending,
+   uint64_t *postcopiable_pending)
+{
+VFIOPCIDevice *vdev = opaque;
+uint64_t pending;
+
+qemu_mutex_lock_iothread();
+rcu_read_lock();
+pending = vfio_dirty_log_sync(vdev);
+rcu_read_unlock();
+qemu_mutex_unlock_iothread();
+*non_postcopiable_pending += pending;
+}
+
+static int vfio_load(QEMUFile *f, void *opaque, int version_id)
+{
+VFIOPCIDevice *vdev = opaque;
+PCIDevice *pdev = >pdev;
+int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+uint8_t *buf = NULL;
+uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) {
+goto exit;
+}
+
+/* retore pci bar configuration */
+ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = qemu_get_be32(f);
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
+}
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
+
+/* restore msi configuration */
+ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
+
+vfio_pci_write_config(>pdev,
+  pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+msi_lo = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
+
+if (msi_64bit) {
+msi_hi = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+  msi_hi, 4);
+}
+msi_data = qemu_get_be32(f);
+vfio_pci_write_config(pdev,
+  pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+  msi_data, 2);
+
+vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FL

[Qemu-devel] [RFC V4 PATCH 0/4] vfio: Introduce live migation capability to

2018-04-09 Thread Yulei Zhang
Summary

This series RFC would like to resume the discussion about how to
introduce the live migration capability to vfio mdev device. 

A new subtype region VFIO_REGION_SUBTYPE_DEVICE_STATE is introduced
for vfio device status migrate, during the initialization it will
check if the region is supported by the vfio device, otherwise it 
will remain non-migratable.

The intention to add the new region is using it for mdev device status
save and restore during the migration. The access to this region
will be trapped and forward to the mdev device driver, it also uses 
the first byte in the new region to control the running state of mdev
device, so during the migration after stop the mdev driver, qemu could
retrieve the specific device status from this region and transfer to 
the target VM side for the mdev device restore.

In addition, during the pre-copy period, it will be able to fetch the
dirty bitmap of vfio device through ioctl VFIO_DEVICE_GET_DIRTY_BITMAP
iteratively, which will be able to shorten the system downtime during
the static copy.

Below is the vfio mdev device migration sequence
Source VM side:
start migration
|
V
 in pre-copy stage, fetch the device dirty bitmap
 and add into qemu dirty list for migrate iteratively.
|
V
 get the cpu state change callback, write to the
 subregion's first byte to stop the mdev device
|
V
 quary the dirty page bitmap from iommu container 
 and add into qemu dirty list for last synchronization
|
V
 save the deivce status into Qemufile which is 
 read from the vfio device subregion

Target VM side:
 restore the mdev device after get the
 saved status context from Qemufile
|
V
  get the cpu state change callback write to 
  subregion's first byte to start the mdev device
  to put it in running status
|
V
finish migration

V3->V4:
1. add migration_blocker if device state region isnot supported.
2. instead of using vmsd, register SaveVMHandlers for VFIO device
   to leverage the pro-copy facility, and add new ioctl for VFIO
   device to fetch dirty bitmap during pro-copy.
3. remove the intel vendor ID dependence for the device state 
   subregion.

V2->V3:
1. rebase the patch to Qemu stable 2.10 branch.
2. use a common name for the subregion instead of specific for 
   intel IGD.

V1->V2:
Per Alex's suggestion:
1. use device subtype region instead of VFIO PCI fixed region.
2. remove unnecessary ioctl, use the first byte of subregion to 
   control the running state of mdev device.  
3. for dirty page synchronization, implement the interface with
   VFIOContainer instead of vfio pci device.

Yulei Zhang (4):
  vfio: introduce a new VFIO subregion for mdev device migration support
  vfio: Add vm status change callback to stop/restart the mdev device
  vfio: Add SaveVMHanlders for VFIO device to support live migration
  vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

 hw/vfio/common.c  |  34 ++
 hw/vfio/pci.c | 240 --
 hw/vfio/pci.h |   2 +
 include/hw/vfio/vfio-common.h |   1 +
 linux-headers/linux/vfio.h|  43 +++-
 roms/seabios  |   2 +-
 6 files changed, 312 insertions(+), 10 deletions(-)

-- 
2.7.4




[Qemu-devel] [RFC PATCH V4 1/4] vfio: introduce a new VFIO subregion for mdev device migration support

2018-04-09 Thread Yulei Zhang
New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added
to fetch and restore the status of mdev device vGPU during the
live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 25 -
 hw/vfio/pci.h  |  2 ++
 linux-headers/linux/vfio.h |  9 ++---
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index c977ee3..f98a9dd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -32,6 +32,7 @@
 #include "pci.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "migration/blocker.h"
 
 #define MSIX_CAP_LENGTH 12
 
@@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_vga_quirk_setup(vdev);
 }
 
+struct vfio_region_info *device_state;
+/* device state region setup */
+if (!vfio_get_dev_region_info(>vbasedev,
+VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+VFIO_REGION_SUBTYPE_DEVICE_STATE, _state)) {
+memcpy(>device_state, device_state,
+   sizeof(struct vfio_region_info));
+g_free(device_state);
+} else {
+error_setg(>migration_blocker,
+"Migration disabled: cannot support device state region");
+migrate_add_blocker(vdev->migration_blocker, );
+if (err) {
+error_propagate(errp, err);
+error_free(vdev->migration_blocker);
+goto error;
+}
+}
+
 for (i = 0; i < PCI_ROM_SLOT; i++) {
 vfio_bar_quirk_setup(vdev, i);
 }
@@ -2884,6 +2904,10 @@ out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
 error:
+if (vdev->migration_blocker) {
+migrate_del_blocker(vdev->migration_blocker);
+error_free(vdev->migration_blocker);
+}
 error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name);
 }
 
@@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = {
 
 static const VMStateDescription vfio_pci_vmstate = {
 .name = "vfio-pci",
-.unmigratable = 1,
 };
 
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 502a575..0ee1724 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice {
 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */
 void *igd_opregion;
+struct vfio_region_info device_state;
+Error *migration_blocker;
 PCIHostDeviceAddress host;
 EventNotifier err_notifier;
 EventNotifier req_notifier;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 4312e96..e3380ad 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -297,9 +297,12 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 
 /* 8086 Vendor sub-types */
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+
+/* Mdev sub-type for device state save and restore */
+#define VFIO_REGION_SUBTYPE_DEVICE_STATE   (4)
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
2.7.4




[Qemu-devel] [PATCH V3 3/4] vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore

2018-03-04 Thread Yulei Zhang
Introduce vfio_device_put/vfio_device_get funtion for vfio device state
save/restore usage.

For VFIO pci device status migrate, on the source side with
funtion vfio_device_put to save the following states
1. pci configuration space addr0~addr5
2. pci configuration space msi_addr msi_data
3. pci device status fetch from device driver

And on the target side with funtion vfio_device_get to restore
the same states
1. re-setup the pci bar configuration
2. re-setup the pci device msi configuration
3. restore the pci device status

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 137 +
 linux-headers/linux/vfio.h |   3 +
 2 files changed, 140 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3e2289c..c1676cf 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2982,6 +2982,123 @@ static void vfio_vm_change_state_handler(void *pv, int 
running, RunState state)
 vbasedev->device_state = dev_state;
 }
 
+static int vfio_device_put(QEMUFile *f, void *pv, size_t size,
+   VMStateField *field, QJSON *vmdesc)
+{
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+uint8_t *buf = NULL;
+uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar_cfg);
+}
+
+msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
+
+msi_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
+qemu_put_be32(f, msi_lo);
+
+if (msi_64bit) {
+msi_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+ 4);
+qemu_put_be32(f, msi_hi);
+}
+
+msi_data = pci_default_read_config(pdev,
+   pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+   2);
+qemu_put_be32(f, msi_data);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate");
+goto exit;
+}
+
+if (pread(vdev->vbasedev.fd, buf, sz,
+  vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+error_report("vfio: Failed to read Device State Region");
+goto exit;
+}
+
+qemu_put_buffer(f, buf, sz);
+
+exit:
+g_free(buf);
+
+return 0;
+}
+
+static int vfio_device_get(QEMUFile *f, void *pv,
+   size_t size, VMStateField *field)
+{
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+uint8_t *buf = NULL;
+uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+/* retore pci bar configuration */
+ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = qemu_get_be32(f);
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4);
+}
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
+
+/* restore msi configuration */
+ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
+
+vfio_pci_write_config(>pdev,
+  pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+msi_lo = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
+
+if (msi_64bit) {
+msi_hi = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+  msi_hi, 4);
+}
+msi_data = qemu_get_be32(f);
+vfio_pci_write_config(pdev,
+  pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+  msi_data, 2);
+
+vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl | PCI_MSI_FLAGS_ENABLE, 2);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate");
+return -1;
+}
+
+qemu_get_buffer(f, buf, sz);
+if (pwrite(vdev->vbasedev.fd, buf, sz,
+   vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+error_report("vfio: Failed to write Device State Region");
+

[Qemu-devel] [PATCH V3 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

2018-03-04 Thread Yulei Zhang
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the
bitmap of pinned memory in iommu container, we need copy those
memory to the target during the migration as they are dirtied by
mdev devices.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/common.c   | 34 ++
 linux-headers/linux/vfio.h | 14 ++
 2 files changed, 48 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7b2924c..a952554 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "exec/ram_addr.h"
 
 struct vfio_group_head vfio_group_list =
 QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -624,9 +625,42 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static void vfio_log_sync(MemoryListener *listener,
+  MemoryRegionSection *section)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+VFIOGroup *group = QLIST_FIRST(>group_list);
+VFIODevice *vbasedev;
+QLIST_FOREACH(vbasedev, >device_list, next) {
+if (vbasedev->device_state == VFIO_DEVICE_START) {
+return;
+}
+}
+
+struct vfio_iommu_get_dirty_bitmap *d;
+ram_addr_t size = int128_get64(section->size);
+unsigned long page_nr = size >> TARGET_PAGE_BITS;
+unsigned long bitmap_size =
+(BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long);
+d = g_malloc0(sizeof(*d) + bitmap_size);
+d->start_addr = section->offset_within_address_space;
+d->page_nr = page_nr;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) {
+error_report("vfio: Failed to fetch dirty pages for migration");
+goto exit;
+}
+
+cpu_physical_memory_set_dirty_lebitmap((unsigned long *)>dirty_bitmap,
+   d->start_addr, d->page_nr);
+exit:
+g_free(d);
+}
+
 static const MemoryListener vfio_memory_listener = {
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
+.log_sync = vfio_log_sync,
 };
 
 static void vfio_listener_release(VFIOContainer *container)
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 4451a8f..a41f73b 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -574,6 +574,20 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_get_dirty_bitmap {
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.4




[Qemu-devel] [PATCH V3 2/4] vfio: Add vm status change callback to stop/restart the mdev device

2018-03-04 Thread Yulei Zhang
VM status change handler is added to change the vfio pci device
status during the migration, write the demanded device status
to the DEVICE STATUS subregion to stop the device on the source side
before fetch its status and start the deivce on the target side
after restore its status.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c | 20 
 include/hw/vfio/vfio-common.h |  1 +
 linux-headers/linux/vfio.h|  3 +++
 3 files changed, 24 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2fe20e4..3e2289c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -38,6 +38,7 @@
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static VMStateDescription vfio_pci_vmstate;
+static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2880,6 +2881,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_register_err_notifier(vdev);
 vfio_register_req_notifier(vdev);
 vfio_setup_resetfn_quirk(vdev);
+qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
 return;
 
@@ -2962,6 +2964,24 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
+{
+VFIOPCIDevice *vdev = pv;
+VFIODevice *vbasedev = >vbasedev;
+uint8_t dev_state;
+uint8_t sz = 1;
+
+dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP;
+
+if (pwrite(vdev->vbasedev.fd, _state,
+   sz, vdev->device_state.offset) != sz) {
+error_report("vfio: Failed to %s device", running ? "start" : "stop");
+return;
+}
+
+vbasedev->device_state = dev_state;
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f3a2ac9..9c14a8f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -125,6 +125,7 @@ typedef struct VFIODevice {
 unsigned int num_irqs;
 unsigned int num_regions;
 unsigned int flags;
+bool device_state;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index c3b8e4a..4ddeebc 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -303,6 +303,9 @@ struct vfio_region_info_cap_type {
 /* Mdev sub-type for device state save and restore */
 #define VFIO_REGION_SUBTYPE_DEVICE_STATE   (4)
 
+#define VFIO_DEVICE_START  0
+#define VFIO_DEVICE_STOP   1
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  * struct vfio_irq_info)
-- 
2.7.4




[Qemu-devel] [PATCH V3 1/4] vfio: introduce a new VFIO subregion for mdev device migration support

2018-03-04 Thread Yulei Zhang
New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added
to fetch and restore the status of mdev device vGPU during the
live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 14 +-
 hw/vfio/pci.h  |  1 +
 linux-headers/linux/vfio.h |  9 ++---
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 31e1edf..2fe20e4 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -37,6 +37,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static VMStateDescription vfio_pci_vmstate;
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2813,6 +2814,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_vga_quirk_setup(vdev);
 }
 
+struct vfio_region_info *device_state;
+/* device state region setup */
+if (!vfio_get_dev_region_info(>vbasedev,
+VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+VFIO_REGION_SUBTYPE_DEVICE_STATE, _state)) {
+memcpy(>device_state, device_state,
+   sizeof(struct vfio_region_info));
+g_free(device_state);
+vfio_pci_vmstate.unmigratable = 0;
+}
+
 for (i = 0; i < PCI_ROM_SLOT; i++) {
 vfio_bar_quirk_setup(vdev, i);
 }
@@ -2994,7 +3006,7 @@ static Property vfio_pci_dev_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
-static const VMStateDescription vfio_pci_vmstate = {
+static VMStateDescription vfio_pci_vmstate = {
 .name = "vfio-pci",
 .unmigratable = 1,
 };
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a8366bb..6a1d26e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -116,6 +116,7 @@ typedef struct VFIOPCIDevice {
 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */
 void *igd_opregion;
+struct vfio_region_info device_state;
 PCIHostDeviceAddress host;
 EventNotifier err_notifier;
 EventNotifier req_notifier;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 4e7ab4c..c3b8e4a 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -296,9 +296,12 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 
 /* 8086 Vendor sub-types */
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+
+/* Mdev sub-type for device state save and restore */
+#define VFIO_REGION_SUBTYPE_DEVICE_STATE   (4)
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
2.7.4




[Qemu-devel] [PATCH V3 0/4] vfio: Introduce Live migration capability to vfio_mdev device

2018-03-04 Thread Yulei Zhang
Summary

This series RFC would like to resume the discussion about how to
introduce the live migration capability to vfio mdev device. 

By adding a new vfio subtype region VFIO_REGION_SUBTYPE_DEVICE_STATE,
the mdev device will be set to migratable if the new region exist
during the initialization.  

The intention to add the new region is using it for mdev device status
save and restore during the migration. The access to this region
will be trapped and forward to the mdev device driver, it also uses 
the first byte in the new region to control the running state of mdev
device, so during the migration after stop the mdev driver, qemu could
retrieve the specific device status from this region and transfer to 
the target VM side for the mdev device restore.

In addition,  we add one new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP to help do 
the mdev device dirty page synchronization during the migration, currently
it is just for static copy, in the future we would like to add new interface
for the pre-copy.

Below is the vfio_mdev device migration sequence
Source VM side:
start migration
|
V
 get the cpu state change callback, write to the
 subregion's first byte to stop the mdev device
|
V
 quary the dirty page bitmap from iommu container 
 and add into qemu dirty list for synchronization
|
V
 save the deivce status into Qemufile which is 
 read from the vfio device subregion

Target VM side:
   restore the mdev device after get the
 saved status context from Qemufile
|
V
 get the cpu state change callback
 write to subregion's first byte to 
  start the mdev device to put it in 
  running status
|
V
finish migration

V3->V2:
1. rebase the patch to Qemu stable 2.10 branch.
2. use a common name for the subregion instead of specific for 
   intel IGD.

V1->V2:
Per Alex's suggestion:
1. use device subtype region instead of VFIO PCI fixed region.
2. remove unnecessary ioctl, use the first byte of subregion to 
   control the running state of mdev device.  
3. for dirty page synchronization, implement the interface with
   VFIOContainer instead of vfio pci device.

Yulei Zhang (4):
  vfio: introduce a new VFIO subregion for mdev device migration support
  vfio: Add vm status change callback to stop/restart the mdev device
  vfio: Add struct vfio_vmstate_info to introduce put/get callback
funtion for vfio device status save/restore
  vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

 hw/vfio/common.c  |  34 +
 hw/vfio/pci.c | 171 +-
 hw/vfio/pci.h |   1 +
 include/hw/vfio/vfio-common.h |   1 +
 linux-headers/linux/vfio.h|  29 ++-
 5 files changed, 232 insertions(+), 4 deletions(-)

-- 
2.7.4



[Qemu-devel] [RFC V2 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

2017-07-31 Thread Yulei Zhang
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the
bitmap of pinned memory in iommu container, we need copy those
memory to the target during the migration as they are dirtied by
mdev devices.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/common.c   | 32 
 linux-headers/linux/vfio.h | 14 ++
 2 files changed, 46 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f3ba9b9..54d43d5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "exec/ram_addr.h"
 
 struct vfio_group_head vfio_group_list =
 QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -603,9 +604,40 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static void vfio_log_sync(MemoryListener *listener,
+  MemoryRegionSection *section)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+VFIOGroup *group = QLIST_FIRST(>group_list);
+VFIODevice *vbasedev;
+QLIST_FOREACH(vbasedev, >device_list, next) {
+   if (vbasedev->device_state == VFIO_DEVICE_START)
+   return;
+}
+
+struct vfio_iommu_get_dirty_bitmap *d;
+ram_addr_t size = int128_get64(section->size);
+unsigned long page_nr = size >> TARGET_PAGE_BITS;
+unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned 
long);
+d = g_malloc0(sizeof(*d) + bitmap_size);
+d->start_addr = section->offset_within_address_space;
+d->page_nr = page_nr;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) {
+error_report("vfio: Failed to fetch dirty pages for migration\n");
+goto exit;
+}
+
+cpu_physical_memory_set_dirty_lebitmap((unsigned long*)>dirty_bitmap, 
d->start_addr, d->page_nr);
+
+exit:
+g_free(d);
+}
+
 static const MemoryListener vfio_memory_listener = {
 .region_add = vfio_listener_region_add,
 .region_del = vfio_listener_region_del,
+.log_sync = vfio_log_sync,
 };
 
 static void vfio_listener_release(VFIOContainer *container)
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index dbbe7e1..cf3d163 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -553,6 +553,20 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_get_dirty_bitmap{
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.4




[Qemu-devel] [RFC V2 2/4] vfio: Add vm status change callback to stop/restart the mdev device

2017-07-31 Thread Yulei Zhang
VM status change handler is added to change the vfio pci device
status during the migration, write the demanded device status
to the DEVICE STATUS subregion to stop the device on the source side
before fetch its status and start the deivce on the target side
after restore its status.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c | 19 +++
 include/hw/vfio/vfio-common.h |  1 +
 linux-headers/linux/vfio.h|  3 +++
 3 files changed, 23 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 21a5cef..753da80 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -38,6 +38,7 @@
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static VMStateDescription vfio_pci_vmstate;
+static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2858,6 +2859,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_register_err_notifier(vdev);
 vfio_register_req_notifier(vdev);
 vfio_setup_resetfn_quirk(vdev);
+qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
 return;
 
@@ -2940,6 +2942,23 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
+{
+VFIOPCIDevice *vdev = pv;
+VFIODevice *vbasedev = >vbasedev;
+uint8_t dev_state;
+uint8_t sz = 1;
+
+dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP;
+
+if (pwrite(vdev->vbasedev.fd, _state, sz, vdev->device_state.offset) 
!= sz) {
+error_report("vfio: Failed to %s device\n", running ? "start" : 
"stop");
+return;
+}
+
+vbasedev->device_state = dev_state;
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c582de1..c4bab97 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -123,6 +123,7 @@ typedef struct VFIODevice {
 unsigned int num_irqs;
 unsigned int num_regions;
 unsigned int flags;
+bool device_state;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index e2c53bf..ae1b953 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -299,6 +299,9 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE (4)
 
+#define VFIO_DEVICE_START  0
+#define VFIO_DEVICE_STOP   1
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  * struct vfio_irq_info)
-- 
2.7.4




[Qemu-devel] [RFC V2 3/4] vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore

2017-07-31 Thread Yulei Zhang
Introduce vfio_device_put/vfio_device_get funtion for vfio device state
save/restore usage.

For VFIO pci device status migrate, on the source side with
funtion vfio_device_put to save the following states
1. pci configuration space addr0~addr5
2. pci configuration space msi_addr msi_data
3. pci device status fetch from device driver

And on the target side with funtion vfio_device_get to restore
the same states
1. re-setup the pci bar configuration
2. re-setup the pci device msi configuration
3. restore the pci device status

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 132 +
 linux-headers/linux/vfio.h |   2 +
 2 files changed, 134 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 753da80..c0fc1d2 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2959,6 +2959,118 @@ static void vfio_vm_change_state_handler(void *pv, int 
running, RunState state)
 vbasedev->device_state = dev_state;
 }
 
+static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField 
*field,
+QJSON *vmdesc)
+{
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+uint8_t *buf = NULL;
+uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i*4, 4);
+qemu_put_be32(f, bar_cfg);
+}
+
+msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
+
+msi_lo = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_lo);
+
+if (msi_64bit) {
+msi_hi = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_ADDRESS_HI, 4);
+qemu_put_be32(f, msi_hi);
+}
+
+msi_data = pci_default_read_config(pdev,
+ pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32), 2);
+qemu_put_be32(f, msi_data);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate\n");
+goto exit;
+}
+
+if (pread(vdev->vbasedev.fd, buf, sz,
+  vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+error_report("vfio: Failed to read Device State Region\n");
+goto exit;
+}
+
+qemu_put_buffer(f, buf, sz);
+
+exit:
+if (buf)
+g_free(buf);
+
+return 0;
+}
+
+static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField 
*field)
+{
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET;
+uint8_t *buf = NULL;
+uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+/* retore pci bar configuration */
+ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = qemu_get_be32(f);
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i*4, bar_cfg, 4);
+}
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
+
+/* restore msi configuration */
+ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
+
+vfio_pci_write_config(>pdev,
+  pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+msi_lo = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
+
+if (msi_64bit) {
+msi_hi = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 
msi_hi, 4);
+}
+msi_data = qemu_get_be32(f);
+vfio_pci_write_config(pdev,
+  pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+  msi_data, 2);
+
+vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl | PCI_MSI_FLAGS_ENABLE, 2);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate\n");
+return -1;
+}
+
+qemu_get_buffer(f, buf, sz);
+if (pwrite(vdev->vbasedev.fd, buf, sz,
+   vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) {
+error_report("vfio: Failed to write Device State Region\n");
+return -1;
+}
+
+if (buf)
+   g_free(buf);
+return 0;
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = 

[Qemu-devel] [RFC V2 1/4] vfio: introduce a new VFIO sub region for mdev device migration support

2017-07-31 Thread Yulei Zhang
New VFIO sub region VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE is added
to fetch and restore the status of mdev device vGPU during the live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 13 -
 hw/vfio/pci.h  |  1 +
 linux-headers/linux/vfio.h |  7 ---
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 03a3d01..21a5cef 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -37,6 +37,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static VMStateDescription vfio_pci_vmstate;
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2792,6 +2793,16 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_vga_quirk_setup(vdev);
 }
 
+struct vfio_region_info *device_state;
+/* device state region setup */
+if (!vfio_get_dev_region_info(>vbasedev,
+VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
+VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE, _state)) {
+   memcpy(>device_state, device_state, sizeof(struct 
vfio_region_info));
+   g_free(device_state);
+vfio_pci_vmstate.unmigratable = 0;
+}
+
 for (i = 0; i < PCI_ROM_SLOT; i++) {
 vfio_bar_quirk_setup(vdev, i);
 }
@@ -2973,7 +2984,7 @@ static Property vfio_pci_dev_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
-static const VMStateDescription vfio_pci_vmstate = {
+static VMStateDescription vfio_pci_vmstate = {
 .name = "vfio-pci",
 .unmigratable = 1,
 };
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a8366bb..6a1d26e 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -116,6 +116,7 @@ typedef struct VFIOPCIDevice {
 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */
 void *igd_opregion;
+struct vfio_region_info device_state;
 PCIHostDeviceAddress host;
 EventNotifier err_notifier;
 EventNotifier req_notifier;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 531cb2e..e2c53bf 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -294,9 +294,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 
 /* 8086 Vendor sub-types */
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
-#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE (4)
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
2.7.4




[Qemu-devel] [RFC V2 0/4] vfio: Introduce Live migration capability to vfio_mdev device

2017-07-31 Thread Yulei Zhang
Summary

This series RFC would like to introduce the live migration capability
to vfio_mdev device. 

As currently vfio_mdev device don't support migration, we introduce 
a new vfio subtype region VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE
for Intel vGPU device, during the vfio device initialization, the mdev
device will be set to migratable if the new region exist.  

The intention to add the new region is using it for vfio_mdev device
status save and restore during the migration. The access to this region
will be trapped and forward to the vfio_mdev device driver. And we use 
the first byte in the new region to control the running state of mdev
device.

Meanwhile we add one new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP to help do 
the mdev device dirty page synchronization.

So the vfio_mdev device migration sequence would be
Source VM side:
start migration
|
V
 get the cpu state change callback, write to the
 subregion's first byte to stop the mdev device
|
V
 quary the dirty page bitmap from iommu container 
 and add into qemu dirty list for synchronization
|
V
 save the deivce status into Qemufile which is 
 read from the vfio device subregion

Target VM side:
   restore the mdev device after get the
 saved status context from Qemufile
|
V
 get the cpu state change callback
 write to subregion's first byte to 
  start the mdev device to put it in 
  running status
|
V
finish migration

V1->V2:
Per Alex's suggestion:
1. use device subtype region instead of VFIO PCI fixed region.
2. remove unnecessary ioctl, use the first byte of subregion to 
   control the running state of mdev device.  
3. for dirty page synchronization, implement the interface with
   VFIOContainer instead of vfio pci device.

Yulei Zhang (4):
  vfio: introduce a new VFIO sub region for mdev device migration
support
  vfio: Add vm status change callback to stop/restart the mdev device
  vfio: Add struct vfio_vmstate_info to introduce put/get callback
funtion for vfio device status save/restore
  vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

 hw/vfio/common.c  |  32 +
 hw/vfio/pci.c | 164 +-
 hw/vfio/pci.h |   1 +
 include/hw/vfio/vfio-common.h |   1 +
 linux-headers/linux/vfio.h|  26 ++-
 5 files changed, 220 insertions(+), 4 deletions(-)

-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 7/9] drm/i915/gvt: Introduce new VFIO ioctl for device status control

2017-06-26 Thread Yulei Zhang
Add handling for new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET to control
the status of mdev device vGPU. vGPU will stop/start rendering according
to the command comes along with the ioctl.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c |  9 +
 drivers/gpu/drm/i915/gvt/vgpu.c  |  1 +
 include/uapi/linux/vfio.h| 15 +++
 3 files changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index c44b319..ac327f7 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1147,6 +1147,15 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, 
unsigned int cmd,
} else if (cmd == VFIO_DEVICE_RESET) {
intel_gvt_ops->vgpu_reset(vgpu);
return 0;
+   } else if (cmd == VFIO_DEVICE_PCI_STATUS_SET) {
+   struct vfio_pci_status_set status;
+   minsz = offsetofend(struct vfio_pci_status_set, flags);
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+   if (status.flags == VFIO_DEVICE_PCI_STOP)
+   intel_gvt_ops->vgpu_deactivate(vgpu);
+   else
+   intel_gvt_ops->vgpu_activate(vgpu);
}
 
return 0;
diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 989f353..542bde9 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -205,6 +205,7 @@ void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu)
 {
mutex_lock(>gvt->lock);
vgpu->active = true;
+   intel_vgpu_start_schedule(vgpu);
mutex_unlock(>gvt->lock);
 }
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9ad9ce1..4bb057d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -503,6 +503,21 @@ struct vfio_pci_hot_reset {
 
 #define VFIO_DEVICE_PCI_HOT_RESET  _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+/**
+ * VFIO_DEVICE_PCI_STATUS_SET - _IOW(VFIO_TYPE, VFIO_BASE + 14,
+ * struct vfio_pci_status_set)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_status_set{
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_DEVICE_PCI_STOP  (1 << 0)
+#define VFIO_DEVICE_PCI_START (1 << 1)
+};
+
+#define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14)
+
 /*  API for Type1 VFIO IOMMU  */
 
 /**
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 5/9] drm/i915/gvt: Align the guest gm aperture start offset for live migration

2017-06-26 Thread Yulei Zhang
As guest gm aperture region start offset is initialized when vGPU created,
in order to make sure that start offset is remain the same after migration,
align the aperture start offset to 0 for guest.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 3 +--
 drivers/gpu/drm/i915/gvt/vgpu.c  | 7 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 1ae0b40..d2b13ae 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1002,8 +1002,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, 
unsigned int cmd,
 
sparse->nr_areas = nr_areas;
cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
-   sparse->areas[0].offset =
-   PAGE_ALIGN(vgpu_aperture_offset(vgpu));
+   sparse->areas[0].offset = 0;
sparse->areas[0].size = vgpu_aperture_sz(vgpu);
break;
 
diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 90c14e6..989f353 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -43,8 +43,7 @@ void populate_pvinfo_page(struct intel_vgpu *vgpu)
vgpu_vreg(vgpu, vgtif_reg(version_minor)) = 0;
vgpu_vreg(vgpu, vgtif_reg(display_ready)) = 0;
vgpu_vreg(vgpu, vgtif_reg(vgt_id)) = vgpu->id;
-   vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) =
-   vgpu_aperture_gmadr_base(vgpu);
+   vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = 0;
vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.size)) =
vgpu_aperture_sz(vgpu);
vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) =
@@ -480,6 +479,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, 
bool dmlr,
 {
struct intel_gvt *gvt = vgpu->gvt;
struct intel_gvt_workload_scheduler *scheduler = >scheduler;
+   u64 maddr = vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base));
+   u64 unmaddr = vgpu_vreg(vgpu, 
vgtif_reg(avail_rs.nonmappable_gmadr.base));
 
gvt_dbg_core("--\n");
gvt_dbg_core("resseting vgpu%d, dmlr %d, engine_mask %08x\n",
@@ -510,6 +511,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, 
bool dmlr,
 
intel_vgpu_reset_mmio(vgpu, dmlr);
populate_pvinfo_page(vgpu);
+   vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = 
maddr;
+   vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) = 
unmaddr;
intel_vgpu_reset_display(vgpu);
 
if (dmlr) {
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 9/9] drm/i915/gvt: Add support to VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX

2017-06-26 Thread Yulei Zhang
Add new VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX support in vGPU, through
this new region it can fetch the status from mdev device for migration, on
the target side it can retrieve the device status and reconfigure the device to
continue running after resume the guest.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/Makefile  |   2 +-
 drivers/gpu/drm/i915/gvt/gvt.c |   1 +
 drivers/gpu/drm/i915/gvt/gvt.h |   5 +
 drivers/gpu/drm/i915/gvt/kvmgt.c   |  19 +
 drivers/gpu/drm/i915/gvt/migrate.c | 715 +
 drivers/gpu/drm/i915/gvt/migrate.h |  82 +
 drivers/gpu/drm/i915/gvt/mmio.c|  14 +
 drivers/gpu/drm/i915/gvt/mmio.h|   1 +
 include/uapi/linux/vfio.h  |   3 +-
 9 files changed, 840 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/gvt/migrate.c
 create mode 100644 drivers/gpu/drm/i915/gvt/migrate.h

diff --git a/drivers/gpu/drm/i915/gvt/Makefile 
b/drivers/gpu/drm/i915/gvt/Makefile
index f5486cb9..a7e2e34 100644
--- a/drivers/gpu/drm/i915/gvt/Makefile
+++ b/drivers/gpu/drm/i915/gvt/Makefile
@@ -1,7 +1,7 @@
 GVT_DIR := gvt
 GVT_SOURCE := gvt.o aperture_gm.o handlers.o vgpu.o trace_points.o firmware.o \
interrupt.o gtt.o cfg_space.o opregion.o mmio.o display.o edid.o \
-   execlist.o scheduler.o sched_policy.o render.o cmd_parser.o
+   execlist.o scheduler.o sched_policy.o render.o cmd_parser.o migrate.o
 
 ccflags-y  += -I$(src) -I$(src)/$(GVT_DIR)
 i915-y += $(addprefix $(GVT_DIR)/, 
$(GVT_SOURCE))
diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index c27c683..e40af70 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -54,6 +54,7 @@ static const struct intel_gvt_ops intel_gvt_ops = {
.vgpu_reset = intel_gvt_reset_vgpu,
.vgpu_activate = intel_gvt_activate_vgpu,
.vgpu_deactivate = intel_gvt_deactivate_vgpu,
+   .vgpu_save_restore = intel_gvt_save_restore,
 };
 
 /**
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 23eeb7c..12aa3b8 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -46,6 +46,7 @@
 #include "sched_policy.h"
 #include "render.h"
 #include "cmd_parser.h"
+#include "migrate.h"
 
 #define GVT_MAX_VGPU 8
 
@@ -431,6 +432,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, 
bool dmlr,
 void intel_gvt_reset_vgpu(struct intel_vgpu *vgpu);
 void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu);
 void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu);
+int intel_gvt_save_restore(struct intel_vgpu *vgpu, char *buf,
+   size_t count, uint64_t off, bool restore);
 
 /* validating GM functions */
 #define vgpu_gmadr_is_aperture(vgpu, gmadr) \
@@ -513,6 +516,8 @@ struct intel_gvt_ops {
void (*vgpu_reset)(struct intel_vgpu *);
void (*vgpu_activate)(struct intel_vgpu *);
void (*vgpu_deactivate)(struct intel_vgpu *);
+   int  (*vgpu_save_restore)(struct intel_vgpu *, char *buf,
+ size_t count, uint64_t off, bool restore);
 };
 
 
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index e9f11a9..d4ede29 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -670,6 +670,9 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char 
*buf,
bar0_start + pos, buf, count);
}
break;
+   case VFIO_PCI_DEVICE_STATE_REGION_INDEX:
+   ret = intel_gvt_ops->vgpu_save_restore(vgpu, buf, count, pos, 
is_write);
+   break;
case VFIO_PCI_BAR2_REGION_INDEX:
case VFIO_PCI_BAR3_REGION_INDEX:
case VFIO_PCI_BAR4_REGION_INDEX:
@@ -688,6 +691,10 @@ static ssize_t intel_vgpu_read(struct mdev_device *mdev, 
char __user *buf,
 {
unsigned int done = 0;
int ret;
+   unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+   if (index == VFIO_PCI_DEVICE_STATE_REGION_INDEX)
+   return intel_vgpu_rw(mdev, (char *)buf, count, ppos, false);
 
while (count) {
size_t filled;
@@ -748,6 +755,10 @@ static ssize_t intel_vgpu_write(struct mdev_device *mdev,
 {
unsigned int done = 0;
int ret;
+   unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+   if (index == VFIO_PCI_DEVICE_STATE_REGION_INDEX)
+   return intel_vgpu_rw(mdev, (char *)buf, count, ppos, true);
 
while (count) {
size_t filled;
@@ -1037,6 +1048,14 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, 
unsigned int cmd,
case VFIO_PCI_VGA_REGION_INDEX:
gvt_dbg_core("get region info index:%d\n", info.

[Qemu-devel] [Intel-gfx][RFC 3/9] drm/i915/gvt: Adjust the gma parameter in gpu commands during command parser

2017-06-26 Thread Yulei Zhang
Adjust the gma parameter in gpu commands according to the shift
offset in guests' aperture and hidden gm address, and patch
the commands before submit to execute.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/cmd_parser.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c 
b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 51241de5..540ee42 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -922,7 +922,7 @@ static int cmd_handler_lrr(struct parser_exec_state *s)
 }
 
 static inline int cmd_address_audit(struct parser_exec_state *s,
-   unsigned long guest_gma, int op_size, bool index_mode);
+   unsigned long guest_gma, int op_size, bool index_mode, int 
offset);
 
 static int cmd_handler_lrm(struct parser_exec_state *s)
 {
@@ -942,7 +942,7 @@ static int cmd_handler_lrm(struct parser_exec_state *s)
gma = cmd_gma(s, i + 1);
if (gmadr_bytes == 8)
gma |= (cmd_gma_hi(s, i + 2)) << 32;
-   ret |= cmd_address_audit(s, gma, sizeof(u32), false);
+   ret |= cmd_address_audit(s, gma, sizeof(u32), false, i 
+ 1);
}
i += gmadr_dw_number(s) + 1;
}
@@ -962,7 +962,7 @@ static int cmd_handler_srm(struct parser_exec_state *s)
gma = cmd_gma(s, i + 1);
if (gmadr_bytes == 8)
gma |= (cmd_gma_hi(s, i + 2)) << 32;
-   ret |= cmd_address_audit(s, gma, sizeof(u32), false);
+   ret |= cmd_address_audit(s, gma, sizeof(u32), false, i 
+ 1);
}
i += gmadr_dw_number(s) + 1;
}
@@ -1032,7 +1032,7 @@ static int cmd_handler_pipe_control(struct 
parser_exec_state *s)
if (cmd_val(s, 1) & (1 << 21))
index_mode = true;
ret |= cmd_address_audit(s, gma, sizeof(u64),
-   index_mode);
+   index_mode, 2);
}
}
}
@@ -1364,10 +1364,12 @@ static unsigned long get_gma_bb_from_cmd(struct 
parser_exec_state *s, int index)
 }
 
 static inline int cmd_address_audit(struct parser_exec_state *s,
-   unsigned long guest_gma, int op_size, bool index_mode)
+   unsigned long guest_gma, int op_size, bool index_mode, int 
offset)
 {
struct intel_vgpu *vgpu = s->vgpu;
u32 max_surface_size = vgpu->gvt->device_info.max_surface_size;
+   int gmadr_bytes = vgpu->gvt->device_info.gmadr_bytes_in_cmd;
+   u64 host_gma;
int i;
int ret;
 
@@ -1387,6 +1389,14 @@ static inline int cmd_address_audit(struct 
parser_exec_state *s,
  guest_gma + op_size - 1))) {
ret = -EINVAL;
goto err;
+   } else
+   intel_gvt_ggtt_gmadr_g2h(vgpu, guest_gma, _gma);
+
+   if (offset > 0) {
+   patch_value(s, cmd_ptr(s, offset), host_gma & GENMASK(31, 2));
+   if (gmadr_bytes == 8)
+   patch_value(s, cmd_ptr(s, offset + 1),
+   (host_gma >> 32) & GENMASK(15, 0));
}
return 0;
 err:
@@ -1429,7 +1439,7 @@ static int cmd_handler_mi_store_data_imm(struct 
parser_exec_state *s)
gma = (gma_high << 32) | gma_low;
core_id = (cmd_val(s, 1) & (1 << 0)) ? 1 : 0;
}
-   ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false);
+   ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false, 1);
return ret;
 }
 
@@ -1473,7 +1483,7 @@ static int cmd_handler_mi_op_2f(struct parser_exec_state 
*s)
gma_high = cmd_val(s, 2) & GENMASK(15, 0);
gma = (gma_high << 32) | gma;
}
-   ret = cmd_address_audit(s, gma, op_size, false);
+   ret = cmd_address_audit(s, gma, op_size, false, 1);
return ret;
 }
 
@@ -1513,7 +1523,7 @@ static int cmd_handler_mi_flush_dw(struct 
parser_exec_state *s)
/* Store Data Index */
if (cmd_val(s, 0) & (1 << 21))
index_mode = true;
-   ret = cmd_address_audit(s, gma, sizeof(u64), index_mode);
+   ret = cmd_address_audit(s, (gma | (1 << 2)), sizeof(u64), 
index_mode, 1);
}
/* Check notify bit */
if ((cmd_val(s, 0) & (1 << 8)))
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 4/9] drm/i915/gvt: Retrieve the guest gm base address from PVINFO

2017-06-26 Thread Yulei Zhang
As after migration the host gm base address will be changed due
to resource re-allocation, in order to make sure the guest gm
address doesn't change with that to retrieve the guest gm base
address from PVINFO.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/cfg_space.c |  3 ++-
 drivers/gpu/drm/i915/gvt/gtt.c   |  8 
 drivers/gpu/drm/i915/gvt/gvt.h   | 22 ++
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/cfg_space.c 
b/drivers/gpu/drm/i915/gvt/cfg_space.c
index 40af17e..b57ae44 100644
--- a/drivers/gpu/drm/i915/gvt/cfg_space.c
+++ b/drivers/gpu/drm/i915/gvt/cfg_space.c
@@ -33,6 +33,7 @@
 
 #include "i915_drv.h"
 #include "gvt.h"
+#include "i915_pvinfo.h"
 
 enum {
INTEL_GVT_PCI_BAR_GTTMMIO = 0,
@@ -123,7 +124,7 @@ static int map_aperture(struct intel_vgpu *vgpu, bool map)
else
val = *(u32 *)(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_2);
 
-   first_gfn = (val + vgpu_aperture_offset(vgpu)) >> PAGE_SHIFT;
+   first_gfn = (val + vgpu_guest_aperture_offset(vgpu)) >> PAGE_SHIFT;
first_mfn = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
 
ret = intel_gvt_hypervisor_map_gfn_to_mfn(vgpu, first_gfn,
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index df596a6..e9a127c 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -64,10 +64,10 @@ int intel_gvt_ggtt_gmadr_g2h(struct intel_vgpu *vgpu, u64 
g_addr, u64 *h_addr)
 
if (vgpu_gmadr_is_aperture(vgpu, g_addr))
*h_addr = vgpu_aperture_gmadr_base(vgpu)
- + (g_addr - vgpu_aperture_offset(vgpu));
+ + (g_addr - vgpu_guest_aperture_gmadr_base(vgpu));
else
*h_addr = vgpu_hidden_gmadr_base(vgpu)
- + (g_addr - vgpu_hidden_offset(vgpu));
+ + (g_addr - vgpu_guest_hidden_gmadr_base(vgpu));
return 0;
 }
 
@@ -79,10 +79,10 @@ int intel_gvt_ggtt_gmadr_h2g(struct intel_vgpu *vgpu, u64 
h_addr, u64 *g_addr)
return -EACCES;
 
if (gvt_gmadr_is_aperture(vgpu->gvt, h_addr))
-   *g_addr = vgpu_aperture_gmadr_base(vgpu)
+   *g_addr = vgpu_guest_aperture_gmadr_base(vgpu)
+ (h_addr - gvt_aperture_gmadr_base(vgpu->gvt));
else
-   *g_addr = vgpu_hidden_gmadr_base(vgpu)
+   *g_addr = vgpu_guest_hidden_gmadr_base(vgpu)
+ (h_addr - gvt_hidden_gmadr_base(vgpu->gvt));
return 0;
 }
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 71c00b2..23eeb7c 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -343,6 +343,20 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt);
 #define vgpu_fence_base(vgpu) (vgpu->fence.base)
 #define vgpu_fence_sz(vgpu) (vgpu->fence.size)
 
+/* Aperture/GM space definitions for vGPU Guest view point */
+#define vgpu_guest_aperture_offset(vgpu) \
+   vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base))
+#define vgpu_guest_hidden_offset(vgpu) \
+   vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base))
+
+#define vgpu_guest_aperture_gmadr_base(vgpu) (vgpu_guest_aperture_offset(vgpu))
+#define vgpu_guest_aperture_gmadr_end(vgpu) \
+   (vgpu_guest_aperture_gmadr_base(vgpu) + vgpu_aperture_sz(vgpu) - 1)
+
+#define vgpu_guest_hidden_gmadr_base(vgpu) (vgpu_guest_hidden_offset(vgpu))
+#define vgpu_guest_hidden_gmadr_end(vgpu) \
+   (vgpu_guest_hidden_gmadr_base(vgpu) + vgpu_hidden_sz(vgpu) - 1)
+
 struct intel_vgpu_creation_params {
__u64 handle;
__u64 low_gm_sz;  /* in MB */
@@ -420,12 +434,12 @@ void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu);
 
 /* validating GM functions */
 #define vgpu_gmadr_is_aperture(vgpu, gmadr) \
-   ((gmadr >= vgpu_aperture_gmadr_base(vgpu)) && \
-(gmadr <= vgpu_aperture_gmadr_end(vgpu)))
+   ((gmadr >= vgpu_guest_aperture_gmadr_base(vgpu)) && \
+(gmadr <= vgpu_guest_aperture_gmadr_end(vgpu)))
 
 #define vgpu_gmadr_is_hidden(vgpu, gmadr) \
-   ((gmadr >= vgpu_hidden_gmadr_base(vgpu)) && \
-(gmadr <= vgpu_hidden_gmadr_end(vgpu)))
+   ((gmadr >= vgpu_guest_hidden_gmadr_base(vgpu)) && \
+(gmadr <= vgpu_guest_hidden_gmadr_end(vgpu)))
 
 #define vgpu_gmadr_is_valid(vgpu, gmadr) \
 ((vgpu_gmadr_is_aperture(vgpu, gmadr) || \
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 0/9] drm/i915/gvt: Add the live migration support to VFIO mdev deivce - Intel vGPU

2017-06-26 Thread Yulei Zhang
This series RFC patches give a sample about how to enable the live migration
on vfio mdev deivce with the new introduced vfio interface and vfio device
status region.

In order to fulfill the migration requirement we add the following
modifications to the mdev device driver.
1. Add the guest to host graphics address adjustment when guest 
   try to access gma through mmio or graphics commands, so after 
   migraiton the guest view of graphics address will remain the same.
2. Add handler for VFIO new ioctls to contorl the device stop/start and
   fetch the dirty page bitmap from device model.
3. Implement the function to save/retore the device context, which 
   is accessed through VFIO new region VFIO_PCI_DEVICE_STATE_REGION_INDEX
   to transfer device status during the migration.

Yulei Zhang (9):
  drm/i915/gvt: Apply g2h adjust for GTT mmio access
  drm/i915/gvt: Apply g2h adjustment during fence mmio access
  drm/i915/gvt: Adjust the gma parameter in gpu commands during command
parser
  drm/i915/gvt: Retrieve the guest gm base address from PVINFO
  drm/i915/gvt: Align the guest gm aperture start offset for live
migration
  drm/i915/gvt: Introduce new flag to indicate migration capability
  drm/i915/gvt: Introduce new VFIO ioctl for device status control
  drm/i915/gvt: Introduce new VFIO ioctl for mdev device dirty page sync
  drm/i915/gvt: Add support to VFIO region
VFIO_PCI_DEVICE_STATE_REGION_INDEX

 drivers/gpu/drm/i915/gvt/Makefile  |   2 +-
 drivers/gpu/drm/i915/gvt/aperture_gm.c |   6 +-
 drivers/gpu/drm/i915/gvt/cfg_space.c   |   3 +-
 drivers/gpu/drm/i915/gvt/cmd_parser.c  |  26 +-
 drivers/gpu/drm/i915/gvt/gtt.c |  19 +-
 drivers/gpu/drm/i915/gvt/gvt.c |   1 +
 drivers/gpu/drm/i915/gvt/gvt.h |  41 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c   |  65 ++-
 drivers/gpu/drm/i915/gvt/migrate.c | 715 +
 drivers/gpu/drm/i915/gvt/migrate.h |  82 
 drivers/gpu/drm/i915/gvt/mmio.c|  14 +
 drivers/gpu/drm/i915/gvt/mmio.h|   1 +
 drivers/gpu/drm/i915/gvt/vgpu.c|   8 +-
 include/uapi/linux/vfio.h  |  33 +-
 14 files changed, 984 insertions(+), 32 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/gvt/migrate.c
 create mode 100644 drivers/gpu/drm/i915/gvt/migrate.h

-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 6/9] drm/i915/gvt: Introduce new flag to indicate migration capability

2017-06-26 Thread Yulei Zhang
New device flag VFIO_DEVICE_FLAGS_MIGRATABLE is added for vfio mdev
device vGPU to claim the capability for live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 1 +
 include/uapi/linux/vfio.h| 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index d2b13ae..c44b319 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -940,6 +940,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, 
unsigned int cmd,
 
info.flags = VFIO_DEVICE_FLAGS_PCI;
info.flags |= VFIO_DEVICE_FLAGS_RESET;
+   info.flags |= VFIO_DEVICE_FLAGS_MIGRATABLE;
info.num_regions = VFIO_PCI_NUM_REGIONS;
info.num_irqs = VFIO_PCI_NUM_IRQS;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ae46105..9ad9ce1 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -199,6 +199,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
 #define VFIO_DEVICE_FLAGS_CCW  (1 << 4)/* vfio-ccw device */
+#define VFIO_DEVICE_FLAGS_MIGRATABLE (1 << 5)   /* Device supports migrate */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 1/9] drm/i915/gvt: Apply g2h adjust for GTT mmio access

2017-06-26 Thread Yulei Zhang
Apply guest to host gma conversion while guest try to access the
GTT mmio registers, as after enable live migration the host gma
will be changed due to the resourece re-allocation, but guest
gma should be remaining unchanged, thus g2h conversion is request
for it.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/gtt.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 66374db..df596a6 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -59,8 +59,7 @@ bool intel_gvt_ggtt_validate_range(struct intel_vgpu *vgpu, 
u64 addr, u32 size)
 /* translate a guest gmadr to host gmadr */
 int intel_gvt_ggtt_gmadr_g2h(struct intel_vgpu *vgpu, u64 g_addr, u64 *h_addr)
 {
-   if (WARN(!vgpu_gmadr_is_valid(vgpu, g_addr),
-"invalid guest gmadr %llx\n", g_addr))
+   if (!vgpu_gmadr_is_valid(vgpu, g_addr))
return -EACCES;
 
if (vgpu_gmadr_is_aperture(vgpu, g_addr))
@@ -1819,17 +1818,15 @@ static int emulate_gtt_mmio_write(struct intel_vgpu 
*vgpu, unsigned int off,
struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm;
struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops;
unsigned long g_gtt_index = off >> info->gtt_entry_size_shift;
-   unsigned long gma;
+   unsigned long h_gtt_index;
struct intel_gvt_gtt_entry e, m;
int ret;
 
if (bytes != 4 && bytes != 8)
return -EINVAL;
 
-   gma = g_gtt_index << GTT_PAGE_SHIFT;
-
/* the VM may configure the whole GM space when ballooning is used */
-   if (!vgpu_gmadr_is_valid(vgpu, gma))
+   if (intel_gvt_ggtt_index_g2h(vgpu, g_gtt_index, _gtt_index))
return 0;
 
ggtt_get_guest_entry(ggtt_mm, , g_gtt_index);
@@ -1852,7 +1849,7 @@ static int emulate_gtt_mmio_write(struct intel_vgpu 
*vgpu, unsigned int off,
ops->set_pfn(, gvt->gtt.scratch_ggtt_mfn);
}
 
-   ggtt_set_shadow_entry(ggtt_mm, , g_gtt_index);
+   ggtt_set_shadow_entry(ggtt_mm, , h_gtt_index);
gtt_invalidate(gvt->dev_priv);
ggtt_set_guest_entry(ggtt_mm, , g_gtt_index);
return 0;
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 8/9] drm/i915/gvt: Introduce new VFIO ioctl for mdev device dirty page sync

2017-06-26 Thread Yulei Zhang
Add new vfio ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP to fetch the dirty
page bitmap from mdev device driver for data sync during live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 33 +
 include/uapi/linux/vfio.h| 14 ++
 2 files changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index ac327f7..e9f11a9 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -919,6 +919,24 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, 
uint32_t flags,
return func(vgpu, index, start, count, flags, data);
 }
 
+static void intel_vgpu_update_dirty_bitmap(struct intel_vgpu *vgpu, u64 
start_addr,
+   u64 page_nr, void *bitmap)
+{
+   u64 gfn = start_addr >> GTT_PAGE_SHIFT;
+   struct intel_vgpu_guest_page *p;
+   int i;
+
+   for (i = 0; i < page_nr; i++) {
+   hash_for_each_possible(vgpu->gtt.guest_page_hash_table,
+  p, node, gfn) {
+   if (p->gfn == gfn)
+   set_bit(i, bitmap);
+   }
+   gfn++;
+   }
+
+}
+
 static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
 unsigned long arg)
 {
@@ -1156,6 +1174,21 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, 
unsigned int cmd,
intel_gvt_ops->vgpu_deactivate(vgpu);
else
intel_gvt_ops->vgpu_activate(vgpu);
+   } else if (cmd == VFIO_DEVICE_PCI_GET_DIRTY_BITMAP) {
+   struct vfio_pci_get_dirty_bitmap d;
+   unsigned long bitmap_sz;
+   unsigned *bitmap;
+   minsz = offsetofend(struct vfio_pci_get_dirty_bitmap, page_nr);
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+   bitmap_sz = (BITS_TO_LONGS(d.page_nr) + 1) * sizeof(unsigned 
long);
+   bitmap = vzalloc(bitmap_sz);
+   intel_vgpu_update_dirty_bitmap(vgpu, d.start_addr, d.page_nr, 
bitmap);
+   if (copy_to_user((void __user*)arg + minsz, bitmap, bitmap_sz)) 
{
+   vfree(bitmap);
+   return -EFAULT;
+   }
+   vfree(bitmap);
}
 
return 0;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4bb057d..544cf93 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -518,6 +518,20 @@ struct vfio_pci_status_set{
 
 #define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/**
+ * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15,
+ * struct vfio_pci_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_get_dirty_bitmap{
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15)
+
 /*  API for Type1 VFIO IOMMU  */
 
 /**
-- 
2.7.4




[Qemu-devel] [Intel-gfx][RFC 2/9] drm/i915/gvt: Apply g2h adjustment during fence mmio access

2017-06-26 Thread Yulei Zhang
Apply the guest to host gma conversion while guest config the
fence mmio registers due to the host gma change after the migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 drivers/gpu/drm/i915/gvt/aperture_gm.c |  6 --
 drivers/gpu/drm/i915/gvt/gvt.h | 14 ++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c 
b/drivers/gpu/drm/i915/gvt/aperture_gm.c
index ca3d192..cd68ec6 100644
--- a/drivers/gpu/drm/i915/gvt/aperture_gm.c
+++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c
@@ -144,8 +144,10 @@ void intel_vgpu_write_fence(struct intel_vgpu *vgpu,
I915_WRITE(fence_reg_lo, 0);
POSTING_READ(fence_reg_lo);
 
-   I915_WRITE(fence_reg_hi, upper_32_bits(value));
-   I915_WRITE(fence_reg_lo, lower_32_bits(value));
+   I915_WRITE(fence_reg_hi,
+   intel_gvt_reg_g2h(vgpu, upper_32_bits(value), 
0xF000));
+   I915_WRITE(fence_reg_lo,
+   intel_gvt_reg_g2h(vgpu, lower_32_bits(value), 
0xF000));
POSTING_READ(fence_reg_lo);
 }
 
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 3a74e79..71c00b2 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -451,6 +451,20 @@ int intel_gvt_ggtt_index_g2h(struct intel_vgpu *vgpu, 
unsigned long g_index,
 int intel_gvt_ggtt_h2g_index(struct intel_vgpu *vgpu, unsigned long h_index,
 unsigned long *g_index);
 
+/* apply guest to host gma convertion in GM registers setting */
+static inline u64 intel_gvt_reg_g2h(struct intel_vgpu *vgpu,
+   u32 addr, u32 mask)
+{
+   u64 gma;
+
+   if (addr) {
+   intel_gvt_ggtt_gmadr_g2h(vgpu,
+   addr & mask, );
+   addr = gma | (addr & (~mask));
+   }
+   return addr;
+}
+
 void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu,
bool primary);
 void intel_vgpu_reset_cfg_space(struct intel_vgpu *vgpu);
-- 
2.7.4




[Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP

2017-06-26 Thread Yulei Zhang
New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the
pci device dirty pages during the migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 32 
 hw/vfio/pci.h  |  2 ++
 linux-headers/linux/vfio.h | 14 ++
 3 files changed, 48 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 833cd90..64c851f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -32,6 +32,7 @@
 #include "pci.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "exec/ram_addr.h"
 
 #define MSIX_CAP_LENGTH 12
 
@@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static VMStateDescription vfio_pci_vmstate;
 static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
+static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection 
*section);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_setup_resetfn_quirk(vdev);
 qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
+vdev->vfio_memory_listener = (MemoryListener) {
+   .log_sync = vfio_log_sync,
+};
+memory_listener_register(>vfio_memory_listener, 
_space_memory);
+
 return;
 
 out_teardown:
@@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void *pv, int 
running, RunState state)
 if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) {
 error_report("vfio: Failed to %s device\n", running ? "start" : 
"stop");
 }
+vdev->device_stop = running ? false : true;
 g_free(vfio_status);
 }
 
@@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv, size_t 
size, VMStateField *fie
 return 0;
 }
 
+static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection 
*section)
+{
+VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice, 
vfio_memory_listener);
+
+if (vdev->device_stop) {
+struct vfio_pci_get_dirty_bitmap *d;
+ram_addr_t size = int128_get64(section->size);
+unsigned long page_nr = size >> TARGET_PAGE_BITS;
+unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * 
sizeof(unsigned long);
+d = g_malloc0(sizeof(*d) + bitmap_size);
+d->start_addr = section->offset_within_address_space;
+d->page_nr = page_nr;
+
+if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d)) {
+error_report("vfio: Failed to fetch dirty pages for migration\n");
+goto exit;
+}
+cpu_physical_memory_set_dirty_lebitmap((unsigned 
long*)>dirty_bitmap, d->start_addr, d->page_nr);
+
+exit:
+g_free(d);
+}
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index bd98618..984391d 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice {
 bool no_kvm_intx;
 bool no_kvm_msi;
 bool no_kvm_msix;
+bool device_stop;
+MemoryListener vfio_memory_listener;
 } VFIOPCIDevice;
 
 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index fa17848..aa73ee1 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -502,6 +502,20 @@ struct vfio_pci_status_set{
 
 #define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/**
+ * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15,
+ * struct vfio_pci_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_get_dirty_bitmap{
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15)
+
 /*  API for Type1 VFIO IOMMU  */
 
 /**
-- 
2.7.4




[Qemu-devel] [RFC 4/5] vfio: use vfio_device_put/vfio_device_get for device status save/restore

2017-06-26 Thread Yulei Zhang
For VFIO pci device status migrate, on the source side with
funtion vfio_device_put to save the following states
1. pci configuration space addr0~addr5
2. pci configuration space msi_addr msi_data
3. pci device status fetch from device driver

And on the target side with funtion vfio_device_get to restore
the same states
1. re-setup the pci bar configuration
2. re-setup the pci device msi configuration
3. restore the pci device status

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c | 105 +-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 605a473..833cd90 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2961,18 +2961,121 @@ static void vfio_vm_change_state_handler(void *pv, int 
running, RunState state)
 vfio_status->flags = running ? VFIO_DEVICE_PCI_START :
  VFIO_DEVICE_PCI_STOP;
 
-ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status);
+if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) {
+error_report("vfio: Failed to %s device\n", running ? "start" : 
"stop");
+}
 g_free(vfio_status);
 }
 
 static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField 
*field,
 QJSON *vmdesc)
 {
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+VFIORegion *region = >device_state.region;
+int sz = region->size;
+uint8_t *buf = NULL;
+uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i*4, 4);
+qemu_put_be32(f, bar_cfg);
+}
+
+msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT);
+
+msi_lo = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_lo);
+
+if (msi_64bit) {
+msi_hi = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_ADDRESS_HI, 4);
+qemu_put_be32(f, msi_hi);
+}
+
+msi_data = pci_default_read_config(pdev,
+ pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32), 2);
+qemu_put_be32(f, msi_data);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate\n");
+goto exit;
+}
+
+if (pread(vdev->vbasedev.fd, buf, sz, region->fd_offset) != sz) {
+error_report("vfio: Failed to read Device State Region\n");
+goto exit;
+}
+
+qemu_put_buffer(f, buf, sz);
+
+exit:
+if (buf)
+g_free(buf);
+
 return 0;
 }
 
 static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField 
*field)
 {
+VFIOPCIDevice *vdev = pv;
+PCIDevice *pdev = >pdev;
+VFIORegion *region = >device_state.region;
+int sz = region->size;
+uint8_t *buf = NULL;
+uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i;
+bool msi_64bit;
+
+/* retore pci bar configuration */
+ctl = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+bar_cfg = qemu_get_be32(f);
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i*4, bar_cfg, 4);
+}
+vfio_pci_write_config(pdev, PCI_COMMAND,
+  ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
+
+/* restore msi configuration */
+ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2);
+msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT);
+
+vfio_pci_write_config(>pdev,
+  pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+msi_lo = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4);
+
+if (msi_64bit) {
+msi_hi = qemu_get_be32(f);
+vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 
msi_hi, 4);
+}
+msi_data = qemu_get_be32(f);
+vfio_pci_write_config(pdev,
+  pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+  msi_data, 2);
+
+vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+  ctl | PCI_MSI_FLAGS_ENABLE, 2);
+
+buf = g_malloc(sz);
+if (buf == NULL) {
+error_report("vfio: Failed to allocate memory for migrate\n");
+return -1;
+}
+
+qemu_get_buffer(f, buf, sz);
+if (pwrite(vdev->vbasedev.fd, buf, sz, region->fd_offset) != sz) {
+error_report("vfio: Failed to write Device State Region\n");
+return -1;
+}
+
+if (buf)
+   g_free(buf);
 return 0;
 }
 
-- 
2.7.4




[Qemu-devel] [RFC 3/5] vfio: introduce new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET

2017-06-26 Thread Yulei Zhang
New VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET is added to change the
vfio pci device status during the migration, stop the device on
the source side before fetch its status and start the deivce on
the target side after restore its status.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 17 +
 linux-headers/linux/vfio.h | 15 +++
 2 files changed, 32 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 7de4eb4..605a473 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -38,6 +38,7 @@
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
 static VMStateDescription vfio_pci_vmstate;
+static void vfio_vm_change_state_handler(void *pv, int running, RunState 
state);
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2866,6 +2867,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 vfio_register_err_notifier(vdev);
 vfio_register_req_notifier(vdev);
 vfio_setup_resetfn_quirk(vdev);
+qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev);
 
 return;
 
@@ -2948,6 +2950,21 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static void vfio_vm_change_state_handler(void *pv, int running, RunState state)
+{
+VFIOPCIDevice *vdev = pv;
+struct vfio_pci_status_set *vfio_status;
+int argsz = sizeof(*vfio_status);
+
+vfio_status = g_malloc0(argsz);
+vfio_status->argsz = argsz;
+vfio_status->flags = running ? VFIO_DEVICE_PCI_START :
+ VFIO_DEVICE_PCI_STOP;
+
+ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status);
+g_free(vfio_status);
+}
+
 static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField 
*field,
 QJSON *vmdesc)
 {
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index c87d05c..fa17848 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -487,6 +487,21 @@ struct vfio_pci_hot_reset {
 
 #define VFIO_DEVICE_PCI_HOT_RESET  _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+/**
+ * VFIO_DEVICE_PCI_STATUS_SET - _IOW(VFIO_TYPE, VFIO_BASE + 14,
+ * struct vfio_pci_status_set)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_status_set{
+   __u32   argsz;
+   __u32   flags;
+#define VFIO_DEVICE_PCI_STOP  (1 << 0)
+#define VFIO_DEVICE_PCI_START (1 << 1)
+};
+
+#define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14)
+
 /*  API for Type1 VFIO IOMMU  */
 
 /**
-- 
2.7.4




[Qemu-devel] [RFC 2/5] vfio: Add struct vfio_vmstate_info to introduce vfio device put/get funtion

2017-06-26 Thread Yulei Zhang
Introduce vfio_device_put/vfio_device_get funtion for vfio device state
save/restore usage. And vfio device unmigratable flag will be set to false
during initialization if device flag VFIO_DEVICE_FLAGS_MIGRATABLE is set.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index bf2e0ff..7de4eb4 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -37,6 +37,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static VMStateDescription vfio_pci_vmstate;
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -2375,6 +2376,7 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, 
Error **errp)
 }
 
 QLIST_INIT(>device_state.quirks);
+vfio_pci_vmstate.unmigratable = 0;
 }
 
 ret = vfio_get_region_info(vbasedev,
@@ -2946,6 +2948,17 @@ post_reset:
 vfio_pci_post_reset(vdev);
 }
 
+static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField 
*field,
+QJSON *vmdesc)
+{
+return 0;
+}
+
+static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField 
*field)
+{
+return 0;
+}
+
 static void vfio_instance_init(Object *obj)
 {
 PCIDevice *pci_dev = PCI_DEVICE(obj);
@@ -2990,9 +3003,29 @@ static Property vfio_pci_dev_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
-static const VMStateDescription vfio_pci_vmstate = {
+static VMStateInfo vfio_vmstate_info = {
+.name = "vfio-state",
+.get = vfio_device_get,
+.put = vfio_device_put,
+};
+
+static VMStateDescription vfio_pci_vmstate = {
 .name = "vfio-pci",
 .unmigratable = 1,
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+{
+.name = "vfio dev",
+.version_id   = 0,
+.field_exists = NULL,
+.size = 0,
+.info = _vmstate_info,
+.flags= VMS_SINGLE,
+.offset   = 0,
+ },
+VMSTATE_END_OF_LIST()
+},
 };
 
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
-- 
2.7.4




[Qemu-devel] [RFC 1/5] vfio: introduce a new VFIO region for migration support

2017-06-26 Thread Yulei Zhang
New VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX is added to fetch
and restore the pci device status during the live migration.

Signed-off-by: Yulei Zhang <yulei.zh...@intel.com>
---
 hw/vfio/pci.c  | 17 +
 hw/vfio/pci.h  |  1 +
 linux-headers/linux/vfio.h |  5 -
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 03a3d01..bf2e0ff 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2360,6 +2360,23 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, 
Error **errp)
 QLIST_INIT(>bars[i].quirks);
 }
 
+/* device state region setup */
+if (vbasedev->flags & VFIO_DEVICE_FLAGS_MIGRATABLE) {
+char *name = g_strdup_printf("%s BAR %d", vbasedev->name, 
VFIO_PCI_DEVICE_STATE_REGION_INDEX);
+
+ret = vfio_region_setup(OBJECT(vdev), vbasedev,
+>device_state.region, 
VFIO_PCI_DEVICE_STATE_REGION_INDEX, name);
+g_free(name);
+
+if (ret) {
+error_setg_errno(errp, -ret, "failed to get region %d info",
+ VFIO_PCI_DEVICE_STATE_REGION_INDEX);
+return;
+}
+
+QLIST_INIT(>device_state.quirks);
+}
+
 ret = vfio_get_region_info(vbasedev,
VFIO_PCI_CONFIG_REGION_INDEX, _info);
 if (ret) {
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a8366bb..bd98618 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice {
 int interrupt; /* Current interrupt type */
 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
 VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */
+VFIOBAR device_state;
 void *igd_opregion;
 PCIHostDeviceAddress host;
 EventNotifier err_notifier;
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 531cb2e..c87d05c 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -198,6 +198,8 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
 #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
+#define VFIO_DEVICE_FLAGS_CCW   (1 << 4)/* vfio-ccw device */
+#define VFIO_DEVICE_FLAGS_MIGRATABLE (1 << 5)  /* Device supports migrate */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
@@ -433,7 +435,8 @@ enum {
 * between described ranges are unimplemented.
 */
VFIO_PCI_VGA_REGION_INDEX,
-   VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */
+   VFIO_PCI_DEVICE_STATE_REGION_INDEX,
+   VFIO_PCI_NUM_REGIONS = 10 /* Fixed user ABI, region indexes >=9 use */
 /* device specific cap to define content. */
 };
 
-- 
2.7.4




[Qemu-devel] [RFC 0/5] vfio: Introduce Live migration capability to vfio_mdev device

2017-06-26 Thread Yulei Zhang
Summary

This series RFC would like to introduce the live migration capability
to vfio_mdev device. 

As currently vfio_mdev device don't support migration, we introduce a
device flag VFIO_DEVICE_FLAGS_MIGRATABLE to help determine whether the
mdev device can be migrate or not, it will check the flag during the 
device initialization and decide to init the new vfio region 
VFIO_PCI_DEVICE_STATE_REGION_INDEX. 

The intention to add the new region is using it for vfio_mdev device
status save and restore during the migration. The access to this region
will be trapped and forward to the vfio_mdev device driver. There is 
an alternative way to achieve it is to add a new vfio ioctl to help fetch
and save the device status.

Also this series include two new vfio ioctl 
#define VFIO_DEVICE_PCI_STATUS_SET  _IO(VFIO_TYPE, VFIO_BASE + 14)
#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15)

The first one is used to contorl the device running status, we want to
stop the mdev device before quary the status from its device driver and
restart the device after migration.
The second one is used to do the mdev device dirty page synchronization.

So the vfio_mdev device migration sequence would be
Source VM side:
start migration
|
V
 get the cpu state change callback
use status set ioctl to stop the mdev device
|
V
 save the deivce status into Qemufile which is 
 read from the new vfio device status region
|
V
   quary the dirty page bitmap from deivce
and add into qemu dirty list for sync

Target VM side:
   restore the mdev device after get the
 saved status context from Qemufile
|
V
 get the cpu state change callback
 use status set ioctl to start the mdev 
 device to put it in running status
|
V
finish migration

Yulei Zhang (5):
  vfio: introduce a new VFIO region for migration support
  vfio: Add struct vfio_vmstate_info to introduce vfio device put/get
funtion
  vfio: introduce new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET
  vfio: use vfio_device_put/vfio_device_get for device status
save/restore
  vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP

 hw/vfio/pci.c  | 204 -
 hw/vfio/pci.h  |   3 +
 linux-headers/linux/vfio.h |  34 +++-
 3 files changed, 239 insertions(+), 2 deletions(-)

-- 
2.7.4