[Qemu-devel] [RFC PATCH V4 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the bitmap of pinned memory in iommu container, we need copy those memory to the target during the migration as they are dirtied by mdev devices. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/common.c | 34 ++ linux-headers/linux/vfio.h | 14 ++ 2 files changed, 48 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7007878..460b186 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -35,6 +35,7 @@ #include "sysemu/kvm.h" #include "trace.h" #include "qapi/error.h" +#include "exec/ram_addr.h" struct vfio_group_head vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -624,9 +625,42 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static void vfio_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ +VFIOContainer *container = container_of(listener, VFIOContainer, listener); +VFIOGroup *group = QLIST_FIRST(>group_list); +VFIODevice *vbasedev; +QLIST_FOREACH(vbasedev, >device_list, next) { +if (vbasedev->device_state == VFIO_DEVICE_START) { +return; +} +} + +struct vfio_iommu_get_dirty_bitmap *d; +ram_addr_t size = int128_get64(section->size); +unsigned long page_nr = size >> TARGET_PAGE_BITS; +unsigned long bitmap_size = +(BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long); +d = g_malloc0(sizeof(*d) + bitmap_size); +d->start_addr = section->offset_within_address_space; +d->page_nr = page_nr; + +if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) { +error_report("vfio: Failed to fetch dirty pages for migration"); +goto exit; +} + +cpu_physical_memory_set_dirty_lebitmap((unsigned long *)>dirty_bitmap, + d->start_addr, d->page_nr); +exit: +g_free(d); +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, +.log_sync = vfio_log_sync, }; static void vfio_listener_release(VFIOContainer *container) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 2c911d9..56bf76f 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -589,6 +589,20 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_iommu_get_dirty_bitmap { + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17) + /* Additional API for SPAPR TCE (Server POWERPC) IOMMU */ /* -- 2.7.4
[Qemu-devel] [RFC PATCH V4 2/4] vfio: Add vm status change callback to stop/restart the mdev device
VM status change handler is added to change the vfio pci device status during the migration, write the demanded device status to the DEVICE STATUS subregion to stop the device on the source side before fetch its status and start the deivce on the target side after restore its status. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 20 include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 6 ++ roms/seabios | 2 +- 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f98a9dd..13d8c73 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -38,6 +38,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +static void vfio_vm_change_state_handler(void *pv, int running, RunState state); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2896,6 +2897,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); +qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); return; @@ -2982,6 +2984,24 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) +{ +VFIOPCIDevice *vdev = pv; +VFIODevice *vbasedev = >vbasedev; +uint8_t dev_state; +uint8_t sz = 1; + +dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP; + +if (pwrite(vdev->vbasedev.fd, _state, + sz, vdev->device_state.offset) != sz) { +error_report("vfio: Failed to %s device", running ? "start" : "stop"); +return; +} + +vbasedev->device_state = dev_state; +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f3a2ac9..9c14a8f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -125,6 +125,7 @@ typedef struct VFIODevice { unsigned int num_irqs; unsigned int num_regions; unsigned int flags; +bool device_state; } VFIODevice; struct VFIODeviceOps { diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index e3380ad..8f02f2f 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -304,6 +304,12 @@ struct vfio_region_info_cap_type { /* Mdev sub-type for device state save and restore */ #define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) +/* Offset in region to save device state */ +#define VFIO_DEVICE_STATE_OFFSET 1 + +#define VFIO_DEVICE_START 0 +#define VFIO_DEVICE_STOP 1 + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) diff --git a/roms/seabios b/roms/seabios index 63451fc..5f4c7b1 16 --- a/roms/seabios +++ b/roms/seabios @@ -1 +1 @@ -Subproject commit 63451fca13c75870e1703eb3e20584d91179aebc +Subproject commit 5f4c7b13cdf9c450eb55645f4362ea58fa61b79b -- 2.7.4
[Qemu-devel] [RFC PATCH V4 3/4] vfio: Add SaveVMHanlders for VFIO device to support live migration
Instead of using vm state description, add SaveVMHandlers for VFIO device to support live migration. Introduce new Ioctl VFIO_DEVICE_GET_DIRTY_BITMAP to fetch the memory bitmap that dirtied by vfio device during the iterative precopy stage to shorten the system downtime afterward. For vfio pci device status migrate, during the system downtime, it will save the following states 1. pci configuration space addr0~addr5 2. pci configuration space msi_addr msi_data 3. pci device status fetch from device driver And on the target side the vfio_load will restore the same states 1. re-setup the pci bar configuration 2. re-setup the pci device msi configuration 3. restore the pci device status Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 195 +++-- linux-headers/linux/vfio.h | 14 2 files changed, 204 insertions(+), 5 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 13d8c73..ac6a9c7 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -33,9 +33,14 @@ #include "trace.h" #include "qapi/error.h" #include "migration/blocker.h" +#include "migration/register.h" +#include "exec/ram_addr.h" #define MSIX_CAP_LENGTH 12 +#define VFIO_SAVE_FLAG_SETUP 0 +#define VFIO_SAVE_FLAG_DEV_STATE 1 + static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_vm_change_state_handler(void *pv, int running, RunState state); @@ -2639,6 +2644,190 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } +static uint64_t vfio_dirty_log_sync(VFIOPCIDevice *vdev) +{ +RAMBlock *block; +struct vfio_device_get_dirty_bitmap *d; +uint64_t page = 0; +ram_addr_t size; +unsigned long nr, bitmap; + +RAMBLOCK_FOREACH(block) { +size = block->used_length; +nr = size >> TARGET_PAGE_BITS; +bitmap = (BITS_TO_LONGS(nr) + 1) * sizeof(unsigned long); +d = g_malloc0(sizeof(*d) + bitmap); +d->start_addr = block->offset; +d->page_nr = nr; +if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_DIRTY_BITMAP, d)) { +error_report("vfio: Failed to get device dirty bitmap"); +g_free(d); +goto exit; +} + +if (d->page_nr) { +cpu_physical_memory_set_dirty_lebitmap( + (unsigned long *)>dirty_bitmap, + d->start_addr, d->page_nr); +page += d->page_nr; +} +g_free(d); +} + +exit: +return page; +} + +static void vfio_save_live_pending(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending) +{ +VFIOPCIDevice *vdev = opaque; +uint64_t pending; + +qemu_mutex_lock_iothread(); +rcu_read_lock(); +pending = vfio_dirty_log_sync(vdev); +rcu_read_unlock(); +qemu_mutex_unlock_iothread(); +*non_postcopiable_pending += pending; +} + +static int vfio_load(QEMUFile *f, void *opaque, int version_id) +{ +VFIOPCIDevice *vdev = opaque; +PCIDevice *pdev = >pdev; +int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET; +uint8_t *buf = NULL; +uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +if (qemu_get_byte(f) == VFIO_SAVE_FLAG_SETUP) { +goto exit; +} + +/* retore pci bar configuration */ +ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = qemu_get_be32(f); +vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4); +} +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); + +/* restore msi configuration */ +ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); + +vfio_pci_write_config(>pdev, + pdev->msi_cap + PCI_MSI_FLAGS, + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); + +msi_lo = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); + +if (msi_64bit) { +msi_hi = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, + msi_hi, 4); +} +msi_data = qemu_get_be32(f); +vfio_pci_write_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), + msi_data, 2); + +vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FL
[Qemu-devel] [RFC V4 PATCH 0/4] vfio: Introduce live migation capability to
Summary This series RFC would like to resume the discussion about how to introduce the live migration capability to vfio mdev device. A new subtype region VFIO_REGION_SUBTYPE_DEVICE_STATE is introduced for vfio device status migrate, during the initialization it will check if the region is supported by the vfio device, otherwise it will remain non-migratable. The intention to add the new region is using it for mdev device status save and restore during the migration. The access to this region will be trapped and forward to the mdev device driver, it also uses the first byte in the new region to control the running state of mdev device, so during the migration after stop the mdev driver, qemu could retrieve the specific device status from this region and transfer to the target VM side for the mdev device restore. In addition, during the pre-copy period, it will be able to fetch the dirty bitmap of vfio device through ioctl VFIO_DEVICE_GET_DIRTY_BITMAP iteratively, which will be able to shorten the system downtime during the static copy. Below is the vfio mdev device migration sequence Source VM side: start migration | V in pre-copy stage, fetch the device dirty bitmap and add into qemu dirty list for migrate iteratively. | V get the cpu state change callback, write to the subregion's first byte to stop the mdev device | V quary the dirty page bitmap from iommu container and add into qemu dirty list for last synchronization | V save the deivce status into Qemufile which is read from the vfio device subregion Target VM side: restore the mdev device after get the saved status context from Qemufile | V get the cpu state change callback write to subregion's first byte to start the mdev device to put it in running status | V finish migration V3->V4: 1. add migration_blocker if device state region isnot supported. 2. instead of using vmsd, register SaveVMHandlers for VFIO device to leverage the pro-copy facility, and add new ioctl for VFIO device to fetch dirty bitmap during pro-copy. 3. remove the intel vendor ID dependence for the device state subregion. V2->V3: 1. rebase the patch to Qemu stable 2.10 branch. 2. use a common name for the subregion instead of specific for intel IGD. V1->V2: Per Alex's suggestion: 1. use device subtype region instead of VFIO PCI fixed region. 2. remove unnecessary ioctl, use the first byte of subregion to control the running state of mdev device. 3. for dirty page synchronization, implement the interface with VFIOContainer instead of vfio pci device. Yulei Zhang (4): vfio: introduce a new VFIO subregion for mdev device migration support vfio: Add vm status change callback to stop/restart the mdev device vfio: Add SaveVMHanlders for VFIO device to support live migration vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP hw/vfio/common.c | 34 ++ hw/vfio/pci.c | 240 -- hw/vfio/pci.h | 2 + include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 43 +++- roms/seabios | 2 +- 6 files changed, 312 insertions(+), 10 deletions(-) -- 2.7.4
[Qemu-devel] [RFC PATCH V4 1/4] vfio: introduce a new VFIO subregion for mdev device migration support
New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added to fetch and restore the status of mdev device vGPU during the live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 25 - hw/vfio/pci.h | 2 ++ linux-headers/linux/vfio.h | 9 ++--- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c977ee3..f98a9dd 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -32,6 +32,7 @@ #include "pci.h" #include "trace.h" #include "qapi/error.h" +#include "migration/blocker.h" #define MSIX_CAP_LENGTH 12 @@ -2821,6 +2822,25 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_vga_quirk_setup(vdev); } +struct vfio_region_info *device_state; +/* device state region setup */ +if (!vfio_get_dev_region_info(>vbasedev, +VFIO_REGION_TYPE_PCI_VENDOR_TYPE, +VFIO_REGION_SUBTYPE_DEVICE_STATE, _state)) { +memcpy(>device_state, device_state, + sizeof(struct vfio_region_info)); +g_free(device_state); +} else { +error_setg(>migration_blocker, +"Migration disabled: cannot support device state region"); +migrate_add_blocker(vdev->migration_blocker, ); +if (err) { +error_propagate(errp, err); +error_free(vdev->migration_blocker); +goto error; +} +} + for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_quirk_setup(vdev, i); } @@ -2884,6 +2904,10 @@ out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); error: +if (vdev->migration_blocker) { +migrate_del_blocker(vdev->migration_blocker); +error_free(vdev->migration_blocker); +} error_prepend(errp, ERR_PREFIX, vdev->vbasedev.name); } @@ -3009,7 +3033,6 @@ static Property vfio_pci_dev_properties[] = { static const VMStateDescription vfio_pci_vmstate = { .name = "vfio-pci", -.unmigratable = 1, }; static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 502a575..0ee1724 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,6 +116,8 @@ typedef struct VFIOPCIDevice { VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */ void *igd_opregion; +struct vfio_region_info device_state; +Error *migration_blocker; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 4312e96..e3380ad 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -297,9 +297,12 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0x) /* 8086 Vendor sub-types */ -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* Mdev sub-type for device state save and restore */ +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, -- 2.7.4
[Qemu-devel] [PATCH V3 3/4] vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore
Introduce vfio_device_put/vfio_device_get funtion for vfio device state save/restore usage. For VFIO pci device status migrate, on the source side with funtion vfio_device_put to save the following states 1. pci configuration space addr0~addr5 2. pci configuration space msi_addr msi_data 3. pci device status fetch from device driver And on the target side with funtion vfio_device_get to restore the same states 1. re-setup the pci bar configuration 2. re-setup the pci device msi configuration 3. restore the pci device status Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 137 + linux-headers/linux/vfio.h | 3 + 2 files changed, 140 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 3e2289c..c1676cf 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2982,6 +2982,123 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state) vbasedev->device_state = dev_state; } +static int vfio_device_put(QEMUFile *f, void *pv, size_t size, + VMStateField *field, QJSON *vmdesc) +{ +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET; +uint8_t *buf = NULL; +uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4); +qemu_put_be32(f, bar_cfg); +} + +msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT); + +msi_lo = pci_default_read_config(pdev, + pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4); +qemu_put_be32(f, msi_lo); + +if (msi_64bit) { +msi_hi = pci_default_read_config(pdev, + pdev->msi_cap + PCI_MSI_ADDRESS_HI, + 4); +qemu_put_be32(f, msi_hi); +} + +msi_data = pci_default_read_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), + 2); +qemu_put_be32(f, msi_data); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate"); +goto exit; +} + +if (pread(vdev->vbasedev.fd, buf, sz, + vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) { +error_report("vfio: Failed to read Device State Region"); +goto exit; +} + +qemu_put_buffer(f, buf, sz); + +exit: +g_free(buf); + +return 0; +} + +static int vfio_device_get(QEMUFile *f, void *pv, + size_t size, VMStateField *field) +{ +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET; +uint8_t *buf = NULL; +uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +/* retore pci bar configuration */ +ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = qemu_get_be32(f); +vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar_cfg, 4); +} +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); + +/* restore msi configuration */ +ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); + +vfio_pci_write_config(>pdev, + pdev->msi_cap + PCI_MSI_FLAGS, + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); + +msi_lo = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); + +if (msi_64bit) { +msi_hi = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, + msi_hi, 4); +} +msi_data = qemu_get_be32(f); +vfio_pci_write_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), + msi_data, 2); + +vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS, + ctl | PCI_MSI_FLAGS_ENABLE, 2); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate"); +return -1; +} + +qemu_get_buffer(f, buf, sz); +if (pwrite(vdev->vbasedev.fd, buf, sz, + vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) { +error_report("vfio: Failed to write Device State Region"); +
[Qemu-devel] [PATCH V3 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the bitmap of pinned memory in iommu container, we need copy those memory to the target during the migration as they are dirtied by mdev devices. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/common.c | 34 ++ linux-headers/linux/vfio.h | 14 ++ 2 files changed, 48 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7b2924c..a952554 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -35,6 +35,7 @@ #include "sysemu/kvm.h" #include "trace.h" #include "qapi/error.h" +#include "exec/ram_addr.h" struct vfio_group_head vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -624,9 +625,42 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static void vfio_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ +VFIOContainer *container = container_of(listener, VFIOContainer, listener); +VFIOGroup *group = QLIST_FIRST(>group_list); +VFIODevice *vbasedev; +QLIST_FOREACH(vbasedev, >device_list, next) { +if (vbasedev->device_state == VFIO_DEVICE_START) { +return; +} +} + +struct vfio_iommu_get_dirty_bitmap *d; +ram_addr_t size = int128_get64(section->size); +unsigned long page_nr = size >> TARGET_PAGE_BITS; +unsigned long bitmap_size = +(BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long); +d = g_malloc0(sizeof(*d) + bitmap_size); +d->start_addr = section->offset_within_address_space; +d->page_nr = page_nr; + +if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) { +error_report("vfio: Failed to fetch dirty pages for migration"); +goto exit; +} + +cpu_physical_memory_set_dirty_lebitmap((unsigned long *)>dirty_bitmap, + d->start_addr, d->page_nr); +exit: +g_free(d); +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, +.log_sync = vfio_log_sync, }; static void vfio_listener_release(VFIOContainer *container) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 4451a8f..a41f73b 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -574,6 +574,20 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_iommu_get_dirty_bitmap { + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17) + /* Additional API for SPAPR TCE (Server POWERPC) IOMMU */ /* -- 2.7.4
[Qemu-devel] [PATCH V3 2/4] vfio: Add vm status change callback to stop/restart the mdev device
VM status change handler is added to change the vfio pci device status during the migration, write the demanded device status to the DEVICE STATUS subregion to stop the device on the source side before fetch its status and start the deivce on the target side after restore its status. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 20 include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 3 +++ 3 files changed, 24 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 2fe20e4..3e2289c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -38,6 +38,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static VMStateDescription vfio_pci_vmstate; +static void vfio_vm_change_state_handler(void *pv, int running, RunState state); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2880,6 +2881,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); +qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); return; @@ -2962,6 +2964,24 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) +{ +VFIOPCIDevice *vdev = pv; +VFIODevice *vbasedev = >vbasedev; +uint8_t dev_state; +uint8_t sz = 1; + +dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP; + +if (pwrite(vdev->vbasedev.fd, _state, + sz, vdev->device_state.offset) != sz) { +error_report("vfio: Failed to %s device", running ? "start" : "stop"); +return; +} + +vbasedev->device_state = dev_state; +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f3a2ac9..9c14a8f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -125,6 +125,7 @@ typedef struct VFIODevice { unsigned int num_irqs; unsigned int num_regions; unsigned int flags; +bool device_state; } VFIODevice; struct VFIODeviceOps { diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index c3b8e4a..4ddeebc 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -303,6 +303,9 @@ struct vfio_region_info_cap_type { /* Mdev sub-type for device state save and restore */ #define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) +#define VFIO_DEVICE_START 0 +#define VFIO_DEVICE_STOP 1 + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) -- 2.7.4
[Qemu-devel] [PATCH V3 1/4] vfio: introduce a new VFIO subregion for mdev device migration support
New VFIO sub region VFIO_REGION_SUBTYPE_DEVICE_STATE is added to fetch and restore the status of mdev device vGPU during the live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 14 +- hw/vfio/pci.h | 1 + linux-headers/linux/vfio.h | 9 ++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 31e1edf..2fe20e4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -37,6 +37,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +static VMStateDescription vfio_pci_vmstate; /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2813,6 +2814,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_vga_quirk_setup(vdev); } +struct vfio_region_info *device_state; +/* device state region setup */ +if (!vfio_get_dev_region_info(>vbasedev, +VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, +VFIO_REGION_SUBTYPE_DEVICE_STATE, _state)) { +memcpy(>device_state, device_state, + sizeof(struct vfio_region_info)); +g_free(device_state); +vfio_pci_vmstate.unmigratable = 0; +} + for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_quirk_setup(vdev, i); } @@ -2994,7 +3006,7 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; -static const VMStateDescription vfio_pci_vmstate = { +static VMStateDescription vfio_pci_vmstate = { .name = "vfio-pci", .unmigratable = 1, }; diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index a8366bb..6a1d26e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,6 +116,7 @@ typedef struct VFIOPCIDevice { VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */ void *igd_opregion; +struct vfio_region_info device_state; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 4e7ab4c..c3b8e4a 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -296,9 +296,12 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0x) /* 8086 Vendor sub-types */ -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* Mdev sub-type for device state save and restore */ +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, -- 2.7.4
[Qemu-devel] [PATCH V3 0/4] vfio: Introduce Live migration capability to vfio_mdev device
Summary This series RFC would like to resume the discussion about how to introduce the live migration capability to vfio mdev device. By adding a new vfio subtype region VFIO_REGION_SUBTYPE_DEVICE_STATE, the mdev device will be set to migratable if the new region exist during the initialization. The intention to add the new region is using it for mdev device status save and restore during the migration. The access to this region will be trapped and forward to the mdev device driver, it also uses the first byte in the new region to control the running state of mdev device, so during the migration after stop the mdev driver, qemu could retrieve the specific device status from this region and transfer to the target VM side for the mdev device restore. In addition, we add one new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP to help do the mdev device dirty page synchronization during the migration, currently it is just for static copy, in the future we would like to add new interface for the pre-copy. Below is the vfio_mdev device migration sequence Source VM side: start migration | V get the cpu state change callback, write to the subregion's first byte to stop the mdev device | V quary the dirty page bitmap from iommu container and add into qemu dirty list for synchronization | V save the deivce status into Qemufile which is read from the vfio device subregion Target VM side: restore the mdev device after get the saved status context from Qemufile | V get the cpu state change callback write to subregion's first byte to start the mdev device to put it in running status | V finish migration V3->V2: 1. rebase the patch to Qemu stable 2.10 branch. 2. use a common name for the subregion instead of specific for intel IGD. V1->V2: Per Alex's suggestion: 1. use device subtype region instead of VFIO PCI fixed region. 2. remove unnecessary ioctl, use the first byte of subregion to control the running state of mdev device. 3. for dirty page synchronization, implement the interface with VFIOContainer instead of vfio pci device. Yulei Zhang (4): vfio: introduce a new VFIO subregion for mdev device migration support vfio: Add vm status change callback to stop/restart the mdev device vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP hw/vfio/common.c | 34 + hw/vfio/pci.c | 171 +- hw/vfio/pci.h | 1 + include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 29 ++- 5 files changed, 232 insertions(+), 4 deletions(-) -- 2.7.4
[Qemu-devel] [RFC V2 4/4] vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP
New VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP is used to fetch the bitmap of pinned memory in iommu container, we need copy those memory to the target during the migration as they are dirtied by mdev devices. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/common.c | 32 linux-headers/linux/vfio.h | 14 ++ 2 files changed, 46 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index f3ba9b9..54d43d5 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -35,6 +35,7 @@ #include "sysemu/kvm.h" #include "trace.h" #include "qapi/error.h" +#include "exec/ram_addr.h" struct vfio_group_head vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -603,9 +604,40 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static void vfio_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ +VFIOContainer *container = container_of(listener, VFIOContainer, listener); +VFIOGroup *group = QLIST_FIRST(>group_list); +VFIODevice *vbasedev; +QLIST_FOREACH(vbasedev, >device_list, next) { + if (vbasedev->device_state == VFIO_DEVICE_START) + return; +} + +struct vfio_iommu_get_dirty_bitmap *d; +ram_addr_t size = int128_get64(section->size); +unsigned long page_nr = size >> TARGET_PAGE_BITS; +unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long); +d = g_malloc0(sizeof(*d) + bitmap_size); +d->start_addr = section->offset_within_address_space; +d->page_nr = page_nr; + +if (ioctl(container->fd, VFIO_IOMMU_GET_DIRTY_BITMAP, d)) { +error_report("vfio: Failed to fetch dirty pages for migration\n"); +goto exit; +} + +cpu_physical_memory_set_dirty_lebitmap((unsigned long*)>dirty_bitmap, d->start_addr, d->page_nr); + +exit: +g_free(d); +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, +.log_sync = vfio_log_sync, }; static void vfio_listener_release(VFIOContainer *container) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index dbbe7e1..cf3d163 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -553,6 +553,20 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_iommu_get_dirty_bitmap{ + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17) + /* Additional API for SPAPR TCE (Server POWERPC) IOMMU */ /* -- 2.7.4
[Qemu-devel] [RFC V2 2/4] vfio: Add vm status change callback to stop/restart the mdev device
VM status change handler is added to change the vfio pci device status during the migration, write the demanded device status to the DEVICE STATUS subregion to stop the device on the source side before fetch its status and start the deivce on the target side after restore its status. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 19 +++ include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 3 +++ 3 files changed, 23 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 21a5cef..753da80 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -38,6 +38,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static VMStateDescription vfio_pci_vmstate; +static void vfio_vm_change_state_handler(void *pv, int running, RunState state); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2858,6 +2859,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); +qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); return; @@ -2940,6 +2942,23 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) +{ +VFIOPCIDevice *vdev = pv; +VFIODevice *vbasedev = >vbasedev; +uint8_t dev_state; +uint8_t sz = 1; + +dev_state = running ? VFIO_DEVICE_START : VFIO_DEVICE_STOP; + +if (pwrite(vdev->vbasedev.fd, _state, sz, vdev->device_state.offset) != sz) { +error_report("vfio: Failed to %s device\n", running ? "start" : "stop"); +return; +} + +vbasedev->device_state = dev_state; +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c582de1..c4bab97 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -123,6 +123,7 @@ typedef struct VFIODevice { unsigned int num_irqs; unsigned int num_regions; unsigned int flags; +bool device_state; } VFIODevice; struct VFIODeviceOps { diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index e2c53bf..ae1b953 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -299,6 +299,9 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) #define VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE (4) +#define VFIO_DEVICE_START 0 +#define VFIO_DEVICE_STOP 1 + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) -- 2.7.4
[Qemu-devel] [RFC V2 3/4] vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore
Introduce vfio_device_put/vfio_device_get funtion for vfio device state save/restore usage. For VFIO pci device status migrate, on the source side with funtion vfio_device_put to save the following states 1. pci configuration space addr0~addr5 2. pci configuration space msi_addr msi_data 3. pci device status fetch from device driver And on the target side with funtion vfio_device_get to restore the same states 1. re-setup the pci bar configuration 2. re-setup the pci device msi configuration 3. restore the pci device status Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 132 + linux-headers/linux/vfio.h | 2 + 2 files changed, 134 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 753da80..c0fc1d2 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2959,6 +2959,118 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state) vbasedev->device_state = dev_state; } +static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField *field, +QJSON *vmdesc) +{ +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET; +uint8_t *buf = NULL; +uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i*4, 4); +qemu_put_be32(f, bar_cfg); +} + +msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT); + +msi_lo = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4); +qemu_put_be32(f, msi_lo); + +if (msi_64bit) { +msi_hi = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 4); +qemu_put_be32(f, msi_hi); +} + +msi_data = pci_default_read_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), 2); +qemu_put_be32(f, msi_data); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate\n"); +goto exit; +} + +if (pread(vdev->vbasedev.fd, buf, sz, + vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) { +error_report("vfio: Failed to read Device State Region\n"); +goto exit; +} + +qemu_put_buffer(f, buf, sz); + +exit: +if (buf) +g_free(buf); + +return 0; +} + +static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *field) +{ +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +int sz = vdev->device_state.size - VFIO_DEVICE_STATE_OFFSET; +uint8_t *buf = NULL; +uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +/* retore pci bar configuration */ +ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = qemu_get_be32(f); +vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i*4, bar_cfg, 4); +} +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); + +/* restore msi configuration */ +ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); + +vfio_pci_write_config(>pdev, + pdev->msi_cap + PCI_MSI_FLAGS, + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); + +msi_lo = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); + +if (msi_64bit) { +msi_hi = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, msi_hi, 4); +} +msi_data = qemu_get_be32(f); +vfio_pci_write_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), + msi_data, 2); + +vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS, + ctl | PCI_MSI_FLAGS_ENABLE, 2); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate\n"); +return -1; +} + +qemu_get_buffer(f, buf, sz); +if (pwrite(vdev->vbasedev.fd, buf, sz, + vdev->device_state.offset + VFIO_DEVICE_STATE_OFFSET) != sz) { +error_report("vfio: Failed to write Device State Region\n"); +return -1; +} + +if (buf) + g_free(buf); +return 0; +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev =
[Qemu-devel] [RFC V2 1/4] vfio: introduce a new VFIO sub region for mdev device migration support
New VFIO sub region VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE is added to fetch and restore the status of mdev device vGPU during the live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 13 - hw/vfio/pci.h | 1 + linux-headers/linux/vfio.h | 7 --- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 03a3d01..21a5cef 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -37,6 +37,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +static VMStateDescription vfio_pci_vmstate; /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2792,6 +2793,16 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_vga_quirk_setup(vdev); } +struct vfio_region_info *device_state; +/* device state region setup */ +if (!vfio_get_dev_region_info(>vbasedev, +VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, +VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE, _state)) { + memcpy(>device_state, device_state, sizeof(struct vfio_region_info)); + g_free(device_state); +vfio_pci_vmstate.unmigratable = 0; +} + for (i = 0; i < PCI_ROM_SLOT; i++) { vfio_bar_quirk_setup(vdev, i); } @@ -2973,7 +2984,7 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; -static const VMStateDescription vfio_pci_vmstate = { +static VMStateDescription vfio_pci_vmstate = { .name = "vfio-pci", .unmigratable = 1, }; diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index a8366bb..6a1d26e 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,6 +116,7 @@ typedef struct VFIOPCIDevice { VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */ void *igd_opregion; +struct vfio_region_info device_state; PCIHostDeviceAddress host; EventNotifier err_notifier; EventNotifier req_notifier; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 531cb2e..e2c53bf 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -294,9 +294,10 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0x) /* 8086 Vendor sub-types */ -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE (4) /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, -- 2.7.4
[Qemu-devel] [RFC V2 0/4] vfio: Introduce Live migration capability to vfio_mdev device
Summary This series RFC would like to introduce the live migration capability to vfio_mdev device. As currently vfio_mdev device don't support migration, we introduce a new vfio subtype region VFIO_REGION_SUBTYPE_INTEL_IGD_DEVICE_STATE for Intel vGPU device, during the vfio device initialization, the mdev device will be set to migratable if the new region exist. The intention to add the new region is using it for vfio_mdev device status save and restore during the migration. The access to this region will be trapped and forward to the vfio_mdev device driver. And we use the first byte in the new region to control the running state of mdev device. Meanwhile we add one new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP to help do the mdev device dirty page synchronization. So the vfio_mdev device migration sequence would be Source VM side: start migration | V get the cpu state change callback, write to the subregion's first byte to stop the mdev device | V quary the dirty page bitmap from iommu container and add into qemu dirty list for synchronization | V save the deivce status into Qemufile which is read from the vfio device subregion Target VM side: restore the mdev device after get the saved status context from Qemufile | V get the cpu state change callback write to subregion's first byte to start the mdev device to put it in running status | V finish migration V1->V2: Per Alex's suggestion: 1. use device subtype region instead of VFIO PCI fixed region. 2. remove unnecessary ioctl, use the first byte of subregion to control the running state of mdev device. 3. for dirty page synchronization, implement the interface with VFIOContainer instead of vfio pci device. Yulei Zhang (4): vfio: introduce a new VFIO sub region for mdev device migration support vfio: Add vm status change callback to stop/restart the mdev device vfio: Add struct vfio_vmstate_info to introduce put/get callback funtion for vfio device status save/restore vifo: introduce new VFIO ioctl VFIO_IOMMU_GET_DIRTY_BITMAP hw/vfio/common.c | 32 + hw/vfio/pci.c | 164 +- hw/vfio/pci.h | 1 + include/hw/vfio/vfio-common.h | 1 + linux-headers/linux/vfio.h| 26 ++- 5 files changed, 220 insertions(+), 4 deletions(-) -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 7/9] drm/i915/gvt: Introduce new VFIO ioctl for device status control
Add handling for new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET to control the status of mdev device vGPU. vGPU will stop/start rendering according to the command comes along with the ioctl. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 + drivers/gpu/drm/i915/gvt/vgpu.c | 1 + include/uapi/linux/vfio.h| 15 +++ 3 files changed, 25 insertions(+) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index c44b319..ac327f7 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1147,6 +1147,15 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, } else if (cmd == VFIO_DEVICE_RESET) { intel_gvt_ops->vgpu_reset(vgpu); return 0; + } else if (cmd == VFIO_DEVICE_PCI_STATUS_SET) { + struct vfio_pci_status_set status; + minsz = offsetofend(struct vfio_pci_status_set, flags); + if (copy_from_user(, (void __user *)arg, minsz)) + return -EFAULT; + if (status.flags == VFIO_DEVICE_PCI_STOP) + intel_gvt_ops->vgpu_deactivate(vgpu); + else + intel_gvt_ops->vgpu_activate(vgpu); } return 0; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 989f353..542bde9 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -205,6 +205,7 @@ void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu) { mutex_lock(>gvt->lock); vgpu->active = true; + intel_vgpu_start_schedule(vgpu); mutex_unlock(>gvt->lock); } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 9ad9ce1..4bb057d 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -503,6 +503,21 @@ struct vfio_pci_hot_reset { #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13) +/** + * VFIO_DEVICE_PCI_STATUS_SET - _IOW(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_pci_status_set) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_status_set{ + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_PCI_STOP (1 << 0) +#define VFIO_DEVICE_PCI_START (1 << 1) +}; + +#define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14) + /* API for Type1 VFIO IOMMU */ /** -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 5/9] drm/i915/gvt: Align the guest gm aperture start offset for live migration
As guest gm aperture region start offset is initialized when vGPU created, in order to make sure that start offset is remain the same after migration, align the aperture start offset to 0 for guest. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 3 +-- drivers/gpu/drm/i915/gvt/vgpu.c | 7 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 1ae0b40..d2b13ae 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1002,8 +1002,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, sparse->nr_areas = nr_areas; cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].offset = 0; sparse->areas[0].size = vgpu_aperture_sz(vgpu); break; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 90c14e6..989f353 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -43,8 +43,7 @@ void populate_pvinfo_page(struct intel_vgpu *vgpu) vgpu_vreg(vgpu, vgtif_reg(version_minor)) = 0; vgpu_vreg(vgpu, vgtif_reg(display_ready)) = 0; vgpu_vreg(vgpu, vgtif_reg(vgt_id)) = vgpu->id; - vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = - vgpu_aperture_gmadr_base(vgpu); + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = 0; vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.size)) = vgpu_aperture_sz(vgpu); vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) = @@ -480,6 +479,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, { struct intel_gvt *gvt = vgpu->gvt; struct intel_gvt_workload_scheduler *scheduler = >scheduler; + u64 maddr = vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)); + u64 unmaddr = vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)); gvt_dbg_core("--\n"); gvt_dbg_core("resseting vgpu%d, dmlr %d, engine_mask %08x\n", @@ -510,6 +511,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, intel_vgpu_reset_mmio(vgpu, dmlr); populate_pvinfo_page(vgpu); + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = maddr; + vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) = unmaddr; intel_vgpu_reset_display(vgpu); if (dmlr) { -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 9/9] drm/i915/gvt: Add support to VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX
Add new VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX support in vGPU, through this new region it can fetch the status from mdev device for migration, on the target side it can retrieve the device status and reconfigure the device to continue running after resume the guest. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/Makefile | 2 +- drivers/gpu/drm/i915/gvt/gvt.c | 1 + drivers/gpu/drm/i915/gvt/gvt.h | 5 + drivers/gpu/drm/i915/gvt/kvmgt.c | 19 + drivers/gpu/drm/i915/gvt/migrate.c | 715 + drivers/gpu/drm/i915/gvt/migrate.h | 82 + drivers/gpu/drm/i915/gvt/mmio.c| 14 + drivers/gpu/drm/i915/gvt/mmio.h| 1 + include/uapi/linux/vfio.h | 3 +- 9 files changed, 840 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/i915/gvt/migrate.c create mode 100644 drivers/gpu/drm/i915/gvt/migrate.h diff --git a/drivers/gpu/drm/i915/gvt/Makefile b/drivers/gpu/drm/i915/gvt/Makefile index f5486cb9..a7e2e34 100644 --- a/drivers/gpu/drm/i915/gvt/Makefile +++ b/drivers/gpu/drm/i915/gvt/Makefile @@ -1,7 +1,7 @@ GVT_DIR := gvt GVT_SOURCE := gvt.o aperture_gm.o handlers.o vgpu.o trace_points.o firmware.o \ interrupt.o gtt.o cfg_space.o opregion.o mmio.o display.o edid.o \ - execlist.o scheduler.o sched_policy.o render.o cmd_parser.o + execlist.o scheduler.o sched_policy.o render.o cmd_parser.o migrate.o ccflags-y += -I$(src) -I$(src)/$(GVT_DIR) i915-y += $(addprefix $(GVT_DIR)/, $(GVT_SOURCE)) diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c index c27c683..e40af70 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.c +++ b/drivers/gpu/drm/i915/gvt/gvt.c @@ -54,6 +54,7 @@ static const struct intel_gvt_ops intel_gvt_ops = { .vgpu_reset = intel_gvt_reset_vgpu, .vgpu_activate = intel_gvt_activate_vgpu, .vgpu_deactivate = intel_gvt_deactivate_vgpu, + .vgpu_save_restore = intel_gvt_save_restore, }; /** diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 23eeb7c..12aa3b8 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -46,6 +46,7 @@ #include "sched_policy.h" #include "render.h" #include "cmd_parser.h" +#include "migrate.h" #define GVT_MAX_VGPU 8 @@ -431,6 +432,8 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, void intel_gvt_reset_vgpu(struct intel_vgpu *vgpu); void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu); void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu); +int intel_gvt_save_restore(struct intel_vgpu *vgpu, char *buf, + size_t count, uint64_t off, bool restore); /* validating GM functions */ #define vgpu_gmadr_is_aperture(vgpu, gmadr) \ @@ -513,6 +516,8 @@ struct intel_gvt_ops { void (*vgpu_reset)(struct intel_vgpu *); void (*vgpu_activate)(struct intel_vgpu *); void (*vgpu_deactivate)(struct intel_vgpu *); + int (*vgpu_save_restore)(struct intel_vgpu *, char *buf, + size_t count, uint64_t off, bool restore); }; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index e9f11a9..d4ede29 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -670,6 +670,9 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, bar0_start + pos, buf, count); } break; + case VFIO_PCI_DEVICE_STATE_REGION_INDEX: + ret = intel_gvt_ops->vgpu_save_restore(vgpu, buf, count, pos, is_write); + break; case VFIO_PCI_BAR2_REGION_INDEX: case VFIO_PCI_BAR3_REGION_INDEX: case VFIO_PCI_BAR4_REGION_INDEX: @@ -688,6 +691,10 @@ static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, { unsigned int done = 0; int ret; + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + + if (index == VFIO_PCI_DEVICE_STATE_REGION_INDEX) + return intel_vgpu_rw(mdev, (char *)buf, count, ppos, false); while (count) { size_t filled; @@ -748,6 +755,10 @@ static ssize_t intel_vgpu_write(struct mdev_device *mdev, { unsigned int done = 0; int ret; + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + + if (index == VFIO_PCI_DEVICE_STATE_REGION_INDEX) + return intel_vgpu_rw(mdev, (char *)buf, count, ppos, true); while (count) { size_t filled; @@ -1037,6 +1048,14 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, case VFIO_PCI_VGA_REGION_INDEX: gvt_dbg_core("get region info index:%d\n", info.
[Qemu-devel] [Intel-gfx][RFC 3/9] drm/i915/gvt: Adjust the gma parameter in gpu commands during command parser
Adjust the gma parameter in gpu commands according to the shift offset in guests' aperture and hidden gm address, and patch the commands before submit to execute. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 26 ++ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index 51241de5..540ee42 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -922,7 +922,7 @@ static int cmd_handler_lrr(struct parser_exec_state *s) } static inline int cmd_address_audit(struct parser_exec_state *s, - unsigned long guest_gma, int op_size, bool index_mode); + unsigned long guest_gma, int op_size, bool index_mode, int offset); static int cmd_handler_lrm(struct parser_exec_state *s) { @@ -942,7 +942,7 @@ static int cmd_handler_lrm(struct parser_exec_state *s) gma = cmd_gma(s, i + 1); if (gmadr_bytes == 8) gma |= (cmd_gma_hi(s, i + 2)) << 32; - ret |= cmd_address_audit(s, gma, sizeof(u32), false); + ret |= cmd_address_audit(s, gma, sizeof(u32), false, i + 1); } i += gmadr_dw_number(s) + 1; } @@ -962,7 +962,7 @@ static int cmd_handler_srm(struct parser_exec_state *s) gma = cmd_gma(s, i + 1); if (gmadr_bytes == 8) gma |= (cmd_gma_hi(s, i + 2)) << 32; - ret |= cmd_address_audit(s, gma, sizeof(u32), false); + ret |= cmd_address_audit(s, gma, sizeof(u32), false, i + 1); } i += gmadr_dw_number(s) + 1; } @@ -1032,7 +1032,7 @@ static int cmd_handler_pipe_control(struct parser_exec_state *s) if (cmd_val(s, 1) & (1 << 21)) index_mode = true; ret |= cmd_address_audit(s, gma, sizeof(u64), - index_mode); + index_mode, 2); } } } @@ -1364,10 +1364,12 @@ static unsigned long get_gma_bb_from_cmd(struct parser_exec_state *s, int index) } static inline int cmd_address_audit(struct parser_exec_state *s, - unsigned long guest_gma, int op_size, bool index_mode) + unsigned long guest_gma, int op_size, bool index_mode, int offset) { struct intel_vgpu *vgpu = s->vgpu; u32 max_surface_size = vgpu->gvt->device_info.max_surface_size; + int gmadr_bytes = vgpu->gvt->device_info.gmadr_bytes_in_cmd; + u64 host_gma; int i; int ret; @@ -1387,6 +1389,14 @@ static inline int cmd_address_audit(struct parser_exec_state *s, guest_gma + op_size - 1))) { ret = -EINVAL; goto err; + } else + intel_gvt_ggtt_gmadr_g2h(vgpu, guest_gma, _gma); + + if (offset > 0) { + patch_value(s, cmd_ptr(s, offset), host_gma & GENMASK(31, 2)); + if (gmadr_bytes == 8) + patch_value(s, cmd_ptr(s, offset + 1), + (host_gma >> 32) & GENMASK(15, 0)); } return 0; err: @@ -1429,7 +1439,7 @@ static int cmd_handler_mi_store_data_imm(struct parser_exec_state *s) gma = (gma_high << 32) | gma_low; core_id = (cmd_val(s, 1) & (1 << 0)) ? 1 : 0; } - ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false); + ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false, 1); return ret; } @@ -1473,7 +1483,7 @@ static int cmd_handler_mi_op_2f(struct parser_exec_state *s) gma_high = cmd_val(s, 2) & GENMASK(15, 0); gma = (gma_high << 32) | gma; } - ret = cmd_address_audit(s, gma, op_size, false); + ret = cmd_address_audit(s, gma, op_size, false, 1); return ret; } @@ -1513,7 +1523,7 @@ static int cmd_handler_mi_flush_dw(struct parser_exec_state *s) /* Store Data Index */ if (cmd_val(s, 0) & (1 << 21)) index_mode = true; - ret = cmd_address_audit(s, gma, sizeof(u64), index_mode); + ret = cmd_address_audit(s, (gma | (1 << 2)), sizeof(u64), index_mode, 1); } /* Check notify bit */ if ((cmd_val(s, 0) & (1 << 8))) -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 4/9] drm/i915/gvt: Retrieve the guest gm base address from PVINFO
As after migration the host gm base address will be changed due to resource re-allocation, in order to make sure the guest gm address doesn't change with that to retrieve the guest gm base address from PVINFO. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/cfg_space.c | 3 ++- drivers/gpu/drm/i915/gvt/gtt.c | 8 drivers/gpu/drm/i915/gvt/gvt.h | 22 ++ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cfg_space.c b/drivers/gpu/drm/i915/gvt/cfg_space.c index 40af17e..b57ae44 100644 --- a/drivers/gpu/drm/i915/gvt/cfg_space.c +++ b/drivers/gpu/drm/i915/gvt/cfg_space.c @@ -33,6 +33,7 @@ #include "i915_drv.h" #include "gvt.h" +#include "i915_pvinfo.h" enum { INTEL_GVT_PCI_BAR_GTTMMIO = 0, @@ -123,7 +124,7 @@ static int map_aperture(struct intel_vgpu *vgpu, bool map) else val = *(u32 *)(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_2); - first_gfn = (val + vgpu_aperture_offset(vgpu)) >> PAGE_SHIFT; + first_gfn = (val + vgpu_guest_aperture_offset(vgpu)) >> PAGE_SHIFT; first_mfn = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT; ret = intel_gvt_hypervisor_map_gfn_to_mfn(vgpu, first_gfn, diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index df596a6..e9a127c 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -64,10 +64,10 @@ int intel_gvt_ggtt_gmadr_g2h(struct intel_vgpu *vgpu, u64 g_addr, u64 *h_addr) if (vgpu_gmadr_is_aperture(vgpu, g_addr)) *h_addr = vgpu_aperture_gmadr_base(vgpu) - + (g_addr - vgpu_aperture_offset(vgpu)); + + (g_addr - vgpu_guest_aperture_gmadr_base(vgpu)); else *h_addr = vgpu_hidden_gmadr_base(vgpu) - + (g_addr - vgpu_hidden_offset(vgpu)); + + (g_addr - vgpu_guest_hidden_gmadr_base(vgpu)); return 0; } @@ -79,10 +79,10 @@ int intel_gvt_ggtt_gmadr_h2g(struct intel_vgpu *vgpu, u64 h_addr, u64 *g_addr) return -EACCES; if (gvt_gmadr_is_aperture(vgpu->gvt, h_addr)) - *g_addr = vgpu_aperture_gmadr_base(vgpu) + *g_addr = vgpu_guest_aperture_gmadr_base(vgpu) + (h_addr - gvt_aperture_gmadr_base(vgpu->gvt)); else - *g_addr = vgpu_hidden_gmadr_base(vgpu) + *g_addr = vgpu_guest_hidden_gmadr_base(vgpu) + (h_addr - gvt_hidden_gmadr_base(vgpu->gvt)); return 0; } diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 71c00b2..23eeb7c 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -343,6 +343,20 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt); #define vgpu_fence_base(vgpu) (vgpu->fence.base) #define vgpu_fence_sz(vgpu) (vgpu->fence.size) +/* Aperture/GM space definitions for vGPU Guest view point */ +#define vgpu_guest_aperture_offset(vgpu) \ + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) +#define vgpu_guest_hidden_offset(vgpu) \ + vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) + +#define vgpu_guest_aperture_gmadr_base(vgpu) (vgpu_guest_aperture_offset(vgpu)) +#define vgpu_guest_aperture_gmadr_end(vgpu) \ + (vgpu_guest_aperture_gmadr_base(vgpu) + vgpu_aperture_sz(vgpu) - 1) + +#define vgpu_guest_hidden_gmadr_base(vgpu) (vgpu_guest_hidden_offset(vgpu)) +#define vgpu_guest_hidden_gmadr_end(vgpu) \ + (vgpu_guest_hidden_gmadr_base(vgpu) + vgpu_hidden_sz(vgpu) - 1) + struct intel_vgpu_creation_params { __u64 handle; __u64 low_gm_sz; /* in MB */ @@ -420,12 +434,12 @@ void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu); /* validating GM functions */ #define vgpu_gmadr_is_aperture(vgpu, gmadr) \ - ((gmadr >= vgpu_aperture_gmadr_base(vgpu)) && \ -(gmadr <= vgpu_aperture_gmadr_end(vgpu))) + ((gmadr >= vgpu_guest_aperture_gmadr_base(vgpu)) && \ +(gmadr <= vgpu_guest_aperture_gmadr_end(vgpu))) #define vgpu_gmadr_is_hidden(vgpu, gmadr) \ - ((gmadr >= vgpu_hidden_gmadr_base(vgpu)) && \ -(gmadr <= vgpu_hidden_gmadr_end(vgpu))) + ((gmadr >= vgpu_guest_hidden_gmadr_base(vgpu)) && \ +(gmadr <= vgpu_guest_hidden_gmadr_end(vgpu))) #define vgpu_gmadr_is_valid(vgpu, gmadr) \ ((vgpu_gmadr_is_aperture(vgpu, gmadr) || \ -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 0/9] drm/i915/gvt: Add the live migration support to VFIO mdev deivce - Intel vGPU
This series RFC patches give a sample about how to enable the live migration on vfio mdev deivce with the new introduced vfio interface and vfio device status region. In order to fulfill the migration requirement we add the following modifications to the mdev device driver. 1. Add the guest to host graphics address adjustment when guest try to access gma through mmio or graphics commands, so after migraiton the guest view of graphics address will remain the same. 2. Add handler for VFIO new ioctls to contorl the device stop/start and fetch the dirty page bitmap from device model. 3. Implement the function to save/retore the device context, which is accessed through VFIO new region VFIO_PCI_DEVICE_STATE_REGION_INDEX to transfer device status during the migration. Yulei Zhang (9): drm/i915/gvt: Apply g2h adjust for GTT mmio access drm/i915/gvt: Apply g2h adjustment during fence mmio access drm/i915/gvt: Adjust the gma parameter in gpu commands during command parser drm/i915/gvt: Retrieve the guest gm base address from PVINFO drm/i915/gvt: Align the guest gm aperture start offset for live migration drm/i915/gvt: Introduce new flag to indicate migration capability drm/i915/gvt: Introduce new VFIO ioctl for device status control drm/i915/gvt: Introduce new VFIO ioctl for mdev device dirty page sync drm/i915/gvt: Add support to VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX drivers/gpu/drm/i915/gvt/Makefile | 2 +- drivers/gpu/drm/i915/gvt/aperture_gm.c | 6 +- drivers/gpu/drm/i915/gvt/cfg_space.c | 3 +- drivers/gpu/drm/i915/gvt/cmd_parser.c | 26 +- drivers/gpu/drm/i915/gvt/gtt.c | 19 +- drivers/gpu/drm/i915/gvt/gvt.c | 1 + drivers/gpu/drm/i915/gvt/gvt.h | 41 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 65 ++- drivers/gpu/drm/i915/gvt/migrate.c | 715 + drivers/gpu/drm/i915/gvt/migrate.h | 82 drivers/gpu/drm/i915/gvt/mmio.c| 14 + drivers/gpu/drm/i915/gvt/mmio.h| 1 + drivers/gpu/drm/i915/gvt/vgpu.c| 8 +- include/uapi/linux/vfio.h | 33 +- 14 files changed, 984 insertions(+), 32 deletions(-) create mode 100644 drivers/gpu/drm/i915/gvt/migrate.c create mode 100644 drivers/gpu/drm/i915/gvt/migrate.h -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 6/9] drm/i915/gvt: Introduce new flag to indicate migration capability
New device flag VFIO_DEVICE_FLAGS_MIGRATABLE is added for vfio mdev device vGPU to claim the capability for live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 + include/uapi/linux/vfio.h| 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index d2b13ae..c44b319 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -940,6 +940,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, info.flags = VFIO_DEVICE_FLAGS_PCI; info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.flags |= VFIO_DEVICE_FLAGS_MIGRATABLE; info.num_regions = VFIO_PCI_NUM_REGIONS; info.num_irqs = VFIO_PCI_NUM_IRQS; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ae46105..9ad9ce1 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -199,6 +199,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4)/* vfio-ccw device */ +#define VFIO_DEVICE_FLAGS_MIGRATABLE (1 << 5) /* Device supports migrate */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 1/9] drm/i915/gvt: Apply g2h adjust for GTT mmio access
Apply guest to host gma conversion while guest try to access the GTT mmio registers, as after enable live migration the host gma will be changed due to the resourece re-allocation, but guest gma should be remaining unchanged, thus g2h conversion is request for it. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/gtt.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 66374db..df596a6 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -59,8 +59,7 @@ bool intel_gvt_ggtt_validate_range(struct intel_vgpu *vgpu, u64 addr, u32 size) /* translate a guest gmadr to host gmadr */ int intel_gvt_ggtt_gmadr_g2h(struct intel_vgpu *vgpu, u64 g_addr, u64 *h_addr) { - if (WARN(!vgpu_gmadr_is_valid(vgpu, g_addr), -"invalid guest gmadr %llx\n", g_addr)) + if (!vgpu_gmadr_is_valid(vgpu, g_addr)) return -EACCES; if (vgpu_gmadr_is_aperture(vgpu, g_addr)) @@ -1819,17 +1818,15 @@ static int emulate_gtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm; struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops; unsigned long g_gtt_index = off >> info->gtt_entry_size_shift; - unsigned long gma; + unsigned long h_gtt_index; struct intel_gvt_gtt_entry e, m; int ret; if (bytes != 4 && bytes != 8) return -EINVAL; - gma = g_gtt_index << GTT_PAGE_SHIFT; - /* the VM may configure the whole GM space when ballooning is used */ - if (!vgpu_gmadr_is_valid(vgpu, gma)) + if (intel_gvt_ggtt_index_g2h(vgpu, g_gtt_index, _gtt_index)) return 0; ggtt_get_guest_entry(ggtt_mm, , g_gtt_index); @@ -1852,7 +1849,7 @@ static int emulate_gtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, ops->set_pfn(, gvt->gtt.scratch_ggtt_mfn); } - ggtt_set_shadow_entry(ggtt_mm, , g_gtt_index); + ggtt_set_shadow_entry(ggtt_mm, , h_gtt_index); gtt_invalidate(gvt->dev_priv); ggtt_set_guest_entry(ggtt_mm, , g_gtt_index); return 0; -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 8/9] drm/i915/gvt: Introduce new VFIO ioctl for mdev device dirty page sync
Add new vfio ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP to fetch the dirty page bitmap from mdev device driver for data sync during live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/kvmgt.c | 33 + include/uapi/linux/vfio.h| 14 ++ 2 files changed, 47 insertions(+) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index ac327f7..e9f11a9 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -919,6 +919,24 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags, return func(vgpu, index, start, count, flags, data); } +static void intel_vgpu_update_dirty_bitmap(struct intel_vgpu *vgpu, u64 start_addr, + u64 page_nr, void *bitmap) +{ + u64 gfn = start_addr >> GTT_PAGE_SHIFT; + struct intel_vgpu_guest_page *p; + int i; + + for (i = 0; i < page_nr; i++) { + hash_for_each_possible(vgpu->gtt.guest_page_hash_table, + p, node, gfn) { + if (p->gfn == gfn) + set_bit(i, bitmap); + } + gfn++; + } + +} + static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) { @@ -1156,6 +1174,21 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, intel_gvt_ops->vgpu_deactivate(vgpu); else intel_gvt_ops->vgpu_activate(vgpu); + } else if (cmd == VFIO_DEVICE_PCI_GET_DIRTY_BITMAP) { + struct vfio_pci_get_dirty_bitmap d; + unsigned long bitmap_sz; + unsigned *bitmap; + minsz = offsetofend(struct vfio_pci_get_dirty_bitmap, page_nr); + if (copy_from_user(, (void __user *)arg, minsz)) + return -EFAULT; + bitmap_sz = (BITS_TO_LONGS(d.page_nr) + 1) * sizeof(unsigned long); + bitmap = vzalloc(bitmap_sz); + intel_vgpu_update_dirty_bitmap(vgpu, d.start_addr, d.page_nr, bitmap); + if (copy_to_user((void __user*)arg + minsz, bitmap, bitmap_sz)) { + vfree(bitmap); + return -EFAULT; + } + vfree(bitmap); } return 0; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4bb057d..544cf93 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -518,6 +518,20 @@ struct vfio_pci_status_set{ #define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14) +/** + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15, + * struct vfio_pci_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_get_dirty_bitmap{ + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15) + /* API for Type1 VFIO IOMMU */ /** -- 2.7.4
[Qemu-devel] [Intel-gfx][RFC 2/9] drm/i915/gvt: Apply g2h adjustment during fence mmio access
Apply the guest to host gma conversion while guest config the fence mmio registers due to the host gma change after the migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- drivers/gpu/drm/i915/gvt/aperture_gm.c | 6 -- drivers/gpu/drm/i915/gvt/gvt.h | 14 ++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c b/drivers/gpu/drm/i915/gvt/aperture_gm.c index ca3d192..cd68ec6 100644 --- a/drivers/gpu/drm/i915/gvt/aperture_gm.c +++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c @@ -144,8 +144,10 @@ void intel_vgpu_write_fence(struct intel_vgpu *vgpu, I915_WRITE(fence_reg_lo, 0); POSTING_READ(fence_reg_lo); - I915_WRITE(fence_reg_hi, upper_32_bits(value)); - I915_WRITE(fence_reg_lo, lower_32_bits(value)); + I915_WRITE(fence_reg_hi, + intel_gvt_reg_g2h(vgpu, upper_32_bits(value), 0xF000)); + I915_WRITE(fence_reg_lo, + intel_gvt_reg_g2h(vgpu, lower_32_bits(value), 0xF000)); POSTING_READ(fence_reg_lo); } diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 3a74e79..71c00b2 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -451,6 +451,20 @@ int intel_gvt_ggtt_index_g2h(struct intel_vgpu *vgpu, unsigned long g_index, int intel_gvt_ggtt_h2g_index(struct intel_vgpu *vgpu, unsigned long h_index, unsigned long *g_index); +/* apply guest to host gma convertion in GM registers setting */ +static inline u64 intel_gvt_reg_g2h(struct intel_vgpu *vgpu, + u32 addr, u32 mask) +{ + u64 gma; + + if (addr) { + intel_gvt_ggtt_gmadr_g2h(vgpu, + addr & mask, ); + addr = gma | (addr & (~mask)); + } + return addr; +} + void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, bool primary); void intel_vgpu_reset_cfg_space(struct intel_vgpu *vgpu); -- 2.7.4
[Qemu-devel] [RFC 5/5] vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP
New VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP is used to sync the pci device dirty pages during the migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 32 hw/vfio/pci.h | 2 ++ linux-headers/linux/vfio.h | 14 ++ 3 files changed, 48 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 833cd90..64c851f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -32,6 +32,7 @@ #include "pci.h" #include "trace.h" #include "qapi/error.h" +#include "exec/ram_addr.h" #define MSIX_CAP_LENGTH 12 @@ -39,6 +40,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static VMStateDescription vfio_pci_vmstate; static void vfio_vm_change_state_handler(void *pv, int running, RunState state); +static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2869,6 +2871,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_setup_resetfn_quirk(vdev); qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); +vdev->vfio_memory_listener = (MemoryListener) { + .log_sync = vfio_log_sync, +}; +memory_listener_register(>vfio_memory_listener, _space_memory); + return; out_teardown: @@ -2964,6 +2971,7 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state) if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) { error_report("vfio: Failed to %s device\n", running ? "start" : "stop"); } +vdev->device_stop = running ? false : true; g_free(vfio_status); } @@ -3079,6 +3087,30 @@ static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *fie return 0; } +static void vfio_log_sync(MemoryListener *listener, MemoryRegionSection *section) +{ +VFIOPCIDevice *vdev = container_of(listener, struct VFIOPCIDevice, vfio_memory_listener); + +if (vdev->device_stop) { +struct vfio_pci_get_dirty_bitmap *d; +ram_addr_t size = int128_get64(section->size); +unsigned long page_nr = size >> TARGET_PAGE_BITS; +unsigned long bitmap_size = (BITS_TO_LONGS(page_nr) + 1) * sizeof(unsigned long); +d = g_malloc0(sizeof(*d) + bitmap_size); +d->start_addr = section->offset_within_address_space; +d->page_nr = page_nr; + +if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_GET_DIRTY_BITMAP, d)) { +error_report("vfio: Failed to fetch dirty pages for migration\n"); +goto exit; +} +cpu_physical_memory_set_dirty_lebitmap((unsigned long*)>dirty_bitmap, d->start_addr, d->page_nr); + +exit: +g_free(d); +} +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index bd98618..984391d 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -144,6 +144,8 @@ typedef struct VFIOPCIDevice { bool no_kvm_intx; bool no_kvm_msi; bool no_kvm_msix; +bool device_stop; +MemoryListener vfio_memory_listener; } VFIOPCIDevice; uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index fa17848..aa73ee1 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -502,6 +502,20 @@ struct vfio_pci_status_set{ #define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14) +/** + * VFIO_DEVICE_PCI_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 15, + * struct vfio_pci_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_get_dirty_bitmap{ + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15) + /* API for Type1 VFIO IOMMU */ /** -- 2.7.4
[Qemu-devel] [RFC 4/5] vfio: use vfio_device_put/vfio_device_get for device status save/restore
For VFIO pci device status migrate, on the source side with funtion vfio_device_put to save the following states 1. pci configuration space addr0~addr5 2. pci configuration space msi_addr msi_data 3. pci device status fetch from device driver And on the target side with funtion vfio_device_get to restore the same states 1. re-setup the pci bar configuration 2. re-setup the pci device msi configuration 3. restore the pci device status Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 105 +- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 605a473..833cd90 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2961,18 +2961,121 @@ static void vfio_vm_change_state_handler(void *pv, int running, RunState state) vfio_status->flags = running ? VFIO_DEVICE_PCI_START : VFIO_DEVICE_PCI_STOP; -ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status); +if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status)) { +error_report("vfio: Failed to %s device\n", running ? "start" : "stop"); +} g_free(vfio_status); } static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField *field, QJSON *vmdesc) { +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +VFIORegion *region = >device_state.region; +int sz = region->size; +uint8_t *buf = NULL; +uint32_t msi_cfg, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i*4, 4); +qemu_put_be32(f, bar_cfg); +} + +msi_cfg = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(msi_cfg & PCI_MSI_FLAGS_64BIT); + +msi_lo = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4); +qemu_put_be32(f, msi_lo); + +if (msi_64bit) { +msi_hi = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 4); +qemu_put_be32(f, msi_hi); +} + +msi_data = pci_default_read_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), 2); +qemu_put_be32(f, msi_data); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate\n"); +goto exit; +} + +if (pread(vdev->vbasedev.fd, buf, sz, region->fd_offset) != sz) { +error_report("vfio: Failed to read Device State Region\n"); +goto exit; +} + +qemu_put_buffer(f, buf, sz); + +exit: +if (buf) +g_free(buf); + return 0; } static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *field) { +VFIOPCIDevice *vdev = pv; +PCIDevice *pdev = >pdev; +VFIORegion *region = >device_state.region; +int sz = region->size; +uint8_t *buf = NULL; +uint32_t ctl, msi_lo, msi_hi, msi_data, bar_cfg, i; +bool msi_64bit; + +/* retore pci bar configuration */ +ctl = pci_default_read_config(pdev, PCI_COMMAND, 2); +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2); +for (i = 0; i < PCI_ROM_SLOT; i++) { +bar_cfg = qemu_get_be32(f); +vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i*4, bar_cfg, 4); +} +vfio_pci_write_config(pdev, PCI_COMMAND, + ctl | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2); + +/* restore msi configuration */ +ctl = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS, 2); +msi_64bit = !!(ctl & PCI_MSI_FLAGS_64BIT); + +vfio_pci_write_config(>pdev, + pdev->msi_cap + PCI_MSI_FLAGS, + ctl & (!PCI_MSI_FLAGS_ENABLE), 2); + +msi_lo = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, msi_lo, 4); + +if (msi_64bit) { +msi_hi = qemu_get_be32(f); +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, msi_hi, 4); +} +msi_data = qemu_get_be32(f); +vfio_pci_write_config(pdev, + pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32), + msi_data, 2); + +vfio_pci_write_config(>pdev, pdev->msi_cap + PCI_MSI_FLAGS, + ctl | PCI_MSI_FLAGS_ENABLE, 2); + +buf = g_malloc(sz); +if (buf == NULL) { +error_report("vfio: Failed to allocate memory for migrate\n"); +return -1; +} + +qemu_get_buffer(f, buf, sz); +if (pwrite(vdev->vbasedev.fd, buf, sz, region->fd_offset) != sz) { +error_report("vfio: Failed to write Device State Region\n"); +return -1; +} + +if (buf) + g_free(buf); return 0; } -- 2.7.4
[Qemu-devel] [RFC 3/5] vfio: introduce new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET
New VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET is added to change the vfio pci device status during the migration, stop the device on the source side before fetch its status and start the deivce on the target side after restore its status. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 17 + linux-headers/linux/vfio.h | 15 +++ 2 files changed, 32 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 7de4eb4..605a473 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -38,6 +38,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static VMStateDescription vfio_pci_vmstate; +static void vfio_vm_change_state_handler(void *pv, int running, RunState state); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2866,6 +2867,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); +qemu_add_vm_change_state_handler(vfio_vm_change_state_handler, vdev); return; @@ -2948,6 +2950,21 @@ post_reset: vfio_pci_post_reset(vdev); } +static void vfio_vm_change_state_handler(void *pv, int running, RunState state) +{ +VFIOPCIDevice *vdev = pv; +struct vfio_pci_status_set *vfio_status; +int argsz = sizeof(*vfio_status); + +vfio_status = g_malloc0(argsz); +vfio_status->argsz = argsz; +vfio_status->flags = running ? VFIO_DEVICE_PCI_START : + VFIO_DEVICE_PCI_STOP; + +ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_STATUS_SET, vfio_status); +g_free(vfio_status); +} + static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField *field, QJSON *vmdesc) { diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index c87d05c..fa17848 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -487,6 +487,21 @@ struct vfio_pci_hot_reset { #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13) +/** + * VFIO_DEVICE_PCI_STATUS_SET - _IOW(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_pci_status_set) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_status_set{ + __u32 argsz; + __u32 flags; +#define VFIO_DEVICE_PCI_STOP (1 << 0) +#define VFIO_DEVICE_PCI_START (1 << 1) +}; + +#define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14) + /* API for Type1 VFIO IOMMU */ /** -- 2.7.4
[Qemu-devel] [RFC 2/5] vfio: Add struct vfio_vmstate_info to introduce vfio device put/get funtion
Introduce vfio_device_put/vfio_device_get funtion for vfio device state save/restore usage. And vfio device unmigratable flag will be set to false during initialization if device flag VFIO_DEVICE_FLAGS_MIGRATABLE is set. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 35 ++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index bf2e0ff..7de4eb4 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -37,6 +37,7 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +static VMStateDescription vfio_pci_vmstate; /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2375,6 +2376,7 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } QLIST_INIT(>device_state.quirks); +vfio_pci_vmstate.unmigratable = 0; } ret = vfio_get_region_info(vbasedev, @@ -2946,6 +2948,17 @@ post_reset: vfio_pci_post_reset(vdev); } +static int vfio_device_put(QEMUFile *f, void *pv, size_t size, VMStateField *field, +QJSON *vmdesc) +{ +return 0; +} + +static int vfio_device_get(QEMUFile *f, void *pv, size_t size, VMStateField *field) +{ +return 0; +} + static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); @@ -2990,9 +3003,29 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_END_OF_LIST(), }; -static const VMStateDescription vfio_pci_vmstate = { +static VMStateInfo vfio_vmstate_info = { +.name = "vfio-state", +.get = vfio_device_get, +.put = vfio_device_put, +}; + +static VMStateDescription vfio_pci_vmstate = { .name = "vfio-pci", .unmigratable = 1, +.version_id = 1, +.minimum_version_id = 1, +.fields = (VMStateField[]) { +{ +.name = "vfio dev", +.version_id = 0, +.field_exists = NULL, +.size = 0, +.info = _vmstate_info, +.flags= VMS_SINGLE, +.offset = 0, + }, +VMSTATE_END_OF_LIST() +}, }; static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) -- 2.7.4
[Qemu-devel] [RFC 1/5] vfio: introduce a new VFIO region for migration support
New VFIO region VFIO_PCI_DEVICE_STATE_REGION_INDEX is added to fetch and restore the pci device status during the live migration. Signed-off-by: Yulei Zhang <yulei.zh...@intel.com> --- hw/vfio/pci.c | 17 + hw/vfio/pci.h | 1 + linux-headers/linux/vfio.h | 5 - 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 03a3d01..bf2e0ff 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2360,6 +2360,23 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) QLIST_INIT(>bars[i].quirks); } +/* device state region setup */ +if (vbasedev->flags & VFIO_DEVICE_FLAGS_MIGRATABLE) { +char *name = g_strdup_printf("%s BAR %d", vbasedev->name, VFIO_PCI_DEVICE_STATE_REGION_INDEX); + +ret = vfio_region_setup(OBJECT(vdev), vbasedev, +>device_state.region, VFIO_PCI_DEVICE_STATE_REGION_INDEX, name); +g_free(name); + +if (ret) { +error_setg_errno(errp, -ret, "failed to get region %d info", + VFIO_PCI_DEVICE_STATE_REGION_INDEX); +return; +} + +QLIST_INIT(>device_state.quirks); +} + ret = vfio_get_region_info(vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, _info); if (ret) { diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index a8366bb..bd98618 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -115,6 +115,7 @@ typedef struct VFIOPCIDevice { int interrupt; /* Current interrupt type */ VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */ +VFIOBAR device_state; void *igd_opregion; PCIHostDeviceAddress host; EventNotifier err_notifier; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 531cb2e..c87d05c 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -198,6 +198,8 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1)/* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_CCW (1 << 4)/* vfio-ccw device */ +#define VFIO_DEVICE_FLAGS_MIGRATABLE (1 << 5) /* Device supports migrate */ __u32 num_regions;/* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; @@ -433,7 +435,8 @@ enum { * between described ranges are unimplemented. */ VFIO_PCI_VGA_REGION_INDEX, - VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */ + VFIO_PCI_DEVICE_STATE_REGION_INDEX, + VFIO_PCI_NUM_REGIONS = 10 /* Fixed user ABI, region indexes >=9 use */ /* device specific cap to define content. */ }; -- 2.7.4
[Qemu-devel] [RFC 0/5] vfio: Introduce Live migration capability to vfio_mdev device
Summary This series RFC would like to introduce the live migration capability to vfio_mdev device. As currently vfio_mdev device don't support migration, we introduce a device flag VFIO_DEVICE_FLAGS_MIGRATABLE to help determine whether the mdev device can be migrate or not, it will check the flag during the device initialization and decide to init the new vfio region VFIO_PCI_DEVICE_STATE_REGION_INDEX. The intention to add the new region is using it for vfio_mdev device status save and restore during the migration. The access to this region will be trapped and forward to the vfio_mdev device driver. There is an alternative way to achieve it is to add a new vfio ioctl to help fetch and save the device status. Also this series include two new vfio ioctl #define VFIO_DEVICE_PCI_STATUS_SET _IO(VFIO_TYPE, VFIO_BASE + 14) #define VFIO_DEVICE_PCI_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 15) The first one is used to contorl the device running status, we want to stop the mdev device before quary the status from its device driver and restart the device after migration. The second one is used to do the mdev device dirty page synchronization. So the vfio_mdev device migration sequence would be Source VM side: start migration | V get the cpu state change callback use status set ioctl to stop the mdev device | V save the deivce status into Qemufile which is read from the new vfio device status region | V quary the dirty page bitmap from deivce and add into qemu dirty list for sync Target VM side: restore the mdev device after get the saved status context from Qemufile | V get the cpu state change callback use status set ioctl to start the mdev device to put it in running status | V finish migration Yulei Zhang (5): vfio: introduce a new VFIO region for migration support vfio: Add struct vfio_vmstate_info to introduce vfio device put/get funtion vfio: introduce new VFIO ioctl VFIO_DEVICE_PCI_STATUS_SET vfio: use vfio_device_put/vfio_device_get for device status save/restore vifo: introduce new VFIO ioctl VFIO_DEVICE_PCI_GET_DIRTY_BITMAP hw/vfio/pci.c | 204 - hw/vfio/pci.h | 3 + linux-headers/linux/vfio.h | 34 +++- 3 files changed, 239 insertions(+), 2 deletions(-) -- 2.7.4