[PATCH QEMU v23 05/18] vfio: Add migration region initialization and finalize function

2020-05-20 Thread Kirti Wankhede
- Migration functions are implemented for VFIO_DEVICE_TYPE_PCI device in this
  patch series.
- VFIO device supports migration or not is decided based of migration region
  query. If migration region query is successful and migration region
  initialization is successful then migration is supported else migration is
  blocked.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/Makefile.objs |   2 +-
 hw/vfio/migration.c   | 138 ++
 hw/vfio/trace-events  |   3 +
 include/hw/vfio/vfio-common.h |   9 +++
 4 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 hw/vfio/migration.c

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index 9bb1c09e8477..8b296c889ed9 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,4 @@
-obj-y += common.o spapr.o
+obj-y += common.o spapr.o migration.o
 obj-$(CONFIG_VFIO_PCI) += pci.o pci-quirks.o display.o
 obj-$(CONFIG_VFIO_CCW) += ccw.o
 obj-$(CONFIG_VFIO_PLATFORM) += platform.o
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
new file mode 100644
index ..bf9384907ec0
--- /dev/null
+++ b/hw/vfio/migration.c
@@ -0,0 +1,138 @@
+/*
+ * Migration support for VFIO devices
+ *
+ * Copyright NVIDIA, Inc. 2020
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+
+#include "hw/vfio/vfio-common.h"
+#include "cpu.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include "migration/register.h"
+#include "migration/blocker.h"
+#include "migration/misc.h"
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/ram_addr.h"
+#include "pci.h"
+#include "trace.h"
+
+static void vfio_migration_region_exit(VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+
+if (!migration) {
+return;
+}
+
+if (migration->region.size) {
+vfio_region_exit(>region);
+vfio_region_finalize(>region);
+}
+}
+
+static int vfio_migration_region_init(VFIODevice *vbasedev, int index)
+{
+VFIOMigration *migration = vbasedev->migration;
+Object *obj = NULL;
+int ret = -EINVAL;
+
+if (!vbasedev->ops->vfio_get_object) {
+return ret;
+}
+
+obj = vbasedev->ops->vfio_get_object(vbasedev);
+if (!obj) {
+return ret;
+}
+
+ret = vfio_region_setup(obj, vbasedev, >region, index,
+"migration");
+if (ret) {
+error_report("%s: Failed to setup VFIO migration region %d: %s",
+ vbasedev->name, index, strerror(-ret));
+goto err;
+}
+
+if (!migration->region.size) {
+ret = -EINVAL;
+error_report("%s: Invalid region size of VFIO migration region %d: %s",
+ vbasedev->name, index, strerror(-ret));
+goto err;
+}
+
+return 0;
+
+err:
+vfio_migration_region_exit(vbasedev);
+return ret;
+}
+
+static int vfio_migration_init(VFIODevice *vbasedev,
+   struct vfio_region_info *info)
+{
+int ret;
+
+vbasedev->migration = g_new0(VFIOMigration, 1);
+
+ret = vfio_migration_region_init(vbasedev, info->index);
+if (ret) {
+error_report("%s: Failed to initialise migration region",
+ vbasedev->name);
+g_free(vbasedev->migration);
+vbasedev->migration = NULL;
+return ret;
+}
+
+return 0;
+}
+
+/* -- */
+
+int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
+{
+struct vfio_region_info *info;
+Error *local_err = NULL;
+int ret;
+
+ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION,
+   VFIO_REGION_SUBTYPE_MIGRATION, );
+if (ret) {
+goto add_blocker;
+}
+
+ret = vfio_migration_init(vbasedev, info);
+if (ret) {
+goto add_blocker;
+}
+
+trace_vfio_migration_probe(vbasedev->name, info->index);
+return 0;
+
+add_blocker:
+error_setg(>migration_blocker,
+   "VFIO device doesn't support migration");
+ret = migrate_add_blocker(vbasedev->migration_blocker, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+error_free(vbasedev->migration_blocker);
+}
+return ret;
+}
+
+void vfio_migration_finalize(VFIODevice *vbasedev)
+{
+if (vbasedev->migration_blocker) {
+migrate_del_blocker(vbasedev->migration_blocker);
+error_free(vbasedev->migration_blocker);
+}
+
+vfio_migration_region_exit(vbasedev);
+g_free(v

[PATCH QEMU v23 03/18] vfio: Add vfio_get_object callback to VFIODeviceOps

2020-05-20 Thread Kirti Wankhede
Hook vfio_get_object callback for PCI devices.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Suggested-by: Cornelia Huck 
Reviewed-by: Cornelia Huck 
---
 hw/vfio/pci.c | 8 
 include/hw/vfio/vfio-common.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 342dd6b9129d..0514ba373d1c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2407,10 +2407,18 @@ static void vfio_pci_compute_needs_reset(VFIODevice 
*vbasedev)
 }
 }
 
+static Object *vfio_pci_get_object(VFIODevice *vbasedev)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+
+return OBJECT(vdev);
+}
+
 static VFIODeviceOps vfio_pci_ops = {
 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
 .vfio_eoi = vfio_intx_eoi,
+.vfio_get_object = vfio_pci_get_object,
 };
 
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8d7a0fbb1046..74261feaeac9 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -119,6 +119,7 @@ struct VFIODeviceOps {
 void (*vfio_compute_needs_reset)(VFIODevice *vdev);
 int (*vfio_hot_reset_multi)(VFIODevice *vdev);
 void (*vfio_eoi)(VFIODevice *vdev);
+Object *(*vfio_get_object)(VFIODevice *vdev);
 };
 
 typedef struct VFIOGroup {
-- 
2.7.0




[PATCH QEMU v23 01/18] vfio: KABI for migration interface - Kernel header placeholder

2020-05-20 Thread Kirti Wankhede
Kernel header patches are being reviewed along with kernel side changes.
This patch is only for place holder.

This patch include all changes in vfio.h from above patch set

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 linux-headers/linux/vfio.h | 318 -
 1 file changed, 316 insertions(+), 2 deletions(-)

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index a41c45286511..7076d48e1ec0 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,232 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |->|-->|
+ *
+ * 3. Save the state during live migration
+ * |--->|>|-->|
+ *
+ * 4. Resuming
+ *   

[PATCH QEMU v23 04/18] vfio: Add save and load functions for VFIO PCI devices

2020-05-20 Thread Kirti Wankhede
These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/pci.c | 163 ++
 include/hw/vfio/vfio-common.h |   2 +
 2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0514ba373d1c..94535f0e27cd 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
 
 #define TYPE_VFIO_PCI "vfio-pci"
 #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
 }
 }
 
+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo &= (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+  PCI_BASE_ADDRESS_MEM_MASK);
+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;
+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
 {
 VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
 return OBJECT(vdev);
 }
 
+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be16(f, msi_data);
+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint32_t interrupt_type;
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+uint16_t pci_cmd;
+bool msi_64bit;
+int i, ret;
+
+/* retore pci bar configuration */
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar = qemu_get_be32(f);
+
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
+}
+
+ret = vfio_bars_validate(vdev);
+if (ret) {
+return ret;
+}
+
+interrupt_type = qemu_get_be32(f

[PATCH QEMU v23 02/18] vfio: Add function to unmap VFIO region

2020-05-20 Thread Kirti Wankhede
This function will be used for migration region.
Migration region is mmaped when migration starts and will be unmapped when
migration is complete.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Cornelia Huck 
---
 hw/vfio/common.c  | 32 
 hw/vfio/trace-events  |  1 +
 include/hw/vfio/vfio-common.h |  1 +
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0b3593b3c0c4..90e9a854d82c 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -925,6 +925,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, 
VFIORegion *region,
 return 0;
 }
 
+static void vfio_subregion_unmap(VFIORegion *region, int index)
+{
+trace_vfio_region_unmap(memory_region_name(>mmaps[index].mem),
+region->mmaps[index].offset,
+region->mmaps[index].offset +
+region->mmaps[index].size - 1);
+memory_region_del_subregion(region->mem, >mmaps[index].mem);
+munmap(region->mmaps[index].mmap, region->mmaps[index].size);
+object_unparent(OBJECT(>mmaps[index].mem));
+region->mmaps[index].mmap = NULL;
+}
+
 int vfio_region_mmap(VFIORegion *region)
 {
 int i, prot = 0;
@@ -955,10 +967,7 @@ int vfio_region_mmap(VFIORegion *region)
 region->mmaps[i].mmap = NULL;
 
 for (i--; i >= 0; i--) {
-memory_region_del_subregion(region->mem, 
>mmaps[i].mem);
-munmap(region->mmaps[i].mmap, region->mmaps[i].size);
-object_unparent(OBJECT(>mmaps[i].mem));
-region->mmaps[i].mmap = NULL;
+vfio_subregion_unmap(region, i);
 }
 
 return ret;
@@ -983,6 +992,21 @@ int vfio_region_mmap(VFIORegion *region)
 return 0;
 }
 
+void vfio_region_unmap(VFIORegion *region)
+{
+int i;
+
+if (!region->mem) {
+return;
+}
+
+for (i = 0; i < region->nr_mmaps; i++) {
+if (region->mmaps[i].mmap) {
+vfio_subregion_unmap(region, i);
+}
+}
+}
+
 void vfio_region_exit(VFIORegion *region)
 {
 int i;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index b1ef55a33ffd..8cdc27946cb8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -111,6 +111,7 @@ vfio_region_mmap(const char *name, unsigned long offset, 
unsigned long end) "Reg
 vfio_region_exit(const char *name, int index) "Device %s, region %d"
 vfio_region_finalize(const char *name, int index) "Device %s, region %d"
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps 
enabled: %d"
+vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) 
"Region %s unmap [0x%lx - 0x%lx]"
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) 
"Device %s region %d: %d sparse mmap entries"
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) 
"sparse entry %d [0x%lx - 0x%lx]"
 vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t 
subtype) "%s index %d, %08x/%0x8"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index fd564209ac71..8d7a0fbb1046 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -171,6 +171,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, 
VFIORegion *region,
   int index, const char *name);
 int vfio_region_mmap(VFIORegion *region);
 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
+void vfio_region_unmap(VFIORegion *region);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
-- 
2.7.0




[PATCH QEMU v23 00/18] Add migration support for VFIO devices

2020-05-20 Thread Kirti Wankhede
ate is called for that device section data
|
At the end, called .load_cleanup for each device and vCPUs are started.
|
(RUNNING, _NONE, _RUNNING)

Note that:
- Migration post copy is not supported.

v22 -> v23
-- Fixed issue reported by Yan
https://lore.kernel.org/kvm/97977ede-3c5b-c5a5-7858-7eecd7dd5...@nvidia.com/
- Sending this version to test v23 kernel version patches:
https://lore.kernel.org/kvm/1589998088-3250-1-git-send-email-kwankh...@nvidia.com/

v18 -> v22
- Few fixes from v18 review. But not yet fixed all concerns. I'll address those
  concerns in subsequent iterations.
- Sending this version to test v22 kernel version patches:
https://lore.kernel.org/kvm/1589781397-28368-1-git-send-email-kwankh...@nvidia.com/

v16 -> v18
- Nit fixes
- Get migration capability flags from container
- Added VFIO stats to MigrationInfo
- Fixed bug reported by Yan
https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg4.html

v9 -> v16
- KABI almost finalised on kernel patches.
- Added support for migration with vIOMMU enabled.

v8 -> v9:
- Split patch set in 2 sets, Kernel and QEMU sets.
- Dirty pages bitmap is queried from IOMMU container rather than from
  vendor driver for per device. Added 2 ioctls to achieve this.

v7 -> v8:
- Updated comments for KABI
- Added BAR address validation check during PCI device's config space load as
  suggested by Dr. David Alan Gilbert.
- Changed vfio_migration_set_state() to set or clear device state flags.
- Some nit fixes.

v6 -> v7:
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of structure
  vfio_device_migration_info to be followed based on Alex's suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration, that
  is removed assumption that data will be till end of migration region.
- If vendor driver supports mappable sparsed region, map those region during
  setup state of save/load, similarly unmap those from cleanup routines.
- Handles race condition that causes data corruption in migration region during
  save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Splitted into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2 bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined action
  on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with region
  type capability.
- Re-structured vfio_device_migration_info. This structure will be placed at 0th
  offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section of the
  region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested pages
  are copied.

Thanks,
Kirti




Kirti Wankhede (18):
  vfio: KABI for migration interface - Kernel header placeholder
  vfio: Add function to unmap VFIO region
  vfio: Add vfio_get_object callback to VFIODeviceOps
  vfio: Add save and load functions for VFIO PCI devices
  vfio: Add migration region initialization and finalize function
  vfio: Add VM state change handler to know state of VM
  vfio: Add migration state change notifier
  vfio: Register SaveVMHandlers for VFIO device
  vfio: Add save state functions to SaveVMHandlers
  vfio: Add load state functions to SaveVMHandlers
  iommu: add callback to get address limit IOMMU supports
  memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled
  vfio: Get migration capability flags for container
  vfio: Add function to start and stop dirty pages tracking
  vfio: Add vfio_listener_log_sync to mark dirty pages
  vfio: Add ioctl to get dirty pages bitmap during dma unmap.
  vfio: Make vfio-pci device migration capable
  qapi: Add VFIO devices migration stats in Migration stats

 hw/i386/intel_iommu.c |   9 +
 hw/vfio/Makefile.objs |   2 +-
 hw/vfio/common.c  | 411 +++--
 hw/vfio/migration.c   | 804 ++
 hw/vfio/pci.c | 203 +--
 hw/vfio/pci.h |   1 -
 hw/vfio/trace-events  |  19 +
 include/exec/memory.h |  18 +
 include/hw/vfio/vfio-common.h |  22 ++
 inclu

[PATCH Kernel v23 7/8] vfio iommu: Add migration capability to report supported features

2020-05-20 Thread Kirti Wankhede
Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 23 ++-
 include/uapi/linux/vfio.h   | 23 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 948436ad004c..9ed240358d5d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2392,6 +2392,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+
+   cap_mig.flags = 0;
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2438,8 +2454,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
info.iova_pgsizes = iommu->pgsize_bitmap;
 
-   ret = vfio_iommu_iova_build_caps(iommu, );
+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5e3f6f73e997..864b8a7d21e2 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,29 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates that IOMMU kernel driver supports
+ * dirty page logging.
+ *
+ * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for 
dirty
+ * page logging.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes that can be used by user applications when getting the dirty
+ * bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
2.7.0




[PATCH Kernel v23 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-20 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 61 +
 include/uapi/linux/vfio.h   | 11 
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index aa535c84d2e6..948436ad004c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1018,23 +1018,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1045,9 +1047,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1128,6 +1136,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(>lock);
goto again;
}
+
+   if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -2466,17 +2482,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
-   long ret;
+   struct vfio_bitmap bitmap = { 0 };
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
-   if (unmap.argsz < minsz || unmap.flags)
+   if (unmap.argsz < minsz ||
+   unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
return -EINVAL;
 
-   ret = vfio_dma_do_unmap(iommu, );
+   if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   unsigned long pgshift;
+
+   if (unmap.argsz < (minsz + sizeof(bitmap)))
+   return -EINVAL;
+
+   if (copy_from_user(,
+  (void __user *)(arg + minsz),
+  sizeof(bitmap)))
+   return -EFAULT;
+
+   if (!access_ok((void __user *)bitmap.data, bitmap.size))
+   return -EINVAL;
+
+   pgshift = __ffs(bitmap.pgsize);
+   ret = verify_bitmap_size(unmap.size >> pgshift,
+   

[PATCH Kernel v23 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-20 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 103 +---
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9ed240358d5d..e6aa3bd244c5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -592,11 +598,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v23 2/8] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-05-20 Thread Kirti Wankhede
vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0c60f895b24..fa735047b04d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -112,7 +112,7 @@ struct vfio_pfn {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   pfn;/* Host pfn */
-   atomic_tref_count;
+   unsigned intref_count;
 };
 
 struct vfio_regions {
@@ -233,7 +233,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, 
dma_addr_t iova,
 
vpfn->iova = iova;
vpfn->pfn = pfn;
-   atomic_set(>ref_count, 1);
+   vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
 }
@@ -251,7 +251,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct 
vfio_dma *dma,
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
if (vpfn)
-   atomic_inc(>ref_count);
+   vpfn->ref_count++;
return vpfn;
 }
 
@@ -259,7 +259,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 {
int ret = 0;
 
-   if (atomic_dec_and_test(>ref_count)) {
+   vpfn->ref_count--;
+   if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
-- 
2.7.0




[PATCH Kernel v23 4/8] vfio iommu: Add ioctl definition for dirty pages tracking

2020-05-20 Thread Kirti Wankhede
IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 56 +++
 1 file changed, 56 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..c33a641c2a3d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  * struct vfio_dma_unmap)
@@ -1059,6 +1065,56 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages logging.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
+ * the IOMMU driver to log pages that are dirtied or potentially dirtied by
+ * device; designed to be used when a migration is in progress. Dirty pages are
+ * loggeded until logging is disabled by user application by calling the IOCTL
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
+ * the IOMMU driver to stop logging dirtied pages.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
+ * User must specify the IOVA range and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports to get bitmap of smallest supported pgsize only and can be modified
+ * in future to get bitmap of specified pgsize. The user must provide a zeroed
+ * memory area for the bitmap memory and specify its size in bitmap.size.
+ * One bit is used to represent one page consecutively starting from iova
+ * offset. The user should provide page size in bitmap.pgsize field. A bit set
+ * in the bitmap indicates that the page at that offset from iova is dirty.
+ * The caller must set argsz to a value including the size of structure
+ * vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the actual
+ * bitmap. If dirty pages logging is not enabled, an error will be returned.
+ *
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+   __u32argsz;
+   __u32flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START  (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP   (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+   __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+   __u64  iova;/* IO virtual address */
+   __u64  size;/* Size of iova range */
+   struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.0




[PATCH Kernel v23 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-20 Thread Kirti Wankhede
Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 88 +++--
 1 file changed, 49 insertions(+), 39 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fa735047b04d..d41bc7557adb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,6 +69,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
unsigned intdma_avail;
+   uint64_tpgsize_bitmap;
boolv2;
boolnesting;
 };
@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
 
-   mutex_lock(>lock);
+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
/*
 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -823,12 +823,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu 
*iommu)
 * granularity while iommu driver can use the sub-PAGE_SIZE size
 * to map the buffer.
 */
-   if (bitmap & ~PAGE_MASK) {
-   bitmap &= PAGE_MASK;
-   bitmap |= PAGE_SIZE;
+   if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+   iommu->pgsize_bitmap &= PAGE_MASK;
+   iommu->pgsize_bitmap |= PAGE_SIZE;
}
-
-   return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -839,19 +837,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
size_t unmapped = 0;
int ret = 0, retries = 0;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+   mutex_lock(>lock);
+
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+   if (unmap->iova & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   if (!unmap->size || unmap->size & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
-   if (unmap->iova & mask)
-   return -EINVAL;
-   if (!unmap->size || unmap->size & mask)
-   return -EINVAL;
if (unmap->iova + unmap->size - 1 < unmap->iova ||
-   unmap->size > SIZE_MAX)
-   return -EINVAL;
+   unmap->size > SIZE_MAX) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
WARN_ON(mask & PAGE_MASK);
 again:
-   mutex_lock(>lock);
 
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -930,6 +937,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
blocking_notifier_call_chain(>notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
_unmap);
+   mutex_lock(>lock);
goto again;
}
unmapped += dma->size;
@@ -1045,24 +1053,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-   WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
 
-   if (!prot || !size || (size | iova | vaddr) & mask)
-   return -EINVAL;
+   mutex_lock(>lock);
 
-   /* Don't allow IOVA or virtual address wrap */
-   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-   return -EINVAL;
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-   mutex_lock(>lock);
+   WARN_ON(mask & PAGE_MASK);
+
+   if (!prot || !size || (size | iova | vaddr) & mask) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Don't allow IOVA or vir

[PATCH Kernel v23 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-20 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 313 +++-
 1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d41bc7557adb..aa535c84d2e6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,80 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   /*
+* Allocate extra 64 bits that are used to calculate shift required for
+* bitmap_shift_left() to manipulate and club unaligned number of pages
+* in adjacent vfio_dma ranges.
+*/
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+  GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p;
+
+   for (p = rb_prev(n); p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio

[PATCH Kernel v23 0/8] Add UAPIs to support migration for VFIO devices

2020-05-20 Thread Kirti Wankhede
rty pages bitmap is queried from IOMMU container rather than from
  vendor driver for per device. Added 2 ioctls to achieve this.

v7 -> v8:
- Updated comments for KABI
- Added BAR address validation check during PCI device's config space load
  as suggested by Dr. David Alan Gilbert.
- Changed vfio_migration_set_state() to set or clear device state flags.
- Some nit fixes.

v6 -> v7:
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of
  structure vfio_device_migration_info to be followed based on Alex's
  suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration,
  that is removed assumption that data will be till end of migration
  region.
- If vendor driver supports mappable sparsed region, map those region
  during setup state of save/load, similarly unmap those from cleanup
  routines.
- Handles race condition that causes data corruption in migration region
  during save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Split into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2
  bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined
  action on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with
  region type capability.
- Re-structured vfio_device_migration_info. This structure will be placed
  at 0th offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section
  of the region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested
  pages are copied.

Thanks,
Kirti


Kirti Wankhede (8):
  vfio: UAPI for migration interface for device state
  vfio iommu: Remove atomicity of ref_count of pinned pages
  vfio iommu: Cache pgsize_bitmap in struct vfio_iommu
  vfio iommu: Add ioctl definition for dirty pages tracking
  vfio iommu: Implementation of ioctl for dirty pages tracking
  vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap
  vfio iommu: Add migration capability to report supported features
  vfio: Selective dirty page tracking if IOMMU backed device pins pages

 drivers/vfio/vfio.c |  13 +-
 drivers/vfio/vfio_iommu_type1.c | 571 
 include/linux/vfio.h|   4 +-
 include/uapi/linux/vfio.h   | 318 ++
 4 files changed, 847 insertions(+), 59 deletions(-)

-- 
2.7.0




[PATCH Kernel v23 1/8] vfio: UAPI for migration interface for device state

2020-05-20 Thread Kirti Wankhede
- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 228 ++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 015516bcfaa3..ad9bb5af3463 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |--

Re: [PATCH Kernel v22 7/8] vfio iommu: Add migration capability to report supported features

2020-05-20 Thread Kirti Wankhede




On 5/20/2020 4:12 PM, Cornelia Huck wrote:

On Mon, 18 May 2020 11:26:36 +0530
Kirti Wankhede  wrote:


Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
  drivers/vfio/vfio_iommu_type1.c | 23 ++-
  include/uapi/linux/vfio.h   | 22 ++
  2 files changed, 44 insertions(+), 1 deletion(-)


(...)


diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index a1dd2150971e..aa8aa9dcf02a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,28 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
  };
  
+/*

+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates IOMMU kernel driver supports


s/indicates/indicates that/


+ * dirty page tracking.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.


"bitmap of supported page sizes for dirty page tracking" ?


+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.


"maximum supported dirty bitmap size in bytes that can be used by user
applications when getting the dirty bitmap" ?



Done.


+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
  
  /**


Reviewed-by: Cornelia Huck 



Thanks.

Kirti



Re: [PATCH Kernel v22 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-20 Thread Kirti Wankhede




On 5/20/2020 3:57 PM, Cornelia Huck wrote:

On Tue, 19 May 2020 12:24:13 +0530
Kirti Wankhede  wrote:


DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 62 +
  include/uapi/linux/vfio.h   | 10 +++
  2 files changed, 61 insertions(+), 11 deletions(-)


(...)


@@ -1085,6 +1093,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+


Nit: unrelated whitespace change.


dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;


(...)


diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4850c1fef1f8..a1dd2150971e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1048,12 +1048,22 @@ struct vfio_bitmap {
   * field.  No guarantee is made to the user that arbitrary unmaps of iova
   * or size different from those used in the original mapping call will
   * succeed.
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap


s/dirty bitmap/the dirty bitmap/


+ * before unmapping IO virtual addresses. When this flag is set, user must


s/user/the user/


+ * provide data[] as structure vfio_bitmap. User must allocate memory to get


"provide a struct vfio_bitmap in data[]" ?



+ * bitmap, zero the bitmap memory and must set size of allocated memory in
+ * vfio_bitmap.size field.


"The user must provide zero-allocated memory via vfio_bitmap.data and
its size in the vfio_bitmap.size field." ?



A bit in bitmap represents one page of user provided


s/bitmap/the bitmap/


+ * page size in 'pgsize', consecutively starting from iova offset. Bit set


s/Bit set/A set bit/


+ * indicates page at that offset from iova is dirty. Bitmap of pages in the


s/indicates page/indicates that the page/


+ * range of unmapped size is returned in vfio_bitmap.data


"A bitmap of the pages in the range of the unmapped size is returned in
the user-provided vfio_bitmap.data." ?


   */
  struct vfio_iommu_type1_dma_unmap {
__u32   argsz;
__u32   flags;
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
__u64   iova;   /* IO virtual address */
__u64   size;   /* Size of mapping (bytes) */
+   __u8data[];
  };
  
  #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)


With the nits addressed,


Done.


Reviewed-by: Cornelia Huck 



Thanks.

Kirti



Re: [PATCH Kernel v22 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-20 Thread Kirti Wankhede




On 5/20/2020 3:38 PM, Cornelia Huck wrote:

On Mon, 18 May 2020 11:26:32 +0530
Kirti Wankhede  wrote:


Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 88 +++--
  1 file changed, 49 insertions(+), 39 deletions(-)



(...)


@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
  }
  
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)

+static void vfio_pgsize_bitmap(struct vfio_iommu *iommu)


Minor nit: I'd have renamed this function to
vfio_update_pgsize_bitmap().



Done.


  {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
  
-	mutex_lock(>lock);

+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
  
  	/*

 * In case the IOMMU supports page sizes smaller than PAGE_SIZE


(...)

Reviewed-by: Cornelia Huck 



Thanks.

Kirti



Re: [PATCH Kernel v22 0/8] Add UAPIs to support migration for VFIO devices

2020-05-20 Thread Kirti Wankhede




On 5/20/2020 8:25 AM, Yan Zhao wrote:

On Tue, May 19, 2020 at 10:58:04AM -0600, Alex Williamson wrote:

Hi folks,

My impression is that we're getting pretty close to a workable
implementation here with v22 plus respins of patches 5, 6, and 8.  We
also have a matching QEMU series and a proposal for a new i40e
consumer, as well as I assume GVT-g updates happening internally at
Intel.  I expect all of the latter needs further review and discussion,
but we should be at the point where we can validate these proposed
kernel interfaces.  Therefore I'd like to make a call for reviews so
that we can get this wrapped up for the v5.8 merge window.  I know
Connie has some outstanding documentation comments and I'd like to make
sure everyone has an opportunity to check that their comments have been
addressed and we don't discover any new blocking issues.  Please send
your Acked-by/Reviewed-by/Tested-by tags if you're satisfied with this
interface and implementation.  Thanks!


hi Alex and Kirti,
after porting to qemu v22 and kernel v22, it is found out that
it can not even pass basic live migration test with error like

"Failed to get dirty bitmap for iova: 0xca000 size: 0x3000 err: 22"



Thanks for testing Yan.
I think last moment change in below cause this failure

https://lore.kernel.org/kvm/1589871178-8282-1-git-send-email-kwankh...@nvidia.com/

>if (dma->iova > iova + size)
>break;

Surprisingly with my basic testing with 2G sys mem QEMU didn't raise 
abort on g_free, but I do hit this with large sys mem.
With above change, that function iterated through next vfio_dma as well. 
Check should be as below:


-   if (dma->iova > iova + size)
+   if (dma->iova > iova + size -1)
break;

Another fix is in QEMU.
https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg04751.html

> > +range->bitmap.size = ROUND_UP(pages, 64) / 8;
>
> ROUND_UP(npages/8, sizeof(u64))?
>

If npages < 8, npages/8 is 0 and ROUND_UP(0, 8) returns 0.

Changing it as below

-range->bitmap.size = ROUND_UP(pages / 8, sizeof(uint64_t));
+range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * 
BITS_PER_BYTE) /

+ BITS_PER_BYTE;

I'm updating patches with these fixes and Cornelia's suggestion soon.

Due to short of time I may not be able to address all the concerns 
raised on previous versions of QEMU, I'm trying make QEMU side code 
available for testing for others with latest kernel changes. Don't 
worry, I will revisit comments on QEMU patches. Right now first priority 
is to test kernel UAPI and prepare kernel patches for 5.8


Thanks,
Kirti



[PATCH Kernel v22 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-19 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 62 +
 include/uapi/linux/vfio.h   | 10 +++
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0a420594483a..963ae4348b3c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1018,23 +1018,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1045,9 +1047,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1085,6 +1093,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1128,6 +1137,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(>lock);
goto again;
}
+
+   if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -2466,17 +2483,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
-   long ret;
+   struct vfio_bitmap bitmap = { 0 };
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
-   if (unmap.argsz < minsz || unmap.flags)
+   if (unmap.argsz < minsz ||
+   unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
return -EINVAL;
 
-   ret = vfio_dma_do_unmap(iommu, );
+   if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   unsigned long pgshift;
+
+   if (unmap.argsz < (minsz + sizeof(bitmap)))
+   return -EINVAL;
+
+   if (copy_from_user(,
+  (void __user *)(arg + minsz),
+  

[PATCH Kernel v22 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-19 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 103 +---
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d74b76919cbb..f5b79a71e9f7 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -592,11 +598,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v22 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-19 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 313 +++-
 1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..0a420594483a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,80 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   /*
+* Allocate extra 64 bits that are used to calculate shift required for
+* bitmap_shift_left() to manipulate and club unaligned number of pages
+* in adjacent vfio_dma ranges.
+*/
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+  GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p;
+
+   for (p = rb_prev(n); p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n;
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio

Re: [PATCH Kernel v22 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-19 Thread Kirti Wankhede




On 5/19/2020 3:23 AM, Alex Williamson wrote:

On Mon, 18 May 2020 11:26:34 +0530
Kirti Wankhede  wrote:


VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
  drivers/vfio/vfio_iommu_type1.c | 313 +++-
  1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..bf740fef196f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
  };
  
  struct vfio_domain {

@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
  };
  
  struct vfio_group {

@@ -126,6 +128,19 @@ struct vfio_regions {
  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
(!list_empty(>domain_list))
  
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
  static int put_pfn(unsigned long pfn, int prot);
  
  /*

@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
  }
  
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);


Curious that the extra 8-bytes are added in the next patch, but they're
just as necessary here.



Yes, moving it in this patch.
While resolving patches, I had to update 6/8 and 8/8 patches also. So 
updating 3 patches.



We also have the explanation above about why we have the signed int
size limitation, but we sort of ignore that when adding the bytes here.
That limitation is derived from __bitmap_set(), whereas we only need
these extra bits for bitmap_shift_left(), where I can't spot a signed
int limitation.  Do you come to the same conclusion?  


That's right.


Maybe worth a
comment why we think we can exceed DIRTY_BITMAP_PAGES_MAX for that
extra padding.



ok.


+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {


Nit, the previous function above sets the initial value in the for()
statement, it looks like it would fit in 80 columns here too.  We have
examples either way in the code, so not a must fix.


+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+

[PATCH QEMU v22 18/18] qapi: Add VFIO devices migration stats in Migration stats

2020-05-18 Thread Kirti Wankhede
Added amount of bytes transferred to the target VM by all VFIO devices

Signed-off-by: Kirti Wankhede 
---
 hw/vfio/common.c| 20 
 hw/vfio/migration.c | 10 +-
 include/qemu/vfio-helpers.h |  3 +++
 migration/migration.c   | 12 
 monitor/hmp-cmds.c  |  6 ++
 qapi/migration.json | 19 ++-
 6 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9b29de654c7f..3e2ae742575e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -40,6 +40,7 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/migration.h"
+#include "qemu/vfio-helpers.h"
 
 VFIOGroupList vfio_group_list =
 QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -293,6 +294,25 @@ const MemoryRegionOps vfio_region_ops = {
  * Device state interfaces
  */
 
+bool vfio_mig_active(void)
+{
+VFIOGroup *group;
+VFIODevice *vbasedev;
+
+if (QLIST_EMPTY(_group_list)) {
+return false;
+}
+
+QLIST_FOREACH(group, _group_list, next) {
+QLIST_FOREACH(vbasedev, >device_list, next) {
+if (vbasedev->migration_blocker) {
+return false;
+}
+}
+}
+return true;
+}
+
 static bool vfio_devices_are_stopped_and_saving(void)
 {
 VFIOGroup *group;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 7729c90782bd..2eef38fe5c65 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -26,7 +26,7 @@
 #include "exec/ram_addr.h"
 #include "pci.h"
 #include "trace.h"
-
+#include "qemu/vfio-helpers.h"
 /*
  * Flags used as delimiter:
  * 0x => MSB 32-bit all 1s
@@ -38,6 +38,8 @@
 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
 #define VFIO_MIG_FLAG_DEV_DATA_STATE(0xef14ULL)
 
+static int64_t bytes_transferred;
+
 static void vfio_migration_region_exit(VFIODevice *vbasedev)
 {
 VFIOMigration *migration = vbasedev->migration;
@@ -229,6 +231,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice 
*vbasedev)
 return ret;
 }
 
+bytes_transferred += data_size;
 return data_size;
 }
 
@@ -744,6 +747,11 @@ static int vfio_migration_init(VFIODevice *vbasedev,
 
 /* -- */
 
+int64_t vfio_mig_bytes_transferred(void)
+{
+return bytes_transferred;
+}
+
 int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
 {
 struct vfio_region_info *info;
diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h
index 1f057c2b9e40..26a7df0767b1 100644
--- a/include/qemu/vfio-helpers.h
+++ b/include/qemu/vfio-helpers.h
@@ -29,4 +29,7 @@ void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, 
void *bar,
 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
int irq_type, Error **errp);
 
+bool vfio_mig_active(void);
+int64_t vfio_mig_bytes_transferred(void);
+
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index 187ac0410c2d..9d763447261c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -54,6 +54,7 @@
 #include "net/announce.h"
 #include "qemu/queue.h"
 #include "multifd.h"
+#include "qemu/vfio-helpers.h"
 
 #define MAX_THROTTLE  (32 << 20)  /* Migration transfer speed throttling */
 
@@ -967,6 +968,15 @@ static void populate_disk_info(MigrationInfo *info)
 }
 }
 
+static void populate_vfio_info(MigrationInfo *info)
+{
+if (vfio_mig_active()) {
+info->has_vfio = true;
+info->vfio = g_malloc0(sizeof(*info->vfio));
+info->vfio->bytes = vfio_mig_bytes_transferred();
+}
+}
+
 static void fill_source_migration_info(MigrationInfo *info)
 {
 MigrationState *s = migrate_get_current();
@@ -992,6 +1002,7 @@ static void fill_source_migration_info(MigrationInfo *info)
 populate_time_info(info, s);
 populate_ram_info(info, s);
 populate_disk_info(info);
+populate_vfio_info(info);
 break;
 case MIGRATION_STATUS_COLO:
 info->has_status = true;
@@ -1000,6 +1011,7 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 case MIGRATION_STATUS_COMPLETED:
 populate_time_info(info, s);
 populate_ram_info(info, s);
+populate_vfio_info(info);
 break;
 case MIGRATION_STATUS_FAILED:
 info->has_status = true;
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 7f6e982dc834..d04bc042f2fe 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -353,6 +353,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 }
 monitor_printf(mon, "]\n");
 }
+
+if (info->has_vfio) {
+monitor_printf(mon, "vfio device bytes: %" PRIu64 " kbytes\n",
+

[PATCH QEMU v22 17/18] vfio: Make vfio-pci device migration capable

2020-05-18 Thread Kirti Wankhede
If device is not failover primary device call vfio_migration_probe()
and vfio_migration_finalize() functions for vfio-pci device to enable
migration for vfio PCI device which support migration.
Removed vfio_pci_vmstate structure.
Removed migration blocker from VFIO PCI device specific structure and use
migration blocker from generic structure of  VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/pci.c | 32 +++-
 hw/vfio/pci.h |  1 -
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 36b1e08f84d8..70ab0e4623b8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2916,22 +2916,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 return;
 }
 
-if (!pdev->failover_pair_id) {
-error_setg(>migration_blocker,
-"VFIO device doesn't support migration");
-ret = migrate_add_blocker(vdev->migration_blocker, );
-if (ret) {
-error_propagate(errp, err);
-error_free(vdev->migration_blocker);
-vdev->migration_blocker = NULL;
-return;
-}
-}
-
 vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev);
 vdev->vbasedev.ops = _pci_ops;
 vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
 vdev->vbasedev.dev = DEVICE(vdev);
+vdev->vbasedev.device_state = 0;
 
 tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
 len = readlink(tmp, group_path, sizeof(group_path));
@@ -3195,6 +3184,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 }
 }
 
+if (!pdev->failover_pair_id) {
+ret = vfio_migration_probe(>vbasedev, errp);
+if (ret) {
+error_report("%s: Failed to setup for migration",
+ vdev->vbasedev.name);
+}
+}
+
 vfio_register_err_notifier(vdev);
 vfio_register_req_notifier(vdev);
 vfio_setup_resetfn_quirk(vdev);
@@ -3209,11 +3206,6 @@ out_teardown:
 vfio_bars_exit(vdev);
 error:
 error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
-if (vdev->migration_blocker) {
-migrate_del_blocker(vdev->migration_blocker);
-error_free(vdev->migration_blocker);
-vdev->migration_blocker = NULL;
-}
 }
 
 static void vfio_instance_finalize(Object *obj)
@@ -3225,10 +3217,7 @@ static void vfio_instance_finalize(Object *obj)
 vfio_bars_finalize(vdev);
 g_free(vdev->emulated_config_bits);
 g_free(vdev->rom);
-if (vdev->migration_blocker) {
-migrate_del_blocker(vdev->migration_blocker);
-error_free(vdev->migration_blocker);
-}
+
 /*
  * XXX Leaking igd_opregion is not an oversight, we can't remove the
  * fw_cfg entry therefore leaking this allocation seems like the safest
@@ -3256,6 +3245,7 @@ static void vfio_exitfn(PCIDevice *pdev)
 }
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
+vfio_migration_finalize(>vbasedev);
 }
 
 static void vfio_pci_reset(DeviceState *dev)
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 0da7a20a7ec2..b148c937ef72 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -168,7 +168,6 @@ typedef struct VFIOPCIDevice {
 bool no_vfio_ioeventfd;
 bool enable_ramfb;
 VFIODisplay *dpy;
-Error *migration_blocker;
 Notifier irqchip_change_notifier;
 } VFIOPCIDevice;
 
-- 
2.7.0




[PATCH QEMU v22 14/18] vfio: Add function to start and stop dirty pages tracking

2020-05-18 Thread Kirti Wankhede
Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking
for VFIO devices.

Signed-off-by: Kirti Wankhede 
---
 hw/vfio/migration.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index b9bbe38e539c..7729c90782bd 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -9,6 +9,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/main-loop.h"
+#include 
 #include 
 
 #include "sysemu/runstate.h"
@@ -297,6 +298,32 @@ static int vfio_load_device_config_state(QEMUFile *f, void 
*opaque)
 return qemu_file_get_error(f);
 }
 
+static int vfio_start_dirty_page_tracking(VFIODevice *vbasedev, bool start)
+{
+int ret;
+VFIOContainer *container = vbasedev->group->container;
+struct vfio_iommu_type1_dirty_bitmap dirty = {
+.argsz = sizeof(dirty),
+};
+
+if (start) {
+if (vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
+} else {
+return -EINVAL;
+}
+} else {
+dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
+}
+
+ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, );
+if (ret) {
+error_report("Failed to set dirty tracking flag 0x%x errno: %d",
+ dirty.flags, errno);
+}
+return ret;
+}
+
 /* -- */
 
 static int vfio_save_setup(QEMUFile *f, void *opaque)
@@ -327,6 +354,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
 return ret;
 }
 
+ret = vfio_start_dirty_page_tracking(vbasedev, true);
+if (ret) {
+return ret;
+}
+
 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 
 ret = qemu_file_get_error(f);
@@ -342,6 +374,8 @@ static void vfio_save_cleanup(void *opaque)
 VFIODevice *vbasedev = opaque;
 VFIOMigration *migration = vbasedev->migration;
 
+vfio_start_dirty_page_tracking(vbasedev, false);
+
 if (migration->region.mmaps) {
 vfio_region_unmap(>region);
 }
@@ -676,6 +710,8 @@ static void vfio_migration_state_notifier(Notifier 
*notifier, void *data)
 if (ret) {
 error_report("%s: Failed to set state RUNNING", vbasedev->name);
 }
+
+vfio_start_dirty_page_tracking(vbasedev, false);
 }
 }
 
-- 
2.7.0




[PATCH QEMU v22 15/18] vfio: Add vfio_listener_log_sync to mark dirty pages

2020-05-18 Thread Kirti Wankhede
vfio_listener_log_sync gets list of dirty pages from container using
VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
devices are stopped and saving state.
Return early for the RAM block section of mapped MMIO region.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/common.c | 183 +--
 hw/vfio/trace-events |   1 +
 2 files changed, 179 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8e9d1431178c..4cad6fc20c35 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -29,6 +29,7 @@
 #include "hw/vfio/vfio.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
+#include "exec/ram_addr.h"
 #include "hw/hw.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
@@ -38,6 +39,7 @@
 #include "sysemu/reset.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "migration/migration.h"
 
 VFIOGroupList vfio_group_list =
 QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -288,6 +290,28 @@ const MemoryRegionOps vfio_region_ops = {
 };
 
 /*
+ * Device state interfaces
+ */
+
+static bool vfio_devices_are_stopped_and_saving(void)
+{
+VFIOGroup *group;
+VFIODevice *vbasedev;
+
+QLIST_FOREACH(group, _group_list, next) {
+QLIST_FOREACH(vbasedev, >device_list, next) {
+if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
+!(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+continue;
+} else {
+return false;
+}
+}
+}
+return true;
+}
+
+/*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
 static int vfio_dma_unmap(VFIOContainer *container,
@@ -408,8 +432,8 @@ static bool 
vfio_listener_skipped_section(MemoryRegionSection *section)
 }
 
 /* Called with rcu_read_lock held.  */
-static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
-   bool *read_only)
+static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+   ram_addr_t *ram_addr, bool *read_only)
 {
 MemoryRegion *mr;
 hwaddr xlat;
@@ -440,9 +464,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void 
**vaddr,
 return false;
 }
 
-*vaddr = memory_region_get_ram_ptr(mr) + xlat;
-*read_only = !writable || mr->readonly;
+if (vaddr) {
+*vaddr = memory_region_get_ram_ptr(mr) + xlat;
+}
 
+if (ram_addr) {
+*ram_addr = memory_region_get_ram_addr(mr) + xlat;
+}
+
+if (read_only) {
+*read_only = !writable || mr->readonly;
+}
 return true;
 }
 
@@ -467,7 +499,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 rcu_read_lock();
 
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-if (!vfio_get_vaddr(iotlb, , _only)) {
+if (!vfio_get_xlat_addr(iotlb, , NULL, _only)) {
 goto out;
 }
 /*
@@ -813,9 +845,150 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 }
 }
 
+static int vfio_get_dirty_bitmap(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+VFIOContainer *container = container_of(listener, VFIOContainer, listener);
+VFIOGuestIOMMU *giommu;
+IOMMUTLBEntry iotlb;
+hwaddr granularity, iova, iova_end;
+int ret;
+
+if (memory_region_is_iommu(section->mr)) {
+QLIST_FOREACH(giommu, >giommu_list, giommu_next) {
+if (MEMORY_REGION(giommu->iommu) == section->mr &&
+giommu->n.start == section->offset_within_region) {
+break;
+}
+}
+
+if (!giommu) {
+return -EINVAL;
+}
+}
+
+iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+
+if (memory_region_is_iommu(section->mr)) {
+hwaddr iova_size;
+
+granularity = memory_region_iommu_get_min_page_size(giommu->iommu);
+iova_end = memory_region_iommu_get_address_limit(giommu->iommu);
+
+if (iova_end) {
+iova_size = MIN(int128_get64(section->size), iova_end - iova + 1);
+} else {
+iova_size = int128_get64(section->size);
+}
+
+iova_end = iova + iova_size - 1;
+} else {
+granularity = memory_region_size(section->mr);
+iova_end = iova + int128_get64(section->size) - 1;
+}
+
+RCU_READ_LOCK_GUARD();
+
+while (iova < iova_end) {
+struct vfio_iommu_type1_dirty_bitmap *dbitmap;
+struct vfio_iommu_type1_dirty_bitmap_get *range;
+ram_addr_t start, pages;
+uint64_t iova_xlat, size;
+
+if (memory_region_is_iommu(section->mr)) {
+

[PATCH QEMU v22 12/18] memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled

2020-05-18 Thread Kirti Wankhede
Signed-off-by: Kirti Wankhede 
---
 memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memory.c b/memory.c
index 52f1a4cd37f0..5b868fe5eab3 100644
--- a/memory.c
+++ b/memory.c
@@ -1788,7 +1788,7 @@ bool memory_region_is_ram_device(MemoryRegion *mr)
 uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
 {
 uint8_t mask = mr->dirty_log_mask;
-if (global_dirty_log && mr->ram_block) {
+if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) {
 mask |= (1 << DIRTY_MEMORY_MIGRATION);
 }
 return mask;
-- 
2.7.0




[PATCH QEMU v22 16/18] vfio: Add ioctl to get dirty pages bitmap during dma unmap.

2020-05-18 Thread Kirti Wankhede
With vIOMMU, IO virtual address range can get unmapped while in pre-copy
phase of migration. In that case, unmap ioctl should return pages pinned
in that range and QEMU should find its correcponding guest physical
addresses and report those dirty.

Note: This patch is not yet tested. I'm trying to see how I can test this
code path.

Suggested-by: Alex Williamson 
Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/common.c | 87 +---
 1 file changed, 83 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4cad6fc20c35..9b29de654c7f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -311,11 +311,85 @@ static bool vfio_devices_are_stopped_and_saving(void)
 return true;
 }
 
+static bool vfio_devices_are_running_and_saving(void)
+{
+VFIOGroup *group;
+VFIODevice *vbasedev;
+
+QLIST_FOREACH(group, _group_list, next) {
+QLIST_FOREACH(vbasedev, >device_list, next) {
+if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
+(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+continue;
+} else {
+return false;
+}
+}
+}
+return true;
+}
+
+static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+ hwaddr iova, ram_addr_t size,
+ IOMMUTLBEntry *iotlb)
+{
+struct vfio_iommu_type1_dma_unmap *unmap;
+struct vfio_bitmap *bitmap;
+uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
+int ret;
+
+unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
+if (!unmap) {
+return -ENOMEM;
+}
+
+unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
+unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
+bitmap = (struct vfio_bitmap *)>data;
+
+/*
+ * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+ * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
+ * TARGET_PAGE_SIZE.
+ */
+
+bitmap->pgsize = TARGET_PAGE_SIZE;
+bitmap->size = ROUND_UP(pages / 8, sizeof(uint64_t));
+
+if (bitmap->size > container->max_dirty_bitmap_size) {
+error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
+ret = -E2BIG;
+goto unmap_exit;
+}
+
+bitmap->data = g_malloc0(bitmap->size);
+if (!bitmap->data) {
+error_report("UNMAP: Error allocating bitmap of size 0x%llx",
+ bitmap->size);
+ret = -ENOMEM;
+goto unmap_exit;
+}
+
+ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+if (!ret) {
+cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
+iotlb->translated_addr, pages);
+} else {
+error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %d", -errno);
+}
+
+g_free(bitmap->data);
+unmap_exit:
+g_free(unmap);
+return ret;
+}
+
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
 static int vfio_dma_unmap(VFIOContainer *container,
-  hwaddr iova, ram_addr_t size)
+  hwaddr iova, ram_addr_t size,
+  IOMMUTLBEntry *iotlb)
 {
 struct vfio_iommu_type1_dma_unmap unmap = {
 .argsz = sizeof(unmap),
@@ -324,6 +398,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
 .size = size,
 };
 
+if (iotlb && container->dirty_pages_supported &&
+vfio_devices_are_running_and_saving()) {
+return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+}
+
 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, )) {
 /*
  * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -371,7 +450,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
  * the VGA ROM space.
  */
 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, ) == 0 ||
-(errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
+(errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
  ioctl(container->fd, VFIO_IOMMU_MAP_DMA, ) == 0)) {
 return 0;
 }
@@ -519,7 +598,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
  iotlb->addr_mask + 1, vaddr, ret);
 }
 } else {
-ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
+ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
 if (ret) {
 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx") = %d (%m)",
@@ -822,7 +901,7 @@ static 

[PATCH QEMU v22 07/18] vfio: Add migration state change notifier

2020-05-18 Thread Kirti Wankhede
Added migration state change notifier to get notification on migration state
change. These states are translated to VFIO device state and conveyed to vendor
driver.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/migration.c   | 30 ++
 hw/vfio/trace-events  |  5 +++--
 include/hw/vfio/vfio-common.h |  1 +
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index e79b34003079..c2f5564b51c3 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -154,6 +154,28 @@ static void vfio_vmstate_change(void *opaque, int running, 
RunState state)
 }
 }
 
+static void vfio_migration_state_notifier(Notifier *notifier, void *data)
+{
+MigrationState *s = data;
+VFIODevice *vbasedev = container_of(notifier, VFIODevice, migration_state);
+int ret;
+
+trace_vfio_migration_state_notifier(vbasedev->name,
+MigrationStatus_str(s->state));
+
+switch (s->state) {
+case MIGRATION_STATUS_CANCELLING:
+case MIGRATION_STATUS_CANCELLED:
+case MIGRATION_STATUS_FAILED:
+ret = vfio_migration_set_state(vbasedev,
+  ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING),
+  VFIO_DEVICE_STATE_RUNNING);
+if (ret) {
+error_report("%s: Failed to set state RUNNING", vbasedev->name);
+}
+}
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev,
struct vfio_region_info *info)
 {
@@ -173,6 +195,9 @@ static int vfio_migration_init(VFIODevice *vbasedev,
 vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
   vbasedev);
 
+vbasedev->migration_state.notify = vfio_migration_state_notifier;
+add_migration_state_change_notifier(>migration_state);
+
 return 0;
 }
 
@@ -211,6 +236,11 @@ add_blocker:
 
 void vfio_migration_finalize(VFIODevice *vbasedev)
 {
+
+if (vbasedev->migration_state.notify) {
+remove_migration_state_change_notifier(>migration_state);
+}
+
 if (vbasedev->vm_state) {
 qemu_del_vm_change_state_handler(vbasedev->vm_state);
 }
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 14b0a86c0035..bd3d47b005cb 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -146,5 +146,6 @@ vfio_display_edid_write_error(void) ""
 
 # migration.c
 vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
-vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
-vfio_vmstate_change(char *name, int running, const char *reason, uint32_t 
dev_state) " (%s) running %d reason %s device state %d"
+vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
+vfio_vmstate_change(const char *name, int running, const char *reason, 
uint32_t dev_state) " (%s) running %d reason %s device state %d"
+vfio_migration_state_notifier(const char *name, const char *state) " (%s) 
state %s"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 3d18eb146b33..28f55f66d019 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -123,6 +123,7 @@ typedef struct VFIODevice {
 VMChangeStateEntry *vm_state;
 uint32_t device_state;
 int vm_running;
+Notifier migration_state;
 } VFIODevice;
 
 struct VFIODeviceOps {
-- 
2.7.0




[PATCH QEMU v22 13/18] vfio: Get migration capability flags for container

2020-05-18 Thread Kirti Wankhede
Added helper functions to get IOMMU info capability chain.
Added function to get migration capability information from that
capability chain for IOMMU container.

Similar change was proposed earlier:
https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg03759.html

Signed-off-by: Kirti Wankhede 
Cc: Shameer Kolothum 
Cc: Eric Auger 
---
 hw/vfio/common.c  | 91 +++
 include/hw/vfio/vfio-common.h |  3 ++
 2 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 2c2db4bcba20..8e9d1431178c 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1227,6 +1227,75 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 return 0;
 }
 
+static int vfio_get_iommu_info(VFIOContainer *container,
+   struct vfio_iommu_type1_info **info)
+{
+
+size_t argsz = sizeof(struct vfio_iommu_type1_info);
+
+*info = g_new0(struct vfio_iommu_type1_info, 1);
+again:
+(*info)->argsz = argsz;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
+g_free(*info);
+*info = NULL;
+return -errno;
+}
+
+if (((*info)->argsz > argsz)) {
+argsz = (*info)->argsz;
+*info = g_realloc(*info, argsz);
+goto again;
+}
+
+return 0;
+}
+
+static struct vfio_info_cap_header *
+vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
+{
+struct vfio_info_cap_header *hdr;
+void *ptr = info;
+
+if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
+return NULL;
+}
+
+for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+if (hdr->id == id) {
+return hdr;
+}
+}
+
+return NULL;
+}
+
+static void vfio_get_iommu_info_migration(VFIOContainer *container,
+ struct vfio_iommu_type1_info *info)
+{
+struct vfio_info_cap_header *hdr;
+struct vfio_iommu_type1_info_cap_migration *cap_mig;
+
+hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
+if (!hdr) {
+return;
+}
+
+cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
+header);
+
+container->dirty_pages_supported = true;
+container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
+container->dirty_pgsizes = cap_mig->pgsize_bitmap;
+
+/*
+ * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+ * TARGET_PAGE_SIZE to mark those dirty.
+ */
+assert(container->dirty_pgsizes & TARGET_PAGE_SIZE);
+}
+
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
   Error **errp)
 {
@@ -1291,6 +1360,7 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 container->space = space;
 container->fd = fd;
 container->error = NULL;
+container->dirty_pages_supported = false;
 QLIST_INIT(>giommu_list);
 QLIST_INIT(>hostwin_list);
 
@@ -1303,7 +1373,7 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 case VFIO_TYPE1v2_IOMMU:
 case VFIO_TYPE1_IOMMU:
 {
-struct vfio_iommu_type1_info info;
+struct vfio_iommu_type1_info *info;
 
 /*
  * FIXME: This assumes that a Type1 IOMMU can map any 64-bit
@@ -1312,15 +1382,20 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
  * existing Type1 IOMMUs generally support any IOVA we're
  * going to actually try in practice.
  */
-info.argsz = sizeof(info);
-ret = ioctl(fd, VFIO_IOMMU_GET_INFO, );
-/* Ignore errors */
-if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
+ret = vfio_get_iommu_info(container, );
+if (ret) {
+goto free_container_exit;
+}
+
+if (!(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
 /* Assume 4k IOVA page size */
-info.iova_pgsizes = 4096;
+info->iova_pgsizes = 4096;
 }
-vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
-container->pgsizes = info.iova_pgsizes;
+vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
+container->pgsizes = info->iova_pgsizes;
+
+vfio_get_iommu_info_migration(container, info);
+g_free(info);
 break;
 }
 case VFIO_SPAPR_TCE_v2_IOMMU:
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c78033e4149d..5a57a78ec517 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -79,6 +79,9 @@ typedef struct VFIOContainer {
 unsigned iommu_type;
 Error *error;
 bool initialized;
+bool dirty_pages_supported;
+uint64_t dirty_pgsizes;
+uint64_t max_dirty_bitmap

[PATCH QEMU v22 11/18] iommu: add callback to get address limit IOMMU supports

2020-05-18 Thread Kirti Wankhede
Add optional method to get address limit IOMMU supports

Signed-off-by: Kirti Wankhede 
---
 hw/i386/intel_iommu.c |  9 +
 include/exec/memory.h | 18 ++
 memory.c  | 11 +++
 3 files changed, 38 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index df7ad254ac15..d0b88c20c31e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3577,6 +3577,14 @@ static void vtd_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 return;
 }
 
+static hwaddr vtd_iommu_get_address_limit(IOMMUMemoryRegion *iommu_mr)
+{
+VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
+IntelIOMMUState *s = vtd_as->iommu_state;
+
+return VTD_ADDRESS_SIZE(s->aw_bits) - 1;
+}
+
 /* Do the initialization. It will also be called when reset, so pay
  * attention when adding new initialization stuff.
  */
@@ -3878,6 +3886,7 @@ static void 
vtd_iommu_memory_region_class_init(ObjectClass *klass,
 imrc->translate = vtd_iommu_translate;
 imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
 imrc->replay = vtd_iommu_replay;
+imrc->get_address_limit = vtd_iommu_get_address_limit;
 }
 
 static const TypeInfo vtd_iommu_memory_region_info = {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index e000bd2f97b2..2d0cbd46d2a6 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -355,6 +355,16 @@ typedef struct IOMMUMemoryRegionClass {
  * @iommu: the IOMMUMemoryRegion
  */
 int (*num_indexes)(IOMMUMemoryRegion *iommu);
+
+/*
+ * Return address limit this IOMMU supports.
+ *
+ * Optional method: if this method is not provided, then
+ * memory_region_iommu_get_address_limit() will return 0.
+ *
+ * @iommu: the IOMMUMemoryRegion
+ */
+hwaddr (*get_address_limit)(IOMMUMemoryRegion *iommu);
 } IOMMUMemoryRegionClass;
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
@@ -1364,6 +1374,14 @@ int memory_region_iommu_attrs_to_index(IOMMUMemoryRegion 
*iommu_mr,
 int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
 
 /**
+ * memory_region_iommu_get_address_limit : return the maximum address limit
+ * that this IOMMU supports.
+ *
+ * @iommu_mr: the memory region
+ */
+hwaddr memory_region_iommu_get_address_limit(IOMMUMemoryRegion *iommu_mr);
+
+/**
  * memory_region_name: get a memory region's name
  *
  * Returns the string that was used to initialize the memory region.
diff --git a/memory.c b/memory.c
index 601b74990620..52f1a4cd37f0 100644
--- a/memory.c
+++ b/memory.c
@@ -1887,6 +1887,17 @@ void memory_region_iommu_replay(IOMMUMemoryRegion 
*iommu_mr, IOMMUNotifier *n)
 }
 }
 
+hwaddr memory_region_iommu_get_address_limit(IOMMUMemoryRegion *iommu_mr)
+{
+IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+
+if (imrc->get_address_limit) {
+return imrc->get_address_limit(iommu_mr);
+}
+
+return 0;
+}
+
 void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
  IOMMUNotifier *n)
 {
-- 
2.7.0




[PATCH QEMU v22 10/18] vfio: Add load state functions to SaveVMHandlers

2020-05-18 Thread Kirti Wankhede
Sequence  during _RESUMING device state:
While data for this device is available, repeat below steps:
a. read data_offset from where user application should write data.
b. write data of data_size to migration region from data_offset.
c. write data_size which indicates vendor driver that data is written in
   staging buffer.

For user, data is opaque. User should write data in the same order as
received.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/migration.c  | 180 +++
 hw/vfio/trace-events |   3 +
 2 files changed, 183 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index d90bd2296afd..b9bbe38e539c 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -270,6 +270,33 @@ static int vfio_save_device_config_state(QEMUFile *f, void 
*opaque)
 return qemu_file_get_error(f);
 }
 
+static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
+{
+VFIODevice *vbasedev = opaque;
+uint64_t data;
+
+if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
+int ret;
+
+ret = vbasedev->ops->vfio_load_config(vbasedev, f);
+if (ret) {
+error_report("%s: Failed to load device config space",
+ vbasedev->name);
+return ret;
+}
+}
+
+data = qemu_get_be64(f);
+if (data != VFIO_MIG_FLAG_END_OF_STATE) {
+error_report("%s: Failed loading device config space, "
+ "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
+return -EINVAL;
+}
+
+trace_vfio_load_device_config_state(vbasedev->name);
+return qemu_file_get_error(f);
+}
+
 /* -- */
 
 static int vfio_save_setup(QEMUFile *f, void *opaque)
@@ -439,12 +466,165 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
*opaque)
 return ret;
 }
 
+static int vfio_load_setup(QEMUFile *f, void *opaque)
+{
+VFIODevice *vbasedev = opaque;
+VFIOMigration *migration = vbasedev->migration;
+int ret = 0;
+
+if (migration->region.mmaps) {
+ret = vfio_region_mmap(>region);
+if (ret) {
+error_report("%s: Failed to mmap VFIO migration region %d: %s",
+ vbasedev->name, migration->region.nr,
+ strerror(-ret));
+return ret;
+}
+}
+
+ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK,
+   VFIO_DEVICE_STATE_RESUMING);
+if (ret) {
+error_report("%s: Failed to set state RESUMING", vbasedev->name);
+}
+return ret;
+}
+
+static int vfio_load_cleanup(void *opaque)
+{
+vfio_save_cleanup(opaque);
+return 0;
+}
+
+static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
+{
+VFIODevice *vbasedev = opaque;
+VFIOMigration *migration = vbasedev->migration;
+int ret = 0;
+uint64_t data, data_size;
+
+data = qemu_get_be64(f);
+while (data != VFIO_MIG_FLAG_END_OF_STATE) {
+
+trace_vfio_load_state(vbasedev->name, data);
+
+switch (data) {
+case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
+{
+ret = vfio_load_device_config_state(f, opaque);
+if (ret) {
+return ret;
+}
+break;
+}
+case VFIO_MIG_FLAG_DEV_SETUP_STATE:
+{
+data = qemu_get_be64(f);
+if (data == VFIO_MIG_FLAG_END_OF_STATE) {
+return ret;
+} else {
+error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
+ vbasedev->name, data);
+return -EINVAL;
+}
+break;
+}
+case VFIO_MIG_FLAG_DEV_DATA_STATE:
+{
+VFIORegion *region = >region;
+void *buf = NULL;
+bool buffer_mmaped = false;
+uint64_t data_offset = 0;
+
+data_size = qemu_get_be64(f);
+if (data_size == 0) {
+break;
+}
+
+ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
+region->fd_offset +
+offsetof(struct vfio_device_migration_info,
+data_offset));
+if (ret != sizeof(data_offset)) {
+error_report("%s:Failed to get migration buffer data offset 
%d",
+ vbasedev->name, ret);
+return -EINVAL;
+}
+
+if (region->mmaps) {
+buf = find_data_region(region, data_offset, data_size);
+}
+
+buffer_mmaped = (buf != NULL) ? true : false;
+
+if (!buffer_mmaped) {
+buf = g_try_malloc0(data_size

[PATCH QEMU v22 05/18] vfio: Add migration region initialization and finalize function

2020-05-18 Thread Kirti Wankhede
- Migration functions are implemented for VFIO_DEVICE_TYPE_PCI device in this
  patch series.
- VFIO device supports migration or not is decided based of migration region
  query. If migration region query is successful and migration region
  initialization is successful then migration is supported else migration is
  blocked.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/Makefile.objs |   2 +-
 hw/vfio/migration.c   | 138 ++
 hw/vfio/trace-events  |   3 +
 include/hw/vfio/vfio-common.h |   9 +++
 4 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 hw/vfio/migration.c

diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index 9bb1c09e8477..8b296c889ed9 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -1,4 +1,4 @@
-obj-y += common.o spapr.o
+obj-y += common.o spapr.o migration.o
 obj-$(CONFIG_VFIO_PCI) += pci.o pci-quirks.o display.o
 obj-$(CONFIG_VFIO_CCW) += ccw.o
 obj-$(CONFIG_VFIO_PLATFORM) += platform.o
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
new file mode 100644
index ..bf9384907ec0
--- /dev/null
+++ b/hw/vfio/migration.c
@@ -0,0 +1,138 @@
+/*
+ * Migration support for VFIO devices
+ *
+ * Copyright NVIDIA, Inc. 2020
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+
+#include "hw/vfio/vfio-common.h"
+#include "cpu.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include "migration/register.h"
+#include "migration/blocker.h"
+#include "migration/misc.h"
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/ram_addr.h"
+#include "pci.h"
+#include "trace.h"
+
+static void vfio_migration_region_exit(VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+
+if (!migration) {
+return;
+}
+
+if (migration->region.size) {
+vfio_region_exit(>region);
+vfio_region_finalize(>region);
+}
+}
+
+static int vfio_migration_region_init(VFIODevice *vbasedev, int index)
+{
+VFIOMigration *migration = vbasedev->migration;
+Object *obj = NULL;
+int ret = -EINVAL;
+
+if (!vbasedev->ops->vfio_get_object) {
+return ret;
+}
+
+obj = vbasedev->ops->vfio_get_object(vbasedev);
+if (!obj) {
+return ret;
+}
+
+ret = vfio_region_setup(obj, vbasedev, >region, index,
+"migration");
+if (ret) {
+error_report("%s: Failed to setup VFIO migration region %d: %s",
+ vbasedev->name, index, strerror(-ret));
+goto err;
+}
+
+if (!migration->region.size) {
+ret = -EINVAL;
+error_report("%s: Invalid region size of VFIO migration region %d: %s",
+ vbasedev->name, index, strerror(-ret));
+goto err;
+}
+
+return 0;
+
+err:
+vfio_migration_region_exit(vbasedev);
+return ret;
+}
+
+static int vfio_migration_init(VFIODevice *vbasedev,
+   struct vfio_region_info *info)
+{
+int ret;
+
+vbasedev->migration = g_new0(VFIOMigration, 1);
+
+ret = vfio_migration_region_init(vbasedev, info->index);
+if (ret) {
+error_report("%s: Failed to initialise migration region",
+ vbasedev->name);
+g_free(vbasedev->migration);
+vbasedev->migration = NULL;
+return ret;
+}
+
+return 0;
+}
+
+/* -- */
+
+int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
+{
+struct vfio_region_info *info;
+Error *local_err = NULL;
+int ret;
+
+ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION,
+   VFIO_REGION_SUBTYPE_MIGRATION, );
+if (ret) {
+goto add_blocker;
+}
+
+ret = vfio_migration_init(vbasedev, info);
+if (ret) {
+goto add_blocker;
+}
+
+trace_vfio_migration_probe(vbasedev->name, info->index);
+return 0;
+
+add_blocker:
+error_setg(>migration_blocker,
+   "VFIO device doesn't support migration");
+ret = migrate_add_blocker(vbasedev->migration_blocker, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+error_free(vbasedev->migration_blocker);
+}
+return ret;
+}
+
+void vfio_migration_finalize(VFIODevice *vbasedev)
+{
+if (vbasedev->migration_blocker) {
+migrate_del_blocker(vbasedev->migration_blocker);
+error_free(vbasedev->migration_blocker);
+}
+
+vfio_migration_region_exit(vbasedev);
+g_free(v

[PATCH QEMU v22 09/18] vfio: Add save state functions to SaveVMHandlers

2020-05-18 Thread Kirti Wankhede
Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
functions. These functions handles pre-copy and stop-and-copy phase.

In _SAVING|_RUNNING device state or pre-copy phase:
- read pending_bytes. If pending_bytes > 0, go through below steps.
- read data_offset - indicates kernel driver to write data to staging
  buffer.
- read data_size - amount of data in bytes written by vendor driver in
  migration region.
- read data_size bytes of data from data_offset in the migration region.
- Write data packet to file stream as below:
{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
VFIO_MIG_FLAG_END_OF_STATE }

In _SAVING device state or stop-and-copy phase
a. read config space of device and save to migration file stream. This
   doesn't need to be from vendor driver. Any other special config state
   from driver can be saved as data in following iteration.
b. read pending_bytes. If pending_bytes > 0, go through below steps.
c. read data_offset - indicates kernel driver to write data to staging
   buffer.
d. read data_size - amount of data in bytes written by vendor driver in
   migration region.
e. read data_size bytes of data from data_offset in the migration region.
f. Write data packet as below:
   {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
g. iterate through steps b to f while (pending_bytes > 0)
h. Write {VFIO_MIG_FLAG_END_OF_STATE}

When data region is mapped, its user's responsibility to read data from
data_offset of data_size before moving to next steps.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/migration.c   | 254 +-
 hw/vfio/trace-events  |   6 +
 include/hw/vfio/vfio-common.h |   1 +
 3 files changed, 260 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 773c8d16b1c1..d90bd2296afd 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -139,6 +139,137 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, 
uint32_t mask,
 return 0;
 }
 
+static void *find_data_region(VFIORegion *region,
+  uint64_t data_offset,
+  uint64_t data_size)
+{
+void *ptr = NULL;
+int i;
+
+for (i = 0; i < region->nr_mmaps; i++) {
+if ((data_offset >= region->mmaps[i].offset) &&
+(data_offset < region->mmaps[i].offset + region->mmaps[i].size) &&
+(data_size <= region->mmaps[i].size)) {
+ptr = region->mmaps[i].mmap + (data_offset -
+   region->mmaps[i].offset);
+break;
+}
+}
+return ptr;
+}
+
+static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+VFIORegion *region = >region;
+uint64_t data_offset = 0, data_size = 0;
+int ret;
+
+ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+ data_offset));
+if (ret != sizeof(data_offset)) {
+error_report("%s: Failed to get migration buffer data offset %d",
+ vbasedev->name, ret);
+return -EINVAL;
+}
+
+ret = pread(vbasedev->fd, _size, sizeof(data_size),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+ data_size));
+if (ret != sizeof(data_size)) {
+error_report("%s: Failed to get migration buffer data size %d",
+ vbasedev->name, ret);
+return -EINVAL;
+}
+
+if (data_size > 0) {
+void *buf = NULL;
+bool buffer_mmaped;
+
+if (region->mmaps) {
+buf = find_data_region(region, data_offset, data_size);
+}
+
+buffer_mmaped = (buf != NULL);
+
+if (!buffer_mmaped) {
+buf = g_try_malloc(data_size);
+if (!buf) {
+error_report("%s: Error allocating buffer ", __func__);
+return -ENOMEM;
+}
+
+ret = pread(vbasedev->fd, buf, data_size,
+region->fd_offset + data_offset);
+if (ret != data_size) {
+error_report("%s: Failed to get migration data %d",
+ vbasedev->name, ret);
+g_free(buf);
+return -EINVAL;
+}
+}
+
+qemu_put_be64(f, data_size);
+qemu_put_buffer(f, buf, data_size);
+
+if (!buffer_mmaped) {
+g_free(buf);
+}
+} else {
+qemu_put_be64(f, data_size);
+}
+
+trace_vfio_save_buffer(vbasedev->name, data_offset, data_size,
+   migration->pending_bytes);
+
+ret = qemu_fi

[PATCH QEMU v22 04/18] vfio: Add save and load functions for VFIO PCI devices

2020-05-18 Thread Kirti Wankhede
These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/pci.c | 163 ++
 include/hw/vfio/vfio-common.h |   2 +
 2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..36b1e08f84d8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
 
 #define TYPE_VFIO_PCI "vfio-pci"
 #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
 }
 }
 
+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo &= (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+  PCI_BASE_ADDRESS_MEM_MASK);
+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;
+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
 {
 VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
 return OBJECT(vdev);
 }
 
+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be16(f, msi_data);
+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint32_t interrupt_type;
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+uint16_t pci_cmd;
+bool msi_64bit;
+int i, ret;
+
+/* retore pci bar configuration */
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+vfio_pci_write_config(pdev, PCI_COMMAND,
+pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar = qemu_get_be32(f);
+
+vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
+}
+
+ret = vfio_bars_validate(vdev);
+if (ret) {
+return ret;
+}
+
+interrupt_type = qemu_get_be32(f

[PATCH QEMU v22 02/18] vfio: Add function to unmap VFIO region

2020-05-18 Thread Kirti Wankhede
This function will be used for migration region.
Migration region is mmaped when migration starts and will be unmapped when
migration is complete.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Cornelia Huck 
---
 hw/vfio/common.c  | 30 ++
 hw/vfio/trace-events  |  1 +
 include/hw/vfio/vfio-common.h |  1 +
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0b3593b3c0c4..2c2db4bcba20 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -925,6 +925,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, 
VFIORegion *region,
 return 0;
 }
 
+static void vfio_subregion_unmap(VFIORegion *region, int index)
+{
+trace_vfio_region_unmap(memory_region_name(>mmaps[index].mem),
+region->mmaps[index].offset,
+region->mmaps[index].offset +
+region->mmaps[index].size - 1);
+memory_region_del_subregion(region->mem, >mmaps[index].mem);
+munmap(region->mmaps[index].mmap, region->mmaps[index].size);
+object_unparent(OBJECT(>mmaps[index].mem));
+region->mmaps[index].mmap = NULL;
+}
+
 int vfio_region_mmap(VFIORegion *region)
 {
 int i, prot = 0;
@@ -955,10 +967,7 @@ int vfio_region_mmap(VFIORegion *region)
 region->mmaps[i].mmap = NULL;
 
 for (i--; i >= 0; i--) {
-memory_region_del_subregion(region->mem, 
>mmaps[i].mem);
-munmap(region->mmaps[i].mmap, region->mmaps[i].size);
-object_unparent(OBJECT(>mmaps[i].mem));
-region->mmaps[i].mmap = NULL;
+vfio_subregion_unmap(region, i);
 }
 
 return ret;
@@ -983,6 +992,19 @@ int vfio_region_mmap(VFIORegion *region)
 return 0;
 }
 
+void vfio_region_unmap(VFIORegion *region)
+{
+int i;
+
+if (!region->mem) {
+return;
+}
+
+for (i = 0; i < region->nr_mmaps; i++) {
+vfio_subregion_unmap(region, i);
+}
+}
+
 void vfio_region_exit(VFIORegion *region)
 {
 int i;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index b1ef55a33ffd..8cdc27946cb8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -111,6 +111,7 @@ vfio_region_mmap(const char *name, unsigned long offset, 
unsigned long end) "Reg
 vfio_region_exit(const char *name, int index) "Device %s, region %d"
 vfio_region_finalize(const char *name, int index) "Device %s, region %d"
 vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps 
enabled: %d"
+vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) 
"Region %s unmap [0x%lx - 0x%lx]"
 vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) 
"Device %s region %d: %d sparse mmap entries"
 vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) 
"sparse entry %d [0x%lx - 0x%lx]"
 vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t 
subtype) "%s index %d, %08x/%0x8"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index fd564209ac71..8d7a0fbb1046 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -171,6 +171,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, 
VFIORegion *region,
   int index, const char *name);
 int vfio_region_mmap(VFIORegion *region);
 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
+void vfio_region_unmap(VFIORegion *region);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
-- 
2.7.0




[PATCH QEMU v22 08/18] vfio: Register SaveVMHandlers for VFIO device

2020-05-18 Thread Kirti Wankhede
Define flags to be used as delimeter in migration file stream.
Added .save_setup and .save_cleanup functions. Mapped & unmapped migration
region from these functions at source during saving or pre-copy phase.
Set VFIO device state depending on VM's state. During live migration, VM is
running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO
device. During save-restore, VM is paused, _SAVING state is set for VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/migration.c  | 73 
 hw/vfio/trace-events |  2 ++
 2 files changed, 75 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index c2f5564b51c3..773c8d16b1c1 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -8,12 +8,14 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include 
 
 #include "sysemu/runstate.h"
 #include "hw/vfio/vfio-common.h"
 #include "cpu.h"
 #include "migration/migration.h"
+#include "migration/vmstate.h"
 #include "migration/qemu-file.h"
 #include "migration/register.h"
 #include "migration/blocker.h"
@@ -24,6 +26,17 @@
 #include "pci.h"
 #include "trace.h"
 
+/*
+ * Flags used as delimiter:
+ * 0x => MSB 32-bit all 1s
+ * 0xef10 => emulated (virtual) function IO
+ * 0x => 16-bits reserved for flags
+ */
+#define VFIO_MIG_FLAG_END_OF_STATE  (0xef11ULL)
+#define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xef12ULL)
+#define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
+#define VFIO_MIG_FLAG_DEV_DATA_STATE(0xef14ULL)
+
 static void vfio_migration_region_exit(VFIODevice *vbasedev)
 {
 VFIOMigration *migration = vbasedev->migration;
@@ -126,6 +139,64 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, 
uint32_t mask,
 return 0;
 }
 
+/* -- */
+
+static int vfio_save_setup(QEMUFile *f, void *opaque)
+{
+VFIODevice *vbasedev = opaque;
+VFIOMigration *migration = vbasedev->migration;
+int ret;
+
+trace_vfio_save_setup(vbasedev->name);
+
+qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
+
+if (migration->region.mmaps) {
+qemu_mutex_lock_iothread();
+ret = vfio_region_mmap(>region);
+qemu_mutex_unlock_iothread();
+if (ret) {
+error_report("%s: Failed to mmap VFIO migration region %d: %s",
+ vbasedev->name, migration->region.index,
+ strerror(-ret));
+return ret;
+}
+}
+
+ret = vfio_migration_set_state(vbasedev, ~0, VFIO_DEVICE_STATE_SAVING);
+if (ret) {
+error_report("%s: Failed to set state SAVING", vbasedev->name);
+return ret;
+}
+
+qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+
+ret = qemu_file_get_error(f);
+if (ret) {
+return ret;
+}
+
+return 0;
+}
+
+static void vfio_save_cleanup(void *opaque)
+{
+VFIODevice *vbasedev = opaque;
+VFIOMigration *migration = vbasedev->migration;
+
+if (migration->region.mmaps) {
+vfio_region_unmap(>region);
+}
+trace_vfio_save_cleanup(vbasedev->name);
+}
+
+static SaveVMHandlers savevm_vfio_handlers = {
+.save_setup = vfio_save_setup,
+.save_cleanup = vfio_save_cleanup,
+};
+
+/* -- */
+
 static void vfio_vmstate_change(void *opaque, int running, RunState state)
 {
 VFIODevice *vbasedev = opaque;
@@ -192,6 +263,8 @@ static int vfio_migration_init(VFIODevice *vbasedev,
 return ret;
 }
 
+register_savevm_live("vfio", VMSTATE_INSTANCE_ID_ANY, 1,
+ _vfio_handlers, vbasedev);
 vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
   vbasedev);
 
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index bd3d47b005cb..86c18def016e 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -149,3 +149,5 @@ vfio_migration_probe(const char *name, uint32_t index) " 
(%s) Region %d"
 vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d"
 vfio_vmstate_change(const char *name, int running, const char *reason, 
uint32_t dev_state) " (%s) running %d reason %s device state %d"
 vfio_migration_state_notifier(const char *name, const char *state) " (%s) 
state %s"
+vfio_save_setup(const char *name) " (%s)"
+vfio_save_cleanup(const char *name) " (%s)"
-- 
2.7.0




[PATCH QEMU v22 06/18] vfio: Add VM state change handler to know state of VM

2020-05-18 Thread Kirti Wankhede
VM state change handler gets called on change in VM's state. This is used to set
VFIO device state to _RUNNING.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 hw/vfio/migration.c   | 87 +++
 hw/vfio/trace-events  |  2 +
 include/hw/vfio/vfio-common.h |  4 ++
 3 files changed, 93 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index bf9384907ec0..e79b34003079 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -10,6 +10,7 @@
 #include "qemu/osdep.h"
 #include 
 
+#include "sysemu/runstate.h"
 #include "hw/vfio/vfio-common.h"
 #include "cpu.h"
 #include "migration/migration.h"
@@ -74,6 +75,85 @@ err:
 return ret;
 }
 
+static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask,
+uint32_t value)
+{
+VFIOMigration *migration = vbasedev->migration;
+VFIORegion *region = >region;
+uint32_t device_state;
+int ret;
+
+ret = pread(vbasedev->fd, _state, sizeof(device_state),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+  device_state));
+if (ret < 0) {
+error_report("%s: Failed to read device state %d %s",
+ vbasedev->name, ret, strerror(errno));
+return ret;
+}
+
+device_state = (device_state & mask) | value;
+
+if (!VFIO_DEVICE_STATE_VALID(device_state)) {
+return -EINVAL;
+}
+
+ret = pwrite(vbasedev->fd, _state, sizeof(device_state),
+ region->fd_offset + offsetof(struct 
vfio_device_migration_info,
+  device_state));
+if (ret < 0) {
+error_report("%s: Failed to set device state %d %s",
+ vbasedev->name, ret, strerror(errno));
+
+ret = pread(vbasedev->fd, _state, sizeof(device_state),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+device_state));
+if (ret < 0) {
+error_report("%s: On failure, failed to read device state %d %s",
+vbasedev->name, ret, strerror(errno));
+return ret;
+}
+
+if (VFIO_DEVICE_STATE_IS_ERROR(device_state)) {
+error_report("%s: Device is in error state 0x%x",
+ vbasedev->name, device_state);
+return -EFAULT;
+}
+}
+
+vbasedev->device_state = device_state;
+trace_vfio_migration_set_state(vbasedev->name, device_state);
+return 0;
+}
+
+static void vfio_vmstate_change(void *opaque, int running, RunState state)
+{
+VFIODevice *vbasedev = opaque;
+
+if ((vbasedev->vm_running != running)) {
+int ret;
+uint32_t value = 0, mask = 0;
+
+if (running) {
+value = VFIO_DEVICE_STATE_RUNNING;
+if (vbasedev->device_state & VFIO_DEVICE_STATE_RESUMING) {
+mask = ~VFIO_DEVICE_STATE_RESUMING;
+}
+} else {
+mask = ~VFIO_DEVICE_STATE_RUNNING;
+}
+
+ret = vfio_migration_set_state(vbasedev, mask, value);
+if (ret) {
+error_report("%s: Failed to set device state 0x%x",
+ vbasedev->name, value & mask);
+}
+vbasedev->vm_running = running;
+trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
+  value & mask);
+}
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev,
struct vfio_region_info *info)
 {
@@ -90,6 +170,9 @@ static int vfio_migration_init(VFIODevice *vbasedev,
 return ret;
 }
 
+vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
+  vbasedev);
+
 return 0;
 }
 
@@ -128,6 +211,10 @@ add_blocker:
 
 void vfio_migration_finalize(VFIODevice *vbasedev)
 {
+if (vbasedev->vm_state) {
+qemu_del_vm_change_state_handler(vbasedev->vm_state);
+}
+
 if (vbasedev->migration_blocker) {
 migrate_del_blocker(vbasedev->migration_blocker);
 error_free(vbasedev->migration_blocker);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index fd034ac53684..14b0a86c0035 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -146,3 +146,5 @@ vfio_display_edid_write_error(void) ""
 
 # migration.c
 vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d"
+vfio_migration_set_state(char *name, uint32_t state) " (%s) state %d"
+vfio_vmstate_change(char *name, int running, const char *reason, uint32_t 
dev_state) " (%s) running %d reason %s device state %d"
diff

[PATCH QEMU v22 03/18] vfio: Add vfio_get_object callback to VFIODeviceOps

2020-05-18 Thread Kirti Wankhede
Hook vfio_get_object callback for PCI devices.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Suggested-by: Cornelia Huck 
Reviewed-by: Cornelia Huck 
---
 hw/vfio/pci.c | 8 
 include/hw/vfio/vfio-common.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 5e75a95129ac..6c77c12e44b9 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2407,10 +2407,18 @@ static void vfio_pci_compute_needs_reset(VFIODevice 
*vbasedev)
 }
 }
 
+static Object *vfio_pci_get_object(VFIODevice *vbasedev)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+
+return OBJECT(vdev);
+}
+
 static VFIODeviceOps vfio_pci_ops = {
 .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
 .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
 .vfio_eoi = vfio_intx_eoi,
+.vfio_get_object = vfio_pci_get_object,
 };
 
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 8d7a0fbb1046..74261feaeac9 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -119,6 +119,7 @@ struct VFIODeviceOps {
 void (*vfio_compute_needs_reset)(VFIODevice *vdev);
 int (*vfio_hot_reset_multi)(VFIODevice *vdev);
 void (*vfio_eoi)(VFIODevice *vdev);
+Object *(*vfio_get_object)(VFIODevice *vdev);
 };
 
 typedef struct VFIOGroup {
-- 
2.7.0




[PATCH QEMU v22 00/18] Add migration support for VFIO devices

2020-05-18 Thread Kirti Wankhede
opy is not supported.

v18 -> v22
- Few fixes from v18 review. But not yet fixed all concerns. I'll address those
  concerns in subsequent iterations.
- Sending this version to test v22 kernel version patches:
https://lore.kernel.org/kvm/1589781397-28368-1-git-send-email-kwankh...@nvidia.com/

v16 -> v18
- Nit fixes
- Get migration capability flags from container
- Added VFIO stats to MigrationInfo
- Fixed bug reported by Yan
https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg4.html

v9 -> v16
- KABI almost finalised on kernel patches.
- Added support for migration with vIOMMU enabled.

v8 -> v9:
- Split patch set in 2 sets, Kernel and QEMU sets.
- Dirty pages bitmap is queried from IOMMU container rather than from
  vendor driver for per device. Added 2 ioctls to achieve this.

v7 -> v8:
- Updated comments for KABI
- Added BAR address validation check during PCI device's config space load as
  suggested by Dr. David Alan Gilbert.
- Changed vfio_migration_set_state() to set or clear device state flags.
- Some nit fixes.

v6 -> v7:
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of structure
  vfio_device_migration_info to be followed based on Alex's suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration, that
  is removed assumption that data will be till end of migration region.
- If vendor driver supports mappable sparsed region, map those region during
  setup state of save/load, similarly unmap those from cleanup routines.
- Handles race condition that causes data corruption in migration region during
  save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Splitted into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2 bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined action
  on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with region
  type capability.
- Re-structured vfio_device_migration_info. This structure will be placed at 0th
  offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section of the
  region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested pages
  are copied.

Thanks,
Kirti



Kirti Wankhede (18):
  vfio: KABI for migration interface - Kernel header placeholder
  vfio: Add function to unmap VFIO region
  vfio: Add vfio_get_object callback to VFIODeviceOps
  vfio: Add save and load functions for VFIO PCI devices
  vfio: Add migration region initialization and finalize function
  vfio: Add VM state change handler to know state of VM
  vfio: Add migration state change notifier
  vfio: Register SaveVMHandlers for VFIO device
  vfio: Add save state functions to SaveVMHandlers
  vfio: Add load state functions to SaveVMHandlers
  iommu: add callback to get address limit IOMMU supports
  memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled
  vfio: Get migration capability flags for container
  vfio: Add function to start and stop dirty pages tracking
  vfio: Add vfio_listener_log_sync to mark dirty pages
  vfio: Add ioctl to get dirty pages bitmap during dma unmap.
  vfio: Make vfio-pci device migration capable
  qapi: Add VFIO devices migration stats in Migration stats

 hw/i386/intel_iommu.c |   9 +
 hw/vfio/Makefile.objs |   2 +-
 hw/vfio/common.c  | 411 +++--
 hw/vfio/migration.c   | 804 ++
 hw/vfio/pci.c | 203 +--
 hw/vfio/pci.h |   1 -
 hw/vfio/trace-events  |  19 +
 include/exec/memory.h |  18 +
 include/hw/vfio/vfio-common.h |  22 ++
 include/qemu/vfio-helpers.h   |   3 +
 linux-headers/linux/vfio.h| 318 -
 memory.c  |  13 +-
 migration/migration.c |  12 +
 monitor/hmp-cmds.c|   6 +
 qapi/migration.json   |  19 +-
 15 files changed, 1812 insertions(+), 48 deletions(-)
 create mode 100644 hw/vfio/migration.c

-- 
2.7.0




[PATCH QEMU v22 01/18] vfio: KABI for migration interface - Kernel header placeholder

2020-05-18 Thread Kirti Wankhede
Kernel header patches are being reviewed along with kernel side changes.
This patch is only for place holder.

This patch include all changes in vfio.h from above patch set

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 linux-headers/linux/vfio.h | 318 -
 1 file changed, 316 insertions(+), 2 deletions(-)

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index a41c45286511..7076d48e1ec0 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,232 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |->|-->|
+ *
+ * 3. Save the state during live migration
+ * |--->|>|-->|
+ *
+ * 4. Resuming
+ *   

[PATCH Kernel v22 7/8] vfio iommu: Add migration capability to report supported features

2020-05-18 Thread Kirti Wankhede
Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
 drivers/vfio/vfio_iommu_type1.c | 23 ++-
 include/uapi/linux/vfio.h   | 22 ++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b9ee78a615a4..5c3dc5863893 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2397,6 +2397,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+
+   cap_mig.flags = 0;
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2443,8 +2459,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
info.iova_pgsizes = iommu->pgsize_bitmap;
 
-   ret = vfio_iommu_iova_build_caps(iommu, );
+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index a1dd2150971e..aa8aa9dcf02a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,28 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates IOMMU kernel driver supports
+ * dirty page tracking.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
2.7.0




[PATCH Kernel v22 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-18 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 103 +---
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c3dc5863893..a6b9e13ef57c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -590,11 +596,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v22 4/8] vfio iommu: Add ioctl definition for dirty pages tracking

2020-05-18 Thread Kirti Wankhede
IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..4850c1fef1f8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  * struct vfio_dma_unmap)
@@ -1059,6 +1065,55 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages tracking.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
+ * the IOMMU driver to track pages that are dirtied or potentially dirtied by
+ * device; designed to be used when a migration is in progress. Dirty pages are
+ * tracked until tracking is stopped by user application by calling the IOCTL
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
+ * the IOMMU driver to stop tracking dirtied pages.
+ *
+ * Calling the  IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
+ * User must specify the IOVA range and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports to get bitmap of smallest supported pgsize only and can be modified
+ * in future to get bitmap of specified pgsize. The user must provide a zeroed
+ * memory area for the bitmap memory and specify its size in bitmap.size.
+ * One bit is used to represent one page consecutively starting from iova
+ * offset. The user should provide page size in bitmap.pgsize field. A bit set
+ * in the bitmap indicates that the page at that offset from iova is dirty.
+ * The caller must set argsz including size of structure
+ * vfio_iommu_type1_dirty_bitmap_get.
+ *
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+   __u32argsz;
+   __u32flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START  (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP   (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+   __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+   __u64  iova;/* IO virtual address */
+   __u64  size;/* Size of iova range */
+   struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.0




[PATCH Kernel v22 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-18 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 68 +
 include/uapi/linux/vfio.h   | 10 ++
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf740fef196f..b9ee78a615a4 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
 
if (npages > DIRTY_BITMAP_PAGES_MAX)
return -EINVAL;
 
-   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
 
@@ -1018,23 +1022,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1045,9 +1051,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1085,6 +1097,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1128,6 +1141,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(>lock);
goto again;
}
+
+   if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -2466,17 +2487,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
-   long ret;
+   struct vfio_bitmap bitmap = { 0 };
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
if (copy_from_user(, (void __user *)arg, m

[PATCH Kernel v22 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-18 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 313 +++-
 1 file changed, 307 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..bf740fef196f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -568,6 +651,17 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
vfio

[PATCH Kernel v22 2/8] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-05-18 Thread Kirti Wankhede
vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0c60f895b24..fa735047b04d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -112,7 +112,7 @@ struct vfio_pfn {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   pfn;/* Host pfn */
-   atomic_tref_count;
+   unsigned intref_count;
 };
 
 struct vfio_regions {
@@ -233,7 +233,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, 
dma_addr_t iova,
 
vpfn->iova = iova;
vpfn->pfn = pfn;
-   atomic_set(>ref_count, 1);
+   vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
 }
@@ -251,7 +251,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct 
vfio_dma *dma,
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
if (vpfn)
-   atomic_inc(>ref_count);
+   vpfn->ref_count++;
return vpfn;
 }
 
@@ -259,7 +259,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 {
int ret = 0;
 
-   if (atomic_dec_and_test(>ref_count)) {
+   vpfn->ref_count--;
+   if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
-- 
2.7.0




[PATCH Kernel v22 0/8] Add UAPIs to support migration for VFIO devices

2020-05-18 Thread Kirti Wankhede
 vendor driver for per device. Added 2 ioctls to achieve this.

v7 -> v8:
- Updated comments for KABI
- Added BAR address validation check during PCI device's config space load
  as suggested by Dr. David Alan Gilbert.
- Changed vfio_migration_set_state() to set or clear device state flags.
- Some nit fixes.

v6 -> v7:
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of
  structure vfio_device_migration_info to be followed based on Alex's
  suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration,
  that is removed assumption that data will be till end of migration
  region.
- If vendor driver supports mappable sparsed region, map those region
  during setup state of save/load, similarly unmap those from cleanup
  routines.
- Handles race condition that causes data corruption in migration region
  during save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Split into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2
  bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined
  action on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with
  region type capability.
- Re-structured vfio_device_migration_info. This structure will be placed
  at 0th offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section
  of the region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested
  pages are copied.

Thanks,
Kirti




Kirti Wankhede (8):
  vfio: UAPI for migration interface for device state
  vfio iommu: Remove atomicity of ref_count of pinned pages
  vfio iommu: Cache pgsize_bitmap in struct vfio_iommu
  vfio iommu: Add ioctl definition for dirty pages tracking
  vfio iommu: Implementation of ioctl for dirty pages tracking
  vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap
  vfio iommu: Add migration capability to report supported features
  vfio: Selective dirty page tracking if IOMMU backed device pins pages

 drivers/vfio/vfio.c |  13 +-
 drivers/vfio/vfio_iommu_type1.c | 576 
 include/linux/vfio.h|   4 +-
 include/uapi/linux/vfio.h   | 315 ++
 4 files changed, 849 insertions(+), 59 deletions(-)

-- 
2.7.0




[PATCH Kernel v22 1/8] vfio: UAPI for migration interface for device state

2020-05-18 Thread Kirti Wankhede
- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 228 ++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 015516bcfaa3..ad9bb5af3463 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |--

[PATCH Kernel v22 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-18 Thread Kirti Wankhede
Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 88 +++--
 1 file changed, 49 insertions(+), 39 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fa735047b04d..de17787ffece 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,6 +69,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
unsigned intdma_avail;
+   uint64_tpgsize_bitmap;
boolv2;
boolnesting;
 };
@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
 
-   mutex_lock(>lock);
+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
/*
 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -823,12 +823,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu 
*iommu)
 * granularity while iommu driver can use the sub-PAGE_SIZE size
 * to map the buffer.
 */
-   if (bitmap & ~PAGE_MASK) {
-   bitmap &= PAGE_MASK;
-   bitmap |= PAGE_SIZE;
+   if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+   iommu->pgsize_bitmap &= PAGE_MASK;
+   iommu->pgsize_bitmap |= PAGE_SIZE;
}
-
-   return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -839,19 +837,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
size_t unmapped = 0;
int ret = 0, retries = 0;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+   mutex_lock(>lock);
+
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+   if (unmap->iova & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   if (!unmap->size || unmap->size & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
-   if (unmap->iova & mask)
-   return -EINVAL;
-   if (!unmap->size || unmap->size & mask)
-   return -EINVAL;
if (unmap->iova + unmap->size - 1 < unmap->iova ||
-   unmap->size > SIZE_MAX)
-   return -EINVAL;
+   unmap->size > SIZE_MAX) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
WARN_ON(mask & PAGE_MASK);
 again:
-   mutex_lock(>lock);
 
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -930,6 +937,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
blocking_notifier_call_chain(>notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
_unmap);
+   mutex_lock(>lock);
goto again;
}
unmapped += dma->size;
@@ -1045,24 +1053,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-   WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
 
-   if (!prot || !size || (size | iova | vaddr) & mask)
-   return -EINVAL;
+   mutex_lock(>lock);
 
-   /* Don't allow IOVA or virtual address wrap */
-   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-   return -EINVAL;
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-   mutex_lock(>lock);
+   WARN_ON(mask & PAGE_MASK);
+
+   if (!prot || !size || (size | iova | vaddr) & mask) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Don't allow IOVA or virtual address wrap */
+   if (iova + 

Re: [PATCH Kernel v21 0/8] Add UAPIs to support migration for VFIO devices

2020-05-17 Thread Kirti Wankhede




On 5/18/2020 8:09 AM, Xiang Zheng wrote:

Hi Kirti and Yan,

How can I test this patch series on my SR-IOV devices?
I have looked through Yan's pathes for i40e VF live migration support:
 https://patchwork.kernel.org/patch/11375177/

However, I cannot find the detailed implementation about device state
saving/restoring and dirty page logging.


Details of device state transitions, how user application should use 
UAPI and how vendor driver should behave are mentioned in the comment of 
UAPI:

https://lore.kernel.org/kvm/1589577203-20640-2-git-send-email-kwankh...@nvidia.com/


Has i40e hardware already supported
these two features?

And if once a device supports both features, how to implement live
migration for this device via this series patch?



Here is the patch-set which adds migration support to mtty sample 
device, you can refer to see how to implement on driver side:

https://lore.kernel.org/kvm/1588614860-16330-1-git-send-email-kwankh...@nvidia.com/

I'm working on preparing QEMU patches for v21 and will be sending out soon.

Thanks,
Kirti


On 2020/5/16 5:13, Kirti Wankhede wrote:

Hi,

This patch set adds:
* IOCTL VFIO_IOMMU_DIRTY_PAGES to get dirty pages bitmap with
   respect to IOMMU container rather than per device. All pages pinned by
   vendor driver through vfio_pin_pages external API has to be marked as
   dirty during  migration. When IOMMU capable device is present in the
   container and all pages are pinned and mapped, then all pages are marked
   dirty.
   When there are CPU writes, CPU dirty page tracking can identify dirtied
   pages, but any page pinned by vendor driver can also be written by
   device. As of now there is no device which has hardware support for
   dirty page tracking. So all pages which are pinned should be considered
   as dirty.
   This ioctl is also used to start/stop dirty pages tracking for pinned and
   unpinned pages while migration is active.

* Updated IOCTL VFIO_IOMMU_UNMAP_DMA to get dirty pages bitmap before
   unmapping IO virtual address range.
   With vIOMMU, during pre-copy phase of migration, while CPUs are still
   running, IO virtual address unmap can happen while device still keeping
   reference of guest pfns. Those pages should be reported as dirty before
   unmap, so that VFIO user space application can copy content of those
   pages from source to destination.

* Patch 8 detect if IOMMU capable device driver is smart to report pages
   to be marked dirty by pinning pages using vfio_pin_pages() API.


Yet TODO:
Since there is no device which has hardware support for system memmory
dirty bitmap tracking, right now there is no other API from vendor driver
to VFIO IOMMU module to report dirty pages. In future, when such hardware
support will be implemented, an API will be required such that vendor
driver could report dirty pages to VFIO module during migration phases.

Adding revision history from previous QEMU patch set to understand KABI
changes done till now

v20 -> v21
- Added checkin for GET_BITMAP ioctl for vfio_dma boundaries.
- Updated unmap ioctl function - as suggested by Alex.
- Updated comments in DIRTY_TRACKING ioctl definition - as suggested by
   Cornelia.

v19 -> v20
- Fixed ioctl to get dirty bitmap to get bitmap of multiple vfio_dmas
- Fixed unmap ioctl to get dirty bitmap of multiple vfio_dmas.
- Removed flag definition from migration capability.

v18 -> v19
- Updated migration capability with supported page sizes bitmap for dirty
   page tracking and  maximum bitmap size supported by kernel module.
- Added patch to calculate and cache pgsize_bitmap when iommu->domain_list
   is updated.
- Removed extra buffers added in previous version for bitmap manipulation
   and optimised the code.

v17 -> v18
- Add migration capability to the capability chain for VFIO_IOMMU_GET_INFO
   ioctl
- Updated UMAP_DMA ioctl to return bitmap of multiple vfio_dma

v16 -> v17
- Fixed errors reported by kbuild test robot  on i386

v15 -> v16
- Minor edits and nit picks (Auger Eric)
- On copying bitmap to user, re-populated bitmap only for pinned pages,
   excluding unmapped pages and CPU dirtied pages.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v14 -> v15
- Minor edits and nit picks.
- In the verification of user allocated bitmap memory, added check of
maximum size.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v13 -> v14
- Added struct vfio_bitmap to kabi. updated structure
   vfio_iommu_type1_dirty_bitmap_get and vfio_iommu_type1_dma_unmap.
- All small changes suggested by Alex.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v12 -> v13
- Changed bitmap allocation in vfio_iommu_type1 to per vfio_dma
- Changed VFIO_IOMMU_DIRTY_PAGES ioctl behaviour to be per vfio_dma range.
- Changed vfio_iommu_type1_

Re: [PATCH Kernel v21 0/8] Add UAPIs to support migration for VFIO devices

2020-05-16 Thread Kirti Wankhede



On 5/16/2020 5:17 AM, Tian, Kevin wrote:

Hi, Kirti,

Will you send out a new version in Qemu side, or previous v16 still applies?



v16 doesn't work as now migration capability is added to iommu info 
chain. I'll send out new version of QEMU side tomorrow, though I'm not 
able to update QEMU side patches with all review comments on those 
patches. Still I'll send out QEMU patches which are compatible with v21 
and will cover rest of the comments in later revision.


Thanks,
Kirti


Thanks
Kevin


From: Kirti Wankhede
Sent: Saturday, May 16, 2020 5:13 AM

Hi,

This patch set adds:
* IOCTL VFIO_IOMMU_DIRTY_PAGES to get dirty pages bitmap with
   respect to IOMMU container rather than per device. All pages pinned by
   vendor driver through vfio_pin_pages external API has to be marked as
   dirty during  migration. When IOMMU capable device is present in the
   container and all pages are pinned and mapped, then all pages are marked
   dirty.
   When there are CPU writes, CPU dirty page tracking can identify dirtied
   pages, but any page pinned by vendor driver can also be written by
   device. As of now there is no device which has hardware support for
   dirty page tracking. So all pages which are pinned should be considered
   as dirty.
   This ioctl is also used to start/stop dirty pages tracking for pinned and
   unpinned pages while migration is active.

* Updated IOCTL VFIO_IOMMU_UNMAP_DMA to get dirty pages bitmap
before
   unmapping IO virtual address range.
   With vIOMMU, during pre-copy phase of migration, while CPUs are still
   running, IO virtual address unmap can happen while device still keeping
   reference of guest pfns. Those pages should be reported as dirty before
   unmap, so that VFIO user space application can copy content of those
   pages from source to destination.

* Patch 8 detect if IOMMU capable device driver is smart to report pages
   to be marked dirty by pinning pages using vfio_pin_pages() API.


Yet TODO:
Since there is no device which has hardware support for system memmory
dirty bitmap tracking, right now there is no other API from vendor driver
to VFIO IOMMU module to report dirty pages. In future, when such
hardware
support will be implemented, an API will be required such that vendor
driver could report dirty pages to VFIO module during migration phases.

Adding revision history from previous QEMU patch set to understand KABI
changes done till now

v20 -> v21
- Added checkin for GET_BITMAP ioctl for vfio_dma boundaries.
- Updated unmap ioctl function - as suggested by Alex.
- Updated comments in DIRTY_TRACKING ioctl definition - as suggested by
   Cornelia.

v19 -> v20
- Fixed ioctl to get dirty bitmap to get bitmap of multiple vfio_dmas
- Fixed unmap ioctl to get dirty bitmap of multiple vfio_dmas.
- Removed flag definition from migration capability.

v18 -> v19
- Updated migration capability with supported page sizes bitmap for dirty
   page tracking and  maximum bitmap size supported by kernel module.
- Added patch to calculate and cache pgsize_bitmap when iommu-

domain_list

   is updated.
- Removed extra buffers added in previous version for bitmap manipulation
   and optimised the code.

v17 -> v18
- Add migration capability to the capability chain for
VFIO_IOMMU_GET_INFO
   ioctl
- Updated UMAP_DMA ioctl to return bitmap of multiple vfio_dma

v16 -> v17
- Fixed errors reported by kbuild test robot  on i386

v15 -> v16
- Minor edits and nit picks (Auger Eric)
- On copying bitmap to user, re-populated bitmap only for pinned pages,
   excluding unmapped pages and CPU dirtied pages.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v14 -> v15
- Minor edits and nit picks.
- In the verification of user allocated bitmap memory, added check of
maximum size.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v13 -> v14
- Added struct vfio_bitmap to kabi. updated structure
   vfio_iommu_type1_dirty_bitmap_get and vfio_iommu_type1_dma_unmap.
- All small changes suggested by Alex.
- Patches are on tag: next-20200318 and 1-3 patches from Yan's series
   https://lkml.org/lkml/2020/3/12/1255

v12 -> v13
- Changed bitmap allocation in vfio_iommu_type1 to per vfio_dma
- Changed VFIO_IOMMU_DIRTY_PAGES ioctl behaviour to be per vfio_dma
range.
- Changed vfio_iommu_type1_dirty_bitmap structure to have separate data
   field.

v11 -> v12
- Changed bitmap allocation in vfio_iommu_type1.
- Remove atomicity of ref_count.
- Updated comments for migration device state structure about error
   reporting.
- Nit picks from v11 reviews

v10 -> v11
- Fix pin pages API to free vpfn if it is marked as unpinned tracking page.
- Added proposal to detect if IOMMU capable device calls external pin pages
   API to mark pages dirty.
- Nit picks from v10 reviews

v9 -> v10:
- Updated existing VFIO_IOMMU_UNMAP_DMA ioctl to get dirt

Re: [PATCH Kernel v21 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-16 Thread Kirti Wankhede

DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nvidia.com; s=n1;
t=1589656095; bh=+tZ0dBYIJDY6PHAfvMYygljkbJgDRKM2mXYJTiJ5LAU=;
h=X-PGP-Universal:Subject:To:CC:References:X-Nvconfidentiality:From:
 Message-ID:Date:User-Agent:MIME-Version:In-Reply-To:
 X-Originating-IP:X-ClientProxiedBy:Content-Type:Content-Language:
 Content-Transfer-Encoding;
b=AIbO+yRdmNHn4LV2XE0br8vquXdgTLtrWscElXmWTZiSzrFeRqlATyPGsHleQF3QU
 nBcXsRa9tsbQOwgPkyh0nhMBzcV+q6CoKMw4c3CmRhkSXWG6XnQepdpEF4WDC5VJ1j
 /kxVKDvmS/WIGEMLowaG/lra0BpLqY9FQLPkCc2up9t94NJ15nHzMx+poYTeVeomWq
 x3b9j+KGJesMojeYHF4p02v5kpaquce7dYmP7FjlUMTdEZTgbB46FMu/GynDs3ZPLp
 5Jj51SmTeTP/0NR8+K7XjbAFdNc/ux1RzpNITw6FFJ7kmcIImwoKGPat0qKhpN2P6u
 J2ThfIZtvw0wg==


On 5/16/2020 4:03 AM, Alex Williamson wrote:

On Sat, 16 May 2020 02:43:20 +0530
Kirti Wankhede  wrote:






+static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+ dma_addr_t iova, size_t size, size_t pgsize)
+{
+   struct vfio_dma *dma;
+   unsigned long pgshift = __ffs(pgsize);
+   int ret;
+
+   /*
+* GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
+* vfio_dma mappings may be clubbed by specifying large ranges, but
+* there must not be any previous mappings bisected by the range.
+* An error will be returned if these conditions are not met.
+*/
+   dma = vfio_find_dma(iommu, iova, 1);
+   if (dma && dma->iova != iova)
+   return -EINVAL;
+
+   dma = vfio_find_dma(iommu, iova + size - 1, 0);
+   if (dma && dma->iova + dma->size != iova + size)
+   return -EINVAL;
+
+   dma = vfio_find_dma(iommu, iova, size);
+
+   while (dma && (dma->iova >= iova) &&
+   (dma->iova + dma->size <= iova + size)) {


Thanks for doing this!  Unfortunately I think I've mislead you :(
But I think there was a bug here in the last version as well, so maybe
it's all for the better ;)

vfio_find_dma() does not guarantee to find the first dma in the range
(ie. the lowest iova), it only guarantees to find a dma in the range.
Since we have a tree structure, as we descend the nodes we might find
multiple nodes within the range.  vfio_find_dma() only returns the first
occurrence it finds, so we can't assume that other matching nodes are
next in the tree or that their iovas are greater than the iova of the
node we found.

All the other use cases of vfio_find_dma() are looking for specific
pages or boundaries or checking for the existence of a conflict or are
removing all of the instances within the range, which is probably the
example that was used in the v20 version of this patch, since it was
quite similar to vfio_dma_do_unmap() but tried to adjust the size to
get the next match rather than removing the entry.  That could
potentially lead to an entire unexplored half of the tree making our
bitmap incomplete.

So I think my initial suggestion[1] on the previous version is probably
the way we should go.  Sorry!  OTOH, it would have been a nasty bug to
find later, it's a subtle semantic that's easy to overlook.  Thanks,

Alex

[1]https://lore.kernel.org/kvm/20200514212720.479cc...@x1.home/



Ok. Got your point.

Replacing
dma = vfio_find_dma(iommu, iova, size);

with below should work

for (n = rb_first(>dma_list); n; n = rb_next(n)) {
struct vfio_dma *ldma = rb_entry(n, struct vfio_dma, node);

if (ldma->iova >= iova)
break;
}

dma = n ? rb_entry(n, struct vfio_dma, node) : NULL;

Should I update all patches with v22 version? or Is it fine to update 
this patch with v21 only?


Thanks,
Kirti




[PATCH Kernel v21 7/8] vfio iommu: Add migration capability to report supported features

2020-05-15 Thread Kirti Wankhede
Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
 drivers/vfio/vfio_iommu_type1.c | 23 ++-
 include/uapi/linux/vfio.h   | 22 ++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d687794c721c..5c8d20974a2a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2390,6 +2390,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+
+   cap_mig.flags = 0;
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2436,8 +2452,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
info.iova_pgsizes = iommu->pgsize_bitmap;
 
-   ret = vfio_iommu_iova_build_caps(iommu, );
+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index a1dd2150971e..aa8aa9dcf02a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,28 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates IOMMU kernel driver supports
+ * dirty page tracking.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
2.7.0




[PATCH Kernel v21 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-15 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 103 +---
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c8d20974a2a..05637b51e7c9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -590,11 +596,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v21 2/8] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-05-15 Thread Kirti Wankhede
vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0c60f895b24..fa735047b04d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -112,7 +112,7 @@ struct vfio_pfn {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   pfn;/* Host pfn */
-   atomic_tref_count;
+   unsigned intref_count;
 };
 
 struct vfio_regions {
@@ -233,7 +233,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, 
dma_addr_t iova,
 
vpfn->iova = iova;
vpfn->pfn = pfn;
-   atomic_set(>ref_count, 1);
+   vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
 }
@@ -251,7 +251,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct 
vfio_dma *dma,
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
if (vpfn)
-   atomic_inc(>ref_count);
+   vpfn->ref_count++;
return vpfn;
 }
 
@@ -259,7 +259,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 {
int ret = 0;
 
-   if (atomic_dec_and_test(>ref_count)) {
+   vpfn->ref_count--;
+   if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
-- 
2.7.0




[PATCH Kernel v21 1/8] vfio: UAPI for migration interface for device state

2020-05-15 Thread Kirti Wankhede
- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 228 ++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 015516bcfaa3..ad9bb5af3463 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |--

[PATCH Kernel v21 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-15 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 68 +
 include/uapi/linux/vfio.h   | 10 ++
 2 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8b8fe171f4c2..d687794c721c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
 
if (npages > DIRTY_BITMAP_PAGES_MAX)
return -EINVAL;
 
-   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
 
@@ -1011,23 +1015,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1038,9 +1044,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1078,6 +1090,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1121,6 +1134,14 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(>lock);
goto again;
}
+
+   if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -2459,17 +2480,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
-   long ret;
+   struct vfio_bitmap bitmap = { 0 };
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 
if (copy_from_user(, (void __user *)arg, m

[PATCH Kernel v21 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-15 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 306 +++-
 1 file changed, 300 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..8b8fe171f4c2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -568,6 +651,17 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
vfio

[PATCH Kernel v21 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-15 Thread Kirti Wankhede
Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 88 +++--
 1 file changed, 49 insertions(+), 39 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fa735047b04d..de17787ffece 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,6 +69,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
unsigned intdma_avail;
+   uint64_tpgsize_bitmap;
boolv2;
boolnesting;
 };
@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
 
-   mutex_lock(>lock);
+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
/*
 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -823,12 +823,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu 
*iommu)
 * granularity while iommu driver can use the sub-PAGE_SIZE size
 * to map the buffer.
 */
-   if (bitmap & ~PAGE_MASK) {
-   bitmap &= PAGE_MASK;
-   bitmap |= PAGE_SIZE;
+   if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+   iommu->pgsize_bitmap &= PAGE_MASK;
+   iommu->pgsize_bitmap |= PAGE_SIZE;
}
-
-   return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -839,19 +837,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
size_t unmapped = 0;
int ret = 0, retries = 0;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+   mutex_lock(>lock);
+
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+   if (unmap->iova & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   if (!unmap->size || unmap->size & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
-   if (unmap->iova & mask)
-   return -EINVAL;
-   if (!unmap->size || unmap->size & mask)
-   return -EINVAL;
if (unmap->iova + unmap->size - 1 < unmap->iova ||
-   unmap->size > SIZE_MAX)
-   return -EINVAL;
+   unmap->size > SIZE_MAX) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
WARN_ON(mask & PAGE_MASK);
 again:
-   mutex_lock(>lock);
 
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -930,6 +937,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
blocking_notifier_call_chain(>notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
_unmap);
+   mutex_lock(>lock);
goto again;
}
unmapped += dma->size;
@@ -1045,24 +1053,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-   WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
 
-   if (!prot || !size || (size | iova | vaddr) & mask)
-   return -EINVAL;
+   mutex_lock(>lock);
 
-   /* Don't allow IOVA or virtual address wrap */
-   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-   return -EINVAL;
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-   mutex_lock(>lock);
+   WARN_ON(mask & PAGE_MASK);
+
+   if (!prot || !size || (size | iova | vaddr) & mask) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Don't allow IOVA or virtual address wrap */
+   if (iova + 

[PATCH Kernel v21 0/8] Add UAPIs to support migration for VFIO devices

2020-05-15 Thread Kirti Wankhede
Added BAR address validation check during PCI device's config space load
  as suggested by Dr. David Alan Gilbert.
- Changed vfio_migration_set_state() to set or clear device state flags.
- Some nit fixes.

v6 -> v7:
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of
  structure vfio_device_migration_info to be followed based on Alex's
  suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration,
  that is removed assumption that data will be till end of migration
  region.
- If vendor driver supports mappable sparsed region, map those region
  during setup state of save/load, similarly unmap those from cleanup
  routines.
- Handles race condition that causes data corruption in migration region
  during save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Split into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2
  bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined
  action on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with
  region type capability.
- Re-structured vfio_device_migration_info. This structure will be placed
  at 0th offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section
  of the region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested
  pages are copied.

Thanks,
Kirti



Kirti Wankhede (8):
  vfio: UAPI for migration interface for device state
  vfio iommu: Remove atomicity of ref_count of pinned pages
  vfio iommu: Cache pgsize_bitmap in struct vfio_iommu
  vfio iommu: Add ioctl definition for dirty pages tracking
  vfio iommu: Implementation of ioctl for dirty pages tracking
  vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap
  vfio iommu: Add migration capability to report supported features
  vfio: Selective dirty page tracking if IOMMU backed device pins pages

 drivers/vfio/vfio.c |  13 +-
 drivers/vfio/vfio_iommu_type1.c | 569 
 include/linux/vfio.h|   4 +-
 include/uapi/linux/vfio.h   | 315 ++
 4 files changed, 842 insertions(+), 59 deletions(-)

-- 
2.7.0




[PATCH Kernel v21 4/8] vfio iommu: Add ioctl definition for dirty pages tracking

2020-05-15 Thread Kirti Wankhede
IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..4850c1fef1f8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  * struct vfio_dma_unmap)
@@ -1059,6 +1065,55 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages tracking.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
+ * the IOMMU driver to track pages that are dirtied or potentially dirtied by
+ * device; designed to be used when a migration is in progress. Dirty pages are
+ * tracked until tracking is stopped by user application by calling the IOCTL
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
+ * the IOMMU driver to stop tracking dirtied pages.
+ *
+ * Calling the  IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
+ * User must specify the IOVA range and the pgsize through the structure
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
+ * supports to get bitmap of smallest supported pgsize only and can be modified
+ * in future to get bitmap of specified pgsize. The user must provide a zeroed
+ * memory area for the bitmap memory and specify its size in bitmap.size.
+ * One bit is used to represent one page consecutively starting from iova
+ * offset. The user should provide page size in bitmap.pgsize field. A bit set
+ * in the bitmap indicates that the page at that offset from iova is dirty.
+ * The caller must set argsz including size of structure
+ * vfio_iommu_type1_dirty_bitmap_get.
+ *
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+   __u32argsz;
+   __u32flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START  (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP   (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+   __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+   __u64  iova;/* IO virtual address */
+   __u64  size;/* Size of iova range */
+   struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.0




Re: [PATCH Kernel v20 4/8] vfio iommu: Add ioctl definition for dirty pages tracking

2020-05-15 Thread Kirti Wankhede




On 5/15/2020 4:29 PM, Cornelia Huck wrote:

On Fri, 15 May 2020 02:07:43 +0530
Kirti Wankhede  wrote:


IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  include/uapi/linux/vfio.h | 55 +++
  1 file changed, 55 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..123de3bc2dce 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
  
  #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
  
+struct vfio_bitmap {

+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
  /**
   * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
   *struct vfio_dma_unmap)
@@ -1059,6 +1065,55 @@ struct vfio_iommu_type1_dma_unmap {
  #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
  #define VFIO_IOMMU_DISABLE_IO(VFIO_TYPE, VFIO_BASE + 16)
  
+/**

+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages tracking.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_START set, indicates
+ * migration is active and IOMMU module should track pages which are dirtied or
+ * potentially dirtied by device.


"Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START instructs the
IOMMU driver to track pages that are dirtied or potentially dirtied by
the device; designed to be used when a migration is in progress."

?



Ok, updating.


+ * Dirty pages are tracked until tracking is stopped by user application by
+ * setting VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.


"...by calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP." ?


+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP set, indicates
+ * IOMMU should stop tracking dirtied pages.


"Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP instructs the
IOMMU driver to stop tracking dirtied pages."

?



Ok.


+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set,
+ * IOCTL returns dirty pages bitmap for IOMMU container during migration for
+ * given IOVA range.


"Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_GET_BITMAP returns the
dirty pages bitmap for the IOMMU container for a given IOVA range." ?

Q: How does this interact with the two other operations? I imagine
getting an empty bitmap before _START 


No, if dirty page tracking is not started, get_bitmap IOCTL will fail 
with -EINVAL.



and a bitmap-in-progress between
_START and _STOP. > After _STOP, will subsequent calls always give the
same bitmap?



No, return -EINVAL.



User must provide data[] as the structure
+ * vfio_iommu_type1_dirty_bitmap_get through which user provides IOVA range and
+ * pgsize.


"The user must specify the IOVA range and the pgsize through the
vfio_iommu_type1_dirty_bitmap_get structure in the data[] portion."

?


This interface supports to get bitmap of smallest supported pgsize
+ * only and can be modified in future to get bitmap of specified pgsize.


That's a current restriction? How can the user find out whether it has
been lifted (or, more generally, find out which pgsize values are
supported)?


Migration capability is added to IOMMU info chain. That gives supported 
pgsize bitmap by IOMMU driver.





+ * User must allocate memory for bitmap, zero the bitmap memory  and set size
+ * of allocated memory in bitmap.size field.


"The user must provide a zeroed memory area for the bitmap memory and
specify its size in bitmap.size."

?


One bit is used to represent one
+ * page consecutively starting from iova offset. User should provide page size
+ * in bitmap.pgsize field.


s/User/The user/

Is that the 'pgsize' the comment above talks about?



By specifying pgsize here user can ask for bitmap of specific pgsize.


Bit set in bitmap indicates page at that offset from
+ * iova is dirty.


"A bit set in the bitmap indicates that the page at that offset from
iova is dirty." ?


Caller must set argsz including size of structure
+ * vfio_iommu_type1_dirty_bitmap_get.


s/Caller/The caller/

Does argz also include the size of the bitmap?


No.




+ *
+ * Only one of t

Re: [PATCH Kernel v20 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-15 Thread Kirti Wankhede




On 5/15/2020 8:45 PM, Alex Williamson wrote:

On Fri, 15 May 2020 16:44:38 +0530
Kirti Wankhede  wrote:


On 5/15/2020 3:35 PM, Yan Zhao wrote:

On Fri, May 15, 2020 at 02:07:44AM +0530, Kirti Wankhede wrote:

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
   drivers/vfio/vfio_iommu_type1.c | 294 
+++-
   1 file changed, 288 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..b76d3b14abfd 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
   };
   
   struct vfio_domain {

@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
   };
   
   struct vfio_group {

@@ -126,6 +128,19 @@ struct vfio_regions {
   #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)  \
(!list_empty(>domain_list))
   
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
   static int put_pfn(unsigned long pfn, int prot);
   
   /*

@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
   }
   
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio

Re: [PATCH Kernel v20 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-15 Thread Kirti Wankhede




On 5/15/2020 7:01 PM, Alex Williamson wrote:

On Fri, 15 May 2020 12:17:03 +0530
Kirti Wankhede  wrote:


On 5/15/2020 11:17 AM, Alex Williamson wrote:

On Fri, 15 May 2020 09:46:43 +0530
Kirti Wankhede  wrote:
   

On 5/15/2020 8:57 AM, Alex Williamson wrote:

On Fri, 15 May 2020 02:07:45 +0530
Kirti Wankhede  wrote:
  

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
drivers/vfio/vfio_iommu_type1.c | 77 
++---
include/uapi/linux/vfio.h   | 10 ++
2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b76d3b14abfd..a1dc57bcece5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
{
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;

	if (npages > DIRTY_BITMAP_PAGES_MAX)

return -EINVAL;

-	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);

+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;

@@ -999,23 +1003,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)

}

static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
{
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;

	mutex_lock(>lock);

-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;

+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;

-	if (unmap->iova & mask) {

+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}

-	if (!unmap->size || unmap->size & mask) {

+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1026,9 +1032,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}

-	WARN_ON(mask & PAGE_MASK);

-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }

+	WARN_ON((pgsize - 1) & PAGE_MASK);

+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1066,6 +1078,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1083,6 +1096,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;

+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+   (dma_last != dma)) {
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0,
+  dma->size >> pgshift);


Nit, all the callers of update_user_bitmap() prec

Re: [PATCH Kernel v20 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-15 Thread Kirti Wankhede




On 5/15/2020 3:35 PM, Yan Zhao wrote:

On Fri, May 15, 2020 at 02:07:44AM +0530, Kirti Wankhede wrote:

VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
  drivers/vfio/vfio_iommu_type1.c | 294 +++-
  1 file changed, 288 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..b76d3b14abfd 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
  };
  
  struct vfio_domain {

@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
  };
  
  struct vfio_group {

@@ -126,6 +128,19 @@ struct vfio_regions {
  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
(!list_empty(>domain_list))
  
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
  static int put_pfn(unsigned long pfn, int prot);
  
  /*

@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
  }
  
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
  /*
   * Helper Functions for host iova-p

Re: [PATCH Kernel v20 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-15 Thread Kirti Wankhede




On 5/15/2020 11:17 AM, Alex Williamson wrote:

On Fri, 15 May 2020 09:46:43 +0530
Kirti Wankhede  wrote:


On 5/15/2020 8:57 AM, Alex Williamson wrote:

On Fri, 15 May 2020 02:07:45 +0530
Kirti Wankhede  wrote:
   

DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   drivers/vfio/vfio_iommu_type1.c | 77 
++---
   include/uapi/linux/vfio.h   | 10 ++
   2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b76d3b14abfd..a1dc57bcece5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
   static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
   {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
   
   	if (npages > DIRTY_BITMAP_PAGES_MAX)

return -EINVAL;
   
-	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);

+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
   
@@ -999,23 +1003,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)

   }
   
   static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
   {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
   
   	mutex_lock(>lock);
   
-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;

+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
   
-	if (unmap->iova & mask) {

+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
   
-	if (!unmap->size || unmap->size & mask) {

+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1026,9 +1032,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
   
-	WARN_ON(mask & PAGE_MASK);

-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
   
+	WARN_ON((pgsize - 1) & PAGE_MASK);

+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1066,6 +1078,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1083,6 +1096,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
   
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+   (dma_last != dma)) {
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0,
+  dma->size >> pgshift);


Nit, all the callers of update_user_bitmap() precede the call with this
identical operation, we should probably push it into the function to do
it.
   

+
+ 

Re: [PATCH Kernel v20 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-14 Thread Kirti Wankhede




On 5/15/2020 8:57 AM, Alex Williamson wrote:

On Fri, 15 May 2020 02:07:45 +0530
Kirti Wankhede  wrote:


DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 77 ++---
  include/uapi/linux/vfio.h   | 10 ++
  2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b76d3b14abfd..a1dc57bcece5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
  static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
  {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
  
  	if (npages > DIRTY_BITMAP_PAGES_MAX)

return -EINVAL;
  
-	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);

+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
  
@@ -999,23 +1003,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)

  }
  
  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
  {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
  
  	mutex_lock(>lock);
  
-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;

+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
  
-	if (unmap->iova & mask) {

+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
  
-	if (!unmap->size || unmap->size & mask) {

+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1026,9 +1032,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
  
-	WARN_ON(mask & PAGE_MASK);

-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
  
+	WARN_ON((pgsize - 1) & PAGE_MASK);

+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1066,6 +1078,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1083,6 +1096,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
  
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+   (dma_last != dma)) {
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0,
+  dma->size >> pgshift);


Nit, all the callers of update_user_bitmap() precede the call with this
identical operation, we should probably push it into the function to do
it.


+
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+

[PATCH Kernel v20 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-14 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c |  13 +++--
 drivers/vfio/vfio_iommu_type1.c | 104 
 include/linux/vfio.h|   4 +-
 3 files changed, 109 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3edb3c3e6170..a52c4ae8907b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -590,11 +596,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v20 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-14 Thread Kirti Wankhede
Calculate and cache pgsize_bitmap when iommu->domain_list is updated
and iommu->external_domain is set for mdev device.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 88 +++--
 1 file changed, 49 insertions(+), 39 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fa735047b04d..de17787ffece 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,6 +69,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
unsigned intdma_avail;
+   uint64_tpgsize_bitmap;
boolv2;
boolnesting;
 };
@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
 
-   mutex_lock(>lock);
+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
/*
 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -823,12 +823,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu 
*iommu)
 * granularity while iommu driver can use the sub-PAGE_SIZE size
 * to map the buffer.
 */
-   if (bitmap & ~PAGE_MASK) {
-   bitmap &= PAGE_MASK;
-   bitmap |= PAGE_SIZE;
+   if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+   iommu->pgsize_bitmap &= PAGE_MASK;
+   iommu->pgsize_bitmap |= PAGE_SIZE;
}
-
-   return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -839,19 +837,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
size_t unmapped = 0;
int ret = 0, retries = 0;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+   mutex_lock(>lock);
+
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+   if (unmap->iova & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   if (!unmap->size || unmap->size & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
-   if (unmap->iova & mask)
-   return -EINVAL;
-   if (!unmap->size || unmap->size & mask)
-   return -EINVAL;
if (unmap->iova + unmap->size - 1 < unmap->iova ||
-   unmap->size > SIZE_MAX)
-   return -EINVAL;
+   unmap->size > SIZE_MAX) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
WARN_ON(mask & PAGE_MASK);
 again:
-   mutex_lock(>lock);
 
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -930,6 +937,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
blocking_notifier_call_chain(>notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
_unmap);
+   mutex_lock(>lock);
goto again;
}
unmapped += dma->size;
@@ -1045,24 +1053,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-   WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
 
-   if (!prot || !size || (size | iova | vaddr) & mask)
-   return -EINVAL;
+   mutex_lock(>lock);
 
-   /* Don't allow IOVA or virtual address wrap */
-   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-   return -EINVAL;
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-   mutex_lock(>lock);
+   WARN_ON(mask & PAGE_MASK);
+
+   if (!prot || !size || (size | iova | vaddr) & mask) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Don't allow IOVA or virtual address wrap */
+   if (iova + 

[PATCH Kernel v20 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-14 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
it all zeros, set size of allocated memory, set page size to be
considered for bitmap and set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 77 ++---
 include/uapi/linux/vfio.h   | 10 ++
 2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b76d3b14abfd..a1dc57bcece5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
 
if (npages > DIRTY_BITMAP_PAGES_MAX)
return -EINVAL;
 
-   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
 
@@ -999,23 +1003,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
+   size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+   unsigned long pgshift;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1026,9 +1032,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1066,6 +1078,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1083,6 +1096,23 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
 
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (dma_last != dma)) {
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0,
+  dma->size >> pgshift);
+
+   ret = update_user_bitmap(bitmap->data, dma,
+unmap->iova, pgsize);
+   if (ret)
+   break;
+   }
+
if (!RB_EMPTY_ROOT(>pfn_list)) {
struct vfio_iommu_type1_dma_unmap nb_unmap;
 
@@ -2447,17 +2477,40 @@ static long v

[PATCH Kernel v20 2/8] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-05-14 Thread Kirti Wankhede
vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0c60f895b24..fa735047b04d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -112,7 +112,7 @@ struct vfio_pfn {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   pfn;/* Host pfn */
-   atomic_tref_count;
+   unsigned intref_count;
 };
 
 struct vfio_regions {
@@ -233,7 +233,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, 
dma_addr_t iova,
 
vpfn->iova = iova;
vpfn->pfn = pfn;
-   atomic_set(>ref_count, 1);
+   vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
 }
@@ -251,7 +251,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct 
vfio_dma *dma,
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
if (vpfn)
-   atomic_inc(>ref_count);
+   vpfn->ref_count++;
return vpfn;
 }
 
@@ -259,7 +259,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 {
int ret = 0;
 
-   if (atomic_dec_and_test(>ref_count)) {
+   vpfn->ref_count--;
+   if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
-- 
2.7.0




[PATCH Kernel v20 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-14 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 294 +++-
 1 file changed, 288 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index de17787ffece..b76d3b14abfd 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,74 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -568,6 +651,17 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
vfio

[PATCH Kernel v20 4/8] vfio iommu: Add ioctl definition for dirty pages tracking

2020-05-14 Thread Kirti Wankhede
IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..123de3bc2dce 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  * struct vfio_dma_unmap)
@@ -1059,6 +1065,55 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages tracking.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_START set, indicates
+ * migration is active and IOMMU module should track pages which are dirtied or
+ * potentially dirtied by device.
+ * Dirty pages are tracked until tracking is stopped by user application by
+ * setting VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP set, indicates
+ * IOMMU should stop tracking dirtied pages.
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set,
+ * IOCTL returns dirty pages bitmap for IOMMU container during migration for
+ * given IOVA range. User must provide data[] as the structure
+ * vfio_iommu_type1_dirty_bitmap_get through which user provides IOVA range and
+ * pgsize. This interface supports to get bitmap of smallest supported pgsize
+ * only and can be modified in future to get bitmap of specified pgsize.
+ * User must allocate memory for bitmap, zero the bitmap memory  and set size
+ * of allocated memory in bitmap.size field. One bit is used to represent one
+ * page consecutively starting from iova offset. User should provide page size
+ * in bitmap.pgsize field. Bit set in bitmap indicates page at that offset from
+ * iova is dirty. Caller must set argsz including size of structure
+ * vfio_iommu_type1_dirty_bitmap_get.
+ *
+ * Only one of the flags _START, STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+   __u32argsz;
+   __u32flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START  (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP   (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+   __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+   __u64  iova;/* IO virtual address */
+   __u64  size;/* Size of iova range */
+   struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.0




[PATCH Kernel v20 7/8] vfio iommu: Add migration capability to report supported features

2020-05-14 Thread Kirti Wankhede
Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
 drivers/vfio/vfio_iommu_type1.c | 23 ++-
 include/uapi/linux/vfio.h   | 22 ++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a1dc57bcece5..3edb3c3e6170 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2387,6 +2387,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+
+   cap_mig.flags = 0;
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2433,8 +2449,13 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
info.iova_pgsizes = iommu->pgsize_bitmap;
 
-   ret = vfio_iommu_iova_build_caps(iommu, );
+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0a0c7315ddd6..ab1d0150bbd3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,28 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * The existence of this capability indicates IOMMU kernel driver supports
+ * dirty page tracking.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
2.7.0




[PATCH Kernel v20 0/8] Add UAPIs to support migration for VFIO devices

2020-05-14 Thread Kirti Wankhede
- Fix build failures.

v5 -> v6:
- Fix build failure.

v4 -> v5:
- Added decriptive comment about the sequence of access of members of
  structure vfio_device_migration_info to be followed based on Alex's
  suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration,
  that is removed assumption that data will be till end of migration
  region.
- If vendor driver supports mappable sparsed region, map those region
  during setup state of save/load, similarly unmap those from cleanup
  routines.
- Handles race condition that causes data corruption in migration region
  during save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Split into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2
  bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined
  action on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with
  region type capability.
- Re-structured vfio_device_migration_info. This structure will be placed
  at 0th offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section
  of the region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested
  pages are copied.

Thanks,
Kirti



Kirti Wankhede (8):
  vfio: UAPI for migration interface for device state
  vfio iommu: Remove atomicity of ref_count of pinned pages
  vfio iommu: Cache pgsize_bitmap in struct vfio_iommu
  vfio iommu: Add ioctl definition for dirty pages tracking
  vfio iommu: Implementation of ioctl for dirty pages tracking
  vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap
  vfio iommu: Add migration capability to report supported features
  vfio: Selective dirty page tracking if IOMMU backed device pins pages

 drivers/vfio/vfio.c |  13 +-
 drivers/vfio/vfio_iommu_type1.c | 565 
 include/linux/vfio.h|   4 +-
 include/uapi/linux/vfio.h   | 315 ++
 4 files changed, 838 insertions(+), 59 deletions(-)

-- 
2.7.0




[PATCH Kernel v20 1/8] vfio: UAPI for migration interface for device state

2020-05-14 Thread Kirti Wankhede
- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 228 ++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 015516bcfaa3..ad9bb5af3463 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |--

Re: [PATCH Kernel v19 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-14 Thread Kirti Wankhede




On 5/14/2020 10:32 AM, Alex Williamson wrote:

On Thu, 14 May 2020 01:34:36 +0530
Kirti Wankhede  wrote:


VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
  drivers/vfio/vfio_iommu_type1.c | 274 +++-
  1 file changed, 268 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f09fbabed12..469b09185b83 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
  };
  
  struct vfio_domain {

@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
  };
  
  struct vfio_group {

@@ -126,6 +128,19 @@ struct vfio_regions {
  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
(!list_empty(>domain_list))
  
+#define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)

+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
  static int put_pfn(unsigned long pfn, int prot);
  
  /*

@@ -176,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
  }
  
+

+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   if (RB_EMPTY_ROOT(>pfn_list))
+   return;


I don't think this is optimizing anything:

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

struct rb_node *rb_first(const struct rb_root *root)
{
 struct rb_node  *n;

 n = root->rb_node;
 if (!n)
 return NULL;

So the loop below won't be entered if the tree is empty.



Removing empty check.


+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+  

Re: [PATCH Kernel v19 7/8] vfio iommu: Add migration capability to report supported features

2020-05-14 Thread Kirti Wankhede




On 5/14/2020 10:31 AM, Alex Williamson wrote:

On Thu, 14 May 2020 01:34:38 +0530
Kirti Wankhede  wrote:


Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
  drivers/vfio/vfio_iommu_type1.c | 24 +++-
  include/uapi/linux/vfio.h   | 21 +
  2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4358be26ff80..77351497a9c2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2389,6 +2389,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
  }
  
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,

+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+   cap_mig.flags = VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK;
+
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
  static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
  {
@@ -2433,10 +2449,16 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
mutex_lock(>lock);
info.flags = VFIO_IOMMU_INFO_PGSIZES;
  
+		vfio_pgsize_bitmap(iommu);



Why is it necessary to rebuild the bitmap here?  The user can't get to
this ioctl until they've added a group to the container and set the
IOMMU model.


For mdev device, domain is not added to domain_list so 
vfio_pgsize_bitmap() doesn't get called when there is only mdev device 
attached.
Your concern is right though, vfio_pgsize_bitmap() should get populated 
with attach_group,so fixing it by calling vfio_pgsize_bitmap() for mdev 
device when iommu->external_domain is set.



info.iova_pgsizes = iommu->pgsize_bitmap;
  
-		ret = vfio_iommu_iova_build_caps(iommu, );

+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
  
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h

index e3cbf8b78623..c90604322798 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,27 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
  };
  
+/*

+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   /* supports dirty page tracking */
+#define VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK(1 << 0)


This flag is a bit redundant to the purpose of this capability, isn't
it?  I think exposing the capability itself is indicating support for
dirty page tracking.  We should probably be explicit in the comment
about exactly what interface this capability implies.  Thanks,



Capability is added to provide provision for feature flags that kernel 
driver support, that's where we started right?

Later added pgsize_bitmap and max supported bitmap size as you suggested.
I'm confused now, should I keep this flag here?
Even if the flag is removed, 'flags' field is still required so that 
whenever new feature is added, new flag will be added. That's the whole 
purpose we added this capability. Can we add a field which is not used? 
and we don't know when it will be used in future?


Thanks,
Kirti


Alex


+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
  
  /**






Re: [PATCH Kernel v19 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-13 Thread Kirti Wankhede




On 5/14/2020 10:37 AM, Alex Williamson wrote:

On Thu, 14 May 2020 01:34:37 +0530
Kirti Wankhede  wrote:


DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 102 +++-
  include/uapi/linux/vfio.h   |  10 
  2 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 469b09185b83..4358be26ff80 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
  static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
  {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
  
  	if (npages > DIRTY_BITMAP_PAGES_MAX)

return -EINVAL;
  
-	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);

+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
  
@@ -979,23 +983,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)

  }
  
  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
  {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
-   int ret = 0, retries = 0;
+   size_t unmapped = 0, pgsize;
+   int ret = 0, retries = 0, cnt = 0;
+   unsigned long pgshift, shift = 0, leftover;
  
  	mutex_lock(>lock);
  
-	mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;

+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
  
-	if (unmap->iova & mask) {

+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
  
-	if (!unmap->size || unmap->size & mask) {

+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1006,9 +1012,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
  
-	WARN_ON(mask & PAGE_MASK);

-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
  
+	WARN_ON((pgsize - 1) & PAGE_MASK);

+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1046,6 +1058,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1063,6 +1076,39 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
  
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+   (dma_last != dma)) {
+   unsigned int nbits = dma->size >> pgshift;
+   int curr_lcnt = nbits / BITS_PER_LONG;
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped.
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0, nbits);
+
+   if (shift) {
+   bitmap_shift_left(dma->bitmap, dma->bitmap,
+ shift, nbits + shif

[PATCH Kernel v19 8/8] vfio: Selective dirty page tracking if IOMMU backed device pins pages

2020-05-13 Thread Kirti Wankhede
Added a check such that only singleton IOMMU groups can pin pages.
>From the point when vendor driver pins any pages, consider IOMMU group
dirty page scope to be limited to pinned pages.

To optimize to avoid walking list often, added flag
pinned_page_dirty_scope to indicate if all of the vfio_groups for each
vfio_domain in the domain_list dirty page scope is limited to pinned
pages. This flag is updated on first pinned pages request for that IOMMU
group and on attaching/detaching group.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio.c | 13 --
 drivers/vfio/vfio_iommu_type1.c | 94 +++--
 include/linux/vfio.h|  4 +-
 3 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 765e0e5d83ed..580099afeaff 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@ struct vfio_group {
atomic_topened;
wait_queue_head_t   container_q;
boolnoiommu;
+   unsigned intdev_counter;
struct kvm  *kvm;
struct blocking_notifier_head   notifier;
 };
@@ -555,6 +556,7 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
mutex_lock(>device_lock);
list_add(>group_next, >device_list);
+   group->dev_counter++;
mutex_unlock(>device_lock);
 
return device;
@@ -567,6 +569,7 @@ static void vfio_device_release(struct kref *kref)
struct vfio_group *group = device->group;
 
list_del(>group_next);
+   group->dev_counter--;
mutex_unlock(>device_lock);
 
dev_set_drvdata(device->dev, NULL);
@@ -1945,6 +1948,9 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
if (!group)
return -ENODEV;
 
+   if (group->dev_counter > 1)
+   return -EINVAL;
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1952,7 +1958,8 @@ int vfio_pin_pages(struct device *dev, unsigned long 
*user_pfn, int npage,
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
-   ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+   ret = driver->ops->pin_pages(container->iommu_data,
+group->iommu_group, user_pfn,
 npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -2050,8 +2057,8 @@ int vfio_group_pin_pages(struct vfio_group *group,
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
ret = driver->ops->pin_pages(container->iommu_data,
-user_iova_pfn, npage,
-prot, phys_pfn);
+group->iommu_group, user_iova_pfn,
+npage, prot, phys_pfn);
else
ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 77351497a9c2..24384c4f5844 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -73,6 +73,7 @@ struct vfio_iommu {
boolv2;
boolnesting;
booldirty_page_tracking;
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_domain {
@@ -100,6 +101,7 @@ struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
boolmdev_group; /* An mdev group */
+   boolpinned_page_dirty_scope;
 };
 
 struct vfio_iova {
@@ -143,6 +145,10 @@ struct vfio_regions {
 
 static int put_pfn(unsigned long pfn, int prot);
 
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+  struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -593,11 +599,13 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, 
dma_addr_t iova,
 }
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
  unsigned long *user_pfn,
  int npage, int prot,
  unsigned long *phys_pfn)
 {
struct vfio_iommu *iommu = iommu_data;
+   stru

[PATCH Kernel v19 7/8] vfio iommu: Add migration capability to report supported features

2020-05-13 Thread Kirti Wankhede
Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.
User application must check page sizes supported and maximum dirty
bitmap size returned by this capability structure for ioctls used to get
dirty bitmap.

Signed-off-by: Kirti Wankhede 
---
 drivers/vfio/vfio_iommu_type1.c | 24 +++-
 include/uapi/linux/vfio.h   | 21 +
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4358be26ff80..77351497a9c2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2389,6 +2389,22 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
 }
 
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+  struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+   cap_mig.flags = VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK;
+
+   /* support minimum pgsize */
+   cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+   cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2433,10 +2449,16 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
mutex_lock(>lock);
info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
+   vfio_pgsize_bitmap(iommu);
info.iova_pgsizes = iommu->pgsize_bitmap;
 
-   ret = vfio_iommu_iova_build_caps(iommu, );
+   ret = vfio_iommu_migration_build_caps(iommu, );
+
+   if (!ret)
+   ret = vfio_iommu_iova_build_caps(iommu, );
+
mutex_unlock(>lock);
+
if (ret)
return ret;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3cbf8b78623..c90604322798 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,27 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
 };
 
+/*
+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ *
+ * pgsize_bitmap: Kernel driver returns supported page sizes bitmap for dirty
+ * page tracking.
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
+ * size in bytes to be used by user application for ioctls to get dirty bitmap.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   /* supports dirty page tracking */
+#define VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK(1 << 0)
+   __u64   pgsize_bitmap;
+   __u64   max_dirty_bitmap_size;  /* in bytes */
+};
+
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
 
 /**
-- 
2.7.0




[PATCH Kernel v19 6/8] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-13 Thread Kirti Wankhede
DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 102 +++-
 include/uapi/linux/vfio.h   |  10 
 2 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 469b09185b83..4358be26ff80 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -195,11 +195,15 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 {
uint64_t npages = dma->size / pgsize;
+   size_t bitmap_size;
 
if (npages > DIRTY_BITMAP_PAGES_MAX)
return -EINVAL;
 
-   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   /* Allocate extra 64 bits which are used for bitmap manipulation */
+   bitmap_size = DIRTY_BITMAP_BYTES(npages) + sizeof(u64);
+
+   dma->bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dma->bitmap)
return -ENOMEM;
 
@@ -979,23 +983,25 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
 {
-   uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
-   size_t unmapped = 0;
-   int ret = 0, retries = 0;
+   size_t unmapped = 0, pgsize;
+   int ret = 0, retries = 0, cnt = 0;
+   unsigned long pgshift, shift = 0, leftover;
 
mutex_lock(>lock);
 
-   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+   pgshift = __ffs(iommu->pgsize_bitmap);
+   pgsize = (size_t)1 << pgshift;
 
-   if (unmap->iova & mask) {
+   if (unmap->iova & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
 
-   if (!unmap->size || unmap->size & mask) {
+   if (!unmap->size || unmap->size & (pgsize - 1)) {
ret = -EINVAL;
goto unlock;
}
@@ -1006,9 +1012,15 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto unlock;
}
 
-   WARN_ON(mask & PAGE_MASK);
-again:
+   /* When dirty tracking is enabled, allow only min supported pgsize */
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
+   WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 * avoid tracking individual mappings.  This means that the granularity
@@ -1046,6 +1058,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1063,6 +1076,39 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
 
+   if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+   (dma_last != dma)) {
+   unsigned int nbits = dma->size >> pgshift;
+   int curr_lcnt = nbits / BITS_PER_LONG;
+
+   /*
+* mark all pages dirty if all pages are pinned and
+* mapped.
+*/
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0, nbits);
+
+   if (shift) {
+   bitmap_shift_left(dma->bitmap, dma->bitmap,
+ shift, nbits + shift);
+   bitmap_or(dma->bitmap, dma-&

[PATCH Kernel v19 5/8] vfio iommu: Implementation of ioctl for dirty pages tracking

2020-05-13 Thread Kirti Wankhede
VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
  copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 

Fixed error reported by build bot by changing pgsize type from uint64_t
to size_t.
Reported-by: kbuild test robot 
---
 drivers/vfio/vfio_iommu_type1.c | 274 +++-
 1 file changed, 268 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f09fbabed12..469b09185b83 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,6 +72,7 @@ struct vfio_iommu {
uint64_tpgsize_bitmap;
boolv2;
boolnesting;
+   booldirty_page_tracking;
 };
 
 struct vfio_domain {
@@ -92,6 +93,7 @@ struct vfio_dma {
boollock_cap;   /* capable(CAP_IPC_LOCK) */
struct task_struct  *task;
struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
+   unsigned long   *bitmap;
 };
 
 struct vfio_group {
@@ -126,6 +128,19 @@ struct vfio_regions {
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
(!list_empty(>domain_list))
 
+#define DIRTY_BITMAP_BYTES(n)  (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX  ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX   DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
 static int put_pfn(unsigned long pfn, int prot);
 
 /*
@@ -176,6 +191,77 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(>node, >dma_list);
 }
 
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+   uint64_t npages = dma->size / pgsize;
+
+   if (npages > DIRTY_BITMAP_PAGES_MAX)
+   return -EINVAL;
+
+   dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages), GFP_KERNEL);
+   if (!dma->bitmap)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+   kfree(dma->bitmap);
+   dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+   struct rb_node *p;
+
+   if (RB_EMPTY_ROOT(>pfn_list))
+   return;
+
+   for (p = rb_first(>pfn_list); p; p = rb_next(p)) {
+   struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+   bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) / pgsize, 1);
+   }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   int ret;
+
+   ret = vfio_dma_bitmap_alloc(dma, pgsize);
+   if (ret) {
+   struct rb_node *p = rb_prev(n);
+
+   for (; p; p = rb_prev(p)) {
+   struct vfio_dma *dma = rb_entry(n,
+   struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+   return ret;
+   }
+   vfio_dma_populate_bitmap(dma, pgsize);
+   }
+   return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n = rb_first(>dma_list);
+
+   for (; n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+   vfio_dma_bitmap_free(dma);
+   }
+}
+
 /*
  * Helper Functions for host iova-pfn list
  */
@@ -568,6 +654,17 @@ static

[PATCH Kernel v19 3/8] vfio iommu: Cache pgsize_bitmap in struct vfio_iommu

2020-05-13 Thread Kirti Wankhede
Calculate and cache pgsize_bitmap when iommu->domain_list is updated.
Add iommu->lock protection when cached pgsize_bitmap is accessed.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 drivers/vfio/vfio_iommu_type1.c | 87 +++--
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index fa735047b04d..6f09fbabed12 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -69,6 +69,7 @@ struct vfio_iommu {
struct rb_root  dma_list;
struct blocking_notifier_head notifier;
unsigned intdma_avail;
+   uint64_tpgsize_bitmap;
boolv2;
boolnesting;
 };
@@ -805,15 +806,14 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
iommu->dma_avail++;
 }
 
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = ULONG_MAX;
 
-   mutex_lock(>lock);
+   iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, >domain_list, next)
-   bitmap &= domain->domain->pgsize_bitmap;
-   mutex_unlock(>lock);
+   iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 
/*
 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -823,12 +823,10 @@ static unsigned long vfio_pgsize_bitmap(struct vfio_iommu 
*iommu)
 * granularity while iommu driver can use the sub-PAGE_SIZE size
 * to map the buffer.
 */
-   if (bitmap & ~PAGE_MASK) {
-   bitmap &= PAGE_MASK;
-   bitmap |= PAGE_SIZE;
+   if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+   iommu->pgsize_bitmap &= PAGE_MASK;
+   iommu->pgsize_bitmap |= PAGE_SIZE;
}
-
-   return bitmap;
 }
 
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
@@ -839,19 +837,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
size_t unmapped = 0;
int ret = 0, retries = 0;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
+   mutex_lock(>lock);
+
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
+
+   if (unmap->iova & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   if (!unmap->size || unmap->size & mask) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
-   if (unmap->iova & mask)
-   return -EINVAL;
-   if (!unmap->size || unmap->size & mask)
-   return -EINVAL;
if (unmap->iova + unmap->size - 1 < unmap->iova ||
-   unmap->size > SIZE_MAX)
-   return -EINVAL;
+   unmap->size > SIZE_MAX) {
+   ret = -EINVAL;
+   goto unlock;
+   }
 
WARN_ON(mask & PAGE_MASK);
 again:
-   mutex_lock(>lock);
 
/*
 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
@@ -930,6 +937,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
blocking_notifier_call_chain(>notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
_unmap);
+   mutex_lock(>lock);
goto again;
}
unmapped += dma->size;
@@ -1045,24 +1053,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
 
-   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
-   WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
 
-   if (!prot || !size || (size | iova | vaddr) & mask)
-   return -EINVAL;
+   mutex_lock(>lock);
 
-   /* Don't allow IOVA or virtual address wrap */
-   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
-   return -EINVAL;
+   mask = ((uint64_t)1 << __ffs(iommu->pgsize_bitmap)) - 1;
 
-   mutex_lock(>lock);
+   WARN_ON(mask & PAGE_MASK);
+
+   if (!prot || !size || (size | iova | vaddr) & mask) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Don't allow IOVA or virtual address wrap */
+   if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+

[PATCH Kernel v19 2/8] vfio iommu: Remove atomicity of ref_count of pinned pages

2020-05-13 Thread Kirti Wankhede
vfio_pfn.ref_count is always updated while holding iommu->lock, using
atomic variable is overkill.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Eric Auger 
Reviewed-by: Cornelia Huck 
---
 drivers/vfio/vfio_iommu_type1.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0c60f895b24..fa735047b04d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -112,7 +112,7 @@ struct vfio_pfn {
struct rb_node  node;
dma_addr_t  iova;   /* Device address */
unsigned long   pfn;/* Host pfn */
-   atomic_tref_count;
+   unsigned intref_count;
 };
 
 struct vfio_regions {
@@ -233,7 +233,7 @@ static int vfio_add_to_pfn_list(struct vfio_dma *dma, 
dma_addr_t iova,
 
vpfn->iova = iova;
vpfn->pfn = pfn;
-   atomic_set(>ref_count, 1);
+   vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
 }
@@ -251,7 +251,7 @@ static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct 
vfio_dma *dma,
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 
if (vpfn)
-   atomic_inc(>ref_count);
+   vpfn->ref_count++;
return vpfn;
 }
 
@@ -259,7 +259,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 {
int ret = 0;
 
-   if (atomic_dec_and_test(>ref_count)) {
+   vpfn->ref_count--;
+   if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
-- 
2.7.0




[PATCH Kernel v19 4/8] vfio iommu: Add ioctl definition for dirty pages tracking.

2020-05-13 Thread Kirti Wankhede
IOMMU container maintains a list of all pages pinned by vfio_pin_pages API.
All pages pinned by vendor driver through this API should be considered as
dirty during migration. When container consists of IOMMU capable device and
all pages are pinned and mapped, then all pages are marked dirty.
Added support to start/stop dirtied pages tracking and to get bitmap of all
dirtied pages for requested IO virtual address range.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 56 +++
 1 file changed, 56 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ad9bb5af3463..5f359c63f5ef 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1033,6 +1033,12 @@ struct vfio_iommu_type1_dma_map {
 
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
 
+struct vfio_bitmap {
+   __u64pgsize;/* page size for bitmap in bytes */
+   __u64size;  /* in bytes */
+   __u64 __user *data; /* one bit per page */
+};
+
 /**
  * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
  * struct vfio_dma_unmap)
@@ -1059,6 +1065,56 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_type1_dirty_bitmap)
+ * IOCTL is used for dirty pages tracking.
+ * Caller should set flag depending on which operation to perform, details as
+ * below:
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_START set, indicates
+ * migration is active and IOMMU module should track pages which are dirtied or
+ * potentially dirtied by device.
+ * Dirty pages are tracked until tracking is stopped by user application by
+ * setting VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP set, indicates
+ * IOMMU should stop tracking dirtied pages.
+ *
+ * When IOCTL is called with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set,
+ * IOCTL returns dirty pages bitmap for IOMMU container during migration for
+ * given IOVA range. User must provide data[] as the structure
+ * vfio_iommu_type1_dirty_bitmap_get through which user provides IOVA range and
+ * pgsize. IOVA range must match to that used in original mapping call. This
+ * interface supports to get bitmap of smallest supported pgsize only and can
+ * be modified in future to get bitmap of specified pgsize.
+ * User must allocate memory for bitmap and set size of allocated memory in
+ * bitmap.size field. One bit is used to represent one page consecutively
+ * starting from iova offset. User should provide page size in bitmap.pgsize
+ * field. Bit set in bitmap indicates page at that offset from iova is
+ * dirty. Caller must set argsz including size of structure
+ * vfio_iommu_type1_dirty_bitmap_get.
+ *
+ * Only one of the flags _START, STOP and _GET may be specified at a time.
+ *
+ */
+struct vfio_iommu_type1_dirty_bitmap {
+   __u32argsz;
+   __u32flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START  (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP   (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+   __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+   __u64  iova;/* IO virtual address */
+   __u64  size;/* Size of iova range */
+   struct vfio_bitmap bitmap;
+};
+
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.0




[PATCH Kernel v19 1/8] vfio: UAPI for migration interface for device state

2020-05-13 Thread Kirti Wankhede
- Defined MIGRATION region type and sub-type.

- Defined vfio_device_migration_info structure which will be placed at the
  0th offset of migration region to get/set VFIO device related
  information. Defined members of structure and usage on read/write access.

- Defined device states and state transition details.

- Defined sequence to be followed while saving and resuming VFIO device.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
 include/uapi/linux/vfio.h | 228 ++
 1 file changed, 228 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 015516bcfaa3..ad9bb5af3463 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_TYPE_CCW   (2)
+#define VFIO_REGION_TYPE_MIGRATION  (3)
 
 /* sub-types for VFIO_REGION_TYPE_PCI_* */
 
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
 /* sub-types for VFIO_REGION_TYPE_CCW */
 #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
 
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
+#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
+
+/*
+ * The structure vfio_device_migration_info is placed at the 0th offset of
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
+ * migration information. Field accesses from this structure are only supported
+ * at their native width and alignment. Otherwise, the result is undefined and
+ * vendor drivers should return an error.
+ *
+ * device_state: (read/write)
+ *  - The user application writes to this field to inform the vendor driver
+ *about the device state to be transitioned to.
+ *  - The vendor driver should take the necessary actions to change the
+ *device state. After successful transition to a given state, the
+ *vendor driver should return success on write(device_state, state)
+ *system call. If the device state transition fails, the vendor driver
+ *should return an appropriate -errno for the fault condition.
+ *  - On the user application side, if the device state transition fails,
+ *   that is, if write(device_state, state) returns an error, read
+ *   device_state again to determine the current state of the device from
+ *   the vendor driver.
+ *  - The vendor driver should return previous state of the device unless
+ *the vendor driver has encountered an internal error, in which case
+ *the vendor driver may report the device_state 
VFIO_DEVICE_STATE_ERROR.
+ *  - The user application must use the device reset ioctl to recover the
+ *device from VFIO_DEVICE_STATE_ERROR state. If the device is
+ *indicated to be in a valid device state by reading device_state, the
+ *user application may attempt to transition the device to any valid
+ *state reachable from the current state or terminate itself.
+ *
+ *  device_state consists of 3 bits:
+ *  - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
+ *it indicates the _STOP state. When the device state is changed to
+ *_STOP, driver should stop the device before write() returns.
+ *  - If bit 1 is set, it indicates the _SAVING state, which means that the
+ *driver should start gathering device state information that will be
+ *provided to the VFIO user application to save the device's state.
+ *  - If bit 2 is set, it indicates the _RESUMING state, which means that
+ *the driver should prepare to resume the device. Data provided through
+ *the migration region should be used to resume the device.
+ *  Bits 3 - 31 are reserved for future use. To preserve them, the user
+ *  application should perform a read-modify-write operation on this
+ *  field when modifying the specified bits.
+ *
+ *  +--- _RESUMING
+ *  |+-- _SAVING
+ *  ||+- _RUNNING
+ *  |||
+ *  000b => Device Stopped, not saving or resuming
+ *  001b => Device running, which is the default state
+ *  010b => Stop the device & save the device state, stop-and-copy state
+ *  011b => Device running and save the device state, pre-copy state
+ *  100b => Device stopped and the device state is resuming
+ *  101b => Invalid state
+ *  110b => Error state
+ *  111b => Invalid state
+ *
+ * State transitions:
+ *
+ *  _RESUMING  _RUNNINGPre-copyStop-and-copy   _STOP
+ *(100b) (001b) (011b)(010b)   (000b)
+ * 0. Running or default state
+ * |
+ *
+ * 1. Normal Shutdown (optional)
+ * |->|
+ *
+ * 2. Save the state or suspend
+ * |--

[PATCH Kernel v19 0/8] Add UAPIs to support migration for VFIO devices

2020-05-13 Thread Kirti Wankhede
on Alex's
  suggestion
- Updated get dirty pages sequence.
- As per Cornelia Huck's suggestion, added callbacks to VFIODeviceOps to
  get_object, save_config and load_config.
- Fixed multiple nit picks.
- Tested live migration with multiple vfio device assigned to a VM.

v3 -> v4:
- Added one more bit for _RESUMING flag to be set explicitly.
- data_offset field is read-only for user space application.
- data_size is read for every iteration before reading data from migration,
  that is removed assumption that data will be till end of migration
  region.
- If vendor driver supports mappable sparsed region, map those region
  during setup state of save/load, similarly unmap those from cleanup
  routines.
- Handles race condition that causes data corruption in migration region
  during save device state by adding mutex and serialiaing save_buffer and
  get_dirty_pages routines.
- Skip called get_dirty_pages routine for mapped MMIO region of device.
- Added trace events.
- Split into multiple functional patches.

v2 -> v3:
- Removed enum of VFIO device states. Defined VFIO device state with 2
  bits.
- Re-structured vfio_device_migration_info to keep it minimal and defined
  action on read and write access on its members.

v1 -> v2:
- Defined MIGRATION region type and sub-type which should be used with
  region type capability.
- Re-structured vfio_device_migration_info. This structure will be placed
  at 0th offset of migration region.
- Replaced ioctl with read/write for trapped part of migration region.
- Added both type of access support, trapped or mmapped, for data section
  of the region.
- Moved PCI device functions to pci file.
- Added iteration to get dirty page bitmap until bitmap for all requested
  pages are copied.

Thanks,
Kirti



Kirti Wankhede (8):
  vfio: UAPI for migration interface for device state
  vfio iommu: Remove atomicity of ref_count of pinned pages
  vfio iommu: Cache pgsize_bitmap in struct vfio_iommu
  vfio iommu: Add ioctl definition for dirty pages tracking.
  vfio iommu: Implementation of ioctl for dirty pages tracking
  vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap
  vfio iommu: Add migration capability to report supported features
  vfio: Selective dirty page tracking if IOMMU backed device pins pages

 drivers/vfio/vfio.c |  13 +-
 drivers/vfio/vfio_iommu_type1.c | 570 
 include/linux/vfio.h|   4 +-
 include/uapi/linux/vfio.h   | 315 ++
 4 files changed, 842 insertions(+), 60 deletions(-)

-- 
2.7.0




Re: [PATCH Kernel v18 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-05-13 Thread Kirti Wankhede




On 5/6/2020 4:24 PM, Cornelia Huck wrote:

On Mon, 4 May 2020 21:28:56 +0530
Kirti Wankhede  wrote:


VFIO_IOMMU_DIRTY_PAGES ioctl performs three operations:
- Start dirty pages tracking while migration is active
- Stop dirty pages tracking.
- Get dirty pages bitmap. Its user space application's responsibility to
   copy content of dirty pages from source to destination during migration.

To prevent DoS attack, memory for bitmap is allocated per vfio_dma
structure. Bitmap size is calculated considering smallest supported page
size. Bitmap is allocated for all vfio_dmas when dirty logging is enabled

Bitmap is populated for already pinned pages when bitmap is allocated for
a vfio_dma with the smallest supported page size. Update bitmap from
pinning functions when tracking is enabled. When user application queries
bitmap, check if requested page size is same as page size used to
populated bitmap. If it is equal, copy bitmap, but if not equal, return
error.

Fixed below error by changing pgsize type from uint64_t to size_t.
Reported-by: kbuild test robot 

All errors:
drivers/vfio/vfio_iommu_type1.c:197: undefined reference to `__udivdi3'

drivers/vfio/vfio_iommu_type1.c:225: undefined reference to `__udivdi3'


Move that below the '---' delimiter so that it does not end up in the
commit? (Crediting the build bot is fine, but the details are not
really useful when you look at the code later.)



ok, removing errors.



Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 266 +++-
  1 file changed, 260 insertions(+), 6 deletions(-)



@@ -2278,6 +2435,93 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
  
  		return copy_to_user((void __user *)arg, , minsz) ?

-EFAULT : 0;
+   } else if (cmd == VFIO_IOMMU_DIRTY_PAGES) {
+   struct vfio_iommu_type1_dirty_bitmap dirty;
+   uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+   VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+   VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+   int ret = 0;
+
+   if (!iommu->v2)
+   return -EACCES;
+
+   minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap,
+   flags);
+
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (dirty.argsz < minsz || dirty.flags & ~mask)
+   return -EINVAL;
+
+   /* only one flag should be set at a time */
+   if (__ffs(dirty.flags) != __fls(dirty.flags))
+   return -EINVAL;
+


Shouldn't you also check whether the flag that is set is actually
valid? (maybe dirty.flags & ~VFIO_IOMMU_DIRTY_PAGES_FLAG_MASK and do a
switch/case over dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_MASK)



There is a check above this check, dirty.flags & ~mask, which makes sure 
that flag is valid.


Thanks,
Kirti




Re: [PATCH Kernel v18 5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-12 Thread Kirti Wankhede




On 5/7/2020 3:55 AM, Alex Williamson wrote:

On Mon, 4 May 2020 21:28:57 +0530
Kirti Wankhede  wrote:


DMA mapped pages, including those pinned by mdev vendor drivers, might
get unpinned and unmapped while migration is active and device is still
running. For example, in pre-copy phase while guest driver could access
those pages, host device or vendor driver can dirty these mapped pages.
Such pages should be marked dirty so as to maintain memory consistency
for a user making use of dirty page tracking.

To get bitmap during unmap, user should allocate memory for bitmap, set
size of allocated memory, set page size to be considered for bitmap and
set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  drivers/vfio/vfio_iommu_type1.c | 84 +++--
  include/uapi/linux/vfio.h   | 10 +
  2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 01dcb417836f..8b27faf1ec38 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -983,12 +983,14 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
bitmap_size)
  }
  
  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

-struct vfio_iommu_type1_dma_unmap *unmap)
+struct vfio_iommu_type1_dma_unmap *unmap,
+struct vfio_bitmap *bitmap)
  {
uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
size_t unmapped = 0;
int ret = 0, retries = 0;
+   unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
  
  	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
  
@@ -1041,6 +1043,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

ret = -EINVAL;
goto unlock;
}
+
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
ret = -EINVAL;
@@ -1048,6 +1051,22 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
}
}
  
+	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+iommu->dirty_page_tracking) {


Why do we even accept VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP when not
dirty page tracking rather than returning -EINVAL?  It would simplify
things here to reject it at the ioctl and silently ignoring a flag is
rarely if ever the right approach.


+   final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
+   if (!final_bitmap) {
+   ret = -ENOMEM;
+   goto unlock;
+   }
+
+   temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
+   if (!temp_bitmap) {
+   ret = -ENOMEM;
+   kfree(final_bitmap);
+   goto unlock;
+   }


YIKES!  So the user can instantly trigger the kernel to internally
allocate 2 x 256MB, regardless of how much they can actually map.



That is worst case senario. I don't think ideally that will ever hit. 
More comment below regarding this.



+   }
+
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
if (!iommu->v2 && unmap->iova > dma->iova)
break;
@@ -1058,6 +1077,24 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm)
break;
  
+		if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&

+iommu->dirty_page_tracking) {
+   unsigned long pgshift = __ffs(bitmap->pgsize);
+   unsigned int npages = dma->size >> pgshift;
+   unsigned int shift;
+
+   vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
+   bitmap->pgsize, (u64 *)temp_bitmap);


vfio_iova_dirty_bitmap() takes a __user bitmap, we're doing
copy_to_user() on a kernel allocated buffer???



Actually, there is no need to call vfio_iova_dirty_bitmap(), dma pointer 
is known here and since its getting unmapped, there is no need to 
repopulate bitmap. Removing vfio_iova_dirty_bitmap() and changing it as 
below:


if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
unsigned long pgshift = __ffs(bitmap->pgsize);
unsigned int npages = dma->size >> pgshift;
unsigned int bitmap_size = DIRTY_BITMAP_BYTES(npages);
unsigned int shift = (dma->iova - unmap->iova) >>
pgshift;
/*
 * mark all pages dirty if all pages are pinned and
 * mapp

Re: [PATCH v16 QEMU 09/16] vfio: Add save state functions to SaveVMHandlers

2020-05-11 Thread Kirti Wankhede




On 5/9/2020 11:01 AM, Yan Zhao wrote:

On Wed, Mar 25, 2020 at 05:09:07AM +0800, Kirti Wankhede wrote:

Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
functions. These functions handles pre-copy and stop-and-copy phase.

In _SAVING|_RUNNING device state or pre-copy phase:
- read pending_bytes. If pending_bytes > 0, go through below steps.
- read data_offset - indicates kernel driver to write data to staging
   buffer.
- read data_size - amount of data in bytes written by vendor driver in
   migration region.

I think we should change the sequence of reading data_size and
data_offset. see the next comment below.


- read data_size bytes of data from data_offset in the migration region.
- Write data packet to file stream as below:
{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
VFIO_MIG_FLAG_END_OF_STATE }

In _SAVING device state or stop-and-copy phase
a. read config space of device and save to migration file stream. This
doesn't need to be from vendor driver. Any other special config state
from driver can be saved as data in following iteration.
b. read pending_bytes. If pending_bytes > 0, go through below steps.
c. read data_offset - indicates kernel driver to write data to staging
buffer.
d. read data_size - amount of data in bytes written by vendor driver in
migration region.
e. read data_size bytes of data from data_offset in the migration region.
f. Write data packet as below:
{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
g. iterate through steps b to f while (pending_bytes > 0)
h. Write {VFIO_MIG_FLAG_END_OF_STATE}

When data region is mapped, its user's responsibility to read data from
data_offset of data_size before moving to next steps.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
  hw/vfio/migration.c   | 245 +-
  hw/vfio/trace-events  |   6 ++
  include/hw/vfio/vfio-common.h |   1 +
  3 files changed, 251 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 033f76526e49..ecbeed5182c2 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -138,6 +138,137 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, 
uint32_t mask,
  return 0;
  }
  
+static void *find_data_region(VFIORegion *region,

+  uint64_t data_offset,
+  uint64_t data_size)
+{
+void *ptr = NULL;
+int i;
+
+for (i = 0; i < region->nr_mmaps; i++) {
+if ((data_offset >= region->mmaps[i].offset) &&
+(data_offset < region->mmaps[i].offset + region->mmaps[i].size) &&
+(data_size <= region->mmaps[i].size)) {
+ptr = region->mmaps[i].mmap + (data_offset -
+   region->mmaps[i].offset);
+break;
+}
+}
+return ptr;
+}
+
+static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+VFIORegion *region = >region;
+uint64_t data_offset = 0, data_size = 0;
+int ret;
+
+ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+ data_offset));
+if (ret != sizeof(data_offset)) {
+error_report("%s: Failed to get migration buffer data offset %d",
+ vbasedev->name, ret);
+return -EINVAL;
+}
+
+ret = pread(vbasedev->fd, _size, sizeof(data_size),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+ data_size));
+if (ret != sizeof(data_size)) {
+error_report("%s: Failed to get migration buffer data size %d",
+ vbasedev->name, ret);
+return -EINVAL;
+}

data_size should be read first, and if it's 0, data_offset will not
be read further.

the reasons are below:
1. if there's no data region provided by vendor driver, there's no
reason to get a valid data_offset, so reading/writing of data_offset
should fail. And this should not be treated as a migration error.

2. even if pending_bytes is 0, vfio_save_iterate() is still possible to be
called and therefore vfio_save_buffer() is called.



As I mentioned in reply to Alex in:
https://lists.nongnu.org/archive/html/qemu-devel/2020-05/msg02476.html

With that, vfio_save_iterate() will read pending_bytes if its 0 and then 
if pending_bytes is not 0 then call vfio_save_buffer(). With that your 
above concerns should get resolved.


Thanks,
Kirti



Re: [PATCH v16 QEMU 09/16] vfio: Add save state functions to SaveVMHandlers

2020-05-11 Thread Kirti Wankhede




On 5/5/2020 10:07 AM, Alex Williamson wrote:

On Tue, 5 May 2020 04:48:14 +0530
Kirti Wankhede  wrote:


On 3/26/2020 3:33 AM, Alex Williamson wrote:

On Wed, 25 Mar 2020 02:39:07 +0530
Kirti Wankhede  wrote:
   

Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
functions. These functions handles pre-copy and stop-and-copy phase.

In _SAVING|_RUNNING device state or pre-copy phase:
- read pending_bytes. If pending_bytes > 0, go through below steps.
- read data_offset - indicates kernel driver to write data to staging
buffer.
- read data_size - amount of data in bytes written by vendor driver in
migration region.
- read data_size bytes of data from data_offset in the migration region.
- Write data packet to file stream as below:
{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
VFIO_MIG_FLAG_END_OF_STATE }

In _SAVING device state or stop-and-copy phase
a. read config space of device and save to migration file stream. This
 doesn't need to be from vendor driver. Any other special config state
 from driver can be saved as data in following iteration.
b. read pending_bytes. If pending_bytes > 0, go through below steps.
c. read data_offset - indicates kernel driver to write data to staging
 buffer.
d. read data_size - amount of data in bytes written by vendor driver in
 migration region.
e. read data_size bytes of data from data_offset in the migration region.
f. Write data packet as below:
 {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
g. iterate through steps b to f while (pending_bytes > 0)
h. Write {VFIO_MIG_FLAG_END_OF_STATE}

When data region is mapped, its user's responsibility to read data from
data_offset of data_size before moving to next steps.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   hw/vfio/migration.c   | 245 
+-
   hw/vfio/trace-events  |   6 ++
   include/hw/vfio/vfio-common.h |   1 +
   3 files changed, 251 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 033f76526e49..ecbeed5182c2 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -138,6 +138,137 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, 
uint32_t mask,
   return 0;
   }
   
+static void *find_data_region(VFIORegion *region,

+  uint64_t data_offset,
+  uint64_t data_size)
+{
+void *ptr = NULL;
+int i;
+
+for (i = 0; i < region->nr_mmaps; i++) {
+if ((data_offset >= region->mmaps[i].offset) &&
+(data_offset < region->mmaps[i].offset + region->mmaps[i].size) &&
+(data_size <= region->mmaps[i].size)) {


(data_offset - region->mmaps[i].offset) can be non-zero, so this test
is invalid.  Additionally the uapi does not require that a give data
chunk fits exclusively within an mmap'd area, it may overlap one or
more mmap'd sections of the region, possibly with non-mmap'd areas
included.
   


What's the advantage of having mmap and non-mmap overlapped regions?
Isn't it better to have data section either mapped or trapped?


The spec allows for it, therefore we need to support it.  A vendor
driver might choose to include a header with sequence and checksum
information for each transaction, they might accomplish this by setting
data_offset to a trapped area backed by kernel memory followed by an
area supporting direct mmap to the device.  The target end could then
fault on writing the header if the sequence information is incorrect.
A trapped area at the end of the transaction could allow the vendor
driver to validate a checksum.



If mmap and non-mmap regions overlapped is allowed then here read() 
should be used, which means buffer is allocated, then get data in buffer 
(first memcpy) and then call qemu_put_buffer(f, buf, data_size) (second 
memcpy)


Advantage of using full mmaped region for data, qemu_put_buffer(f, buf, 
data_size) directly uses pointer to mmaped region and so we reduce one 
memcpy.



+ptr = region->mmaps[i].mmap + (data_offset -
+   region->mmaps[i].offset);
+break;
+}
+}
+return ptr;
+}
+
+static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev)
+{
+VFIOMigration *migration = vbasedev->migration;
+VFIORegion *region = >region;
+uint64_t data_offset = 0, data_size = 0;
+int ret;
+
+ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
+region->fd_offset + offsetof(struct vfio_device_migration_info,
+ data_offset));
+if (ret != sizeof(data_offset)) {
+error_report("%s: Failed to get migration buffer data offset %d",
+ vbasedev->name, ret);
+return -EINVAL;
+}
+
+ret = pread(vbasedev->fd, _size, sizeof(da

Re: [PATCH v1 2/2] Sample mtty: Add migration capability to mtty module

2020-05-06 Thread Kirti Wankhede




On 5/7/2020 6:31 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 01:54:20AM +0800, Kirti Wankhede wrote:

This patch makes mtty device migration capable. Purpose od this code is
to test migration interface. Only stop-and-copy phase is implemented.
Postcopy migration is not supported.

Actual data for mtty device migration is very less. Appended dummy data to
migration data stream, default 100 Mbytes. Added sysfs file
'dummy_data_size_MB' to get dummy data size from user which can be used
to check performance of based of data size. During resuming dummy data is
read and discarded.

Signed-off-by: Kirti Wankhede 
---
  samples/vfio-mdev/mtty.c | 602 ---
  1 file changed, 574 insertions(+), 28 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index bf666cce5bb7..f9194234fc6a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -44,9 +44,23 @@
  
  #define MTTY_STRING_LEN		16
  
-#define MTTY_CONFIG_SPACE_SIZE  0xff

-#define MTTY_IO_BAR_SIZE0x8
-#define MTTY_MMIO_BAR_SIZE  0x10
+#define MTTY_CONFIG_SPACE_SIZE 0xff
+#define MTTY_IO_BAR_SIZE   0x8
+#define MTTY_MMIO_BAR_SIZE 0x10
+#define MTTY_MIGRATION_REGION_SIZE 0x100   // 16M
+
+#define MTTY_MIGRATION_REGION_INDEXVFIO_PCI_NUM_REGIONS
+#define MTTY_REGIONS_MAX   (MTTY_MIGRATION_REGION_INDEX + 1)
+
+/* Data section start from page aligned offset */
+#define MTTY_MIGRATION_REGION_DATA_OFFSET  (0x1000)
+
+/* First page is used for struct vfio_device_migration_info */
+#define MTTY_MIGRATION_REGION_SIZE_MMAP \
+   (MTTY_MIGRATION_REGION_SIZE - MTTY_MIGRATION_REGION_DATA_OFFSET)
+
+#define MIGRATION_INFO_OFFSET(MEMBER)  \
+   offsetof(struct vfio_device_migration_info, MEMBER)
  
  #define STORE_LE16(addr, val)   (*(u16 *)addr = val)

  #define STORE_LE32(addr, val)   (*(u32 *)addr = val)
@@ -129,6 +143,28 @@ struct serial_port {
u8 intr_trigger_level;  /* interrupt trigger level */
  };
  
+/* Migration packet */

+#define PACKET_ID  (u16)(0xfeedbaba)
+
+#define PACKET_FLAGS_ACTUAL_DATA   (1 << 0)
+#define PACKET_FLAGS_DUMMY_DATA(1 << 1)
+
+#define PACKET_DATA_SIZE_MAX   (8 * 1024 * 1024)
+
+struct packet {
+   u16 id;
+   u16 flags;
+   u32 data_size;
+   u8 data[];
+};
+
+enum {
+   PACKET_STATE_NONE = 0,
+   PACKET_STATE_PREPARED,
+   PACKET_STATE_COPIED,
+   PACKET_STATE_LAST,
+};
+
  /* State of each mdev device */
  struct mdev_state {
int irq_fd;
@@ -138,22 +174,37 @@ struct mdev_state {
u8 *vconfig;
struct mutex ops_lock;
struct mdev_device *mdev;
-   struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
-   u32 bar_mask[VFIO_PCI_NUM_REGIONS];
+   struct mdev_region_info region_info[MTTY_REGIONS_MAX];
+   u32 bar_mask[MTTY_REGIONS_MAX];
struct list_head next;
struct serial_port s[2];
struct mutex rxtx_lock;
struct vfio_device_info dev_info;
-   int nr_ports;
+   u32 nr_ports;
  
  	/* List of pinned gpfns, gpfn as index and content is translated hpfn */

unsigned long *gpfn_to_hpfn;
struct notifier_block nb;
+
+   u32 device_state;
+   u64 saved_size;
+   void *mig_region_base;
+   bool is_actual_data_sent;
+   struct packet *pkt;
+   u32 packet_state;
+   u64 dummy_data_size;
  };
  
  static struct mutex mdev_list_lock;

  static struct list_head mdev_devices_list;
  
+/*

+ * Default dummy data size set to 100 MB. To change value of dummy data size at
+ * runtime but before migration write size in MB to sysfs file
+ * dummy_data_size_MB
+ */
+static unsigned long user_dummy_data_size = (100 * 1024 * 1024);
+
  static const struct file_operations vd_fops = {
.owner  = THIS_MODULE,
  };
@@ -639,6 +690,288 @@ static void mdev_read_base(struct mdev_state *mdev_state)
}
  }
  
+static int save_setup(struct mdev_state *mdev_state)

+{
+   mdev_state->is_actual_data_sent = false;
+
+   memset(mdev_state->pkt, 0, sizeof(struct packet) +
+  PACKET_DATA_SIZE_MAX);
+
+   return 0;
+}
+
+static int set_device_state(struct mdev_state *mdev_state, u32 device_state)
+{
+   int ret = 0;
+
+   if (mdev_state->device_state == device_state)
+   return 0;
+
+   if (device_state & VFIO_DEVICE_STATE_RUNNING) {
+#if defined(DEBUG)
+   if (device_state & VFIO_DEVICE_STATE_SAVING) {
+   pr_info("%s: %s Pre-copy\n", __func__,
+   dev_name(mdev_dev(mdev_state->mdev)));
+   } else
+   pr_info("%s: %s Running\n", __func__,
+   dev_name(mdev_dev(mdev_state->mdev)));
+#endif
+   } else {
+   if (

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-05-06 Thread Kirti Wankhede




On 5/7/2020 1:33 AM, Alex Williamson wrote:

On Thu, 7 May 2020 01:18:19 +0530
Kirti Wankhede  wrote:


On 5/6/2020 11:41 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:

On Tue, 5 May 2020 04:48:37 +0530
Kirti Wankhede  wrote:
  

On 3/26/2020 1:26 AM, Alex Williamson wrote:

On Wed, 25 Mar 2020 02:39:02 +0530
Kirti Wankhede  wrote:
  

These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
hw/vfio/pci.c | 163 
++
include/hw/vfio/vfio-common.h |   2 +
2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..8deb11e87ef7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/blocker.h"
+#include "migration/qemu-file.h"

#define TYPE_VFIO_PCI "vfio-pci"

#define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
}
}

+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)

+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+   PCI_BASE_ADDRESS_MEM_MASK);


Nit, &= or combine with previous set.
  

+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;


Could we use a union?
  

+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}


What specifically are we validating here?  This should be true no
matter what we wrote to the BAR or else BAR emulation is broken.  The
bits that could make this unaligned are not implemented in the BAR.
  

+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
{
VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
return OBJECT(vdev);
}

+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)

+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be32(f, msi_data);


Isn't the data field only a u16?
  


Yes, fixing it.
  

+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice

Re: [PATCH Kernel v18 6/7] vfio iommu: Add migration capability to report supported features

2020-05-06 Thread Kirti Wankhede




On 5/7/2020 3:57 AM, Alex Williamson wrote:

On Mon, 4 May 2020 21:28:58 +0530
Kirti Wankhede  wrote:


Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.

Signed-off-by: Kirti Wankhede 
---
  drivers/vfio/vfio_iommu_type1.c | 15 +++
  include/uapi/linux/vfio.h   | 14 ++
  2 files changed, 29 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8b27faf1ec38..b38d278d7bff 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2378,6 +2378,17 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
  }
  
+static int vfio_iommu_migration_build_caps(struct vfio_info_cap *caps)

+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+   cap_mig.flags = VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
  static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
  {
@@ -2427,6 +2438,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
if (ret)
return ret;
  
+		ret = vfio_iommu_migration_build_caps();

+   if (ret)
+   return ret;
+
if (caps.size) {
info.flags |= VFIO_IOMMU_INFO_CAPS;
  
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h

index e3cbf8b78623..df9ce8aaafab 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,20 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
  };
  
+/*

+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   /* supports dirty page tracking */
+#define VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK(1 << 0)
+};
+


What about exposing the maximum supported dirty bitmap size and the
supported page sizes?  Thanks,



How should user application use that?

Thanks,
Kirti


Alex


  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
  
  /**






Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-05-06 Thread Kirti Wankhede




On 5/6/2020 11:41 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:

On Tue, 5 May 2020 04:48:37 +0530
Kirti Wankhede  wrote:


On 3/26/2020 1:26 AM, Alex Williamson wrote:

On Wed, 25 Mar 2020 02:39:02 +0530
Kirti Wankhede  wrote:
   

These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   hw/vfio/pci.c | 163 
++
   include/hw/vfio/vfio-common.h |   2 +
   2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..8deb11e87ef7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
   #include "trace.h"
   #include "qapi/error.h"
   #include "migration/blocker.h"
+#include "migration/qemu-file.h"
   
   #define TYPE_VFIO_PCI "vfio-pci"

   #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
   }
   }
   
+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)

+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+   PCI_BASE_ADDRESS_MEM_MASK);


Nit, &= or combine with previous set.
   

+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;


Could we use a union?
   

+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}


What specifically are we validating here?  This should be true no
matter what we wrote to the BAR or else BAR emulation is broken.  The
bits that could make this unaligned are not implemented in the BAR.
   

+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
   {
   VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
   return OBJECT(vdev);
   }
   
+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)

+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be32(f, msi_data);


Isn't the data field only a u16?
   


Yes, fixing it.


+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+ 

Re: [PATCH Kernel v18 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-05-06 Thread Kirti Wankhede




On 5/6/2020 1:45 PM, Yan Zhao wrote:

On Mon, May 04, 2020 at 11:58:56PM +0800, Kirti Wankhede wrote:





  /*
   * Helper Functions for host iova-pfn list
   */
@@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
vfio_unpin_page_external(dma, iova, do_accounting);
goto pin_unwind;
}
+
+   if (iommu->dirty_page_tracking) {
+   unsigned long pgshift =
+__ffs(vfio_pgsize_bitmap(iommu));
+

hi Kirti,
may I know if there's any vfio_pin_pages() happpening during NVidia's vGPU 
migration?
the code would enter into deadlock as I reported in last version.



Hm, you are right and same is the case in vfio_iommu_type1_dma_rw_chunk().

Instead of calling vfio_pgsize_bitmap() from lots of places, I'm 
thinking of saving pgsize_bitmap in struct vfio_iommu, which should be 
populated whenever domain_list is updated. Alex, will that be fine?


Thanks,
Kirti



Thanks
Yan





Re: [PATCH v16 QEMU 08/16] vfio: Register SaveVMHandlers for VFIO device

2020-05-06 Thread Kirti Wankhede




On 5/6/2020 10:23 PM, Dr. David Alan Gilbert wrote:

* Cornelia Huck (coh...@redhat.com) wrote:

On Wed, 6 May 2020 02:38:46 -0400
Yan Zhao  wrote:


On Tue, May 05, 2020 at 12:37:26PM +0800, Alex Williamson wrote:

It's been a long time, but that doesn't seem like what I was asking.
The sysfs version checking is used to select a target that is likely to
succeed, but the migration stream is still generated by a user and the
vendor driver is still ultimately responsible for validating that
stream.  I would hope that a vendor migration stream therefore starts
with information similar to that found in the sysfs interface, allowing
the receiving vendor driver to validate the source device and vendor
software version, such that we can fail an incoming migration that the
vendor driver deems incompatible.  Ideally the vendor driver might also
include consistency and sequence checking throughout the stream to
prevent a malicious user from exploiting the internal operation of the
vendor driver.  Thanks,


Some kind of somewhat standardized marker for driver/version seems like
a good idea. Further checking is also a good idea, but I think the
details of that need to be left to the individual drivers.


Standardised markers like that would be useful; although the rules of
how to compare them might be a bit vendor specific; but still - it would
be good for us to be able to dump something out when it all goes wrong.



Such checking should already there in vendor driver. Vendor driver might 
also support across version migration. I think checking in QEMU again 
would be redundant. Let vendor driver handle version checks.


Thanks,
Kirti

   

maybe we can add a rw field migration_version in
struct vfio_device_migration_info besides sysfs interface ?

when reading it in src, it gets the same string as that from sysfs;
when writing it in target, it returns success or not to check
compatibility and fails the migration early in setup phase.


Getting both populated from the same source seems like a good idea.

Not sure if a string is the best value to put into a migration stream;
maybe the sysfs interface can derive a human-readable string from a
more compact value to be put into the migration region (and ultimately
the stream)? Might be overengineering, just thinking out aloud here.


A string might be OK fi you specify a little about it.

Dave

--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK





<    1   2   3   4   5   6   7   8   9   10   >