"Maciej S. Szmigiero" <[email protected]> writes: > From: "Maciej S. Szmigiero" <[email protected]> > > When multiple VFIO devices are present in a VM the fact that their state > transitions on migration happen sequentially has a visible impact on > migration downtime. > > This is because both PRE_COPY -> PRE_COPY_P2P -> STOP_COPY transitions on > the source and RESUMING -> RUNNING_P2P -> RUNNING transitions on the target > happen during the switchover phase. > During this phase the VM is stopped so the downtime is ticking. > > These device state transitions are performed by VM state change handlers > registered by the VFIO device migration code. > > Instead of performing such state transition synchronously use the priority > adjustment mechanism from the previous patch to launch a thread performing > the state change at the priority level *before* all other VM state change > handlers at the particular VFIO device qdev tree depth. > Only wait for this thread to finish at the priority level ordered *after* > all other handlers at this tree depth. > > This way these state transitions can happen in parallel not only with > respect to other VFIO device instances but also ordinary (serialized) > handlers for other devices at this qdev tree depth while still being > properly ordered with respect to handlers registered at other tree depths. > > Unfortunately, the order in which VM state handlers are called depends > on whether the VM is starting or stopping. > Because of this, one extra layer of indirection is necessary to make the > (first, second) ordering of these handlers constant. > > Enable the feature by default since it has no impact on the migration bit > stream protocol - it shouldn't need disabling for anything else but > debugging scenarios. > > Signed-off-by: Maciej S. Szmigiero <[email protected]> > --- > hw/vfio/migration.c | 174 ++++++++++++++++++++++++++++-- > hw/vfio/pci.c | 2 + > hw/vfio/vfio-migration-internal.h | 4 +- > include/hw/vfio/vfio-device.h | 1 + > 4 files changed, 173 insertions(+), 8 deletions(-) > > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > index 9889b20ad7dd..fd2b37a85f4b 100644 > --- a/hw/vfio/migration.c > +++ b/hw/vfio/migration.c > @@ -1047,6 +1047,135 @@ static void vfio_vmstate_change(void *opaque, bool > running, RunState state) > mig_state_to_str(new_state)); > } > > +typedef struct VMStateChangeThreadData { > + VFIODevice *vbasedev; > + bool is_prepare; > + bool running; > + RunState state; > +} VMStateChangeThreadData; > + > +static void *vfio_vmstate_change_thread(void *opaque) > +{ > + g_autofree VMStateChangeThreadData *data = opaque; > + > + if (data->is_prepare) { > + vfio_vmstate_change_prepare(data->vbasedev, data->running, > data->state); > + } else { > + vfio_vmstate_change(data->vbasedev, data->running, data->state); > + } > + > + return NULL; > +} > + > +static void vfio_vmstate_change_thread_launch(VFIODevice *vbasedev, > + bool is_prepare, > + bool running, > + RunState state) > +{ > + VFIOMigration *migration = vbasedev->migration; > + VMStateChangeThreadData *data = g_new(VMStateChangeThreadData, 1); > + > + data->vbasedev = vbasedev; > + data->is_prepare = is_prepare; > + data->running = running; > + data->state = state; > + > + assert(!migration->vm_state_thread_running); > + migration->vm_state_thread_running = true; > + > + qemu_thread_create(&migration->vm_state_thread, > + is_prepare ? "vfio_vmstate_change_prepare" : > + "vfio_vmstate_change", > + vfio_vmstate_change_thread, data, > + QEMU_THREAD_JOINABLE); > +} > + > +static void vfio_vmstate_change_thread_join(VFIODevice *vbasedev) > +{ > + VFIOMigration *migration = vbasedev->migration; > + > + assert(migration->vm_state_thread_running); > + > + qemu_thread_join(&migration->vm_state_thread); > + > + migration->vm_state_thread_running = false; > +} > + > +/* > + * The first handler called during a vmstate change at a particular depth - > + * launch the VFIO device state change thread. > + */ > +static void vfio_vmstate_change_first(VFIODevice *vbasedev, > + bool is_prepare, > + bool running, RunState state) > +{ > + vfio_vmstate_change_thread_launch(vbasedev, > + is_prepare, > + running, > + state); > +} > + > +/* > + * The last handler called during a vmstate change at a particular depth - > + * wait for the VFIO device state change thread to finish. > + */ > +static void vfio_vmstate_change_second(VFIODevice *vbasedev) > +{ > + vfio_vmstate_change_thread_join(vbasedev); > +} > + > +/* > + * Lower priority number handler: > + * Called before higher number handler when VM is starting > + * but after higher number handler when VM is stopping. > + */ > +static void vfio_vmstate_change_prepare_lower_prio(void *opaque, bool > running, > + RunState state) > +{ > + if (running) { > + vfio_vmstate_change_first(opaque, true, running, state); > + } else { > + vfio_vmstate_change_second(opaque); > + } > +} > + > +/* > + * Higher priority number handler: > + * Called after lower number handler when VM is starting > + * but before lower number handler when VM is stopping. > + */ > +static void vfio_vmstate_change_prepare_higher_prio(void *opaque, bool > running, > + RunState state) > +{ > + if (running) { > + vfio_vmstate_change_second(opaque); > + } else { > + vfio_vmstate_change_first(opaque, true, running, state); > + } > +} > + > +/* Same ordering issues as for vfio_vmstate_change_prepare_lower_prio() */ > +static void vfio_vmstate_change_lower_prio(void *opaque, bool running, > + RunState state) > +{ > + if (running) { > + vfio_vmstate_change_first(opaque, false, running, state); > + } else { > + vfio_vmstate_change_second(opaque); > + } > +} > + > +/* Same ordering issues as for vfio_vmstate_change_prepare_higher_prio() */ > +static void vfio_vmstate_change_higher_prio(void *opaque, bool running, > + RunState state) > +{ > + if (running) { > + vfio_vmstate_change_second(opaque); > + } else { > + vfio_vmstate_change_first(opaque, false, running, state); > + } > +} > + > static int vfio_migration_state_notifier(NotifierWithReturn *notifier, > MigrationEvent *e, Error **errp) > { > @@ -1063,6 +1192,8 @@ static int > vfio_migration_state_notifier(NotifierWithReturn *notifier, > * MigrationNotifyFunc may not return an error code and an Error > * object for MIG_EVENT_FAILED. Hence, report the error > * locally and ignore the errp argument. > + * This state change is not parallelized as it is not expected to be > + * performance critical. > */ > ret = vfio_migration_set_state_or_reset(vbasedev, > VFIO_DEVICE_STATE_RUNNING, > @@ -1143,7 +1274,7 @@ static int vfio_migration_init(VFIODevice *vbasedev, > Error **errp) > char id[256] = ""; > g_autofree char *path = NULL, *oid = NULL; > uint64_t mig_flags = 0; > - VMChangeStateHandler *prepare_cb; > + VMChangeStateHandler *prepare_cb, *prepare_cb_lower, *prepare_cb_higher; > > if (!vbasedev->ops->vfio_get_object) { > ret = -EINVAL; > @@ -1223,11 +1354,34 @@ static int vfio_migration_init(VFIODevice *vbasedev, > Error **errp) > register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, > &savevm_vfio_handlers, > vbasedev); > > - prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? > - vfio_vmstate_change_prepare : > - NULL; > - migration->vm_state = qdev_add_vm_change_state_handler_full( > - vbasedev->dev, vfio_vmstate_change, prepare_cb, NULL, vbasedev, 0); > + if (vbasedev->migration_parallel_states) { > + /* > + * Unfortunately, the order in which vmstate handlers are called > depends > + * on whether the VM is starting or stopping. > + * Because of this, one extra layer of indirection is necessary > + * to make the (first, second) ordering of these handlers constant. > + */ > + prepare_cb_lower = migration->mig_flags & VFIO_MIGRATION_P2P ? > + vfio_vmstate_change_prepare_lower_prio : NULL; > + prepare_cb_higher = migration->mig_flags & VFIO_MIGRATION_P2P ? > + vfio_vmstate_change_prepare_higher_prio : NULL; > + migration->vm_state_lower_prio = > qdev_add_vm_change_state_handler_full( > + vbasedev->dev, vfio_vmstate_change_lower_prio, prepare_cb_lower, > + NULL, vbasedev, -1); > + migration->vm_state_higher_prio = > qdev_add_vm_change_state_handler_full( > + vbasedev->dev, vfio_vmstate_change_higher_prio, > prepare_cb_higher, > + NULL, vbasedev, 1); > + } else { > + prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? > + vfio_vmstate_change_prepare : NULL; > + /* Arbitrarily use lower_prio field to store non-parallel handler */ > + migration->vm_state_lower_prio = > + qdev_add_vm_change_state_handler_full(vbasedev->dev, > + vfio_vmstate_change, > + prepare_cb, NULL, > + vbasedev, 0); > + } > + > migration_add_notifier(&migration->migration_state, > vfio_migration_state_notifier); > > @@ -1302,7 +1456,13 @@ static void vfio_migration_deinit(VFIODevice *vbasedev) > VFIOMigration *migration = vbasedev->migration; > > migration_remove_notifier(&migration->migration_state); > - qemu_del_vm_change_state_handler(migration->vm_state); > + > + if (vbasedev->migration_parallel_states) { > + qemu_del_vm_change_state_handler(migration->vm_state_higher_prio); > + } > + /* Non-parallel state change uses lower_prio field to store its handler > */ > + qemu_del_vm_change_state_handler(migration->vm_state_lower_prio); > + > unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); > vfio_migration_free(vbasedev); > vfio_unblock_multiple_devices_migration(); > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index b2a07f6bb421..fa2411474c9b 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3777,6 +3777,8 @@ static const Property vfio_pci_properties[] = { > ON_OFF_AUTO_AUTO), > DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice, > vbasedev.migration_max_queued_buffers_size, UINT64_MAX), > + DEFINE_PROP_BOOL("x-migration-parallel-states", VFIOPCIDevice, > + vbasedev.migration_parallel_states, true), > DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, > vbasedev.migration_events, false), > DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), > diff --git a/hw/vfio/vfio-migration-internal.h > b/hw/vfio/vfio-migration-internal.h > index a1c58b112685..9fb00f9f4d7d 100644 > --- a/hw/vfio/vfio-migration-internal.h > +++ b/hw/vfio/vfio-migration-internal.h > @@ -38,7 +38,9 @@ typedef struct VFIOMultifd VFIOMultifd; > > typedef struct VFIOMigration { > struct VFIODevice *vbasedev; > - VMChangeStateEntry *vm_state; > + VMChangeStateEntry *vm_state_lower_prio, *vm_state_higher_prio; > + QemuThread vm_state_thread; > + bool vm_state_thread_running; > NotifierWithReturn migration_state; > uint32_t device_state; > int data_fd; > diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h > index 380a55d6e5ea..28004e1e99f4 100644 > --- a/include/hw/vfio/vfio-device.h > +++ b/include/hw/vfio/vfio-device.h > @@ -69,6 +69,7 @@ typedef struct VFIODevice { > OnOffAuto migration_multifd_transfer; > OnOffAuto migration_load_config_after_iter; > uint64_t migration_max_queued_buffers_size; > + bool migration_parallel_states; > bool migration_events; > bool use_region_fds; > VFIODeviceOps *ops;
A bit idiosyncratic, but I don't have any better suggestions. Also, pre-existing but maybe you'll want to fix it in this patch, 'vmstate_change' is the wrong term. This is vm_change_state or vm_state_change. Acked-by: Fabiano Rosas <[email protected]>
