On Wed, 22 Feb 2023 19:49:06 +0200 Avihai Horon <avih...@nvidia.com> wrote:
> From: Joao Martins <joao.m.mart...@oracle.com> > > Add device dirty page tracking start/stop functionality. This uses the > device DMA logging uAPI to start and stop dirty page tracking by device. > > Device dirty page tracking is used only if all devices within a > container support device dirty page tracking. > > Signed-off-by: Joao Martins <joao.m.mart...@oracle.com> > Signed-off-by: Avihai Horon <avih...@nvidia.com> > --- > include/hw/vfio/vfio-common.h | 2 + > hw/vfio/common.c | 211 +++++++++++++++++++++++++++++++++- > 2 files changed, 211 insertions(+), 2 deletions(-) > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 6f36876ce0..1f21e1fa43 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -149,6 +149,8 @@ typedef struct VFIODevice { > VFIOMigration *migration; > Error *migration_blocker; > OnOffAuto pre_copy_dirty_page_tracking; > + bool dirty_pages_supported; > + bool dirty_tracking; > } VFIODevice; > > struct VFIODeviceOps { > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 6041da6c7e..740153e7d7 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -473,6 +473,22 @@ static bool > vfio_devices_all_dirty_tracking(VFIOContainer *container) > return true; > } > > +static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) > +{ > + VFIOGroup *group; > + VFIODevice *vbasedev; > + > + QLIST_FOREACH(group, &container->group_list, container_next) { > + QLIST_FOREACH(vbasedev, &group->device_list, next) { > + if (!vbasedev->dirty_pages_supported) { > + return false; > + } > + } > + } > + > + return true; > +} > + > /* > * Check if all VFIO devices are running and migration is active, which is > * essentially equivalent to the migration being in pre-copy phase. > @@ -1404,13 +1420,192 @@ static int > vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) > return ret; > } > > +static int vfio_devices_dma_logging_set(VFIOContainer *container, > + struct vfio_device_feature *feature) > +{ > + bool status = (feature->flags & VFIO_DEVICE_FEATURE_MASK) == > + VFIO_DEVICE_FEATURE_DMA_LOGGING_START; > + VFIODevice *vbasedev; > + VFIOGroup *group; > + int ret = 0; > + > + QLIST_FOREACH(group, &container->group_list, container_next) { > + QLIST_FOREACH(vbasedev, &group->device_list, next) { > + if (vbasedev->dirty_tracking == status) { > + continue; > + } > + > + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); > + if (ret) { > + ret = -errno; > + error_report("%s: Failed to set DMA logging %s, err %d (%s)", > + vbasedev->name, status ? "start" : "stop", ret, > + strerror(errno)); > + goto out; > + } > + vbasedev->dirty_tracking = status; > + } > + } > + > +out: > + return ret; > +} > + > +static int vfio_devices_dma_logging_stop(VFIOContainer *container) > +{ > + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), > + sizeof(uint64_t))] = {}; > + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; > + > + feature->argsz = sizeof(buf); > + feature->flags = VFIO_DEVICE_FEATURE_SET; > + feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; > + > + return vfio_devices_dma_logging_set(container, feature); > +} > + > +static gboolean vfio_device_dma_logging_range_add(DMAMap *map, gpointer data) > +{ > + struct vfio_device_feature_dma_logging_range **out = data; > + struct vfio_device_feature_dma_logging_range *range = *out; > + > + range->iova = map->iova; > + /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */ > + range->length = map->size + 1; > + > + *out = ++range; > + > + return false; > +} > + > +static gboolean vfio_iova_tree_get_first(DMAMap *map, gpointer data) > +{ > + DMAMap *first = data; > + > + first->iova = map->iova; > + first->size = map->size; > + > + return true; > +} > + > +static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data) > +{ > + DMAMap *last = data; > + > + last->iova = map->iova; > + last->size = map->size; > + > + return false; > +} > + > +static struct vfio_device_feature * > +vfio_device_feature_dma_logging_start_create(VFIOContainer *container) > +{ > + struct vfio_device_feature *feature; > + size_t feature_size; > + struct vfio_device_feature_dma_logging_control *control; > + struct vfio_device_feature_dma_logging_range *ranges; > + unsigned int max_ranges; > + unsigned int cur_ranges; > + > + feature_size = sizeof(struct vfio_device_feature) + > + sizeof(struct vfio_device_feature_dma_logging_control); > + feature = g_malloc0(feature_size); > + feature->argsz = feature_size; > + feature->flags = VFIO_DEVICE_FEATURE_SET; > + feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_START; > + > + control = (struct vfio_device_feature_dma_logging_control > *)feature->data; > + control->page_size = qemu_real_host_page_size(); > + > + QEMU_LOCK_GUARD(&container->mappings_mutex); > + > + /* > + * DMA logging uAPI guarantees to support at least num_ranges that fits > into > + * a single host kernel page. To be on the safe side, use this as a limit > + * from which to merge to a single range. > + */ > + max_ranges = qemu_real_host_page_size() / sizeof(*ranges); > + cur_ranges = iova_tree_nnodes(container->mappings); > + control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1; This makes me suspicious that we're implementing to the characteristics of a specific device rather than strictly to the vfio migration API. Are we just trying to avoid the error handling to support the try and fall back to a single range behavior? If we want to make a simplification, then document it as such. The "[t]o be on the safe side" phrasing above could later be interpreted as avoiding an issue and might discourage a more complete implementation. Thanks, Alex > + ranges = g_try_new0(struct vfio_device_feature_dma_logging_range, > + control->num_ranges); > + if (!ranges) { > + g_free(feature); > + errno = ENOMEM; > + > + return NULL; > + } > + > + control->ranges = (uint64_t)ranges; > + if (cur_ranges <= max_ranges) { > + iova_tree_foreach(container->mappings, > + vfio_device_dma_logging_range_add, &ranges); > + } else { > + DMAMap first, last; > + > + iova_tree_foreach(container->mappings, vfio_iova_tree_get_first, > + &first); > + iova_tree_foreach(container->mappings, vfio_iova_tree_get_last, > &last); > + ranges->iova = first.iova; > + /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length > */ > + ranges->length = (last.iova - first.iova) + last.size + 1; > + } > + > + return feature; > +} > + > +static void vfio_device_feature_dma_logging_start_destroy( > + struct vfio_device_feature *feature) > +{ > + struct vfio_device_feature_dma_logging_control *control = > + (struct vfio_device_feature_dma_logging_control *)feature->data; > + struct vfio_device_feature_dma_logging_range *ranges = > + (struct vfio_device_feature_dma_logging_range *)control->ranges; > + > + g_free(ranges); > + g_free(feature); > +} > + > +static int vfio_devices_dma_logging_start(VFIOContainer *container) > +{ > + struct vfio_device_feature *feature; > + int ret; > + > + feature = vfio_device_feature_dma_logging_start_create(container); > + if (!feature) { > + return -errno; > + } > + > + ret = vfio_devices_dma_logging_set(container, feature); > + if (ret) { > + vfio_devices_dma_logging_stop(container); > + } > + > + vfio_device_feature_dma_logging_start_destroy(feature); > + > + return ret; > +} > + > static void vfio_listener_log_global_start(MemoryListener *listener) > { > VFIOContainer *container = container_of(listener, VFIOContainer, > listener); > int ret; > > - ret = vfio_set_dirty_page_tracking(container, true); > + if (vfio_devices_all_device_dirty_tracking(container)) { > + if (vfio_have_giommu(container)) { > + /* Device dirty page tracking currently doesn't support vIOMMU */ > + return; > + } > + > + ret = vfio_devices_dma_logging_start(container); > + } else { > + ret = vfio_set_dirty_page_tracking(container, true); > + } > + > if (ret) { > + error_report("vfio: Could not start dirty page tracking, err: %d > (%s)", > + ret, strerror(-ret)); > vfio_set_migration_error(ret); > } > } > @@ -1420,8 +1615,20 @@ static void > vfio_listener_log_global_stop(MemoryListener *listener) > VFIOContainer *container = container_of(listener, VFIOContainer, > listener); > int ret; > > - ret = vfio_set_dirty_page_tracking(container, false); > + if (vfio_devices_all_device_dirty_tracking(container)) { > + if (vfio_have_giommu(container)) { > + /* Device dirty page tracking currently doesn't support vIOMMU */ > + return; > + } > + > + ret = vfio_devices_dma_logging_stop(container); > + } else { > + ret = vfio_set_dirty_page_tracking(container, false); > + } > + > if (ret) { > + error_report("vfio: Could not stop dirty page tracking, err: %d > (%s)", > + ret, strerror(-ret)); > vfio_set_migration_error(ret); > } > }