On Sat, 4 Mar 2023 01:43:37 +0000 Joao Martins <joao.m.mart...@oracle.com> wrote:
> According to the device DMA logging uAPI, IOVA ranges to be logged by > the device must be provided all at once upon DMA logging start. > > As preparation for the following patches which will add device dirty > page tracking, keep a record of all DMA mapped IOVA ranges so later they > can be used for DMA logging start. > > Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked. > This is due to the dynamic nature of vIOMMU DMA mapping/unmapping. Commit log is outdated for this version. > Signed-off-by: Avihai Horon <avih...@nvidia.com> > Signed-off-by: Joao Martins <joao.m.mart...@oracle.com> > --- > hw/vfio/common.c | 84 +++++++++++++++++++++++++++++++++++ > hw/vfio/trace-events | 1 + > include/hw/vfio/vfio-common.h | 11 +++++ > 3 files changed, 96 insertions(+) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index ed908e303dbb..d84e5fd86bb4 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -44,6 +44,7 @@ > #include "migration/blocker.h" > #include "migration/qemu-file.h" > #include "sysemu/tpm.h" > +#include "qemu/iova-tree.h" Unnecessary > > VFIOGroupList vfio_group_list = > QLIST_HEAD_INITIALIZER(vfio_group_list); > @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer > *container, bool start) > return ret; > } > > +/* > + * Called for the dirty tracking memory listener to calculate the iova/end > + * for a given memory region section. The checks here, replicate the logic > + * in vfio_listener_region_{add,del}() used for the same purpose. And thus > + * both listener should be kept in sync. > + */ > +static bool vfio_get_section_iova_range(VFIOContainer *container, > + MemoryRegionSection *section, > + hwaddr *out_iova, hwaddr *out_end) > +{ > + Int128 llend; > + hwaddr iova; > + > + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); > + llend = int128_make64(section->offset_within_address_space); > + llend = int128_add(llend, section->size); > + llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask())); > + > + if (int128_ge(int128_make64(iova), llend)) { > + return false; > + } > + > + *out_iova = iova; > + *out_end = int128_get64(llend) - 1; > + return true; > +} Not sure why this isn't turned into a helper here to avoid the issue noted in the comment. Also why do both of the existing listener implementations resolve the end address as: int128_get64(int128_sub(llend, int128_one())); While here we use: int128_get64(llend) - 1; We're already out of sync. > + > +static void vfio_dirty_tracking_update(MemoryListener *listener, > + MemoryRegionSection *section) > +{ > + VFIOContainer *container = container_of(listener, VFIOContainer, > + tracking_listener); > + VFIODirtyTrackingRange *range = &container->tracking_range; > + hwaddr max32 = (1ULL << 32) - 1ULL; UINT32_MAX > + hwaddr iova, end; > + > + if (!vfio_listener_valid_section(section) || > + !vfio_get_section_iova_range(container, section, &iova, &end)) { > + return; > + } > + > + WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) { > + if (iova < max32 && end <= max32) { > + if (range->min32 > iova) { > + range->min32 = iova; > + } > + if (range->max32 < end) { > + range->max32 = end; > + } > + trace_vfio_device_dirty_tracking_update(iova, end, > + range->min32, range->max32); > + } else { > + if (!range->min64 || range->min64 > iova) { > + range->min64 = iova; > + } I think this improperly handles a range starting at zero, min64 should be UINT64_MAX initially. For example, if we have ranges 0-8GB and 12-16GB, this effectively ignores the first range. Likewise I think range->min32 has a similar problem, it's initialized to zero, it will never be updated to match a non-zero lowest range. It needs to be initialized to UINT32_MAX. A comment describing the purpose of the 32/64 split tracking would be useful too. > + if (range->max64 < end) { > + range->max64 = end; > + } > + trace_vfio_device_dirty_tracking_update(iova, end, > + range->min64, range->max64); > + } > + } > + return; > +} > + > +static const MemoryListener vfio_dirty_tracking_listener = { > + .name = "vfio-tracking", > + .region_add = vfio_dirty_tracking_update, > +}; > + > +static void vfio_dirty_tracking_init(VFIOContainer *container) > +{ > + memset(&container->tracking_range, 0, sizeof(container->tracking_range)); > + qemu_mutex_init(&container->tracking_mutex); As noted in other thread, this mutex seems unnecessary. The listener needs to be embedded in an object, but it doesn't need to be the container. Couldn't we create: typedef struct VFIODirtyRanges { VFIOContainer *container; VFIODirtyTrackingRange ranges; MemoryListener listener; } VFIODirectRanges; For use here? Caller could provide VFIODirtyTrackingRange pointer for the resulting ranges, which then gets passed to vfio_device_feature_dma_logging_start_create() > + container->tracking_listener = vfio_dirty_tracking_listener; > + memory_listener_register(&container->tracking_listener, > + container->space->as); > + memory_listener_unregister(&container->tracking_listener); It's sufficiently subtle that the listener callback is synchronous and we're done with it here for a comment. > + qemu_mutex_destroy(&container->tracking_mutex); > +} > + > static void vfio_listener_log_global_start(MemoryListener *listener) > { > VFIOContainer *container = container_of(listener, VFIOContainer, > listener); > int ret; > > + vfio_dirty_tracking_init(container); > + > ret = vfio_set_dirty_page_tracking(container, true); > if (ret) { > vfio_set_migration_error(ret); > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index 669d9fe07cd9..d97a6de17921 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t > iova, uint64_t offset_wi > vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, > uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" > size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" > vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING > region_del 0x%"PRIx64" - 0x%"PRIx64 > vfio_listener_region_del(uint64_t start, uint64_t end) "region_del > 0x%"PRIx64" - 0x%"PRIx64 > +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t > min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" > - 0x%"PRIx64"]" > vfio_disconnect_container(int fd) "close container->fd=%d" > vfio_put_group(int fd) "close group->fd=%d" > vfio_get_device(const char * name, unsigned int flags, unsigned int > num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: > %u" > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 87524c64a443..96791add2719 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -23,6 +23,7 @@ > > #include "exec/memory.h" > #include "qemu/queue.h" > +#include "qemu/iova-tree.h" Unused. > #include "qemu/notify.h" > #include "ui/console.h" > #include "hw/display/ramfb.h" > @@ -68,6 +69,13 @@ typedef struct VFIOMigration { > size_t data_buffer_size; > } VFIOMigration; > > +typedef struct VFIODirtyTrackingRange { > + hwaddr min32; > + hwaddr max32; > + hwaddr min64; > + hwaddr max64; > +} VFIODirtyTrackingRange; > + > typedef struct VFIOAddressSpace { > AddressSpace *as; > QLIST_HEAD(, VFIOContainer) containers; > @@ -89,6 +97,9 @@ typedef struct VFIOContainer { > uint64_t max_dirty_bitmap_size; > unsigned long pgsizes; > unsigned int dma_max_mappings; > + VFIODirtyTrackingRange tracking_range; > + QemuMutex tracking_mutex; > + MemoryListener tracking_listener; Let's make this unnecessary too. Thanks, Alex > QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; > QLIST_HEAD(, VFIOGroup) group_list;