Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
On Wed, Feb 14, 2024 at 03:50:19AM -0800, Si-Wei Liu wrote: > On setups with one or more virtio-net devices with vhost on, > dirty tracking iteration increases cost the bigger the number > amount of queues are set up e.g. on idle guests migration the > following is observed with virtio-net with vhost=on: > > 48 queues -> 78.11% [.] vhost_dev_sync_region.isra.13 > 8 queues -> 40.50% [.] vhost_dev_sync_region.isra.13 > 1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13 > 2 devices, 1 queue -> 18.60% [.] vhost_dev_sync_region.isra.14 Given the drastic slowdown I am prepared to treat this as a bugfix if a version addressing all comments and rebased is sent early during the freeze. > With high memory rates the symptom is lack of convergence as soon > as it has a vhost device with a sufficiently high number of queues, > the sufficient number of vhost devices. > > On every migration iteration (every 100msecs) it will redundantly > query the *shared log* the number of queues configured with vhost > that exist in the guest. For the virtqueue data, this is necessary, > but not for the memory sections which are the same. So > essentially we end up scanning the dirty log too often. > > To fix that, select a vhost device responsible for scanning the > log with regards to memory sections dirty tracking. It is selected > when we enable the logger (during migration) and cleared when we > disable the logger. If the vhost logger device goes away for some > reason, the logger will be re-selected from the rest of vhost > devices. > > Co-developed-by: Joao Martins > Signed-off-by: Joao Martins > Signed-off-by: Si-Wei Liu > --- > hw/virtio/vhost.c | 75 > +++ > include/hw/virtio/vhost.h | 1 + > 2 files changed, 70 insertions(+), 6 deletions(-) > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index ef6d9b5..997d560 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -45,6 +45,9 @@ > > static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; > static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; > +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX]; > +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices = > +QLIST_HEAD_INITIALIZER(vhost_mlog_devices); > > /* Memslots used by backends that support private memslots (without an fd). > */ > static unsigned int used_memslots; > @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev) > } > } > > +static bool vhost_log_dev_enabled(struct vhost_dev *dev) > +{ > +assert(dev->vhost_ops); > +assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); > +assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); > + > +return dev == vhost_mem_logger[dev->vhost_ops->backend_type]; > +} > + > +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable) > +{ > +struct vhost_dev *logdev = NULL; > +VhostBackendType backend_type; > +bool reelect = false; > + > +assert(hdev->vhost_ops); > +assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); > +assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); > + > +backend_type = hdev->vhost_ops->backend_type; > + > +if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) { > +reelect = !vhost_mem_logger[backend_type]; > +QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry); > +} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) { > +reelect = vhost_mem_logger[backend_type] == hdev; > +QLIST_REMOVE(hdev, logdev_entry); > +} > + > +if (!reelect) > +return; > + > +QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) { > +if (!hdev->vhost_ops || > +hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE || > +hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX) > +continue; > + > +if (hdev->vhost_ops->backend_type == backend_type) { > +logdev = hdev; > +break; > +} > +} > + > +vhost_mem_logger[backend_type] = logdev; > +} > + > static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, > MemoryRegionSection *section, > hwaddr first, > @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev > *dev, > start_addr = MAX(first, start_addr); > end_addr = MIN(last, end_addr); > > -for (i = 0; i < dev->mem->nregions; ++i) { > -struct vhost_memory_region *reg = dev->mem->regions + i; > -vhost_dev_sync_region(dev, section, start_addr, end_addr, > - reg->guest_phys_addr, > - range_get_last(reg->guest_phys_addr, > - reg->memory_size)); > +if (vhost_log_dev_enabled(dev)) { > +for (i = 0; i < dev->mem->nregions; ++i) { > +struct
Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
On Wed, Feb 14, 2024 at 2:02 PM Si-Wei Liu wrote: > > On setups with one or more virtio-net devices with vhost on, > dirty tracking iteration increases cost the bigger the number > amount of queues are set up e.g. on idle guests migration the > following is observed with virtio-net with vhost=on: > > 48 queues -> 78.11% [.] vhost_dev_sync_region.isra.13 > 8 queues -> 40.50% [.] vhost_dev_sync_region.isra.13 > 1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13 > 2 devices, 1 queue -> 18.60% [.] vhost_dev_sync_region.isra.14 > I think the after benchmark should also be included. > With high memory rates the symptom is lack of convergence as soon > as it has a vhost device with a sufficiently high number of queues, > the sufficient number of vhost devices. > > On every migration iteration (every 100msecs) it will redundantly > query the *shared log* the number of queues configured with vhost > that exist in the guest. For the virtqueue data, this is necessary, > but not for the memory sections which are the same. So > essentially we end up scanning the dirty log too often. > > To fix that, select a vhost device responsible for scanning the > log with regards to memory sections dirty tracking. It is selected > when we enable the logger (during migration) and cleared when we > disable the logger. If the vhost logger device goes away for some > reason, the logger will be re-selected from the rest of vhost > devices. > > Co-developed-by: Joao Martins > Signed-off-by: Joao Martins > Signed-off-by: Si-Wei Liu > --- > hw/virtio/vhost.c | 75 > +++ > include/hw/virtio/vhost.h | 1 + > 2 files changed, 70 insertions(+), 6 deletions(-) > > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index ef6d9b5..997d560 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -45,6 +45,9 @@ > > static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; > static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; > +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX]; > +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices = > +QLIST_HEAD_INITIALIZER(vhost_mlog_devices); > > /* Memslots used by backends that support private memslots (without an fd). > */ > static unsigned int used_memslots; > @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev) > } > } > > +static bool vhost_log_dev_enabled(struct vhost_dev *dev) "Enabled" sounds misleading to me. Maybe vhost_dev_should_log? More suggestions below. > +{ > +assert(dev->vhost_ops); > +assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); > +assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); > + > +return dev == vhost_mem_logger[dev->vhost_ops->backend_type]; > +} > + > +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable) > +{ > +struct vhost_dev *logdev = NULL; > +VhostBackendType backend_type; > +bool reelect = false; > + > +assert(hdev->vhost_ops); > +assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); > +assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); > + > +backend_type = hdev->vhost_ops->backend_type; > + > +if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) { > +reelect = !vhost_mem_logger[backend_type]; > +QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry); > +} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) { > +reelect = vhost_mem_logger[backend_type] == hdev; > +QLIST_REMOVE(hdev, logdev_entry); > +} > + > +if (!reelect) > +return; > + > +QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) { > +if (!hdev->vhost_ops || > +hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE || > +hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX) Aren't comparisons with ops->backend_type already contained in the following "hdev->vhost_ops->backend_type == backend_type" ? > +continue; > + > +if (hdev->vhost_ops->backend_type == backend_type) { > +logdev = hdev; > +break; > +} Why not use VHOST_BACKEND_TYPE_MAX QLISTs, and then simply check if *dev is the head at vhost_log_dev_enabled? That way we can remove this foreach and vhost_log_dev_enabled entirely, as the check is simpler. I think it could even remove this function entirely and inline QLIST_INSERT / QLIST_REMOVE at callers. What do you think? Thanks! > +} > + > +vhost_mem_logger[backend_type] = logdev; > +} > + > static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, > MemoryRegionSection *section, > hwaddr first, > @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev > *dev, > start_addr = MAX(first, start_addr); > end_addr = MIN(last, end_addr); > > -for (i = 0; i < dev->mem->nregions; ++i) { > -
[PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration
On setups with one or more virtio-net devices with vhost on, dirty tracking iteration increases cost the bigger the number amount of queues are set up e.g. on idle guests migration the following is observed with virtio-net with vhost=on: 48 queues -> 78.11% [.] vhost_dev_sync_region.isra.13 8 queues -> 40.50% [.] vhost_dev_sync_region.isra.13 1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13 2 devices, 1 queue -> 18.60% [.] vhost_dev_sync_region.isra.14 With high memory rates the symptom is lack of convergence as soon as it has a vhost device with a sufficiently high number of queues, the sufficient number of vhost devices. On every migration iteration (every 100msecs) it will redundantly query the *shared log* the number of queues configured with vhost that exist in the guest. For the virtqueue data, this is necessary, but not for the memory sections which are the same. So essentially we end up scanning the dirty log too often. To fix that, select a vhost device responsible for scanning the log with regards to memory sections dirty tracking. It is selected when we enable the logger (during migration) and cleared when we disable the logger. If the vhost logger device goes away for some reason, the logger will be re-selected from the rest of vhost devices. Co-developed-by: Joao Martins Signed-off-by: Joao Martins Signed-off-by: Si-Wei Liu --- hw/virtio/vhost.c | 75 +++ include/hw/virtio/vhost.h | 1 + 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index ef6d9b5..997d560 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -45,6 +45,9 @@ static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX]; static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX]; +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX]; +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices = +QLIST_HEAD_INITIALIZER(vhost_mlog_devices); /* Memslots used by backends that support private memslots (without an fd). */ static unsigned int used_memslots; @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev) } } +static bool vhost_log_dev_enabled(struct vhost_dev *dev) +{ +assert(dev->vhost_ops); +assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); +assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); + +return dev == vhost_mem_logger[dev->vhost_ops->backend_type]; +} + +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable) +{ +struct vhost_dev *logdev = NULL; +VhostBackendType backend_type; +bool reelect = false; + +assert(hdev->vhost_ops); +assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE); +assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX); + +backend_type = hdev->vhost_ops->backend_type; + +if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) { +reelect = !vhost_mem_logger[backend_type]; +QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry); +} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) { +reelect = vhost_mem_logger[backend_type] == hdev; +QLIST_REMOVE(hdev, logdev_entry); +} + +if (!reelect) +return; + +QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) { +if (!hdev->vhost_ops || +hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE || +hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX) +continue; + +if (hdev->vhost_ops->backend_type == backend_type) { +logdev = hdev; +break; +} +} + +vhost_mem_logger[backend_type] = logdev; +} + static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, MemoryRegionSection *section, hwaddr first, @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, start_addr = MAX(first, start_addr); end_addr = MIN(last, end_addr); -for (i = 0; i < dev->mem->nregions; ++i) { -struct vhost_memory_region *reg = dev->mem->regions + i; -vhost_dev_sync_region(dev, section, start_addr, end_addr, - reg->guest_phys_addr, - range_get_last(reg->guest_phys_addr, - reg->memory_size)); +if (vhost_log_dev_enabled(dev)) { +for (i = 0; i < dev->mem->nregions; ++i) { +struct vhost_memory_region *reg = dev->mem->regions + i; +vhost_dev_sync_region(dev, section, start_addr, end_addr, + reg->guest_phys_addr, + range_get_last(reg->guest_phys_addr, + reg->memory_size)); +} } for (i = 0; i < dev->nvqs; ++i) { struct vhost_virtqueue *vq = dev->vqs + i; @@ -382,6 +434,7 @@ static void