Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration

2024-03-12 Thread Michael S. Tsirkin
On Wed, Feb 14, 2024 at 03:50:19AM -0800, Si-Wei Liu wrote:
> On setups with one or more virtio-net devices with vhost on,
> dirty tracking iteration increases cost the bigger the number
> amount of queues are set up e.g. on idle guests migration the
> following is observed with virtio-net with vhost=on:
> 
> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> 1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13
> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

Given the drastic slowdown I am prepared to treat this as
a bugfix if a version addressing all comments and rebased
is sent early during the freeze.

> With high memory rates the symptom is lack of convergence as soon
> as it has a vhost device with a sufficiently high number of queues,
> the sufficient number of vhost devices.
> 
> On every migration iteration (every 100msecs) it will redundantly
> query the *shared log* the number of queues configured with vhost
> that exist in the guest. For the virtqueue data, this is necessary,
> but not for the memory sections which are the same. So
> essentially we end up scanning the dirty log too often.
> 
> To fix that, select a vhost device responsible for scanning the
> log with regards to memory sections dirty tracking. It is selected
> when we enable the logger (during migration) and cleared when we
> disable the logger. If the vhost logger device goes away for some
> reason, the logger will be re-selected from the rest of vhost
> devices.
> 
> Co-developed-by: Joao Martins 
> Signed-off-by: Joao Martins 
> Signed-off-by: Si-Wei Liu 
> ---
>  hw/virtio/vhost.c | 75 
> +++
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 70 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index ef6d9b5..997d560 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -45,6 +45,9 @@
>  
>  static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>  static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
> +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
> +QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
>  
>  /* Memslots used by backends that support private memslots (without an fd). 
> */
>  static unsigned int used_memslots;
> @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>  }
>  }
>  
> +static bool vhost_log_dev_enabled(struct vhost_dev *dev)
> +{
> +assert(dev->vhost_ops);
> +assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
> +}
> +
> +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
> +{
> +struct vhost_dev *logdev = NULL;
> +VhostBackendType backend_type;
> +bool reelect = false;
> +
> +assert(hdev->vhost_ops);
> +assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +backend_type = hdev->vhost_ops->backend_type;
> +
> +if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +reelect = !vhost_mem_logger[backend_type];
> +QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry);
> +} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +reelect = vhost_mem_logger[backend_type] == hdev;
> +QLIST_REMOVE(hdev, logdev_entry);
> +}
> +
> +if (!reelect)
> +return;
> +
> +QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) {
> +if (!hdev->vhost_ops ||
> +hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
> +hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)
> +continue;
> +
> +if (hdev->vhost_ops->backend_type == backend_type) {
> +logdev = hdev;
> +break;
> +}
> +}
> +
> +vhost_mem_logger[backend_type] = logdev;
> +}
> +
>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> MemoryRegionSection *section,
> hwaddr first,
> @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev 
> *dev,
>  start_addr = MAX(first, start_addr);
>  end_addr = MIN(last, end_addr);
>  
> -for (i = 0; i < dev->mem->nregions; ++i) {
> -struct vhost_memory_region *reg = dev->mem->regions + i;
> -vhost_dev_sync_region(dev, section, start_addr, end_addr,
> -  reg->guest_phys_addr,
> -  range_get_last(reg->guest_phys_addr,
> - reg->memory_size));
> +if (vhost_log_dev_enabled(dev)) {
> +for (i = 0; i < dev->mem->nregions; ++i) {
> +struct 

Re: [PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration

2024-03-06 Thread Eugenio Perez Martin
On Wed, Feb 14, 2024 at 2:02 PM Si-Wei Liu  wrote:
>
> On setups with one or more virtio-net devices with vhost on,
> dirty tracking iteration increases cost the bigger the number
> amount of queues are set up e.g. on idle guests migration the
> following is observed with virtio-net with vhost=on:
>
> 48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
> 8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
> 1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13
> 2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
>

I think the after benchmark should also be included.

> With high memory rates the symptom is lack of convergence as soon
> as it has a vhost device with a sufficiently high number of queues,
> the sufficient number of vhost devices.
>
> On every migration iteration (every 100msecs) it will redundantly
> query the *shared log* the number of queues configured with vhost
> that exist in the guest. For the virtqueue data, this is necessary,
> but not for the memory sections which are the same. So
> essentially we end up scanning the dirty log too often.
>
> To fix that, select a vhost device responsible for scanning the
> log with regards to memory sections dirty tracking. It is selected
> when we enable the logger (during migration) and cleared when we
> disable the logger. If the vhost logger device goes away for some
> reason, the logger will be re-selected from the rest of vhost
> devices.
>
> Co-developed-by: Joao Martins 
> Signed-off-by: Joao Martins 
> Signed-off-by: Si-Wei Liu 
> ---
>  hw/virtio/vhost.c | 75 
> +++
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index ef6d9b5..997d560 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -45,6 +45,9 @@
>
>  static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
>  static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
> +static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
> +static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
> +QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
>
>  /* Memslots used by backends that support private memslots (without an fd). 
> */
>  static unsigned int used_memslots;
> @@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
>  }
>  }
>
> +static bool vhost_log_dev_enabled(struct vhost_dev *dev)

"Enabled" sounds misleading to me. Maybe vhost_dev_should_log? More
suggestions below.

> +{
> +assert(dev->vhost_ops);
> +assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
> +}
> +
> +static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
> +{
> +struct vhost_dev *logdev = NULL;
> +VhostBackendType backend_type;
> +bool reelect = false;
> +
> +assert(hdev->vhost_ops);
> +assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
> +assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
> +
> +backend_type = hdev->vhost_ops->backend_type;
> +
> +if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +reelect = !vhost_mem_logger[backend_type];
> +QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry);
> +} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
> +reelect = vhost_mem_logger[backend_type] == hdev;
> +QLIST_REMOVE(hdev, logdev_entry);
> +}
> +
> +if (!reelect)
> +return;
> +
> +QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) {
> +if (!hdev->vhost_ops ||
> +hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
> +hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)

Aren't comparisons with ops->backend_type already contained in the
following "hdev->vhost_ops->backend_type == backend_type" ?

> +continue;
> +
> +if (hdev->vhost_ops->backend_type == backend_type) {
> +logdev = hdev;
> +break;
> +}

Why not use VHOST_BACKEND_TYPE_MAX QLISTs, and then simply check if
*dev is the head at vhost_log_dev_enabled?

That way we can remove this foreach and vhost_log_dev_enabled
entirely, as the check is simpler. I think it could even remove this
function entirely and inline QLIST_INSERT / QLIST_REMOVE at callers.
What do you think?

Thanks!

> +}
> +
> +vhost_mem_logger[backend_type] = logdev;
> +}
> +
>  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
> MemoryRegionSection *section,
> hwaddr first,
> @@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev 
> *dev,
>  start_addr = MAX(first, start_addr);
>  end_addr = MIN(last, end_addr);
>
> -for (i = 0; i < dev->mem->nregions; ++i) {
> -

[PATCH v2 2/2] vhost: Perform memory section dirty scans once per iteration

2024-02-14 Thread Si-Wei Liu
On setups with one or more virtio-net devices with vhost on,
dirty tracking iteration increases cost the bigger the number
amount of queues are set up e.g. on idle guests migration the
following is observed with virtio-net with vhost=on:

48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13
2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

With high memory rates the symptom is lack of convergence as soon
as it has a vhost device with a sufficiently high number of queues,
the sufficient number of vhost devices.

On every migration iteration (every 100msecs) it will redundantly
query the *shared log* the number of queues configured with vhost
that exist in the guest. For the virtqueue data, this is necessary,
but not for the memory sections which are the same. So
essentially we end up scanning the dirty log too often.

To fix that, select a vhost device responsible for scanning the
log with regards to memory sections dirty tracking. It is selected
when we enable the logger (during migration) and cleared when we
disable the logger. If the vhost logger device goes away for some
reason, the logger will be re-selected from the rest of vhost
devices.

Co-developed-by: Joao Martins 
Signed-off-by: Joao Martins 
Signed-off-by: Si-Wei Liu 
---
 hw/virtio/vhost.c | 75 +++
 include/hw/virtio/vhost.h |  1 +
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index ef6d9b5..997d560 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -45,6 +45,9 @@
 
 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
+static struct vhost_dev *vhost_mem_logger[VHOST_BACKEND_TYPE_MAX];
+static QLIST_HEAD(, vhost_dev) vhost_mlog_devices =
+QLIST_HEAD_INITIALIZER(vhost_mlog_devices);
 
 /* Memslots used by backends that support private memslots (without an fd). */
 static unsigned int used_memslots;
@@ -149,6 +152,53 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
 }
 }
 
+static bool vhost_log_dev_enabled(struct vhost_dev *dev)
+{
+assert(dev->vhost_ops);
+assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+return dev == vhost_mem_logger[dev->vhost_ops->backend_type];
+}
+
+static void vhost_mlog_set_dev(struct vhost_dev *hdev, bool enable)
+{
+struct vhost_dev *logdev = NULL;
+VhostBackendType backend_type;
+bool reelect = false;
+
+assert(hdev->vhost_ops);
+assert(hdev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+assert(hdev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+backend_type = hdev->vhost_ops->backend_type;
+
+if (enable && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
+reelect = !vhost_mem_logger[backend_type];
+QLIST_INSERT_HEAD(_mlog_devices, hdev, logdev_entry);
+} else if (!enable && QLIST_IS_INSERTED(hdev, logdev_entry)) {
+reelect = vhost_mem_logger[backend_type] == hdev;
+QLIST_REMOVE(hdev, logdev_entry);
+}
+
+if (!reelect)
+return;
+
+QLIST_FOREACH(hdev, _mlog_devices, logdev_entry) {
+if (!hdev->vhost_ops ||
+hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_NONE ||
+hdev->vhost_ops->backend_type >= VHOST_BACKEND_TYPE_MAX)
+continue;
+
+if (hdev->vhost_ops->backend_type == backend_type) {
+logdev = hdev;
+break;
+}
+}
+
+vhost_mem_logger[backend_type] = logdev;
+}
+
 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
MemoryRegionSection *section,
hwaddr first,
@@ -166,12 +216,14 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
 start_addr = MAX(first, start_addr);
 end_addr = MIN(last, end_addr);
 
-for (i = 0; i < dev->mem->nregions; ++i) {
-struct vhost_memory_region *reg = dev->mem->regions + i;
-vhost_dev_sync_region(dev, section, start_addr, end_addr,
-  reg->guest_phys_addr,
-  range_get_last(reg->guest_phys_addr,
- reg->memory_size));
+if (vhost_log_dev_enabled(dev)) {
+for (i = 0; i < dev->mem->nregions; ++i) {
+struct vhost_memory_region *reg = dev->mem->regions + i;
+vhost_dev_sync_region(dev, section, start_addr, end_addr,
+  reg->guest_phys_addr,
+  range_get_last(reg->guest_phys_addr,
+ reg->memory_size));
+}
 }
 for (i = 0; i < dev->nvqs; ++i) {
 struct vhost_virtqueue *vq = dev->vqs + i;
@@ -382,6 +434,7 @@ static void