On Mon, Dec 22, 2025 at 6:10 AM Wafer <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Eugenio Perez Martin <[email protected]>
> > Sent: 2025年12月19日 16:12
> > To: Wafer <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; Angus Chen
> > <[email protected]>
> > Subject: Re: [PATCH v3 2/4] vdpa: implement a statically allocated buffer
> > for
> > SVQ
> >
> > External Mail: This email originated from OUTSIDE of the organization!
> > Do not click links, open attachments or provide ANY information unless you
> > recognize the sender and know the content is safe.
> >
> >
> > On Tue, Dec 16, 2025 at 2:55 AM Wafer Xie <[email protected]> wrote:
> > >
> > > From: wafer Xie <[email protected]>
> > >
> > > allocated and initialized when creating the vhost-vdpa device, and
> > > release the indirect buffer when vhost-vdpa is stopped.
> > >
> > > Signed-off-by: wafer Xie <[email protected]>
> > > ---
> > > hw/virtio/vhost-shadow-virtqueue.c | 25 +++++
> > > hw/virtio/vhost-vdpa.c | 163 ++++++++++++++++++++++++++++-
> > > 2 files changed, 187 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > > b/hw/virtio/vhost-shadow-virtqueue.c
> > > index 2481d49345..85eff1d841 100644
> > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > @@ -708,6 +708,25 @@ void vhost_svq_start(VhostShadowVirtqueue
> > *svq, VirtIODevice *vdev,
> > > for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > > svq->desc_next[i] = i + 1;
> > > }
> > > +
> > > + /* Initialize indirect descriptor state */
> > > + svq->indirect_enabled = false;
> > > + svq->current_indirect_buf = -1;
> > > + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> > > + svq->indirect_bufs[i].desc = NULL;
> > > + svq->indirect_bufs[i].iova = 0;
> > > + svq->indirect_bufs[i].size = 0;
> > > + svq->indirect_bufs[i].state = SVQ_INDIRECT_BUF_FREED;
> > > + svq->indirect_bufs[i].num_descs = 0;
> > > + svq->indirect_bufs[i].freed_descs = 0;
> > > + svq->indirect_bufs[i].freeing_descs = 0;
> > > + svq->indirect_bufs[i].freed_head = 0;
> > > + }
> > > +
> > > + /* Initialize desc_state indirect_buf_idx to -1 */
> > > + for (unsigned i = 0; i < svq->vring.num; i++) {
> > > + svq->desc_state[i].indirect_buf_idx = -1;
> > > + }
> > > }
> > >
> > > /**
> > > @@ -748,6 +767,10 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > > munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
> > > munmap(svq->vring.used, vhost_svq_device_area_size(svq));
> > > event_notifier_set_handler(&svq->hdev_call, NULL);
> > > +
> > > + /* Reset indirect descriptor state (memory is freed by vhost-vdpa) */
> > > + svq->indirect_enabled = false;
> > > + svq->current_indirect_buf = -1;
> > > }
> > >
> > > /**
> > > @@ -765,6 +788,8 @@ VhostShadowVirtqueue *vhost_svq_new(const
> > VhostShadowVirtqueueOps *ops,
> > > event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
> > > svq->ops = ops;
> > > svq->ops_opaque = ops_opaque;
> > > + svq->indirect_enabled = false;
> > > + svq->current_indirect_buf = -1;
> > > return svq;
> > > }
> > >
> > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index
> > > 7061b6e1a3..816868d153 100644
> > > --- a/hw/virtio/vhost-vdpa.c
> > > +++ b/hw/virtio/vhost-vdpa.c
> > > @@ -584,6 +584,155 @@ static int vhost_vdpa_get_dev_features(struct
> > vhost_dev *dev,
> > > return ret;
> > > }
> > >
> > > +/**
> > > + * Allocate a single indirect descriptor buffer for SVQ
> > > + *
> > > + * @v: vhost_vdpa instance
> > > + * @num_descs: Number of descriptors (should be vring.num)
> > > + * @buf: Output buffer structure to fill
> > > + *
> > > + * Returns true on success, false on failure.
> > > + */
> > > +static bool vhost_vdpa_alloc_indirect_buf(struct vhost_vdpa *v,
> > > + uint16_t num_descs,
> > > + SVQIndirectDescBuf *buf) {
> > > + size_t desc_size = sizeof(vring_desc_t) * num_descs;
> > > + size_t alloc_size = ROUND_UP(desc_size,
> > qemu_real_host_page_size());
> > > + DMAMap needle = {
> > > + .size = alloc_size - 1,
> > > + .perm = IOMMU_RO,
> > > + };
> > > + vring_desc_t *indirect_desc;
> > > + int r;
> > > +
> > > + /* Allocate memory for indirect descriptors */
> > > + indirect_desc = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
> > > + MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> > > + if (indirect_desc == MAP_FAILED) {
> > > + error_report("Cannot allocate indirect descriptor buffer");
> > > + return false;
> > > + }
> > > +
> > > + /* Allocate IOVA for the indirect descriptor buffer */
> > > + r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &needle,
> > > + (hwaddr)(uintptr_t)indirect_desc);
> > > + if (unlikely(r != IOVA_OK)) {
> > > + error_report("Cannot allocate iova for indirect descriptors
> > > (%d)", r);
> > > + munmap(indirect_desc, alloc_size);
> > > + return false;
> > > + }
> > > +
> > > + /* Map to device */
> > > + r = vhost_vdpa_dma_map(v->shared, v->address_space_id,
> > needle.iova,
> > > + alloc_size, indirect_desc, true);
> > > + if (unlikely(r != 0)) {
> > > + error_report("Cannot map indirect descriptors to device: %s
> > > (%d)",
> > > + g_strerror(-r), -r);
> > > + vhost_iova_tree_remove(v->shared->iova_tree, needle);
> > > + munmap(indirect_desc, alloc_size);
> > > + return false;
> > > + }
> > > +
> > > + buf->desc = indirect_desc;
> > > + buf->iova = needle.iova;
> > > + buf->size = alloc_size;
> > > + buf->state = SVQ_INDIRECT_BUF_FREED;
> > > + buf->num_descs = num_descs;
> > > + buf->freed_descs = num_descs; /* All descriptors are free initially
> > > */
> > > + buf->freeing_descs = 0;
> > > + buf->freed_head = 0;
> > > +
> > > + return true;
> > > +}
> > > +
> > > +/**
> > > + * Free a single indirect descriptor buffer
> > > + *
> > > + * @v: vhost_vdpa instance
> > > + * @buf: Buffer to free
> > > + */
> > > +static void vhost_vdpa_free_indirect_buf(struct vhost_vdpa *v,
> > > + SVQIndirectDescBuf *buf) {
> > > + DMAMap needle;
> > > + const DMAMap *result;
> > > + int r;
> > > +
> > > + if (!buf->desc) {
> > > + return;
> > > + }
> > > +
> > > + needle = (DMAMap) {
> > > + .translated_addr = (uint64_t)(uintptr_t)buf->desc,
> > > + };
> > > + result = vhost_iova_tree_find_iova(v->shared->iova_tree,
> > > + &needle);
> > > +
> > > + if (result) {
> > > + r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id,
> > > + result->iova, buf->size);
> > > + if (unlikely(r != 0)) {
> > > + error_report("Cannot unmap indirect descriptors: %s (%d)",
> > > + g_strerror(-r), -r);
> > > + }
> > > +
> > > + vhost_iova_tree_remove(v->shared->iova_tree, *result);
> > > + }
> > > +
> > > + munmap(buf->desc, buf->size);
> > > + buf->desc = NULL;
> > > + buf->iova = 0;
> > > + buf->size = 0;
> > > +}
> > > +
> > > +/**
> > > + * Initialize indirect descriptor buffers for a single SVQ
> > > + *
> > > + * @v: vhost_vdpa instance
> > > + * @svq: Shadow virtqueue to initialize
> > > + *
> > > + * Returns true on success, false on failure.
> > > + */
> > > +static bool vhost_vdpa_svq_init_indirect(struct vhost_vdpa *v,
> > > + VhostShadowVirtqueue *svq)
> > > +{
> > > + if (!svq->vring.num) {
> > > + return true;
> > > + }
> > > +
> > > + /* Allocate indirect descriptor buffers for this SVQ */
> > > + for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) {
> > > + if (!vhost_vdpa_alloc_indirect_buf(v, svq->vring.num * 4,
> > > + &svq->indirect_bufs[j])) {
> >
> > Why not allocate a single array of size "SVQ_NUM_INDIRECT_BUFS *
> > svq->vring.num * 4"? It should help management and maybe even
> > performance as memory accesses are more predictable.
> >
>
> We cannot assume or rely on the device to populate the used ring in the same
> order as the available ring.
> With a single large array, this may introduce holes, which makes managing
> freed descriptors difficult.
> In addition, the virtio specification requires indirect descriptor addresses
> to be contiguous, which further complicates management with a single array.
>
Actually you made me review the QEMU's virtqueue_pop and vhost code
and it does not agree with the standard, so thanks for that :). As you
correctly point out, the standard says:
The first descriptor is located at the start of the indirect
descriptor table, additional indirect descriptors come immediately
afterwards. The VIRTQ_DESC_F_WRITE flags bit is the only valid flag
for descriptors in the indirect table. Others are reserved and are
ignored by the device. Buffer ID is also reserved and is ignored by
the device.
But in the do {} while after the "/* Collect all the descriptors */"
comments in the virtqueue_pop it follows the _F_NEXT field and
desc->next, as it is transversing a descriptor chain, and it should
ignore them if it is processing the indirect table. I think that
should be fixed, but maybe we can break some driver? Not Linux's one
for sure. Jason, MST, what do you think? If we should fix the device's
code, would you like to submit a patch, Wafer?
Now regarding the patch. We need the indirect descriptors to be
contiguous in memory, but my point is that having many different
tables makes it harder to manage. A good first step would be to just
allocate a buffer of that size and then divide it into
svq->indirect_bufs[i].
Instead of:
svq->indirect_buf[0].iova = mmap(size) && vhost_dma_map(...) ...
svq->indirect_buf[1].iova = mmap(size) && vhost_dma_map(...) ...
At least:
buf = mmap(size*2) && vhost_dma_map(...) ...
svq->indirect_buf[0].iova = buf
svq->indirect_buf[1].iova = buf + size
So the translation tree etc is smaller, QEMU need to do less ioctls,
less memory transactions, etc.
Even with that, we're losing opportunities of finding contiguous free
descriptors with many SVQIndirectDescBuf instead of a bigger single
one able to reuse the free holes. Let's say the SVQIndirectDescBuf are
4 descriptor length for simplicity. Now the driver makes available one
buffer that spans 3 descriptors (step 1), another one that spans three
descriptors (step 2), and another buffer than spans two descriptors
(step 3).
In this series the SVQ places the first indirect table in the first
SVQIndirectDescBuf, the second table at the second SVQIndirectDescBuf,
and then the third one must resort to chained descriptors. If we make
just one big buffer of size 8, all of them fits in the indirect table,
we just need to mark it as free.
Actually, as long as the device does not mark as used one descriptor,
we lose the all the SVQIndirectDescBuf from the time buf->freed_head
reaches the end.
Some ideas for that:
* Reuse IOVATree to handle the indirect table. It already do the
linear search on it, but the allocation and free may hurt performance.
Still, it should be able to find contiguous buffers if any.
* Use it in a circular way but store some data in desc_state so we can
coalesce the free entries. I can expand on that if needed.
If we still find that this approach still works better, I think we
should have many SVQIndirectDescBuf of queue length, which is the
maximum indirect table length. We can handle them as a free / used
stack, so looking for a free entry would be cheaper. What do you think
about that?
> > > + /* Cleanup already allocated buffers */
> > > + for (int k = 0; k < j; k++) {
> > > + vhost_vdpa_free_indirect_buf(v, &svq->indirect_bufs[k]);
> > > + }
> > > + return false;
> > > + }
> > > + }
> > > +
> > > + svq->indirect_enabled = true;
> > > + svq->current_indirect_buf = 0;
> > > + return true;
> > > +}
> > > +
> > > +/**
> > > + * Cleanup indirect descriptor buffers for a single SVQ
> > > + *
> > > + * @v: vhost_vdpa instance
> > > + * @svq: Shadow virtqueue to cleanup
> > > + */
> > > +static void vhost_vdpa_svq_cleanup_indirect(struct vhost_vdpa *v,
> > > + VhostShadowVirtqueue
> > > +*svq) {
> > > + for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) {
> > > + vhost_vdpa_free_indirect_buf(v, &svq->indirect_bufs[j]);
> > > + }
> > > + svq->indirect_enabled = false;
> > > + svq->current_indirect_buf = -1;
> > > +}
> > > +
> > > static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct
> > > vhost_vdpa *v) {
> > > g_autoptr(GPtrArray) shadow_vqs = NULL; @@ -791,8 +940,11 @@
> > > static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
> > > size_t idx;
> > >
> > > for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
> > > - vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
> > > + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> > idx);
> > > + vhost_vdpa_svq_cleanup_indirect(v, svq);
> >
> > All usages of vhost_vdpa_svq_cleanup_indirect go with vhost_svq_stop.
> > We can call vhost_vdpa_svq_cleanup_indirect from vhost_svq_stop or even
> > embed it.
> >
>
> Thank you for the suggestion. I will modify this in the next version of the
> patch.
>
> However, I have one question: in
> hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_cleanup,
> why does it only call vhost_svq_stop(svq) without calling
> vhost_vdpa_svq_unmap_rings(dev, svq)?
>
We could call it, but the device is about to be totally destroyed,
either because it has been deleted from QEMU or because QEMU itself is
shutting down. The only action QEMU can do afterwards is to close the
descriptor, so the device should clean the maps anyway. But maybe I'm
missing something here.
> >
> > > + vhost_svq_stop(svq);
> > > }
> > > +
> > > g_ptr_array_free(v->shadow_vqs, true); }
> > >
> > > @@ -1299,6 +1451,13 @@ static bool vhost_vdpa_svqs_start(struct
> > vhost_dev *dev)
> > > error_setg_errno(&err, -r, "Cannot set device address");
> > > goto err_set_addr;
> > > }
> > > +
> > > + /* Initialize indirect descriptor buffers for this SVQ */
> > > + if (!vhost_vdpa_svq_init_indirect(v, svq)) {
> > > + /* Non-fatal: will fallback to chain mode */
> > > + warn_report("Cannot initialize indirect descriptor for SVQ
> > > %u",
> > > + virtio_get_queue_index(vq));
> > > + }
> > > }
> > >
> > > return true;
> > > @@ -1313,6 +1472,7 @@ err:
> > > error_reportf_err(err, "Cannot setup SVQ %u: ", i);
> > > for (unsigned j = 0; j < i; ++j) {
> > > VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> > > j);
> > > + vhost_vdpa_svq_cleanup_indirect(v, svq);
> > > vhost_vdpa_svq_unmap_rings(dev, svq);
> > > vhost_svq_stop(svq);
> > > }
> > > @@ -1331,6 +1491,7 @@ static void vhost_vdpa_svqs_stop(struct
> > vhost_dev *dev)
> > > for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > > VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
> > > i);
> > >
> > > + vhost_vdpa_svq_cleanup_indirect(v, svq);
> > > vhost_svq_stop(svq);
> > > vhost_vdpa_svq_unmap_rings(dev, svq);
> > >
> > > --
> > > 2.48.1
> > >
>