> -----Original Message-----
> From: Eugenio Perez Martin <[email protected]>
> Sent: 2025年12月19日 16:12
> To: Wafer <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; Angus Chen
> <[email protected]>
> Subject: Re: [PATCH v3 4/4] vhost: SVQ add the indirect descriptors to
> available ring
>
> External Mail: This email originated from OUTSIDE of the organization!
> Do not click links, open attachments or provide ANY information unless you
> recognize the sender and know the content is safe.
>
>
> On Tue, Dec 16, 2025 at 2:59 AM Wafer Xie <[email protected]> wrote:
> >
> > From: wafer Xie <[email protected]>
> >
> > Retrieve the target buffer from the indirect buffers by index, add the
> > elements sent by the guest into the buffer’s indirect descriptors, and
> > update freed_head and freed_number. If freed_number is zero, or if the
> > current buffer’s freed_number is less than the number of elements,
> > update the buffer state to SVQ_INDIRECT_BUF_FREEING.
> >
> > Signed-off-by: wafer Xie <[email protected]>
> > ---
> > hw/virtio/vhost-shadow-virtqueue.c | 257
> > ++++++++++++++++++++++++++---
> > 1 file changed, 235 insertions(+), 22 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > b/hw/virtio/vhost-shadow-virtqueue.c
> > index adee52f50b..c6064fb839 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -189,36 +189,246 @@ static bool
> vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
> > return true;
> > }
> >
> > -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > +/**
> > + * Write descriptors to indirect descriptor table
> > + *
> > + * @svq: The shadow virtqueue
> > + * @sg: Cache for hwaddr
> > + * @iovec: The iovec from the guest
> > + * @num: iovec length
> > + * @addr: Descriptors' GPAs, if backed by guest memory
> > + * @buf: The indirect descriptor buffer
> > + * @offset_idx: Offset for write position
> > + * and next field (0 for out, out_num for in)
> > + * @more_descs: True if more descriptors come in the chain
> > + * @write: True if they are writeable descriptors
> > + *
> > + * Return true if success, false otherwise and print error.
> > + */
> > +static bool
> vhost_svq_vring_write_indirect_descs(VhostShadowVirtqueue *svq,
> > + hwaddr *sg,
> > + const struct iovec
> > *iovec,
> > + size_t num,
> > + const hwaddr *addr,
> > + SVQIndirectDescBuf *buf,
> > + size_t offset_idx,
> > + bool more_descs,
> > + bool write) {
> > + bool ok;
> > + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> > + vring_desc_t *descs = buf->desc;
> > + uint16_t i = buf->freed_head + offset_idx;
> > +
> > + if (num == 0) {
> > + return true;
> > + }
> > +
> > + ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > +
> > + for (size_t n = 0; n < num; n++) {
> > + descs[i].addr = cpu_to_le64(sg[n]);
> > + descs[i].len = cpu_to_le32(iovec[n].iov_len);
> > + if (more_descs || (n + 1 < num)) {
> > + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> > + descs[i].next = cpu_to_le16(offset_idx + n + 1);
> > + } else {
> > + descs[i].flags = flags;
> > + }
> > + i++;
> > + }
> > +
> > + return true;
>
> Most of this function is the same as current vhost_svq_vring_write_descs in
> master. Why not make a common one that accepts vring_desc_t * as a
> parameter and make the two of them call it?
>
Thank you for the suggestion.
I will improve this by following the approach used in
linux/driver/virtio/virtio_ring.c, function virtqueue_add_desc_split, and
refactor the code accordingly.
> > +}
> > +
> > +/**
> > + * Add descriptors to SVQ vring using indirect descriptors
> > +(dual-buffer)
> > + *
> > + * @svq: The shadow virtqueue
> > + * @out_sg: The out iovec from the guest
> > + * @out_num: The out iovec length
> > + * @out_addr: The out descriptors' GPAs
> > + * @in_sg: The in iovec from the guest
> > + * @in_num: The in iovec length
> > + * @in_addr: The in descriptors' GPAs
> > + * @sgs: Cache for hwaddr
> > + * @buf_idx: Index of the indirect buffer to use
> > + *
> > + * Return true if success, false otherwise and print error.
> > + */
> > +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq,
> > + const struct iovec *out_sg,
> > + size_t out_num,
> > + const hwaddr *out_addr,
> > + const struct iovec *in_sg,
> > + size_t in_num,
> > + const hwaddr *in_addr,
> > + hwaddr *sgs, int buf_idx) {
> > + SVQIndirectDescBuf *buf = &svq->indirect_bufs[buf_idx];
> > + uint16_t start_idx = buf->freed_head;
> > + size_t total_descs = out_num + in_num;
> > + hwaddr indirect_iova;
> > + bool ok;
> > +
> > + /* Populate indirect descriptor table for out descriptors */
> > + ok = vhost_svq_vring_write_indirect_descs(svq, sgs, out_sg, out_num,
> > + out_addr, buf, 0,
> > + in_num > 0, false);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > +
> > + /* Populate indirect descriptor table for in descriptors */
> > + ok = vhost_svq_vring_write_indirect_descs(svq, sgs, in_sg, in_num,
> > + in_addr, buf, out_num,
> > + false, true);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > +
> > + /* Calculate IOVA for this indirect descriptor range */
> > + indirect_iova = buf->iova + start_idx * sizeof(vring_desc_t);
> > +
> > + /* Add a single descriptor pointing to the indirect table */
> > + svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova);
> > + svq->vring.desc[svq->free_head].len =
> > + cpu_to_le32(total_descs * sizeof(vring_desc_t));
> > + svq->vring.desc[svq->free_head].flags =
> > + cpu_to_le16(VRING_DESC_F_INDIRECT);
> > +
> > + /* Store indirect descriptor info in desc_state */
> > + svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx;
> > +
> > + /* Update buffer state */
> > + buf->freed_head = start_idx + total_descs;
> > + buf->freed_descs -= total_descs;
> > +
> > + /* Move free_head forward */
> > + svq->free_head = svq->desc_next[svq->free_head];
> > +
> > + return true;
>
> Same here, most logic is duplicated in vhost_svq_add_split even when
> vhost_svq_add_split calls vhost_svq_add_split_indirect.
>
Thank you for the suggestion. I will modify this in the next version of the
patch.
> > +}
> > +
> > +/**
> > + * Try to get a freed indirect buffer for use
> > + *
> > + * @svq: The shadow virtqueue
> > + * @total_descs: Number of descriptors needed
> > + *
> > + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1)
> > + * if available, -1 if none available.
> > + */
> > +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq,
> > + size_t total_descs) {
> > + int cur = svq->current_indirect_buf;
> > + SVQIndirectDescBuf *buf;
> > +
> > + if (!svq->indirect_enabled) {
> > + return -1;
> > + }
> > +
> > + /* Try current buffer first if it's in FREED state */
> > + if (cur >= 0) {
> > + buf = &svq->indirect_bufs[cur];
> > + if (buf->state == SVQ_INDIRECT_BUF_FREED) {
> > + /* Check if we have enough free descriptors */
> > + if (buf->freed_descs >= total_descs) {
> > + return cur;
> > + }
> > + /* Not enough space, switch to FREEING and try next buffer */
> > + buf->state = SVQ_INDIRECT_BUF_FREEING;
> > + }
> > + }
> > +
> > + /* Try all other buffers */
> > + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> > + if (i == cur) {
> > + continue;
> > + }
> > + buf = &svq->indirect_bufs[i];
> > + if (buf->state == SVQ_INDIRECT_BUF_FREED &&
> > + buf->freed_descs >= total_descs) {
> > + svq->current_indirect_buf = i;
> > + return i;
> > + }
> > + }
> > +
>
> Why not store the free indirect descriptors as a linked list instead of a
> circular
> buffer, the same way as regular descriptors? That way we don't need to
> traverse all the circular buffer. Also, we reduce the complexity as you don't
> need to keep the state in each descriptor or the
> SVQ_INDIRECT_BUF_FREEING state.
>
For indirect descriptors, the virtio specification requires the descriptor
table to be contiguous in memory.
If a linked list were used, descriptors would need to be sorted and reassembled
into a contiguous layout
when building the indirect table, which would add extra complexity during
descriptor insertion.
>
> > + /* All buffers unavailable, fallback to chain mode */
> > + return -1;
> > +}
> > +
> > +static int vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > const struct iovec *out_sg, size_t out_num,
> > const hwaddr *out_addr,
> > const struct iovec *in_sg, size_t in_num,
> > - const hwaddr *in_addr, unsigned *head)
> > + const hwaddr *in_addr, unsigned *head,
> > + bool *used_indirect)
> > {
> > unsigned avail_idx;
> > vring_avail_t *avail = svq->vring.avail;
> > bool ok;
> > g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
> > + size_t total_descs = out_num + in_num;
> > + int indirect_buf_idx = -1;
> >
> > *head = svq->free_head;
> > + *used_indirect = false;
> >
> > /* We need some descriptors here */
> > if (unlikely(!out_num && !in_num)) {
> > qemu_log_mask(LOG_GUEST_ERROR,
> > "Guest provided element with no descriptors");
> > - return false;
> > + return -EINVAL;
> > }
> >
> > - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> > - in_num > 0, false);
> > - if (unlikely(!ok)) {
> > - return false;
> > + /* Try to use indirect descriptors if feature is negotiated and total
> > > 1 */
> > + if (virtio_vdev_has_feature(svq->vdev,
> VIRTIO_RING_F_INDIRECT_DESC) &&
> > + total_descs > 1) {
> > + indirect_buf_idx = vhost_svq_get_indirect_buf(svq,
> > + total_descs);
> > }
> >
> > - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> false,
> > - true);
> > - if (unlikely(!ok)) {
> > - return false;
> > + if (indirect_buf_idx >= 0) {
> > + /* Indirect mode: only need 1 main descriptor slot */
> > + if (unlikely(vhost_svq_available_slots(svq) < 1)) {
> > + return -ENOSPC;
> > + }
> > +
> > + /* Use indirect mode */
> > + ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr,
> > + in_sg, in_num, in_addr,
> > + sgs, indirect_buf_idx);
> > + if (unlikely(!ok)) {
> > + error_report("indirect error, out_num %zu in_num %zu "
> > + "avail index %u head %u",
> > + out_num, in_num, svq->shadow_avail_idx, *head);
> > + return -EINVAL;
> > + }
> > + *used_indirect = true;
> > + } else {
> > + /* Chain mode: need total_descs descriptor slots */
> > + if (unlikely(vhost_svq_available_slots(svq) < total_descs)) {
> > + return -ENOSPC;
> > + }
> > +
> > + /* Use direct (chain) mode */
> > + svq->desc_state[svq->free_head].indirect_buf_idx = -1;
> > +
> > + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num,
> out_addr,
> > + in_num > 0, false);
> > + if (unlikely(!ok)) {
> > + return -EINVAL;
> > + }
> > +
> > + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> > + false,
> > + true);
> > + if (unlikely(!ok)) {
> > + return -EINVAL;
> > + }
> > }
> >
> > /*
> > @@ -233,7 +443,7 @@ static bool
> vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > smp_wmb();
> > avail->idx = cpu_to_le16(svq->shadow_avail_idx);
> >
> > - return true;
> > + return 0;
> > }
> >
> > static void vhost_svq_kick(VhostShadowVirtqueue *svq) @@ -249,7
> > +459,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
> > uint16_t avail_event = le16_to_cpu(
> > *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]));
> > - needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> svq->shadow_avail_idx - 1);
> > + needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> > + svq->shadow_avail_idx - 1);
> > } else {
> > needs_kick =
> > !(svq->vring.used->flags &
> > cpu_to_le16(VRING_USED_F_NO_NOTIFY));
> > @@ -274,19 +485,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq,
> > const struct iovec *out_sg, {
> > unsigned qemu_head;
> > unsigned ndescs = in_num + out_num;
> > - bool ok;
> > + int r;
> > + bool used_indirect = false;
> >
> > - if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
> > - return -ENOSPC;
> > + r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg,
> in_num,
> > + in_addr, &qemu_head, &used_indirect);
> > + if (unlikely(r != 0)) {
> > + return r;
> > }
> >
> > - ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg,
> in_num,
> > - in_addr, &qemu_head);
> > - if (unlikely(!ok)) {
> > - return -EINVAL;
> > + /* If using indirect, only 1 main descriptor is used; otherwise ndescs
> > */
> > + if (used_indirect) {
> > + svq->num_free -= 1;
> > + } else {
> > + svq->num_free -= ndescs;
> > }
> > -
> > - svq->num_free -= ndescs;
> > svq->desc_state[qemu_head].elem = elem;
> > svq->desc_state[qemu_head].ndescs = ndescs;
> > vhost_svq_kick(svq);
> > --
> > 2.48.1
> >