On Wed, Dec 31, 2025 at 12:01 PM Wafer Xie <[email protected]> wrote:
>
> From: wafer Xie <[email protected]>
>
> Retrieve the target buffer from the indirect buffers by index,
> add the elements sent by the guest into the buffer’s indirect descriptors,
> and update freed_head and freed_number. If freed_number is zero,
> or if the current buffer’s freed_number is less than the number of elements,
> update the buffer state to SVQ_INDIRECT_BUF_FREEING.
>
> If the current indirect buffer does not have enough freed descriptors
> to accommodate the SVQ descriptors,
> descriptors can be borrowed from the next indirect buffer.
>
> Suggested-by: Eugenio Pérez <[email protected]>
> Signed-off-by: wafer Xie <[email protected]>
> ---
>  hw/virtio/vhost-shadow-virtqueue.c | 341 +++++++++++++++++++++++++----
>  1 file changed, 299 insertions(+), 42 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
> b/hw/virtio/vhost-shadow-virtqueue.c
> index 4f564f514c..02a238548c 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -139,86 +139,340 @@ static bool vhost_svq_translate_addr(const 
> VhostShadowVirtqueue *svq,
>  }
>
>  /**
> - * Write descriptors to SVQ vring
> + * Add a single descriptor to a descriptor table
> + *
> + * @desc: The descriptor to write to
> + * @addr: IOVA address
> + * @len: Length of the buffer
> + * @flags: Descriptor flags (VRING_DESC_F_WRITE, VRING_DESC_F_NEXT)
> + * @next: Next descriptor index (only used if VRING_DESC_F_NEXT is set)
> + */
> +static void vhost_svq_vring_add_desc(vring_desc_t *desc,
> +                                       hwaddr addr,
> +                                       uint32_t len,
> +                                       uint16_t flags,
> +                                       uint16_t next)
> +{
> +    desc->addr = cpu_to_le64(addr);
> +    desc->len = cpu_to_le32(len);
> +    desc->flags = flags;
> +    if (flags & cpu_to_le16(VRING_DESC_F_NEXT)) {
> +        desc->next = cpu_to_le16(next);
> +    }
> +}
> +
> +/**
> + * Write descriptors to a descriptor table (chain or indirect)
>   *
>   * @svq: The shadow virtqueue
>   * @sg: Cache for hwaddr
>   * @iovec: The iovec from the guest
>   * @num: iovec length
>   * @addr: Descriptors' GPAs, if backed by guest memory
> + * @descs: The descriptor table to write to
> + * @start_idx: Starting index in the descriptor table
> + * @offset_idx: Offset for next field calculation (used for indirect)
>   * @more_descs: True if more descriptors come in the chain
>   * @write: True if they are writeable descriptors
> + * @indirect: True if writing to indirect descriptor table
>   *
> - * Return true if success, false otherwise and print error.
> + * Return the next free descriptor index if success, -1 on error.
>   */
> -static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr 
> *sg,
> -                                        const struct iovec *iovec, size_t 
> num,
> -                                        const hwaddr *addr, bool more_descs,
> -                                        bool write)
> +static int vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq,
> +                                       hwaddr *sg,
> +                                       const struct iovec *iovec,
> +                                       size_t num,
> +                                       const hwaddr *addr,
> +                                       vring_desc_t *descs,
> +                                       uint16_t start_idx,
> +                                       size_t offset_idx,
> +                                       bool more_descs,
> +                                       bool write,
> +                                       bool indirect)
>  {
> -    uint16_t i = svq->free_head, last = svq->free_head;
> -    unsigned n;
> +    uint16_t i = start_idx, last = start_idx;
>      uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> -    vring_desc_t *descs = svq->vring.desc;
>      bool ok;
>
>      if (num == 0) {
> -        return true;
> +        return start_idx;
>      }
>
>      ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr);
>      if (unlikely(!ok)) {
> -        return false;
> +        return -1;
>      }
>
> -    for (n = 0; n < num; n++) {
> -        if (more_descs || (n + 1 < num)) {
> -            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> -            descs[i].next = cpu_to_le16(svq->desc_next[i]);
> +    for (size_t n = 0; n < num; n++) {
> +        uint16_t desc_flags = flags;
> +        uint16_t next;
> +
> +        if (indirect) {
> +            next = offset_idx + n + 1;
>          } else {
> -            descs[i].flags = flags;
> +            next = svq->desc_next[i];
>          }
> -        descs[i].addr = cpu_to_le64(sg[n]);
> -        descs[i].len = cpu_to_le32(iovec[n].iov_len);
>
> +        if (more_descs || (n + 1 < num)) {
> +            desc_flags |= cpu_to_le16(VRING_DESC_F_NEXT);
> +        }
> +        vhost_svq_vring_add_desc(&descs[i], sg[n],
> +                                   iovec[n].iov_len, desc_flags, next);
>          last = i;
> -        i = svq->desc_next[i];
> +        if (indirect) {
> +            i++;
> +        } else {
> +            i = next;
> +        }
> +    }
> +
> +    /* Return the next free index */
> +    if (!indirect) {
> +        i = svq->desc_next[last];
> +    }
> +    return i;
> +}
> +
> +/**
> + * Add descriptors to SVQ vring using indirect descriptors (dual-buffer)
> + *
> + * @svq: The shadow virtqueue
> + * @out_sg: The out iovec from the guest
> + * @out_num: The out iovec length
> + * @out_addr: The out descriptors' GPAs
> + * @in_sg: The in iovec from the guest
> + * @in_num: The in iovec length
> + * @in_addr: The in descriptors' GPAs
> + * @sgs: Cache for hwaddr
> + * @buf_idx: Index of the indirect buffer to use
> + *
> + * Return true if success, false otherwise and print error.
> + */
> +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq,
> +                                         const struct iovec *out_sg,
> +                                         size_t out_num,
> +                                         const hwaddr *out_addr,
> +                                         const struct iovec *in_sg,
> +                                         size_t in_num,
> +                                         const hwaddr *in_addr,
> +                                         hwaddr *sgs, int buf_idx)
> +{
> +    SVQIndirectDescBuf *buf = &svq->indirect.bufs[buf_idx];
> +    uint16_t start_idx = buf->start_idx + buf->freed_head;
> +    size_t total_descs = out_num + in_num;
> +    hwaddr indirect_iova;
> +    int ret;
> +
> +    /* Populate indirect descriptor table for out descriptors */
> +    ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> +                                      svq->indirect.desc, start_idx,
> +                                      0, in_num > 0, false, true);
> +    if (unlikely(ret < 0)) {
> +        return false;
> +    }
> +
> +    /* Populate indirect descriptor table for in descriptors */
> +    ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> +                                      svq->indirect.desc, start_idx + 
> out_num,
> +                                      out_num, false, true, true);
> +    if (unlikely(ret < 0)) {
> +        return false;
>      }
>
> -    svq->free_head = svq->desc_next[last];
> +    /* Calculate IOVA for this indirect descriptor range */
> +    indirect_iova = svq->indirect.iova + start_idx * sizeof(vring_desc_t);
> +
> +    /* Add a single descriptor pointing to the indirect table */
> +    svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova);
> +    svq->vring.desc[svq->free_head].len =
> +            cpu_to_le32(total_descs * sizeof(vring_desc_t));
> +    svq->vring.desc[svq->free_head].flags = 
> cpu_to_le16(VRING_DESC_F_INDIRECT);
> +
> +    /* Store indirect descriptor info in desc_state */
> +    svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx;
> +
> +    /* Update buffer state */
> +    buf->freed_head += total_descs;
> +    buf->freed_descs -= total_descs;
> +
> +    /* Move free_head forward */
> +    svq->free_head = svq->desc_next[svq->free_head];
> +
>      return true;
>  }
>
> -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> +/**
> + * Try to borrow descriptors from the next buffer segment
> + *
> + * @svq: The shadow virtqueue
> + * @buf_idx: Current buffer index
> + * @needed: Number of additional descriptors needed
> + *
> + * Returns true if successfully borrowed, false otherwise.
> + * Note: Last buffer cannot borrow from first buffer (IOVA not contiguous).
> + */
> +static bool vhost_svq_borrow_from_next(VhostShadowVirtqueue *svq,
> +                                       int buf_idx, size_t needed)
> +{
> +    SVQIndirectDescBuf *cur_buf = &svq->indirect.bufs[buf_idx];
> +    SVQIndirectDescBuf *next_buf;
> +    int next_idx;
> +
> +    /* Last buffer cannot borrow from first - IOVA would not be contiguous */
> +    if (buf_idx == SVQ_NUM_INDIRECT_BUFS - 1) {
> +        return false;
> +    }
> +
> +    next_idx = buf_idx + 1;
> +    next_buf = &svq->indirect.bufs[next_idx];
> +
> +    /* Can borrow if next buffer is in FREED state and has freed_head at 0 */
> +    if (next_buf->state != SVQ_INDIRECT_BUF_FREED ||
> +        next_buf->freed_head != 0) {
> +        return false;
> +    }
> +
> +    /* Check if next buffer has enough free descriptors to lend */
> +    if (next_buf->freed_descs < needed) {
> +        return false;
> +    }
> +    /* Borrow descriptors: expand current buffer, shrink next buffer */
> +    cur_buf->num_descs += needed;
> +    cur_buf->borrowed_descs += needed;
> +    cur_buf->freed_descs += needed;
> +
> +    next_buf->start_idx += needed;
> +    next_buf->num_descs -= needed;
> +    next_buf->freed_descs -= needed;
> +

I don't think I understand the algorithm for indirect tables
allocation and freeing. Let me know what I'm missing.

We can have at max SVQ_NUM_INDIRECT_BUFS (4) indirect tables, as the
information we use to locate the indirect table itself is
svq->desc_state[used_elem.id].indirect_buf_idx. This is used to access
the svq->indirect.bufs, whose size is SVQ_NUM_INDIRECT_BUFS.

These tables could grow until the maximum descriptors allowed, which
is the size of the vring per VirtIO standard. The function
virtqueue_pop already checks this. But this code does not allow more
than 4 indirect tables anyway, even if they were smaller than vq size
and there is contiguous space in the mapped buffer for the indirect
tables.

The code allocates a buffer big enough for 8 indirect tables in
vhost_vdpa_svq_init_indirect, as they can only be svq->vring.num big,
so half of it will never be allocated under any circumstance.

Now let's say the driver allocates 4 indirect tables (IT). The shared
buffer now looks like:

[
  [1st IT],
  [2nd IT],
  [3rd IT],
  [4th IT],
  [free size for at least 4 indirect tables more]
]

Now the device starts freeing indirect descriptors and the dance of
reallocating could start. The condition for borrowing and returning
descriptors may be met at 2*vq_size boundaries. But, by that time, it
is guaranteed that you can allocate an indirect table of vq_num
descriptors at some position N*vq_num, with N between 0 and 7.

Why complicate the code with this dance of resizing
svq->indirect.bufs[j] then? Why not divide all the mapped buffer in 8
chunks of vq size and never move them?

I would prefer to remove the 4 (or any number) indirect tables
limitation and use something like the IOVA allocator or the buddy
allocator to find contiguous space though.

Thanks!

> +    return true;
> +}
> +
> +/**
> + * Try to get a freed indirect buffer for use
> + *
> + * @svq: The shadow virtqueue
> + * @total_descs: Number of descriptors needed
> + *
> + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1)
> + * if available, -1 if none available.
> + */
> +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq,
> +                                      size_t total_descs)
> +{
> +    int cur = svq->indirect.current_buf;
> +    SVQIndirectDescBuf *buf;
> +
> +    if (!svq->indirect.enabled) {
> +        return -1;
> +    }
> +
> +    if ( cur < 0) {
> +        cur = 0;
> +    }
> +    /* Start from current or first buffer, try all buffers in order */
> +    for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> +        int idx = (cur + i) % SVQ_NUM_INDIRECT_BUFS;
> +        buf = &svq->indirect.bufs[idx];
> +
> +        if (buf->state != SVQ_INDIRECT_BUF_FREED) {
> +            continue;
> +        }
> +
> +        /* Check if we have enough free descriptors */
> +        if (buf->freed_descs >= total_descs) {
> +            svq->indirect.current_buf = idx;
> +            return idx;
> +        }
> +
> +        /* Try to borrow from next buffer */
> +        size_t needed = total_descs - buf->freed_descs;
> +        if ((buf->freed_descs > 0) &&
> +            vhost_svq_borrow_from_next(svq, idx, needed)) {
> +            svq->indirect.current_buf = idx + 1;
> +            return idx;
> +        }
> +
> +        /* Not enough space, mark as FREEING if it's the current buffer */
> +        buf->state = SVQ_INDIRECT_BUF_FREEING;
> +    }
> +
> +    /* All buffers unavailable, fallback to chain mode */
> +    return -1;
> +}
> +
> +static int vhost_svq_add_split(VhostShadowVirtqueue *svq,
>                                  const struct iovec *out_sg, size_t out_num,
>                                  const hwaddr *out_addr,
>                                  const struct iovec *in_sg, size_t in_num,
> -                                const hwaddr *in_addr, unsigned *head)
> +                                const hwaddr *in_addr, unsigned *head,
> +                                bool *used_indirect)
>  {
>      unsigned avail_idx;
>      vring_avail_t *avail = svq->vring.avail;
>      bool ok;
> +    int ret;
>      g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
> +    size_t total_descs = out_num + in_num;
> +    int indirect_buf_idx = -1;
>
>      *head = svq->free_head;
> +    *used_indirect = false;
>
>      /* We need some descriptors here */
>      if (unlikely(!out_num && !in_num)) {
>          qemu_log_mask(LOG_GUEST_ERROR,
>                        "Guest provided element with no descriptors");
> -        return false;
> +        return -EINVAL;
>      }
>
> -    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> -                                     in_num > 0, false);
> -    if (unlikely(!ok)) {
> -        return false;
> +    /* Try to use indirect descriptors if feature is negotiated and total > 
> 1 */
> +    if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> +        total_descs > 1) {
> +        indirect_buf_idx = vhost_svq_get_indirect_buf(svq, total_descs);
>      }
>
> -    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false,
> -                                     true);
> -    if (unlikely(!ok)) {
> -        return false;
> +    if (indirect_buf_idx >= 0) {
> +        /* Indirect mode: only need 1 main descriptor slot */
> +        if (unlikely(vhost_svq_available_slots(svq) < 1)) {
> +            return -ENOSPC;
> +        }
> +
> +        /* Use indirect mode */
> +        ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr,
> +                                          in_sg, in_num, in_addr,
> +                                          sgs, indirect_buf_idx);
> +        if (unlikely(!ok)) {
> +            error_report("indirect error, out_num %zu in_num %zu "
> +                         "avail index %u head %u",
> +                         out_num, in_num, svq->shadow_avail_idx, *head);
> +            return -EINVAL;
> +        }
> +        *used_indirect = true;
> +    } else {
> +        /* Chain mode: need total_descs descriptor slots */
> +        if (unlikely(vhost_svq_available_slots(svq) < total_descs)) {
> +            return -ENOSPC;
> +        }
> +
> +        /* Use direct (chain) mode */
> +        svq->desc_state[svq->free_head].indirect_buf_idx = -1;
> +
> +        ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, 
> out_addr,
> +                                          svq->vring.desc, svq->free_head, 0,
> +                                          in_num > 0, false, false);
> +        if (unlikely(ret < 0)) {
> +            return -EINVAL;
> +        }
> +        svq->free_head = ret;
> +
> +        ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> +                                          svq->vring.desc, svq->free_head, 0,
> +                                          false, true, false);
> +        if (unlikely(ret < 0)) {
> +            return -EINVAL;
> +        }
> +        svq->free_head = ret;
>      }
>
>      /*
> @@ -233,7 +487,7 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
>      smp_wmb();
>      avail->idx = cpu_to_le16(svq->shadow_avail_idx);
>
> -    return true;
> +    return 0;
>  }
>
>  static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> @@ -249,7 +503,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
>      if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
>          uint16_t avail_event = le16_to_cpu(
>                  *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]));
> -        needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, 
> svq->shadow_avail_idx - 1);
> +        needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> +                                      svq->shadow_avail_idx - 1);
>      } else {
>          needs_kick =
>                  !(svq->vring.used->flags & 
> cpu_to_le16(VRING_USED_F_NO_NOTIFY));
> @@ -274,19 +529,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const 
> struct iovec *out_sg,
>  {
>      unsigned qemu_head;
>      unsigned ndescs = in_num + out_num;
> -    bool ok;
> +    int r;
> +    bool used_indirect = false;
>
> -    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
> -        return -ENOSPC;
> +    r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> +                             in_addr, &qemu_head, &used_indirect);
> +    if (unlikely(r != 0)) {
> +        return r;
>      }
>
> -    ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> -                             in_addr, &qemu_head);
> -    if (unlikely(!ok)) {
> -        return -EINVAL;
> +    /* If using indirect, only 1 main descriptor is used; otherwise ndescs */
> +    if (used_indirect) {
> +        svq->num_free -= 1;
> +    } else {
> +        svq->num_free -= ndescs;
>      }
> -
> -    svq->num_free -= ndescs;
>      svq->desc_state[qemu_head].elem = elem;
>      svq->desc_state[qemu_head].ndescs = ndescs;
>      vhost_svq_kick(svq);
> --
> 2.34.1
>


Reply via email to