On Wed, Dec 31, 2025 at 12:01 PM Wafer Xie <[email protected]> wrote:
>
> From: wafer Xie <[email protected]>
>
> Retrieve the target buffer from the indirect buffers by index,
> add the elements sent by the guest into the buffer’s indirect descriptors,
> and update freed_head and freed_number. If freed_number is zero,
> or if the current buffer’s freed_number is less than the number of elements,
> update the buffer state to SVQ_INDIRECT_BUF_FREEING.
>
> If the current indirect buffer does not have enough freed descriptors
> to accommodate the SVQ descriptors,
> descriptors can be borrowed from the next indirect buffer.
>
> Suggested-by: Eugenio Pérez <[email protected]>
> Signed-off-by: wafer Xie <[email protected]>
> ---
> hw/virtio/vhost-shadow-virtqueue.c | 341 +++++++++++++++++++++++++----
> 1 file changed, 299 insertions(+), 42 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> b/hw/virtio/vhost-shadow-virtqueue.c
> index 4f564f514c..02a238548c 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -139,86 +139,340 @@ static bool vhost_svq_translate_addr(const
> VhostShadowVirtqueue *svq,
> }
>
> /**
> - * Write descriptors to SVQ vring
> + * Add a single descriptor to a descriptor table
> + *
> + * @desc: The descriptor to write to
> + * @addr: IOVA address
> + * @len: Length of the buffer
> + * @flags: Descriptor flags (VRING_DESC_F_WRITE, VRING_DESC_F_NEXT)
> + * @next: Next descriptor index (only used if VRING_DESC_F_NEXT is set)
> + */
> +static void vhost_svq_vring_add_desc(vring_desc_t *desc,
> + hwaddr addr,
> + uint32_t len,
> + uint16_t flags,
> + uint16_t next)
> +{
> + desc->addr = cpu_to_le64(addr);
> + desc->len = cpu_to_le32(len);
> + desc->flags = flags;
> + if (flags & cpu_to_le16(VRING_DESC_F_NEXT)) {
> + desc->next = cpu_to_le16(next);
> + }
> +}
> +
> +/**
> + * Write descriptors to a descriptor table (chain or indirect)
> *
> * @svq: The shadow virtqueue
> * @sg: Cache for hwaddr
> * @iovec: The iovec from the guest
> * @num: iovec length
> * @addr: Descriptors' GPAs, if backed by guest memory
> + * @descs: The descriptor table to write to
> + * @start_idx: Starting index in the descriptor table
> + * @offset_idx: Offset for next field calculation (used for indirect)
> * @more_descs: True if more descriptors come in the chain
> * @write: True if they are writeable descriptors
> + * @indirect: True if writing to indirect descriptor table
> *
> - * Return true if success, false otherwise and print error.
> + * Return the next free descriptor index if success, -1 on error.
> */
> -static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr
> *sg,
> - const struct iovec *iovec, size_t
> num,
> - const hwaddr *addr, bool more_descs,
> - bool write)
> +static int vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq,
> + hwaddr *sg,
> + const struct iovec *iovec,
> + size_t num,
> + const hwaddr *addr,
> + vring_desc_t *descs,
> + uint16_t start_idx,
> + size_t offset_idx,
> + bool more_descs,
> + bool write,
> + bool indirect)
> {
> - uint16_t i = svq->free_head, last = svq->free_head;
> - unsigned n;
> + uint16_t i = start_idx, last = start_idx;
> uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> - vring_desc_t *descs = svq->vring.desc;
> bool ok;
>
> if (num == 0) {
> - return true;
> + return start_idx;
> }
>
> ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr);
> if (unlikely(!ok)) {
> - return false;
> + return -1;
> }
>
> - for (n = 0; n < num; n++) {
> - if (more_descs || (n + 1 < num)) {
> - descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> - descs[i].next = cpu_to_le16(svq->desc_next[i]);
> + for (size_t n = 0; n < num; n++) {
> + uint16_t desc_flags = flags;
> + uint16_t next;
> +
> + if (indirect) {
> + next = offset_idx + n + 1;
> } else {
> - descs[i].flags = flags;
> + next = svq->desc_next[i];
> }
> - descs[i].addr = cpu_to_le64(sg[n]);
> - descs[i].len = cpu_to_le32(iovec[n].iov_len);
>
> + if (more_descs || (n + 1 < num)) {
> + desc_flags |= cpu_to_le16(VRING_DESC_F_NEXT);
> + }
> + vhost_svq_vring_add_desc(&descs[i], sg[n],
> + iovec[n].iov_len, desc_flags, next);
> last = i;
> - i = svq->desc_next[i];
> + if (indirect) {
> + i++;
> + } else {
> + i = next;
> + }
> + }
> +
> + /* Return the next free index */
> + if (!indirect) {
> + i = svq->desc_next[last];
> + }
> + return i;
> +}
> +
> +/**
> + * Add descriptors to SVQ vring using indirect descriptors (dual-buffer)
> + *
> + * @svq: The shadow virtqueue
> + * @out_sg: The out iovec from the guest
> + * @out_num: The out iovec length
> + * @out_addr: The out descriptors' GPAs
> + * @in_sg: The in iovec from the guest
> + * @in_num: The in iovec length
> + * @in_addr: The in descriptors' GPAs
> + * @sgs: Cache for hwaddr
> + * @buf_idx: Index of the indirect buffer to use
> + *
> + * Return true if success, false otherwise and print error.
> + */
> +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq,
> + const struct iovec *out_sg,
> + size_t out_num,
> + const hwaddr *out_addr,
> + const struct iovec *in_sg,
> + size_t in_num,
> + const hwaddr *in_addr,
> + hwaddr *sgs, int buf_idx)
> +{
> + SVQIndirectDescBuf *buf = &svq->indirect.bufs[buf_idx];
> + uint16_t start_idx = buf->start_idx + buf->freed_head;
> + size_t total_descs = out_num + in_num;
> + hwaddr indirect_iova;
> + int ret;
> +
> + /* Populate indirect descriptor table for out descriptors */
> + ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> + svq->indirect.desc, start_idx,
> + 0, in_num > 0, false, true);
> + if (unlikely(ret < 0)) {
> + return false;
> + }
> +
> + /* Populate indirect descriptor table for in descriptors */
> + ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> + svq->indirect.desc, start_idx +
> out_num,
> + out_num, false, true, true);
> + if (unlikely(ret < 0)) {
> + return false;
> }
>
> - svq->free_head = svq->desc_next[last];
> + /* Calculate IOVA for this indirect descriptor range */
> + indirect_iova = svq->indirect.iova + start_idx * sizeof(vring_desc_t);
> +
> + /* Add a single descriptor pointing to the indirect table */
> + svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova);
> + svq->vring.desc[svq->free_head].len =
> + cpu_to_le32(total_descs * sizeof(vring_desc_t));
> + svq->vring.desc[svq->free_head].flags =
> cpu_to_le16(VRING_DESC_F_INDIRECT);
> +
> + /* Store indirect descriptor info in desc_state */
> + svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx;
> +
> + /* Update buffer state */
> + buf->freed_head += total_descs;
> + buf->freed_descs -= total_descs;
> +
> + /* Move free_head forward */
> + svq->free_head = svq->desc_next[svq->free_head];
> +
> return true;
> }
>
> -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> +/**
> + * Try to borrow descriptors from the next buffer segment
> + *
> + * @svq: The shadow virtqueue
> + * @buf_idx: Current buffer index
> + * @needed: Number of additional descriptors needed
> + *
> + * Returns true if successfully borrowed, false otherwise.
> + * Note: Last buffer cannot borrow from first buffer (IOVA not contiguous).
> + */
> +static bool vhost_svq_borrow_from_next(VhostShadowVirtqueue *svq,
> + int buf_idx, size_t needed)
> +{
> + SVQIndirectDescBuf *cur_buf = &svq->indirect.bufs[buf_idx];
> + SVQIndirectDescBuf *next_buf;
> + int next_idx;
> +
> + /* Last buffer cannot borrow from first - IOVA would not be contiguous */
> + if (buf_idx == SVQ_NUM_INDIRECT_BUFS - 1) {
> + return false;
> + }
> +
> + next_idx = buf_idx + 1;
> + next_buf = &svq->indirect.bufs[next_idx];
> +
> + /* Can borrow if next buffer is in FREED state and has freed_head at 0 */
> + if (next_buf->state != SVQ_INDIRECT_BUF_FREED ||
> + next_buf->freed_head != 0) {
> + return false;
> + }
> +
> + /* Check if next buffer has enough free descriptors to lend */
> + if (next_buf->freed_descs < needed) {
> + return false;
> + }
> + /* Borrow descriptors: expand current buffer, shrink next buffer */
> + cur_buf->num_descs += needed;
> + cur_buf->borrowed_descs += needed;
> + cur_buf->freed_descs += needed;
> +
> + next_buf->start_idx += needed;
> + next_buf->num_descs -= needed;
> + next_buf->freed_descs -= needed;
> +
I don't think I understand the algorithm for indirect tables
allocation and freeing. Let me know what I'm missing.
We can have at max SVQ_NUM_INDIRECT_BUFS (4) indirect tables, as the
information we use to locate the indirect table itself is
svq->desc_state[used_elem.id].indirect_buf_idx. This is used to access
the svq->indirect.bufs, whose size is SVQ_NUM_INDIRECT_BUFS.
These tables could grow until the maximum descriptors allowed, which
is the size of the vring per VirtIO standard. The function
virtqueue_pop already checks this. But this code does not allow more
than 4 indirect tables anyway, even if they were smaller than vq size
and there is contiguous space in the mapped buffer for the indirect
tables.
The code allocates a buffer big enough for 8 indirect tables in
vhost_vdpa_svq_init_indirect, as they can only be svq->vring.num big,
so half of it will never be allocated under any circumstance.
Now let's say the driver allocates 4 indirect tables (IT). The shared
buffer now looks like:
[
[1st IT],
[2nd IT],
[3rd IT],
[4th IT],
[free size for at least 4 indirect tables more]
]
Now the device starts freeing indirect descriptors and the dance of
reallocating could start. The condition for borrowing and returning
descriptors may be met at 2*vq_size boundaries. But, by that time, it
is guaranteed that you can allocate an indirect table of vq_num
descriptors at some position N*vq_num, with N between 0 and 7.
Why complicate the code with this dance of resizing
svq->indirect.bufs[j] then? Why not divide all the mapped buffer in 8
chunks of vq size and never move them?
I would prefer to remove the 4 (or any number) indirect tables
limitation and use something like the IOVA allocator or the buddy
allocator to find contiguous space though.
Thanks!
> + return true;
> +}
> +
> +/**
> + * Try to get a freed indirect buffer for use
> + *
> + * @svq: The shadow virtqueue
> + * @total_descs: Number of descriptors needed
> + *
> + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1)
> + * if available, -1 if none available.
> + */
> +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq,
> + size_t total_descs)
> +{
> + int cur = svq->indirect.current_buf;
> + SVQIndirectDescBuf *buf;
> +
> + if (!svq->indirect.enabled) {
> + return -1;
> + }
> +
> + if ( cur < 0) {
> + cur = 0;
> + }
> + /* Start from current or first buffer, try all buffers in order */
> + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> + int idx = (cur + i) % SVQ_NUM_INDIRECT_BUFS;
> + buf = &svq->indirect.bufs[idx];
> +
> + if (buf->state != SVQ_INDIRECT_BUF_FREED) {
> + continue;
> + }
> +
> + /* Check if we have enough free descriptors */
> + if (buf->freed_descs >= total_descs) {
> + svq->indirect.current_buf = idx;
> + return idx;
> + }
> +
> + /* Try to borrow from next buffer */
> + size_t needed = total_descs - buf->freed_descs;
> + if ((buf->freed_descs > 0) &&
> + vhost_svq_borrow_from_next(svq, idx, needed)) {
> + svq->indirect.current_buf = idx + 1;
> + return idx;
> + }
> +
> + /* Not enough space, mark as FREEING if it's the current buffer */
> + buf->state = SVQ_INDIRECT_BUF_FREEING;
> + }
> +
> + /* All buffers unavailable, fallback to chain mode */
> + return -1;
> +}
> +
> +static int vhost_svq_add_split(VhostShadowVirtqueue *svq,
> const struct iovec *out_sg, size_t out_num,
> const hwaddr *out_addr,
> const struct iovec *in_sg, size_t in_num,
> - const hwaddr *in_addr, unsigned *head)
> + const hwaddr *in_addr, unsigned *head,
> + bool *used_indirect)
> {
> unsigned avail_idx;
> vring_avail_t *avail = svq->vring.avail;
> bool ok;
> + int ret;
> g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
> + size_t total_descs = out_num + in_num;
> + int indirect_buf_idx = -1;
>
> *head = svq->free_head;
> + *used_indirect = false;
>
> /* We need some descriptors here */
> if (unlikely(!out_num && !in_num)) {
> qemu_log_mask(LOG_GUEST_ERROR,
> "Guest provided element with no descriptors");
> - return false;
> + return -EINVAL;
> }
>
> - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> - in_num > 0, false);
> - if (unlikely(!ok)) {
> - return false;
> + /* Try to use indirect descriptors if feature is negotiated and total >
> 1 */
> + if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> + total_descs > 1) {
> + indirect_buf_idx = vhost_svq_get_indirect_buf(svq, total_descs);
> }
>
> - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false,
> - true);
> - if (unlikely(!ok)) {
> - return false;
> + if (indirect_buf_idx >= 0) {
> + /* Indirect mode: only need 1 main descriptor slot */
> + if (unlikely(vhost_svq_available_slots(svq) < 1)) {
> + return -ENOSPC;
> + }
> +
> + /* Use indirect mode */
> + ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr,
> + in_sg, in_num, in_addr,
> + sgs, indirect_buf_idx);
> + if (unlikely(!ok)) {
> + error_report("indirect error, out_num %zu in_num %zu "
> + "avail index %u head %u",
> + out_num, in_num, svq->shadow_avail_idx, *head);
> + return -EINVAL;
> + }
> + *used_indirect = true;
> + } else {
> + /* Chain mode: need total_descs descriptor slots */
> + if (unlikely(vhost_svq_available_slots(svq) < total_descs)) {
> + return -ENOSPC;
> + }
> +
> + /* Use direct (chain) mode */
> + svq->desc_state[svq->free_head].indirect_buf_idx = -1;
> +
> + ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num,
> out_addr,
> + svq->vring.desc, svq->free_head, 0,
> + in_num > 0, false, false);
> + if (unlikely(ret < 0)) {
> + return -EINVAL;
> + }
> + svq->free_head = ret;
> +
> + ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> + svq->vring.desc, svq->free_head, 0,
> + false, true, false);
> + if (unlikely(ret < 0)) {
> + return -EINVAL;
> + }
> + svq->free_head = ret;
> }
>
> /*
> @@ -233,7 +487,7 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> smp_wmb();
> avail->idx = cpu_to_le16(svq->shadow_avail_idx);
>
> - return true;
> + return 0;
> }
>
> static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> @@ -249,7 +503,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
> uint16_t avail_event = le16_to_cpu(
> *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]));
> - needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> svq->shadow_avail_idx - 1);
> + needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> + svq->shadow_avail_idx - 1);
> } else {
> needs_kick =
> !(svq->vring.used->flags &
> cpu_to_le16(VRING_USED_F_NO_NOTIFY));
> @@ -274,19 +529,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const
> struct iovec *out_sg,
> {
> unsigned qemu_head;
> unsigned ndescs = in_num + out_num;
> - bool ok;
> + int r;
> + bool used_indirect = false;
>
> - if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
> - return -ENOSPC;
> + r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> + in_addr, &qemu_head, &used_indirect);
> + if (unlikely(r != 0)) {
> + return r;
> }
>
> - ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> - in_addr, &qemu_head);
> - if (unlikely(!ok)) {
> - return -EINVAL;
> + /* If using indirect, only 1 main descriptor is used; otherwise ndescs */
> + if (used_indirect) {
> + svq->num_free -= 1;
> + } else {
> + svq->num_free -= ndescs;
> }
> -
> - svq->num_free -= ndescs;
> svq->desc_state[qemu_head].elem = elem;
> svq->desc_state[qemu_head].ndescs = ndescs;
> vhost_svq_kick(svq);
> --
> 2.34.1
>