From: wafer Xie <[email protected]> Retrieve the target buffer from the indirect buffers by index, add the elements sent by the guest into the buffer’s indirect descriptors, and update freed_head and freed_number. If freed_number is zero, or if the current buffer’s freed_number is less than the number of elements, update the buffer state to SVQ_INDIRECT_BUF_FREEING.
If the current indirect buffer does not have enough freed descriptors to accommodate the SVQ descriptors, descriptors can be borrowed from the next indirect buffer. Suggested-by: Eugenio Pérez <[email protected]> Signed-off-by: wafer Xie <[email protected]> --- hw/virtio/vhost-shadow-virtqueue.c | 341 +++++++++++++++++++++++++---- 1 file changed, 299 insertions(+), 42 deletions(-) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 4f564f514c..02a238548c 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -139,86 +139,340 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, } /** - * Write descriptors to SVQ vring + * Add a single descriptor to a descriptor table + * + * @desc: The descriptor to write to + * @addr: IOVA address + * @len: Length of the buffer + * @flags: Descriptor flags (VRING_DESC_F_WRITE, VRING_DESC_F_NEXT) + * @next: Next descriptor index (only used if VRING_DESC_F_NEXT is set) + */ +static void vhost_svq_vring_add_desc(vring_desc_t *desc, + hwaddr addr, + uint32_t len, + uint16_t flags, + uint16_t next) +{ + desc->addr = cpu_to_le64(addr); + desc->len = cpu_to_le32(len); + desc->flags = flags; + if (flags & cpu_to_le16(VRING_DESC_F_NEXT)) { + desc->next = cpu_to_le16(next); + } +} + +/** + * Write descriptors to a descriptor table (chain or indirect) * * @svq: The shadow virtqueue * @sg: Cache for hwaddr * @iovec: The iovec from the guest * @num: iovec length * @addr: Descriptors' GPAs, if backed by guest memory + * @descs: The descriptor table to write to + * @start_idx: Starting index in the descriptor table + * @offset_idx: Offset for next field calculation (used for indirect) * @more_descs: True if more descriptors come in the chain * @write: True if they are writeable descriptors + * @indirect: True if writing to indirect descriptor table * - * Return true if success, false otherwise and print error. + * Return the next free descriptor index if success, -1 on error. */ -static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, - const struct iovec *iovec, size_t num, - const hwaddr *addr, bool more_descs, - bool write) +static int vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, + hwaddr *sg, + const struct iovec *iovec, + size_t num, + const hwaddr *addr, + vring_desc_t *descs, + uint16_t start_idx, + size_t offset_idx, + bool more_descs, + bool write, + bool indirect) { - uint16_t i = svq->free_head, last = svq->free_head; - unsigned n; + uint16_t i = start_idx, last = start_idx; uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; - vring_desc_t *descs = svq->vring.desc; bool ok; if (num == 0) { - return true; + return start_idx; } ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr); if (unlikely(!ok)) { - return false; + return -1; } - for (n = 0; n < num; n++) { - if (more_descs || (n + 1 < num)) { - descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); - descs[i].next = cpu_to_le16(svq->desc_next[i]); + for (size_t n = 0; n < num; n++) { + uint16_t desc_flags = flags; + uint16_t next; + + if (indirect) { + next = offset_idx + n + 1; } else { - descs[i].flags = flags; + next = svq->desc_next[i]; } - descs[i].addr = cpu_to_le64(sg[n]); - descs[i].len = cpu_to_le32(iovec[n].iov_len); + if (more_descs || (n + 1 < num)) { + desc_flags |= cpu_to_le16(VRING_DESC_F_NEXT); + } + vhost_svq_vring_add_desc(&descs[i], sg[n], + iovec[n].iov_len, desc_flags, next); last = i; - i = svq->desc_next[i]; + if (indirect) { + i++; + } else { + i = next; + } + } + + /* Return the next free index */ + if (!indirect) { + i = svq->desc_next[last]; + } + return i; +} + +/** + * Add descriptors to SVQ vring using indirect descriptors (dual-buffer) + * + * @svq: The shadow virtqueue + * @out_sg: The out iovec from the guest + * @out_num: The out iovec length + * @out_addr: The out descriptors' GPAs + * @in_sg: The in iovec from the guest + * @in_num: The in iovec length + * @in_addr: The in descriptors' GPAs + * @sgs: Cache for hwaddr + * @buf_idx: Index of the indirect buffer to use + * + * Return true if success, false otherwise and print error. + */ +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq, + const struct iovec *out_sg, + size_t out_num, + const hwaddr *out_addr, + const struct iovec *in_sg, + size_t in_num, + const hwaddr *in_addr, + hwaddr *sgs, int buf_idx) +{ + SVQIndirectDescBuf *buf = &svq->indirect.bufs[buf_idx]; + uint16_t start_idx = buf->start_idx + buf->freed_head; + size_t total_descs = out_num + in_num; + hwaddr indirect_iova; + int ret; + + /* Populate indirect descriptor table for out descriptors */ + ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr, + svq->indirect.desc, start_idx, + 0, in_num > 0, false, true); + if (unlikely(ret < 0)) { + return false; + } + + /* Populate indirect descriptor table for in descriptors */ + ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, + svq->indirect.desc, start_idx + out_num, + out_num, false, true, true); + if (unlikely(ret < 0)) { + return false; } - svq->free_head = svq->desc_next[last]; + /* Calculate IOVA for this indirect descriptor range */ + indirect_iova = svq->indirect.iova + start_idx * sizeof(vring_desc_t); + + /* Add a single descriptor pointing to the indirect table */ + svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova); + svq->vring.desc[svq->free_head].len = + cpu_to_le32(total_descs * sizeof(vring_desc_t)); + svq->vring.desc[svq->free_head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT); + + /* Store indirect descriptor info in desc_state */ + svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx; + + /* Update buffer state */ + buf->freed_head += total_descs; + buf->freed_descs -= total_descs; + + /* Move free_head forward */ + svq->free_head = svq->desc_next[svq->free_head]; + return true; } -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, +/** + * Try to borrow descriptors from the next buffer segment + * + * @svq: The shadow virtqueue + * @buf_idx: Current buffer index + * @needed: Number of additional descriptors needed + * + * Returns true if successfully borrowed, false otherwise. + * Note: Last buffer cannot borrow from first buffer (IOVA not contiguous). + */ +static bool vhost_svq_borrow_from_next(VhostShadowVirtqueue *svq, + int buf_idx, size_t needed) +{ + SVQIndirectDescBuf *cur_buf = &svq->indirect.bufs[buf_idx]; + SVQIndirectDescBuf *next_buf; + int next_idx; + + /* Last buffer cannot borrow from first - IOVA would not be contiguous */ + if (buf_idx == SVQ_NUM_INDIRECT_BUFS - 1) { + return false; + } + + next_idx = buf_idx + 1; + next_buf = &svq->indirect.bufs[next_idx]; + + /* Can borrow if next buffer is in FREED state and has freed_head at 0 */ + if (next_buf->state != SVQ_INDIRECT_BUF_FREED || + next_buf->freed_head != 0) { + return false; + } + + /* Check if next buffer has enough free descriptors to lend */ + if (next_buf->freed_descs < needed) { + return false; + } + /* Borrow descriptors: expand current buffer, shrink next buffer */ + cur_buf->num_descs += needed; + cur_buf->borrowed_descs += needed; + cur_buf->freed_descs += needed; + + next_buf->start_idx += needed; + next_buf->num_descs -= needed; + next_buf->freed_descs -= needed; + + return true; +} + +/** + * Try to get a freed indirect buffer for use + * + * @svq: The shadow virtqueue + * @total_descs: Number of descriptors needed + * + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1) + * if available, -1 if none available. + */ +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq, + size_t total_descs) +{ + int cur = svq->indirect.current_buf; + SVQIndirectDescBuf *buf; + + if (!svq->indirect.enabled) { + return -1; + } + + if ( cur < 0) { + cur = 0; + } + /* Start from current or first buffer, try all buffers in order */ + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) { + int idx = (cur + i) % SVQ_NUM_INDIRECT_BUFS; + buf = &svq->indirect.bufs[idx]; + + if (buf->state != SVQ_INDIRECT_BUF_FREED) { + continue; + } + + /* Check if we have enough free descriptors */ + if (buf->freed_descs >= total_descs) { + svq->indirect.current_buf = idx; + return idx; + } + + /* Try to borrow from next buffer */ + size_t needed = total_descs - buf->freed_descs; + if ((buf->freed_descs > 0) && + vhost_svq_borrow_from_next(svq, idx, needed)) { + svq->indirect.current_buf = idx + 1; + return idx; + } + + /* Not enough space, mark as FREEING if it's the current buffer */ + buf->state = SVQ_INDIRECT_BUF_FREEING; + } + + /* All buffers unavailable, fallback to chain mode */ + return -1; +} + +static int vhost_svq_add_split(VhostShadowVirtqueue *svq, const struct iovec *out_sg, size_t out_num, const hwaddr *out_addr, const struct iovec *in_sg, size_t in_num, - const hwaddr *in_addr, unsigned *head) + const hwaddr *in_addr, unsigned *head, + bool *used_indirect) { unsigned avail_idx; vring_avail_t *avail = svq->vring.avail; bool ok; + int ret; g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num)); + size_t total_descs = out_num + in_num; + int indirect_buf_idx = -1; *head = svq->free_head; + *used_indirect = false; /* We need some descriptors here */ if (unlikely(!out_num && !in_num)) { qemu_log_mask(LOG_GUEST_ERROR, "Guest provided element with no descriptors"); - return false; + return -EINVAL; } - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr, - in_num > 0, false); - if (unlikely(!ok)) { - return false; + /* Try to use indirect descriptors if feature is negotiated and total > 1 */ + if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_INDIRECT_DESC) && + total_descs > 1) { + indirect_buf_idx = vhost_svq_get_indirect_buf(svq, total_descs); } - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false, - true); - if (unlikely(!ok)) { - return false; + if (indirect_buf_idx >= 0) { + /* Indirect mode: only need 1 main descriptor slot */ + if (unlikely(vhost_svq_available_slots(svq) < 1)) { + return -ENOSPC; + } + + /* Use indirect mode */ + ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr, + in_sg, in_num, in_addr, + sgs, indirect_buf_idx); + if (unlikely(!ok)) { + error_report("indirect error, out_num %zu in_num %zu " + "avail index %u head %u", + out_num, in_num, svq->shadow_avail_idx, *head); + return -EINVAL; + } + *used_indirect = true; + } else { + /* Chain mode: need total_descs descriptor slots */ + if (unlikely(vhost_svq_available_slots(svq) < total_descs)) { + return -ENOSPC; + } + + /* Use direct (chain) mode */ + svq->desc_state[svq->free_head].indirect_buf_idx = -1; + + ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr, + svq->vring.desc, svq->free_head, 0, + in_num > 0, false, false); + if (unlikely(ret < 0)) { + return -EINVAL; + } + svq->free_head = ret; + + ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, + svq->vring.desc, svq->free_head, 0, + false, true, false); + if (unlikely(ret < 0)) { + return -EINVAL; + } + svq->free_head = ret; } /* @@ -233,7 +487,7 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, smp_wmb(); avail->idx = cpu_to_le16(svq->shadow_avail_idx); - return true; + return 0; } static void vhost_svq_kick(VhostShadowVirtqueue *svq) @@ -249,7 +503,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { uint16_t avail_event = le16_to_cpu( *(uint16_t *)(&svq->vring.used->ring[svq->vring.num])); - needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, svq->shadow_avail_idx - 1); + needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, + svq->shadow_avail_idx - 1); } else { needs_kick = !(svq->vring.used->flags & cpu_to_le16(VRING_USED_F_NO_NOTIFY)); @@ -274,19 +529,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, { unsigned qemu_head; unsigned ndescs = in_num + out_num; - bool ok; + int r; + bool used_indirect = false; - if (unlikely(ndescs > vhost_svq_available_slots(svq))) { - return -ENOSPC; + r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num, + in_addr, &qemu_head, &used_indirect); + if (unlikely(r != 0)) { + return r; } - ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num, - in_addr, &qemu_head); - if (unlikely(!ok)) { - return -EINVAL; + /* If using indirect, only 1 main descriptor is used; otherwise ndescs */ + if (used_indirect) { + svq->num_free -= 1; + } else { + svq->num_free -= ndescs; } - - svq->num_free -= ndescs; svq->desc_state[qemu_head].elem = elem; svq->desc_state[qemu_head].ndescs = ndescs; vhost_svq_kick(svq); -- 2.34.1
