Re: [PATCH v4 4/4] vhost: SVQ add the indirect descriptors to available ring

Eugenio Perez Martin Mon, 19 Jan 2026 01:36:50 -0800

On Sun, Jan 18, 2026 at 9:31 AM Wafer <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Eugenio Perez Martin <[email protected]>
> > Sent: 2026年1月15日 18:56
> > To: Wafer <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; Angus Chen
> > <[email protected]>
> > Subject: Re: [PATCH v4 4/4] vhost: SVQ add the indirect descriptors to
> > available ring
> >
> > External Mail: This email originated from OUTSIDE of the organization!
> > Do not click links, open attachments or provide ANY information unless you
> > recognize the sender and know the content is safe.
> >
> >
> > On Wed, Dec 31, 2025 at 12:01 PM Wafer Xie <[email protected]>
> > wrote:
> > >
> > > From: wafer Xie <[email protected]>
> > >
> > > Retrieve the target buffer from the indirect buffers by index, add the
> > > elements sent by the guest into the buffer’s indirect descriptors, and
> > > update freed_head and freed_number. If freed_number is zero, or if the
> > > current buffer’s freed_number is less than the number of elements,
> > > update the buffer state to SVQ_INDIRECT_BUF_FREEING.
> > >
> > > If the current indirect buffer does not have enough freed descriptors
> > > to accommodate the SVQ descriptors, descriptors can be borrowed from
> > > the next indirect buffer.
> > >
> > > Suggested-by: Eugenio Pérez <[email protected]>
> > > Signed-off-by: wafer Xie <[email protected]>
> > > ---
> > >  hw/virtio/vhost-shadow-virtqueue.c | 341
> > > +++++++++++++++++++++++++----
> > >  1 file changed, 299 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > > b/hw/virtio/vhost-shadow-virtqueue.c
> > > index 4f564f514c..02a238548c 100644
> > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > @@ -139,86 +139,340 @@ static bool vhost_svq_translate_addr(const
> > > VhostShadowVirtqueue *svq,  }
> > >
> > >  /**
> > > - * Write descriptors to SVQ vring
> > > + * Add a single descriptor to a descriptor table
> > > + *
> > > + * @desc: The descriptor to write to
> > > + * @addr: IOVA address
> > > + * @len: Length of the buffer
> > > + * @flags: Descriptor flags (VRING_DESC_F_WRITE, VRING_DESC_F_NEXT)
> > > + * @next: Next descriptor index (only used if VRING_DESC_F_NEXT is
> > > +set)  */ static void vhost_svq_vring_add_desc(vring_desc_t *desc,
> > > +                                       hwaddr addr,
> > > +                                       uint32_t len,
> > > +                                       uint16_t flags,
> > > +                                       uint16_t next) {
> > > +    desc->addr = cpu_to_le64(addr);
> > > +    desc->len = cpu_to_le32(len);
> > > +    desc->flags = flags;
> > > +    if (flags & cpu_to_le16(VRING_DESC_F_NEXT)) {
> > > +        desc->next = cpu_to_le16(next);
> > > +    }
> > > +}
> > > +
> > > +/**
> > > + * Write descriptors to a descriptor table (chain or indirect)
> > >   *
> > >   * @svq: The shadow virtqueue
> > >   * @sg: Cache for hwaddr
> > >   * @iovec: The iovec from the guest
> > >   * @num: iovec length
> > >   * @addr: Descriptors' GPAs, if backed by guest memory
> > > + * @descs: The descriptor table to write to
> > > + * @start_idx: Starting index in the descriptor table
> > > + * @offset_idx: Offset for next field calculation (used for indirect)
> > >   * @more_descs: True if more descriptors come in the chain
> > >   * @write: True if they are writeable descriptors
> > > + * @indirect: True if writing to indirect descriptor table
> > >   *
> > > - * Return true if success, false otherwise and print error.
> > > + * Return the next free descriptor index if success, -1 on error.
> > >   */
> > > -static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq,
> > hwaddr *sg,
> > > -                                        const struct iovec *iovec, 
> > > size_t num,
> > > -                                        const hwaddr *addr, bool 
> > > more_descs,
> > > -                                        bool write)
> > > +static int vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq,
> > > +                                       hwaddr *sg,
> > > +                                       const struct iovec *iovec,
> > > +                                       size_t num,
> > > +                                       const hwaddr *addr,
> > > +                                       vring_desc_t *descs,
> > > +                                       uint16_t start_idx,
> > > +                                       size_t offset_idx,
> > > +                                       bool more_descs,
> > > +                                       bool write,
> > > +                                       bool indirect)
> > >  {
> > > -    uint16_t i = svq->free_head, last = svq->free_head;
> > > -    unsigned n;
> > > +    uint16_t i = start_idx, last = start_idx;
> > >      uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> > > -    vring_desc_t *descs = svq->vring.desc;
> > >      bool ok;
> > >
> > >      if (num == 0) {
> > > -        return true;
> > > +        return start_idx;
> > >      }
> > >
> > >      ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr);
> > >      if (unlikely(!ok)) {
> > > -        return false;
> > > +        return -1;
> > >      }
> > >
> > > -    for (n = 0; n < num; n++) {
> > > -        if (more_descs || (n + 1 < num)) {
> > > -            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> > > -            descs[i].next = cpu_to_le16(svq->desc_next[i]);
> > > +    for (size_t n = 0; n < num; n++) {
> > > +        uint16_t desc_flags = flags;
> > > +        uint16_t next;
> > > +
> > > +        if (indirect) {
> > > +            next = offset_idx + n + 1;
> > >          } else {
> > > -            descs[i].flags = flags;
> > > +            next = svq->desc_next[i];
> > >          }
> > > -        descs[i].addr = cpu_to_le64(sg[n]);
> > > -        descs[i].len = cpu_to_le32(iovec[n].iov_len);
> > >
> > > +        if (more_descs || (n + 1 < num)) {
> > > +            desc_flags |= cpu_to_le16(VRING_DESC_F_NEXT);
> > > +        }
> > > +        vhost_svq_vring_add_desc(&descs[i], sg[n],
> > > +                                   iovec[n].iov_len, desc_flags,
> > > + next);
> > >          last = i;
> > > -        i = svq->desc_next[i];
> > > +        if (indirect) {
> > > +            i++;
> > > +        } else {
> > > +            i = next;
> > > +        }
> > > +    }
> > > +
> > > +    /* Return the next free index */
> > > +    if (!indirect) {
> > > +        i = svq->desc_next[last];
> > > +    }
> > > +    return i;
> > > +}
> > > +
> > > +/**
> > > + * Add descriptors to SVQ vring using indirect descriptors
> > > +(dual-buffer)
> > > + *
> > > + * @svq: The shadow virtqueue
> > > + * @out_sg: The out iovec from the guest
> > > + * @out_num: The out iovec length
> > > + * @out_addr: The out descriptors' GPAs
> > > + * @in_sg: The in iovec from the guest
> > > + * @in_num: The in iovec length
> > > + * @in_addr: The in descriptors' GPAs
> > > + * @sgs: Cache for hwaddr
> > > + * @buf_idx: Index of the indirect buffer to use
> > > + *
> > > + * Return true if success, false otherwise and print error.
> > > + */
> > > +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq,
> > > +                                         const struct iovec *out_sg,
> > > +                                         size_t out_num,
> > > +                                         const hwaddr *out_addr,
> > > +                                         const struct iovec *in_sg,
> > > +                                         size_t in_num,
> > > +                                         const hwaddr *in_addr,
> > > +                                         hwaddr *sgs, int buf_idx) {
> > > +    SVQIndirectDescBuf *buf = &svq->indirect.bufs[buf_idx];
> > > +    uint16_t start_idx = buf->start_idx + buf->freed_head;
> > > +    size_t total_descs = out_num + in_num;
> > > +    hwaddr indirect_iova;
> > > +    int ret;
> > > +
> > > +    /* Populate indirect descriptor table for out descriptors */
> > > +    ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num,
> > out_addr,
> > > +                                      svq->indirect.desc, start_idx,
> > > +                                      0, in_num > 0, false, true);
> > > +    if (unlikely(ret < 0)) {
> > > +        return false;
> > > +    }
> > > +
> > > +    /* Populate indirect descriptor table for in descriptors */
> > > +    ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> > > +                                      svq->indirect.desc, start_idx + 
> > > out_num,
> > > +                                      out_num, false, true, true);
> > > +    if (unlikely(ret < 0)) {
> > > +        return false;
> > >      }
> > >
> > > -    svq->free_head = svq->desc_next[last];
> > > +    /* Calculate IOVA for this indirect descriptor range */
> > > +    indirect_iova = svq->indirect.iova + start_idx *
> > > + sizeof(vring_desc_t);
> > > +
> > > +    /* Add a single descriptor pointing to the indirect table */
> > > +    svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova);
> > > +    svq->vring.desc[svq->free_head].len =
> > > +            cpu_to_le32(total_descs * sizeof(vring_desc_t));
> > > +    svq->vring.desc[svq->free_head].flags =
> > > + cpu_to_le16(VRING_DESC_F_INDIRECT);
> > > +
> > > +    /* Store indirect descriptor info in desc_state */
> > > +    svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx;
> > > +
> > > +    /* Update buffer state */
> > > +    buf->freed_head += total_descs;
> > > +    buf->freed_descs -= total_descs;
> > > +
> > > +    /* Move free_head forward */
> > > +    svq->free_head = svq->desc_next[svq->free_head];
> > > +
> > >      return true;
> > >  }
> > >
> > > -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > > +/**
> > > + * Try to borrow descriptors from the next buffer segment
> > > + *
> > > + * @svq: The shadow virtqueue
> > > + * @buf_idx: Current buffer index
> > > + * @needed: Number of additional descriptors needed
> > > + *
> > > + * Returns true if successfully borrowed, false otherwise.
> > > + * Note: Last buffer cannot borrow from first buffer (IOVA not
> > contiguous).
> > > + */
> > > +static bool vhost_svq_borrow_from_next(VhostShadowVirtqueue *svq,
> > > +                                       int buf_idx, size_t needed) {
> > > +    SVQIndirectDescBuf *cur_buf = &svq->indirect.bufs[buf_idx];
> > > +    SVQIndirectDescBuf *next_buf;
> > > +    int next_idx;
> > > +
> > > +    /* Last buffer cannot borrow from first - IOVA would not be 
> > > contiguous
> > */
> > > +    if (buf_idx == SVQ_NUM_INDIRECT_BUFS - 1) {
> > > +        return false;
> > > +    }
> > > +
> > > +    next_idx = buf_idx + 1;
> > > +    next_buf = &svq->indirect.bufs[next_idx];
> > > +
> > > +    /* Can borrow if next buffer is in FREED state and has freed_head at > > > 0
> > */
> > > +    if (next_buf->state != SVQ_INDIRECT_BUF_FREED ||
> > > +        next_buf->freed_head != 0) {
> > > +        return false;
> > > +    }
> > > +
> > > +    /* Check if next buffer has enough free descriptors to lend */
> > > +    if (next_buf->freed_descs < needed) {
> > > +        return false;
> > > +    }
> > > +    /* Borrow descriptors: expand current buffer, shrink next buffer */
> > > +    cur_buf->num_descs += needed;
> > > +    cur_buf->borrowed_descs += needed;
> > > +    cur_buf->freed_descs += needed;
> > > +
> > > +    next_buf->start_idx += needed;
> > > +    next_buf->num_descs -= needed;
> > > +    next_buf->freed_descs -= needed;
> > > +
> >
> > I don't think I understand the algorithm for indirect tables allocation and
> > freeing. Let me know what I'm missing.
> >
> > We can have at max SVQ_NUM_INDIRECT_BUFS (4) indirect tables, as the
> > information we use to locate the indirect table itself is
> > svq->desc_state[used_elem.id].indirect_buf_idx. This is used to access
> > the svq->indirect.bufs, whose size is SVQ_NUM_INDIRECT_BUFS.
> >
> > These tables could grow until the maximum descriptors allowed, which is the
> > size of the vring per VirtIO standard. The function virtqueue_pop already
> > checks this. But this code does not allow more than 4 indirect tables 
> > anyway,
> > even if they were smaller than vq size and there is contiguous space in the
> > mapped buffer for the indirect tables.
> >
>
>      Thank you for the detailed review! Let me clarify the design as I think
>      there may be some misunderstanding.
>
>      BUFFER ALLOCATION:
>      ------------------
>     # define CONFIG_MAX_SKB_FRAGS 17
>       In the Linux kernel, virtio-net limits the maximum scatterlist size to 
> 17 entries. With a vq_size of 256 and indirect descriptors enabled,
>       the virtio-net driver can therefore submit up to 256 * 17 
> scatter-gather entries.
>
>       Based on this, the implementation allocates an indirect descriptor 
> capacity about half of the theoretical maximum, specifically vq_size * 2 * 4.
>
>      The total allocation is:
>
>        SVQ_NUM_INDIRECT_BUFS(4) * vq_size * 2 descriptors
>
>      Each buf[i] has capacity of 2*vq_size, not vq_size. So the full buffer
>      is divided into 4 segments, each capable of holding 2x the vring size.
>      There is no wasted space.
>
>      +---------------------------------------------------------------------+
>      |         Total: 4 bufs * 2 * vq_size = 8 * vq_size descriptors       |
>      +----------------+----------------+----------------+------------------+
>      |     buf[0]     |     buf[1]     |     buf[2]     |      buf[3]      |
>      |  2*vq_size cap |  2*vq_size cap |  2*vq_size cap |   2*vq_size cap  |
>      +----------------+----------------+----------------+------------------+
>
>      WHY 2*vq_size PER BUFFER:
>      -------------------------
>      The reason each buffer has 2*vq_size capacity (instead of 1*vq_size) is
>      to REDUCE the frequency of borrowing scenarios and improve performance.
>
>      With 2*vq_size capacity per buffer:
>      - Most requests can be satisfied within a single buffer
>      - Borrowing from next buffer only happens in rare edge cases
>      - Better cache locality and fewer cross-buffer operations
>
>      If we used 1*vq_size per buffer, borrowing would occur much more
>      frequently, adding overhead to the common path.
>
>      WHY BORROWING IS STILL NEEDED:
>      ------------------------------
>      Even with 2*vq_size capacity, the borrowing mechanism addresses edge
>      cases that you mentioned in your previous review about to reuse the free 
> holes.
>
>      https://lists.gnu.org/archive/html/qemu-devel/2025-12/msg02779.html
>
>      Consider this situation:
>
>        buf[0] state after heavy usage:
>        +------------------------------------------------------------+
>        | freed_descs = 4  (only 4 free at the tail)                 |
>        | freeing_descs = 100  (waiting for device to return)        |
>        | freed_head = 508                                           |
>        |                                                            |
>        | [0.....507: IN_USE/FREEING] [508,509,510,511: FREE]        |
>        +------------------------------------------------------------+
>
>      Now driver sends a request needing 8 descriptors:
>
>      WITHOUT borrowing:
>        - buf[0] has only 4 freed_descs < 8 needed
>        - buf[0] transitions to FREEING state
>        - Those 4 free descriptors are WASTED
>        - Must use buf[1] from scratch
>
>      WITH borrowing:
>        - buf[0] has 4 freed_descs, needs 4 more
>        - Borrow 4 from buf[1]'s head (IOVA is contiguous!)
>        - Now have 8 contiguous descriptors: [508-511 from buf[0]] + [0-3 from 
> buf[1]]
>
>        buf[0]:                              buf[1]:
>        +--------------------------------+   +----------------------+
>        | [508,509,510,511,|0,1,2,3]     |   | start_idx: 4         |
>        |  <-- original --> <--borrowed->|   | num_descs: 2*vq-4    |
>        |      8 contiguous descs        |   |                      |
>        +--------------------------------+   +----------------------+
>               Contiguous IOVA!
>


I understand that algorithm, but I don't get what it buys. As you're
only allowing 4 indirect tables in flight at max, you are just moving
the descriptors you tag as "wasted" from the tail of the queue to the
inter-buf space. And this edge case has comparatively way more
complexity than the regular code path.

Let's try from a different angle, can you describe a buf[0,1,2,3]
state where forwarding an indirect table is not possible without
splitting it between many buffers?

>      BORROWING RULES:
>      ----------------
>      To keep the design simple and predictable:
>
>      1. Each buf[i] can only borrow ONCE from buf[i+1]
>      2. After borrowing, svq->indirect.current_buf is set to buf[i+1]
>      3. Subsequent requests will use buf[i+1] as the starting point
>
>      This ensures:
>      - No complex multi-level borrowing chains
>      - Predictable buffer rotation
>      - Simple bookkeeping (only track borrowed_descs per buffer)
>
>      Example flow:
>                  
> +-------------------------------------------------------------+
>        | current_buf = 0                                             |
>        | buf[0] needs to borrow from buf[1]                          |
>        |   -> borrow succeeds                                        |
>        |   -> current_buf = 1  (next requests start from buf[1])     |
>        |                                                             |
>        | Later, buf[1] needs to borrow from buf[2]                   |
>        |   -> borrow succeeds                                        |
>        |   -> current_buf = 2  (next requests start from buf[2])     |
>        +-------------------------------------------------------------+
>
>      DESIGN SUMMARY:
>      ---------------
>        +-------------------------------------------------------------+
>        |  2*vq_size per buffer  ->  Reduces borrowing frequency      |
>        |                            (performance optimization)       |
>        +-------------------------------------------------------------+
>        |  Borrowing mechanism   ->  Handles edge cases when buffer   |
>        |                            is almost full but not empty     |
>        |                            (avoids descriptor waste)        |
>        +-------------------------------------------------------------+
>        |  One-time borrow rule  ->  Simple state management          |
>        |                            (current_buf moves to next)      |
>        +-------------------------------------------------------------+
>
>      WHY NOT FIXED CHUNKS:
>      ---------------------
>      Fixed vq_size chunks would waste descriptors in the scenario above.
>      The 2*vq_size capacity + borrowing allows:
>
>      1. Single request can use up to 2*vq_size descs (exceeding vq_size)
>      2. Partially-used buffers can still contribute to new requests
>      3. Maximum utilization of the pre-allocated IOVA space
>
>      REGARDING BUDDY ALLOCATOR:
>      --------------------------
>      A buddy allocator would be more flexible, but adds complexity for
>      managing splits/merges. The current design with 4 large segments +
>      borrowing provides a good balance:
>      - Simple allocation (round-robin through 4 bufs)
>      - Efficient utilization (borrowing avoids waste)
>      - Predictable performance (no fragmentation)
>
>      Does this clarify the design? Please let me know if you have
>      further questions.
>
>      Thanks,
>
> > The code allocates a buffer big enough for 8 indirect tables in
> > vhost_vdpa_svq_init_indirect, as they can only be svq->vring.num big, so
> > half of it will never be allocated under any circumstance.
> >
> > Now let's say the driver allocates 4 indirect tables (IT). The shared 
> > buffer now
> > looks like:
> >
> > [
> >   [1st IT],
> >   [2nd IT],
> >   [3rd IT],
> >   [4th IT],
> >   [free size for at least 4 indirect tables more] ]
> >
> > Now the device starts freeing indirect descriptors and the dance of
> > reallocating could start. The condition for borrowing and returning
> > descriptors may be met at 2*vq_size boundaries. But, by that time, it is
> > guaranteed that you can allocate an indirect table of vq_num descriptors at
> > some position N*vq_num, with N between 0 and 7.
> >
> > Why complicate the code with this dance of resizing
> > svq->indirect.bufs[j] then? Why not divide all the mapped buffer in 8
> > chunks of vq size and never move them?
> >
> > I would prefer to remove the 4 (or any number) indirect tables limitation 
> > and
> > use something like the IOVA allocator or the buddy allocator to find
> > contiguous space though.
> >
> > Thanks!
> >
> > > +    return true;
> > > +}
> > > +
> > > +/**
> > > + * Try to get a freed indirect buffer for use
> > > + *
> > > + * @svq: The shadow virtqueue
> > > + * @total_descs: Number of descriptors needed
> > > + *
> > > + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1)
> > > + * if available, -1 if none available.
> > > + */
> > > +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq,
> > > +                                      size_t total_descs) {
> > > +    int cur = svq->indirect.current_buf;
> > > +    SVQIndirectDescBuf *buf;
> > > +
> > > +    if (!svq->indirect.enabled) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if ( cur < 0) {
> > > +        cur = 0;
> > > +    }
> > > +    /* Start from current or first buffer, try all buffers in order */
> > > +    for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> > > +        int idx = (cur + i) % SVQ_NUM_INDIRECT_BUFS;
> > > +        buf = &svq->indirect.bufs[idx];
> > > +
> > > +        if (buf->state != SVQ_INDIRECT_BUF_FREED) {
> > > +            continue;
> > > +        }
> > > +
> > > +        /* Check if we have enough free descriptors */
> > > +        if (buf->freed_descs >= total_descs) {
> > > +            svq->indirect.current_buf = idx;
> > > +            return idx;
> > > +        }
> > > +
> > > +        /* Try to borrow from next buffer */
> > > +        size_t needed = total_descs - buf->freed_descs;
> > > +        if ((buf->freed_descs > 0) &&
> > > +            vhost_svq_borrow_from_next(svq, idx, needed)) {
> > > +            svq->indirect.current_buf = idx + 1;
> > > +            return idx;
> > > +        }
> > > +
> > > +        /* Not enough space, mark as FREEING if it's the current buffer 
> > > */
> > > +        buf->state = SVQ_INDIRECT_BUF_FREEING;
> > > +    }
> > > +
> > > +    /* All buffers unavailable, fallback to chain mode */
> > > +    return -1;
> > > +}
> > > +
> > > +static int vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > >                                  const struct iovec *out_sg, size_t 
> > > out_num,
> > >                                  const hwaddr *out_addr,
> > >                                  const struct iovec *in_sg, size_t in_num,
> > > -                                const hwaddr *in_addr, unsigned *head)
> > > +                                const hwaddr *in_addr, unsigned *head,
> > > +                                bool *used_indirect)
> > >  {
> > >      unsigned avail_idx;
> > >      vring_avail_t *avail = svq->vring.avail;
> > >      bool ok;
> > > +    int ret;
> > >      g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
> > > +    size_t total_descs = out_num + in_num;
> > > +    int indirect_buf_idx = -1;
> > >
> > >      *head = svq->free_head;
> > > +    *used_indirect = false;
> > >
> > >      /* We need some descriptors here */
> > >      if (unlikely(!out_num && !in_num)) {
> > >          qemu_log_mask(LOG_GUEST_ERROR,
> > >                        "Guest provided element with no descriptors");
> > > -        return false;
> > > +        return -EINVAL;
> > >      }
> > >
> > > -    ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> > > -                                     in_num > 0, false);
> > > -    if (unlikely(!ok)) {
> > > -        return false;
> > > +    /* Try to use indirect descriptors if feature is negotiated and 
> > > total > 1 */
> > > +    if (virtio_vdev_has_feature(svq->vdev,
> > VIRTIO_RING_F_INDIRECT_DESC) &&
> > > +        total_descs > 1) {
> > > +        indirect_buf_idx = vhost_svq_get_indirect_buf(svq,
> > > + total_descs);
> > >      }
> > >
> > > -    ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> > false,
> > > -                                     true);
> > > -    if (unlikely(!ok)) {
> > > -        return false;
> > > +    if (indirect_buf_idx >= 0) {
> > > +        /* Indirect mode: only need 1 main descriptor slot */
> > > +        if (unlikely(vhost_svq_available_slots(svq) < 1)) {
> > > +            return -ENOSPC;
> > > +        }
> > > +
> > > +        /* Use indirect mode */
> > > +        ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr,
> > > +                                          in_sg, in_num, in_addr,
> > > +                                          sgs, indirect_buf_idx);
> > > +        if (unlikely(!ok)) {
> > > +            error_report("indirect error, out_num %zu in_num %zu "
> > > +                         "avail index %u head %u",
> > > +                         out_num, in_num, svq->shadow_avail_idx, *head);
> > > +            return -EINVAL;
> > > +        }
> > > +        *used_indirect = true;
> > > +    } else {
> > > +        /* Chain mode: need total_descs descriptor slots */
> > > +        if (unlikely(vhost_svq_available_slots(svq) < total_descs)) {
> > > +            return -ENOSPC;
> > > +        }
> > > +
> > > +        /* Use direct (chain) mode */
> > > +        svq->desc_state[svq->free_head].indirect_buf_idx = -1;
> > > +
> > > +        ret = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num,
> > out_addr,
> > > +                                          svq->vring.desc, 
> > > svq->free_head, 0,
> > > +                                          in_num > 0, false, false);
> > > +        if (unlikely(ret < 0)) {
> > > +            return -EINVAL;
> > > +        }
> > > +        svq->free_head = ret;
> > > +
> > > +        ret = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, 
> > > in_addr,
> > > +                                          svq->vring.desc, 
> > > svq->free_head, 0,
> > > +                                          false, true, false);
> > > +        if (unlikely(ret < 0)) {
> > > +            return -EINVAL;
> > > +        }
> > > +        svq->free_head = ret;
> > >      }
> > >
> > >      /*
> > > @@ -233,7 +487,7 @@ static bool
> > vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > >      smp_wmb();
> > >      avail->idx = cpu_to_le16(svq->shadow_avail_idx);
> > >
> > > -    return true;
> > > +    return 0;
> > >  }
> > >
> > >  static void vhost_svq_kick(VhostShadowVirtqueue *svq) @@ -249,7
> > > +503,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > >      if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
> > >          uint16_t avail_event = le16_to_cpu(
> > >                  *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]));
> > > -        needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> > svq->shadow_avail_idx - 1);
> > > +        needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> > > +                                      svq->shadow_avail_idx - 1);
> > >      } else {
> > >          needs_kick =
> > >                  !(svq->vring.used->flags &
> > > cpu_to_le16(VRING_USED_F_NO_NOTIFY));
> > > @@ -274,19 +529,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq,
> > > const struct iovec *out_sg,  {
> > >      unsigned qemu_head;
> > >      unsigned ndescs = in_num + out_num;
> > > -    bool ok;
> > > +    int r;
> > > +    bool used_indirect = false;
> > >
> > > -    if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
> > > -        return -ENOSPC;
> > > +    r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg,
> > in_num,
> > > +                             in_addr, &qemu_head, &used_indirect);
> > > +    if (unlikely(r != 0)) {
> > > +        return r;
> > >      }
> > >
> > > -    ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg,
> > in_num,
> > > -                             in_addr, &qemu_head);
> > > -    if (unlikely(!ok)) {
> > > -        return -EINVAL;
> > > +    /* If using indirect, only 1 main descriptor is used; otherwise 
> > > ndescs */
> > > +    if (used_indirect) {
> > > +        svq->num_free -= 1;
> > > +    } else {
> > > +        svq->num_free -= ndescs;
> > >      }
> > > -
> > > -    svq->num_free -= ndescs;
> > >      svq->desc_state[qemu_head].elem = elem;
> > >      svq->desc_state[qemu_head].ndescs = ndescs;
> > >      vhost_svq_kick(svq);
> > > --
> > > 2.34.1
> > >
>
>

Re: [PATCH v4 4/4] vhost: SVQ add the indirect descriptors to available ring

Reply via email to