date:20230327

On Mon, Mar 27, 2023 at 12:05 PM Xuan Zhuo  wrote:
>
> Update the document of virtqueue_add_* series API, allowing the callers to
> use sg->dma_address to pass the dma address to Virtio Core.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_ring.c | 16 
>  1 file changed, 16 insertions(+)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index d5dffbe50070..0b198ec3ff92 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -2202,6 +2202,10 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   * Caller must ensure we don't call this with other virtqueue operations
>   * at the same time (except where noted).
>   *
> + * If the caller has done dma map then use sg->dma_address to pass dma 
> address.
> + * If one sg->dma_address is used, then all sgs must use sg->dma_address;
> + * otherwise all sg->dma_address must be NULL.
> + *
>   * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
>   */
>  int virtqueue_add_sgs(struct virtqueue *_vq,
> @@ -2236,6 +2240,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
>   * Caller must ensure we don't call this with other virtqueue operations
>   * at the same time (except where noted).
>   *
> + * If the caller has done dma map then use sg->dma_address to pass dma 
> address.
> + * If one sg->dma_address is used, then all sgs must use sg->dma_address;
> + * otherwise all sg->dma_address must be NULL.
> + *
>   * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
>   */
>  int virtqueue_add_outbuf(struct virtqueue *vq,
> @@ -2258,6 +2266,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
>   * Caller must ensure we don't call this with other virtqueue operations
>   * at the same time (except where noted).
>   *
> + * If the caller has done dma map then use sg->dma_address to pass dma 
> address.
> + * If one sg->dma_address is used, then all sgs must use sg->dma_address;
> + * otherwise all sg->dma_address must be NULL.
> + *
>   * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
>   */
>  int virtqueue_add_inbuf(struct virtqueue *vq,
> @@ -2281,6 +2293,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
>   * Caller must ensure we don't call this with other virtqueue operations
>   * at the same time (except where noted).
>   *
> + * If the caller has done dma map then use sg->dma_address to pass dma 
> address.
> + * If one sg->dma_address is used, then all sgs must use sg->dma_address;
> + * otherwise all sg->dma_address must be NULL.
> + *
>   * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
>   */
>  int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v6 06/11] virtio_ring: packed-indirect: support premapped

On Mon, Mar 27, 2023 at 12:05 PM Xuan Zhuo  wrote:
>
> virtio core only supports virtual addresses, dma is completed in virtio
> core.
>
> In some scenarios (such as the AF_XDP), the memory is allocated
> and DMA mapping is completed in advance, so it is necessary for us to
> support passing the DMA address to virtio core.
>
> Drives can use sg->dma_address to pass the mapped dma address to virtio
> core. If one sg->dma_address is used then all sgs must use sg->dma_address,
> otherwise all dma_address must be null when passing it to the APIs of
> virtio.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_ring.c | 11 ---
>  1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 2afff1dc6c74..d5dffbe50070 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1338,6 +1338,7 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> unsigned int i, n;
> u16 head, id;
> dma_addr_t addr;
> +   bool dma_map_internal;
>
> head = vq->packed.next_avail_idx;
> desc = alloc_indirect_packed(total_sg, gfp);
> @@ -1355,7 +1356,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> id = vq->free_head;
> BUG_ON(id == vq->packed.vring.num);
>
> -   if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
> +   dma_map_internal = !sgs[0]->dma_address;
> +   if (dma_map_internal && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, 
> in_sgs))
> goto err_map;
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> @@ -1417,6 +1419,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> vq->packed.desc_state[id].data = data;
> vq->packed.desc_state[id].indir_desc = desc;
> vq->packed.desc_state[id].last = id;
> +   vq->packed.desc_state[id].flags = dma_map_internal ? 
> VRING_STATE_F_MAP_INTERNAL : 0;
> +
>
> vq->num_added += 1;
>
> @@ -1426,7 +1430,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> return 0;
>
>  unmap_release:
> -   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
> +   if (dma_map_internal)
> +   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
>
>  err_map:
> kfree(desc);
> @@ -1653,7 +1658,7 @@ static void detach_buf_packed(struct vring_virtqueue 
> *vq,
> if (!desc)
> return;
>
> -   if (vq->use_dma_api) {
> +   if (vq->use_dma_api && dma_map_internal) {
> len = vq->packed.desc_extra[id].len;
> for (i = 0; i < len / sizeof(struct 
> vring_packed_desc);
> i++)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v6 05/11] virtio_ring: packed: support premapped

On Mon, Mar 27, 2023 at 12:05 PM Xuan Zhuo  wrote:
>
> virtio core only supports virtual addresses, dma is completed in virtio
> core.
>
> In some scenarios (such as the AF_XDP), the memory is allocated
> and DMA mapping is completed in advance, so it is necessary for us to
> support passing the DMA address to virtio core.
>
> Drives can use sg->dma_address to pass the mapped dma address to virtio
> core. If one sg->dma_address is used then all sgs must use
> sg->dma_address, otherwise all must be null when passing it to the APIs
> of virtio.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_ring.c | 17 ++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 9d6acd59e3e0..2afff1dc6c74 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -81,6 +81,7 @@ struct vring_desc_state_packed {
> struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. 
> */
> u16 num;/* Descriptor list length. */
> u16 last;   /* The last desc state in a list. */
> +   u32 flags;  /* State flags. */
>  };
>
>  struct vring_desc_extra {
> @@ -1264,7 +1265,8 @@ static inline u16 packed_last_used(u16 last_used_idx)
>  }
>
>  static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
> -struct vring_desc_extra *extra)
> +struct vring_desc_extra *extra,
> +bool dma_map_internal)
>  {
> u16 flags;
>
> @@ -1279,6 +1281,9 @@ static void vring_unmap_extra_packed(const struct 
> vring_virtqueue *vq,
>  (flags & VRING_DESC_F_WRITE) ?
>  DMA_FROM_DEVICE : DMA_TO_DEVICE);
> } else {
> +   if (!dma_map_internal)
> +   return;
> +
> dma_unmap_page(vring_dma_dev(vq),
>extra->addr, extra->len,
>(flags & VRING_DESC_F_WRITE) ?
> @@ -1445,6 +1450,7 @@ static inline int virtqueue_add_packed(struct virtqueue 
> *_vq,
> unsigned int i, n, c, descs_used;
> __le16 head_flags, flags;
> u16 head, id, prev, curr;
> +   bool dma_map_internal;
> int err;
>
> START_USE(vq);
> @@ -1490,7 +1496,8 @@ static inline int virtqueue_add_packed(struct virtqueue 
> *_vq,
> id = vq->free_head;
> BUG_ON(id == vq->packed.vring.num);
>
> -   if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs)) {
> +   dma_map_internal = !sgs[0]->dma_address;
> +   if (dma_map_internal && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, 
> in_sgs)) {
> END_USE(vq);
> return -EIO;
> }
> @@ -1544,6 +1551,7 @@ static inline int virtqueue_add_packed(struct virtqueue 
> *_vq,
> vq->packed.desc_state[id].data = data;
> vq->packed.desc_state[id].indir_desc = ctx;
> vq->packed.desc_state[id].last = prev;
> +   vq->packed.desc_state[id].flags = dma_map_internal ? 
> VRING_STATE_F_MAP_INTERNAL : 0;
>
> /*
>  * A driver MUST NOT make the first descriptor in the list
> @@ -1615,8 +1623,10 @@ static void detach_buf_packed(struct vring_virtqueue 
> *vq,
> struct vring_desc_state_packed *state = NULL;
> struct vring_packed_desc *desc;
> unsigned int i, curr;
> +   bool dma_map_internal;
>
> state = &vq->packed.desc_state[id];
> +   dma_map_internal = !!(state->flags & VRING_STATE_F_MAP_INTERNAL);
>
> /* Clear data ptr. */
> state->data = NULL;
> @@ -1629,7 +1639,8 @@ static void detach_buf_packed(struct vring_virtqueue 
> *vq,
> curr = id;
> for (i = 0; i < state->num; i++) {
> vring_unmap_extra_packed(vq,
> -
> &vq->packed.desc_extra[curr]);
> +&vq->packed.desc_extra[curr],
> +dma_map_internal);
> curr = vq->packed.desc_extra[curr].next;
> }
> }
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v6 04/11] virtio_ring: split: support premapped

On Mon, Mar 27, 2023 at 12:05 PM Xuan Zhuo  wrote:
>
> virtio core only supports virtual addresses, dma is completed in virtio
> core.
>
> In some scenarios (such as the AF_XDP), the memory is allocated
> and DMA mapping is completed in advance, so it is necessary for us to
> support passing the DMA address to virtio core.
>
> Drives can use sg->dma_address to pass the mapped dma address to virtio
> core. If one sg->dma_address is used then all sgs must use
> sg->dma_address, otherwise all must be null when passing it to the APIs
> of virtio.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_ring.c | 31 +++
>  1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 3ada30b475d2..9d6acd59e3e0 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -67,9 +67,13 @@
>  #define LAST_ADD_TIME_INVALID(vq)
>  #endif
>
> +#define VRING_STATE_F_MAP_INTERNAL BIT(0)
> +
>  struct vring_desc_state_split {
> void *data; /* Data for callback. */
> struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
> +   u32 flags;  /* State flags. */
> +   u32 padding;
>  };
>
>  struct vring_desc_state_packed {
> @@ -448,7 +452,7 @@ static void vring_unmap_one_split_indirect(const struct 
> vring_virtqueue *vq,
>  }
>
>  static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> - unsigned int i)
> + unsigned int i, bool 
> dma_map_internal)
>  {
> struct vring_desc_extra *extra = vq->split.desc_extra;
> u16 flags;
> @@ -465,6 +469,9 @@ static unsigned int vring_unmap_one_split(const struct 
> vring_virtqueue *vq,
>  (flags & VRING_DESC_F_WRITE) ?
>  DMA_FROM_DEVICE : DMA_TO_DEVICE);
> } else {
> +   if (!dma_map_internal)
> +   goto out;
> +
> dma_unmap_page(vring_dma_dev(vq),
>extra[i].addr,
>extra[i].len,
> @@ -615,7 +622,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> struct scatterlist *sg;
> struct vring_desc *desc;
> unsigned int i, n, avail, descs_used, prev;
> -   bool indirect;
> +   bool indirect, dma_map_internal;
> int head;
>
> START_USE(vq);
> @@ -668,7 +675,8 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> return -ENOSPC;
> }
>
> -   if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
> +   dma_map_internal = !sgs[0]->dma_address;
> +   if (dma_map_internal && virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, 
> in_sgs))
> goto err_map;
>
> for (n = 0; n < out_sgs; n++) {
> @@ -735,6 +743,8 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> else
> vq->split.desc_state[head].indir_desc = ctx;
>
> +   vq->split.desc_state[head].flags = dma_map_internal ? 
> VRING_STATE_F_MAP_INTERNAL : 0;
> +
> /* Put entry in available array (but don't update avail->idx until 
> they
>  * do sync). */
> avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
> @@ -759,7 +769,8 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> return 0;
>
>  unmap_release:
> -   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
> +   if (dma_map_internal)
> +   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
>
>  err_map:
> if (indirect)
> @@ -805,20 +816,22 @@ static void detach_buf_split(struct vring_virtqueue 
> *vq, unsigned int head,
>  {
> unsigned int i, j;
> __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> +   bool dma_map_internal;
>
> /* Clear data ptr. */
> vq->split.desc_state[head].data = NULL;
> +   dma_map_internal = !!(vq->split.desc_state[head].flags & 
> VRING_STATE_F_MAP_INTERNAL);
>
> /* Put back on free list: unmap first-level descriptors and find end 
> */
> i = head;
>
> while (vq->split.vring.desc[i].flags & nextflag) {
> -   vring_unmap_one_split(vq, i);
> +   vring_unmap_one_split(vq, i, dma_map_internal);
> i = vq->split.desc_extra[i].next;
> vq->vq.num_free++;
> }
>
> -   vring_unmap_one_split(vq, i);
> +   vring_unmap_one_split(vq, i, dma_map_internal);
> vq->split.desc_extra[i].next = vq->free_head;
> vq->free_head = head;
>
> @@ -840,8 +853,10 @@ static void detach_buf_split(struct vring_virtqueue *vq, 
> unsigned int head,
> VRING_DESC_F_INDIRECT));
> BUG_ON(l

Re: [PATCH vhost v6 01/11] virtio_ring: split: separate dma codes

On Mon, Mar 27, 2023 at 12:05 PM Xuan Zhuo  wrote:
>
> DMA-related logic is separated from the virtqueue_add_split() to
> one new function. DMA address will be saved as sg->dma_address if
> use_dma_api is true, then virtqueue_add_split() will use it directly.
> Unmap operation will be simpler.
>
> The purpose of this is to facilitate subsequent support to receive
> dma address mapped by drivers.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_ring.c | 122 +++
>  1 file changed, 94 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 41144b5246a8..2aafb7da793d 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -379,6 +379,14 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
> direction);
>  }
>
> +static dma_addr_t vring_sg_address(struct scatterlist *sg)
> +{
> +   if (sg->dma_address)
> +   return sg->dma_address;
> +
> +   return (dma_addr_t)sg_phys(sg);
> +}
> +
>  static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>void *cpu_addr, size_t size,
>enum dma_data_direction direction)
> @@ -520,6 +528,80 @@ static inline unsigned int 
> virtqueue_add_desc_split(struct virtqueue *vq,
> return next;
>  }
>
> +static void virtqueue_unmap_sgs(struct vring_virtqueue *vq,
> +   struct scatterlist *sgs[],
> +   unsigned int total_sg,
> +   unsigned int out_sgs,
> +   unsigned int in_sgs)
> +{
> +   struct scatterlist *sg;
> +   unsigned int n;
> +
> +   if (!vq->use_dma_api)
> +   return;
> +
> +   for (n = 0; n < out_sgs; n++) {
> +   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +   if (!sg->dma_address)
> +   return;
> +
> +   dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
> +  sg->length, DMA_TO_DEVICE);
> +   }
> +   }
> +
> +   for (; n < (out_sgs + in_sgs); n++) {
> +   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +   if (!sg->dma_address)
> +   return;
> +
> +   dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
> +  sg->length, DMA_FROM_DEVICE);
> +   }
> +   }
> +}
> +
> +static int virtqueue_map_sgs(struct vring_virtqueue *vq,
> +struct scatterlist *sgs[],
> +unsigned int total_sg,
> +unsigned int out_sgs,
> +unsigned int in_sgs)
> +{
> +   struct scatterlist *sg;
> +   unsigned int n;
> +
> +   if (!vq->use_dma_api)
> +   return 0;
> +
> +   for (n = 0; n < out_sgs; n++) {
> +   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_TO_DEVICE);
> +
> +   if (vring_mapping_error(vq, addr))
> +   goto err;
> +
> +   sg->dma_address = addr;
> +   }
> +   }
> +
> +   for (; n < (out_sgs + in_sgs); n++) {
> +   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_FROM_DEVICE);
> +
> +   if (vring_mapping_error(vq, addr))
> +   goto err;
> +
> +   sg->dma_address = addr;
> +   }
> +   }
> +
> +   return 0;
> +
> +err:
> +   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
> +   return -ENOMEM;
> +}
> +
>  static inline int virtqueue_add_split(struct virtqueue *_vq,
>   struct scatterlist *sgs[],
>   unsigned int total_sg,
> @@ -532,9 +614,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> struct vring_virtqueue *vq = to_vvq(_vq);
> struct scatterlist *sg;
> struct vring_desc *desc;
> -   unsigned int i, n, avail, descs_used, prev, err_idx;
> -   int head;
> +   unsigned int i, n, avail, descs_used, prev;
> bool indirect;
> +   int head;
>
> START_USE(vq);
>
> @@ -586,32 +668,30 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> return -ENOSPC;
> }
>
> +   if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))
> +   goto err_map;
> +
> for (n = 0; n < out_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_on

Re: [PATCH vhost v4 01/11] virtio_ring: split: separate dma codes

On Tue, Mar 28, 2023 at 2:24 PM Jason Wang  wrote:
>
>
> 在 2023/3/22 10:56, Xuan Zhuo 写道:
> > DMA-related logic is separated from the virtqueue_add_split() to
> > one new function. DMA address will be saved as sg->dma_address if
> > use_dma_api is true, then virtqueue_add_split() will use it directly.
> > Unmap operation will be simpler.
> >
> > The purpose of this is to facilitate subsequent support to receive
> > dma address mapped by drivers.
> >
> > Signed-off-by: Xuan Zhuo 
>
>
> Acked-by: Jason Wang 
>
> Thanks

Please ignore this, hit the button accidentally.

Thanks

>
>
> > ---
> >   drivers/virtio/virtio_ring.c | 121 +++
> >   1 file changed, 93 insertions(+), 28 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 41144b5246a8..fe704ca6c813 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -379,6 +379,14 @@ static dma_addr_t vring_map_one_sg(const struct 
> > vring_virtqueue *vq,
> >   direction);
> >   }
> >
> > +static dma_addr_t vring_sg_address(struct scatterlist *sg)
> > +{
> > + if (sg->dma_address)
> > + return sg->dma_address;
> > +
> > + return (dma_addr_t)sg_phys(sg);
> > +}
> > +
> >   static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> >  void *cpu_addr, size_t size,
> >  enum dma_data_direction direction)
> > @@ -520,6 +528,80 @@ static inline unsigned int 
> > virtqueue_add_desc_split(struct virtqueue *vq,
> >   return next;
> >   }
> >
> > +static void virtqueue_unmap_sgs(struct vring_virtqueue *vq,
> > + struct scatterlist *sgs[],
> > + unsigned int total_sg,
> > + unsigned int out_sgs,
> > + unsigned int in_sgs)
> > +{
> > + struct scatterlist *sg;
> > + unsigned int n;
> > +
> > + if (!vq->use_dma_api)
> > + return;
> > +
> > + for (n = 0; n < out_sgs; n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + if (!sg->dma_address)
> > + return;
> > +
> > + dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
> > +sg->length, DMA_TO_DEVICE);
> > + }
> > + }
> > +
> > + for (; n < (out_sgs + in_sgs); n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + if (!sg->dma_address)
> > + return;
> > +
> > + dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
> > +sg->length, DMA_FROM_DEVICE);
> > + }
> > + }
> > +}
> > +
> > +static int virtqueue_map_sgs(struct vring_virtqueue *vq,
> > +  struct scatterlist *sgs[],
> > +  unsigned int total_sg,
> > +  unsigned int out_sgs,
> > +  unsigned int in_sgs)
> > +{
> > + struct scatterlist *sg;
> > + unsigned int n;
> > +
> > + if (!vq->use_dma_api)
> > + return 0;
> > +
> > + for (n = 0; n < out_sgs; n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + dma_addr_t addr = vring_map_one_sg(vq, sg, 
> > DMA_TO_DEVICE);
> > +
> > + if (vring_mapping_error(vq, addr))
> > + goto err;
> > +
> > + sg->dma_address = addr;
> > + }
> > + }
> > +
> > + for (; n < (out_sgs + in_sgs); n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + dma_addr_t addr = vring_map_one_sg(vq, sg, 
> > DMA_FROM_DEVICE);
> > +
> > + if (vring_mapping_error(vq, addr))
> > + goto err;
> > +
> > + sg->dma_address = addr;
> > + }
> > + }
> > +
> > + return 0;
> > +
> > +err:
> > + virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
> > + return -ENOMEM;
> > +}
> > +
> >   static inline int virtqueue_add_split(struct virtqueue *_vq,
> > struct scatterlist *sgs[],
> > unsigned int total_sg,
> > @@ -532,9 +614,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> > *_vq,
> >   struct vring_virtqueue *vq = to_vvq(_vq);
> >   struct scatterlist *sg;
> >   struct vring_desc *desc;
> > - unsigned int i, n, avail, descs_used, prev, err_idx;
> > - int head;
> > + unsigned int i, n, avail, descs_used, prev;
> >   bool indirect;
> > + int head;
> >
> >   START_USE(vq);
> >
> > @@ -586,32 +668,30 @@ static inline int virtqueue_add_split(struct 
> > virtqueue *_vq,
> >   return -ENOSPC;
> >   }
> >
> > + if (virtqueu

Re: [PATCH vhost v4 01/11] virtio_ring: split: separate dma codes



在 2023/3/22 10:56, Xuan Zhuo 写道:

DMA-related logic is separated from the virtqueue_add_split() to
one new function. DMA address will be saved as sg->dma_address if
use_dma_api is true, then virtqueue_add_split() will use it directly.
Unmap operation will be simpler.

The purpose of this is to facilitate subsequent support to receive
dma address mapped by drivers.

Signed-off-by: Xuan Zhuo 



Acked-by: Jason Wang 

Thanks



---
  drivers/virtio/virtio_ring.c | 121 +++
  1 file changed, 93 insertions(+), 28 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 41144b5246a8..fe704ca6c813 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -379,6 +379,14 @@ static dma_addr_t vring_map_one_sg(const struct 
vring_virtqueue *vq,
direction);
  }
  
+static dma_addr_t vring_sg_address(struct scatterlist *sg)

+{
+   if (sg->dma_address)
+   return sg->dma_address;
+
+   return (dma_addr_t)sg_phys(sg);
+}
+
  static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
   void *cpu_addr, size_t size,
   enum dma_data_direction direction)
@@ -520,6 +528,80 @@ static inline unsigned int virtqueue_add_desc_split(struct 
virtqueue *vq,
return next;
  }
  
+static void virtqueue_unmap_sgs(struct vring_virtqueue *vq,

+   struct scatterlist *sgs[],
+   unsigned int total_sg,
+   unsigned int out_sgs,
+   unsigned int in_sgs)
+{
+   struct scatterlist *sg;
+   unsigned int n;
+
+   if (!vq->use_dma_api)
+   return;
+
+   for (n = 0; n < out_sgs; n++) {
+   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+   if (!sg->dma_address)
+   return;
+
+   dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
+  sg->length, DMA_TO_DEVICE);
+   }
+   }
+
+   for (; n < (out_sgs + in_sgs); n++) {
+   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+   if (!sg->dma_address)
+   return;
+
+   dma_unmap_page(vring_dma_dev(vq), sg->dma_address,
+  sg->length, DMA_FROM_DEVICE);
+   }
+   }
+}
+
+static int virtqueue_map_sgs(struct vring_virtqueue *vq,
+struct scatterlist *sgs[],
+unsigned int total_sg,
+unsigned int out_sgs,
+unsigned int in_sgs)
+{
+   struct scatterlist *sg;
+   unsigned int n;
+
+   if (!vq->use_dma_api)
+   return 0;
+
+   for (n = 0; n < out_sgs; n++) {
+   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+   dma_addr_t addr = vring_map_one_sg(vq, sg, 
DMA_TO_DEVICE);
+
+   if (vring_mapping_error(vq, addr))
+   goto err;
+
+   sg->dma_address = addr;
+   }
+   }
+
+   for (; n < (out_sgs + in_sgs); n++) {
+   for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+   dma_addr_t addr = vring_map_one_sg(vq, sg, 
DMA_FROM_DEVICE);
+
+   if (vring_mapping_error(vq, addr))
+   goto err;
+
+   sg->dma_address = addr;
+   }
+   }
+
+   return 0;
+
+err:
+   virtqueue_unmap_sgs(vq, sgs, total_sg, out_sgs, in_sgs);
+   return -ENOMEM;
+}
+
  static inline int virtqueue_add_split(struct virtqueue *_vq,
  struct scatterlist *sgs[],
  unsigned int total_sg,
@@ -532,9 +614,9 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
struct vring_desc *desc;
-   unsigned int i, n, avail, descs_used, prev, err_idx;
-   int head;
+   unsigned int i, n, avail, descs_used, prev;
bool indirect;
+   int head;
  
  	START_USE(vq);
  
@@ -586,32 +668,30 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,

return -ENOSPC;
}
  
+	if (virtqueue_map_sgs(vq, sgs, total_sg, out_sgs, in_sgs))

+   return -ENOMEM;
+
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
-   dma_addr_t addr = vring_map_one_sg(vq, sg, 
DMA_TO_DEVICE);
-   if (vring_mapping_error(vq, addr))
-   goto unmap_release;
-
prev = i;
/* Note that we trust indirect descriptor
 * table si

Re: [PATCH v4 08/11] vdpa: Add eventfd for the vdpa callback



在 2023/3/23 13:30, Xie Yongji 写道:

Add eventfd for the vdpa callback so that user
can signal it directly instead of triggering the
callback. It will be used for vhost-vdpa case.

Signed-off-by: Xie Yongji 



Acked-by: Jason Wang 

Thanks



---
  drivers/vhost/vdpa.c | 2 ++
  drivers/virtio/virtio_vdpa.c | 1 +
  include/linux/vdpa.h | 6 ++
  3 files changed, 9 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 7be9d9d8f01c..9cd878e25cff 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -599,9 +599,11 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, 
unsigned int cmd,
if (vq->call_ctx.ctx) {
cb.callback = vhost_vdpa_virtqueue_cb;
cb.private = vq;
+   cb.trigger = vq->call_ctx.ctx;
} else {
cb.callback = NULL;
cb.private = NULL;
+   cb.trigger = NULL;
}
ops->set_vq_cb(vdpa, idx, &cb);
vhost_vdpa_setup_vq_irq(v, idx);
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f3826f42b704..2a095f37f26b 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -196,6 +196,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned 
int index,
/* Setup virtqueue callback */
cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
cb.private = info;
+   cb.trigger = NULL;
ops->set_vq_cb(vdpa, index, &cb);
ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
  
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h

index e52c9a37999c..0372b2e3d38a 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -13,10 +13,16 @@
   * struct vdpa_calllback - vDPA callback definition.
   * @callback: interrupt callback function
   * @private: the data passed to the callback function
+ * @trigger: the eventfd for the callback (Optional).
+ *   When it is set, the vDPA driver must guarantee that
+ *   signaling it is functional equivalent to triggering
+ *   the callback. Then vDPA parent can signal it directly
+ *   instead of triggering the callback.
   */
  struct vdpa_callback {
irqreturn_t (*callback)(void *data);
void *private;
+   struct eventfd_ctx *trigger;
  };
  
  /**


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v4 07/11] vduse: Add sysfs interface for irq callback affinity



在 2023/3/23 13:30, Xie Yongji 写道:

Add sysfs interface for each vduse virtqueue to
get/set the affinity for irq callback. This might
be useful for performance tuning when the irq callback
affinity mask contains more than one CPU.

Signed-off-by: Xie Yongji 



Acked-by: Jason Wang 

Thanks



---
  drivers/vdpa/vdpa_user/vduse_dev.c | 124 ++---
  1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index cefabd0dab9c..77da3685568a 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -61,6 +61,7 @@ struct vduse_virtqueue {
struct work_struct kick;
int irq_effective_cpu;
struct cpumask irq_affinity;
+   struct kobject kobj;
  };
  
  struct vduse_dev;

@@ -1387,6 +1388,96 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
  };
  
+static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)

+{
+   return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
+}
+
+static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq,
+const char *buf, size_t count)
+{
+   cpumask_var_t new_value;
+   int ret;
+
+   if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+   return -ENOMEM;
+
+   ret = cpumask_parse(buf, new_value);
+   if (ret)
+   goto free_mask;
+
+   ret = -EINVAL;
+   if (!cpumask_intersects(new_value, cpu_online_mask))
+   goto free_mask;
+
+   cpumask_copy(&vq->irq_affinity, new_value);
+   ret = count;
+free_mask:
+   free_cpumask_var(new_value);
+   return ret;
+}
+
+struct vq_sysfs_entry {
+   struct attribute attr;
+   ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
+   ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
+size_t count);
+};
+
+static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity);
+
+static struct attribute *vq_attrs[] = {
+   &irq_cb_affinity_attr.attr,
+   NULL,
+};
+ATTRIBUTE_GROUPS(vq);
+
+static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
+   char *buf)
+{
+   struct vduse_virtqueue *vq = container_of(kobj,
+   struct vduse_virtqueue, kobj);
+   struct vq_sysfs_entry *entry = container_of(attr,
+   struct vq_sysfs_entry, attr);
+
+   if (!entry->show)
+   return -EIO;
+
+   return entry->show(vq, buf);
+}
+
+static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
+const char *buf, size_t count)
+{
+   struct vduse_virtqueue *vq = container_of(kobj,
+   struct vduse_virtqueue, kobj);
+   struct vq_sysfs_entry *entry = container_of(attr,
+   struct vq_sysfs_entry, attr);
+
+   if (!entry->store)
+   return -EIO;
+
+   return entry->store(vq, buf, count);
+}
+
+static const struct sysfs_ops vq_sysfs_ops = {
+   .show = vq_attr_show,
+   .store = vq_attr_store,
+};
+
+static void vq_release(struct kobject *kobj)
+{
+   struct vduse_virtqueue *vq = container_of(kobj,
+   struct vduse_virtqueue, kobj);
+   kfree(vq);
+}
+
+static const struct kobj_type vq_type = {
+   .release= vq_release,
+   .sysfs_ops  = &vq_sysfs_ops,
+   .default_groups = vq_groups,
+};
+
  static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
  {
int i;
@@ -1395,13 +1486,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
return;
  
  	for (i = 0; i < dev->vq_num; i++)

-   kfree(dev->vqs[i]);
+   kobject_put(&dev->vqs[i]->kobj);
kfree(dev->vqs);
  }
  
  static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)

  {
-   int i;
+   int ret, i;
  
  	dev->vq_align = vq_align;

dev->vq_num = vq_num;
@@ -1411,8 +1502,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 
vq_align, u32 vq_num)
  
  	for (i = 0; i < vq_num; i++) {

dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
-   if (!dev->vqs[i])
+   if (!dev->vqs[i]) {
+   ret = -ENOMEM;
goto err;
+   }
  
  		dev->vqs[i]->index = i;

dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND;
@@ -1421,15 +1514,23 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, 
u32 vq_align, u32 vq_num)
spin_lock_init(&dev->vqs[i]->kick_lock);
spin_lock_init(&dev->vqs[i]->irq_lock);
cpumask_setall(&dev->vqs[i]->irq_affinity);
+
+   kobject_init(&dev->vqs[i]->kobj, &vq_type);
+

Re: [PATCH v4 06/11] vduse: Support get_vq_affinity callback



在 2023/3/23 13:30, Xie Yongji 写道:

This implements get_vq_affinity callback so that
the virtio-blk driver can build the blk-mq queues
based on the irq callback affinity.

Signed-off-by: Xie Yongji 



Acked-by: Jason Wang 

Thanks



---
  drivers/vdpa/vdpa_user/vduse_dev.c | 9 +
  1 file changed, 9 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 45aa8703c4b5..cefabd0dab9c 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -722,6 +722,14 @@ static int vduse_vdpa_set_vq_affinity(struct vdpa_device 
*vdpa, u16 idx,
return 0;
  }
  
+static const struct cpumask *

+vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   return &dev->vqs[idx]->irq_affinity;
+}
+
  static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
unsigned int asid,
struct vhost_iotlb *iotlb)
@@ -773,6 +781,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.set_config = vduse_vdpa_set_config,
.get_generation = vduse_vdpa_get_generation,
.set_vq_affinity= vduse_vdpa_set_vq_affinity,
+   .get_vq_affinity= vduse_vdpa_get_vq_affinity,
.reset  = vduse_vdpa_reset,
.set_map= vduse_vdpa_set_map,
.free   = vduse_vdpa_free,


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v4 05/11] vduse: Support set_vq_affinity callback



在 2023/3/23 13:30, Xie Yongji 写道:

Since virtio-vdpa bus driver already support interrupt
affinity spreading mechanism, let's implement the
set_vq_affinity callback to bring it to vduse device.
After we get the virtqueue's affinity, we can spread
IRQs between CPUs in the affinity mask, in a round-robin
manner, to run the irq callback.

Signed-off-by: Xie Yongji 



Acked-by: Jason Wang 

Thanks



---
  drivers/vdpa/vdpa_user/vduse_dev.c | 61 ++
  1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 98359d87a06f..45aa8703c4b5 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -41,6 +41,8 @@
  #define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
  #define VDUSE_MSG_DEFAULT_TIMEOUT 30
  
+#define IRQ_UNBOUND -1

+
  struct vduse_virtqueue {
u16 index;
u16 num_max;
@@ -57,6 +59,8 @@ struct vduse_virtqueue {
struct vdpa_callback cb;
struct work_struct inject;
struct work_struct kick;
+   int irq_effective_cpu;
+   struct cpumask irq_affinity;
  };
  
  struct vduse_dev;

@@ -128,6 +132,7 @@ static struct class *vduse_class;
  static struct cdev vduse_ctrl_cdev;
  static struct cdev vduse_cdev;
  static struct workqueue_struct *vduse_irq_wq;
+static struct workqueue_struct *vduse_irq_bound_wq;
  
  static u32 allowed_device_id[] = {

VIRTIO_ID_BLOCK,
@@ -708,6 +713,15 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device 
*vdpa)
return dev->generation;
  }
  
+static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,

+ const struct cpumask *cpu_mask)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
+   return 0;
+}
+
  static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
unsigned int asid,
struct vhost_iotlb *iotlb)
@@ -758,6 +772,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
.get_config = vduse_vdpa_get_config,
.set_config = vduse_vdpa_set_config,
.get_generation = vduse_vdpa_get_generation,
+   .set_vq_affinity= vduse_vdpa_set_vq_affinity,
.reset  = vduse_vdpa_reset,
.set_map= vduse_vdpa_set_map,
.free   = vduse_vdpa_free,
@@ -917,7 +932,8 @@ static void vduse_vq_irq_inject(struct work_struct *work)
  }
  
  static int vduse_dev_queue_irq_work(struct vduse_dev *dev,

-   struct work_struct *irq_work)
+   struct work_struct *irq_work,
+   int irq_effective_cpu)
  {
int ret = -EINVAL;
  
@@ -926,7 +942,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,

goto unlock;
  
  	ret = 0;

-   queue_work(vduse_irq_wq, irq_work);
+   if (irq_effective_cpu == IRQ_UNBOUND)
+   queue_work(vduse_irq_wq, irq_work);
+   else
+   queue_work_on(irq_effective_cpu,
+ vduse_irq_bound_wq, irq_work);
  unlock:
up_read(&dev->rwsem);
  
@@ -1029,6 +1049,22 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,

return ret;
  }
  
+static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)

+{
+   int curr_cpu = vq->irq_effective_cpu;
+
+   while (true) {
+   curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity);
+   if (cpu_online(curr_cpu))
+   break;
+
+   if (curr_cpu >= nr_cpu_ids)
+   curr_cpu = IRQ_UNBOUND;
+   }
+
+   vq->irq_effective_cpu = curr_cpu;
+}
+
  static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
  {
@@ -,7 +1147,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned 
int cmd,
break;
}
case VDUSE_DEV_INJECT_CONFIG_IRQ:
-   ret = vduse_dev_queue_irq_work(dev, &dev->inject);
+   ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND);
break;
case VDUSE_VQ_SETUP: {
struct vduse_vq_config config;
@@ -1198,7 +1234,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned 
int cmd,
break;
  
  		index = array_index_nospec(index, dev->vq_num);

-   ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject);
+
+   vduse_vq_update_effective_cpu(dev->vqs[index]);
+   ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject,
+   dev->vqs[index]->irq_effective_cpu);
break;
}
case VDUSE_IOTLB_REG_UMEM: {
@@ -1367,10 +1406,12 @@ static int vduse_d

Re: [PATCH v4 03/11] virtio-vdpa: Support interrupt affinity spreading mechanism



在 2023/3/24 17:12, Michael S. Tsirkin 写道:

On Fri, Mar 24, 2023 at 02:27:52PM +0800, Jason Wang wrote:

On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji  wrote:

To support interrupt affinity spreading mechanism,
this makes use of group_cpus_evenly() to create
an irq callback affinity mask for each virtqueue
of vdpa device. Then we will unify set_vq_affinity
callback to pass the affinity to the vdpa device driver.

Signed-off-by: Xie Yongji 

Thinking hard of all the logics, I think I've found something interesting.

Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
pass irq_affinity to transport specific find_vqs().  This seems a
layer violation since driver has no knowledge of

1) whether or not the callback is based on an IRQ
2) whether or not the device is a PCI or not (the details are hided by
the transport driver)
3) how many vectors could be used by a device

This means the driver can't actually pass a real affinity masks so the
commit passes a zero irq affinity structure as a hint in fact, so the
PCI layer can build a default affinity based that groups cpus evenly
based on the number of MSI-X vectors (the core logic is the
group_cpus_evenly). I think we should fix this by replacing the
irq_affinity structure with

1) a boolean like auto_cb_spreading

or

2) queue to cpu mapping

So each transport can do its own logic based on that. Then virtio-vDPA
can pass that policy to VDUSE where we only need a group_cpus_evenly()
and avoid duplicating irq_create_affinity_masks()?

Thanks

I don't really understand what you propose. Care to post a patch?



I meant to avoid passing irq_affinity structure in find_vqs but an array 
of boolean telling us whether or not the vq requires a automatic 
spreading of callbacks. But it seems less flexible.




Also does it have to block this patchset or can it be done on top?



We can leave it in the future.

So

Acked-by: Jason Wang 

Thanks





---
  drivers/virtio/virtio_vdpa.c | 68 
  1 file changed, 68 insertions(+)

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f72696b4c1c2..f3826f42b704 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -13,6 +13,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -272,6 +273,66 @@ static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
 virtio_vdpa_del_vq(vq);
  }

+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+   affd->nr_sets = 1;
+   affd->set_size[0] = affvecs;
+}
+
+static struct cpumask *
+create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+{
+   unsigned int affvecs = 0, curvec, usedvecs, i;
+   struct cpumask *masks = NULL;
+
+   if (nvecs > affd->pre_vectors + affd->post_vectors)
+   affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+
+   if (!affd->calc_sets)
+   affd->calc_sets = default_calc_sets;
+
+   affd->calc_sets(affd, affvecs);
+
+   if (!affvecs)
+   return NULL;
+
+   masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+   if (!masks)
+   return NULL;
+
+   /* Fill out vectors at the beginning that don't need affinity */
+   for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+   cpumask_setall(&masks[curvec]);
+
+   for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+   unsigned int this_vecs = affd->set_size[i];
+   int j;
+   struct cpumask *result = group_cpus_evenly(this_vecs);
+
+   if (!result) {
+   kfree(masks);
+   return NULL;
+   }
+
+   for (j = 0; j < this_vecs; j++)
+   cpumask_copy(&masks[curvec + j], &result[j]);
+   kfree(result);
+
+   curvec += this_vecs;
+   usedvecs += this_vecs;
+   }
+
+   /* Fill out vectors at the end that don't need affinity */
+   if (usedvecs >= affvecs)
+   curvec = affd->pre_vectors + affvecs;
+   else
+   curvec = affd->pre_vectors + usedvecs;
+   for (; curvec < nvecs; curvec++)
+   cpumask_setall(&masks[curvec]);
+
+   return masks;
+}
+
  static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 struct virtqueue *vqs[],
 vq_callback_t *callbacks[],
@@ -282,9 +343,15 @@ static int virtio_vdpa_find_vqs(struct virtio_device 
*vdev, unsigned int nvqs,
 struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
 struct vdpa_device *vdpa = vd_get_vdpa(vdev);
 const struct vdpa_config_ops *ops = vdpa->config;
+   struct irq_affinity default_affd = { 0 };
+   struct cpumask *masks;
 struct vdpa_callback cb;
 int i, err, queue_idx = 0;

+   masks = create_

Re: [PATCH v4 03/11] virtio-vdpa: Support interrupt affinity spreading mechanism

On Tue, Mar 28, 2023 at 12:05 PM Yongji Xie  wrote:
>
> On Tue, Mar 28, 2023 at 11:44 AM Jason Wang  wrote:
> >
> > On Tue, Mar 28, 2023 at 11:33 AM Yongji Xie  wrote:
> > >
> > > On Tue, Mar 28, 2023 at 11:14 AM Jason Wang  wrote:
> > > >
> > > > On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie  
> > > > wrote:
> > > > >
> > > > > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang  
> > > > > wrote:
> > > > > >
> > > > > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji 
> > > > > >  wrote:
> > > > > > >
> > > > > > > To support interrupt affinity spreading mechanism,
> > > > > > > this makes use of group_cpus_evenly() to create
> > > > > > > an irq callback affinity mask for each virtqueue
> > > > > > > of vdpa device. Then we will unify set_vq_affinity
> > > > > > > callback to pass the affinity to the vdpa device driver.
> > > > > > >
> > > > > > > Signed-off-by: Xie Yongji 
> > > > > >
> > > > > > Thinking hard of all the logics, I think I've found something 
> > > > > > interesting.
> > > > > >
> > > > > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries 
> > > > > > to
> > > > > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > > > > layer violation since driver has no knowledge of
> > > > > >
> > > > > > 1) whether or not the callback is based on an IRQ
> > > > > > 2) whether or not the device is a PCI or not (the details are hided 
> > > > > > by
> > > > > > the transport driver)
> > > > > > 3) how many vectors could be used by a device
> > > > > >
> > > > > > This means the driver can't actually pass a real affinity masks so 
> > > > > > the
> > > > > > commit passes a zero irq affinity structure as a hint in fact, so 
> > > > > > the
> > > > > > PCI layer can build a default affinity based that groups cpus evenly
> > > > > > based on the number of MSI-X vectors (the core logic is the
> > > > > > group_cpus_evenly). I think we should fix this by replacing the
> > > > > > irq_affinity structure with
> > > > > >
> > > > > > 1) a boolean like auto_cb_spreading
> > > > > >
> > > > > > or
> > > > > >
> > > > > > 2) queue to cpu mapping
> > > > > >
> > > > >
> > > > > But only the driver knows which queues are used in the control path
> > > > > which don't need the automatic irq affinity assignment.
> > > >
> > > > Is this knowledge awarded by the transport driver now?
> > > >
> > >
> > > This knowledge is awarded by the device driver rather than the transport 
> > > driver.
> > >
> > > E.g. virtio-scsi uses:
> > >
> > > struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
> > > queue, vq1 is event queue
> >
> > Ok, but it only works as a hint, it's not a real affinity. As replied,
> > we can pass an array of boolean in this case then transport driver
> > knows it doesn't need to use automatic affinity for the first two
> > queues.
> >
>
> But we don't know whether we would use other fields in structure
> irq_affinity in the future. So a full set should be better?

Good point. So the issue is the calc_sets() and we probably need that
if there's a virtio driver that needs more than one set of vectors
that needs to be spreaded. Technically, we could have a virtio level
abstraction for this but I agree it's probably not worth bothering
now.

>
> > >
> > > > E.g virtio-blk uses:
> > > >
> > > > struct irq_affinity desc = { 0, };
> > > >
> > > > Atleast we can tell the transport driver which vq requires automatic
> > > > irq affinity.
> > > >
> > >
> > > I think that is what the current implementation does.
> > >
> > > > > So I think the
> > > > > irq_affinity structure can only be created by device drivers and
> > > > > passed to the virtio-pci/virtio-vdpa driver.
> > > >
> > > > This could be not easy since the driver doesn't even know how many
> > > > interrupts will be used by the transport driver, so it can't built the
> > > > actual affinity structure.
> > > >
> > >
> > > The actual affinity mask is built by the transport driver,
> >
> > For PCI yes, it talks directly to the IRQ subsystems.
> >
> > > device
> > > driver only passes a hint on which queues don't need the automatic irq
> > > affinity assignment.
> >
> > But not for virtio-vDPA since the IRQ needs to be dealt with by the
> > parent driver. For our case, it's the VDUSE where it doesn't need IRQ
> > at all, a queue to cpu mapping is sufficient.
> >
>
> The device driver doesn't know whether it is binded to virtio-pci or
> virtio-vdpa. So it should pass a full set needed by the automatic irq
> affinity assignment instead of a subset. Then virtio-vdpa can choose
> to pass a queue to cpu mapping to VDUSE, which is what we do now (use
> set_vq_affinity()).

Yes, so basically two ways:

1) automatic IRQ management, passing affd to find_vqs(), affinity was
determined by the transport (e.g vDPA).
2) affinity that is under the control of the driver, it needs to use
set_vq_affinity() but need to deal with cpu hotplug stuffs.

Thanks

>
> Thanks,
> Yongji
>

___

Re: 9p regression (Was: [PATCH v2] virtio_ring: don't update event idx on get_buf)

2023-03-27 Thread Luis Chamberlain

On Tue, Mar 28, 2023 at 11:13:32AM +0900, Dominique Martinet wrote:
> Hi Michael, Albert,
> 
> Albert Huang wrote on Sat, Mar 25, 2023 at 06:56:33PM +0800:
> > in virtio_net, if we disable the napi_tx, when we triger a tx interrupt,
> > the vq->event_triggered will be set to true. It will no longer be set to
> > false. Unless we explicitly call virtqueue_enable_cb_delayed or
> > virtqueue_enable_cb_prepare.
> 
> This patch (commited as 35395770f803 ("virtio_ring: don't update event
> idx on get_buf") in next-20230327 apparently breaks 9p, as reported by
> Luis in https://lkml.kernel.org/r/zci+7wg5oclsl...@bombadil.infradead.org

Spot on, reverting that commit fixes it.

  Luis
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3 virtio 6/8] pds_vdpa: add support for vdpa and vdpamgmt interfaces

On Sat, Mar 25, 2023 at 8:27 AM Shannon Nelson  wrote:
>
> On 3/22/23 10:18 PM, Jason Wang wrote:
> > On Thu, Mar 23, 2023 at 3:11 AM Shannon Nelson  
> > wrote:
> >>
> >> This is the vDPA device support, where we advertise that we can
> >> support the virtio queues and deal with the configuration work
> >> through the pds_core's adminq.
> >>
> >> Signed-off-by: Shannon Nelson 
> >> ---
> >>   drivers/vdpa/pds/aux_drv.c  |  15 +
> >>   drivers/vdpa/pds/aux_drv.h  |   1 +
> >>   drivers/vdpa/pds/debugfs.c  | 260 +
> >>   drivers/vdpa/pds/debugfs.h  |  10 +
> >>   drivers/vdpa/pds/vdpa_dev.c | 560 +++-
> >>   5 files changed, 845 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c
> >> index 8f3ae3326885..e54f0371c60e 100644
> >> --- a/drivers/vdpa/pds/aux_drv.c
> >> +++ b/drivers/vdpa/pds/aux_drv.c
> >
> > [...]
> >
> >> +
> >> +static struct vdpa_notification_area
> >> +pds_vdpa_get_vq_notification(struct vdpa_device *vdpa_dev, u16 qid)
> >> +{
> >> +   struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
> >> +   struct virtio_pci_modern_device *vd_mdev;
> >> +   struct vdpa_notification_area area;
> >> +
> >> +   area.addr = pdsv->vqs[qid].notify_pa;
> >> +
> >> +   vd_mdev = &pdsv->vdpa_aux->vd_mdev;
> >> +   if (!vd_mdev->notify_offset_multiplier)
> >> +   area.size = PDS_PAGE_SIZE;
> >
> > This hasn't been defined so far? Others look good.
>
> Sorry, I don't understand your question.
> sln

I mean I don't see the definition of PDS_PAGE_SIZE so far.

Thanks

>
>
> >
> > Thanks
> >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v4 03/11] virtio-vdpa: Support interrupt affinity spreading mechanism

On Tue, Mar 28, 2023 at 11:33 AM Yongji Xie  wrote:
>
> On Tue, Mar 28, 2023 at 11:14 AM Jason Wang  wrote:
> >
> > On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie  wrote:
> > >
> > > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang  wrote:
> > > >
> > > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji  
> > > > wrote:
> > > > >
> > > > > To support interrupt affinity spreading mechanism,
> > > > > this makes use of group_cpus_evenly() to create
> > > > > an irq callback affinity mask for each virtqueue
> > > > > of vdpa device. Then we will unify set_vq_affinity
> > > > > callback to pass the affinity to the vdpa device driver.
> > > > >
> > > > > Signed-off-by: Xie Yongji 
> > > >
> > > > Thinking hard of all the logics, I think I've found something 
> > > > interesting.
> > > >
> > > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > > layer violation since driver has no knowledge of
> > > >
> > > > 1) whether or not the callback is based on an IRQ
> > > > 2) whether or not the device is a PCI or not (the details are hided by
> > > > the transport driver)
> > > > 3) how many vectors could be used by a device
> > > >
> > > > This means the driver can't actually pass a real affinity masks so the
> > > > commit passes a zero irq affinity structure as a hint in fact, so the
> > > > PCI layer can build a default affinity based that groups cpus evenly
> > > > based on the number of MSI-X vectors (the core logic is the
> > > > group_cpus_evenly). I think we should fix this by replacing the
> > > > irq_affinity structure with
> > > >
> > > > 1) a boolean like auto_cb_spreading
> > > >
> > > > or
> > > >
> > > > 2) queue to cpu mapping
> > > >
> > >
> > > But only the driver knows which queues are used in the control path
> > > which don't need the automatic irq affinity assignment.
> >
> > Is this knowledge awarded by the transport driver now?
> >
>
> This knowledge is awarded by the device driver rather than the transport 
> driver.
>
> E.g. virtio-scsi uses:
>
> struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
> queue, vq1 is event queue

Ok, but it only works as a hint, it's not a real affinity. As replied,
we can pass an array of boolean in this case then transport driver
knows it doesn't need to use automatic affinity for the first two
queues.

>
> > E.g virtio-blk uses:
> >
> > struct irq_affinity desc = { 0, };
> >
> > Atleast we can tell the transport driver which vq requires automatic
> > irq affinity.
> >
>
> I think that is what the current implementation does.
>
> > > So I think the
> > > irq_affinity structure can only be created by device drivers and
> > > passed to the virtio-pci/virtio-vdpa driver.
> >
> > This could be not easy since the driver doesn't even know how many
> > interrupts will be used by the transport driver, so it can't built the
> > actual affinity structure.
> >
>
> The actual affinity mask is built by the transport driver,

For PCI yes, it talks directly to the IRQ subsystems.

> device
> driver only passes a hint on which queues don't need the automatic irq
> affinity assignment.

But not for virtio-vDPA since the IRQ needs to be dealt with by the
parent driver. For our case, it's the VDUSE where it doesn't need IRQ
at all, a queue to cpu mapping is sufficient.

Thanks

>
> Thanks,
> Yongji
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [External] Re: 9p regression (Was: [PATCH v2] virtio_ring: don't update event idx on get_buf)

On Tue, Mar 28, 2023 at 11:09 AM 黄杰  wrote:
>
> Jason Wang  于2023年3月28日周二 10:59写道：
> >
> > On Tue, Mar 28, 2023 at 10:13 AM Dominique Martinet
> >  wrote:
> > >
> > > Hi Michael, Albert,
> > >
> > > Albert Huang wrote on Sat, Mar 25, 2023 at 06:56:33PM +0800:
> > > > in virtio_net, if we disable the napi_tx, when we triger a tx interrupt,
> > > > the vq->event_triggered will be set to true. It will no longer be set to
> > > > false. Unless we explicitly call virtqueue_enable_cb_delayed or
> > > > virtqueue_enable_cb_prepare.
> > >
> > > This patch (commited as 35395770f803 ("virtio_ring: don't update event
> > > idx on get_buf") in next-20230327 apparently breaks 9p, as reported by
> > > Luis in https://lkml.kernel.org/r/zci+7wg5oclsl...@bombadil.infradead.org
> > >
> > > I've just hit had a look at recent patches[1] and reverted this to test
> > > and I can mount again, so I'm pretty sure this is the culprit, but I
> > > didn't look at the content at all yet so cannot advise further.
> > > It might very well be that we need some extra handling for 9p
> > > specifically that can be added separately if required.
> > >
> > > [1] git log 0ec57cfa721fbd36b4c4c0d9ccc5d78a78f7fa35..HEAD drivers/virtio/
> > >
> > >
> > > This can be reproduced with a simple mount, run qemu with some -virtfs
> > > argument and `mount -t 9p -o debug=65535 tag mountpoint` will hang after
> > > these messages:
> > > 9pnet: -- p9_virtio_request (83): 9p debug: virtio request
> > > 9pnet: -- p9_virtio_request (83): virtio request kicked
> > >
> > > So I suspect we're just not getting a callback.
> >
> > I think so. The patch assumes the driver will call
> > virtqueue_disable/enable_cb() which is not the case of the 9p driver.
> >
> > So after the first interrupt, event_triggered will be set to true forever.
> >
> > Thanks
> >
>
> Hi: Wang
>
> Yes,  This patch assumes that all virtio-related drivers will call
> virtqueue_disable/enable_cb().
> Thank you for raising this issue.
>
> It seems that napi_tx is only related to virtue_net. I'm thinking if
> we need to refactor
> napi_tx instead of implementing it inside virtio_ring.

We can hear from others.

I think it's better not to workaround virtio_ring issues in a specific
driver. It might just add more hacks. We should correctly set
VRING_AVAIL_F_NO_INTERRUPT,

Do you think the following might work (not even a compile test)?

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 41144b5246a8..12f4efb6dc54 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -852,16 +852,16 @@ static void virtqueue_disable_cb_split(struct
virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);

-   if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
-   vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-   if (vq->event)
-   /* TODO: this is a hack. Figure out a cleaner
value to write. */
-   vring_used_event(&vq->split.vring) = 0x0;
-   else
-   vq->split.vring.avail->flags =
-   cpu_to_virtio16(_vq->vdev,
-   vq->split.avail_flags_shadow);
-   }
+   if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+   vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+
+   if (vq->event && !vq->event_triggered)
+   /* TODO: this is a hack. Figure out a cleaner value to write. */
+   vring_used_event(&vq->split.vring) = 0x0;
+   else
+   vq->split.vring.avail->flags =
+   cpu_to_virtio16(_vq->vdev,
+   vq->split.avail_flags_shadow);
 }

 static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
@@ -1697,8 +1697,10 @@ static void virtqueue_disable_cb_packed(struct
virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);

-   if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) {
+   if (!(vq->packed.event_flags_shadow & VRING_PACKED_EVENT_FLAG_DISABLE))
vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
+
+   if (vq->event_triggered)
vq->packed.vring.driver->flags =
cpu_to_le16(vq->packed.event_flags_shadow);
}
@@ -2330,12 +2332,6 @@ void virtqueue_disable_cb(struct vir

Re: [PATCH v4 03/11] virtio-vdpa: Support interrupt affinity spreading mechanism

On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie  wrote:
>
> On Fri, Mar 24, 2023 at 2:28 PM Jason Wang  wrote:
> >
> > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji  wrote:
> > >
> > > To support interrupt affinity spreading mechanism,
> > > this makes use of group_cpus_evenly() to create
> > > an irq callback affinity mask for each virtqueue
> > > of vdpa device. Then we will unify set_vq_affinity
> > > callback to pass the affinity to the vdpa device driver.
> > >
> > > Signed-off-by: Xie Yongji 
> >
> > Thinking hard of all the logics, I think I've found something interesting.
> >
> > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > pass irq_affinity to transport specific find_vqs().  This seems a
> > layer violation since driver has no knowledge of
> >
> > 1) whether or not the callback is based on an IRQ
> > 2) whether or not the device is a PCI or not (the details are hided by
> > the transport driver)
> > 3) how many vectors could be used by a device
> >
> > This means the driver can't actually pass a real affinity masks so the
> > commit passes a zero irq affinity structure as a hint in fact, so the
> > PCI layer can build a default affinity based that groups cpus evenly
> > based on the number of MSI-X vectors (the core logic is the
> > group_cpus_evenly). I think we should fix this by replacing the
> > irq_affinity structure with
> >
> > 1) a boolean like auto_cb_spreading
> >
> > or
> >
> > 2) queue to cpu mapping
> >
>
> But only the driver knows which queues are used in the control path
> which don't need the automatic irq affinity assignment.

Is this knowledge awarded by the transport driver now?

E.g virtio-blk uses:

struct irq_affinity desc = { 0, };

Atleast we can tell the transport driver which vq requires automatic
irq affinity.

> So I think the
> irq_affinity structure can only be created by device drivers and
> passed to the virtio-pci/virtio-vdpa driver.

This could be not easy since the driver doesn't even know how many
interrupts will be used by the transport driver, so it can't built the
actual affinity structure.

>
> > So each transport can do its own logic based on that. Then virtio-vDPA
> > can pass that policy to VDUSE where we only need a group_cpus_evenly()
> > and avoid duplicating irq_create_affinity_masks()?
> >
>
> I don't get why we would have duplicated irq_create_affinity_masks().

I meant the create_affinity_masks() in patch 3 seems a duplication of
irq_create_affinity_masks().

Thanks

>
> Thanks,
> Yongji
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: 9p regression (Was: [PATCH v2] virtio_ring: don't update event idx on get_buf)

On Tue, Mar 28, 2023 at 10:13 AM Dominique Martinet
 wrote:
>
> Hi Michael, Albert,
>
> Albert Huang wrote on Sat, Mar 25, 2023 at 06:56:33PM +0800:
> > in virtio_net, if we disable the napi_tx, when we triger a tx interrupt,
> > the vq->event_triggered will be set to true. It will no longer be set to
> > false. Unless we explicitly call virtqueue_enable_cb_delayed or
> > virtqueue_enable_cb_prepare.
>
> This patch (commited as 35395770f803 ("virtio_ring: don't update event
> idx on get_buf") in next-20230327 apparently breaks 9p, as reported by
> Luis in https://lkml.kernel.org/r/zci+7wg5oclsl...@bombadil.infradead.org
>
> I've just hit had a look at recent patches[1] and reverted this to test
> and I can mount again, so I'm pretty sure this is the culprit, but I
> didn't look at the content at all yet so cannot advise further.
> It might very well be that we need some extra handling for 9p
> specifically that can be added separately if required.
>
> [1] git log 0ec57cfa721fbd36b4c4c0d9ccc5d78a78f7fa35..HEAD drivers/virtio/
>
>
> This can be reproduced with a simple mount, run qemu with some -virtfs
> argument and `mount -t 9p -o debug=65535 tag mountpoint` will hang after
> these messages:
> 9pnet: -- p9_virtio_request (83): 9p debug: virtio request
> 9pnet: -- p9_virtio_request (83): virtio request kicked
>
> So I suspect we're just not getting a callback.

I think so. The patch assumes the driver will call
virtqueue_disable/enable_cb() which is not the case of the 9p driver.

So after the first interrupt, event_triggered will be set to true forever.

Thanks

>
>
> I'll have a closer look after work, but any advice meanwhile will be
> appreciated!
> (I'm sure Luis would also like a temporary drop from -next until
> this is figured out, but I'll leave this up to you)
>
>
> >
> > If we disable the napi_tx, it will only be called when the tx ring
> > buffer is relatively small.
> >
> > Because event_triggered is true. Therefore, VRING_AVAIL_F_NO_INTERRUPT or
> > VRING_PACKED_EVENT_FLAG_DISABLE will not be set. So we update
> > vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> > every time we call virtqueue_get_buf_ctx. This will bring more 
> > interruptions.
> >
> > To summarize:
> > 1) event_triggered was set to true in vring_interrupt()
> > 2) after this nothing will happen for virtqueue_disable_cb() so
> >VRING_AVAIL_F_NO_INTERRUPT is not set in avail_flags_shadow
> > 3) virtqueue_get_buf_ctx_split() will still think the cb is enabled
> >then it tries to publish new event
> >
> > To fix, if event_triggered is set to true, do not update
> > vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> >
> > Tested with iperf:
> > iperf3 tcp stream:
> > vm1 -> vm2
> > vm2 just receives tcp data stream from vm1, and sends the ack to vm1,
> > there are many tx interrupts in vm2.
> > but without event_triggered there are just a few tx interrupts.
> >
> > Fixes: 8d622d21d248 ("virtio: fix up virtio_disable_cb")
> > Signed-off-by: Albert Huang 
> > Message-Id: <20230321085953.24949-1-huangjie.alb...@bytedance.com>
> > Signed-off-by: Michael S. Tsirkin 
> > ---
> >  drivers/virtio/virtio_ring.c | 6 --
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index cbeeea1b0439..1c36fa477966 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -914,7 +914,8 @@ static void *virtqueue_get_buf_ctx_split(struct 
> > virtqueue *_vq,
> >   /* If we expect an interrupt for the next entry, tell host
> >* by writing event index and flush out the write before
> >* the read in the next get_buf call. */
> > - if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> > + if (unlikely(!(vq->split.avail_flags_shadow & 
> > VRING_AVAIL_F_NO_INTERRUPT) &&
> > +  !vq->event_triggered))
> >   virtio_store_mb(vq->weak_barriers,
> >   &vring_used_event(&vq->split.vring),
> >   cpu_to_virtio16(_vq->vdev, 
> > vq->last_used_idx));
> > @@ -1744,7 +1745,8 @@ static void *virtqueue_get_buf_ctx_packed(struct 
> > virtqueue *_vq,
> >* by writing event index and flush out the write before
> >* the read in the next get_buf call.
> >*/
> > - if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
> > + if (unlikely(vq->packed.event_flags_shadow == 
> > VRING_PACKED_EVENT_FLAG_DESC &&
> > +  !vq->event_triggered))
> >   virtio_store_mb(vq->weak_barriers,
> >   &vq->packed.vring.driver->off_wrap,
> >   cpu_to_le16(vq->last_used_idx));
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 11/11] vhost: allow userspace to create workers

For vhost-scsi with 3 vqs and a workload like that tries to use those vqs
like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

the single vhost worker thread will become a bottlneck and we are stuck
at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
used.

To better utilize virtqueues and available CPUs, this patch allows
userspace to create workers and bind them to vqs. You can have N workers
per dev and also share N workers with M vqs.

With the patches and doing a worker per vq, we can scale to at least
16 vCPUs/vqs (that's my system limit) with the same command fio command
above with numjobs=16:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=64  --numjobs=16

which gives around 2326K IOPs.

Note that for testing I dropped depth to 64 above because the vhost/virt
layer supports only 1024 total commands per device. And the only tuning I
did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
path which becomes an issue at around 12 jobs/virtqueues.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c| 177 ---
 drivers/vhost/vhost.h|   4 +-
 include/uapi/linux/vhost.h   |  22 
 include/uapi/linux/vhost_types.h |  15 +++
 4 files changed, 204 insertions(+), 14 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1fa5e9a49092..e40699e83c6d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-   vhost_work_flush_on(dev->worker);
+   struct vhost_worker *worker;
+   unsigned long i;
+
+   xa_for_each(&dev->worker_xa, i, worker)
+   vhost_work_flush_on(worker);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
@@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
-   dev->worker = NULL;
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
@@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);
-
+   xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
 
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -562,32 +565,67 @@ static void vhost_detach_mm(struct vhost_dev *dev)
dev->mm = NULL;
 }
 
-static void vhost_worker_free(struct vhost_dev *dev)
+static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker 
*worker)
 {
-   struct vhost_worker *worker = dev->worker;
-
if (!worker)
return;
 
-   dev->worker = NULL;
+   if (!refcount_dec_and_test(&worker->refcount))
+   return;
+
WARN_ON(!llist_empty(&worker->work_list));
vhost_task_stop(worker->vtsk);
kfree(worker);
 }
 
+static void vhost_vq_detach_worker(struct vhost_virtqueue *vq)
+{
+   if (vq->worker)
+   vhost_worker_put(vq->dev, vq->worker);
+   vq->worker = NULL;
+}
+
+static void vhost_workers_free(struct vhost_dev *dev)
+{
+   struct vhost_worker *worker;
+   unsigned long i;
+
+   if (!dev->use_worker)
+   return;
+
+   for (i = 0; i < dev->nvqs; i++)
+   vhost_vq_detach_worker(dev->vqs[i]);
+   /*
+* Drop the refcount taken during allocation, and handle the default
+* worker and the cases where userspace might have crashed or was lazy
+* and did a VHOST_NEW_WORKER but not a VHOST_FREE_WORKER.
+*/
+   xa_for_each(&dev->worker_xa, i, worker) {
+   xa_erase(&dev->worker_xa, worker->id);
+   vhost_worker_put(dev, worker);
+   }
+   xa_destroy(&dev->worker_xa);
+}
+
 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
+   int ret;
+   u32 id;
 
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
return NULL;
 
-   dev->worker = worker;
worker->kcov_handle = kcov_common_handle();
init_llist_head(&worker->work_list);
+   /*
+* We increase the refcount for the initial creation and then
+* later each time it's attached to a virtqueue.
+*/
+   refcount_set(&worker->refcount, 1);
snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
vtsk = vhost_task_create(vhost_worker, worker, name);
@@ -596,14 +634,104 @@ static struct vhost_worker *vhost_worker_create(struct 
vhost_dev *dev)
 
worker->vtsk = vtsk;
vhost_task_start(vtsk);
+
+   ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+   if (ret

[PATCH v6 10/11] vhost-scsi: flush IO vqs then send TMF rsp

With one worker we will always send the scsi cmd responses then send the
TMF rsp, because LIO will always complete the scsi cmds first then call
into us to send the TMF response.

With multiple workers, the IO vq workers could be running while the
TMF/ctl vq worker is so this has us do a flush before completing the TMF
to make sure cmds are completed when it's work is later queued and run.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c  | 22 +++---
 drivers/vhost/vhost.c |  6 ++
 drivers/vhost/vhost.h |  1 +
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3e86b5fbeca6..48dba4fe2dac 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1158,12 +1158,28 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work 
*work)
 {
struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
  vwork);
-   int resp_code;
+   struct vhost_virtqueue *ctl_vq, *vq;
+   int resp_code, i;
+
+   if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) {
+   /*
+* Flush IO vqs that don't share a worker with the ctl to make
+* sure they have sent their responses before us.
+*/
+   ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq;
+   for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) {
+   vq = &tmf->vhost->vqs[i].vq;
+
+   if (vhost_vq_is_setup(vq) &&
+   vq->worker != ctl_vq->worker) {
+   vhost_vq_flush(vq);
+   }
+   }
 
-   if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE)
resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-   else
+   } else {
resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED;
+   }
 
vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs,
 tmf->vq_desc, &tmf->resp_iov, resp_code);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f812daf25648..1fa5e9a49092 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -275,6 +275,12 @@ void vhost_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
+void vhost_vq_flush(struct vhost_virtqueue *vq)
+{
+   vhost_work_flush_on(vq->worker);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_flush);
+
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ef55fae2517c..395707c680e5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -53,6 +53,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_dev_flush(struct vhost_dev *dev);
+void vhost_vq_flush(struct vhost_virtqueue *vq);
 
 struct vhost_log {
u64 addr;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 07/11] vhost-scsi: make SCSI cmd completion per vq

This patch separates the scsi cmd completion code paths so we can complete
cmds based on their vq instead of having all cmds complete on the same
worker/CPU. This will be useful with the next patches that allow us to
create mulitple worker threads and bind them to different vqs, so we can
have completions running on different threads/CPUs.

Signed-off-by: Mike Christie 
Reviewed-by: Stefan Hajnoczi 
---
 drivers/vhost/scsi.c | 56 
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3b0b556c57ef..ecb5cd7450b8 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -167,6 +167,7 @@ MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO 
virtqueues a vhost scsi d
 
 struct vhost_scsi_virtqueue {
struct vhost_virtqueue vq;
+   struct vhost_scsi *vs;
/*
 * Reference counting for inflight reqs, used for flush operation. At
 * each time, one reference tracks new commands submitted, while we
@@ -181,6 +182,9 @@ struct vhost_scsi_virtqueue {
struct vhost_scsi_cmd *scsi_cmds;
struct sbitmap scsi_tags;
int max_cmds;
+
+   struct vhost_work completion_work;
+   struct llist_head completion_list;
 };
 
 struct vhost_scsi {
@@ -190,12 +194,8 @@ struct vhost_scsi {
 
struct vhost_dev dev;
struct vhost_scsi_virtqueue *vqs;
-   unsigned long *compl_bitmap;
struct vhost_scsi_inflight **old_inflight;
 
-   struct vhost_work vs_completion_work; /* cmd completion work item */
-   struct llist_head vs_completion_list; /* cmd completion queue */
-
struct vhost_work vs_event_work; /* evt injection work item */
struct llist_head vs_event_list; /* evt injection queue */
 
@@ -368,10 +368,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
} else {
struct vhost_scsi_cmd *cmd = container_of(se_cmd,
struct vhost_scsi_cmd, tvc_se_cmd);
-   struct vhost_scsi *vs = cmd->tvc_vhost;
+   struct vhost_scsi_virtqueue *svq =  container_of(cmd->tvc_vq,
+   struct vhost_scsi_virtqueue, vq);
 
-   llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
-   vhost_work_queue(&vs->dev, &vs->vs_completion_work);
+   llist_add(&cmd->tvc_completion_list, &svq->completion_list);
+   vhost_vq_work_queue(&svq->vq, &svq->completion_work);
}
 }
 
@@ -534,17 +535,17 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
  */
 static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 {
-   struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
-   vs_completion_work);
+   struct vhost_scsi_virtqueue *svq = container_of(work,
+   struct vhost_scsi_virtqueue, completion_work);
struct virtio_scsi_cmd_resp v_rsp;
struct vhost_scsi_cmd *cmd, *t;
struct llist_node *llnode;
struct se_cmd *se_cmd;
struct iov_iter iov_iter;
-   int ret, vq;
+   bool signal = false;
+   int ret;
 
-   bitmap_zero(vs->compl_bitmap, vs->dev.nvqs);
-   llnode = llist_del_all(&vs->vs_completion_list);
+   llnode = llist_del_all(&svq->completion_list);
llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) {
se_cmd = &cmd->tvc_se_cmd;
 
@@ -564,21 +565,17 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
  cmd->tvc_in_iovs, sizeof(v_rsp));
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
-   struct vhost_scsi_virtqueue *q;
+   signal = true;
+
vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
-   q = container_of(cmd->tvc_vq, struct 
vhost_scsi_virtqueue, vq);
-   vq = q - vs->vqs;
-   __set_bit(vq, vs->compl_bitmap);
} else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
 
vhost_scsi_release_cmd_res(se_cmd);
}
 
-   vq = -1;
-   while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1))
-   < vs->dev.nvqs)
-   vhost_signal(&vs->dev, &vs->vqs[vq].vq);
+   if (signal)
+   vhost_signal(&svq->vs->dev, &svq->vq);
 }
 
 static struct vhost_scsi_cmd *
@@ -1795,6 +1792,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, 
u64 features)
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
+   struct vhost_scsi_virtqueue *svq;
struct vhost_scsi *vs;
struct vhost_virtqueue **vqs;
int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs;
@@ -1813,10 +1811,6 @@ static int vhost_scsi_open(s

[PATCH v6 06/11] vhost-sock: convert to vhost_vq_work_queue

Convert from vhost_work_queue to vhost_vq_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index c8e6087769a1..1dcbc8669f95 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -285,7 +285,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
atomic_inc(&vsock->queued_replies);
 
virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-   vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+   vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
rcu_read_unlock();
return len;
@@ -582,7 +582,7 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
/* Some packets may have been queued before the device was started,
 * let's kick the send worker to send them.
 */
-   vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+   vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
mutex_unlock(&vsock->dev.mutex);
return 0;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 04/11] vhost: take worker or vq instead of dev for flushing

This patch has the core work flush function take a worker. When we
support multiple workers we can then flush each worker during device
removal, stoppage, etc.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cc2628ba9a77..6160aa1cc922 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -247,6 +247,20 @@ static void vhost_work_queue_on(struct vhost_worker 
*worker,
}
 }
 
+static void vhost_work_flush_on(struct vhost_worker *worker)
+{
+   struct vhost_flush_struct flush;
+
+   if (!worker)
+   return;
+
+   init_completion(&flush.wait_event);
+   vhost_work_init(&flush.work, vhost_flush_work);
+
+   vhost_work_queue_on(worker, &flush.work);
+   wait_for_completion(&flush.wait_event);
+}
+
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
vhost_work_queue_on(dev->worker, work);
@@ -261,15 +275,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-   struct vhost_flush_struct flush;
-
-   if (dev->worker) {
-   init_completion(&flush.wait_event);
-   vhost_work_init(&flush.work, vhost_flush_work);
-
-   vhost_work_queue(dev, &flush.work);
-   wait_for_completion(&flush.wait_event);
-   }
+   vhost_work_flush_on(dev->worker);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 09/11] vhost: remove vhost_work_queue

vhost_work_queue is no longer used. Each driver is using the poll or vq
based queueing, so remove vhost_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 6 --
 drivers/vhost/vhost.h | 1 -
 2 files changed, 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6968f8fc17e8..f812daf25648 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -263,12 +263,6 @@ static void vhost_work_flush_on(struct vhost_worker 
*worker)
wait_for_completion(&flush.wait_event);
 }
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-   vhost_work_queue_on(dev->worker, work);
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-
 void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
 {
vhost_work_queue_on(vq->worker, work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d9b8abbe3a26..ef55fae2517c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,6 @@ struct vhost_poll {
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 __poll_t mask, struct vhost_dev *dev,
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 08/11] vhost-scsi: convert to vhost_vq_work_queue

Convert from vhost_work_queue to vhost_vq_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index ecb5cd7450b8..3e86b5fbeca6 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -363,8 +363,9 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) {
struct vhost_scsi_tmf *tmf = container_of(se_cmd,
struct vhost_scsi_tmf, se_cmd);
+   struct vhost_virtqueue *vq = &tmf->svq->vq;
 
-   vhost_work_queue(&tmf->vhost->dev, &tmf->vwork);
+   vhost_vq_work_queue(vq, &tmf->vwork);
} else {
struct vhost_scsi_cmd *cmd = container_of(se_cmd,
struct vhost_scsi_cmd, tvc_se_cmd);
@@ -1357,11 +1358,9 @@ static void vhost_scsi_ctl_handle_kick(struct vhost_work 
*work)
 }
 
 static void
-vhost_scsi_send_evt(struct vhost_scsi *vs,
-  struct vhost_scsi_tpg *tpg,
-  struct se_lun *lun,
-  u32 event,
-  u32 reason)
+vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
+   struct vhost_scsi_tpg *tpg, struct se_lun *lun,
+   u32 event, u32 reason)
 {
struct vhost_scsi_evt *evt;
 
@@ -1383,7 +1382,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs,
}
 
llist_add(&evt->list, &vs->vs_event_list);
-   vhost_work_queue(&vs->dev, &vs->vs_event_work);
+   vhost_vq_work_queue(vq, &vs->vs_event_work);
 }
 
 static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
@@ -1397,7 +1396,8 @@ static void vhost_scsi_evt_handle_kick(struct vhost_work 
*work)
goto out;
 
if (vs->vs_events_missed)
-   vhost_scsi_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+   vhost_scsi_send_evt(vs, vq, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT,
+   0);
 out:
mutex_unlock(&vq->mutex);
 }
@@ -2016,7 +2016,7 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg,
goto unlock;
 
if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG))
-   vhost_scsi_send_evt(vs, tpg, lun,
+   vhost_scsi_send_evt(vs, vq, tpg, lun,
   VIRTIO_SCSI_T_TRANSPORT_RESET, reason);
 unlock:
mutex_unlock(&vq->mutex);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 00/11] vhost: multiple worker support

The following patches were built over linux-next which contains various
vhost patches in mst's tree and the vhost_task patchset in Christian
Brauner's tree:

git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git

kernel.user_worker branch:

https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=kernel.user_worker

The latter patchset handles the review comment for the patches in thread
to make sure that worker threads we create are accounted for in the parent
process's NPROC limit. The patches are scheduled to be sent to Linus for
6.4.

The patches in this patchset allow us to support multiple vhost workers
per device. The design is a modified version of Stefan's original idea
where userspace has the kernel create a worker and we pass back the pid.
In this version instead of passing the pid between user/kernel space we
use a worker_id which is just an integer managed by the vhost driver and
we allow userspace to create and free workers and then attach them to
virtqueues at setup time.

All review comments from the past reviews should be handled. If I didn't
reply to a review comment, I agreed with the comment and should have
handled it in this posting. Let me know if I missed one.

Results:

fio jobs1 2 4 8 12 16
--
1 worker160k 488k - - - -
worker per vq 160k 310k620k1300k 1836k 2326k

Notes:
0. This used a simple fio command:

fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128 --numjobs=$JOBS_ABOVE

and I used a VM with 16 vCPUs and 16 virtqueues.

1. The patches were tested with LIO's emulate_pr=0 which drops the
LIO PR lock use. This was a bottleneck at around 12 vqs/jobs.

2. Because we have a hard limit of 1024 cmds, if the num jobs * iodepth
was greater than 1024, I would decrease iodepth. So 12 jobs used 85 cmds,
and 16 used 64.

3. The perf issue above at 2 jobs is because when we only have 1 worker
we execute more cmds per vhost_work due to all vqs funneling to one worker.
This results in less context switches and better batching without having to
tweak any settings. I'm working on patches to add back batching during lio
completion and do polling on the submission side.

We will still want the threading patches, because if we batch at the fio
level plus use the vhost theading patches, we can see a big boost like
below. So hopefully doing it at the kernel will allow apps to just work
without having to be smart like fio.

fio using io_uring and batching with the iodepth_batch* settings:

fio jobs1 2 4 8 12 16
-
1 worker494k520k- - - -
worker per vq 496k878k1542k 2436k 2304k 2590k

V6:
- Rebase against vhost_task patchset.
- Used xa instead of idr.
V5:
- Rebase against user_worker patchset.
- Rebase against flush patchset.
- Redo vhost-scsi tmf flush handling so it doesn't access vq->worker.
V4:
- fix vhost-sock VSOCK_VQ_RX use.
- name functions called directly by ioctl cmd's to match the ioctl cmd.
- break up VHOST_SET_VRING_WORKER into a new, free and attach cmd.
- document worker lifetime, and cgroup, namespace, mm, rlimit
inheritance, make it clear we currently only support sharing within the
device.
- add support to attach workers while IO is running.
- instead of passing a pid_t of the kernel thread, pass a int allocated
by the vhost layer with an idr.

V3:
- fully convert vhost code to use vq based APIs instead of leaving it
half per dev and half per vq.
- rebase against kernel worker API.
- Drop delayed worker creation. We always create the default worker at
VHOST_SET_OWNER time. Userspace can create and bind workers after that.

V2:
- change loop that we take a refcount to the worker in
- replaced pid == -1 with define.
- fixed tabbing/spacing coding style issue
- use hash instead of list to lookup workers.
- I dropped the patch that added an ioctl cmd to get a vq's worker's
pid. I saw we might do a generic netlink interface instead.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 01/11] vhost: add vhost_worker pointer to vhost_virtqueue

This patchset allows userspace to map vqs to different workers. This
patch adds a worker pointer to the vq so we can store that info.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 24 +---
 drivers/vhost/vhost.h |  1 +
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4368ee9b999c..e041e116afee 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -486,6 +486,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
+   vq->worker = NULL;
vq->dev = dev;
mutex_init(&vq->mutex);
vhost_vq_reset(dev, vq);
@@ -554,16 +555,15 @@ static void vhost_worker_free(struct vhost_dev *dev)
kfree(worker);
 }
 
-static int vhost_worker_create(struct vhost_dev *dev)
+static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
-   int ret;
 
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
-   return -ENOMEM;
+   return NULL;
 
dev->worker = worker;
worker->kcov_handle = kcov_common_handle();
@@ -571,25 +571,24 @@ static int vhost_worker_create(struct vhost_dev *dev)
snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
vtsk = vhost_task_create(vhost_worker, worker, name);
-   if (!vtsk) {
-   ret = -ENOMEM;
+   if (!vtsk)
goto free_worker;
-   }
 
worker->vtsk = vtsk;
vhost_task_start(vtsk);
-   return 0;
+   return worker;
 
 free_worker:
kfree(worker);
dev->worker = NULL;
-   return ret;
+   return NULL;
 }
 
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-   int err;
+   struct vhost_worker *worker;
+   int err, i;
 
/* Is there an owner already? */
if (vhost_dev_has_owner(dev)) {
@@ -600,9 +599,12 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
vhost_attach_mm(dev);
 
if (dev->use_worker) {
-   err = vhost_worker_create(dev);
-   if (err)
+   worker = vhost_worker_create(dev);
+   if (!worker)
goto err_worker;
+
+   for (i = 0; i < dev->nvqs; i++)
+   dev->vqs[i]->worker = worker;
}
 
err = vhost_dev_alloc_iovecs(dev);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0308638cdeee..e72b665ba3a5 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -74,6 +74,7 @@ struct vhost_vring_call {
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
struct vhost_dev *dev;
+   struct vhost_worker *worker;
 
/* The actual ring of buffers. */
struct mutex mutex;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 05/11] vhost: convert poll work to be vq based

This has the drivers pass in their poll to vq mapping and then converts
the core poll code to use the vq based helpers. In the next patches we
will allow vqs to be handled by different workers, so to allow drivers
to execute operations like queue, stop, flush, etc on specific polls/vqs
we need to know the mappings.

Signed-off-by: Mike Christie 
---
 drivers/vhost/net.c   | 6 --
 drivers/vhost/vhost.c | 8 +---
 drivers/vhost/vhost.h | 4 +++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8ed63651b9eb..4a9b757071a2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1342,8 +1342,10 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
   VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
   NULL);
 
-   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, 
dev);
-   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
+   vqs[VHOST_NET_VQ_TX]);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
+   vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
n->page_frag.page = NULL;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6160aa1cc922..6968f8fc17e8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -187,13 +187,15 @@ EXPORT_SYMBOL_GPL(vhost_work_init);
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev)
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq)
 {
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
poll->dev = dev;
poll->wqh = NULL;
+   poll->vq = vq;
 
vhost_work_init(&poll->work, fn);
 }
@@ -288,7 +290,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-   vhost_work_queue(poll->dev, &poll->work);
+   vhost_vq_work_queue(poll->vq, &poll->work);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
@@ -510,7 +512,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
-   EPOLLIN, dev);
+   EPOLLIN, dev, vq);
}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b64ee4ef387d..d9b8abbe3a26 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -41,13 +41,15 @@ struct vhost_poll {
struct vhost_work   work;
__poll_tmask;
struct vhost_dev*dev;
+   struct vhost_virtqueue  *vq;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev);
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 02/11] vhost, vhost-net: add helper to check if vq has work

In the next patches each vq might have different workers so one could
have work but others do not. For net, we only want to check specific vqs,
so this adds a helper to check if a vq has work pending and converts
vhost-net to use it.

Signed-off-by: Mike Christie 
---
 drivers/vhost/net.c   | 2 +-
 drivers/vhost/vhost.c | 6 +++---
 drivers/vhost/vhost.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 07181cd8d52e..8ed63651b9eb 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -546,7 +546,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
endtime = busy_clock() + busyloop_timeout;
 
while (vhost_can_busy_poll(endtime)) {
-   if (vhost_has_work(&net->dev)) {
+   if (vhost_vq_has_work(vq)) {
*busyloop_intr = true;
break;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e041e116afee..6567aed69ebb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -262,11 +262,11 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
 /* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
+bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
-   return dev->worker && !llist_empty(&dev->worker->work_list);
+   return vq->worker && !llist_empty(&vq->worker->work_list);
 }
-EXPORT_SYMBOL_GPL(vhost_has_work);
+EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e72b665ba3a5..0dde119fb0ee 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,6 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
-bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 __poll_t mask, struct vhost_dev *dev);
@@ -195,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v6 03/11] vhost: take worker or vq instead of dev for queueing

This patch has the core work queueing function take a worker for when we
support multiple workers. It also adds a helper that takes a vq during
queueing so modules can control which vq/worker to queue work on.

This temp leaves vhost_work_queue. It will be removed when the drivers
are converted in the next patches.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 44 +++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6567aed69ebb..cc2628ba9a77 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,6 +231,34 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
+static void vhost_work_queue_on(struct vhost_worker *worker,
+   struct vhost_work *work)
+{
+   if (!worker)
+   return;
+
+   if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+   /* We can only add the work to the list after we're
+* sure it was not in the list.
+* test_and_set_bit() implies a memory barrier.
+*/
+   llist_add(&work->node, &worker->work_list);
+   wake_up_process(worker->vtsk->task);
+   }
+}
+
+void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+   vhost_work_queue_on(dev->worker, work);
+}
+EXPORT_SYMBOL_GPL(vhost_work_queue);
+
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
+{
+   vhost_work_queue_on(vq->worker, work);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
+
 void vhost_dev_flush(struct vhost_dev *dev)
 {
struct vhost_flush_struct flush;
@@ -245,22 +273,6 @@ void vhost_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-   if (!dev->worker)
-   return;
-
-   if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
-   /* We can only add the work to the list after we're
-* sure it was not in the list.
-* test_and_set_bit() implies a memory barrier.
-*/
-   llist_add(&work->node, &dev->worker->work_list);
-   wake_up_process(dev->worker->vtsk->task);
-   }
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0dde119fb0ee..b64ee4ef387d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -194,6 +194,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

9p regression (Was: [PATCH v2] virtio_ring: don't update event idx on get_buf)

2023-03-27 Thread Dominique Martinet

Hi Michael, Albert,

Albert Huang wrote on Sat, Mar 25, 2023 at 06:56:33PM +0800:
> in virtio_net, if we disable the napi_tx, when we triger a tx interrupt,
> the vq->event_triggered will be set to true. It will no longer be set to
> false. Unless we explicitly call virtqueue_enable_cb_delayed or
> virtqueue_enable_cb_prepare.

This patch (commited as 35395770f803 ("virtio_ring: don't update event
idx on get_buf") in next-20230327 apparently breaks 9p, as reported by
Luis in https://lkml.kernel.org/r/zci+7wg5oclsl...@bombadil.infradead.org

I've just hit had a look at recent patches[1] and reverted this to test
and I can mount again, so I'm pretty sure this is the culprit, but I
didn't look at the content at all yet so cannot advise further.
It might very well be that we need some extra handling for 9p
specifically that can be added separately if required.

[1] git log 0ec57cfa721fbd36b4c4c0d9ccc5d78a78f7fa35..HEAD drivers/virtio/


This can be reproduced with a simple mount, run qemu with some -virtfs
argument and `mount -t 9p -o debug=65535 tag mountpoint` will hang after
these messages:
9pnet: -- p9_virtio_request (83): 9p debug: virtio request
9pnet: -- p9_virtio_request (83): virtio request kicked

So I suspect we're just not getting a callback.


I'll have a closer look after work, but any advice meanwhile will be
appreciated!
(I'm sure Luis would also like a temporary drop from -next until
this is figured out, but I'll leave this up to you)


> 
> If we disable the napi_tx, it will only be called when the tx ring
> buffer is relatively small.
> 
> Because event_triggered is true. Therefore, VRING_AVAIL_F_NO_INTERRUPT or
> VRING_PACKED_EVENT_FLAG_DISABLE will not be set. So we update
> vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> every time we call virtqueue_get_buf_ctx. This will bring more interruptions.
> 
> To summarize:
> 1) event_triggered was set to true in vring_interrupt()
> 2) after this nothing will happen for virtqueue_disable_cb() so
>VRING_AVAIL_F_NO_INTERRUPT is not set in avail_flags_shadow
> 3) virtqueue_get_buf_ctx_split() will still think the cb is enabled
>then it tries to publish new event
> 
> To fix, if event_triggered is set to true, do not update
> vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> 
> Tested with iperf:
> iperf3 tcp stream:
> vm1 -> vm2
> vm2 just receives tcp data stream from vm1, and sends the ack to vm1,
> there are many tx interrupts in vm2.
> but without event_triggered there are just a few tx interrupts.
> 
> Fixes: 8d622d21d248 ("virtio: fix up virtio_disable_cb")
> Signed-off-by: Albert Huang 
> Message-Id: <20230321085953.24949-1-huangjie.alb...@bytedance.com>
> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/virtio/virtio_ring.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index cbeeea1b0439..1c36fa477966 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -914,7 +914,8 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue 
> *_vq,
>   /* If we expect an interrupt for the next entry, tell host
>* by writing event index and flush out the write before
>* the read in the next get_buf call. */
> - if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> + if (unlikely(!(vq->split.avail_flags_shadow & 
> VRING_AVAIL_F_NO_INTERRUPT) &&
> +  !vq->event_triggered))
>   virtio_store_mb(vq->weak_barriers,
>   &vring_used_event(&vq->split.vring),
>   cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> @@ -1744,7 +1745,8 @@ static void *virtqueue_get_buf_ctx_packed(struct 
> virtqueue *_vq,
>* by writing event index and flush out the write before
>* the read in the next get_buf call.
>*/
> - if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
> + if (unlikely(vq->packed.event_flags_shadow == 
> VRING_PACKED_EVENT_FLAG_DESC &&
> +  !vq->event_triggered))
>   virtio_store_mb(vq->weak_barriers,
>   &vq->packed.vring.driver->off_wrap,
>   cpu_to_le16(vq->last_used_idx));
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Add and remove debugfs in setup/teardown driver

On Sun, Mar 26, 2023 at 04:18:19PM +0300, Eli Cohen wrote:
> The right place to add the debufs create is in

s/debufs/debugfs/

> setup_driver() and remove it in teardown_driver().
> 
> Current code adds the debugfs when creating the device but resetting a
> device will remove the debugfs subtree and subsequent set_driver will
> not be able to create the files since the debugfs pointer is NULL.
> 
> Fixes: 294221004322 ("vdpa/mlx5: Add debugfs subtree")
> Signed-off-by: Eli Cohen 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 2805d58378fb..d012732e3835 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -2547,6 +2547,7 @@ static int setup_driver(struct mlx5_vdpa_dev *mvdev)
>   err = 0;
>   goto out;
>   }
> + mlx5_vdpa_add_debugfs(ndev);
>   err = setup_virtqueues(mvdev);
>   if (err) {
>   mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
> @@ -2593,6 +2594,8 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev)
>   if (!ndev->setup)
>   return;
>  
> + mlx5_vdpa_remove_debugfs(ndev->debugfs);
> + ndev->debugfs = NULL;
>   teardown_steering(ndev);
>   destroy_tir(ndev);
>   destroy_rqt(ndev);
> @@ -3313,7 +3316,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
> *v_mdev, const char *name,
>   if (err)
>   goto err_reg;
>  
> - mlx5_vdpa_add_debugfs(ndev);
>   mgtdev->ndev = ndev;
>   return 0;
>  
> -- 
> 2.38.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Veirfy wq has a valid pointer in mlx5_vdpa_suspend

typos all over.

subject:
s/Veirfy/verify/
s/has/is/

On Sun, Mar 26, 2023 at 04:17:56PM +0300, Eli Cohen wrote:
> mlx5_vdpa_suspend() flushes the workqueue as part of its logic. However,
> if the devices has been deleted while a VM was runnig, the workqueue

s/the devices/the device/
s/runnig/running/

> would be destroyed first and wq will be null. After the VM is destroyed,
> suspend could be called and would access a null pointer.

s/could be/can be/
s/would/will/

> 
> Fix it by vrifyig wq is not NULL.

s/vrifyig/verifying/

> 
> Fixes: cae15c2ed8e6 ("vdpa/mlx5: Implement susupend virtqueue callback")
> Signed-off-by: Eli Cohen 


> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 8b52961d1d25..2805d58378fb 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -2938,7 +2938,8 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
>   down_write(&ndev->reslock);
>   ndev->nb_registered = false;
>   mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
> - flush_workqueue(ndev->mvdev.wq);
> + if (ndev->mvdev.wq)
> + flush_workqueue(ndev->mvdev.wq);
>   for (i = 0; i < ndev->cur_num_vqs; i++) {
>   mvq = &ndev->vqs[i];
>   suspend_vq(ndev, mvq);
> -- 
> 2.38.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [GIT PULL] vdpa: bugfix

And the issue was that the author self-nacked the single fix here.
So we'll merge another fix, later.

On Mon, Mar 27, 2023 at 09:30:13AM -0400, Michael S. Tsirkin wrote:
> Looks like a sent a bad pull request. Sorry!
> Please disregard.
> 
> On Mon, Mar 27, 2023 at 09:19:50AM -0400, Michael S. Tsirkin wrote:
> > The following changes since commit e8d018dd0257f744ca50a729e3d042cf2ec9da65:
> > 
> >   Linux 6.3-rc3 (2023-03-19 13:27:55 -0700)
> > 
> > are available in the Git repository at:
> > 
> >   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git 
> > tags/for_linus
> > 
> > for you to fetch changes up to 8fc9ce051f22581f60325fd87a0fd0f37a7b70c3:
> > 
> >   vdpa/mlx5: Remove debugfs file after device unregister (2023-03-21 
> > 16:39:02 -0400)
> > 
> > 
> > vdpa: bugfix
> > 
> > An error handling fix in mlx5.
> > 
> > Signed-off-by: Michael S. Tsirkin 
> 
> 
> 
> 
> > 
> > Eli Cohen (1):
> >   vdpa/mlx5: Remove debugfs file after device unregister
> > 
> >  drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [GIT PULL] vdpa: bugfix

Looks like a sent a bad pull request. Sorry!
Please disregard.

On Mon, Mar 27, 2023 at 09:19:50AM -0400, Michael S. Tsirkin wrote:
> The following changes since commit e8d018dd0257f744ca50a729e3d042cf2ec9da65:
> 
>   Linux 6.3-rc3 (2023-03-19 13:27:55 -0700)
> 
> are available in the Git repository at:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
> 
> for you to fetch changes up to 8fc9ce051f22581f60325fd87a0fd0f37a7b70c3:
> 
>   vdpa/mlx5: Remove debugfs file after device unregister (2023-03-21 16:39:02 
> -0400)
> 
> 
> vdpa: bugfix
> 
> An error handling fix in mlx5.
> 
> Signed-off-by: Michael S. Tsirkin 




> 
> Eli Cohen (1):
>   vdpa/mlx5: Remove debugfs file after device unregister
> 
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Avoid losing link state updates

On Mon, Mar 27, 2023 at 03:54:34PM +0300, Eli Cohen wrote:
> 
> On 27/03/2023 15:41, Michael S. Tsirkin wrote:
> > On Mon, Mar 20, 2023 at 10:01:05AM +0200, Eli Cohen wrote:
> > > Current code ignores link state updates if VIRTIO_NET_F_STATUS was not
> > > negotiated. However, link state updates could be received before feature
> > > negotiation was completed , therefore causing link state events to be
> > > lost, possibly leaving the link state down.
> > > 
> > > Add code to detect if VIRTIO_NET_F_STATUS was set and update the link
> > > state. We add a spinlock to serialize updated to config.status to
> > > maintain its integrity.
> > > 
> > > Fixes: 033779a708f0 ("vdpa/mlx5: make MTU/STATUS presence conditional on 
> > > feature bits")
> > > Signed-off-by: Eli Cohen 
> > > ---
> > >   drivers/vdpa/mlx5/net/mlx5_vnet.c | 91 ---
> > >   drivers/vdpa/mlx5/net/mlx5_vnet.h |  2 +
> > >   2 files changed, 60 insertions(+), 33 deletions(-)
> > > 
> > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > index 520646ae7fa0..20d415e25aeb 100644
> > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > @@ -2298,10 +2298,62 @@ static void update_cvq_info(struct mlx5_vdpa_dev 
> > > *mvdev)
> > >   }
> > >   }
> > > +static bool f_status_was_set(u64 old, u64 new)
> > the name is exact reverse of what it does is it not?
> > it is true if status was not set and is being set.
> 
> I am tracking feature negotiation to check if VIRTIO_NET_F_STATUS *just
> became set* and unconditionally update link state.
> If you still think the name choice is not good, I can change it.

this is where you use the function currently.
yes the "new" value is currently the value that
was already saved before the call to the function.
but the function itself does not care.
but it says nothing about what it does itself.
it gets old and new and tells you whether VIRTIO_NET_F_STATUS
flipped from 0 in old to 1 in new.

I am not sure why you need helper, you can write
it in a single statement as:

(~old & new & BIT_ULL(VIRTIO_NET_F_STATUS))

or here practically:

if (~cur_features & ndev->mvdev.actual_features & 
BIT_ULL(VIRTIO_NET_F_STATUS))

(maybe use a shorter name for cur_features to fit in 80 columns).
so I would just open-code, but if you do write a helper then it
should make sense by itself and be reusable.


> 
> > 
> > > +{
> > > + if (!(old & BIT_ULL(VIRTIO_NET_F_STATUS)) &&
> > > + (new & BIT_ULL(VIRTIO_NET_F_STATUS)))
> > > + return true;
> > > +
> > > + return false;
> > > +}
> > > +
> > > +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 
> > > vport)
> > > +{
> > > + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
> > > + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
> > > + int err;
> > > +
> > > + MLX5_SET(query_vport_state_in, in, opcode, 
> > > MLX5_CMD_OP_QUERY_VPORT_STATE);
> > > + MLX5_SET(query_vport_state_in, in, op_mod, opmod);
> > > + MLX5_SET(query_vport_state_in, in, vport_number, vport);
> > > + if (vport)
> > > + MLX5_SET(query_vport_state_in, in, other_vport, 1);
> > > +
> > > + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
> > > + if (err)
> > > + return 0;
> > > +
> > > + return MLX5_GET(query_vport_state_out, out, state);
> > > +}
> > > +
> > > +static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
> > > +{
> > > + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 
> > > 0) ==
> > > + VPORT_STATE_UP)
> > > + return true;
> > > +
> > > + return false;
> > > +}
> > > +
> > > +static void update_link_state(struct mlx5_vdpa_dev *mvdev)
> > > +{
> > > + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> > > + bool up;
> > > +
> > > + up = get_link_state(mvdev);
> > > + spin_lock(&ndev->status_lock);
> > > + if (up)
> > > + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, 
> > > VIRTIO_NET_S_LINK_UP);
> > > + else
> > > + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, 
> > > ~VIRTIO_NET_S_LINK_UP);
> > > + spin_unlock(&ndev->status_lock);
> > > +}
> > > +
> > >   static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 
> > > features)
> > >   {
> > >   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
> > >   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> > > + u64 cur_features;
> > >   int err;
> > >   print_features(mvdev, features, true);
> > > @@ -2310,7 +2362,11 @@ static int mlx5_vdpa_set_driver_features(struct 
> > > vdpa_device *vdev, u64 features)
> > >   if (err)
> > >   return err;
> > > + cur_features = ndev->mvdev.actual_features;
> > >   ndev->mvdev.actual_features = features & 
> > > ndev->mvdev.mlx_features;
> > > + if (f_status_was_set(cur_features, ndev->mvdev.actual_features))
> > > + update_link_state(mvdev);
> > > +
> > >   if (ndev->mvdev.actual_fe

[GIT PULL] vdpa: bugfix

The following changes since commit e8d018dd0257f744ca50a729e3d042cf2ec9da65:

  Linux 6.3-rc3 (2023-03-19 13:27:55 -0700)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus

for you to fetch changes up to 8fc9ce051f22581f60325fd87a0fd0f37a7b70c3:

  vdpa/mlx5: Remove debugfs file after device unregister (2023-03-21 16:39:02 
-0400)


vdpa: bugfix

An error handling fix in mlx5.

Signed-off-by: Michael S. Tsirkin 


Eli Cohen (1):
  vdpa/mlx5: Remove debugfs file after device unregister

 drivers/vdpa/mlx5/net/mlx5_vnet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] virtio_ring: don't update event idx on get_buf

is the below same as what I posted or different? how?

On Sat, Mar 25, 2023 at 06:56:33PM +0800, Albert Huang wrote:
> in virtio_net, if we disable the napi_tx, when we triger a tx interrupt,
> the vq->event_triggered will be set to true. It will no longer be set to
> false. Unless we explicitly call virtqueue_enable_cb_delayed or
> virtqueue_enable_cb_prepare.
> 
> If we disable the napi_tx, it will only be called when the tx ring
> buffer is relatively small.
> 
> Because event_triggered is true. Therefore, VRING_AVAIL_F_NO_INTERRUPT or
> VRING_PACKED_EVENT_FLAG_DISABLE will not be set. So we update
> vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> every time we call virtqueue_get_buf_ctx. This will bring more interruptions.
> 
> To summarize:
> 1) event_triggered was set to true in vring_interrupt()
> 2) after this nothing will happen for virtqueue_disable_cb() so
>VRING_AVAIL_F_NO_INTERRUPT is not set in avail_flags_shadow
> 3) virtqueue_get_buf_ctx_split() will still think the cb is enabled
>then it tries to publish new event
> 
> To fix, if event_triggered is set to true, do not update
> vring_used_event(&vq->split.vring) or vq->packed.vring.driver->off_wrap
> 
> Tested with iperf:
> iperf3 tcp stream:
> vm1 -> vm2
> vm2 just receives tcp data stream from vm1, and sends the ack to vm1,
> there are many tx interrupts in vm2.
> but without event_triggered there are just a few tx interrupts.
> 
> Fixes: 8d622d21d248 ("virtio: fix up virtio_disable_cb")
> Signed-off-by: Albert Huang 
> Message-Id: <20230321085953.24949-1-huangjie.alb...@bytedance.com>

what is this exactly?

> Signed-off-by: Michael S. Tsirkin 
> ---
>  drivers/virtio/virtio_ring.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index cbeeea1b0439..1c36fa477966 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -914,7 +914,8 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue 
> *_vq,
>   /* If we expect an interrupt for the next entry, tell host
>* by writing event index and flush out the write before
>* the read in the next get_buf call. */
> - if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> + if (unlikely(!(vq->split.avail_flags_shadow & 
> VRING_AVAIL_F_NO_INTERRUPT) &&
> +  !vq->event_triggered))
>   virtio_store_mb(vq->weak_barriers,
>   &vring_used_event(&vq->split.vring),
>   cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
> @@ -1744,7 +1745,8 @@ static void *virtqueue_get_buf_ctx_packed(struct 
> virtqueue *_vq,
>* by writing event index and flush out the write before
>* the read in the next get_buf call.
>*/
> - if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
> + if (unlikely(vq->packed.event_flags_shadow == 
> VRING_PACKED_EVENT_FLAG_DESC &&
> +  !vq->event_triggered))
>   virtio_store_mb(vq->weak_barriers,
>   &vq->packed.vring.driver->off_wrap,
>   cpu_to_le16(vq->last_used_idx));
> -- 
> 2.37.0 (Apple Git-136)

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Avoid losing link state updates

On Mon, Mar 20, 2023 at 10:01:05AM +0200, Eli Cohen wrote:
> Current code ignores link state updates if VIRTIO_NET_F_STATUS was not
> negotiated. However, link state updates could be received before feature
> negotiation was completed , therefore causing link state events to be
> lost, possibly leaving the link state down.
> 
> Add code to detect if VIRTIO_NET_F_STATUS was set and update the link
> state. We add a spinlock to serialize updated to config.status to
> maintain its integrity.
> 
> Fixes: 033779a708f0 ("vdpa/mlx5: make MTU/STATUS presence conditional on 
> feature bits")
> Signed-off-by: Eli Cohen 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 91 ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.h |  2 +
>  2 files changed, 60 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 520646ae7fa0..20d415e25aeb 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -2298,10 +2298,62 @@ static void update_cvq_info(struct mlx5_vdpa_dev 
> *mvdev)
>   }
>  }
>  
> +static bool f_status_was_set(u64 old, u64 new)

the name is exact reverse of what it does is it not?
it is true if status was not set and is being set.

> +{
> + if (!(old & BIT_ULL(VIRTIO_NET_F_STATUS)) &&
> + (new & BIT_ULL(VIRTIO_NET_F_STATUS)))
> + return true;
> +
> + return false;
> +}
> +
> +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
> +{
> + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
> + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
> + int err;
> +
> + MLX5_SET(query_vport_state_in, in, opcode, 
> MLX5_CMD_OP_QUERY_VPORT_STATE);
> + MLX5_SET(query_vport_state_in, in, op_mod, opmod);
> + MLX5_SET(query_vport_state_in, in, vport_number, vport);
> + if (vport)
> + MLX5_SET(query_vport_state_in, in, other_vport, 1);
> +
> + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
> + if (err)
> + return 0;
> +
> + return MLX5_GET(query_vport_state_out, out, state);
> +}
> +
> +static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
> +{
> + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 
> 0) ==
> + VPORT_STATE_UP)
> + return true;
> +
> + return false;
> +}
> +
> +static void update_link_state(struct mlx5_vdpa_dev *mvdev)
> +{
> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> + bool up;
> +
> + up = get_link_state(mvdev);
> + spin_lock(&ndev->status_lock);
> + if (up)
> + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, 
> VIRTIO_NET_S_LINK_UP);
> + else
> + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, 
> ~VIRTIO_NET_S_LINK_UP);
> + spin_unlock(&ndev->status_lock);
> +}
> +
>  static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 
> features)
>  {
>   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> + u64 cur_features;
>   int err;
>  
>   print_features(mvdev, features, true);
> @@ -2310,7 +2362,11 @@ static int mlx5_vdpa_set_driver_features(struct 
> vdpa_device *vdev, u64 features)
>   if (err)
>   return err;
>  
> + cur_features = ndev->mvdev.actual_features;
>   ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
> + if (f_status_was_set(cur_features, ndev->mvdev.actual_features))
> + update_link_state(mvdev);
> +
>   if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
>   ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, 
> ndev->config.max_virtqueue_pairs);
>   else
> @@ -2995,34 +3051,6 @@ struct mlx5_vdpa_mgmtdev {
>   struct mlx5_vdpa_net *ndev;
>  };
>  
> -static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
> -{
> - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
> - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
> - int err;
> -
> - MLX5_SET(query_vport_state_in, in, opcode, 
> MLX5_CMD_OP_QUERY_VPORT_STATE);
> - MLX5_SET(query_vport_state_in, in, op_mod, opmod);
> - MLX5_SET(query_vport_state_in, in, vport_number, vport);
> - if (vport)
> - MLX5_SET(query_vport_state_in, in, other_vport, 1);
> -
> - err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
> - if (err)
> - return 0;
> -
> - return MLX5_GET(query_vport_state_out, out, state);
> -}
> -
> -static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
> -{
> - if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 
> 0) ==
> - VPORT_STATE_UP)
> - return true;
> -
> - return false;
> -}
> -
>  static void update_carrier(struct work_struct *work)
>  {
>   struct mlx5_vdpa_wq_ent *wqent;
> @@ -3032,11 +3060,7 @@ static void update_carrier(struct work_struct *work

Re: [PATCH] vdpa/mlx5: Avoid losing link state updates

On Mon, Mar 20, 2023 at 10:01:05AM +0200, Eli Cohen wrote:
> Current code ignores link state updates if VIRTIO_NET_F_STATUS was not
> negotiated. However, link state updates could be received before feature
> negotiation was completed , therefore causing link state events to be
> lost, possibly leaving the link state down.
> 
> Add code to detect if VIRTIO_NET_F_STATUS was set and update the link
> state. We add a spinlock to serialize updated to config.status to
> maintain its integrity.
> 
> Fixes: 033779a708f0 ("vdpa/mlx5: make MTU/STATUS presence conditional on 
> feature bits")
> Signed-off-by: Eli Cohen 

Hmm I don't get it.

link state changes are config updates are they not?
probe sequence is:

err = virtio_features_ok(dev);
if (err)
goto err;

err = drv->probe(dev);
if (err)
goto err;

/* If probe didn't do it, mark device DRIVER_OK ourselves. */
if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
virtio_device_ready(dev);

if (drv->scan)
drv->scan(dev);

virtio_config_enable(dev);

Looks like config changes that trigger before negotiation are queued,
not lost.

Was there an actual bug you observed or is this theoretical?

Thanks!




> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 91 ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.h |  2 +
>  2 files changed, 60 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 520646ae7fa0..20d415e25aeb 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -2298,10 +2298,62 @@ static void update_cvq_info(struct mlx5_vdpa_dev 
> *mvdev)
>   }
>  }
>  
> +static bool f_status_was_set(u64 old, u64 new)
> +{
> + if (!(old & BIT_ULL(VIRTIO_NET_F_STATUS)) &&
> + (new & BIT_ULL(VIRTIO_NET_F_STATUS)))
> + return true;
> +
> + return false;
> +}
> +
> +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
> +{
> + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
> + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
> + int err;
> +
> + MLX5_SET(query_vport_state_in, in, opcode, 
> MLX5_CMD_OP_QUERY_VPORT_STATE);
> + MLX5_SET(query_vport_state_in, in, op_mod, opmod);
> + MLX5_SET(query_vport_state_in, in, vport_number, vport);
> + if (vport)
> + MLX5_SET(query_vport_state_in, in, other_vport, 1);
> +
> + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
> + if (err)
> + return 0;
> +
> + return MLX5_GET(query_vport_state_out, out, state);
> +}
> +
> +static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
> +{
> + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 
> 0) ==
> + VPORT_STATE_UP)
> + return true;
> +
> + return false;
> +}
> +
> +static void update_link_state(struct mlx5_vdpa_dev *mvdev)
> +{
> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> + bool up;
> +
> + up = get_link_state(mvdev);
> + spin_lock(&ndev->status_lock);
> + if (up)
> + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, 
> VIRTIO_NET_S_LINK_UP);
> + else
> + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, 
> ~VIRTIO_NET_S_LINK_UP);
> + spin_unlock(&ndev->status_lock);
> +}
> +
>  static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 
> features)
>  {
>   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> + u64 cur_features;
>   int err;
>  
>   print_features(mvdev, features, true);
> @@ -2310,7 +2362,11 @@ static int mlx5_vdpa_set_driver_features(struct 
> vdpa_device *vdev, u64 features)
>   if (err)
>   return err;
>  
> + cur_features = ndev->mvdev.actual_features;
>   ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
> + if (f_status_was_set(cur_features, ndev->mvdev.actual_features))
> + update_link_state(mvdev);
> +
>   if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
>   ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, 
> ndev->config.max_virtqueue_pairs);
>   else
> @@ -2995,34 +3051,6 @@ struct mlx5_vdpa_mgmtdev {
>   struct mlx5_vdpa_net *ndev;
>  };
>  
> -static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
> -{
> - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
> - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
> - int err;
> -
> - MLX5_SET(query_vport_state_in, in, opcode, 
> MLX5_CMD_OP_QUERY_VPORT_STATE);
> - MLX5_SET(query_vport_state_in, in, op_mod, opmod);
> - MLX5_SET(query_vport_state_in, in, vport_number, vport);
> - if (vport)
> - MLX5_SET(query_vport_state_in, in, other_vport, 1);
> -
> - err = mlx5_cmd_exec

Re: [PATCH 0/2] vdpa/snet: support [s/g]et_vq_state and suspend