Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Jason Wang
On Wed, Jun 28, 2023 at 2:02 PM Xuan Zhuo  wrote:
>
> On Wed, 28 Jun 2023 12:07:10 +0800, Jason Wang  wrote:
> > On Tue, Jun 27, 2023 at 5:05 PM Xuan Zhuo  
> > wrote:
> > >
> > > On Tue, 27 Jun 2023 16:03:26 +0800, Jason Wang  
> > > wrote:
> > > > On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  
> > > > wrote:
> > > > >
> > > > > If the vq is the premapped mode, use the sg_dma_address() directly.
> > > > >
> > > > > Signed-off-by: Xuan Zhuo 
> > > > > ---
> > > > >  drivers/virtio/virtio_ring.c | 46 
> > > > > ++--
> > > > >  1 file changed, 28 insertions(+), 18 deletions(-)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio_ring.c 
> > > > > b/drivers/virtio/virtio_ring.c
> > > > > index 2afdfb9e3e30..18212c3e056b 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct 
> > > > > virtqueue *_vq,
> > > > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > > dma_addr_t addr;
> > > > >
> > > > > -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> > > > > &addr))
> > > > > -   goto unmap_release;
> > > > > +   if (vq->premapped) {
> > > > > +   addr = sg_dma_address(sg);
> > > > > +   } else {
> > > > > +   if (vring_map_one_sg(vq, sg, 
> > > > > DMA_TO_DEVICE, &addr))
> > > > > +   goto unmap_release;
> > > > > +   }
> > > >
> > > > Btw, I wonder whether or not it would be simple to implement the
> > > > vq->premapped check inside vring_map_one_sg() assuming the
> > > > !use_dma_api is done there as well.
> > >
> > >
> > > YES,
> > >
> > > That will more simple for the caller.
> > >
> > > But we will have things like:
> > >
> > > int func(bool do)
> > > {
> > > if (!do)
> > > return;
> > > }
> > >
> > > I like this way, but you don't like it in last version.
> >
> > I see :)
> >
> > So I think it depends on the error handling path, we should choose a
> > way that can let us easily deal with errors.
> >
> > For example, it seems the current approach is better since it doesn't
> > need to change the unmap_release.
>
> NO,
>
> The unmap_release is same for two way.
>
> Thanks.

Ok, so either is fine for me.

Thanks

>
>
> >
> > Thanks
> >
> > >
> > > >
> > > > >
> > > > > prev = i;
> > > > > /* Note that we trust indirect descriptor
> > > > > @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct 
> > > > > virtqueue *_vq,
> > > > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > > dma_addr_t addr;
> > > > >
> > > > > -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> > > > > &addr))
> > > > > -   goto unmap_release;
> > > > > +   if (vq->premapped) {
> > > > > +   addr = sg_dma_address(sg);
> > > > > +   } else {
> > > > > +   if (vring_map_one_sg(vq, sg, 
> > > > > DMA_FROM_DEVICE, &addr))
> > > > > +   goto unmap_release;
> > > > > +   }
> > > > >
> > > > > prev = i;
> > > > > /* Note that we trust indirect descriptor
> > > > > @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct 
> > > > > virtqueue *_vq,
> > > > > return 0;
> > > > >
> > > > >  unmap_release:
> > > > > -   err_idx = i;
> > > > > +   if (!vq->premapped) {
> > > >
> > > > Can vq->premapped be true here? The label is named as "unmap_relase"
> > > > which implies "map" beforehand which seems not the case for
> > > > premapping.
> > >
> > > I see.
> > >
> > > Rethink about this, there is a better way.
> > > I will fix in next version.
> > >
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > >
> > > > > +   err_idx = i;
> > > > >
> > > > > -   if (indirect)
> > > > > -   i = 0;
> > > > > -   else
> > > > > -   i = head;
> > > > > -
> > > > > -   for (n = 0; n < total_sg; n++) {
> > > > > -   if (i == err_idx)
> > > > > -   break;
> > > > > -   if (indirect) {
> > > > > -   vring_unmap_one_split_indirect(vq, &desc[i]);
> > > > > -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> > > > > -   } else
> > > > > -   i = vring_unmap_one_split(vq, i);
> > > > > +   if (indirect)
> > > > > +   i = 0;
> > > > > +   else
> > > > > +   i = head;
> > > > > +
> > > > > +   for (n = 0; n < total_sg; n++) {
> > > > > +   if (i ==

Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Xuan Zhuo
On Wed, 28 Jun 2023 12:07:10 +0800, Jason Wang  wrote:
> On Tue, Jun 27, 2023 at 5:05 PM Xuan Zhuo  wrote:
> >
> > On Tue, 27 Jun 2023 16:03:26 +0800, Jason Wang  wrote:
> > > On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  
> > > wrote:
> > > >
> > > > If the vq is the premapped mode, use the sg_dma_address() directly.
> > > >
> > > > Signed-off-by: Xuan Zhuo 
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 46 ++--
> > > >  1 file changed, 28 insertions(+), 18 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 2afdfb9e3e30..18212c3e056b 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct 
> > > > virtqueue *_vq,
> > > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > dma_addr_t addr;
> > > >
> > > > -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> > > > &addr))
> > > > -   goto unmap_release;
> > > > +   if (vq->premapped) {
> > > > +   addr = sg_dma_address(sg);
> > > > +   } else {
> > > > +   if (vring_map_one_sg(vq, sg, 
> > > > DMA_TO_DEVICE, &addr))
> > > > +   goto unmap_release;
> > > > +   }
> > >
> > > Btw, I wonder whether or not it would be simple to implement the
> > > vq->premapped check inside vring_map_one_sg() assuming the
> > > !use_dma_api is done there as well.
> >
> >
> > YES,
> >
> > That will more simple for the caller.
> >
> > But we will have things like:
> >
> > int func(bool do)
> > {
> > if (!do)
> > return;
> > }
> >
> > I like this way, but you don't like it in last version.
>
> I see :)
>
> So I think it depends on the error handling path, we should choose a
> way that can let us easily deal with errors.
>
> For example, it seems the current approach is better since it doesn't
> need to change the unmap_release.

NO,

The unmap_release is same for two way.

Thanks.


>
> Thanks
>
> >
> > >
> > > >
> > > > prev = i;
> > > > /* Note that we trust indirect descriptor
> > > > @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct 
> > > > virtqueue *_vq,
> > > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > dma_addr_t addr;
> > > >
> > > > -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> > > > &addr))
> > > > -   goto unmap_release;
> > > > +   if (vq->premapped) {
> > > > +   addr = sg_dma_address(sg);
> > > > +   } else {
> > > > +   if (vring_map_one_sg(vq, sg, 
> > > > DMA_FROM_DEVICE, &addr))
> > > > +   goto unmap_release;
> > > > +   }
> > > >
> > > > prev = i;
> > > > /* Note that we trust indirect descriptor
> > > > @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct 
> > > > virtqueue *_vq,
> > > > return 0;
> > > >
> > > >  unmap_release:
> > > > -   err_idx = i;
> > > > +   if (!vq->premapped) {
> > >
> > > Can vq->premapped be true here? The label is named as "unmap_relase"
> > > which implies "map" beforehand which seems not the case for
> > > premapping.
> >
> > I see.
> >
> > Rethink about this, there is a better way.
> > I will fix in next version.
> >
> >
> > Thanks.
> >
> >
> > >
> > > Thanks
> > >
> > >
> > > > +   err_idx = i;
> > > >
> > > > -   if (indirect)
> > > > -   i = 0;
> > > > -   else
> > > > -   i = head;
> > > > -
> > > > -   for (n = 0; n < total_sg; n++) {
> > > > -   if (i == err_idx)
> > > > -   break;
> > > > -   if (indirect) {
> > > > -   vring_unmap_one_split_indirect(vq, &desc[i]);
> > > > -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> > > > -   } else
> > > > -   i = vring_unmap_one_split(vq, i);
> > > > +   if (indirect)
> > > > +   i = 0;
> > > > +   else
> > > > +   i = head;
> > > > +
> > > > +   for (n = 0; n < total_sg; n++) {
> > > > +   if (i == err_idx)
> > > > +   break;
> > > > +   if (indirect) {
> > > > +   vring_unmap_one_split_indirect(vq, 
> > > > &desc[i]);
> > > > +   i = virtio16_to_cpu(_vq->vdev, 
> > > > desc[i].next);
> > > > +   } else
> > > > +   i = vring_unmap_one_split(vq,

Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Jason Wang
On Tue, Jun 27, 2023 at 5:05 PM Xuan Zhuo  wrote:
>
> On Tue, 27 Jun 2023 16:03:26 +0800, Jason Wang  wrote:
> > On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> > >
> > > If the vq is the premapped mode, use the sg_dma_address() directly.
> > >
> > > Signed-off-by: Xuan Zhuo 
> > > ---
> > >  drivers/virtio/virtio_ring.c | 46 ++--
> > >  1 file changed, 28 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 2afdfb9e3e30..18212c3e056b 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct 
> > > virtqueue *_vq,
> > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > dma_addr_t addr;
> > >
> > > -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> > > &addr))
> > > -   goto unmap_release;
> > > +   if (vq->premapped) {
> > > +   addr = sg_dma_address(sg);
> > > +   } else {
> > > +   if (vring_map_one_sg(vq, sg, 
> > > DMA_TO_DEVICE, &addr))
> > > +   goto unmap_release;
> > > +   }
> >
> > Btw, I wonder whether or not it would be simple to implement the
> > vq->premapped check inside vring_map_one_sg() assuming the
> > !use_dma_api is done there as well.
>
>
> YES,
>
> That will more simple for the caller.
>
> But we will have things like:
>
> int func(bool do)
> {
> if (!do)
> return;
> }
>
> I like this way, but you don't like it in last version.

I see :)

So I think it depends on the error handling path, we should choose a
way that can let us easily deal with errors.

For example, it seems the current approach is better since it doesn't
need to change the unmap_release.

Thanks

>
> >
> > >
> > > prev = i;
> > > /* Note that we trust indirect descriptor
> > > @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct 
> > > virtqueue *_vq,
> > > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > dma_addr_t addr;
> > >
> > > -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> > > &addr))
> > > -   goto unmap_release;
> > > +   if (vq->premapped) {
> > > +   addr = sg_dma_address(sg);
> > > +   } else {
> > > +   if (vring_map_one_sg(vq, sg, 
> > > DMA_FROM_DEVICE, &addr))
> > > +   goto unmap_release;
> > > +   }
> > >
> > > prev = i;
> > > /* Note that we trust indirect descriptor
> > > @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct 
> > > virtqueue *_vq,
> > > return 0;
> > >
> > >  unmap_release:
> > > -   err_idx = i;
> > > +   if (!vq->premapped) {
> >
> > Can vq->premapped be true here? The label is named as "unmap_relase"
> > which implies "map" beforehand which seems not the case for
> > premapping.
>
> I see.
>
> Rethink about this, there is a better way.
> I will fix in next version.
>
>
> Thanks.
>
>
> >
> > Thanks
> >
> >
> > > +   err_idx = i;
> > >
> > > -   if (indirect)
> > > -   i = 0;
> > > -   else
> > > -   i = head;
> > > -
> > > -   for (n = 0; n < total_sg; n++) {
> > > -   if (i == err_idx)
> > > -   break;
> > > -   if (indirect) {
> > > -   vring_unmap_one_split_indirect(vq, &desc[i]);
> > > -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> > > -   } else
> > > -   i = vring_unmap_one_split(vq, i);
> > > +   if (indirect)
> > > +   i = 0;
> > > +   else
> > > +   i = head;
> > > +
> > > +   for (n = 0; n < total_sg; n++) {
> > > +   if (i == err_idx)
> > > +   break;
> > > +   if (indirect) {
> > > +   vring_unmap_one_split_indirect(vq, 
> > > &desc[i]);
> > > +   i = virtio16_to_cpu(_vq->vdev, 
> > > desc[i].next);
> > > +   } else
> > > +   i = vring_unmap_one_split(vq, i);
> > > +   }
> > > }
> > >
> > > if (indirect)
> > > --
> > > 2.32.0.3.g01195cf9f
> > >
> >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 2/2] vduse: enable Virtio-net device type

2023-06-27 Thread Jason Wang
On Tue, Jun 27, 2023 at 7:37 PM Maxime Coquelin
 wrote:
>
> This patch adds Virtio-net device type to the supported
> devices types. Initialization fails if the device does
> not support VIRTIO_F_VERSION_1 feature, in order to
> guarantee the configuration space is read-only.
>
> Signed-off-by: Maxime Coquelin 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> b/drivers/vdpa/vdpa_user/vduse_dev.c
> index c1c2f4c711ae..89088fa27026 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -142,6 +142,7 @@ static struct workqueue_struct *vduse_irq_bound_wq;
>
>  static u32 allowed_device_id[] = {
> VIRTIO_ID_BLOCK,
> +   VIRTIO_ID_NET,
>  };
>
>  static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
> @@ -1668,6 +1669,10 @@ static bool features_is_valid(struct vduse_dev_config 
> *config)
> (config->features & (1ULL << 
> VIRTIO_BLK_F_CONFIG_WCE)))
> return false;
>
> +   if ((config->device_id == VIRTIO_ID_NET) &&
> +   !(config->features & (1ULL << VIRTIO_F_VERSION_1)))
> +   return false;
> +
> return true;
>  }
>
> @@ -2023,6 +2028,7 @@ static const struct vdpa_mgmtdev_ops 
> vdpa_dev_mgmtdev_ops = {
>
>  static struct virtio_device_id id_table[] = {
> { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
> +   { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
> { 0 },
>  };
>
> --
> 2.41.0
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/2] vduse: validate block features only with block devices

2023-06-27 Thread Jason Wang
On Tue, Jun 27, 2023 at 7:37 PM Maxime Coquelin
 wrote:
>
> This patch is preliminary work to enable network device
> type support to VDUSE.
>
> As VIRTIO_BLK_F_CONFIG_WCE shares the same value as
> VIRTIO_NET_F_HOST_TSO4, we need to restrict its check
> to Virtio-blk device type.
>
> Signed-off-by: Maxime Coquelin 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 5f5c21674fdc..c1c2f4c711ae 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -1658,13 +1658,14 @@ static bool device_is_allowed(u32 device_id)
> return false;
>  }
>
> -static bool features_is_valid(u64 features)
> +static bool features_is_valid(struct vduse_dev_config *config)
>  {
> -   if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
> +   if (!(config->features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
> return false;
>
> /* Now we only support read-only configuration space */
> -   if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
> +   if ((config->device_id == VIRTIO_ID_BLOCK) &&
> +   (config->features & (1ULL << 
> VIRTIO_BLK_F_CONFIG_WCE)))
> return false;
>
> return true;
> @@ -1691,7 +1692,7 @@ static bool vduse_validate_config(struct 
> vduse_dev_config *config)
> if (!device_is_allowed(config->device_id))
> return false;
>
> -   if (!features_is_valid(config->features))
> +   if (!features_is_valid(config))
> return false;
>
> return true;
> --
> 2.41.0
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 3/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread kernel test robot
Hi David,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 6995e2de6891c724bfeb2db33d7b87775f913ad1]

url:
https://github.com/intel-lab-lkp/linux/commits/David-Hildenbrand/mm-memory_hotplug-check-for-fatal-signals-only-in-offline_pages/20230627-192444
base:   6995e2de6891c724bfeb2db33d7b87775f913ad1
patch link:
https://lore.kernel.org/r/20230627112220.229240-4-david%40redhat.com
patch subject: [PATCH v1 3/5] mm/memory_hotplug: make 
offline_and_remove_memory() timeout instead of failing on fatal signals
config: x86_64-randconfig-x006-20230627 
(https://download.01.org/0day-ci/archive/20230628/202306280935.dktwlhfd-...@intel.com/config)
compiler: clang version 15.0.7 (https://github.com/llvm/llvm-project.git 
8dfdcc7b7bf66834a761bd8de445840ef68e4d1a)
reproduce: 
(https://download.01.org/0day-ci/archive/20230628/202306280935.dktwlhfd-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202306280935.dktwlhfd-...@intel.com/

All warnings (new ones prefixed by >>):

>> mm/memory_hotplug.c:163:13: warning: unused variable 
>> 'mhp_offlining_timer_active' [-Wunused-variable]
   static bool mhp_offlining_timer_active;
   ^
   mm/memory_hotplug.c:166:13: warning: unused function 'mhp_offline_timer_fn' 
[-Wunused-function]
   static void mhp_offline_timer_fn(struct timer_list *unused)
   ^
   2 warnings generated.


vim +/mhp_offlining_timer_active +163 mm/memory_hotplug.c

   154  
   155  /*
   156   * Protected by the device hotplug lock: offline_and_remove_memory()
   157   * will activate a timer such that offlining cannot be stuck forever.
   158   *
   159   * With an active timer, fatal signals will be ignored, because they 
can be
   160   * counter-productive when dying user space triggers device 
unplug/driver
   161   * unloading that ends up offlining+removing device memory.
   162   */
 > 163  static bool mhp_offlining_timer_active;
   164  static atomic_t mhp_offlining_timer_expired;
   165  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 10:56:54 -0400, "Michael S. Tsirkin"  
wrote:
> On Tue, Jun 27, 2023 at 04:50:01PM +0800, Xuan Zhuo wrote:
> > On Tue, 27 Jun 2023 16:03:23 +0800, Jason Wang  wrote:
> > > On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  
> > > wrote:
> > > >
> > > > This helper allows the driver change the dma mode to premapped mode.
> > > > Under the premapped mode, the virtio core do not do dma mapping
> > > > internally.
> > > >
> > > > This just work when the use_dma_api is true. If the use_dma_api is 
> > > > false,
> > > > the dma options is not through the DMA APIs, that is not the standard
> > > > way of the linux kernel.
> > > >
> > > > Signed-off-by: Xuan Zhuo 
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 40 
> > > >  include/linux/virtio.h   |  2 ++
> > > >  2 files changed, 42 insertions(+)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 72ed07a604d4..2afdfb9e3e30 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -172,6 +172,9 @@ struct vring_virtqueue {
> > > > /* Host publishes avail event idx */
> > > > bool event;
> > > >
> > > > +   /* Do DMA mapping by driver */
> > > > +   bool premapped;
> > > > +
> > > > /* Head of free buffer list. */
> > > > unsigned int free_head;
> > > > /* Number we've added since last sync. */
> > > > @@ -2059,6 +2062,7 @@ static struct virtqueue 
> > > > *vring_create_virtqueue_packed(
> > > > vq->packed_ring = true;
> > > > vq->dma_dev = dma_dev;
> > > > vq->use_dma_api = vring_use_dma_api(vdev);
> > > > +   vq->premapped = false;
> > > >
> > > > vq->indirect = virtio_has_feature(vdev, 
> > > > VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > !context;
> > > > @@ -2548,6 +2552,7 @@ static struct virtqueue 
> > > > *__vring_new_virtqueue(unsigned int index,
> > > >  #endif
> > > > vq->dma_dev = dma_dev;
> > > > vq->use_dma_api = vring_use_dma_api(vdev);
> > > > +   vq->premapped = false;
> > > >
> > > > vq->indirect = virtio_has_feature(vdev, 
> > > > VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > !context;
> > > > @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 
> > > > num,
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(virtqueue_resize);
> > > >
> > > > +/**
> > > > + * virtqueue_set_premapped - set the vring premapped mode
> > > > + * @_vq: the struct virtqueue we're talking about.
> > > > + *
> > > > + * Enable the premapped mode of the vq.
> > > > + *
> > > > + * The vring in premapped mode does not do dma internally, so the 
> > > > driver must
> > > > + * do dma mapping in advance. The driver must pass the dma_address 
> > > > through
> > > > + * dma_address of scatterlist. When the driver got a used buffer from
> > > > + * the vring, it has to unmap the dma address. So the driver must call
> > > > + * 
> > > > virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped().
> > > > + *
> > > > + * This must be called before adding any buf to vring.
> > >
> > > And any old buffer should be detached?
> >
> > I mean that before adding any buf, So there are not old buffer.
> >
>
> Oh. So put this in the same sentence:
>
>   This function must be called immediately after creating the vq,
>   or after vq reset, and before adding any buffers to it.


OK, thanks.

>
>
> > >
> > > > + * So this should be called immediately after init vq or vq reset.
>
> Do you really need to call this again after each reset?

YES


Thanks.


>
>
> > > Any way to detect and warn in this case? (not a must if it's too
> > > expensive to do the check)
> >
> >
> > I can try to check whether the qeueu is empty.
> >
> >
> > >
> > > > + *
> > > > + * Caller must ensure we don't call this with other virtqueue 
> > > > operations
> > > > + * at the same time (except where noted).
> > > > + *
> > > > + * Returns zero or a negative error.
> > > > + * 0: success.
> > > > + * -EINVAL: vring does not use the dma api, so we can not enable 
> > > > premapped mode.
> > > > + */
> > > > +int virtqueue_set_premapped(struct virtqueue *_vq)
> > > > +{
> > > > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +
> > > > +   if (!vq->use_dma_api)
> > > > +   return -EINVAL;
> > > > +
> > > > +   vq->premapped = true;
> > >
> > > I guess there should be a way to disable it. Would it be useful for
> > > the case when AF_XDP sockets were destroyed?
> >
> > Yes.
> >
> > When we reset the queue, the vq->premapped will be set to 0.
> >
> > The is called after find_vqs or reset vq.
> >
> > Thanks.
> >
> >
> >
> > >
> > > Thanks
> > >
> > >
> > > > +
> > > > +   return 0;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(virtqueue_set_premapped);
> > > > +
> > > >  /* Only available for split ring */
> > > >  struct virtqueue *vring_new_virtqueue(unsigned int index,
> > > >   

Re: [PATCH v1 3/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread John Hubbard via Virtualization

On 6/27/23 08:14, Michal Hocko wrote:

On Tue 27-06-23 16:57:53, David Hildenbrand wrote:

...

IIUC (John can correct me if I am wrong):

1) The process holds the device node open
2) The process gets killed or quits
3) As the process gets torn down, it closes the device node
4) Closing the device node results in the driver removing the device and
 calling offline_and_remove_memory()

So it's not a "tear down process" that triggers that offlining_removal
somehow explicitly, it's just a side-product of it letting go of the device
node as the process gets torn down.


Isn't that just fragile? The operation might fail for other reasons. Why
cannot there be a hold on the resource to control the tear down
explicitly?


I'll let John comment on that. But from what I understood, in most setups
where ZONE_MOVABLE gets used for hotplugged memory
offline_and_remove_memory() succeeds and allows for reusing the device later
without a reboot.

For the cases where it doesn't work, a reboot is required.
 
That is exactly correct. That's what we ran into.


And there are workarounds (for example: kthreads don't have any signals
pending...), but I did want to follow through here and make -mm aware of the
problem. And see if there is a better way.

...

It seems that offline_and_remove_memory is using a wrong operation then.
If it wants an opportunistic offlining with some sort of policy. Timeout
might be just one policy to use but failure mode or a retry count might
be a better fit for some users. So rather than (ab)using offline_pages,
would be make more sense to extract basic offlining steps and allow
drivers like virtio-mem to reuse them and define their own policy?


...like this, perhaps. Sounds promising!


thanks,
--
John Hubbard
NVIDIA

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 0/3] Add sync object UAPI support to VirtIO-GPU driver

2023-06-27 Thread Rob Clark
On Fri, May 12, 2023 at 2:23 PM Gurchetan Singh
 wrote:
>
>
>
> On Thu, May 11, 2023 at 7:33 PM Dmitry Osipenko 
>  wrote:
>>
>> On 5/12/23 03:17, Gurchetan Singh wrote:
>> ...
>> > Can we get one of the Mesa MRs reviewed first?  There's currently no
>> > virtio-intel MR AFAICT, and the amdgpu one is marked as "Draft:".
>> >
>> > Even for the amdgpu, Pierre suggests the feature "will be marked as
>> > experimental both in Mesa and virglrenderer" and we can revise as needed.
>> > The DRM requirements seem to warn against adding an UAPI too hastily...
>> >
>> > You can get the deqp-vk 1.2 tests to pass with the current UAPI, if you
>> > just change your mesa <--> virglrenderer protocol a little.  Perhaps that
>> > way is even better, since you plumb the in sync-obj into host-side command
>> > submission.
>> >
>> > Without inter-context sharing of the fence, this MR really only adds guest
>> > kernel syntactic sugar.
>> >
>> > Note I'm not against syntactic sugar, but I just want to point out that you
>> > can likely merge the native context work without any UAPI changes, in case
>> > it's not clear.
>> >
>> > If this was for the core drm syncobj implementation, and not just
>> >> driver ioctl parsing and wiring up the core helpers, I would agree
>> >> with you.
>> >>
>> >
>> > There are several possible and viable paths to get the features in question
>> > (VK1.2 syncobjs, and inter-context fence sharing).  There are paths
>> > entirely without the syncobj, paths that only use the syncobj for the
>> > inter-context fence sharing case and create host syncobjs for VK1.2, paths
>> > that also use guest syncobjs in every proxied command submission.
>> >
>> > It's really hard to tell which one is better.  Here's my suggestion:
>> >
>> > 1) Get the native contexts reviewed/merged in Mesa/virglrenderer using the
>> > current UAPI.  Options for VK1.2 include: pushing down the syncobjs to the
>> > host, and simulating the syncobj (as already done).  It's fine to mark
>> > these contexts as "experimental" like msm-experimental.  That will allow
>> > you to experiment with the protocols, come up with tests, and hopefully
>> > determine an answer to the host versus guest syncobj question.
>> >
>> > 2) Once you've completed (1), try to add UAPI changes for features that are
>> > missing or things that are suboptimal with the knowledge gained from doing
>> > (2).
>> >
>> > WDYT?
>>
>> Having syncobj support available by DRM driver is a mandatory
>> requirement for native contexts because userspace (Mesa) relies on sync
>> objects support presence. In particular, Intel Mesa driver checks
>> whether DRM driver supports sync objects to decide which features are
>> available, ANV depends on the syncobj support.
>>
>>
>> I'm not familiar with a history of Venus and its limitations. Perhaps
>> the reason it's using host-side syncobjs is to have 1:1 Vulkan API
>> mapping between guest and host. Not sure if Venus could use guest
>> syncobjs instead or there are problems with that.
>
>
> Why not submit a Venus MR?  It's already in-tree, and you can see how your 
> API works in scenarios with a host side timeline semaphore (aka syncobj).  I 
> think they are also interested in fencing/sync improvements.
>
>>
>>
>> When syncobj was initially added to kernel, it was done from the needs
>> of supporting Vulkan wait API. For Venus the actual Vulkan driver is on
>> host side, while for native contexts it's on guest side. Native contexts
>> don't need syncobj on host side, it will be unnecessary overhead for
>> every nctx to have it on host. Hence, if there is no good reason for
>> host-side syncobjs, then why do that?
>
>
> Depends on your threading model.  You can have the following scenarios:
>
> 1) N guest contexts : 1 host thread
> 2) N guest contexts : N host threads for each context
> 3) 1:1 thread
>
> I think the native context is single-threaded (1), IIRC?  If the goal is to 
> push command submission to the host (for inter-context sharing), I think 
> you'll at-least want (2).  For a 1:1 model (a la gfxstream), one host thread 
> can put another thread's out_sync_objs as it's in_sync_objs (in the same 
> virtgpu context).  I think that's kind of the goal of timeline semaphores, 
> with the example given by Khronos as with a compute thread + a graphics 
> thread.
>
> I'm not saying one threading model is better than any other, perhaps the 
> native context using the host driver in the guest is so good, it doesn't 
> matter.  I'm just saying these are the types of discussions we can have if we 
> tried to get one the Mesa MRs merged first ;-)
>
>>
>> Native contexts pass deqp synchronization tests, they use sync objects
>> universally for both GL and VK. Games work, piglit/deqp passing. What
>> else you're wanting to test? Turnip?
>
>
> Turnip would also fulfill the requirements, since most of the native context 
> stuff is already wired for freedreno.
>
>>
>>
>> The AMDGPU code has been looked and it looks good. It's a draft for

Re: [PATCH v1 3/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread David Hildenbrand

On 27.06.23 16:17, Michal Hocko wrote:

On Tue 27-06-23 15:14:11, David Hildenbrand wrote:

On 27.06.23 14:40, Michal Hocko wrote:

On Tue 27-06-23 13:22:18, David Hildenbrand wrote:

John Hubbard writes [1]:

  Some device drivers add memory to the system via memory hotplug.
  When the driver is unloaded, that memory is hot-unplugged.

  However, memory hot unplug can fail. And these days, it fails a
  little too easily, with respect to the above case. Specifically, if
  a signal is pending on the process, hot unplug fails.

  [...]

  So in this case, other things (unmovable pages, un-splittable huge
  pages) can also cause the above problem. However, those are
  demonstrably less common than simply having a pending signal. I've
  got bug reports from users who can trivially reproduce this by
  killing their process with a "kill -9", for example.


This looks like a bug of the said driver no? If the tear down process is
killed it could very well happen right before offlining so you end up in
the very same state. Or what am I missing?


IIUC (John can correct me if I am wrong):

1) The process holds the device node open
2) The process gets killed or quits
3) As the process gets torn down, it closes the device node
4) Closing the device node results in the driver removing the device and
calling offline_and_remove_memory()

So it's not a "tear down process" that triggers that offlining_removal
somehow explicitly, it's just a side-product of it letting go of the device
node as the process gets torn down.


Isn't that just fragile? The operation might fail for other reasons. Why
cannot there be a hold on the resource to control the tear down
explicitly?


I'll let John comment on that. But from what I understood, in most 
setups where ZONE_MOVABLE gets used for hotplugged memory 
offline_and_remove_memory() succeeds and allows for reusing the device 
later without a reboot.


For the cases where it doesn't work, a reboot is required.




Especially with ZONE_MOVABLE, offlining is supposed to work in most
cases when offlining actually hotplugged (not boot) memory, and only fail
in rare corner cases (e.g., some driver holds a reference to a page in
ZONE_MOVABLE, turning it unmovable).

In these corner cases we really don't want to be stuck forever in
offline_and_remove_memory(). But in the general cases, we really want to
do our best to make memory offlining succeed -- in a reasonable
timeframe.

Reliably failing in the described case when there is a fatal signal pending
is sub-optimal. The pending signal check is mostly only relevant when user
space explicitly triggers offlining of memory using sysfs device attributes
("state" or "online" attribute), but not when coming via
offline_and_remove_memory().

So let's use a timer instead and ignore fatal signals, because they are
not really expressive for offline_and_remove_memory() users. Let's default
to 30 seconds if no timeout was specified, and limit the timeout to 120
seconds.


I really hate having timeouts back. They just proven to be hard to get
right and it is essentially a policy implemented in the kernel. They
simply do not belong to the kernel space IMHO.


As much as I agree with you in terms of offlining triggered from user space
(e.g., write "state" or "online" attribute) where user-space is actually in
charge  and can do something reasonable (timeout, retry, whatever), in these
the offline_and_remove_memory() case it's the driver that wants a
best-effort memory offlining+removal.

If it times out, virtio-mem will simply try another block or retry later.
Right now, it could get stuck forever in offline_and_remove_memory(), which
is obviously "not great". Fortunately, for virtio-mem it's configurable and
we use the alloc_contig_range()-method for now as default.


It seems that offline_and_remove_memory is using a wrong operation then.
If it wants an opportunistic offlining with some sort of policy. Timeout
might be just one policy to use but failure mode or a retry count might
be a better fit for some users. So rather than (ab)using offline_pages,
would be make more sense to extract basic offlining steps and allow
drivers like virtio-mem to reuse them and define their own policy?


virtio-mem, in default operation, does that: use alloc_contig_range() to 
logically unplug ("fake offline") that memory and then just trigger 
offline_and_remove_memory() to make it "officially offline".


In that mode, offline_and_remove_memory() cannot really timeout and is 
almost always going to succeed (except memory notifiers and some hugetlb 
dissolving).


Right now we also allow the admin to configure ordinary offlining 
directly (without prior fake offlining) when bigger memory blocks are 
used: offline_pages() is more reliable than alloc_contig_range(), for 
example, because it disables the PCP and the LRU cache, and retries more 
often (well, unfortunately then also for

Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()

2023-06-27 Thread Michael S. Tsirkin
On Tue, Jun 27, 2023 at 04:50:01PM +0800, Xuan Zhuo wrote:
> On Tue, 27 Jun 2023 16:03:23 +0800, Jason Wang  wrote:
> > On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> > >
> > > This helper allows the driver change the dma mode to premapped mode.
> > > Under the premapped mode, the virtio core do not do dma mapping
> > > internally.
> > >
> > > This just work when the use_dma_api is true. If the use_dma_api is false,
> > > the dma options is not through the DMA APIs, that is not the standard
> > > way of the linux kernel.
> > >
> > > Signed-off-by: Xuan Zhuo 
> > > ---
> > >  drivers/virtio/virtio_ring.c | 40 
> > >  include/linux/virtio.h   |  2 ++
> > >  2 files changed, 42 insertions(+)
> > >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 72ed07a604d4..2afdfb9e3e30 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -172,6 +172,9 @@ struct vring_virtqueue {
> > > /* Host publishes avail event idx */
> > > bool event;
> > >
> > > +   /* Do DMA mapping by driver */
> > > +   bool premapped;
> > > +
> > > /* Head of free buffer list. */
> > > unsigned int free_head;
> > > /* Number we've added since last sync. */
> > > @@ -2059,6 +2062,7 @@ static struct virtqueue 
> > > *vring_create_virtqueue_packed(
> > > vq->packed_ring = true;
> > > vq->dma_dev = dma_dev;
> > > vq->use_dma_api = vring_use_dma_api(vdev);
> > > +   vq->premapped = false;
> > >
> > > vq->indirect = virtio_has_feature(vdev, 
> > > VIRTIO_RING_F_INDIRECT_DESC) &&
> > > !context;
> > > @@ -2548,6 +2552,7 @@ static struct virtqueue 
> > > *__vring_new_virtqueue(unsigned int index,
> > >  #endif
> > > vq->dma_dev = dma_dev;
> > > vq->use_dma_api = vring_use_dma_api(vdev);
> > > +   vq->premapped = false;
> > >
> > > vq->indirect = virtio_has_feature(vdev, 
> > > VIRTIO_RING_F_INDIRECT_DESC) &&
> > > !context;
> > > @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 
> > > num,
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_resize);
> > >
> > > +/**
> > > + * virtqueue_set_premapped - set the vring premapped mode
> > > + * @_vq: the struct virtqueue we're talking about.
> > > + *
> > > + * Enable the premapped mode of the vq.
> > > + *
> > > + * The vring in premapped mode does not do dma internally, so the driver 
> > > must
> > > + * do dma mapping in advance. The driver must pass the dma_address 
> > > through
> > > + * dma_address of scatterlist. When the driver got a used buffer from
> > > + * the vring, it has to unmap the dma address. So the driver must call
> > > + * virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped().
> > > + *
> > > + * This must be called before adding any buf to vring.
> >
> > And any old buffer should be detached?
> 
> I mean that before adding any buf, So there are not old buffer.
> 

Oh. So put this in the same sentence:

This function must be called immediately after creating the vq,
or after vq reset, and before adding any buffers to it.


> >
> > > + * So this should be called immediately after init vq or vq reset.

Do you really need to call this again after each reset?


> > Any way to detect and warn in this case? (not a must if it's too
> > expensive to do the check)
> 
> 
> I can try to check whether the qeueu is empty.
> 
> 
> >
> > > + *
> > > + * Caller must ensure we don't call this with other virtqueue operations
> > > + * at the same time (except where noted).
> > > + *
> > > + * Returns zero or a negative error.
> > > + * 0: success.
> > > + * -EINVAL: vring does not use the dma api, so we can not enable 
> > > premapped mode.
> > > + */
> > > +int virtqueue_set_premapped(struct virtqueue *_vq)
> > > +{
> > > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > > +
> > > +   if (!vq->use_dma_api)
> > > +   return -EINVAL;
> > > +
> > > +   vq->premapped = true;
> >
> > I guess there should be a way to disable it. Would it be useful for
> > the case when AF_XDP sockets were destroyed?
> 
> Yes.
> 
> When we reset the queue, the vq->premapped will be set to 0.
> 
> The is called after find_vqs or reset vq.
> 
> Thanks.
> 
> 
> 
> >
> > Thanks
> >
> >
> > > +
> > > +   return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(virtqueue_set_premapped);
> > > +
> > >  /* Only available for split ring */
> > >  struct virtqueue *vring_new_virtqueue(unsigned int index,
> > >   unsigned int num,
> > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > index b93238db94e3..1fc0e1023bd4 100644
> > > --- a/include/linux/virtio.h
> > > +++ b/include/linux/virtio.h
> > > @@ -78,6 +78,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
> > >
> > >  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
> > >
> > > +int vir

Re: [PATCH v1 1/5] mm/memory_hotplug: check for fatal signals only in offline_pages()

2023-06-27 Thread David Hildenbrand

On 27.06.23 14:34, Michal Hocko wrote:

On Tue 27-06-23 13:22:16, David Hildenbrand wrote:

Let's check for fatal signals only. That looks cleaner and still keeps
the documented use case for manual user-space triggered memory offlining
working. From Documentation/admin-guide/mm/memory-hotplug.rst:

% timeout $TIMEOUT offline_block | failure_handling

In fact, we even document there: "the offlining context can be terminated
by sending a fatal signal".


We should be fixing documentation instead. This could break users who do
have a SIGALRM signal hander installed.


You mean because timeout will send a SIGALRM, which is not considered 
fatal in case a signal handler is installed?


At least the "traditional" tools I am aware of don't set a timeout at 
all (crossing fingers that they never end up stuck):

* chmem
* QEMU guest agent
* powerpc-utils

libdaxctl also doesn't seem to implement an easy-to-spot timeout for 
memory offlining, but it also doesn't configure SIGALRM.



Of course, that doesn't mean that there isn't somewhere a program that 
does that; I merely assume that it would be pretty unlikely to find such 
a program.


But no strong opinion: we can also keep it like that, update the doc and 
add a comment why this one here is different than most other signal 
backoff checks.



Thanks!

--
Cheers,

David / dhildenb

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 3/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread David Hildenbrand

On 27.06.23 14:40, Michal Hocko wrote:

On Tue 27-06-23 13:22:18, David Hildenbrand wrote:

John Hubbard writes [1]:

 Some device drivers add memory to the system via memory hotplug.
 When the driver is unloaded, that memory is hot-unplugged.

 However, memory hot unplug can fail. And these days, it fails a
 little too easily, with respect to the above case. Specifically, if
 a signal is pending on the process, hot unplug fails.

 [...]

 So in this case, other things (unmovable pages, un-splittable huge
 pages) can also cause the above problem. However, those are
 demonstrably less common than simply having a pending signal. I've
 got bug reports from users who can trivially reproduce this by
 killing their process with a "kill -9", for example.


This looks like a bug of the said driver no? If the tear down process is
killed it could very well happen right before offlining so you end up in
the very same state. Or what am I missing?


IIUC (John can correct me if I am wrong):

1) The process holds the device node open
2) The process gets killed or quits
3) As the process gets torn down, it closes the device node
4) Closing the device node results in the driver removing the device and
   calling offline_and_remove_memory()

So it's not a "tear down process" that triggers that offlining_removal 
somehow explicitly, it's just a side-product of it letting go of the 
device node as the process gets torn down.


  

Especially with ZONE_MOVABLE, offlining is supposed to work in most
cases when offlining actually hotplugged (not boot) memory, and only fail
in rare corner cases (e.g., some driver holds a reference to a page in
ZONE_MOVABLE, turning it unmovable).

In these corner cases we really don't want to be stuck forever in
offline_and_remove_memory(). But in the general cases, we really want to
do our best to make memory offlining succeed -- in a reasonable
timeframe.

Reliably failing in the described case when there is a fatal signal pending
is sub-optimal. The pending signal check is mostly only relevant when user
space explicitly triggers offlining of memory using sysfs device attributes
("state" or "online" attribute), but not when coming via
offline_and_remove_memory().

So let's use a timer instead and ignore fatal signals, because they are
not really expressive for offline_and_remove_memory() users. Let's default
to 30 seconds if no timeout was specified, and limit the timeout to 120
seconds.


I really hate having timeouts back. They just proven to be hard to get
right and it is essentially a policy implemented in the kernel. They
simply do not belong to the kernel space IMHO.


As much as I agree with you in terms of offlining triggered from user 
space (e.g., write "state" or "online" attribute) where user-space is 
actually in charge  and can do something reasonable (timeout, retry, 
whatever), in these the offline_and_remove_memory() case it's the driver 
that wants a best-effort memory offlining+removal.


If it times out, virtio-mem will simply try another block or retry 
later. Right now, it could get stuck forever in 
offline_and_remove_memory(), which is obviously "not great". 
Fortunately, for virtio-mem it's configurable and we use the 
alloc_contig_range()-method for now as default.


If it would time out for John's driver, we most certainly don't want to 
be stuck in offline_and_remove_memory(), blocking device/driver 
unloading (and even a reboot IIRC) possibly forever.



I much rather have offline_and_remove_memory() indicate "timeout" to a 
in-kernel user a bit earlier than getting stuck in there forever. The 
timeout parameter allows for giving the in-kernel users a bit of 
flexibility, which I showcases for virtio-mem that unplugs smaller 
blocks and rather wants to fail fast and retry later.



Sure, we could make the timeout configurable to optimize for some corner 
cases, but that's not really what offline_and_remove_memory() users want 
and I doubt anybody would fine-tune that: they want a best-effort 
attempt. And that's IMHO not really a policy, it's an implementation 
detail of these drivers.


For example, the driver from John could simply never call 
offline_and_remove_memory() and always require a reboot when wanting to 
reuse a device. But that's definitely what users want.


virtio-mem could simply never call offline_and_remove_memory() and 
indicate "I don't support unplug of these online memory blocks". But 
that's *definitely* not what users want.



I'm very open for alternatives regarding offline_and_remove_memory(), so 
far this was the only reasonable thing I could come up with that 
actually achieves what we want for these users: not get stuck in there 
forever but rather fail earlier than later.


And most importantly, not somehow try to involve user space that isn't 
even in charge of the offlining operation.


--
Cheers,

David / dhildenb

_

Re: [PATCH v6 3/3] drm/virtio: Support sync objects

2023-06-27 Thread Geert Uytterhoeven
Hi Dmitry,

On Mon, Jun 26, 2023 at 6:11 PM Dmitry Osipenko
 wrote:
> On 6/25/23 18:36, Geert Uytterhoeven wrote:
> > On Sun, Jun 25, 2023 at 2:41 PM Dmitry Osipenko
> >  wrote:
> >> On 6/25/23 11:47, Geert Uytterhoeven wrote:
> >>> On Sun, Apr 16, 2023 at 1:55 PM Dmitry Osipenko
> >>>  wrote:
>  Add sync object DRM UAPI support to VirtIO-GPU driver. Sync objects
>  support is needed by native context VirtIO-GPU Mesa drivers, it also will
>  be used by Venus and Virgl contexts.
> 
>  Reviewed-by; Emil Velikov 
>  Signed-off-by: Dmitry Osipenko 
> >>>
> >>> Thanks for your patch!
> >>>
>  --- a/drivers/gpu/drm/virtio/virtgpu_submit.c
>  +++ b/drivers/gpu/drm/virtio/virtgpu_submit.c
> >>>
>  +static int
>  +virtio_gpu_parse_deps(struct virtio_gpu_submit *submit)
>  +{
>  +   struct drm_virtgpu_execbuffer *exbuf = submit->exbuf;
>  +   struct drm_virtgpu_execbuffer_syncobj syncobj_desc;
>  +   size_t syncobj_stride = exbuf->syncobj_stride;
>  +   u32 num_in_syncobjs = exbuf->num_in_syncobjs;
>  +   struct drm_syncobj **syncobjs;
>  +   int ret = 0, i;
>  +
>  +   if (!num_in_syncobjs)
>  +   return 0;
>  +
>  +   /*
>  +* kvalloc at first tries to allocate memory using kmalloc and
>  +* falls back to vmalloc only on failure. It also uses GFP_NOWARN
> >>>
> >>> GFP_NOWARN does not exist.
> >>
> >> https://elixir.bootlin.com/linux/v6.4-rc7/source/include/linux/gfp_types.h#L38
> >
> > That line defines "__GFP_NOWARN", not "GFP_NOWARN".
> > C is case- and underscore-sensitive. as is "git grep -w" ;-)
>
> The removal of underscores was done intentionally for improving
> readability of the comment

Please don't do that, as IMHO it actually hampers readability:
  1. For some xxx, both GFP_xxx and __GFP_xxx are defined,
 so it does matter which one you are referring to,
  2. After dropping the underscores, "git grep -w" can no longer find
 the definition, nor its users.

Thanks!

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v1 2/2] vduse: enable Virtio-net device type

2023-06-27 Thread Maxime Coquelin
This patch adds Virtio-net device type to the supported
devices types. Initialization fails if the device does
not support VIRTIO_F_VERSION_1 feature, in order to
guarantee the configuration space is read-only.

Signed-off-by: Maxime Coquelin 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index c1c2f4c711ae..89088fa27026 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -142,6 +142,7 @@ static struct workqueue_struct *vduse_irq_bound_wq;
 
 static u32 allowed_device_id[] = {
VIRTIO_ID_BLOCK,
+   VIRTIO_ID_NET,
 };
 
 static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa)
@@ -1668,6 +1669,10 @@ static bool features_is_valid(struct vduse_dev_config 
*config)
(config->features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE)))
return false;
 
+   if ((config->device_id == VIRTIO_ID_NET) &&
+   !(config->features & (1ULL << VIRTIO_F_VERSION_1)))
+   return false;
+
return true;
 }
 
@@ -2023,6 +2028,7 @@ static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops 
= {
 
 static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+   { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
{ 0 },
 };
 
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 1/2] vduse: validate block features only with block devices

2023-06-27 Thread Maxime Coquelin
This patch is preliminary work to enable network device
type support to VDUSE.

As VIRTIO_BLK_F_CONFIG_WCE shares the same value as
VIRTIO_NET_F_HOST_TSO4, we need to restrict its check
to Virtio-blk device type.

Signed-off-by: Maxime Coquelin 
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 5f5c21674fdc..c1c2f4c711ae 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1658,13 +1658,14 @@ static bool device_is_allowed(u32 device_id)
return false;
 }
 
-static bool features_is_valid(u64 features)
+static bool features_is_valid(struct vduse_dev_config *config)
 {
-   if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
+   if (!(config->features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
return false;
 
/* Now we only support read-only configuration space */
-   if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE))
+   if ((config->device_id == VIRTIO_ID_BLOCK) &&
+   (config->features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE)))
return false;
 
return true;
@@ -1691,7 +1692,7 @@ static bool vduse_validate_config(struct vduse_dev_config 
*config)
if (!device_is_allowed(config->device_id))
return false;
 
-   if (!features_is_valid(config->features))
+   if (!features_is_valid(config))
return false;
 
return true;
-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 0/2] vduse: add support for networking devices

2023-06-27 Thread Maxime Coquelin
This small series enables virtio-net device type in VDUSE.
With it, basic operation have been tested, both with
virtio-vdpa and vhost-vdpa using DPDK Vhost library series
adding VDUSE support using split rings layout (merged in
DPDK v23.07-rc1).

Control queue support (and so multiqueue) has also been
tested, but requires a Kernel series from Jason Wang
relaxing control queue polling [1] to function reliably.

[1]: 
https://lore.kernel.org/lkml/cacgkmetgrxn3ppwsdo4oosnssljfembez0wvjgrr3whu+qa...@mail.gmail.com/T/

RFC -> v1 changes:
==
- Fail device init if it does not support VERSION_1 (Jason)

Maxime Coquelin (2):
  vduse: validate block features only with block devices
  vduse: enable Virtio-net device type

 drivers/vdpa/vdpa_user/vduse_dev.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

-- 
2.41.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 3/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread David Hildenbrand
John Hubbard writes [1]:

Some device drivers add memory to the system via memory hotplug.
When the driver is unloaded, that memory is hot-unplugged.

However, memory hot unplug can fail. And these days, it fails a
little too easily, with respect to the above case. Specifically, if
a signal is pending on the process, hot unplug fails.

[...]

So in this case, other things (unmovable pages, un-splittable huge
pages) can also cause the above problem. However, those are
demonstrably less common than simply having a pending signal. I've
got bug reports from users who can trivially reproduce this by
killing their process with a "kill -9", for example.

Especially with ZONE_MOVABLE, offlining is supposed to work in most
cases when offlining actually hotplugged (not boot) memory, and only fail
in rare corner cases (e.g., some driver holds a reference to a page in
ZONE_MOVABLE, turning it unmovable).

In these corner cases we really don't want to be stuck forever in
offline_and_remove_memory(). But in the general cases, we really want to
do our best to make memory offlining succeed -- in a reasonable
timeframe.

Reliably failing in the described case when there is a fatal signal pending
is sub-optimal. The pending signal check is mostly only relevant when user
space explicitly triggers offlining of memory using sysfs device attributes
("state" or "online" attribute), but not when coming via
offline_and_remove_memory().

So let's use a timer instead and ignore fatal signals, because they are
not really expressive for offline_and_remove_memory() users. Let's default
to 30 seconds if no timeout was specified, and limit the timeout to 120
seconds.

This change is also valuable for virtio-mem in BBM (Big Block Mode) with
"bbm_safe_unplug=off", to avoid endless loops when stuck forever in
offline_and_remove_memory().

While at it, drop the "extern" from offline_and_remove_memory() to make
it fit into a single line.

[1] https://lkml.kernel.org/r/20230620011719.155379-1-jhubb...@nvidia.com

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c|  2 +-
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c| 50 --
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index cb8bc6f6aa90..f8792223f1db 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -738,7 +738,7 @@ static int virtio_mem_offline_and_remove_memory(struct 
virtio_mem *vm,
"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
 
-   rc = offline_and_remove_memory(addr, size);
+   rc = offline_and_remove_memory(addr, size, 0);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 9fcbf5706595..d5f9e8b5a4a4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -307,7 +307,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
 struct zone *zone, struct memory_group *group);
 extern int remove_memory(u64 start, u64 size);
 extern void __remove_memory(u64 start, u64 size);
-extern int offline_and_remove_memory(u64 start, u64 size);
+int offline_and_remove_memory(u64 start, u64 size, unsigned int timeout_ms);
 
 #else
 static inline void try_offline_node(int nid) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d2151df4ee1..ca635121644a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -152,6 +152,22 @@ void put_online_mems(void)
 
 bool movable_node_enabled = false;
 
+/*
+ * Protected by the device hotplug lock: offline_and_remove_memory()
+ * will activate a timer such that offlining cannot be stuck forever.
+ *
+ * With an active timer, fatal signals will be ignored, because they can be
+ * counter-productive when dying user space triggers device unplug/driver
+ * unloading that ends up offlining+removing device memory.
+ */
+static bool mhp_offlining_timer_active;
+static atomic_t mhp_offlining_timer_expired;
+
+static void mhp_offline_timer_fn(struct timer_list *unused)
+{
+   atomic_set(&mhp_offlining_timer_expired, 1);
+}
+
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
 int mhp_default_online_type = MMOP_OFFLINE;
 #else
@@ -1879,7 +1895,18 @@ int __ref offline_pages(unsigned long start_pfn, 
unsigned long nr_pages,
do {
pfn = start_pfn;
do {
-   if (fatal_signal_pending(current)) {
+   /*
+* If a timer is active, we're coming via
+* offline_and_remove_memory() and want to ignore even
+* fatal signals.
+*/
+   if (mhp_offlining_tim

[PATCH v1 4/5] virtio-mem: set the timeout for offline_and_remove_memory() to 10 seconds

2023-06-27 Thread David Hildenbrand
Currently we use the default (30 seconds), let's reduce it to 10
seconds. In BBM, we barely deal with blocks larger than 1/2 GiB, and
after 10 seconds it's most probably best to give up on that memory block
and try another one (or retry this one later).

In the common fake-offline case where we effectively fake-offline memory
using alloc_contig_range() first (SBM or BBM with bbm_safe_unplug=on),
we expect offline_and_remove_memory() to be blazingly fast and never take
anywhere close to 10seconds -- so this should only affect BBM with
bbm_safe_unplug=off.

While at it, update the parameter description and the relationship to
unmovable pages.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index f8792223f1db..7468b4a907e3 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -41,7 +41,7 @@ MODULE_PARM_DESC(bbm_block_size,
 static bool bbm_safe_unplug = true;
 module_param(bbm_safe_unplug, bool, 0444);
 MODULE_PARM_DESC(bbm_safe_unplug,
-"Use a safe unplug mechanism in BBM, avoiding long/endless loops");
+"Use a safe/fast unplug mechanism in BBM, failing faster on 
unmovable pages");
 
 /*
  * virtio-mem currently supports the following modes of operation:
@@ -738,7 +738,7 @@ static int virtio_mem_offline_and_remove_memory(struct 
virtio_mem *vm,
"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
 
-   rc = offline_and_remove_memory(addr, size, 0);
+   rc = offline_and_remove_memory(addr, size, 10 * MSEC_PER_SEC);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
-- 
2.40.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 5/5] virtio-mem: check if the config changed before (fake) offlining memory

2023-06-27 Thread David Hildenbrand
If we repeatedly fail to (fake) offline memory, we won't be sending
any unplug requests to the device. However, we only check if the config
changed when sending such (un)plug requests.

So we could end up trying for a long time to offline memory, even though
the config changed already and we're not supposed to unplug memory
anymore.

Let's optimize for that case, identified while testing the
offline_and_remove() memory timeout and simulating it repeatedly running
into the timeout.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 7468b4a907e3..247fb3e0ce61 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1922,6 +1922,10 @@ static int virtio_mem_sbm_unplug_sb_online(struct 
virtio_mem *vm,
unsigned long start_pfn;
int rc;
 
+   /* Stop fake offlining attempts if the config changed. */
+   if (atomic_read(&vm->config_changed))
+   return -EAGAIN;
+
start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 sb_id * vm->sbm.sb_size);
 
@@ -2233,6 +2237,10 @@ static int virtio_mem_bbm_unplug_request(struct 
virtio_mem *vm, uint64_t diff)
virtio_mem_bbm_for_each_bb_rev(vm, bb_id, 
VIRTIO_MEM_BBM_BB_ADDED) {
cond_resched();
 
+   /* Stop (fake) offlining attempts if the config 
changed. */
+   if (atomic_read(&vm->config_changed))
+   return -EAGAIN;
+
/*
 * As we're holding no locks, these checks are racy,
 * but we don't care.
-- 
2.40.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 2/5] virtio-mem: convert most offline_and_remove_memory() errors to -EBUSY

2023-06-27 Thread David Hildenbrand
Let's prepare for offline_and_remove_memory() to return other error
codes that effectively translate to -EBUSY, such as -ETIMEDOUT.

Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 835f6cc2fb66..cb8bc6f6aa90 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -750,7 +750,15 @@ static int virtio_mem_offline_and_remove_memory(struct 
virtio_mem *vm,
dev_dbg(&vm->vdev->dev,
"offlining and removing memory failed: %d\n", rc);
}
-   return rc;
+
+   switch (rc) {
+   case 0:
+   case -ENOMEM:
+   case -EINVAL:
+   return rc;
+   default:
+   return -EBUSY;
+   }
 }
 
 /*
-- 
2.40.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 0/5] mm/memory_hotplug: make offline_and_remove_memory() timeout instead of failing on fatal signals

2023-06-27 Thread David Hildenbrand
As raised by John Hubbard [1], offline_and_remove_memory() failing on
fatal signals can be sub-optimal for out-of-tree drivers: dying user space
might be the last one holding a device node open.

As that device node gets closed, the driver might unplug the device
and trigger offline_and_remove_memory() to unplug previously
hotplugged device memory. This, however, will fail reliably when fatal
signals are pending on the dying process, turning the device unusable until
the machine gets rebooted.

That can be optizied easily by ignoring fatal signals. In fact, checking
for fatal signals in the case of offline_and_remove_memory() doesn't
make too much sense; the check makes sense when offlining is triggered
directly via sysfs.  However, we actually do want a way to not end up
stuck in offline_and_remove_memory() forever.

What offline_and_remove_memory() users actually want is fail after some
given timeout and not care about fatal signals.

So let's implement that, optimizing virtio-mem along the way.

Cc: Andrew Morton 
Cc: "Michael S. Tsirkin" 
Cc: John Hubbard 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Jason Wang 
Cc: Xuan Zhuo 

[1] https://lkml.kernel.org/r/20230620011719.155379-1-jhubb...@nvidia.com

David Hildenbrand (5):
  mm/memory_hotplug: check for fatal signals only in offline_pages()
  virtio-mem: convert most offline_and_remove_memory() errors to -EBUSY
  mm/memory_hotplug: make offline_and_remove_memory() timeout instead of
failing on fatal signals
  virtio-mem: set the timeout for offline_and_remove_memory() to 10
seconds
  virtio-mem: check if the config changed before (fake) offlining memory

 drivers/virtio/virtio_mem.c| 22 +--
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c| 50 --
 3 files changed, 68 insertions(+), 6 deletions(-)


base-commit: 6995e2de6891c724bfeb2db33d7b87775f913ad1
-- 
2.40.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v1 1/5] mm/memory_hotplug: check for fatal signals only in offline_pages()

2023-06-27 Thread David Hildenbrand
Let's check for fatal signals only. That looks cleaner and still keeps
the documented use case for manual user-space triggered memory offlining
working. From Documentation/admin-guide/mm/memory-hotplug.rst:

% timeout $TIMEOUT offline_block | failure_handling

In fact, we even document there: "the offlining context can be terminated
by sending a fatal signal".

Signed-off-by: David Hildenbrand 
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e0fa209d533..0d2151df4ee1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1879,7 +1879,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
do {
pfn = start_pfn;
do {
-   if (signal_pending(current)) {
+   if (fatal_signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
goto failed_removal_isolated;
-- 
2.40.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH vhost v10 10/10] virtio_net: support dma premapped

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 16:03:35 +0800, Jason Wang  wrote:
> On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> >
> > Introduce the module param "experiment_premapped" to enable the function
> > that the virtio-net do dma mapping.
> >
> > If that is true, the vq of virtio-net is under the premapped mode.
> > It just handle the sg with dma_address. And the driver must get the dma
> > address of the buffer to unmap after get the buffer from virtio core.
> >
> > That will be useful when AF_XDP is enable, AF_XDP tx and the kernel packet
> > xmit will share the tx queue, so the skb xmit must support the premapped
> > mode.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/net/virtio_net.c | 163 +--
> >  1 file changed, 141 insertions(+), 22 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 2396c28c0122..5898212fcb3c 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -26,10 +26,11 @@
> >  static int napi_weight = NAPI_POLL_WEIGHT;
> >  module_param(napi_weight, int, 0444);
> >
> > -static bool csum = true, gso = true, napi_tx = true;
> > +static bool csum = true, gso = true, napi_tx = true, experiment_premapped;
> >  module_param(csum, bool, 0444);
> >  module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> > +module_param(experiment_premapped, bool, 0644);
>
> Having a module parameter is sub-optimal. I think we can demonstrate
> real benefit:
>
> In the case of a merge rx buffer, if the mapping is done by the
> virtio-core, it needs to be done per buffer (< PAGE_SIZE).
>
> But if it is done by the virtio-net, we have a chance to map the
> buffer per page. Which can save a lot of mappings and unmapping. A lot
> of other optimizations could be done on top as well.


Good point.

Thanks


>
> If we manage to prove this, we don't need any experimental module
> parameters at all.
>
> Thanks
>
>
> >
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > @@ -142,6 +143,9 @@ struct send_queue {
> >
> > /* Record whether sq is in reset state. */
> > bool reset;
> > +
> > +   /* The vq is premapped mode. */
> > +   bool premapped;
> >  };
> >
> >  /* Internal representation of a receive virtqueue */
> > @@ -174,6 +178,9 @@ struct receive_queue {
> > char name[16];
> >
> > struct xdp_rxq_info xdp_rxq;
> > +
> > +   /* The vq is premapped mode. */
> > +   bool premapped;
> >  };
> >
> >  /* This structure can contain rss message with maximum settings for 
> > indirection table and keysize
> > @@ -546,6 +553,105 @@ static struct sk_buff *page_to_skb(struct 
> > virtnet_info *vi,
> > return skb;
> >  }
> >
> > +static int virtnet_generic_unmap(struct virtqueue *vq, struct 
> > virtqueue_detach_cursor *cursor)
> > +{
> > +   enum dma_data_direction dir;
> > +   dma_addr_t addr;
> > +   u32 len;
> > +   int err;
> > +
> > +   do {
> > +   err = virtqueue_detach(vq, cursor, &addr, &len, &dir);
> > +   if (!err || err == -EAGAIN)
> > +   dma_unmap_page_attrs(virtqueue_dma_dev(vq), addr, 
> > len, dir, 0);
> > +
> > +   } while (err == -EAGAIN);
> > +
> > +   return err;
> > +}
> > +
> > +static void *virtnet_detach_unused_buf(struct virtqueue *vq, bool 
> > premapped)
> > +{
> > +   struct virtqueue_detach_cursor cursor;
> > +   void *buf;
> > +
> > +   if (!premapped)
> > +   return virtqueue_detach_unused_buf(vq);
> > +
> > +   buf = virtqueue_detach_unused_buf_premapped(vq, &cursor);
> > +   if (buf)
> > +   virtnet_generic_unmap(vq, &cursor);
> > +
> > +   return buf;
> > +}
> > +
> > +static void *virtnet_get_buf_ctx(struct virtqueue *vq, bool premapped, u32 
> > *len, void **ctx)
> > +{
> > +   struct virtqueue_detach_cursor cursor;
> > +   void *buf;
> > +
> > +   if (!premapped)
> > +   return virtqueue_get_buf_ctx(vq, len, ctx);
> > +
> > +   buf = virtqueue_get_buf_premapped(vq, len, ctx, &cursor);
> > +   if (buf)
> > +   virtnet_generic_unmap(vq, &cursor);
> > +
> > +   return buf;
> > +}
> > +
> > +#define virtnet_rq_get_buf(rq, plen, pctx) \
> > +({ \
> > +   typeof(rq) _rq = (rq); \
> > +   virtnet_get_buf_ctx(_rq->vq, _rq->premapped, plen, pctx); \
> > +})
> > +
> > +#define virtnet_sq_get_buf(sq, plen, pctx) \
> > +({ \
> > +   typeof(sq) _sq = (sq); \
> > +   virtnet_get_buf_ctx(_sq->vq, _sq->premapped, plen, pctx); \
> > +})
> > +
> > +static int virtnet_add_sg(struct virtqueue *vq, bool premapped,
> > + struct scatterlist *sg, unsigned int num, bool 
> > out,
> > + void *data, void *ctx, gfp_t gfp)
> > +{
> > +   enum dma_data_direction dir;
> > +   struct device *dev;
> > +   int err, ret;
> > +
> > +   if (!premapped)
> > + 

Re: [PATCH vhost v10 05/10] virtio_ring: split-detach: support return dma info to driver

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 16:03:31 +0800, Jason Wang  wrote:
> On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> >
> > Under the premapped mode, the driver needs to unmap the DMA address
> > after receiving the buffer. The virtio core records the DMA address,
> > so the driver needs a way to get the dma info from the virtio core.
>
> A second thought, can we simply offload the tracking to the driver
> itself? This looks the way many other modern NIC drivers did.
>
> In pre mapped mode, the DMA address is in fact told by the driver
> itself so it should have sufficient knowledge. And in some cases, the
> driver wants to optimize/merge/delay the unampping so the DMA
> addresses returned by the virtio core are not even interested in those
> cases.

Will fix.

Thanks.


>
> Thanks
>
>
>
> >
> > A straightforward approach is to pass an array to the virtio core when
> > calling virtqueue_get_buf(). However, it is not feasible when there are
> > multiple DMA addresses in the descriptor chain, and the array size is
> > unknown.
> >
> > To solve this problem, a helper be introduced. After calling
> > virtqueue_get_buf(), the driver can call the helper to
> > retrieve a dma info. If the helper function returns -EAGAIN, it means
> > that there are more DMA addresses to be processed, and the driver should
> > call the helper function again. To keep track of the current position in
> > the chain, a cursor must be passed to the helper function, which is
> > initialized by virtqueue_get_buf().
> >
> > Some processes are done inside this helper, so this helper MUST be
> > called under the premapped mode.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/virtio/virtio_ring.c | 118 ---
> >  include/linux/virtio.h   |  11 
> >  2 files changed, 119 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index dc109fbc05a5..cdc4349f6066 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -754,8 +754,95 @@ static bool virtqueue_kick_prepare_split(struct 
> > virtqueue *_vq)
> > return needs_kick;
> >  }
> >
> > -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> > -void **ctx)
> > +static void detach_cursor_init_split(struct vring_virtqueue *vq,
> > +struct virtqueue_detach_cursor 
> > *cursor, u16 head)
> > +{
> > +   struct vring_desc_extra *extra;
> > +
> > +   extra = &vq->split.desc_extra[head];
> > +
> > +   /* Clear data ptr. */
> > +   vq->split.desc_state[head].data = NULL;
> > +
> > +   cursor->head = head;
> > +   cursor->done = 0;
> > +
> > +   if (extra->flags & VRING_DESC_F_INDIRECT) {
> > +   cursor->num = extra->len / sizeof(struct vring_desc);
> > +   cursor->indirect = true;
> > +   cursor->pos = 0;
> > +
> > +   vring_unmap_one_split(vq, head);
> > +
> > +   extra->next = vq->free_head;
> > +
> > +   vq->free_head = head;
> > +
> > +   /* Plus final descriptor */
> > +   vq->vq.num_free++;
> > +
> > +   } else {
> > +   cursor->indirect = false;
> > +   cursor->pos = head;
> > +   }
> > +}
> > +
> > +static int virtqueue_detach_split(struct virtqueue *_vq, struct 
> > virtqueue_detach_cursor *cursor,
> > + dma_addr_t *addr, u32 *len, enum 
> > dma_data_direction *dir)
> > +{
> > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > +   __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, 
> > VRING_DESC_F_NEXT);
> > +   int rc = -EAGAIN;
> > +
> > +   if (unlikely(cursor->done))
> > +   return -EINVAL;
> > +
> > +   if (!cursor->indirect) {
> > +   struct vring_desc_extra *extra;
> > +   unsigned int i;
> > +
> > +   i = cursor->pos;
> > +
> > +   extra = &vq->split.desc_extra[i];
> > +
> > +   if (vq->split.vring.desc[i].flags & nextflag) {
> > +   cursor->pos = extra->next;
> > +   } else {
> > +   extra->next = vq->free_head;
> > +   vq->free_head = cursor->head;
> > +   cursor->done = true;
> > +   rc = 0;
> > +   }
> > +
> > +   *addr = extra->addr;
> > +   *len = extra->len;
> > +   *dir = (extra->flags & VRING_DESC_F_WRITE) ? 
> > DMA_FROM_DEVICE : DMA_TO_DEVICE;
> > +
> > +   vq->vq.num_free++;
> > +
> > +   } else {
> > +   struct vring_desc *indir_desc, *desc;
> > +   u16 flags;
> > +
> > +   indir_desc = vq->split.desc_state[cursor->head].indir_desc;
> > +   desc = &indir_desc[cursor->pos];
> > +
> > +   flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> > +   

Re: [PATCH vhost v10 04/10] virtio_ring: packed: support add premapped buf

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 16:03:29 +0800, Jason Wang  wrote:
> On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> >
> > If the vq is the premapped mode, use the sg_dma_address() directly.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/virtio/virtio_ring.c | 36 ++--
> >  1 file changed, 26 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 18212c3e056b..dc109fbc05a5 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -1299,9 +1299,13 @@ static int virtqueue_add_indirect_packed(struct 
> > vring_virtqueue *vq,
> >
> > for (n = 0; n < out_sgs + in_sgs; n++) {
> > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> > -DMA_TO_DEVICE : 
> > DMA_FROM_DEVICE, &addr))
> > -   goto unmap_release;
> > +   if (vq->premapped) {
> > +   addr = sg_dma_address(sg);
> > +   } else {
> > +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> > +DMA_TO_DEVICE : 
> > DMA_FROM_DEVICE, &addr))
> > +   goto unmap_release;
> > +   }
> >
> > desc[i].flags = cpu_to_le16(n < out_sgs ?
> > 0 : VRING_DESC_F_WRITE);
> > @@ -1369,10 +1373,12 @@ static int virtqueue_add_indirect_packed(struct 
> > vring_virtqueue *vq,
> > return 0;
> >
> >  unmap_release:
> > -   err_idx = i;
> > +   if (!vq->premapped) {
> > +   err_idx = i;
> >
> > -   for (i = 0; i < err_idx; i++)
> > -   vring_unmap_desc_packed(vq, &desc[i]);
> > +   for (i = 0; i < err_idx; i++)
> > +   vring_unmap_desc_packed(vq, &desc[i]);
> > +   }
> >
> > kfree(desc);
> >
> > @@ -1447,9 +1453,13 @@ static inline int virtqueue_add_packed(struct 
> > virtqueue *_vq,
> > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > dma_addr_t addr;
> >
> > -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> > -DMA_TO_DEVICE : 
> > DMA_FROM_DEVICE, &addr))
> > -   goto unmap_release;
> > +   if (vq->premapped) {
> > +   addr = sg_dma_address(sg);
> > +   } else {
> > +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> > +DMA_TO_DEVICE : 
> > DMA_FROM_DEVICE, &addr))
> > +   goto unmap_release;
> > +   }
> >
> > flags = cpu_to_le16(vq->packed.avail_used_flags |
> > (++c == total_sg ? 0 : 
> > VRING_DESC_F_NEXT) |
> > @@ -1512,11 +1522,17 @@ static inline int virtqueue_add_packed(struct 
> > virtqueue *_vq,
> > return 0;
> >
> >  unmap_release:
> > +   vq->packed.avail_used_flags = avail_used_flags;
> > +
> > +   if (vq->premapped) {
>
> Similar to the split path, I think we can't hit vq->premapped here.


YES, similar to the above reply, we can have a better way.

But, we can hit vq->premapped, when we fail doing dma for the indirect desc
array.

Thanks.




>
> Thanks
>
>
> > +   END_USE(vq);
> > +   return -EIO;
> > +   }
> > +
> > err_idx = i;
> > i = head;
> > curr = vq->free_head;
> >
> > -   vq->packed.avail_used_flags = avail_used_flags;
> >
> > for (n = 0; n < total_sg; n++) {
> > if (i == err_idx)
> > --
> > 2.32.0.3.g01195cf9f
> >
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 16:03:26 +0800, Jason Wang  wrote:
> On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> >
> > If the vq is the premapped mode, use the sg_dma_address() directly.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/virtio/virtio_ring.c | 46 ++--
> >  1 file changed, 28 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 2afdfb9e3e30..18212c3e056b 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> > *_vq,
> > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > dma_addr_t addr;
> >
> > -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> > -   goto unmap_release;
> > +   if (vq->premapped) {
> > +   addr = sg_dma_address(sg);
> > +   } else {
> > +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> > &addr))
> > +   goto unmap_release;
> > +   }
>
> Btw, I wonder whether or not it would be simple to implement the
> vq->premapped check inside vring_map_one_sg() assuming the
> !use_dma_api is done there as well.


YES,

That will more simple for the caller.

But we will have things like:

int func(bool do)
{
if (!do)
return;
}

I like this way, but you don't like it in last version.

>
> >
> > prev = i;
> > /* Note that we trust indirect descriptor
> > @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> > *_vq,
> > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > dma_addr_t addr;
> >
> > -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> > &addr))
> > -   goto unmap_release;
> > +   if (vq->premapped) {
> > +   addr = sg_dma_address(sg);
> > +   } else {
> > +   if (vring_map_one_sg(vq, sg, 
> > DMA_FROM_DEVICE, &addr))
> > +   goto unmap_release;
> > +   }
> >
> > prev = i;
> > /* Note that we trust indirect descriptor
> > @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct 
> > virtqueue *_vq,
> > return 0;
> >
> >  unmap_release:
> > -   err_idx = i;
> > +   if (!vq->premapped) {
>
> Can vq->premapped be true here? The label is named as "unmap_relase"
> which implies "map" beforehand which seems not the case for
> premapping.

I see.

Rethink about this, there is a better way.
I will fix in next version.


Thanks.


>
> Thanks
>
>
> > +   err_idx = i;
> >
> > -   if (indirect)
> > -   i = 0;
> > -   else
> > -   i = head;
> > -
> > -   for (n = 0; n < total_sg; n++) {
> > -   if (i == err_idx)
> > -   break;
> > -   if (indirect) {
> > -   vring_unmap_one_split_indirect(vq, &desc[i]);
> > -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> > -   } else
> > -   i = vring_unmap_one_split(vq, i);
> > +   if (indirect)
> > +   i = 0;
> > +   else
> > +   i = head;
> > +
> > +   for (n = 0; n < total_sg; n++) {
> > +   if (i == err_idx)
> > +   break;
> > +   if (indirect) {
> > +   vring_unmap_one_split_indirect(vq, 
> > &desc[i]);
> > +   i = virtio16_to_cpu(_vq->vdev, 
> > desc[i].next);
> > +   } else
> > +   i = vring_unmap_one_split(vq, i);
> > +   }
> > }
> >
> > if (indirect)
> > --
> > 2.32.0.3.g01195cf9f
> >
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()

2023-06-27 Thread Xuan Zhuo
On Tue, 27 Jun 2023 16:03:23 +0800, Jason Wang  wrote:
> On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
> >
> > This helper allows the driver change the dma mode to premapped mode.
> > Under the premapped mode, the virtio core do not do dma mapping
> > internally.
> >
> > This just work when the use_dma_api is true. If the use_dma_api is false,
> > the dma options is not through the DMA APIs, that is not the standard
> > way of the linux kernel.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  drivers/virtio/virtio_ring.c | 40 
> >  include/linux/virtio.h   |  2 ++
> >  2 files changed, 42 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 72ed07a604d4..2afdfb9e3e30 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -172,6 +172,9 @@ struct vring_virtqueue {
> > /* Host publishes avail event idx */
> > bool event;
> >
> > +   /* Do DMA mapping by driver */
> > +   bool premapped;
> > +
> > /* Head of free buffer list. */
> > unsigned int free_head;
> > /* Number we've added since last sync. */
> > @@ -2059,6 +2062,7 @@ static struct virtqueue 
> > *vring_create_virtqueue_packed(
> > vq->packed_ring = true;
> > vq->dma_dev = dma_dev;
> > vq->use_dma_api = vring_use_dma_api(vdev);
> > +   vq->premapped = false;
> >
> > vq->indirect = virtio_has_feature(vdev, 
> > VIRTIO_RING_F_INDIRECT_DESC) &&
> > !context;
> > @@ -2548,6 +2552,7 @@ static struct virtqueue 
> > *__vring_new_virtqueue(unsigned int index,
> >  #endif
> > vq->dma_dev = dma_dev;
> > vq->use_dma_api = vring_use_dma_api(vdev);
> > +   vq->premapped = false;
> >
> > vq->indirect = virtio_has_feature(vdev, 
> > VIRTIO_RING_F_INDIRECT_DESC) &&
> > !context;
> > @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
> >  }
> >  EXPORT_SYMBOL_GPL(virtqueue_resize);
> >
> > +/**
> > + * virtqueue_set_premapped - set the vring premapped mode
> > + * @_vq: the struct virtqueue we're talking about.
> > + *
> > + * Enable the premapped mode of the vq.
> > + *
> > + * The vring in premapped mode does not do dma internally, so the driver 
> > must
> > + * do dma mapping in advance. The driver must pass the dma_address through
> > + * dma_address of scatterlist. When the driver got a used buffer from
> > + * the vring, it has to unmap the dma address. So the driver must call
> > + * virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped().
> > + *
> > + * This must be called before adding any buf to vring.
>
> And any old buffer should be detached?

I mean that before adding any buf, So there are not old buffer.


>
> > + * So this should be called immediately after init vq or vq reset.
>
> Any way to detect and warn in this case? (not a must if it's too
> expensive to do the check)


I can try to check whether the qeueu is empty.


>
> > + *
> > + * Caller must ensure we don't call this with other virtqueue operations
> > + * at the same time (except where noted).
> > + *
> > + * Returns zero or a negative error.
> > + * 0: success.
> > + * -EINVAL: vring does not use the dma api, so we can not enable premapped 
> > mode.
> > + */
> > +int virtqueue_set_premapped(struct virtqueue *_vq)
> > +{
> > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +   if (!vq->use_dma_api)
> > +   return -EINVAL;
> > +
> > +   vq->premapped = true;
>
> I guess there should be a way to disable it. Would it be useful for
> the case when AF_XDP sockets were destroyed?

Yes.

When we reset the queue, the vq->premapped will be set to 0.

The is called after find_vqs or reset vq.

Thanks.



>
> Thanks
>
>
> > +
> > +   return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(virtqueue_set_premapped);
> > +
> >  /* Only available for split ring */
> >  struct virtqueue *vring_new_virtqueue(unsigned int index,
> >   unsigned int num,
> > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > index b93238db94e3..1fc0e1023bd4 100644
> > --- a/include/linux/virtio.h
> > +++ b/include/linux/virtio.h
> > @@ -78,6 +78,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
> >
> >  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
> >
> > +int virtqueue_set_premapped(struct virtqueue *_vq);
> > +
> >  bool virtqueue_poll(struct virtqueue *vq, unsigned);
> >
> >  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
> > --
> > 2.32.0.3.g01195cf9f
> >
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3 1/8] drm: Disable the cursor plane on atomic contexts with virtualized drivers

2023-06-27 Thread Pekka Paalanen
On Mon, 26 Jun 2023 23:58:32 -0400
Zack Rusin  wrote:

> From: Zack Rusin 
> 
> Cursor planes on virtualized drivers have special meaning and require
> that the clients handle them in specific ways, e.g. the cursor plane
> should react to the mouse movement the way a mouse cursor would be
> expected to and the client is required to set hotspot properties on it
> in order for the mouse events to be routed correctly.
> 
> This breaks the contract as specified by the "universal planes". Fix it
> by disabling the cursor planes on virtualized drivers while adding
> a foundation on top of which it's possible to special case mouse cursor
> planes for clients that want it.
> 
> Disabling the cursor planes makes some kms compositors which were broken,
> e.g. Weston, fallback to software cursor which works fine or at least
> better than currently while having no effect on others, e.g. gnome-shell
> or kwin, which put virtualized drivers on a deny-list when running in
> atomic context to make them fallback to legacy kms and avoid this issue.
> 
> Signed-off-by: Zack Rusin 
> Fixes: 681e7ec73044 ("drm: Allow userspace to ask for universal plane list 
> (v2)")

Sounds good to me.

Acked-by: Pekka Paalanen 


Thanks,
pq



> Cc:  # v5.4+
> Cc: Maarten Lankhorst 
> Cc: Maxime Ripard 
> Cc: Thomas Zimmermann 
> Cc: David Airlie 
> Cc: Daniel Vetter 
> Cc: Dave Airlie 
> Cc: Gerd Hoffmann 
> Cc: Hans de Goede 
> Cc: Gurchetan Singh 
> Cc: Chia-I Wu 
> Cc: dri-de...@lists.freedesktop.org
> Cc: virtualization@lists.linux-foundation.org
> Cc: spice-de...@lists.freedesktop.org
> ---
>  drivers/gpu/drm/drm_plane.c  | 13 +
>  drivers/gpu/drm/qxl/qxl_drv.c|  2 +-
>  drivers/gpu/drm/vboxvideo/vbox_drv.c |  2 +-
>  drivers/gpu/drm/virtio/virtgpu_drv.c |  2 +-
>  drivers/gpu/drm/vmwgfx/vmwgfx_drv.c  |  2 +-
>  include/drm/drm_drv.h|  9 +
>  include/drm/drm_file.h   | 12 
>  7 files changed, 38 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
> index 24e7998d1731..a4a39f4834e2 100644
> --- a/drivers/gpu/drm/drm_plane.c
> +++ b/drivers/gpu/drm/drm_plane.c
> @@ -678,6 +678,19 @@ int drm_mode_getplane_res(struct drm_device *dev, void 
> *data,
>   !file_priv->universal_planes)
>   continue;
>  
> + /*
> +  * If we're running on a virtualized driver then,
> +  * unless userspace advertizes support for the
> +  * virtualized cursor plane, disable cursor planes
> +  * because they'll be broken due to missing cursor
> +  * hotspot info.
> +  */
> + if (plane->type == DRM_PLANE_TYPE_CURSOR &&
> + drm_core_check_feature(dev, DRIVER_CURSOR_HOTSPOT)  &&
> + file_priv->atomic &&
> + !file_priv->supports_virtualized_cursor_plane)
> + continue;
> +
>   if (drm_lease_held(file_priv, plane->base.id)) {
>   if (count < plane_resp->count_planes &&
>   put_user(plane->base.id, plane_ptr + count))
> diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
> index b30ede1cf62d..91930e84a9cd 100644
> --- a/drivers/gpu/drm/qxl/qxl_drv.c
> +++ b/drivers/gpu/drm/qxl/qxl_drv.c
> @@ -283,7 +283,7 @@ static const struct drm_ioctl_desc qxl_ioctls[] = {
>  };
>  
>  static struct drm_driver qxl_driver = {
> - .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC,
> + .driver_features = DRIVER_GEM | DRIVER_MODESET | DRIVER_ATOMIC | 
> DRIVER_CURSOR_HOTSPOT,
>  
>   .dumb_create = qxl_mode_dumb_create,
>   .dumb_map_offset = drm_gem_ttm_dumb_map_offset,
> diff --git a/drivers/gpu/drm/vboxvideo/vbox_drv.c 
> b/drivers/gpu/drm/vboxvideo/vbox_drv.c
> index 4fee15c97c34..8ecd0863fad7 100644
> --- a/drivers/gpu/drm/vboxvideo/vbox_drv.c
> +++ b/drivers/gpu/drm/vboxvideo/vbox_drv.c
> @@ -172,7 +172,7 @@ DEFINE_DRM_GEM_FOPS(vbox_fops);
>  
>  static const struct drm_driver driver = {
>   .driver_features =
> - DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC,
> + DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
>  
>   .fops = &vbox_fops,
>   .name = DRIVER_NAME,
> diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c 
> b/drivers/gpu/drm/virtio/virtgpu_drv.c
> index a7ec5a3770da..8f4bb8a4e952 100644
> --- a/drivers/gpu/drm/virtio/virtgpu_drv.c
> +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c
> @@ -176,7 +176,7 @@ static const struct drm_driver driver = {
>* If KMS is disabled DRIVER_MODESET and DRIVER_ATOMIC are masked
>* out via drm_device::driver_features:
>*/
> - .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | 
> DRIVER_ATOMIC,
> + .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_RENDER | 
> DRIVER_ATOMIC | DRIVER_CURSOR_HOTSPOT,
>   .open = virtio_gpu

Re: [PATCH vhost v10 10/10] virtio_net: support dma premapped

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> Introduce the module param "experiment_premapped" to enable the function
> that the virtio-net do dma mapping.
>
> If that is true, the vq of virtio-net is under the premapped mode.
> It just handle the sg with dma_address. And the driver must get the dma
> address of the buffer to unmap after get the buffer from virtio core.
>
> That will be useful when AF_XDP is enable, AF_XDP tx and the kernel packet
> xmit will share the tx queue, so the skb xmit must support the premapped
> mode.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/net/virtio_net.c | 163 +--
>  1 file changed, 141 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 2396c28c0122..5898212fcb3c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -26,10 +26,11 @@
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
>
> -static bool csum = true, gso = true, napi_tx = true;
> +static bool csum = true, gso = true, napi_tx = true, experiment_premapped;
>  module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
> +module_param(experiment_premapped, bool, 0644);

Having a module parameter is sub-optimal. I think we can demonstrate
real benefit:

In the case of a merge rx buffer, if the mapping is done by the
virtio-core, it needs to be done per buffer (< PAGE_SIZE).

But if it is done by the virtio-net, we have a chance to map the
buffer per page. Which can save a lot of mappings and unmapping. A lot
of other optimizations could be done on top as well.

If we manage to prove this, we don't need any experimental module
parameters at all.

Thanks


>
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> @@ -142,6 +143,9 @@ struct send_queue {
>
> /* Record whether sq is in reset state. */
> bool reset;
> +
> +   /* The vq is premapped mode. */
> +   bool premapped;
>  };
>
>  /* Internal representation of a receive virtqueue */
> @@ -174,6 +178,9 @@ struct receive_queue {
> char name[16];
>
> struct xdp_rxq_info xdp_rxq;
> +
> +   /* The vq is premapped mode. */
> +   bool premapped;
>  };
>
>  /* This structure can contain rss message with maximum settings for 
> indirection table and keysize
> @@ -546,6 +553,105 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
> return skb;
>  }
>
> +static int virtnet_generic_unmap(struct virtqueue *vq, struct 
> virtqueue_detach_cursor *cursor)
> +{
> +   enum dma_data_direction dir;
> +   dma_addr_t addr;
> +   u32 len;
> +   int err;
> +
> +   do {
> +   err = virtqueue_detach(vq, cursor, &addr, &len, &dir);
> +   if (!err || err == -EAGAIN)
> +   dma_unmap_page_attrs(virtqueue_dma_dev(vq), addr, 
> len, dir, 0);
> +
> +   } while (err == -EAGAIN);
> +
> +   return err;
> +}
> +
> +static void *virtnet_detach_unused_buf(struct virtqueue *vq, bool premapped)
> +{
> +   struct virtqueue_detach_cursor cursor;
> +   void *buf;
> +
> +   if (!premapped)
> +   return virtqueue_detach_unused_buf(vq);
> +
> +   buf = virtqueue_detach_unused_buf_premapped(vq, &cursor);
> +   if (buf)
> +   virtnet_generic_unmap(vq, &cursor);
> +
> +   return buf;
> +}
> +
> +static void *virtnet_get_buf_ctx(struct virtqueue *vq, bool premapped, u32 
> *len, void **ctx)
> +{
> +   struct virtqueue_detach_cursor cursor;
> +   void *buf;
> +
> +   if (!premapped)
> +   return virtqueue_get_buf_ctx(vq, len, ctx);
> +
> +   buf = virtqueue_get_buf_premapped(vq, len, ctx, &cursor);
> +   if (buf)
> +   virtnet_generic_unmap(vq, &cursor);
> +
> +   return buf;
> +}
> +
> +#define virtnet_rq_get_buf(rq, plen, pctx) \
> +({ \
> +   typeof(rq) _rq = (rq); \
> +   virtnet_get_buf_ctx(_rq->vq, _rq->premapped, plen, pctx); \
> +})
> +
> +#define virtnet_sq_get_buf(sq, plen, pctx) \
> +({ \
> +   typeof(sq) _sq = (sq); \
> +   virtnet_get_buf_ctx(_sq->vq, _sq->premapped, plen, pctx); \
> +})
> +
> +static int virtnet_add_sg(struct virtqueue *vq, bool premapped,
> + struct scatterlist *sg, unsigned int num, bool out,
> + void *data, void *ctx, gfp_t gfp)
> +{
> +   enum dma_data_direction dir;
> +   struct device *dev;
> +   int err, ret;
> +
> +   if (!premapped)
> +   return virtqueue_add_sg(vq, sg, num, out, data, ctx, gfp);
> +
> +   dir = out ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
> +   dev = virtqueue_dma_dev(vq);
> +
> +   ret = dma_map_sg_attrs(dev, sg, num, dir, 0);
> +   if (ret != num)
> +   goto err;
> +
> +   err = virtqueue_add_sg(vq, sg, num, out, data, ctx, gfp);
> +   if (err < 0)
> +   

Re: [PATCH vhost v10 05/10] virtio_ring: split-detach: support return dma info to driver

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> Under the premapped mode, the driver needs to unmap the DMA address
> after receiving the buffer. The virtio core records the DMA address,
> so the driver needs a way to get the dma info from the virtio core.

A second thought, can we simply offload the tracking to the driver
itself? This looks the way many other modern NIC drivers did.

In pre mapped mode, the DMA address is in fact told by the driver
itself so it should have sufficient knowledge. And in some cases, the
driver wants to optimize/merge/delay the unampping so the DMA
addresses returned by the virtio core are not even interested in those
cases.

Thanks



>
> A straightforward approach is to pass an array to the virtio core when
> calling virtqueue_get_buf(). However, it is not feasible when there are
> multiple DMA addresses in the descriptor chain, and the array size is
> unknown.
>
> To solve this problem, a helper be introduced. After calling
> virtqueue_get_buf(), the driver can call the helper to
> retrieve a dma info. If the helper function returns -EAGAIN, it means
> that there are more DMA addresses to be processed, and the driver should
> call the helper function again. To keep track of the current position in
> the chain, a cursor must be passed to the helper function, which is
> initialized by virtqueue_get_buf().
>
> Some processes are done inside this helper, so this helper MUST be
> called under the premapped mode.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 118 ---
>  include/linux/virtio.h   |  11 
>  2 files changed, 119 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index dc109fbc05a5..cdc4349f6066 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -754,8 +754,95 @@ static bool virtqueue_kick_prepare_split(struct 
> virtqueue *_vq)
> return needs_kick;
>  }
>
> -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> -void **ctx)
> +static void detach_cursor_init_split(struct vring_virtqueue *vq,
> +struct virtqueue_detach_cursor *cursor, 
> u16 head)
> +{
> +   struct vring_desc_extra *extra;
> +
> +   extra = &vq->split.desc_extra[head];
> +
> +   /* Clear data ptr. */
> +   vq->split.desc_state[head].data = NULL;
> +
> +   cursor->head = head;
> +   cursor->done = 0;
> +
> +   if (extra->flags & VRING_DESC_F_INDIRECT) {
> +   cursor->num = extra->len / sizeof(struct vring_desc);
> +   cursor->indirect = true;
> +   cursor->pos = 0;
> +
> +   vring_unmap_one_split(vq, head);
> +
> +   extra->next = vq->free_head;
> +
> +   vq->free_head = head;
> +
> +   /* Plus final descriptor */
> +   vq->vq.num_free++;
> +
> +   } else {
> +   cursor->indirect = false;
> +   cursor->pos = head;
> +   }
> +}
> +
> +static int virtqueue_detach_split(struct virtqueue *_vq, struct 
> virtqueue_detach_cursor *cursor,
> + dma_addr_t *addr, u32 *len, enum 
> dma_data_direction *dir)
> +{
> +   struct vring_virtqueue *vq = to_vvq(_vq);
> +   __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> +   int rc = -EAGAIN;
> +
> +   if (unlikely(cursor->done))
> +   return -EINVAL;
> +
> +   if (!cursor->indirect) {
> +   struct vring_desc_extra *extra;
> +   unsigned int i;
> +
> +   i = cursor->pos;
> +
> +   extra = &vq->split.desc_extra[i];
> +
> +   if (vq->split.vring.desc[i].flags & nextflag) {
> +   cursor->pos = extra->next;
> +   } else {
> +   extra->next = vq->free_head;
> +   vq->free_head = cursor->head;
> +   cursor->done = true;
> +   rc = 0;
> +   }
> +
> +   *addr = extra->addr;
> +   *len = extra->len;
> +   *dir = (extra->flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE 
> : DMA_TO_DEVICE;
> +
> +   vq->vq.num_free++;
> +
> +   } else {
> +   struct vring_desc *indir_desc, *desc;
> +   u16 flags;
> +
> +   indir_desc = vq->split.desc_state[cursor->head].indir_desc;
> +   desc = &indir_desc[cursor->pos];
> +
> +   flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +   *addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> +   *len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> +   *dir = (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : 
> DMA_TO_DEVICE;
> +
> +   if (++cursor->pos == cursor->num) {
> +   kfree(indir_desc);
> +  

Re: [RFC PATCH v4 07/17] vsock: read from socket's error queue

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:49:00AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 19:08, Stefano Garzarella wrote:

On Sat, Jun 03, 2023 at 11:49:29PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag.

Signed-off-by: Arseniy Krasnov 
---
include/linux/socket.h   | 1 +
net/vmw_vsock/af_vsock.c | 5 +
2 files changed, 6 insertions(+)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index bd1cc3238851..d79efd026880 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -382,6 +382,7 @@ struct ucred {
#define SOL_MPTCP    284
#define SOL_MCTP    285
#define SOL_SMC    286
+#define SOL_VSOCK    287


Maybe this change should go in another patch where we describe that
we need to support setsockopt()


Ok, You mean patch which handles SO_ZEROCOPY option in af_vsock.c as Bobby 
suggested? No
problem, but in this case there will be no user for this define there - this 
option
(SO_ZEROCOPY) uses SOL_SOCKET level, not SOL_VSOCK.


Got it, so it is fine to leave it here.

Just mention that we are defining SOL_VSOCK in the commit description.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH vhost v10 04/10] virtio_ring: packed: support add premapped buf

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> If the vq is the premapped mode, use the sg_dma_address() directly.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 36 ++--
>  1 file changed, 26 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 18212c3e056b..dc109fbc05a5 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1299,9 +1299,13 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> -DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : 
> DMA_FROM_DEVICE, &addr))
> +   goto unmap_release;
> +   }
>
> desc[i].flags = cpu_to_le16(n < out_sgs ?
> 0 : VRING_DESC_F_WRITE);
> @@ -1369,10 +1373,12 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> return 0;
>
>  unmap_release:
> -   err_idx = i;
> +   if (!vq->premapped) {
> +   err_idx = i;
>
> -   for (i = 0; i < err_idx; i++)
> -   vring_unmap_desc_packed(vq, &desc[i]);
> +   for (i = 0; i < err_idx; i++)
> +   vring_unmap_desc_packed(vq, &desc[i]);
> +   }
>
> kfree(desc);
>
> @@ -1447,9 +1453,13 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> -DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : 
> DMA_FROM_DEVICE, &addr))
> +   goto unmap_release;
> +   }
>
> flags = cpu_to_le16(vq->packed.avail_used_flags |
> (++c == total_sg ? 0 : VRING_DESC_F_NEXT) 
> |
> @@ -1512,11 +1522,17 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> return 0;
>
>  unmap_release:
> +   vq->packed.avail_used_flags = avail_used_flags;
> +
> +   if (vq->premapped) {

Similar to the split path, I think we can't hit vq->premapped here.

Thanks


> +   END_USE(vq);
> +   return -EIO;
> +   }
> +
> err_idx = i;
> i = head;
> curr = vq->free_head;
>
> -   vq->packed.avail_used_flags = avail_used_flags;
>
> for (n = 0; n < total_sg; n++) {
> if (i == err_idx)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> If the vq is the premapped mode, use the sg_dma_address() directly.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 46 ++--
>  1 file changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 2afdfb9e3e30..18212c3e056b 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> &addr))
> +   goto unmap_release;
> +   }

Btw, I wonder whether or not it would be simple to implement the
vq->premapped check inside vring_map_one_sg() assuming the
!use_dma_api is done there as well.

>
> prev = i;
> /* Note that we trust indirect descriptor
> @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> &addr))
> +   goto unmap_release;
> +   }
>
> prev = i;
> /* Note that we trust indirect descriptor
> @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> return 0;
>
>  unmap_release:
> -   err_idx = i;
> +   if (!vq->premapped) {

Can vq->premapped be true here? The label is named as "unmap_relase"
which implies "map" beforehand which seems not the case for
premapping.

Thanks


> +   err_idx = i;
>
> -   if (indirect)
> -   i = 0;
> -   else
> -   i = head;
> -
> -   for (n = 0; n < total_sg; n++) {
> -   if (i == err_idx)
> -   break;
> -   if (indirect) {
> -   vring_unmap_one_split_indirect(vq, &desc[i]);
> -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> -   } else
> -   i = vring_unmap_one_split(vq, i);
> +   if (indirect)
> +   i = 0;
> +   else
> +   i = head;
> +
> +   for (n = 0; n < total_sg; n++) {
> +   if (i == err_idx)
> +   break;
> +   if (indirect) {
> +   vring_unmap_one_split_indirect(vq, &desc[i]);
> +   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> +   } else
> +   i = vring_unmap_one_split(vq, i);
> +   }
> }
>
> if (indirect)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> This helper allows the driver change the dma mode to premapped mode.
> Under the premapped mode, the virtio core do not do dma mapping
> internally.
>
> This just work when the use_dma_api is true. If the use_dma_api is false,
> the dma options is not through the DMA APIs, that is not the standard
> way of the linux kernel.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 40 
>  include/linux/virtio.h   |  2 ++
>  2 files changed, 42 insertions(+)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 72ed07a604d4..2afdfb9e3e30 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -172,6 +172,9 @@ struct vring_virtqueue {
> /* Host publishes avail event idx */
> bool event;
>
> +   /* Do DMA mapping by driver */
> +   bool premapped;
> +
> /* Head of free buffer list. */
> unsigned int free_head;
> /* Number we've added since last sync. */
> @@ -2059,6 +2062,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
> vq->packed_ring = true;
> vq->dma_dev = dma_dev;
> vq->use_dma_api = vring_use_dma_api(vdev);
> +   vq->premapped = false;
>
> vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) 
> &&
> !context;
> @@ -2548,6 +2552,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned 
> int index,
>  #endif
> vq->dma_dev = dma_dev;
> vq->use_dma_api = vring_use_dma_api(vdev);
> +   vq->premapped = false;
>
> vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) 
> &&
> !context;
> @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_resize);
>
> +/**
> + * virtqueue_set_premapped - set the vring premapped mode
> + * @_vq: the struct virtqueue we're talking about.
> + *
> + * Enable the premapped mode of the vq.
> + *
> + * The vring in premapped mode does not do dma internally, so the driver must
> + * do dma mapping in advance. The driver must pass the dma_address through
> + * dma_address of scatterlist. When the driver got a used buffer from
> + * the vring, it has to unmap the dma address. So the driver must call
> + * virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped().
> + *
> + * This must be called before adding any buf to vring.

And any old buffer should be detached?

> + * So this should be called immediately after init vq or vq reset.

Any way to detect and warn in this case? (not a must if it's too
expensive to do the check)

> + *
> + * Caller must ensure we don't call this with other virtqueue operations
> + * at the same time (except where noted).
> + *
> + * Returns zero or a negative error.
> + * 0: success.
> + * -EINVAL: vring does not use the dma api, so we can not enable premapped 
> mode.
> + */
> +int virtqueue_set_premapped(struct virtqueue *_vq)
> +{
> +   struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +   if (!vq->use_dma_api)
> +   return -EINVAL;
> +
> +   vq->premapped = true;

I guess there should be a way to disable it. Would it be useful for
the case when AF_XDP sockets were destroyed?

Thanks


> +
> +   return 0;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_premapped);
> +
>  /* Only available for split ring */
>  struct virtqueue *vring_new_virtqueue(unsigned int index,
>   unsigned int num,
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index b93238db94e3..1fc0e1023bd4 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -78,6 +78,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
>
>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
>
> +int virtqueue_set_premapped(struct virtqueue *_vq);
> +
>  bool virtqueue_poll(struct virtqueue *vq, unsigned);
>
>  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 01/10] virtio_ring: put mapping error check in vring_map_one_sg

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> This patch put the dma addr error check in vring_map_one_sg().
>
> The benefits of doing this:
>
> 1. reduce one judgment of vq->use_dma_api.
> 2. make vring_map_one_sg more simple, without calling
>vring_mapping_error to check the return value. simplifies subsequent
>code
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/virtio/virtio_ring.c | 37 +---
>  1 file changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c5310eaf8b46..72ed07a604d4 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -355,9 +355,8 @@ static struct device *vring_dma_dev(const struct 
> vring_virtqueue *vq)
>  }
>
>  /* Map one sg entry. */
> -static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
> -  struct scatterlist *sg,
> -  enum dma_data_direction direction)
> +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> scatterlist *sg,
> +   enum dma_data_direction direction, dma_addr_t 
> *addr)
>  {
> if (!vq->use_dma_api) {
> /*
> @@ -366,7 +365,8 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * depending on the direction.
>  */
> kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, 
> direction);
> -   return (dma_addr_t)sg_phys(sg);
> +   *addr = (dma_addr_t)sg_phys(sg);
> +   return 0;
> }
>
> /*
> @@ -374,9 +374,14 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * the way it expects (we don't guarantee that the scatterlist
>  * will exist for the lifetime of the mapping).
>  */
> -   return dma_map_page(vring_dma_dev(vq),
> +   *addr = dma_map_page(vring_dma_dev(vq),
> sg_page(sg), sg->offset, sg->length,
> direction);
> +
> +   if (dma_mapping_error(vring_dma_dev(vq), *addr))
> +   return -ENOMEM;
> +
> +   return 0;
>  }
>
>  static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> @@ -588,8 +593,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
>
> for (n = 0; n < out_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_TO_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -603,8 +609,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> }
> for (; n < (out_sgs + in_sgs); n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -1279,9 +1286,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> goto unmap_release;
>
> desc[i].flags = cpu_to_le16(n < out_sgs ?
> @@ -1426,9 +1432,10 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> c = 0;
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, n < 
> out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> goto unmap_release;
>
> flags = cpu_to_le16(vq->packed.avail_used_flags |
> --
> 2.32.0.3.g01195cf9f
>

___

Re: [RFC PATCH v4 00/17] vsock: MSG_ZEROCOPY flag support

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:55:58AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 19:15, Stefano Garzarella wrote:

On Sat, Jun 03, 2023 at 11:49:22PM +0300, Arseniy Krasnov wrote:


[...]



  LET'S SPLIT PATCHSET TO MAKE REVIEW EASIER

In v3 Stefano Garzarella  asked to split this patchset
for several parts, because it looks too big for review. I think in this
version (v4) we can do it in the following way:

[0001 - 0005] - this is preparation for virtio/vhost part.
[0006 - 0009] - this is preparation for AF_VSOCK part.
[0010 - 0013] - these patches allows to trigger logic from the previous
   two parts.
[0014 - rest] - updates for doc, tests, utils. This part doesn't touch
   kernel code and looks not critical.


Yeah, I like this split, but I'd include 14 in the (10, 13) group.

I have reviewed most of them and I think we are well on our way :-)
I've already seen that Bobby suggested changes for v5, so I'll review
that version better.

Great work so far!


Hello Stefano!


Hi Arseniy :-)



Thanks for review! I left some questions, but most of comments are clear
for me. So I guess that idea of split is that I still keep all patches in
a big single patchset, but preserve structure described above and we will
do review process step by step according split?

Or I should split this patchset for 3 separated sets? I guess this will be
more complex to review...


If the next is still RFC, a single series is fine.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 06/17] vsock: check error queue to set EPOLLERR

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:44:25AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 19:04, Stefano Garzarella wrote:

On Sat, Jun 03, 2023 at 11:49:28PM +0300, Arseniy Krasnov wrote:

If socket's error queue is not empty, EPOLLERR must be set. Otherwise,
reader of error queue won't detect data in it using EPOLLERR bit.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)


This patch looks like it can go even without this series.

Is it a fix? Should we add a fixes tag?


Yes, it is fix and I can exclude it from this set to reduce number
of patches, but there is no reproducer for this without MSG_ZEROCOPY
support - at this moment this feature is the only user of error queue
for AF_VSOCK.


Okay, so it's fine to keep it here, but please mention in the comment 
that without MSG_ZEROCOPY it can't be reproduced.


That way we know that we don't have to backport into the stable 
branches.


Thanks,
Stefano



Thanks, Arseniy



Thanks,
Stefano



diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index efb8a0937a13..45fd20c4ed50 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1030,7 +1030,7 @@ static __poll_t vsock_poll(struct file *file, struct 
socket *sock,
poll_wait(file, sk_sleep(sk), wait);
mask = 0;

-    if (sk->sk_err)
+    if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
    /* Signify that there has been an error on this socket. */
    mask |= EPOLLERR;

-- 
2.25.1







___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 05/17] vsock/virtio: MSG_ZEROCOPY flag support

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 19:03, Stefano Garzarella wrote:

On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible, then non-linear skb
will be created and filled with the pages of user's buffer. Pages of
user's buffer are locked in memory by 'get_user_pages()'.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 270 ++--
1 file changed, 208 insertions(+), 62 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 0de562c1dc4b..f1ec38c72db7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
return container_of(t, struct virtio_transport, transport);
}

-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-   size_t len,
-   u32 src_cid,
-   u32 src_port,
-   u32 dst_cid,
-   u32 dst_port)
-{
-    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-    struct virtio_vsock_hdr *hdr;
-    struct sk_buff *skb;
+static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
+   size_t max_to_send)
+{
+    struct iov_iter *iov_iter;
+
+    if (!info->msg)
+    return false;
+
+    iov_iter = &info->msg->msg_iter;
+
+    /* Data is simple buffer. */
+    if (iter_is_ubuf(iov_iter))
+    return true;
+
+    if (!iter_is_iovec(iov_iter))
+    return false;
+
+    if (iov_iter->iov_offset)
+    return false;
+
+    /* We can't send whole iov. */
+    if (iov_iter->count > max_to_send)
+    return false;
+
+    return true;
+}
+
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+   struct sk_buff *skb,
+   struct msghdr *msg,
+   bool zerocopy)
+{
+    struct ubuf_info *uarg;
+
+    if (msg->msg_ubuf) {
+    uarg = msg->msg_ubuf;
+    net_zcopy_get(uarg);
+    } else {
+    struct iov_iter *iter = &msg->msg_iter;
+    struct ubuf_info_msgzc *uarg_zc;
+    int len;
+
+    /* Only ITER_IOVEC or ITER_UBUF are allowed and
+ * checked before.
+ */
+    if (iter_is_iovec(iter))
+    len = iov_length(iter->__iov, iter->nr_segs);
+    else
+    len = iter->count;
+
+    uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+    len,
+    NULL);
+
+    if (!uarg)
+    return -1;
+
+    uarg_zc = uarg_to_msgzc(uarg);
+    uarg_zc->zerocopy = zerocopy ? 1 : 0;
+    }
+
+    skb_zcopy_init(skb, uarg);
+
+    return 0;
+}
+
+static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
+    struct vsock_sock *vsk,


`vsk` seems unused


+    struct virtio_vsock_pkt_info *info,
+    size_t len)
+{
void *payload;
int err;

-    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
-    if (!skb)
-    return NULL;
+    payload = skb_put(skb, len);
+    err = memcpy_from_msg(payload, info->msg, len);
+    if (err)
+    return -1;
+
+    if (msg_data_left(info->msg))
+    return 0;
+
+    return 0;
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+  struct virtio_vsock_pkt_info *info,
+  u32 src_cid,
+  u32 src_port,
+  u32 dst_cid,
+  u32 dst_port,
+  size_t len)
+{
+    struct virtio_vsock_hdr *hdr;

hdr = virtio_vsock_hdr(skb);
hdr->type    = cpu_to_le16(info->type);
@@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info 
*info,
hdr->dst_port    = cpu_to_le32(dst_port);
hdr->flags    = cpu_to_le32(info->flags);
hdr->len    = cpu_to_le32(len);
-
-    if (info->msg && len > 0) {
-    payload = skb_put(skb, len);
-    err = memcpy_from_msg(payload, info->msg, len);
-    if (err)
-    goto out;
-
-    if (msg_data_left(info->msg) == 0 &&
-    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
-    hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
-
-    if (info->msg->msg_flags & MSG_EOR)
-    hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-    }
-    }
-
-    if (info->reply)
-    virtio_vsock_skb_set_reply(skb);
-
-    trace_virtio_transport_alloc_pkt(src_cid, src_port,
- dst_cid, dst_port,
- len,
- info->type,
- info->op,
- info->flags);
-
-    

Re: [RFC PATCH v4 03/17] vsock/virtio: support to send non-linear skb

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:39:41AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 18:36, Stefano Garzarella wrote:

On Sat, Jun 03, 2023 at 11:49:25PM +0300, Arseniy Krasnov wrote:

For non-linear skb use its pages from fragment array as buffers in
virtio tx queue. These pages are already pinned by 'get_user_pages()'
during such skb creation.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport.c | 37 ++--
1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..6053d8341091 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -100,7 +100,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];

for (;;) {
-    struct scatterlist hdr, buf, *sgs[2];
+    /* +1 is for packet header. */
+    struct scatterlist *sgs[MAX_SKB_FRAGS + 1];
+    struct scatterlist bufs[MAX_SKB_FRAGS + 1];
    int ret, in_sg = 0, out_sg = 0;
    struct sk_buff *skb;
    bool reply;
@@ -111,12 +113,35 @@ virtio_transport_send_pkt_work(struct work_struct *work)

    virtio_transport_deliver_tap_pkt(skb);
    reply = virtio_vsock_skb_reply(skb);
+    sg_init_one(&bufs[0], virtio_vsock_hdr(skb), 
sizeof(*virtio_vsock_hdr(skb)));
+    sgs[out_sg++] = &bufs[0];


Can we use out_sg also to index bufs (here and in the rest of the code)?

E.g.

    sg_init_one(&bufs[out_sg], ...)
    sgs[out_sg] = &bufs[out_sg];
    ++out_sg;

    ...
    if (skb->len > 0) {
    sg_init_one(&bufs[out_sg], skb->data, skb->len);
    sgs[out_sg] = &bufs[out_sg];
    ++out_sg;
    }

    etc...


+


For readability, I would move the smaller branch above:

    if (!skb_is_nonlinear(skb)) {
    // small block
    ...
    } else {
    // big block
    ...
    }


+    if (skb_is_nonlinear(skb)) {
+    struct skb_shared_info *si;
+    int i;
+
+    si = skb_shinfo(skb);
+
+    for (i = 0; i < si->nr_frags; i++) {
+    skb_frag_t *skb_frag = &si->frags[i];
+    void *va = page_to_virt(skb_frag->bv_page);
+
+    /* We will use 'page_to_virt()' for userspace page here,
+ * because virtio layer will call 'virt_to_phys()' later
+ * to fill buffer descriptor. We don't touch memory at
+ * "virtual" address of this page.
+ */
+    sg_init_one(&bufs[i + 1],
+    va + skb_frag->bv_offset,
+    skb_frag->bv_len);
+    sgs[out_sg++] = &bufs[i + 1];
+    }
+    } else {
+    if (skb->len > 0) {


Should we do the same check (skb->len > 0) for nonlinear skb as well?
Or do the nonlinear ones necessarily have len > 0?


Yes, non-linear skb always has 'data_len' > 0, e.g. such skbs always have some
data in it.


Okay, makes sense ;-)

Thanks,
Stefano



Thanks, Arseniy




+    sg_init_one(&bufs[1], skb->data, skb->len);
+    sgs[out_sg++] = &bufs[1];
+    }


   ^
Blank line that we can remove.

Stefano


-    sg_init_one(&hdr, virtio_vsock_hdr(skb), 
sizeof(*virtio_vsock_hdr(skb)));
-    sgs[out_sg++] = &hdr;
-    if (skb->len > 0) {
-    sg_init_one(&buf, skb->data, skb->len);
-    sgs[out_sg++] = &buf;
    }

    ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL);
-- 
2.25.1







___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 2/4] virtio/vsock: support MSG_PEEK for SOCK_SEQPACKET

2023-06-27 Thread Stefano Garzarella

On Tue, Jun 27, 2023 at 07:34:29AM +0300, Arseniy Krasnov wrote:



On 26.06.2023 19:28, Stefano Garzarella wrote:

On Sun, Jun 18, 2023 at 09:24:49AM +0300, Arseniy Krasnov wrote:

This adds support of MSG_PEEK flag for SOCK_SEQPACKET type of socket.
Difference with SOCK_STREAM is that this callback returns either length
of the message or error.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 63 +++--
1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 2ee40574c339..352d042b130b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -460,6 +460,63 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static ssize_t
+virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
+   struct msghdr *msg)
+{
+    struct virtio_vsock_sock *vvs = vsk->trans;
+    struct sk_buff *skb;
+    size_t total, len;
+
+    spin_lock_bh(&vvs->rx_lock);
+
+    if (!vvs->msg_count) {
+    spin_unlock_bh(&vvs->rx_lock);
+    return 0;
+    }
+
+    total = 0;
+    len = msg_data_left(msg);
+
+    skb_queue_walk(&vvs->rx_queue, skb) {
+    struct virtio_vsock_hdr *hdr;
+
+    if (total < len) {
+    size_t bytes;
+    int err;
+
+    bytes = len - total;
+    if (bytes > skb->len)
+    bytes = skb->len;
+
+    spin_unlock_bh(&vvs->rx_lock);
+
+    /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+    err = memcpy_to_msg(msg, skb->data, bytes);
+    if (err)
+    return err;
+
+    spin_lock_bh(&vvs->rx_lock);
+    }
+
+    total += skb->len;
+    hdr = virtio_vsock_hdr(skb);
+
+    if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+    if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+    msg->msg_flags |= MSG_EOR;
+
+    break;
+    }
+    }
+
+    spin_unlock_bh(&vvs->rx_lock);
+
+    return total;


Should we return the minimum between total and len?


I guess no, because seqpacket dequeue callback always returns length of message,
then, in af_vsock.c we return either number of bytes read or length of message
depending on MSG_TRUNC flags.


Right! We should always return the total lenght of the packet.

Thanks,
Stefano



Thanks, Arseniy



Thanks,
Stefano


+}
+
static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 struct msghdr *msg,
 int flags)
@@ -554,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
   int flags)
{
if (flags & MSG_PEEK)
-    return -EOPNOTSUPP;
-
-    return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
+    return virtio_transport_seqpacket_do_peek(vsk, msg);
+    else
+    return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);

-- 
2.25.1







___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization