Re: [PATCH V2 2/5] vDPA/ifcvf: get_driver_features from virtio registers

2023-05-23 Thread Jason Wang
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
>
> This commit implements a new function ifcvf_get_driver_feature()
> which read driver_features from virtio registers.
>
> To be less ambiguous, ifcvf_set_features() is renamed to
> ifcvf_set_driver_features(), and ifcvf_get_features()
> is renamed to ifcvf_get_dev_features() which returns
> the provisioned vDPA device features.
>
> Signed-off-by: Zhu Lingshan 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 38 +
>  drivers/vdpa/ifcvf/ifcvf_base.h |  5 +++--
>  drivers/vdpa/ifcvf/ifcvf_main.c |  9 +---
>  3 files changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 6c5650f73007..546e923bcd16 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -204,11 +204,29 @@ u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> return features;
>  }
>
> -u64 ifcvf_get_features(struct ifcvf_hw *hw)
> +/* return provisioned vDPA dev features */
> +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw)
>  {
> return hw->dev_features;
>  }
>
> +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +   u32 features_lo, features_hi;
> +   u64 features;
> +
> +   vp_iowrite32(0, &cfg->device_feature_select);
> +   features_lo = vp_ioread32(&cfg->guest_feature);
> +
> +   vp_iowrite32(1, &cfg->device_feature_select);
> +   features_hi = vp_ioread32(&cfg->guest_feature);
> +
> +   features = ((u64)features_hi << 32) | features_lo;
> +
> +   return features;
> +}
> +
>  int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features)
>  {
> if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
> @@ -275,7 +293,7 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 
> offset,
> vp_iowrite8(*p++, hw->dev_cfg + offset + i);
>  }
>
> -static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
> +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features)
>  {
> struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
>
> @@ -286,19 +304,6 @@ static void ifcvf_set_features(struct ifcvf_hw *hw, u64 
> features)
> vp_iowrite32(features >> 32, &cfg->guest_feature);
>  }
>
> -static int ifcvf_config_features(struct ifcvf_hw *hw)
> -{
> -   ifcvf_set_features(hw, hw->req_features);
> -   ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK);
> -
> -   if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) {
> -   IFCVF_ERR(hw->pdev, "Failed to set FEATURES_OK status\n");
> -   return -EIO;
> -   }
> -
> -   return 0;
> -}
> -
>  u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
>  {
> struct ifcvf_lm_cfg __iomem *ifcvf_lm;
> @@ -387,9 +392,6 @@ int ifcvf_start_hw(struct ifcvf_hw *hw)
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
>
> -   if (ifcvf_config_features(hw) < 0)
> -   return -EINVAL;
> -
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
>
> return 0;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index d545a9411143..cb19196c3ece 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -69,7 +69,6 @@ struct ifcvf_hw {
> phys_addr_t notify_base_pa;
> u32 notify_off_multiplier;
> u32 dev_type;
> -   u64 req_features;
> u64 hw_features;
> /* provisioned device features */
> u64 dev_features;
> @@ -122,7 +121,7 @@ u8 ifcvf_get_status(struct ifcvf_hw *hw);
>  void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
>  void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
>  void ifcvf_reset(struct ifcvf_hw *hw);
> -u64 ifcvf_get_features(struct ifcvf_hw *hw);
> +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw);
>  u64 ifcvf_get_hw_features(struct ifcvf_hw *hw);
>  int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features);
>  u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
> @@ -137,4 +136,6 @@ int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, 
> u64 desc_area,
>  u64 driver_area, u64 device_area);
>  bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid);
>  void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready);
> +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features);
> +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw);
>  #endif /* _IFCVF_H_ */
> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
> index 1357c67014ab..4588484bd53d 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> @@ -410,7 +410,7 @@ static u64 ifcvf_vdpa_get_device_features(struct 
> vdpa_device *vdpa_dev)
> u64 features;
>
> if (type ==

Re: [PATCH V2 1/5] vDPA/ifcvf: virt queue ops take immediate actions

2023-05-23 Thread Jason Wang
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
>
> In this commit, virtqueue operations including:
> set_vq_num(), set_vq_address(), set_vq_ready()
> and get_vq_ready() access PCI registers directly
> to take immediate actions.
>
> Signed-off-by: Zhu Lingshan 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 58 -
>  drivers/vdpa/ifcvf/ifcvf_base.h | 10 +++---
>  drivers/vdpa/ifcvf/ifcvf_main.c | 16 +++--
>  3 files changed, 45 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 5563b3a773c7..6c5650f73007 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -329,31 +329,49 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, 
> u16 num)
> return 0;
>  }
>
> -static int ifcvf_hw_enable(struct ifcvf_hw *hw)
> +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num)
>  {
> -   struct virtio_pci_common_cfg __iomem *cfg;
> -   u32 i;
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
>
> -   cfg = hw->common_cfg;
> -   for (i = 0; i < hw->nr_vring; i++) {
> -   if (!hw->vring[i].ready)
> -   break;
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite16(num, &cfg->queue_size);
> +}
>
> -   vp_iowrite16(i, &cfg->queue_select);
> -   vp_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
> -&cfg->queue_desc_hi);
> -   vp_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
> - &cfg->queue_avail_hi);
> -   vp_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
> -&cfg->queue_used_hi);
> -   vp_iowrite16(hw->vring[i].size, &cfg->queue_size);
> -   ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx);
> -   vp_iowrite16(1, &cfg->queue_enable);
> -   }
> +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area,
> +u64 driver_area, u64 device_area)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +&cfg->queue_desc_hi);
> +   vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +&cfg->queue_avail_hi);
> +   vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +&cfg->queue_used_hi);
>
> return 0;
>  }
>
> +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +   u16 queue_enable;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   queue_enable = vp_ioread16(&cfg->queue_enable);
> +
> +   return (bool)queue_enable;
> +}
> +
> +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite16(ready, &cfg->queue_enable);
> +}
> +
>  static void ifcvf_hw_disable(struct ifcvf_hw *hw)
>  {
> u32 i;
> @@ -366,16 +384,12 @@ static void ifcvf_hw_disable(struct ifcvf_hw *hw)
>
>  int ifcvf_start_hw(struct ifcvf_hw *hw)
>  {
> -   ifcvf_reset(hw);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
>
> if (ifcvf_config_features(hw) < 0)
> return -EINVAL;
>
> -   if (ifcvf_hw_enable(hw) < 0)
> -   return -EINVAL;
> -
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
>
> return 0;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index c20d1c40214e..d545a9411143 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -47,12 +47,7 @@
>  #define MSIX_VECTOR_DEV_SHARED 3
>
>  struct vring_info {
> -   u64 desc;
> -   u64 avail;
> -   u64 used;
> -   u16 size;
> u16 last_avail_idx;
> -   bool ready;
> void __iomem *notify_addr;
> phys_addr_t notify_pa;
> u32 irq;
> @@ -137,4 +132,9 @@ int ifcvf_probed_virtio_net(struct ifcvf_hw *hw);
>  u32 ifcvf_get_config_size(struct ifcvf_hw *hw);
>  u16 ifcvf_set_vq_vector(struct ifcvf_hw *hw, u16 qid, int vector);
>  u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector);
> +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num);
> +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area,
> +u64 driver_area, u64 device_area);
> +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid);
> +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready);
>  #endif /* _IFCVF_H_ *

Re: [PATCH vhost v9 04/12] virtio_ring: virtqueue_add() support premapped

2023-05-23 Thread Xuan Zhuo
On Tue, 23 May 2023 03:19:42 -0400, "Michael S. Tsirkin"  
wrote:
> On Mon, May 22, 2023 at 11:03:26PM -0700, Christoph Hellwig wrote:
> > On Wed, May 17, 2023 at 10:22:41AM +0800, Xuan Zhuo wrote:
> > > virtuque_add() adds parameter premapped.
> >
> > Well, I can see that.  But why?
>
> Assuming it's intentional, it should say something along the lines of
> "The parameter is unused for now, and all callers just pass false.
>  It will be used by a follow-up patch".

I agree.

Will fix.

>
> It's not a bad way to split patches, this way actual logic in
> the next patch stands out as opposed to being masked by
> the code reshuffling following the extra parameter.

I think so.

Thanks.


>
> --
> MST
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-23 Thread Mike Christie
Hey Oleg,

For all these questions below let me get back to you by tomorrow.
I need to confirm if something would be considered a regression by
the core vhost developers.

On 5/23/23 7:15 AM, Oleg Nesterov wrote:
> On 05/22, Oleg Nesterov wrote:
>>
>> Right now I think that "int dead" should die,
> 
> No, probably we shouldn't call get_signal() if we have already dequeued 
> SIGKILL.
> 
>> but let me think tomorrow.
> 
> May be something like this... I don't like it but I can't suggest anything 
> better
> right now.
> 
>   bool killed = false;
> 
>   for (;;) {
>   ...
>   
>   node = llist_del_all(&worker->work_list);
>   if (!node) {
>   schedule();
>   /*
>* When we get a SIGKILL our release function will
>* be called. That will stop new IOs from being queued
>* and check for outstanding cmd responses. It will then
>* call vhost_task_stop to tell us to return and exit.
>*/
>   if (signal_pending(current)) {
>   struct ksignal ksig;
> 
>   if (!killed)
>   killed = get_signal(&ksig);
> 
>   clear_thread_flag(TIF_SIGPENDING);
>   }
> 
>   continue;
>   }
> 
> ---
> But let me ask a couple of questions. Let's forget this patch, let's look at 
> the
> current code:
> 
>   node = llist_del_all(&worker->work_list);
>   if (!node)
>   schedule();
> 
>   node = llist_reverse_order(node);
>   ... process works ...
> 
> To me this looks a bit confusing. Shouldn't we do
> 
>   if (!node) {
>   schedule();
>   continue;
>   }
> 
> just to make the code a bit more clear? If node == NULL then
> llist_reverse_order() and llist_for_each_entry_safe() will do nothing.
> But this is minor.
> 
> 
> 
>   /* make sure flag is seen after deletion */
>   smp_wmb();
>   llist_for_each_entry_safe(work, work_next, node, node) {
>   clear_bit(VHOST_WORK_QUEUED, &work->flags);
> 
> I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
> vhost_work_queue() can add this work again and change work->node->next.
> 
> That is why we use _safe, but we need to ensure that llist_for_each_safe()
> completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.
> 
> So it seems that smp_wmb() can't help and should be removed, instead we need
> 
>   llist_for_each_entry_safe(...) {
>   smp_mb__before_atomic();
>   clear_bit(VHOST_WORK_QUEUED, &work->flags);
> 
> Also, if the work->fn pointer is not stable, we should read it before
> smp_mb__before_atomic() as well.
> 
> No?
> 
> 
>   __set_current_state(TASK_RUNNING);
> 
> Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
> can return with current->state != RUNNING ?
> 
> 
>   work->fn(work);
> 
> Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
> before we call work->fn(). Is it "safe" to run this callback with
> signal_pending() or fatal_signal_pending() ?
> 
> 
> Finally. I never looked into drivers/vhost/ before so I don't understand
> this code at all, but let me ask anyway... Can we change vhost_dev_flush()
> to run the pending callbacks rather than wait for vhost_worker() ?
> I guess we can't, ->mm won't be correct, but can you confirm?
> 
> Oleg.
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] virtio-fs: Improved request latencies when Virtio queue is full

2023-05-23 Thread Stefan Hajnoczi
On Mon, May 22, 2023 at 03:19:15PM +0200, Peter-Jan Gootzen wrote:
> When the Virtio queue is full, a work item is scheduled
> to execute in 1ms that retries adding the request to the queue.
> This is a large amount of time on the scale on which a
> virtio-fs device can operate. When using a DPU this is around
> 40us baseline without going to a remote server (4k, QD=1).
> This patch queues requests when the Virtio queue is full,
> and when a completed request is taken off, immediately fills
> it back up with queued requests.
> 
> This reduces the 99.9th percentile latencies in our tests by
> 60x and slightly increases the overall throughput, when using a
> queue depth 2x the size of the Virtio queue size, with a
> DPU-powered virtio-fs device.
> 
> Signed-off-by: Peter-Jan Gootzen 
> ---
>  fs/fuse/virtio_fs.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 4d8d4f16c727..8af9d3dc61d3 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -347,6 +347,8 @@ static void virtio_fs_hiprio_done_work(struct work_struct 
> *work)
>   }
>   } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
>   spin_unlock(&fsvq->lock);
> +
> + schedule_delayed_work(&fsvq->dispatch_work, 0);

This avoids scheduling work when there is nothing queued and uses
schedule_work() since there is no timeout value:

  if (!list_empty(&fsvq->queued_reqs)) {
  schedule_work(&fsvq->dispatch_work);
  }
  spin_unlock(&fsvq->lock);


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-23 Thread Eric W. Biederman
Oleg Nesterov  writes:

> On 05/22, Oleg Nesterov wrote:
>>
>> Right now I think that "int dead" should die,
>
> No, probably we shouldn't call get_signal() if we have already
> dequeued SIGKILL.

Very much agreed.  It is one thing to add a patch to move do_exit
out of get_signal.  It is another to keep calling get_signal after
that.  Nothing tests that case, and so we get some weird behaviors.


>> but let me think tomorrow.
>
> May be something like this... I don't like it but I can't suggest anything 
> better
> right now.
>
>   bool killed = false;
>
>   for (;;) {
>   ...
>   
>   node = llist_del_all(&worker->work_list);
>   if (!node) {
>   schedule();
>   /*
>* When we get a SIGKILL our release function will
>* be called. That will stop new IOs from being queued
>* and check for outstanding cmd responses. It will then
>* call vhost_task_stop to tell us to return and exit.
>*/
>   if (signal_pending(current)) {
>   struct ksignal ksig;
>
>   if (!killed)
>   killed = get_signal(&ksig);
>
>   clear_thread_flag(TIF_SIGPENDING);
>   }
>
>   continue;
>   }

I want to point out that we need to consider not just SIGKILL, but
SIGABRT that causes a coredump, as well as the process peforming
an ordinary exit(2).  All of which will cause get_signal to return
SIGKILL in this context.

>
> ---
> But let me ask a couple of questions.

I share most of these questions.

> Let's forget this patch, let's look at the
> current code:
>
>   node = llist_del_all(&worker->work_list);
>   if (!node)
>   schedule();
>
>   node = llist_reverse_order(node);
>   ... process works ...
>
> To me this looks a bit confusing. Shouldn't we do
>
>   if (!node) {
>   schedule();
>   continue;
>   }
>
> just to make the code a bit more clear? If node == NULL then
> llist_reverse_order() and llist_for_each_entry_safe() will do nothing.
> But this is minor.
>
>
>
>   /* make sure flag is seen after deletion */
>   smp_wmb();
>   llist_for_each_entry_safe(work, work_next, node, node) {
>   clear_bit(VHOST_WORK_QUEUED, &work->flags);
>
> I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
> vhost_work_queue() can add this work again and change work->node->next.
>
> That is why we use _safe, but we need to ensure that llist_for_each_safe()
> completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.
>
> So it seems that smp_wmb() can't help and should be removed, instead we need
>
>   llist_for_each_entry_safe(...) {
>   smp_mb__before_atomic();
>   clear_bit(VHOST_WORK_QUEUED, &work->flags);
>
> Also, if the work->fn pointer is not stable, we should read it before
> smp_mb__before_atomic() as well.
>
> No?
>
>
>   __set_current_state(TASK_RUNNING);
>
> Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
> can return with current->state != RUNNING ?
>
>
>   work->fn(work);
>
> Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
> before we call work->fn(). Is it "safe" to run this callback with
> signal_pending() or fatal_signal_pending() ?
>
>
> Finally. I never looked into drivers/vhost/ before so I don't understand
> this code at all, but let me ask anyway... Can we change vhost_dev_flush()
> to run the pending callbacks rather than wait for vhost_worker() ?
> I guess we can't, ->mm won't be correct, but can you confirm?

In a conversation long ago I remember hearing that vhost does not
support file descriptor passing.  Which means all of the file
descriptors should be in the same process.

Looking at the vhost code what I am seeing happening is that the
vhost_worker persists until vhost_dev_cleanup is called from
one of the vhost_???_release() functions.  The release functions
are only called after the last flush function completes.  See __fput
if you want to trace the details.


On one hand this all seems reasonable.  On the other hand I am not
seeing the code that prevents file descriptor passing.


It is probably not the worst thing in the world, but what this means
is now if you pass a copy of the vhost file descriptor to another
process the vhost_worker will persist, and thus the process will persist
until that copy of the file descriptor is closed.

Eric

___
Virtuali

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-23 Thread Mike Christie
On 5/22/23 2:40 PM, Michael S. Tsirkin wrote:
>>  return copy_process(NULL, 0, node, &args);
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index 8050fe23c732..a0f00a078cbb 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -2891,6 +2891,7 @@ bool get_signal(struct ksignal *ksig)
>>  
>>  return ksig->sig > 0;
>>  }
>> +EXPORT_SYMBOL_GPL(get_signal);
> 
> If you are exporting this, could you add documentation please?
> 

Ok.


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-23 Thread Eric W. Biederman
"Michael S. Tsirkin"  writes:

> On Sun, May 21, 2023 at 09:51:24PM -0500, Mike Christie wrote:
>> When switching from kthreads to vhost_tasks two bugs were added:
>> 1. The vhost worker tasks's now show up as processes so scripts doing ps
>> or ps a would not incorrectly detect the vhost task as another process.
>> 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's
>> didn't disable or add support for them.
>> 
>> To fix both bugs, this switches the vhost task to be thread in the
>> process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
>> get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
>> SIGKILL/STOP support is required because CLONE_THREAD requires
>> CLONE_SIGHAND which requires those 2 signals to be suppported.
>> 
>> This a modified version of patch originally written by Linus which
>> handles his review comment to himself to rename ignore_signals to
>> block_signals to better represent what it now does. And it includes a
>> change to vhost_worker() to support SIGSTOP/KILL and freeze, and it
>> drops the wait use per Oleg's review comment that it's no longer needed
>> when using CLONE_THREAD.
>> 
>> Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
>> Signed-off-by: Mike Christie 
>> ---
>>  drivers/vhost/vhost.c  | 17 -
>>  include/linux/sched/task.h |  2 +-
>>  kernel/fork.c  | 12 +++-
>>  kernel/signal.c|  1 +
>>  kernel/vhost_task.c| 16 
>>  5 files changed, 25 insertions(+), 23 deletions(-)
>> 
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index a92af08e7864..bf83e9340e40 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -338,6 +338,7 @@ static int vhost_worker(void *data)
>>  struct vhost_worker *worker = data;
>>  struct vhost_work *work, *work_next;
>>  struct llist_node *node;
>> +bool dead = false;
>>  
>>  for (;;) {
>>  /* mb paired w/ kthread_stop */
>> @@ -349,8 +350,22 @@ static int vhost_worker(void *data)
>>  }
>>  
>>  node = llist_del_all(&worker->work_list);
>> -if (!node)
>> +if (!node) {
>>  schedule();
>> +/*
>> + * When we get a SIGKILL our release function will
>> + * be called. That will stop new IOs from being queued
>> + * and check for outstanding cmd responses. It will then
>> + * call vhost_task_stop to tell us to return and exit.
>> + */
>> +if (!dead && signal_pending(current)) {
>> +struct ksignal ksig;
>> +
>> +dead = get_signal(&ksig);
>> +if (dead)
>> +clear_thread_flag(TIF_SIGPENDING);
>
>
> Does get_signal actually return true only on SIGKILL then?

get_signal returns the signal that was dequeued, or 0 if no signal was
dequeued.

For these threads that block all but SIGSTOP and SIGKILL get_signal
should properly never return any signal.  As SIGSTOP and SIGKILL are
handled internally to get_signal.

However get_signal was modified to have a special case for io_uring
and now the vhost code so that extra cleanup can be performed, before
do_exit is called on the thread.  That special case causes get_signal
to return when with the value of SIGKILL when the process exits.

The process can exit with do_group_exit aka exit(2) or any fatal signal
not just SIGKILL, and get_signal on these threads will return.

Eric
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/3] signal: Don't always put SIGKILL in shared_pending

2023-05-23 Thread Eric W. Biederman
Mike Christie  writes:

> When get_pending detects the task has been marked to be killed we try to
   ^^^ get_signal
> clean up the SIGKLL by doing a sigdelset and recalc_sigpending, but we
> still leave it in shared_pending. If the signal is being short circuit
> delivered there is no need to put in shared_pending so this adds a check
> in complete_signal.
>
> This patch was modified from Eric Biederman 
> original patch.
>
> Signed-off-by: Mike Christie 
> ---
>  kernel/signal.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 8f6330f0e9ca..3dc99b9aec7f 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1052,6 +1052,14 @@ static void complete_signal(int sig, struct 
> task_struct *p, enum pid_type type)
>   signal->flags = SIGNAL_GROUP_EXIT;
>   signal->group_exit_code = sig;
>   signal->group_stop_count = 0;
> +
> + /*
> +  * The signal is being short circuit delivered so
> +  * don't set pending.
> +  */
> + if (type != PIDTYPE_PID)
> + sigdelset(&signal->shared_pending.signal, sig);
> +
>   t = p;
>   do {
>   task_clear_jobctl_pending(t, 
> JOBCTL_PENDING_MASK);

Oleg Nesterov  writes:
>
> Eric, sorry. I fail to understand this patch.
>
> How can it help? And whom?

You were looking at why recalc_sigpending was resulting in
TIF_SIGPENDING set.

The big bug was that get_signal was getting called by the thread after
the thread had realized it was part of a group exit.

The minor bug is that SIGKILL was stuck in shared_pending and causing
recalc_sigpending to set TIF_SIGPENDING after get_signal removed the
per thread flag that asks the thread to exit.



The fact is that fatal signals (that pass all of the checks) are
delivered right there in complete_signal so it does not make sense from
a data structure consistency standpoint to leave the fatal signal (like
SIGKILL) in shared_pending.

Outside of this case it will only affect coredumps and other analyzers
that run at process exit.



One thing I am looking at is that the vhost code shares a common problem
with the coredump code to pipes.  There is code that tests
signal_pending() and does something with it after signal processing has
completed.

Fixing the data structure to be consistent seems like one way to handle
that situation.

Eric
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vhost: use kzalloc() instead of kmalloc() followed by memset()

2023-05-23 Thread Stefano Garzarella

On Mon, May 22, 2023 at 02:20:19PM +0530, Prathu Baronia wrote:

Use kzalloc() to allocate new zeroed out msg node instead of
memsetting a node allocated with kmalloc().

Signed-off-by: Prathu Baronia 
---
drivers/vhost/vhost.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a92af08e7864..579ecb4ee4d2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2575,12 +2575,11 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
/* Create a new message. */
struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
{
-   struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+   /* Make sure all padding within the structure is initialized. */
+   struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return NULL;

-   /* Make sure all padding within the structure is initialized. */
-   memset(&node->msg, 0, sizeof node->msg);


the patch LGTM:

Reviewed-by: Stefano Garzarella 


node->vq = vq;
node->msg.type = type;
return node;

base-commit: 4d6d4c7f541d7027beed4fb86eb2c451bd8d6fff
--
2.34.1




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-23 Thread Oleg Nesterov
On 05/22, Oleg Nesterov wrote:
>
> Right now I think that "int dead" should die,

No, probably we shouldn't call get_signal() if we have already dequeued SIGKILL.

> but let me think tomorrow.

May be something like this... I don't like it but I can't suggest anything 
better
right now.

bool killed = false;

for (;;) {
...

node = llist_del_all(&worker->work_list);
if (!node) {
schedule();
/*
 * When we get a SIGKILL our release function will
 * be called. That will stop new IOs from being queued
 * and check for outstanding cmd responses. It will then
 * call vhost_task_stop to tell us to return and exit.
 */
if (signal_pending(current)) {
struct ksignal ksig;

if (!killed)
killed = get_signal(&ksig);

clear_thread_flag(TIF_SIGPENDING);
}

continue;
}

---
But let me ask a couple of questions. Let's forget this patch, let's look at the
current code:

node = llist_del_all(&worker->work_list);
if (!node)
schedule();

node = llist_reverse_order(node);
... process works ...

To me this looks a bit confusing. Shouldn't we do

if (!node) {
schedule();
continue;
}

just to make the code a bit more clear? If node == NULL then
llist_reverse_order() and llist_for_each_entry_safe() will do nothing.
But this is minor.



/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);

I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
vhost_work_queue() can add this work again and change work->node->next.

That is why we use _safe, but we need to ensure that llist_for_each_safe()
completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.

So it seems that smp_wmb() can't help and should be removed, instead we need

llist_for_each_entry_safe(...) {
smp_mb__before_atomic();
clear_bit(VHOST_WORK_QUEUED, &work->flags);

Also, if the work->fn pointer is not stable, we should read it before
smp_mb__before_atomic() as well.

No?


__set_current_state(TASK_RUNNING);

Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
can return with current->state != RUNNING ?


work->fn(work);

Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
before we call work->fn(). Is it "safe" to run this callback with
signal_pending() or fatal_signal_pending() ?


Finally. I never looked into drivers/vhost/ before so I don't understand
this code at all, but let me ask anyway... Can we change vhost_dev_flush()
to run the pending callbacks rather than wait for vhost_worker() ?
I guess we can't, ->mm won't be correct, but can you confirm?

Oleg.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: vhost stable branch

2023-05-23 Thread Michael S. Tsirkin
On Tue, May 23, 2023 at 08:38:47AM +, Dragos Tatulea wrote:
> On Tue, 2023-05-23 at 04:18 -0400, Michael S. Tsirkin wrote:
> > On Tue, May 23, 2023 at 07:16:58AM +, Dragos Tatulea wrote:
> > > Hi,
> > > 
> > > I am looking for the stable branch for vdpa fixes in the vhost tree [1].
> > > There
> > > are 3 branches that seem identical: linux-next, test and vhost. linux-next
> > > seems
> > > to be -next branch. Which one would be the stable branch?
> > > 
> > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> > > 
> > > Thanks,
> > > Dragos
> > 
> > I don't publish one until I am ready to send patches to Linus.
> > Which should be today or tomorrow.
> > 
> Understood. Is it the same with patches for -next? Are they published only 
> once
> the patches are sent to Linus?
> 
> Thanks,
> Dragos
> 

A bit different. I start worrying about next when I'm done with stable.
Also my next branch rebases frequently, I also drop and squash patches,
rewrite commit log etc.

-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: vhost stable branch

2023-05-23 Thread Dragos Tatulea via Virtualization
On Tue, 2023-05-23 at 04:18 -0400, Michael S. Tsirkin wrote:
> On Tue, May 23, 2023 at 07:16:58AM +, Dragos Tatulea wrote:
> > Hi,
> > 
> > I am looking for the stable branch for vdpa fixes in the vhost tree [1].
> > There
> > are 3 branches that seem identical: linux-next, test and vhost. linux-next
> > seems
> > to be -next branch. Which one would be the stable branch?
> > 
> > [1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> > 
> > Thanks,
> > Dragos
> 
> I don't publish one until I am ready to send patches to Linus.
> Which should be today or tomorrow.
> 
Understood. Is it the same with patches for -next? Are they published only once
the patches are sent to Linus?

Thanks,
Dragos

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: vhost stable branch

2023-05-23 Thread Michael S. Tsirkin
On Tue, May 23, 2023 at 07:16:58AM +, Dragos Tatulea wrote:
> Hi,
> 
> I am looking for the stable branch for vdpa fixes in the vhost tree [1]. There
> are 3 branches that seem identical: linux-next, test and vhost. linux-next 
> seems
> to be -next branch. Which one would be the stable branch?
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> 
> Thanks,
> Dragos

Here's the changelog if you want to know what I'm intending to send:

 tools/virtio: ringtest: Add .gitignore
 vhost_net: revert upend_idx only on retriable error
 virtio_net: Fix error unwinding of XDP initialization
 virtio_net: Close queue pairs using helper function
 virtio_net: suppress cpu stall when free_unused_bufs
 hwrng: virtio - Fix race on data_avail and actual data
 virtio-vdpa: Fix unchecked call to NULL set_vq_affinity
 tools/virtio: fix build break for aarch64
 virtio-crypto: fix NULL pointer dereference in virtio_crypto_free_request
 virtio_net: Fix error unwinding of XDP initialization
 vdpa/mlx5: Fix hang when cvq commands are triggered during device unregister

If I missed any fixes let me know!
-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: vhost stable branch

2023-05-23 Thread Michael S. Tsirkin
On Tue, May 23, 2023 at 07:16:58AM +, Dragos Tatulea wrote:
> Hi,
> 
> I am looking for the stable branch for vdpa fixes in the vhost tree [1]. There
> are 3 branches that seem identical: linux-next, test and vhost. linux-next 
> seems
> to be -next branch. Which one would be the stable branch?
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
> 
> Thanks,
> Dragos

I don't publish one until I am ready to send patches to Linus.
Which should be today or tomorrow.

-- 
MSt

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH vhost v9 04/12] virtio_ring: virtqueue_add() support premapped

2023-05-23 Thread Michael S. Tsirkin
On Mon, May 22, 2023 at 11:03:26PM -0700, Christoph Hellwig wrote:
> On Wed, May 17, 2023 at 10:22:41AM +0800, Xuan Zhuo wrote:
> > virtuque_add() adds parameter premapped.
> 
> Well, I can see that.  But why?

Assuming it's intentional, it should say something along the lines of
"The parameter is unused for now, and all callers just pass false.
 It will be used by a follow-up patch".

It's not a bad way to split patches, this way actual logic in
the next patch stands out as opposed to being masked by
the code reshuffling following the extra parameter.

-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


vhost stable branch

2023-05-23 Thread Dragos Tatulea via Virtualization
Hi,

I am looking for the stable branch for vdpa fixes in the vhost tree [1]. There
are 3 branches that seem identical: linux-next, test and vhost. linux-next seems
to be -next branch. Which one would be the stable branch?

[1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git

Thanks,
Dragos
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization