On Thu, Jun 01, 2023 at 01:32:32PM -0500, Mike Christie wrote:
> When switching from kthreads to vhost_tasks two bugs were added:
> 1. The vhost worker tasks's now show up as processes so scripts doing
> ps or ps a would not incorrectly detect the vhost task as another
> process.  2. kthreads disabled freeze by setting PF_NOFREEZE, but
> vhost tasks's didn't disable or add support for them.
> 
> To fix both bugs, this switches the vhost task to be thread in the
> process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
> get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
> SIGKILL/STOP support is required because CLONE_THREAD requires
> CLONE_SIGHAND which requires those 2 signals to be supported.
> 
> This is a modified version of the patch written by Mike Christie
> <michael.chris...@oracle.com> which was a modified version of patch
> originally written by Linus.
> 
> Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER.
> Including ignoring signals, setting up the register state, and having
> get_signal return instead of calling do_group_exit.
> 
> Tidied up the vhost_task abstraction so that the definition of
> vhost_task only needs to be visible inside of vhost_task.c.  Making
> it easier to review the code and tell what needs to be done where.
> As part of this the main loop has been moved from vhost_worker into
> vhost_task_fn.  vhost_worker now returns true if work was done.
> 
> The main loop has been updated to call get_signal which handles
> SIGSTOP, freezing, and collects the message that tells the thread to
> exit as part of process exit.  This collection clears
> __fatal_signal_pending.  This collection is not guaranteed to
> clear signal_pending() so clear that explicitly so the schedule()
> sleeps.
> 
> For now the vhost thread continues to exist and run work until the
> last file descriptor is closed and the release function is called as
> part of freeing struct file.  To avoid hangs in the coredump
> rendezvous and when killing threads in a multi-threaded exec.  The
> coredump code and de_thread have been modified to ignore vhost threads.
> 
> Remvoing the special case for exec appears to require teaching
> vhost_dev_flush how to directly complete transactions in case
> the vhost thread is no longer running.
> 
> Removing the special case for coredump rendezvous requires either the
> above fix needed for exec or moving the coredump rendezvous into
> get_signal.
> 
> Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
> Signed-off-by: Eric W. Biederman <ebied...@xmission.com>
> Co-developed-by: Mike Christie <michael.chris...@oracle.com>
> Signed-off-by: Mike Christie <michael.chris...@oracle.com>


Acked-by: Michael S. Tsirkin <m...@redhat.com>


> ---
>  arch/x86/include/asm/fpu/sched.h |  2 +-
>  arch/x86/kernel/fpu/context.h    |  2 +-
>  arch/x86/kernel/fpu/core.c       |  2 +-
>  drivers/vhost/vhost.c            | 22 ++------
>  fs/coredump.c                    |  4 +-
>  include/linux/sched/task.h       |  1 -
>  include/linux/sched/vhost_task.h | 15 ++----
>  kernel/exit.c                    |  5 +-
>  kernel/fork.c                    | 13 ++---
>  kernel/signal.c                  |  8 +--
>  kernel/vhost_task.c              | 92 +++++++++++++++++++++-----------
>  11 files changed, 89 insertions(+), 77 deletions(-)
> 
> diff --git a/arch/x86/include/asm/fpu/sched.h 
> b/arch/x86/include/asm/fpu/sched.h
> index c2d6cd78ed0c..78fcde7b1f07 100644
> --- a/arch/x86/include/asm/fpu/sched.h
> +++ b/arch/x86/include/asm/fpu/sched.h
> @@ -39,7 +39,7 @@ extern void fpu_flush_thread(void);
>  static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
>  {
>       if (cpu_feature_enabled(X86_FEATURE_FPU) &&
> -         !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) {
> +         !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
>               save_fpregs_to_fpstate(old_fpu);
>               /*
>                * The save operation preserved register state, so the
> diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
> index 9fcfa5c4dad7..af5cbdd9bd29 100644
> --- a/arch/x86/kernel/fpu/context.h
> +++ b/arch/x86/kernel/fpu/context.h
> @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void)
>       struct fpu *fpu = &current->thread.fpu;
>       int cpu = smp_processor_id();
>  
> -     if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER)))
> +     if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
>               return;
>  
>       if (!fpregs_state_valid(fpu, cpu)) {
> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
> index caf33486dc5e..1015af1ae562 100644
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
>  
>       this_cpu_write(in_kernel_fpu, true);
>  
> -     if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
> +     if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
>           !test_thread_flag(TIF_NEED_FPU_LOAD)) {
>               set_thread_flag(TIF_NEED_FPU_LOAD);
>               save_fpregs_to_fpstate(&current->thread.fpu);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index a92af08e7864..074273020849 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -256,7 +256,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
> vhost_work *work)
>                * test_and_set_bit() implies a memory barrier.
>                */
>               llist_add(&work->node, &dev->worker->work_list);
> -             wake_up_process(dev->worker->vtsk->task);
> +             vhost_task_wake(dev->worker->vtsk);
>       }
>  }
>  EXPORT_SYMBOL_GPL(vhost_work_queue);
> @@ -333,31 +333,19 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>       __vhost_vq_meta_reset(vq);
>  }
>  
> -static int vhost_worker(void *data)
> +static bool vhost_worker(void *data)
>  {
>       struct vhost_worker *worker = data;
>       struct vhost_work *work, *work_next;
>       struct llist_node *node;
>  
> -     for (;;) {
> -             /* mb paired w/ kthread_stop */
> -             set_current_state(TASK_INTERRUPTIBLE);
> -
> -             if (vhost_task_should_stop(worker->vtsk)) {
> -                     __set_current_state(TASK_RUNNING);
> -                     break;
> -             }
> -
> -             node = llist_del_all(&worker->work_list);
> -             if (!node)
> -                     schedule();
> -
> +     node = llist_del_all(&worker->work_list);
> +     if (node) {
>               node = llist_reverse_order(node);
>               /* make sure flag is seen after deletion */
>               smp_wmb();
>               llist_for_each_entry_safe(work, work_next, node, node) {
>                       clear_bit(VHOST_WORK_QUEUED, &work->flags);
> -                     __set_current_state(TASK_RUNNING);
>                       kcov_remote_start_common(worker->kcov_handle);
>                       work->fn(work);
>                       kcov_remote_stop();
> @@ -365,7 +353,7 @@ static int vhost_worker(void *data)
>               }
>       }
>  
> -     return 0;
> +     return !!node;
>  }
>  
>  static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> diff --git a/fs/coredump.c b/fs/coredump.c
> index ece7badf701b..88740c51b942 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -371,7 +371,9 @@ static int zap_process(struct task_struct *start, int 
> exit_code)
>               if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
>                       sigaddset(&t->pending.signal, SIGKILL);
>                       signal_wake_up(t, 1);
> -                     nr++;
> +                     /* The vhost_worker does not particpate in coredumps */
> +                     if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != 
> PF_USER_WORKER)
> +                             nr++;
>               }
>       }
>  
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 537cbf9a2ade..e0f5ac90a228 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -29,7 +29,6 @@ struct kernel_clone_args {
>       u32 io_thread:1;
>       u32 user_worker:1;
>       u32 no_files:1;
> -     u32 ignore_signals:1;
>       unsigned long stack;
>       unsigned long stack_size;
>       unsigned long tls;
> diff --git a/include/linux/sched/vhost_task.h 
> b/include/linux/sched/vhost_task.h
> index 6123c10b99cf..837a23624a66 100644
> --- a/include/linux/sched/vhost_task.h
> +++ b/include/linux/sched/vhost_task.h
> @@ -2,22 +2,13 @@
>  #ifndef _LINUX_VHOST_TASK_H
>  #define _LINUX_VHOST_TASK_H
>  
> -#include <linux/completion.h>
>  
> -struct task_struct;
> +struct vhost_task;
>  
> -struct vhost_task {
> -     int (*fn)(void *data);
> -     void *data;
> -     struct completion exited;
> -     unsigned long flags;
> -     struct task_struct *task;
> -};
> -
> -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
> +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
>                                    const char *name);
>  void vhost_task_start(struct vhost_task *vtsk);
>  void vhost_task_stop(struct vhost_task *vtsk);
> -bool vhost_task_should_stop(struct vhost_task *vtsk);
> +void vhost_task_wake(struct vhost_task *vtsk);
>  
>  #endif
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 34b90e2e7cf7..edb50b4c9972 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -411,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk)
>       tsk->flags |= PF_POSTCOREDUMP;
>       core_state = tsk->signal->core_state;
>       spin_unlock_irq(&tsk->sighand->siglock);
> -     if (core_state) {
> +
> +     /* The vhost_worker does not particpate in coredumps */
> +     if (core_state &&
> +         ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) 
> {
>               struct core_thread self;
>  
>               self.task = current;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ed4e01daccaa..81cba91f30bb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process(
>       p->flags &= ~PF_KTHREAD;
>       if (args->kthread)
>               p->flags |= PF_KTHREAD;
> -     if (args->user_worker)
> -             p->flags |= PF_USER_WORKER;
> -     if (args->io_thread) {
> +     if (args->user_worker) {
>               /*
> -              * Mark us an IO worker, and block any signal that isn't
> +              * Mark us a user worker, and block any signal that isn't
>                * fatal or STOP
>                */
> -             p->flags |= PF_IO_WORKER;
> +             p->flags |= PF_USER_WORKER;
>               siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
>       }
> +     if (args->io_thread)
> +             p->flags |= PF_IO_WORKER;
>  
>       if (args->name)
>               strscpy_pad(p->comm, args->name, sizeof(p->comm));
> @@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process(
>       if (retval)
>               goto bad_fork_cleanup_io;
>  
> -     if (args->ignore_signals)
> -             ignore_signals(p);
> -
>       stackleak_task_init(p);
>  
>       if (pid != &init_struct_pid) {
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 8f6330f0e9ca..2547fa73bde5 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1368,7 +1368,9 @@ int zap_other_threads(struct task_struct *p)
>  
>       while_each_thread(p, t) {
>               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
> -             count++;
> +             /* Don't require de_thread to wait for the vhost_worker */
> +             if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != 
> PF_USER_WORKER)
> +                     count++;
>  
>               /* Don't bother with already dead threads */
>               if (t->exit_state)
> @@ -2861,11 +2863,11 @@ bool get_signal(struct ksignal *ksig)
>               }
>  
>               /*
> -              * PF_IO_WORKER threads will catch and exit on fatal signals
> +              * PF_USER_WORKER threads will catch and exit on fatal signals
>                * themselves. They have cleanup that must be performed, so
>                * we cannot call do_exit() on their behalf.
>                */
> -             if (current->flags & PF_IO_WORKER)
> +             if (current->flags & PF_USER_WORKER)
>                       goto out;
>  
>               /*
> diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
> index b7cbd66f889e..f80d5c51ae67 100644
> --- a/kernel/vhost_task.c
> +++ b/kernel/vhost_task.c
> @@ -12,58 +12,88 @@ enum vhost_task_flags {
>       VHOST_TASK_FLAGS_STOP,
>  };
>  
> +struct vhost_task {
> +     bool (*fn)(void *data);
> +     void *data;
> +     struct completion exited;
> +     unsigned long flags;
> +     struct task_struct *task;
> +};
> +
>  static int vhost_task_fn(void *data)
>  {
>       struct vhost_task *vtsk = data;
> -     int ret;
> +     bool dead = false;
> +
> +     for (;;) {
> +             bool did_work;
> +
> +             /* mb paired w/ vhost_task_stop */
> +             if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags))
> +                     break;
> +
> +             if (!dead && signal_pending(current)) {
> +                     struct ksignal ksig;
> +                     /*
> +                      * Calling get_signal will block in SIGSTOP,
> +                      * or clear fatal_signal_pending, but remember
> +                      * what was set.
> +                      *
> +                      * This thread won't actually exit until all
> +                      * of the file descriptors are closed, and
> +                      * the release function is called.
> +                      */
> +                     dead = get_signal(&ksig);
> +                     if (dead)
> +                             clear_thread_flag(TIF_SIGPENDING);
> +             }
> +
> +             did_work = vtsk->fn(vtsk->data);
> +             if (!did_work) {
> +                     set_current_state(TASK_INTERRUPTIBLE);
> +                     schedule();
> +             }
> +     }
>  
> -     ret = vtsk->fn(vtsk->data);
>       complete(&vtsk->exited);
> -     do_exit(ret);
> +     do_exit(0);
> +}
> +
> +/**
> + * vhost_task_wake - wakeup the vhost_task
> + * @vtsk: vhost_task to wake
> + *
> + * wake up the vhost_task worker thread
> + */
> +void vhost_task_wake(struct vhost_task *vtsk)
> +{
> +     wake_up_process(vtsk->task);
>  }
> +EXPORT_SYMBOL_GPL(vhost_task_wake);
>  
>  /**
>   * vhost_task_stop - stop a vhost_task
>   * @vtsk: vhost_task to stop
>   *
> - * Callers must call vhost_task_should_stop and return from their worker
> - * function when it returns true;
> + * vhost_task_fn ensures the worker thread exits after
> + * VHOST_TASK_FLAGS_SOP becomes true.
>   */
>  void vhost_task_stop(struct vhost_task *vtsk)
>  {
> -     pid_t pid = vtsk->task->pid;
> -
>       set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
> -     wake_up_process(vtsk->task);
> +     vhost_task_wake(vtsk);
>       /*
>        * Make sure vhost_task_fn is no longer accessing the vhost_task before
> -      * freeing it below. If userspace crashed or exited without closing,
> -      * then the vhost_task->task could already be marked dead so
> -      * kernel_wait will return early.
> +      * freeing it below.
>        */
>       wait_for_completion(&vtsk->exited);
> -     /*
> -      * If we are just closing/removing a device and the parent process is
> -      * not exiting then reap the task.
> -      */
> -     kernel_wait4(pid, NULL, __WCLONE, NULL);
>       kfree(vtsk);
>  }
>  EXPORT_SYMBOL_GPL(vhost_task_stop);
>  
>  /**
> - * vhost_task_should_stop - should the vhost task return from the work 
> function
> - * @vtsk: vhost_task to stop
> - */
> -bool vhost_task_should_stop(struct vhost_task *vtsk)
> -{
> -     return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
> -}
> -EXPORT_SYMBOL_GPL(vhost_task_should_stop);
> -
> -/**
> - * vhost_task_create - create a copy of a process to be used by the kernel
> - * @fn: thread stack
> + * vhost_task_create - create a copy of a task to be used by the kernel
> + * @fn: vhost worker function
>   * @arg: data to be passed to fn
>   * @name: the thread's name
>   *
> @@ -71,17 +101,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop);
>   * failure. The returned task is inactive, and the caller must fire it up
>   * through vhost_task_start().
>   */
> -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
> +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
>                                    const char *name)
>  {
>       struct kernel_clone_args args = {
> -             .flags          = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
> +             .flags          = CLONE_FS | CLONE_UNTRACED | CLONE_VM |
> +                               CLONE_THREAD | CLONE_SIGHAND,
>               .exit_signal    = 0,
>               .fn             = vhost_task_fn,
>               .name           = name,
>               .user_worker    = 1,
>               .no_files       = 1,
> -             .ignore_signals = 1,
>       };
>       struct vhost_task *vtsk;
>       struct task_struct *tsk;
> -- 
> 2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Reply via email to