Each vhost device gets a thread that is used to perform IO and management operations. Instead of a thread that is accessing a device, the thread is part of the device, so when it calls kernel_copy_process we can't dup or clone the parent's (Qemu thread that does the VHOST_SET_OWNER ioctl) files/FDS because it would do an extra increment on ourself.
Later, when we do: Qemu process exits: do_exit -> exit_files -> put_files_struct -> close_files we would leak the device's resources because of that extra refcount on the fd or file_struct. This patch adds a no_files option so these worker threads can prevent taking an extra refcount on themselves. Signed-off-by: Mike Christie <michael.chris...@oracle.com> --- include/linux/sched/task.h | 3 ++- kernel/fork.c | 14 +++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c55f1eb69d41..d0b0872f56cc 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -32,6 +32,7 @@ struct kernel_clone_args { size_t set_tid_size; int cgroup; int io_thread; + int no_files; struct cgroup *cgrp; struct css_set *cset; }; @@ -86,7 +87,7 @@ extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *kernel_copy_process(int (*fn)(void *), void *arg, int node, unsigned long clone_flags, - int io_thread); + int io_thread, int no_files); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/kernel/fork.c b/kernel/fork.c index cec7b6011beb..a0468e30b27e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1532,7 +1532,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk, + int no_files) { struct files_struct *oldf, *newf; int error = 0; @@ -1544,6 +1545,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) if (!oldf) goto out; + if (no_files) { + tsk->files = NULL; + goto out; + } + if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; @@ -2179,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; - retval = copy_files(clone_flags, p); + retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); @@ -2539,6 +2545,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) * @node: numa node to allocate task from * @clone_flags: CLONE flags * @io_thread: 1 if this will be a PF_IO_WORKER else 0. + * @no_files: Do not duplicate or copy the parent's open files. * * This returns a created task, or an error pointer. The returned task is * inactive, and the caller must fire it up through wake_up_new_task(p). If @@ -2546,7 +2553,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) */ struct task_struct *kernel_copy_process(int (*fn)(void *), void *arg, int node, unsigned long clone_flags, - int io_thread) + int io_thread, int no_files) { struct kernel_clone_args args = { .flags = ((lower_32_bits(clone_flags) | CLONE_VM | @@ -2555,6 +2562,7 @@ struct task_struct *kernel_copy_process(int (*fn)(void *), void *arg, int node, .stack = (unsigned long)fn, .stack_size = (unsigned long)arg, .io_thread = io_thread, + .no_files = no_files, }; return copy_process(NULL, 0, node, &args); -- 2.25.1 _______________________________________________ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization