Re: [RFC PATCH] pipe: make pipe_release() deferrable.
On 2020/08/23 1:30, Linus Torvalds wrote: > On Fri, Aug 21, 2020 at 9:35 PM Tetsuo Handa > wrote: >> >> Therefore, this patch tries to convert __pipe_lock() in pipe_release() to >> killable, by deferring to a workqueue context when __pipe_lock_killable() >> failed. > > I don't think this is an improvement. > > If somebody can delay the pipe unlock arbitrarily, you've now turned a > user-visible blocking operation into blocking a workqueue instead. So > it's still there, and now it possibly is much worse and blocks > system_wq instead. We can use a dedicated WQ_MEM_RECLAIM workqueue (like mm_percpu_wq) if blocking system_wq is a concern. Moving a user-visible blocking operation into blocking a workqueue helps avoiding AB-BA deadlocks in some situations. This hung task report says that a task can't close file descriptor of userfaultfd unless pipe_release() completes while pipe_write() (which is blocking pipe_release()) can abort if file descriptor of userfaultfd is closed. A different report [1] says that a task can't close file descriptor of /dev/raw-gadget unless wdm_flush() completes while wdm_flush() can abort if file descriptor of /dev/raw-gadget is closed. handle_userfault() is a method for delaying for arbitrarily period (and this report says the period is forever due to AB-BA deadlock condition). But if each copy_page_to_iter()/copy_page_from_iter() takes 10 seconds for whatever reason, it is sufficient for triggering khungtaskd warning (demonstrated by patch below). We can't use too short pagefault timeout for "do not break e.g. live migration" but we need to use short pagefault timeout for "do not trigger khungtaskd warning" and we can't use long khungtaskd timeout for "detect really hanging tasks". I think these are incompatible as long as uninterruptible wait is used. [1] https://lkml.kernel.org/r/1597188375-4787-1-git-send-email-penguin-ker...@i-love.sakura.ne.jp diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..2510c6a330b8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -82,9 +82,13 @@ void pipe_unlock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_unlock); -static inline void __pipe_lock(struct pipe_inode_info *pipe) +static inline void __pipe_lock(struct pipe_inode_info *pipe, const char *func) { + if (!strcmp(current->comm, "a.out")) + printk("%s started __pipe_lock()\n", func); mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); + if (!strcmp(current->comm, "a.out")) + printk("%s completed __pipe_lock()\n", func); } static inline void __pipe_unlock(struct pipe_inode_info *pipe) @@ -244,7 +248,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return 0; ret = 0; - __pipe_lock(pipe); + __pipe_lock(pipe, __func__); /* * We only wake up writers if the pipe was full when we started @@ -306,6 +310,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } + if (!strcmp(current->comm, "a.out")) { + int i; + + for (i = 0; i < 10; i++) + schedule_timeout_uninterruptible(HZ); + } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) @@ -381,7 +391,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + __pipe_lock(pipe, __func__); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } @@ -432,7 +442,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + __pipe_lock(pipe, __func__); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -472,6 +482,12 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out; + if (!strcmp(current->comm, "a.out")) { + int i; + + for (i = 0; i < 10; i++) + schedule_timeout_uninterruptible(HZ); + } ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; @@ -536,6 +552,12 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page
Re: [RFC PATCH] pipe: make pipe_release() deferrable.
On Fri, Aug 21, 2020 at 9:35 PM Tetsuo Handa wrote: > > Therefore, this patch tries to convert __pipe_lock() in pipe_release() to > killable, by deferring to a workqueue context when __pipe_lock_killable() > failed. I don't think this is an improvement. If somebody can delay the pipe unlock arbitrarily, you've now turned a user-visible blocking operation into blocking a workqueue instead. So it's still there, and now it possibly is much worse and blocks system_wq instead. Linus
[RFC PATCH] pipe: make pipe_release() deferrable.
syzbot is reporting hung task at pipe_write() [1], for __pipe_lock() from pipe_write() by task-A can be blocked forever waiting for handle_userfault() from copy_page_from_iter() from pipe_write() by task-B to complete and call __pipe_unlock(). Since the problem is that we can't enforce task-B to immediately complete handle_userfault() (this is effectively returning to userspace with locks held), we won't be able to avoid this hung task report unless we convert all pipe locks to killable (because khungtaskd does not warn stalling killable waits). Linus Torvalds commented that we could introduce timeout for handle_userfault(), and Andrea Arcangeli responded that too short timeout can cause problems (e.g. breaking qemu's live migration) [2], and we can't guarantee that khungtaskd's timeout is longer than timeout for multiple handle_userfault() events. Since Andrea commented that we will be able to avoid this hung task report if we convert pipe locks to killable, I tried a feasibility test [3]. While it is not difficult to make some of pipe locks killable, there are subtle or complicated locations (e.g. pipe_wait() users). syzbot already reported that even pipe_release() is subjected to this hung task report [4]. While the cause of [4] is that splice() from pipe to file hit an infinite busy loop bug after holding pipe lock, it is a sign that we have to care about __pipe_lock() in pipe_release() even if pipe_read() or pipe_write() is stalling due to page fault handling. Therefore, this patch tries to convert __pipe_lock() in pipe_release() to killable, by deferring to a workqueue context when __pipe_lock_killable() failed. (a) Do you think that we can make all pipe locks killable? (b) Do you think that we can introduce timeout for handling page faults? (c) Do you think that we can avoid page faults with pipe locks held? [1] https://syzkaller.appspot.com/bug?id=ab3d277fa3b068651edb7171a1aa4f78e5eacf78 [2] https://lkml.kernel.org/r/CAHk-=wj15SDiHjP2wPiC=ru-rrujout4aoulj6n_9pvvsxa...@mail.gmail.com [3] https://lkml.kernel.org/r/dc9b2681-3b84-eb74-8c88-3815beaff...@i-love.sakura.ne.jp [4] https://syzkaller.appspot.com/bug?id=2ccac875e85dc852911a0b5b788ada82dc0a081e Signed-off-by: Tetsuo Handa --- fs/pipe.c | 55 --- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..a64c7fc1794f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -87,6 +87,11 @@ static inline void __pipe_lock(struct pipe_inode_info *pipe) mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); } +static inline int __pipe_lock_killable(struct pipe_inode_info *pipe) +{ + return mutex_lock_killable_nested(&pipe->mutex, I_MUTEX_PARENT); +} + static inline void __pipe_unlock(struct pipe_inode_info *pipe) { mutex_unlock(&pipe->mutex); @@ -714,15 +719,12 @@ static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) free_pipe_info(pipe); } -static int -pipe_release(struct inode *inode, struct file *file) +/* Caller holds pipe->mutex. */ +static void do_pipe_release(struct inode *inode, struct pipe_inode_info *pipe, fmode_t f_mode) { - struct pipe_inode_info *pipe = file->private_data; - - __pipe_lock(pipe); - if (file->f_mode & FMODE_READ) + if (f_mode & FMODE_READ) pipe->readers--; - if (file->f_mode & FMODE_WRITE) + if (f_mode & FMODE_WRITE) pipe->writers--; /* Was that the last reader or writer, but not the other side? */ @@ -732,9 +734,48 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } +} + +struct pipe_release_data { + struct work_struct work; + struct inode *inode; + struct pipe_inode_info *pipe; + fmode_t f_mode; +}; + +static void deferred_pipe_release(struct work_struct *work) +{ + struct pipe_release_data *w = container_of(work, struct pipe_release_data, work); + struct inode *inode = w->inode; + struct pipe_inode_info *pipe = w->pipe; + + __pipe_lock(pipe); + do_pipe_release(inode, pipe, w->f_mode); __pipe_unlock(pipe); put_pipe_info(inode, pipe); + iput(inode); /* pipe_release() called ihold(inode). */ + kfree(w); +} + +static int pipe_release(struct inode *inode, struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; + struct pipe_release_data *w; + + if (likely(__pipe_lock_killable(pipe) == 0)) { + do_pipe_release(inode, pipe, file->f_mode); + __pipe_unlock(pipe); + put_pipe_info(inode, pipe); + return 0; + } + w = kmalloc(sizeof(*w), GFP_KERNEL | __GFP_NOFAIL); + ihold(inode); /* deferred_pipe_release() will call iput(inode). */ + w->inode = inode