On Tue, 2018-04-17 at 14:53 +0300, Kirill Tkhai wrote:
> Hi, Jeff,
> 
> On 17.04.2018 14:42, Jeff Layton wrote:
> > On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote:
> > > I observed the following deadlock between them:
> > > 
> > > [task 1]                          [task 2]                         [task 
> > > 3]
> > > kill_fasync()                     mm_update_next_owner()           
> > > copy_process()
> > >  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        
> > > write_lock_irq(&tasklist_lock)
> > >   send_sigio()                    <IRQ>                             ...
> > >    read_lock(&fown->lock)         kill_fasync()                     ...
> > >     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> > > 
> > > Task 1 can't acquire read locked tasklist_lock, since there is
> > > already task 3 expressed its wish to take the lock exclusive.
> > > Task 2 holds the read locked lock, but it can't take the spin lock.
> > > 
> > > Also, there is possible another deadlock (which I haven't observed):
> > > 
> > > [task 1]                            [task 2]
> > > f_getown()                          kill_fasync()
> > >  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
> > >  <IRQ>                               send_sigio()                     
> > > write_lock_irq(&f_own->lock)
> > >   kill_fasync()                       read_lock(&fown->lock)
> > >    spin_lock_irqsave(&fa->fa_lock,)
> > > 
> > > Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
> > > as it guarantees fa->fa_file->f_owner integrity only. It may seem,
> > > that it used to give a task a small possibility to receive two sequential
> > > signals, if there are two parallel kill_fasync() callers, and task
> > > handles the first signal fastly, but the behaviour won't become
> > > different, since there is exclusive sighand lock in do_send_sig_info().
> > > 
> > > The patch converts fa_lock into rwlock_t, and this fixes two above
> > > deadlocks, as rwlock is allowed to be taken from interrupt handler
> > > by qrwlock design.
> > > 
> > > Signed-off-by: Kirill Tkhai <[email protected]>
> > > 
> > > I used the following program for testing:
> > > 
> > > #include <unistd.h>
> > > #include <stdlib.h>
> > > #include <signal.h>
> > > #include <fcntl.h>
> > > #include <errno.h>
> > > #include <stdio.h>
> > > 
> > > #ifndef F_SETSIG
> > > #define F_SETSIG 10
> > > #endif
> > > 
> > > void handler(int sig)
> > > {
> > > }
> > > 
> > > main()
> > > {
> > >   unsigned int flags;
> > >   int fd;
> > > 
> > >   system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
> > >   system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > 
> > > /proc/sys/vm/drop_caches; done &");
> > > 
> > >   if (signal(SIGINT, handler) < 0) {
> > >           perror("Signal");
> > >           exit(1);
> > >   }
> > > 
> > >   fd = open("/dev/random", O_RDWR);
> > >   if (fd < 0) {
> > >           perror("Can't open");
> > >           exit(1);
> > >   }
> > > 
> > >   flags = FASYNC | fcntl(fd, F_GETFL);
> > >   if (fcntl(fd, F_SETFL, flags) < 0) {
> > >           perror("Setfl");
> > >           exit(1);
> > >   }
> > >   if (fcntl(fd, F_SETOWN, getpid())) {
> > >           perror("Setown");
> > >           exit(1);
> > >   }
> > >   if (fcntl(fd, F_SETSIG, SIGINT)) {
> > >           perror("Setsig");
> > >           exit(1);
> > >   }
> > > 
> > >   while (1)
> > >           sleep(100);
> > > }
> > > ---
> > >  fs/fcntl.c         |   15 +++++++--------
> > >  include/linux/fs.h |    2 +-
> > >  2 files changed, 8 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > index 1e97f1fda90c..780161a11f9d 100644
> > > --- a/fs/fcntl.c
> > > +++ b/fs/fcntl.c
> > > @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct 
> > > fasync_struct **fapp)
> > >           if (fa->fa_file != filp)
> > >                   continue;
> > >  
> > > -         spin_lock_irq(&fa->fa_lock);
> > > +         write_lock_irq(&fa->fa_lock);
> > >           fa->fa_file = NULL;
> > > -         spin_unlock_irq(&fa->fa_lock);
> > > +         write_unlock_irq(&fa->fa_lock);
> > >  
> > >           *fp = fa->fa_next;
> > >           call_rcu(&fa->fa_rcu, fasync_free_rcu);
> > > @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, 
> > > struct file *filp, struct fasy
> > >           if (fa->fa_file != filp)
> > >                   continue;
> > >  
> > > -         spin_lock_irq(&fa->fa_lock);
> > > +         write_lock_irq(&fa->fa_lock);
> > >           fa->fa_fd = fd;
> > > -         spin_unlock_irq(&fa->fa_lock);
> > > +         write_unlock_irq(&fa->fa_lock);
> > >           goto out;
> > >   }
> > >  
> > > - spin_lock_init(&new->fa_lock);
> > > + rwlock_init(&new->fa_lock);
> > >   new->magic = FASYNC_MAGIC;
> > >   new->fa_file = filp;
> > >   new->fa_fd = fd;
> > > @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct 
> > > *fa, int sig, int band)
> > >  {
> > >   while (fa) {
> > >           struct fown_struct *fown;
> > > -         unsigned long flags;
> > >  
> > >           if (fa->magic != FASYNC_MAGIC) {
> > >                   printk(KERN_ERR "kill_fasync: bad magic number in "
> > >                          "fasync_struct!\n");
> > >                   return;
> > >           }
> > > -         spin_lock_irqsave(&fa->fa_lock, flags);
> > > +         read_lock(&fa->fa_lock);
> > 
> > Does this need to be read_lock_irq? Why is it ok to allow interrupts
> > here?
> 
> Read locked rwlock can be taken for reading from IRQ once again even
> if there is a writer pending, while spin lock can't:
> 
> void queued_read_lock_slowpath(struct qrwlock *lock)
> {
>         /*
>          * Readers come here when they cannot get the lock without waiting
>          */
>         if (unlikely(in_interrupt())) {
>                 /*
>                  * Readers in interrupt context will get the lock immediately
>                  * if the writer is just waiting (not holding the lock yet),
>                  * so spin with ACQUIRE semantics until the lock is available
>                  * without waiting in the queue.
>                  */
>                 atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
>                 return;
>         }
> 
> So, when we replace spinlock with read_lock(), we don't need disable IRQs 
> anymore.
> All we need is to make write_lock always disable IRQs.

Got it, thanks.

read_lock_irq is still used in several (rather obscure) places. Does
this mean that we should do a global s/read_lock_irq/read_lock/ and
remove it? Or is it still useful to disable irqs for some read_lock
acquisitions?

> 
> > >           if (fa->fa_file) {
> > >                   fown = &fa->fa_file->f_owner;
> > >                   /* Don't send SIGURG to processes which have not set a
> > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, 
> > > int sig, int band)
> > >                   if (!(sig == SIGURG && fown->signum == 0))
> > >                           send_sigio(fown, fa->fa_fd, band);
> > >           }
> > > -         spin_unlock_irqrestore(&fa->fa_lock, flags);
> > > +         read_unlock(&fa->fa_lock);
> > >           fa = rcu_dereference(fa->fa_next);
> > >   }
> > >  }
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index c6baf767619e..297e2dcd9dd2 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file 
> > > *filp, struct file_lock *fl)
> > >  }
> > >  
> > >  struct fasync_struct {
> > > - spinlock_t              fa_lock;
> > > + rwlock_t                fa_lock;
> > >   int                     magic;
> > >   int                     fa_fd;
> > >   struct fasync_struct    *fa_next; /* singly linked list */
> > > 
> > 
> > I've no objection to the patch in principle, but I'm not as familiar
> > with the fasync code as others here.
> 
> I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl,
> don't have an ideas what else should be CCed.


No worries. The patch seems sane enough to me. You can add:

Acked-by: Jeff Layton <[email protected]>

Reply via email to