Re: Filesystem lockup with CONFIG_PREEMPT_RT

Steven Rostedt Fri, 27 Jun 2014 07:04:01 -0700

On Fri, 27 Jun 2014 14:57:36 +0200
Mike Galbraith <[email protected]> wrote:


> On Thu, 2014-06-26 at 17:07 -0700, Austin Schuh wrote:
> 
> > I'm not sure where to go from there.  Any changes to the workpool to
> > try to fix that will be hard, or could affect latency significantly.
> 
> Oh what the hell, I'm out of frozen shark, may as well stock up.  Nobody
> else has posted spit yet.  I _know_ Steven has some shark, I saw tglx
> toss him a chunk.
> 
> It's the same root cause as -rt letting tasks schedule off with plugged
> IO, leading to deadlock scenarios.  Extending the way I dealt with that
> to deal with workqueue forward progress requirements.. works.. though it

> could perhaps use a face lift, tummy tuck.. and minor body-ectomy.

Yeah, I'd say ;-)

> 
> Subject: rtmutex: move pre/post schedule() progress guarantees to before we 
> schedule
> 
> Queued IO can lead to IO deadlock should a task require wakeup from as task
> which is blocked on that queued IO, which is why we usually pull the plug
> while scheduling off.  In RT, pulling the plug could lead us to block on
> a contended sleeping lock while n the process of blocking.  Bad idea, so

"in the process"

> we don't, but that leaves us with various flavors of IO stall like below.
> 
> ext3: dbench1 queues a buffer, blocks on journal mutex, it's plug is not
> pulled.  dbench2 mutex owner is waiting for kjournald, who is waiting for
> the buffer queued by dbench1.  Game over.
> 
> The exact same situation can occur with workqueues.  We must notify the
> workqueue management that a worker is blocking, as if we don't, we can
> lock the box up completely.  It's too late to do that once we have arrived
> in schedule(), as we can't take a sleeping lock.
> 
> Therefore, move progress guarantee work up to before we reach the point of
> no return, and tell the scheduler that we're handling it early.
> 
> Signed-off-by: Mike Galbraith <[email protected]>
> ---
>  include/linux/sched.h    |    2 +
>  kernel/locking/rtmutex.c |   59 
> +++++++++++++++++++++++++++++++++++++++++++----
>  kernel/sched/core.c      |   19 +++++++++++----
>  3 files changed, 72 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1256,6 +1256,8 @@ struct task_struct {
>       /* Revert to default priority/policy when forking */
>       unsigned sched_reset_on_fork:1;
>       unsigned sched_contributes_to_load:1;
> +     unsigned sched_presched:1;
> +     unsigned rtmutex_presched:1;
>  
>       pid_t pid;
>       pid_t tgid;
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -23,6 +23,7 @@
>  #include <linux/sched/deadline.h>
>  #include <linux/timer.h>
>  #include <linux/ww_mutex.h>
> +#include <linux/blkdev.h>
>  
>  #include "rtmutex_common.h"
>  
> @@ -782,6 +783,41 @@ void rt_mutex_adjust_pi(struct task_stru
>  }
>  
>  #ifdef CONFIG_PREEMPT_RT_FULL
> +#include "../workqueue_internal.h"
> +
> +static inline void rt_mutex_presched(struct task_struct *tsk)
> +{
> +     /* Recursive handling of presched work is a very bad idea. */

The above comment can be simplified to just:

        /* Recursion protection */

> +     if (tsk->rtmutex_presched || tsk->sched_presched)
> +             return;
> +
> +     /* Tell the scheduler we handle pre/post schedule() work. */
> +     tsk->rtmutex_presched = 1;
> +
> +     /*
> +      * If a worker went to sleep, notify and ask workqueue whether
> +      * it wants to wake up a task to maintain concurrency.
> +      */
> +     if (tsk->flags & PF_WQ_WORKER)
> +             wq_worker_sleeping(tsk);
> +
> +     /*
> +      * If we are going to sleep and we have plugged IO queued,
> +      * make sure to submit it to avoid deadlocks.
> +      */
> +     if (blk_needs_flush_plug(tsk))
> +             blk_schedule_flush_plug(tsk);
> +}
> +
> +static inline void rt_mutex_postsched(struct task_struct *tsk)
> +{
> +     if (!tsk->rtmutex_presched)
> +             return;
> +     if (tsk->flags & PF_WQ_WORKER)
> +             wq_worker_running(tsk);
> +     tsk->rtmutex_presched = 0;
> +}
> +
>  /*
>   * preemptible spin_lock functions:
>   */
> @@ -857,13 +893,23 @@ static void  noinline __sched rt_spin_lo
>       struct rt_mutex_waiter waiter, *top_waiter;
>       int ret;
>  
> +     /*
> +      * We can't do presched work if we're already holding a lock
> +      * else we can deadlock.  eg, if we're holding slab_lock,
> +      * ksoftirqd can block while processing BLOCK_SOFTIRQ after
> +      * having acquired q->queue_lock.  If _we_ then block on
> +      * that q->queue_lock while flushing our plug, deadlock.
> +      */
> +     if (__migrate_disabled(self) < 2)
> +             rt_mutex_presched(self);
> +
>       rt_mutex_init_waiter(&waiter, true);
>  
>       raw_spin_lock(&lock->wait_lock);
>  
>       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
>               raw_spin_unlock(&lock->wait_lock);
> -             return;
> +             goto out;
>       }
>  
>       BUG_ON(rt_mutex_owner(lock) == self);
> @@ -928,6 +974,8 @@ static void  noinline __sched rt_spin_lo
>       raw_spin_unlock(&lock->wait_lock);
>  
>       debug_rt_mutex_free_waiter(&waiter);
> +out:
> +     rt_mutex_postsched(self);
>  }
>  
>  /*
> @@ -1274,18 +1322,20 @@ rt_mutex_slowlock(struct rt_mutex *lock,
>                 int detect_deadlock, struct ww_acquire_ctx *ww_ctx)
>  {
>       struct rt_mutex_waiter waiter;
> +     struct task_struct *self = current;
>       int ret = 0;
>  
> +     rt_mutex_presched(self);
>       rt_mutex_init_waiter(&waiter, false);
>  
>       raw_spin_lock(&lock->wait_lock);
>  
>       /* Try to acquire the lock again: */
> -     if (try_to_take_rt_mutex(lock, current, NULL)) {
> +     if (try_to_take_rt_mutex(lock, self, NULL)) {
>               if (ww_ctx)
>                       ww_mutex_account_lock(lock, ww_ctx);
>               raw_spin_unlock(&lock->wait_lock);
> -             return 0;
> +             goto out;
>       }
>  
>       set_current_state(state);
> @@ -1322,7 +1372,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
>               hrtimer_cancel(&timeout->timer);
>  
>       debug_rt_mutex_free_waiter(&waiter);
> -
> +out:
> +     rt_mutex_postsched(self);
>       return ret;
>  }
>  
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2915,11 +2915,18 @@ static void __sched __schedule(void)
>               goto need_resched;
>  }
>  
> -static inline void sched_submit_work(struct task_struct *tsk)
> +static inline void sched_presched_work(struct task_struct *tsk)
>  {
> +     /* It's being handled by rtmutex code */
> +     if (tsk->rtmutex_presched)
> +             return;
> +
>       if (!tsk->state || tsk_is_pi_blocked(tsk))
>               return;
>  
> +     /* Tell rtmutex presched code that we're handling it. */
> +     tsk->sched_presched = 1;
> +
>       /*
>        * If a worker went to sleep, notify and ask workqueue whether
>        * it wants to wake up a task to maintain concurrency.
> @@ -2935,19 +2942,23 @@ static inline void sched_submit_work(str
>               blk_schedule_flush_plug(tsk);
>  }
>  
> -static inline void sched_update_worker(struct task_struct *tsk)
> +static inline void sched_postsched_work(struct task_struct *tsk)
>  {
> +     /* It's being handled by rtmutex code */
> +     if (tsk->rtmutex_presched)
> +             return;
>       if (tsk->flags & PF_WQ_WORKER)
>               wq_worker_running(tsk);
> +     tsk->sched_presched = 0;
>  }
>  
>  asmlinkage void __sched schedule(void)
>  {
>       struct task_struct *tsk = current;
>  
> -     sched_submit_work(tsk);
> +     sched_presched_work(tsk);
>       __schedule();
> -     sched_update_worker(tsk);
> +     sched_postsched_work(tsk);
>  }

This seems like a lot of hacks. I'm wondering if it would work if we
just have the rt_spin_lock_slowlock not call schedule(), but call
__schedule() directly. I mean it would keep with the mainline paradigm
as spinlocks don't sleep there, and one going to sleep in the -rt
kernel is similar to it being preempted by a very long NMI.

Does a spin_lock going to sleep really need to do all the presched and
postsched work?


-- Steve



>  EXPORT_SYMBOL(schedule);
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Filesystem lockup with CONFIG_PREEMPT_RT

Reply via email to