Re: Filesystem lockup with CONFIG_PREEMPT_RT

Thomas Gleixner Fri, 27 Jun 2014 07:26:55 -0700

On Thu, 26 Jun 2014, Austin Schuh wrote:
> If I'm reading the rt patch correctly, wq_worker_sleeping was moved
> out of __schedule to sched_submit_work.  It looks like that changes
> the conditions under which wq_worker_sleeping is called.  It used to
> be called whenever a task was going to sleep (I think).  It looks like
> it is called now if the task is going to sleep, and if the task isn't
> blocked on a PI mutex (I think).
> 
> If I try the following experiment
> 
>  static inline void sched_submit_work(struct task_struct *tsk)
>  {
> +   if (tsk->state && tsk->flags & PF_WQ_WORKER) {
> +     wq_worker_sleeping(tsk);
> +     return;
> +   }
> 
> and then remove the call later in the function, I am able to pass my test.
> 
> Unfortunately, I then get a recursive pool spinlock BUG_ON after a
> while (as I would expect), and it all blows up.


Well, that's why the check for !pi_blocked_on is there.
 
> I'm not sure where to go from there.  Any changes to the workpool to
> try to fix that will be hard, or could affect latency significantly.

Completely untested patch below.

Thanks,

        tglx

--------------------->
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8749d20..621329a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2651,9 +2651,8 @@ need_resched:
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-       if (!tsk->state || tsk_is_pi_blocked(tsk))
+       if (!tsk->state)
                return;
-
        /*
         * If a worker went to sleep, notify and ask workqueue whether
         * it wants to wake up a task to maintain concurrency.
@@ -2661,6 +2660,10 @@ static inline void sched_submit_work(struct task_struct 
*tsk)
        if (tsk->flags & PF_WQ_WORKER)
                wq_worker_sleeping(tsk);
 
+
+       if (tsk_is_pi_blocked(tsk))
+               return;
+
        /*
         * If we are going to sleep and we have plugged IO queued,
         * make sure to submit it to avoid deadlocks.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index be0ef50..0ca9d97 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,6 +126,8 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
+ *    On RT we need the extra protection via rt_lock_idle_list()
+ *
  * MG: pool->manager_mutex and pool->lock protected.  Writes require both
  *     locks.  Reads can happen under either lock.
  *
@@ -409,6 +411,31 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
                if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
                else
 
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void rt_lock_idle_list(struct worker_pool *pool)
+{
+       preempt_disable();
+}
+static inline void rt_unlock_idle_list(struct worker_pool *pool)
+{
+       preempt_enable();
+}
+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
+#else
+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
+static inline void sched_lock_idle_list(struct worker_pool *pool)
+{
+       spin_lock_irq(&pool->lock);
+}
+static inline void sched_unlock_idle_list(struct worker_pool *pool)
+{
+       spin_unlock_irq(&pool->lock);
+}
+#endif
+
+
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
 static struct debug_obj_descr work_debug_descr;
@@ -801,10 +828,16 @@ static struct worker *first_worker(struct worker_pool 
*pool)
  */
 static void wake_up_worker(struct worker_pool *pool)
 {
-       struct worker *worker = first_worker(pool);
+       struct worker *worker;
+
+       rt_lock_idle_list(pool);
+
+       worker = first_worker(pool);
 
        if (likely(worker))
                wake_up_process(worker->task);
+
+       rt_unlock_idle_list(pool);
 }
 
 /**
@@ -832,7 +865,7 @@ void wq_worker_running(struct task_struct *task)
  */
 void wq_worker_sleeping(struct task_struct *task)
 {
-       struct worker *next, *worker = kthread_data(task);
+       struct worker *worker = kthread_data(task);
        struct worker_pool *pool;
 
        /*
@@ -849,25 +882,18 @@ void wq_worker_sleeping(struct task_struct *task)
                return;
 
        worker->sleeping = 1;
-       spin_lock_irq(&pool->lock);
+
        /*
         * The counterpart of the following dec_and_test, implied mb,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
-        *
-        * NOT_RUNNING is clear.  This means that we're bound to and
-        * running on the local cpu w/ rq lock held and preemption
-        * disabled, which in turn means that none else could be
-        * manipulating idle_list, so dereferencing idle_list without pool
-        * lock is safe.
         */
        if (atomic_dec_and_test(&pool->nr_running) &&
            !list_empty(&pool->worklist)) {
-               next = first_worker(pool);
-               if (next)
-                       wake_up_process(next->task);
+               sched_lock_idle_list(pool);
+               wake_up_worker(pool);
+               sched_unlock_idle_list(pool);
        }
-       spin_unlock_irq(&pool->lock);
 }
 
 /**
@@ -1571,7 +1597,9 @@ static void worker_enter_idle(struct worker *worker)
        worker->last_active = jiffies;
 
        /* idle_list is LIFO */
+       rt_lock_idle_list(pool);
        list_add(&worker->entry, &pool->idle_list);
+       rt_unlock_idle_list(pool);
 
        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
@@ -1604,7 +1632,9 @@ static void worker_leave_idle(struct worker *worker)
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
+       rt_lock_idle_list(pool);
        list_del_init(&worker->entry);
+       rt_unlock_idle_list(pool);
 }
 
 /**
@@ -1849,7 +1879,9 @@ static void destroy_worker(struct worker *worker)
         */
        get_task_struct(worker->task);
 
+       rt_lock_idle_list(pool);
        list_del_init(&worker->entry);
+       rt_unlock_idle_list(pool);
        worker->flags |= WORKER_DIE;
 
        idr_remove(&pool->worker_idr, worker->id);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Filesystem lockup with CONFIG_PREEMPT_RT

Reply via email to