A crash happened while I was playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
    [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180
    [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
    RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task()(now replaced by rt_mutex_get_top_waiter()
by previous patches, same issue), will access them with rq lock
held but not holding pi_lock.

In order to tackle it, we introduce new "pi_waiters_leftmost_copy"
in task_struct, and add a new function named rt_mutex_update_copy()
to do the copy, it can be called by rt_mutex_setprio() which held
owner's pi_lock, rq lock, and rt_mutex lock. But one exception is
that rt_mutex_setprio() can be called without rtmutex locked after
mark_wakeup_next_waiter() by rt_mutex_adjust_prio(), so we need a
new function in mark_wakeup_next_waiter() to lock rq first then
call rt_mutex_update_copy().

We avoid adding new logic by calling __rt_mutex_adjust_prio() directly
in mark_wakeup_next_waiter()and remove the old rt_mutex_adjust_prio()
outside the lock. Since we moved the deboost point, in order to avoid
current process to be preempted(due to deboost earlier) before finishing
wake_up_q(), we also move preempt_disable() before unlocking rtmutex.

Originally-From: Peter Zijlstra <[email protected]>
Signed-off-by: Xunlei Pang <[email protected]>
---
 include/linux/init_task.h      |  3 ++-
 include/linux/sched.h          |  3 +++
 include/linux/sched/deadline.h |  2 ++
 kernel/fork.c                  |  1 +
 kernel/locking/rtmutex.c       | 61 +++++++++++++++++-------------------------
 kernel/sched/core.c            |  2 ++
 6 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d4..7be5a83 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -162,7 +162,8 @@ extern struct task_group root_task_group;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)                                          \
        .pi_waiters = RB_ROOT,                                          \
-       .pi_waiters_leftmost = NULL,
+       .pi_waiters_leftmost = NULL,                                    \
+       .pi_waiters_leftmost_copy = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 45e848c..332a6b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,7 +1621,10 @@ struct task_struct {
 #ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task */
        struct rb_root pi_waiters;
+       /* Updated under pi_lock and rtmutex lock */
        struct rb_node *pi_waiters_leftmost;
+       /* Updated under pi_lock, rq_lock, and rtmutex lock */
+       struct rb_node *pi_waiters_leftmost_copy;
        /* Deadlock detection and priority inheritance handling */
        struct rt_mutex_waiter *pi_blocked_on;
 #endif
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2a..e8304d4 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -26,4 +26,6 @@ static inline bool dl_time_before(u64 a, u64 b)
        return (s64)(a - b) < 0;
 }
 
+extern void rt_mutex_update_copy(struct task_struct *p);
+
 #endif /* _SCHED_DEADLINE_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index a453546..4b847e4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1225,6 +1225,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT;
        p->pi_waiters_leftmost = NULL;
+       p->pi_waiters_leftmost_copy = NULL;
        p->pi_blocked_on = NULL;
 #endif
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 42bc59b..00c6560 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -273,10 +273,11 @@ int rt_mutex_getprio(struct task_struct *task)
 
 struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
-       if (likely(!task_has_pi_waiters(task)))
+       if (!task->pi_waiters_leftmost_copy)
                return NULL;
 
-       return task_top_pi_waiter(task)->task;
+       return rb_entry(task->pi_waiters_leftmost_copy,
+                               struct rt_mutex_waiter, pi_tree_entry)->task;
 }
 
 /*
@@ -285,12 +286,20 @@ struct task_struct *rt_mutex_get_top_task(struct 
task_struct *task)
  */
 int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
-       if (!task_has_pi_waiters(task))
+       struct task_struct *top_task = rt_mutex_get_top_task(task);
+
+       if (!top_task)
                return newprio;
 
-       if (task_top_pi_waiter(task)->task->prio <= newprio)
-               return task_top_pi_waiter(task)->task->prio;
-       return newprio;
+       return min(top_task->prio, newprio);
+}
+
+/*
+ * Must hold rq lock, p's pi_lock, and rt_mutex lock.
+ */
+void rt_mutex_update_copy(struct task_struct *p)
+{
+       p->pi_waiters_leftmost_copy = p->pi_waiters_leftmost;
 }
 
 /*
@@ -307,24 +316,6 @@ static void __rt_mutex_adjust_prio(struct task_struct 
*task)
 }
 
 /*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
-       __rt_mutex_adjust_prio(task);
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-}
-
-/*
  * Deadlock detection is conditional:
  *
  * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
@@ -987,6 +978,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head 
*wake_q,
         * lock->wait_lock.
         */
        rt_mutex_dequeue_pi(current, waiter);
+       __rt_mutex_adjust_prio(current);
 
        /*
         * As we are waking up the top waiter, and the waiter stays
@@ -1325,6 +1317,15 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex 
*lock,
         */
        mark_wakeup_next_waiter(wake_q, lock);
 
+       /*
+        * We should deboost before waking the high-prio task such that
+        * we don't run two tasks with the 'same' priority. This however
+        * can lead to prio-inversion if we would get preempted after
+        * the deboost but before waking our high-prio task, hence the
+        * preempt_disable before unlock.
+        */
+       preempt_disable();
+
        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
        /* check PI boosting */
@@ -1400,18 +1401,6 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
  */
 void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
 {
-       /*
-        * We should deboost before waking the high-prio task such that
-        * we don't run two tasks with the 'same' priority. This however
-        * can lead to prio-inversion if we would get preempted after
-        * the deboost but before waking our high-prio task, hence the
-        * preempt_disable.
-        */
-       if (deboost) {
-               preempt_disable();
-               rt_mutex_adjust_prio(current);
-       }
-
        wake_up_q(wake_q);
 
        if (deboost)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1159423..4a2c79d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3480,6 +3480,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                goto out_unlock;
        }
 
+       rt_mutex_update_copy(p);
+
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
 
-- 
1.8.3.1

Reply via email to