From: Thomas Gleixner <[email protected]>

The boosted priority is reverted after the unlock but before the
futex_hash_bucket (hb) has been accessed. The result is that we boost the
task, deboost the task, boost again for the hb lock, deboost again.
A sched trace of this scenario looks the following:

| med_prio-93  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 
target_cpu=000
| med_prio-93  sched_switch: prev_comm=med_prio prev_pid=93 prev_prio=29 
prev_state=R ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92  sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92  sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 
prev_state=S ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 
target_cpu=000
| low_prio-91  sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91  sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 
prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92  sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92  sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 
prev_state=D ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 
target_cpu=000
| low_prio-91  sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91  sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 
prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9

We see four sched_pi_setprio() invocation but ideally two would be enough.
The patch tries to avoid the double wakeup by a wake up once the hb lock is
released. The same test case:

| med_prio-21  sched_wakeup: comm=high_prio pid=20 prio=9 success=1 
target_cpu=000
| med_prio-21  sched_switch: prev_comm=med_prio prev_pid=21 prev_prio=29 
prev_state=R ==> next_comm=high_prio next_pid=20 next_prio=9
|high_prio-20  sched_pi_setprio: comm=low_prio pid=19 oldprio=120 newprio=9
|high_prio-20  sched_switch: prev_comm=high_prio prev_pid=20 prev_prio=9 
prev_state=S ==> next_comm=low_prio next_pid=19 next_prio=9
| low_prio-19  sched_wakeup: comm=high_prio pid=20 prio=9 success=1 
target_cpu=000
| low_prio-19  sched_pi_setprio: comm=low_prio pid=19 oldprio=9 newprio=120
| low_prio-19  sched_switch: prev_comm=low_prio prev_pid=19 prev_prio=120 
prev_state=R+ ==> next_comm=high_prio next_pid=20 next_prio=9

only two sched_pi_setprio() invocations as one would expect and see
without -RT.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
 kernel/futex.c                  | 32 +++++++++++++++++++++++++++++---
 kernel/locking/rtmutex.c        | 39 ++++++++++++++++++++++++++++-----------
 kernel/locking/rtmutex_common.h |  4 ++++
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..b38abe3573a8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1122,11 +1122,13 @@ static void wake_futex(struct futex_q *q)
        put_task_struct(p);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+                        struct futex_hash_bucket *hb)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+       bool deboost;
        int ret = 0;
 
        if (!pi_state)
@@ -1178,7 +1180,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, 
struct futex_q *this)
        raw_spin_unlock_irq(&new_owner->pi_lock);
 
        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-       rt_mutex_unlock(&pi_state->pi_mutex);
+
+       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex);
+
+       /*
+        * We deboost after dropping hb->lock. That prevents a double
+        * wakeup on RT.
+        */
+       spin_unlock(&hb->lock);
+
+       if (deboost)
+               rt_mutex_adjust_prio(current);
 
        return 0;
 }
@@ -2412,13 +2424,26 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned 
int flags)
         */
        match = futex_top_waiter(hb, &key);
        if (match) {
-               ret = wake_futex_pi(uaddr, uval, match);
+               ret = wake_futex_pi(uaddr, uval, match, hb);
+
+               /*
+                * In case of success wake_futex_pi dropped the hash
+                * bucket lock.
+                */
+               if (!ret)
+                       goto out_putkey;
+
                /*
                 * The atomic access to the futex value generated a
                 * pagefault, so retry the user-access and the wakeup:
                 */
                if (ret == -EFAULT)
                        goto pi_faulted;
+
+               /*
+                * wake_futex_pi has detected invalid state. Tell user
+                * space.
+                */
                goto out_unlock;
        }
 
@@ -2439,6 +2464,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned 
int flags)
 
 out_unlock:
        spin_unlock(&hb->lock);
+out_putkey:
        put_futex_key(&key);
        return ret;
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..9d858106ba0c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -298,7 +298,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
  * of task. We do not use the spin_xx_mutex() variants here as we are
  * outside of the debug path.)
  */
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
 {
        unsigned long flags;
 
@@ -955,8 +955,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter list take a
+ * reference and return a pointer to it.
  *
  * Called with lock->wait_lock held.
  */
@@ -1253,7 +1253,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex 
*lock)
 /*
  * Slow path to release a rt-mutex:
  */
-static void __sched
+static bool __sched
 rt_mutex_slowunlock(struct rt_mutex *lock)
 {
        raw_spin_lock(&lock->wait_lock);
@@ -1296,7 +1296,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
        while (!rt_mutex_has_waiters(lock)) {
                /* Drops lock->wait_lock ! */
                if (unlock_rt_mutex_safe(lock) == true)
-                       return;
+                       return false;
                /* Relock the rtmutex and try again */
                raw_spin_lock(&lock->wait_lock);
        }
@@ -1309,8 +1309,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 
        raw_spin_unlock(&lock->wait_lock);
 
-       /* Undo pi boosting if necessary: */
-       rt_mutex_adjust_prio(current);
+       return true;
 }
 
 /*
@@ -1361,12 +1360,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
 
 static inline void
 rt_mutex_fastunlock(struct rt_mutex *lock,
-                   void (*slowfn)(struct rt_mutex *lock))
+                   bool (*slowfn)(struct rt_mutex *lock))
 {
-       if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+       if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
                rt_mutex_deadlock_account_unlock(current);
-       else
-               slowfn(lock);
+       } else if (slowfn(lock)) {
+               /* Undo pi boosting if necessary: */
+               rt_mutex_adjust_prio(current);
+       }
 }
 
 /**
@@ -1461,6 +1462,22 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
 /**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+       if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+               rt_mutex_deadlock_account_unlock(current);
+               return false;
+       }
+       return rt_mutex_slowunlock(lock);
+}
+
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..1bcf43099b16 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -132,6 +132,10 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex 
*lock,
                                      struct rt_mutex_waiter *waiter);
 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct 
hrtimer_sleeper *to);
 
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock);
+
+extern void rt_mutex_adjust_prio(struct task_struct *task);
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
 #else
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to