[PATCH 4.4 11/75] futex: Change locking rules

gregkh Mon, 15 Mar 2021 06:54:07 -0700

From: Greg Kroah-Hartman <[email protected]>

From: Peter Zijlstra <[email protected]>


commit 734009e96d1983ad739e5b656e03430b3660c913 upstream.

This patch comes directly from an origin patch (commit
dc3f2ff11740159080f2e8e359ae0ab57c8e74b6) in v4.9.

Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.

The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.

Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.

This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.

Nothing yet relies on the new locking rules.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
[Lee: Back-ported in support of a previous futex back-port attempt]
Signed-off-by: Lee Jones <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
Signed-off-by: Zheng Yejian <[email protected]>
Signed-off-by: Greg Kroah-Hartman <[email protected]>
---
 kernel/futex.c |  138 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 112 insertions(+), 26 deletions(-)

--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1016,6 +1016,39 @@ static void exit_pi_state_list(struct ta
  * [10] There is no transient state which leaves owner and user space
  *     TID out of sync. Except one error case where the kernel is denied
  *     write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ *     hb -> futex_q, relation
+ *     futex_q -> pi_state, relation
+ *
+ *     (cannot be raw because hb can contain arbitrary amount
+ *      of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ *     {uval, pi_state}
+ *
+ *     (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ *     p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ *     pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ *   hb->lock
+ *     pi_mutex->wait_lock
+ *       p->pi_lock
+ *
  */
 
 /*
@@ -1023,10 +1056,12 @@ static void exit_pi_state_list(struct ta
  * the pi_state against the user space value. If correct, attach to
  * it.
  */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+                             struct futex_pi_state *pi_state,
                              struct futex_pi_state **ps)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
+       int ret, uval2;
 
        /*
         * Userspace might have messed up non-PI and PI futexes [3]
@@ -1034,9 +1069,34 @@ static int attach_to_pi_state(u32 uval,
        if (unlikely(!pi_state))
                return -EINVAL;
 
+       /*
+        * We get here with hb->lock held, and having found a
+        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+        * which in turn means that futex_lock_pi() still has a reference on
+        * our pi_state.
+        */
        WARN_ON(!atomic_read(&pi_state->refcount));
 
        /*
+        * Now that we have a pi_state, we can acquire wait_lock
+        * and do the state validation.
+        */
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       /*
+        * Since {uval, pi_state} is serialized by wait_lock, and our current
+        * uval was read without holding it, it can have changed. Verify it
+        * still is what we expect it to be, otherwise retry the entire
+        * operation.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               goto out_efault;
+
+       if (uval != uval2)
+               goto out_eagain;
+
+       /*
         * Handle the owner died case:
         */
        if (uval & FUTEX_OWNER_DIED) {
@@ -1051,11 +1111,11 @@ static int attach_to_pi_state(u32 uval,
                         * is not 0. Inconsistent state. [5]
                         */
                        if (pid)
-                               return -EINVAL;
+                               goto out_einval;
                        /*
                         * Take a ref on the state and return success. [4]
                         */
-                       goto out_state;
+                       goto out_attach;
                }
 
                /*
@@ -1067,14 +1127,14 @@ static int attach_to_pi_state(u32 uval,
                 * Take a ref on the state and return success. [6]
                 */
                if (!pid)
-                       goto out_state;
+                       goto out_attach;
        } else {
                /*
                 * If the owner died bit is not set, then the pi_state
                 * must have an owner. [7]
                 */
                if (!pi_state->owner)
-                       return -EINVAL;
+                       goto out_einval;
        }
 
        /*
@@ -1083,11 +1143,29 @@ static int attach_to_pi_state(u32 uval,
         * user space TID. [9/10]
         */
        if (pid != task_pid_vnr(pi_state->owner))
-               return -EINVAL;
-out_state:
+               goto out_einval;
+
+out_attach:
        atomic_inc(&pi_state->refcount);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        *ps = pi_state;
        return 0;
+
+out_einval:
+       ret = -EINVAL;
+       goto out_error;
+
+out_eagain:
+       ret = -EAGAIN;
+       goto out_error;
+
+out_efault:
+       ret = -EFAULT;
+       goto out_error;
+
+out_error:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
 }
 
 /**
@@ -1180,6 +1258,9 @@ static int attach_to_pi_owner(u32 uval,
 
        /*
         * No existing pi state. First waiter. [2]
+        *
+        * This creates pi_state, we have hb->lock held, this means nothing can
+        * observe this state, wait_lock is irrelevant.
         */
        pi_state = alloc_pi_state();
 
@@ -1204,7 +1285,8 @@ static int attach_to_pi_owner(u32 uval,
        return 0;
 }
 
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                          struct futex_hash_bucket *hb,
                           union futex_key *key, struct futex_pi_state **ps,
                           struct task_struct **exiting)
 {
@@ -1215,7 +1297,7 @@ static int lookup_pi_state(u32 uval, str
         * attach to the pi_state when the validation succeeds.
         */
        if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+               return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
 
        /*
         * We are the first waiter - try to look up the owner based on
@@ -1234,7 +1316,7 @@ static int lock_pi_update_atomic(u32 __u
        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                return -EFAULT;
 
-       /*If user space value changed, let the caller retry */
+       /* If user space value changed, let the caller retry */
        return curval != uval ? -EAGAIN : 0;
 }
 
@@ -1298,7 +1380,7 @@ static int futex_lock_pi_atomic(u32 __us
         */
        match = futex_top_waiter(hb, key);
        if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+               return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
 
        /*
         * No waiter and user TID is 0. We are here because the
@@ -1438,6 +1520,7 @@ static int wake_futex_pi(u32 __user *uad
 
        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
                ret = -EFAULT;
+
        } else if (curval != uval) {
                /*
                 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1971,7 +2054,7 @@ retry_private:
                         * rereading and handing potential crap to
                         * lookup_pi_state.
                         */
-                       ret = lookup_pi_state(ret, hb2, &key2,
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
                                              &pi_state, &exiting);
                }
 
@@ -2249,7 +2332,6 @@ static int __fixup_pi_state_owner(u32 __
        int err = 0;
 
        oldowner = pi_state->owner;
-
        /*
         * We are here because either:
         *
@@ -2268,11 +2350,10 @@ static int __fixup_pi_state_owner(u32 __
         * because we can fault here. Imagine swapped out pages or a fork
         * that marked all the anonymous memory readonly for cow.
         *
-        * Modifying pi_state _before_ the user space value would
-        * leave the pi_state in an inconsistent state when we fault
-        * here, because we need to drop the hash bucket lock to
-        * handle the fault. This might be observed in the PID check
-        * in lookup_pi_state.
+        * Modifying pi_state _before_ the user space value would leave the
+        * pi_state in an inconsistent state when we fault here, because we
+        * need to drop the locks to handle the fault. This might be observed
+        * in the PID check in lookup_pi_state.
         */
 retry:
        if (!argowner) {
@@ -2333,21 +2414,26 @@ retry:
        return argowner == current;
 
        /*
-        * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the highest priority
-        * waiter itself or the task which stole the rtmutex) the
-        * chance to try the fixup of the pi_state. So once we are
-        * back from handling the fault we need to check the pi_state
-        * after reacquiring the hash bucket lock and before trying to
-        * do another fixup. When the fixup has been done already we
-        * simply return.
+        * To handle the page fault we need to drop the locks here. That gives
+        * the other task (either the highest priority waiter itself or the
+        * task which stole the rtmutex) the chance to try the fixup of the
+        * pi_state. So once we are back from handling the fault we need to
+        * check the pi_state after reacquiring the locks and before trying to
+        * do another fixup. When the fixup has been done already we simply
+        * return.
+        *
+        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+        * drop hb->lock since the caller owns the hb -> futex_q relation.
+        * Dropping the pi_mutex->wait_lock requires the state revalidate.
         */
 handle_fault:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        spin_unlock(q->lock_ptr);
 
        err = fault_in_user_writeable(uaddr);
 
        spin_lock(q->lock_ptr);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
        /*
         * Check if someone else fixed it for us:

[PATCH 4.4 11/75] futex: Change locking rules

Reply via email to