Module Name: src
Committed By: thorpej
Date: Thu Aug 5 23:14:21 UTC 2021
Modified Files:
src/sys/kern [thorpej-futex2]: sys_futex.c
src/sys/sys [thorpej-futex2]: lwp.h
Log Message:
Bring over just the futex sleepq infrastructure changes from thorpej-futex
to a new branch based on current HEAD. This contains only the fixes for
the priority problems, and is intended to finish debugging those changes
(without the new extensions).
To generate a diff of this commit:
cvs rdiff -u -r1.12 -r1.12.4.1 src/sys/kern/sys_futex.c
cvs rdiff -u -r1.212 -r1.212.14.1 src/sys/sys/lwp.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/kern/sys_futex.c
diff -u src/sys/kern/sys_futex.c:1.12 src/sys/kern/sys_futex.c:1.12.4.1
--- src/sys/kern/sys_futex.c:1.12 Wed Jul 21 06:35:45 2021
+++ src/sys/kern/sys_futex.c Thu Aug 5 23:14:21 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: sys_futex.c,v 1.12 2021/07/21 06:35:45 skrll Exp $ */
+/* $NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $ */
/*-
* Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12 2021/07/21 06:35:45 skrll Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $");
/*
* Futexes
@@ -121,7 +121,9 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,
#include <sys/futex.h>
#include <sys/mutex.h>
#include <sys/rbtree.h>
+#include <sys/sleepq.h>
#include <sys/queue.h>
+#include <sys/sdt.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
@@ -130,13 +132,56 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,
#include <uvm/uvm_extern.h>
/*
+ * DTrace probes.
+ */
+SDT_PROVIDER_DEFINE(futex);
+
+/* entry: uaddr, val, bitset, timeout, clkflags, fflags */
+/* exit: error */
+SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
+ "struct timespec *", "int", "int");
+SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
+
+/* entry: uaddr, nwake, bitset, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
+SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
+ "int");
+SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
+ "int", "int", "int");
+SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
+ "int", "int");
+SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
+
+SDT_PROBE_DEFINE0(futex, wait, finish, normally);
+SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
+SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
+
+/* entry: timo */
+/* exit: error */
+SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
+SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
+
+/*
* Lock order:
*
* futex_tab.lock
- * futex::fx_qlock ordered by kva of struct futex
- * -> futex_wait::fw_lock only one at a time
- * futex_wait::fw_lock only one at a time
- * -> futex::fx_abortlock only one at a time
+ * futex::fx_op_lock ordered by kva of struct futex
+ * -> futex::fx_sq_lock ordered by kva of sleepq lock
+ *
+ * N.B. multiple futexes can share a single sleepq lock.
*/
/*
@@ -166,37 +211,52 @@ union futex_key {
* that can be present on the system at any given time, and the
* assumption is that limit will be good enough on a 32-bit platform.
* See futex_wake() for why overflow needs to be avoided.
+ *
+ * XXX Since futex addresses must be 4-byte aligned, we could
+ * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
+ * XXX field of the key. Worth it?
*/
struct futex {
union futex_key fx_key;
+ struct rb_node fx_node;
unsigned long fx_refcnt;
bool fx_shared;
bool fx_on_tree;
- struct rb_node fx_node;
+ uint8_t fx_class;
- kmutex_t fx_qlock;
- TAILQ_HEAD(, futex_wait) fx_queue;
-
- kmutex_t fx_abortlock;
- LIST_HEAD(, futex_wait) fx_abortlist;
- kcondvar_t fx_abortcv;
+ kmutex_t fx_op_lock; /* adaptive */
+ kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
+ sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
+ unsigned int fx_nwaiters[2];
};
/*
- * struct futex_wait
- *
- * State for a thread to wait on a futex. Threads wait on fw_cv
- * for fw_bitset to be set to zero. The thread may transition to
- * a different futex queue at any time under the futex's lock.
- */
-struct futex_wait {
- kmutex_t fw_lock;
- kcondvar_t fw_cv;
- struct futex *fw_futex;
- TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */
- LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */
- int fw_bitset;
- bool fw_aborting; /* fw_lock */
+ * futex classes: Some futex operations can only be carried out on
+ * futexes that are known to be abiding by a certain protocol. These
+ * classes are assigned to a futex when created due to a wait event,
+ * and when subsequent wake or requeue operations are issued, the
+ * class is checked at futex lookup time. If the class does not match,
+ * EINVAL is the result.
+ */
+#define FUTEX_CLASS_ANY 0 /* match any class in lookup */
+#define FUTEX_CLASS_NORMAL 1 /* normal futex */
+#define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
+#define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
+
+/* sleepq definitions */
+#define FUTEX_READERQ 0
+#define FUTEX_WRITERQ 1
+
+static const char futex_wmesg[] = "futex";
+
+static void futex_unsleep(lwp_t *, bool);
+
+static syncobj_t futex_syncobj = {
+ .sobj_flag = SOBJ_SLEEPQ_SORTED,
+ .sobj_unsleep = futex_unsleep,
+ .sobj_changepri = sleepq_changepri,
+ .sobj_lendpri = sleepq_lendpri,
+ .sobj_owner = syncobj_noowner,
};
/*
@@ -280,7 +340,95 @@ static const rb_tree_ops_t futex_shared_
.rbto_node_offset = offsetof(struct futex, fx_node),
};
-static void futex_wait_dequeue(struct futex_wait *, struct futex *);
+/*
+ * futex_sq_lock2(f, f2)
+ *
+ * Acquire the sleepq locks for f and f2, which may be null, or
+ * which may be the same. If they are distinct, an arbitrary total
+ * order is chosen on the locks.
+ *
+ * Callers should only ever acquire multiple sleepq locks
+ * simultaneously using futex_sq_lock2.
+ */
+static void
+futex_sq_lock2(struct futex * const f, struct futex * const f2)
+{
+
+ /*
+ * If both are null, do nothing; if one is null and the other
+ * is not, lock the other and be done with it.
+ */
+ if (f == NULL && f2 == NULL) {
+ return;
+ } else if (f == NULL) {
+ mutex_spin_enter(f2->fx_sq_lock);
+ return;
+ } else if (f2 == NULL) {
+ mutex_spin_enter(f->fx_sq_lock);
+ return;
+ }
+
+ kmutex_t * const m = f->fx_sq_lock;
+ kmutex_t * const m2 = f2->fx_sq_lock;
+
+ /* If both locks are the same, acquire only one. */
+ if (m == m2) {
+ mutex_spin_enter(m);
+ return;
+ }
+
+ /* Otherwise, use the ordering on the kva of the lock pointer. */
+ if ((uintptr_t)m < (uintptr_t)m2) {
+ mutex_spin_enter(m);
+ mutex_spin_enter(m2);
+ } else {
+ mutex_spin_enter(m2);
+ mutex_spin_enter(m);
+ }
+}
+
+/*
+ * futex_sq_unlock2(f, f2)
+ *
+ * Release the sleep queue locks for f and f2, which may be null, or
+ * which may have the same underlying lock.
+ */
+static void
+futex_sq_unlock2(struct futex * const f, struct futex * const f2)
+{
+
+ /*
+ * If both are null, do nothing; if one is null and the other
+ * is not, unlock the other and be done with it.
+ */
+ if (f == NULL && f2 == NULL) {
+ return;
+ } else if (f == NULL) {
+ mutex_spin_exit(f2->fx_sq_lock);
+ return;
+ } else if (f2 == NULL) {
+ mutex_spin_exit(f->fx_sq_lock);
+ return;
+ }
+
+ kmutex_t * const m = f->fx_sq_lock;
+ kmutex_t * const m2 = f2->fx_sq_lock;
+
+ /* If both locks are the same, release only one. */
+ if (m == m2) {
+ mutex_spin_exit(m);
+ return;
+ }
+
+ /* Otherwise, use the ordering on the kva of the lock pointer. */
+ if ((uintptr_t)m < (uintptr_t)m2) {
+ mutex_spin_exit(m2);
+ mutex_spin_exit(m);
+ } else {
+ mutex_spin_exit(m);
+ mutex_spin_exit(m2);
+ }
+}
/*
* futex_load(uaddr, kaddr)
@@ -313,6 +461,11 @@ futex_test(int *uaddr, int expected)
return val == expected;
}
+static pool_cache_t futex_cache __read_mostly;
+
+static int futex_ctor(void *, void *, int);
+static void futex_dtor(void *, void *);
+
/*
* futex_sys_init()
*
@@ -325,6 +478,10 @@ futex_sys_init(void)
mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
rb_tree_init(&futex_tab.va, &futex_rb_ops);
rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
+
+ futex_cache = pool_cache_init(sizeof(struct futex),
+ coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
+ futex_dtor, NULL);
}
/*
@@ -342,69 +499,38 @@ futex_sys_fini(void)
}
/*
- * futex_queue_init(f)
+ * futex_ctor()
*
- * Initialize the futex queue. Caller must call futex_queue_fini
- * when done.
- *
- * Never sleeps.
+ * Futex object constructor.
*/
-static void
-futex_queue_init(struct futex *f)
+static int
+futex_ctor(void *arg __unused, void *obj, int flags __unused)
{
+ extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
+ struct futex * const f = obj;
- mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
- mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
- cv_init(&f->fx_abortcv, "fqabort");
- LIST_INIT(&f->fx_abortlist);
- TAILQ_INIT(&f->fx_queue);
-}
-
-/*
- * futex_queue_drain(f)
- *
- * Wait for any aborting waiters in f; then empty the queue of
- * any stragglers and wake them. Caller must guarantee no new
- * references to f.
- *
- * May sleep.
- */
-static void
-futex_queue_drain(struct futex *f)
-{
- struct futex_wait *fw, *fw_next;
+ mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
+ f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
- mutex_enter(&f->fx_abortlock);
- while (!LIST_EMPTY(&f->fx_abortlist))
- cv_wait(&f->fx_abortcv, &f->fx_abortlock);
- mutex_exit(&f->fx_abortlock);
+ sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
+ sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
+ f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
- mutex_enter(&f->fx_qlock);
- TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
- mutex_enter(&fw->fw_lock);
- futex_wait_dequeue(fw, f);
- cv_broadcast(&fw->fw_cv);
- mutex_exit(&fw->fw_lock);
- }
- mutex_exit(&f->fx_qlock);
+ return 0;
}
/*
- * futex_queue_fini(fq)
+ * futex_dtor()
*
- * Finalize the futex queue initialized by futex_queue_init. Queue
- * must be empty. Caller must not use f again until a subsequent
- * futex_queue_init.
+ * Futex object destructor.
*/
static void
-futex_queue_fini(struct futex *f)
+futex_dtor(void *arg __unused, void *obj)
{
+ struct futex * const f = obj;
- KASSERT(TAILQ_EMPTY(&f->fx_queue));
- KASSERT(LIST_EMPTY(&f->fx_abortlist));
- mutex_destroy(&f->fx_qlock);
- mutex_destroy(&f->fx_abortlock);
- cv_destroy(&f->fx_abortcv);
+ mutex_destroy(&f->fx_op_lock);
+ f->fx_sq_lock = NULL;
}
/*
@@ -452,11 +578,11 @@ futex_key_fini(union futex_key *fk, bool
* Never sleeps for memory, but may sleep to acquire a lock.
*/
static struct futex *
-futex_create(union futex_key *fk, bool shared)
+futex_create(union futex_key *fk, bool shared, uint8_t class)
{
struct futex *f;
- f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
+ f = pool_cache_get(futex_cache, PR_NOWAIT);
if (f == NULL) {
futex_key_fini(fk, shared);
return NULL;
@@ -465,7 +591,7 @@ futex_create(union futex_key *fk, bool s
f->fx_refcnt = 1;
f->fx_shared = shared;
f->fx_on_tree = false;
- futex_queue_init(f);
+ f->fx_class = class;
return f;
}
@@ -486,18 +612,18 @@ futex_destroy(struct futex *f)
KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
KASSERT(!f->fx_on_tree);
-
- /* Drain and destroy the private queue. */
- futex_queue_drain(f);
- futex_queue_fini(f);
+ KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
+ KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
+ KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
+ KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
futex_key_fini(&f->fx_key, f->fx_shared);
- kmem_free(f, sizeof(*f));
+ pool_cache_put(futex_cache, f);
}
/*
- * futex_hold(f)
+ * futex_hold_count(f, n)
*
* Attempt to acquire a reference to f. Return 0 on success,
* ENFILE on too many references.
@@ -505,21 +631,23 @@ futex_destroy(struct futex *f)
* Never sleeps.
*/
static int
-futex_hold(struct futex *f)
+futex_hold_count(struct futex *f, unsigned long const count)
{
unsigned long refcnt;
do {
refcnt = atomic_load_relaxed(&f->fx_refcnt);
- if (refcnt == ULONG_MAX)
+ if (ULONG_MAX - refcnt < count)
return ENFILE;
- } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
+ } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+ refcnt + count) != refcnt);
return 0;
}
+#define futex_hold(f) futex_hold_count(f, 1)
/*
- * futex_rele(f)
+ * futex_rele_count(f, n)
*
* Release a reference to f acquired with futex_create or
* futex_hold.
@@ -527,7 +655,7 @@ futex_hold(struct futex *f)
* May sleep to free f.
*/
static void
-futex_rele(struct futex *f)
+futex_rele_count(struct futex *f, unsigned long const count)
{
unsigned long refcnt;
@@ -535,14 +663,17 @@ futex_rele(struct futex *f)
do {
refcnt = atomic_load_relaxed(&f->fx_refcnt);
- if (refcnt == 1)
+ KASSERT(refcnt >= count);
+ if (refcnt - count == 0)
goto trylast;
- } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
+ } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+ refcnt - count) != refcnt);
return;
trylast:
+ KASSERT(count <= LONG_MAX);
mutex_enter(&futex_tab.lock);
- if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
+ if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
if (f->fx_on_tree) {
if (__predict_false(f->fx_shared))
rb_tree_remove_node(&futex_tab.oa, f);
@@ -558,9 +689,10 @@ trylast:
if (f != NULL)
futex_destroy(f);
}
+#define futex_rele(f) futex_rele_count(f, 1)
/*
- * futex_rele_not_last(f)
+ * futex_rele_count_not_last(f, n)
*
* Release a reference to f acquired with futex_create or
* futex_hold.
@@ -569,18 +701,19 @@ trylast:
* reference to f.
*/
static void
-futex_rele_not_last(struct futex *f)
+futex_rele_count_not_last(struct futex *f, unsigned long const count)
{
unsigned long refcnt;
do {
refcnt = atomic_load_relaxed(&f->fx_refcnt);
- KASSERT(refcnt > 1);
- } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
+ KASSERT(refcnt >= count);
+ } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+ refcnt - count) != refcnt);
}
/*
- * futex_lookup_by_key(key, shared, &f)
+ * futex_lookup_by_key(key, shared, class, &f)
*
* Try to find an existing futex va reference in the specified key
* On success, return 0, set f to found futex or to NULL if not found,
@@ -592,7 +725,8 @@ futex_rele_not_last(struct futex *f)
* futex_lookup_create().
*/
static int
-futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
+futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
+ struct futex **fp)
{
struct futex *f;
int error = 0;
@@ -604,7 +738,11 @@ futex_lookup_by_key(union futex_key *fk,
f = rb_tree_find_node(&futex_tab.va, fk);
}
if (f) {
- error = futex_hold(f);
+ if (__predict_true(f->fx_class == class ||
+ class == FUTEX_CLASS_ANY))
+ error = futex_hold(f);
+ else
+ error = EINVAL;
if (error)
f = NULL;
}
@@ -655,7 +793,7 @@ out: mutex_exit(&futex_tab.lock);
}
/*
- * futex_lookup(uaddr, shared, &f)
+ * futex_lookup(uaddr, shared, class, &f)
*
* Find a futex at the userland pointer uaddr in the current
* process's VM space. On success, return the futex in f and
@@ -664,7 +802,7 @@ out: mutex_exit(&futex_tab.lock);
* Caller must call futex_rele when done.
*/
static int
-futex_lookup(int *uaddr, bool shared, struct futex **fp)
+futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
{
union futex_key fk;
struct vmspace *vm = curproc->p_vmspace;
@@ -675,7 +813,7 @@ futex_lookup(int *uaddr, bool shared, st
* Reject unaligned user pointers so we don't cross page
* boundaries and so atomics will work.
*/
- if ((va & 3) != 0)
+ if (__predict_false((va & 3) != 0))
return EINVAL;
/* Look it up. */
@@ -683,12 +821,13 @@ futex_lookup(int *uaddr, bool shared, st
if (error)
return error;
- error = futex_lookup_by_key(&fk, shared, fp);
+ error = futex_lookup_by_key(&fk, shared, class, fp);
futex_key_fini(&fk, shared);
if (error)
return error;
KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
+ KASSERT(*fp == NULL || (*fp)->fx_class == class);
KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
/*
@@ -702,7 +841,7 @@ futex_lookup(int *uaddr, bool shared, st
}
/*
- * futex_lookup_create(uaddr, shared, &f)
+ * futex_lookup_create(uaddr, shared, class, &f)
*
* Find or create a futex at the userland pointer uaddr in the
* current process's VM space. On success, return the futex in f
@@ -711,7 +850,7 @@ futex_lookup(int *uaddr, bool shared, st
* Caller must call futex_rele when done.
*/
static int
-futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
+futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
{
union futex_key fk;
struct vmspace *vm = curproc->p_vmspace;
@@ -723,7 +862,10 @@ futex_lookup_create(int *uaddr, bool sha
* Reject unaligned user pointers so we don't cross page
* boundaries and so atomics will work.
*/
- if ((va & 3) != 0)
+ if (__predict_false((va & 3) != 0))
+ return EINVAL;
+
+ if (__predict_false(class == FUTEX_CLASS_ANY))
return EINVAL;
error = futex_key_init(&fk, vm, va, shared);
@@ -734,7 +876,7 @@ futex_lookup_create(int *uaddr, bool sha
* Optimistically assume there already is one, and try to find
* it.
*/
- error = futex_lookup_by_key(&fk, shared, fp);
+ error = futex_lookup_by_key(&fk, shared, class, fp);
if (error || *fp != NULL) {
/*
* We either found one, or there was an error.
@@ -748,7 +890,7 @@ futex_lookup_create(int *uaddr, bool sha
* Create a futex recoard. This tranfers ownership of the key
* in all cases.
*/
- f = futex_create(&fk, shared);
+ f = futex_create(&fk, shared, class);
if (f == NULL) {
error = ENOMEM;
goto out;
@@ -775,222 +917,220 @@ out: if (f != NULL)
}
/*
- * futex_wait_init(fw, bitset)
- *
- * Initialize a record for a thread to wait on a futex matching
- * the specified bit set. Should be passed to futex_wait_enqueue
- * before futex_wait, and should be passed to futex_wait_fini when
- * done.
- */
-static void
-futex_wait_init(struct futex_wait *fw, int bitset)
-{
-
- KASSERT(bitset);
-
- mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
- cv_init(&fw->fw_cv, "futex");
- fw->fw_futex = NULL;
- fw->fw_bitset = bitset;
- fw->fw_aborting = false;
-}
-
-/*
- * futex_wait_fini(fw)
- *
- * Finalize a record for a futex waiter. Must not be on any
- * futex's queue.
- */
-static void
-futex_wait_fini(struct futex_wait *fw)
-{
-
- KASSERT(fw->fw_futex == NULL);
-
- cv_destroy(&fw->fw_cv);
- mutex_destroy(&fw->fw_lock);
-}
-
-/*
- * futex_wait_enqueue(fw, f)
+ * futex_unsleep:
*
- * Put fw on the futex queue. Must be done before futex_wait.
- * Caller must hold fw's lock and f's lock, and fw must not be on
- * any existing futex's waiter list.
+ * Remove an LWP from the futex and sleep queue. This is called when
+ * the LWP has not been awoken normally but instead interrupted: for
+ * example, when a signal is received. Must be called with the LWP
+ * locked. Will unlock if "unlock" is true.
*/
static void
-futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
+futex_unsleep(lwp_t *l, bool unlock)
{
+ struct futex *f __diagused;
- KASSERT(mutex_owned(&f->fx_qlock));
- KASSERT(mutex_owned(&fw->fw_lock));
- KASSERT(fw->fw_futex == NULL);
- KASSERT(!fw->fw_aborting);
-
- fw->fw_futex = f;
- TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
-}
+ f = (struct futex *)(uintptr_t)l->l_wchan;
-/*
- * futex_wait_dequeue(fw, f)
- *
- * Remove fw from the futex queue. Precludes subsequent
- * futex_wait until a futex_wait_enqueue. Caller must hold fw's
- * lock and f's lock, and fw must be on f.
- */
-static void
-futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
-{
+ KASSERT(mutex_owned(f->fx_sq_lock));
- KASSERT(mutex_owned(&f->fx_qlock));
- KASSERT(mutex_owned(&fw->fw_lock));
- KASSERT(fw->fw_futex == f);
+ /* WRITERQ is more likely */
+ if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
+ KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
+ f->fx_nwaiters[FUTEX_WRITERQ]--;
+ } else {
+ KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
+ KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
+ f->fx_nwaiters[FUTEX_READERQ]--;
+ }
- TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
- fw->fw_futex = NULL;
+ sleepq_unsleep(l, unlock);
}
/*
- * futex_wait_abort(fw)
+ * futex_wait(f, timeout, clkid, bitset)
*
- * Caller is no longer waiting for fw. Remove it from any queue
- * if it was on one. Caller must hold fw->fw_lock.
+ * Wait until deadline on the clock clkid, or forever if deadline
+ * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
+ * ETIMEDOUT on timeout, EINTR on signal.
*/
-static void
-futex_wait_abort(struct futex_wait *fw)
+static int
+futex_wait(struct futex *f, int q, const struct timespec *deadline,
+ clockid_t clkid, int bitset)
{
- struct futex *f;
-
- KASSERT(mutex_owned(&fw->fw_lock));
/*
- * Grab the futex queue. It can't go away as long as we hold
- * fw_lock. However, we can't take the queue lock because
- * that's a lock order reversal.
+ * Some things to note about how this function works:
+ *
+ * ==> When we enter futex_wait(), the futex's op lock is
+ * held, preventing us from missing wakes. Once we are on
+ * the futex's sleepq, it is safe to release the op lock.
+ *
+ * ==> We have a single early escape to avoid calling
+ * sleepq_block(): our deadline already having passed.
+ * If this is a no-timeout wait, or if the deadline has
+ * not already passed, we are guaranteed to call sleepq_block().
+ *
+ * ==> sleepq_block() contains all of the necessary logic
+ * for handling signals; we do not need to check for them
+ * ourselves.
+ *
+ * ==> Once we have blocked, other futex operations will
+ * be able to wake us or requeue us. Until we have blocked,
+ * those other futex operations will not be able to acquire
+ * the sleepq locks in order to perform those operations,
+ * even if we have dropped the op lock. Thus, we will not
+ * miss wakes. This fundamental invariant is relied upon by
+ * every other synchronization object in the kernel.
+ *
+ * ==> sleepq_block() returns for one of three reasons:
+ *
+ * -- We were awakened.
+ * -- We were signaled.
+ * -- We timed out.
+ *
+ * ==> Once sleepq_block() returns, we will not call
+ * sleepq_block() again.
+ *
+ * ==> If sleepq_block() returns due to being signaled,
+ * we must check to see if we were also awakened. This
+ * is to handle the following sequence:
+ *
+ * -- Futex wake takes us off sleepq, places us on runq.
+ * -- We are signaled while sitting on the runq.
+ * -- We run, and sleepq_block() notices the signal and
+ * returns EINTR/ERESTART.
+ *
+ * In this situation, we want to indicate a successful wake
+ * to the caller, because that wake has been reported to the
+ * thread that issued it.
+ *
+ * Note that it is NOT possible for a wake to be issued after
+ * a signal; the LWP will already have been taken off the sleepq
+ * by the signal, so the wake will never happen. Note that for
+ * this reason, we must *never* return ERESTART, because it could
+ * result in us waiting again with no one to awaken us.
*/
- f = fw->fw_futex;
- /* Put us on the abort list so that fq won't go away. */
- mutex_enter(&f->fx_abortlock);
- LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
- mutex_exit(&f->fx_abortlock);
+ struct lwp * const l = curlwp;
+ int timo;
+ int error;
/*
- * Mark fw as aborting so it won't lose wakeups and won't be
- * transferred to any other queue.
+ * Compute our timeout before taking any locks.
*/
- fw->fw_aborting = true;
+ if (deadline) {
+ struct timespec ts;
- /* f is now stable, so we can release fw_lock. */
- mutex_exit(&fw->fw_lock);
+ /* Check our watch. */
+ error = clock_gettime1(clkid, &ts);
+ if (error) {
+ mutex_exit(&f->fx_op_lock);
+ return error;
+ }
- /* Now we can remove fw under the queue lock. */
- mutex_enter(&f->fx_qlock);
- mutex_enter(&fw->fw_lock);
- futex_wait_dequeue(fw, f);
- mutex_exit(&fw->fw_lock);
- mutex_exit(&f->fx_qlock);
+ /*
+ * If we're past the deadline, ETIMEDOUT.
+ */
+ if (timespeccmp(deadline, &ts, <=)) {
+ mutex_exit(&f->fx_op_lock);
+ return ETIMEDOUT;
+ } else {
+ /* Count how much time is left. */
+ timespecsub(deadline, &ts, &ts);
+ timo = tstohz(&ts);
+ if (timo == 0) {
+ /*
+ * tstohz() already rounds up, and
+ * a return value of 0 ticks means
+ * "expired now or in the past".
+ */
+ mutex_exit(&f->fx_op_lock);
+ return ETIMEDOUT;
+ }
+ }
+ } else {
+ timo = 0;
+ }
/*
- * Finally, remove us from the abort list and notify anyone
- * waiting for the abort to complete if we were the last to go.
+ * Acquire in sleepq -> lwp order. While we're at it, ensure
+ * that there's room for us to block.
*/
- mutex_enter(&f->fx_abortlock);
- LIST_REMOVE(fw, fw_abort);
- if (LIST_EMPTY(&f->fx_abortlist))
- cv_broadcast(&f->fx_abortcv);
- mutex_exit(&f->fx_abortlock);
+ mutex_spin_enter(f->fx_sq_lock);
+ if (__predict_false(q == FUTEX_READERQ &&
+ f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
+ mutex_spin_exit(f->fx_sq_lock);
+ mutex_exit(&f->fx_op_lock);
+ return ENFILE;
+ }
+ lwp_lock(l);
/*
- * Release our reference to the futex now that we are not
- * waiting for it.
+ * No need for locks here; until we're on a futex's sleepq, these
+ * values are private to the LWP, and the locks needed to get onto
+ * the sleepq provide the memory barriers needed to ensure visibility.
*/
- futex_rele(f);
+ l->l_futex = f;
+ l->l_futex_wakesel = bitset;
/*
- * Reacquire the fw lock as caller expects. Verify that we're
- * aborting and no longer associated with a futex.
+ * This is an inlined bit of sleepq_enter();
+ * we already hold the lwp lock.
*/
- mutex_enter(&fw->fw_lock);
- KASSERT(fw->fw_aborting);
- KASSERT(fw->fw_futex == NULL);
-}
-
-/*
- * futex_wait(fw, deadline, clkid)
- *
- * fw must be a waiter on a futex's queue. Wait until deadline on
- * the clock clkid, or forever if deadline is NULL, for a futex
- * wakeup. Return 0 on explicit wakeup or destruction of futex,
- * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw
- * will no longer be on a futex queue on return.
- */
-static int
-futex_wait(struct futex_wait *fw, const struct timespec *deadline,
- clockid_t clkid)
-{
- int error = 0;
-
- /* Test and wait under the wait lock. */
- mutex_enter(&fw->fw_lock);
-
- for (;;) {
- /* If we're done yet, stop and report success. */
- if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
- error = 0;
- break;
- }
-
- /* If anything went wrong in the last iteration, stop. */
- if (error)
- break;
+ lwp_unlock_to(l, f->fx_sq_lock);
+ KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
+ KASSERT(lwp_locked(l, f->fx_sq_lock));
- /* Not done yet. Wait. */
- if (deadline) {
- struct timespec ts;
-
- /* Check our watch. */
- error = clock_gettime1(clkid, &ts);
- if (error)
- break;
+ sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
+ f->fx_nwaiters[q]++;
- /* If we're past the deadline, ETIMEDOUT. */
- if (timespeccmp(deadline, &ts, <=)) {
- error = ETIMEDOUT;
- break;
- }
+ /* We can now safely release the op lock. */
+ mutex_exit(&f->fx_op_lock);
- /* Count how much time is left. */
- timespecsub(deadline, &ts, &ts);
+ SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
+ error = sleepq_block(timo, true);
+ SDT_PROBE1(futex, wait, sleepq_block, exit, error);
- /* Wait for that much time, allowing signals. */
- error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
- tstohz(&ts));
- } else {
- /* Wait indefinitely, allowing signals. */
- error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
- }
- }
+ /*
+ * f is no longer valid; we may have been moved to another
+ * futex sleepq while we slept.
+ */
/*
- * If we were woken up, the waker will have removed fw from the
- * queue. But if anything went wrong, we must remove fw from
- * the queue ourselves. While here, convert EWOULDBLOCK to
- * ETIMEDOUT.
+ * If something went wrong, we may still have a futex reference.
+ * Check for that here and drop it if so. We can do this without
+ * taking any locks because l->l_futex is private to the LWP when
+ * not on any futex's sleepq, and we are not on any sleepq because
+ * we are running.
+ *
+ * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
+ * EWOULDBLOCK, and ensure the only other error we return is EINTR.
*/
if (error) {
- futex_wait_abort(fw);
+ f = l->l_futex;
+ if (f != NULL) {
+ SDT_PROBE0(futex, wait, finish, aborted);
+ l->l_futex = NULL;
+ futex_rele(f);
+ } else {
+ /* Raced with wakeup; report success. */
+ SDT_PROBE0(futex, wait, finish, wakerace);
+ error = 0;
+ }
if (error == EWOULDBLOCK)
error = ETIMEDOUT;
+ else if (error != ETIMEDOUT)
+ error = EINTR;
+ } else {
+ KASSERT(l->l_futex == NULL);
+ SDT_PROBE0(futex, wait, finish, normally);
}
- mutex_exit(&fw->fw_lock);
-
return error;
}
/*
- * futex_wake(f, nwake, f2, nrequeue, bitset)
+ * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
*
* Wake up to nwake waiters on f matching bitset; then, if f2 is
* provided, move up to nrequeue remaining waiters on f matching
@@ -998,118 +1138,123 @@ futex_wait(struct futex_wait *fw, const
* Caller must hold the locks of f and f2, if provided.
*/
static unsigned
-futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
- unsigned nrequeue, int bitset)
+futex_wake(struct futex *f, int q, unsigned int const nwake,
+ struct futex *f2, int q2, unsigned int const nrequeue,
+ int bitset)
{
- struct futex_wait *fw, *fw_next;
+ struct lwp *l, *l_next;
unsigned nwoken = 0;
- int hold_error __diagused;
+ unsigned nrequeued = 0;
- KASSERT(mutex_owned(&f->fx_qlock));
- KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
+ KASSERT(mutex_owned(&f->fx_op_lock));
+ KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
+
+ futex_sq_lock2(f, f2);
/* Wake up to nwake waiters, and count the number woken. */
- TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
- if ((fw->fw_bitset & bitset) == 0)
- continue;
- if (nwake > 0) {
- mutex_enter(&fw->fw_lock);
- if (__predict_false(fw->fw_aborting)) {
- mutex_exit(&fw->fw_lock);
- continue;
- }
- futex_wait_dequeue(fw, f);
- fw->fw_bitset = 0;
- cv_broadcast(&fw->fw_cv);
- mutex_exit(&fw->fw_lock);
- nwake--;
- nwoken++;
- /*
- * Drop the futex reference on behalf of the
- * waiter. We assert this is not the last
- * reference on the futex (our caller should
- * also have one).
- */
- futex_rele_not_last(f);
- } else {
+ LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
+ if (nwoken == nwake)
break;
- }
+
+ KASSERT(l->l_syncobj == &futex_syncobj);
+ KASSERT(l->l_wchan == (wchan_t)f);
+ KASSERT(l->l_futex == f);
+ KASSERT(l->l_sleepq == &f->fx_sqs[q]);
+ KASSERT(l->l_mutex == f->fx_sq_lock);
+
+ if ((l->l_futex_wakesel & bitset) == 0)
+ continue;
+
+ l->l_futex_wakesel = 0;
+ l->l_futex = NULL;
+ sleepq_remove(&f->fx_sqs[q], l);
+ f->fx_nwaiters[q]--;
+ nwoken++;
+ /* hold counts adjusted in bulk below */
}
- if (f2) {
+ if (nrequeue) {
+ KASSERT(f2 != NULL);
/* Move up to nrequeue waiters from f's queue to f2's queue. */
- TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
- if ((fw->fw_bitset & bitset) == 0)
- continue;
- if (nrequeue > 0) {
- mutex_enter(&fw->fw_lock);
- if (__predict_false(fw->fw_aborting)) {
- mutex_exit(&fw->fw_lock);
- continue;
- }
- futex_wait_dequeue(fw, f);
- futex_wait_enqueue(fw, f2);
- mutex_exit(&fw->fw_lock);
- nrequeue--;
- /*
- * Transfer the reference from f to f2.
- * As above, we assert that we are not
- * dropping the last reference to f here.
- *
- * XXX futex_hold() could theoretically
- * XXX fail here.
- */
- futex_rele_not_last(f);
- hold_error = futex_hold(f2);
- KASSERT(hold_error == 0);
- } else {
+ LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
+ if (nrequeued == nrequeue)
break;
- }
+
+ KASSERT(l->l_syncobj == &futex_syncobj);
+ KASSERT(l->l_wchan == (wchan_t)f);
+ KASSERT(l->l_futex == f);
+ KASSERT(l->l_sleepq == &f->fx_sqs[q]);
+ KASSERT(l->l_mutex == f->fx_sq_lock);
+
+ if ((l->l_futex_wakesel & bitset) == 0)
+ continue;
+
+ l->l_futex = f2;
+ sleepq_transfer(l, &f->fx_sqs[q],
+ &f2->fx_sqs[q2], f2, futex_wmesg,
+ &futex_syncobj, f2->fx_sq_lock, true);
+ f->fx_nwaiters[q]--;
+ f2->fx_nwaiters[q2]++;
+ nrequeued++;
+ /* hold counts adjusted in bulk below */
+ }
+ if (nrequeued) {
+ /* XXX futex_hold() could theoretically fail here. */
+ int hold_error __diagused =
+ futex_hold_count(f2, nrequeued);
+ KASSERT(hold_error == 0);
}
- } else {
- KASSERT(nrequeue == 0);
}
- /* Return the number of waiters woken. */
- return nwoken;
+ /*
+ * Adjust the futex reference counts for the wakes and
+ * requeues.
+ */
+ KASSERT(nwoken + nrequeued <= LONG_MAX);
+ futex_rele_count_not_last(f, nwoken + nrequeued);
+
+ futex_sq_unlock2(f, f2);
+
+ /* Return the number of waiters woken and requeued. */
+ return nwoken + nrequeued;
}
/*
- * futex_queue_lock(f)
+ * futex_op_lock(f)
*
- * Acquire the queue lock of f. Pair with futex_queue_unlock. Do
+ * Acquire the op lock of f. Pair with futex_op_unlock. Do
* not use if caller needs to acquire two locks; use
- * futex_queue_lock2 instead.
+ * futex_op_lock2 instead.
*/
static void
-futex_queue_lock(struct futex *f)
+futex_op_lock(struct futex *f)
{
- mutex_enter(&f->fx_qlock);
+ mutex_enter(&f->fx_op_lock);
}
/*
- * futex_queue_unlock(f)
+ * futex_op_unlock(f)
*
* Release the queue lock of f.
*/
static void
-futex_queue_unlock(struct futex *f)
+futex_op_unlock(struct futex *f)
{
- mutex_exit(&f->fx_qlock);
+ mutex_exit(&f->fx_op_lock);
}
/*
- * futex_queue_lock2(f, f2)
+ * futex_op_lock2(f, f2)
*
- * Acquire the queue locks of both f and f2, which may be null, or
- * which may have the same underlying queue. If they are
- * distinct, an arbitrary total order is chosen on the locks.
+ * Acquire the op locks of both f and f2, which may be null, or
+ * which may be the same. If they are distinct, an arbitrary total
+ * order is chosen on the locks.
*
- * Callers should only ever acquire multiple queue locks
- * simultaneously using futex_queue_lock2.
+ * Callers should only ever acquire multiple op locks
+ * simultaneously using futex_op_lock2.
*/
static void
-futex_queue_lock2(struct futex *f, struct futex *f2)
+futex_op_lock2(struct futex *f, struct futex *f2)
{
/*
@@ -1119,37 +1264,37 @@ futex_queue_lock2(struct futex *f, struc
if (f == NULL && f2 == NULL) {
return;
} else if (f == NULL) {
- mutex_enter(&f2->fx_qlock);
+ mutex_enter(&f2->fx_op_lock);
return;
} else if (f2 == NULL) {
- mutex_enter(&f->fx_qlock);
+ mutex_enter(&f->fx_op_lock);
return;
}
/* If both futexes are the same, acquire only one. */
if (f == f2) {
- mutex_enter(&f->fx_qlock);
+ mutex_enter(&f->fx_op_lock);
return;
}
/* Otherwise, use the ordering on the kva of the futex pointer. */
if ((uintptr_t)f < (uintptr_t)f2) {
- mutex_enter(&f->fx_qlock);
- mutex_enter(&f2->fx_qlock);
+ mutex_enter(&f->fx_op_lock);
+ mutex_enter(&f2->fx_op_lock);
} else {
- mutex_enter(&f2->fx_qlock);
- mutex_enter(&f->fx_qlock);
+ mutex_enter(&f2->fx_op_lock);
+ mutex_enter(&f->fx_op_lock);
}
}
/*
- * futex_queue_unlock2(f, f2)
+ * futex_op_unlock2(f, f2)
*
* Release the queue locks of both f and f2, which may be null, or
* which may have the same underlying queue.
*/
static void
-futex_queue_unlock2(struct futex *f, struct futex *f2)
+futex_op_unlock2(struct futex *f, struct futex *f2)
{
/*
@@ -1159,33 +1304,33 @@ futex_queue_unlock2(struct futex *f, str
if (f == NULL && f2 == NULL) {
return;
} else if (f == NULL) {
- mutex_exit(&f2->fx_qlock);
+ mutex_exit(&f2->fx_op_lock);
return;
} else if (f2 == NULL) {
- mutex_exit(&f->fx_qlock);
+ mutex_exit(&f->fx_op_lock);
return;
}
/* If both futexes are the same, release only one. */
if (f == f2) {
- mutex_exit(&f->fx_qlock);
+ mutex_exit(&f->fx_op_lock);
return;
}
/* Otherwise, use the ordering on the kva of the futex pointer. */
if ((uintptr_t)f < (uintptr_t)f2) {
- mutex_exit(&f2->fx_qlock);
- mutex_exit(&f->fx_qlock);
+ mutex_exit(&f2->fx_op_lock);
+ mutex_exit(&f->fx_op_lock);
} else {
- mutex_exit(&f->fx_qlock);
- mutex_exit(&f2->fx_qlock);
+ mutex_exit(&f->fx_op_lock);
+ mutex_exit(&f2->fx_op_lock);
}
}
/*
* futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
*
- * Implement futex(FUTEX_WAIT).
+ * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
*/
static int
futex_func_wait(bool shared, int *uaddr, int val, int val3,
@@ -1193,7 +1338,6 @@ futex_func_wait(bool shared, int *uaddr,
register_t *retval)
{
struct futex *f;
- struct futex_wait wait, *fw = &wait;
struct timespec ts;
const struct timespec *deadline;
int error;
@@ -1210,62 +1354,71 @@ futex_func_wait(bool shared, int *uaddr,
return EAGAIN;
/* Determine a deadline on the specified clock. */
- if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
+ if (timeout == NULL) {
+ deadline = timeout;
+ } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
+ /* Sanity-check the deadline. */
+ if (timeout->tv_sec < 0 ||
+ timeout->tv_nsec < 0 ||
+ timeout->tv_nsec >= 1000000000L) {
+ return EINVAL;
+ }
deadline = timeout;
} else {
+ struct timespec interval = *timeout;
+
+ error = itimespecfix(&interval);
+ if (error)
+ return error;
error = clock_gettime1(clkid, &ts);
if (error)
return error;
- timespecadd(&ts, timeout, &ts);
+ timespecadd(&ts, &interval, &ts);
deadline = &ts;
}
/* Get the futex, creating it if necessary. */
- error = futex_lookup_create(uaddr, shared, &f);
+ error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
if (error)
return error;
KASSERT(f);
- /* Get ready to wait. */
- futex_wait_init(fw, val3);
-
/*
- * Under the queue lock, check the value again: if it has
+ * Under the op lock, check the value again: if it has
* already changed, EAGAIN; otherwise enqueue the waiter.
* Since FUTEX_WAKE will use the same lock and be done after
* modifying the value, the order in which we check and enqueue
* is immaterial.
*/
- futex_queue_lock(f);
+ futex_op_lock(f);
if (!futex_test(uaddr, val)) {
- futex_queue_unlock(f);
+ futex_op_unlock(f);
error = EAGAIN;
goto out;
}
- mutex_enter(&fw->fw_lock);
- futex_wait_enqueue(fw, f);
- mutex_exit(&fw->fw_lock);
- futex_queue_unlock(f);
+
+ /*
+ * Now wait. futex_wait() will drop our op lock once we
+ * are entered into the sleep queue, thus ensuring atomicity
+ * of wakes with respect to waits.
+ */
+ error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
/*
* We cannot drop our reference to the futex here, because
* we might be enqueued on a different one when we are awakened.
- * The references will be managed on our behalf in the requeue
- * and wake cases.
+ * The references will be managed on our behalf in the requeue,
+ * wake, and error cases.
*/
f = NULL;
- /* Wait. */
- error = futex_wait(fw, deadline, clkid);
- if (error)
- goto out;
-
- /* Return 0 on success, error on failure. */
- *retval = 0;
+ if (__predict_true(error == 0)) {
+ /* Return 0 on success, error on failure. */
+ *retval = 0;
+ }
out: if (f != NULL)
futex_rele(f);
- futex_wait_fini(fw);
return error;
}
@@ -1288,7 +1441,7 @@ futex_func_wake(bool shared, int *uaddr,
}
/* Look up the futex, if any. */
- error = futex_lookup(uaddr, shared, &f);
+ error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
if (error)
goto out;
@@ -1297,12 +1450,13 @@ futex_func_wake(bool shared, int *uaddr,
goto out;
/*
- * Under f's queue lock, wake the waiters and remember the
+ * Under f's op lock, wake the waiters and remember the
* number woken.
*/
- futex_queue_lock(f);
- nwoken = futex_wake(f, val, NULL, 0, val3);
- futex_queue_unlock(f);
+ futex_op_lock(f);
+ nwoken = futex_wake(f, FUTEX_WRITERQ, val,
+ NULL, FUTEX_WRITERQ, 0, val3);
+ futex_op_unlock(f);
/* Release the futex. */
futex_rele(f);
@@ -1335,7 +1489,7 @@ futex_func_requeue(bool shared, int op,
}
/* Look up the source futex, if any. */
- error = futex_lookup(uaddr, shared, &f);
+ error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
if (error)
goto out;
@@ -1347,22 +1501,24 @@ futex_func_requeue(bool shared, int op,
* We may need to create the destination futex because it's
* entirely possible it does not currently have any waiters.
*/
- error = futex_lookup_create(uaddr2, shared, &f2);
+ error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
if (error)
goto out;
/*
- * Under the futexes' queue locks, check the value; if
+ * Under the futexes' op locks, check the value; if
* unchanged from val3, wake the waiters.
*/
- futex_queue_lock2(f, f2);
+ futex_op_lock2(f, f2);
if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
error = EAGAIN;
} else {
error = 0;
- nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
+ nwoken = futex_wake(f, FUTEX_WRITERQ, val,
+ f2, FUTEX_WRITERQ, val2,
+ FUTEX_BITSET_MATCH_ANY);
}
- futex_queue_unlock2(f, f2);
+ futex_op_unlock2(f, f2);
out:
/* Return the number of waiters woken. */
@@ -1526,23 +1682,23 @@ futex_func_wake_op(bool shared, int *uad
goto out;
/* Look up the first futex, if any. */
- error = futex_lookup(uaddr, shared, &f);
+ error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
if (error)
goto out;
/* Look up the second futex, if any. */
- error = futex_lookup(uaddr2, shared, &f2);
+ error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
if (error)
goto out;
/*
- * Under the queue locks:
+ * Under the op locks:
*
* 1. Read/modify/write: *uaddr2 op= oparg.
* 2. Unconditionally wake uaddr.
- * 3. Conditionally wake uaddr2, if it previously matched val2.
+ * 3. Conditionally wake uaddr2, if it previously matched val3.
*/
- futex_queue_lock2(f, f2);
+ futex_op_lock2(f, f2);
do {
error = futex_load(uaddr2, &oldval);
if (error)
@@ -1552,15 +1708,18 @@ futex_func_wake_op(bool shared, int *uad
if (error)
goto out_unlock;
} while (actual != oldval);
- nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
+ nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
+ NULL, FUTEX_WRITERQ, 0,
+ FUTEX_BITSET_MATCH_ANY) : 0);
if (f2 && futex_compute_cmp(oldval, val3))
- nwoken += futex_wake(f2, val2, NULL, 0,
- FUTEX_BITSET_MATCH_ANY);
+ nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
+ NULL, FUTEX_WRITERQ, 0,
+ FUTEX_BITSET_MATCH_ANY);
/* Success! */
error = 0;
out_unlock:
- futex_queue_unlock2(f, f2);
+ futex_op_unlock2(f, f2);
out:
/* Return the number of waiters woken. */
@@ -1587,38 +1746,76 @@ do_futex(int *uaddr, int op, int val, co
const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
: CLOCK_MONOTONIC;
+ int error;
op &= FUTEX_CMD_MASK;
switch (op) {
case FUTEX_WAIT:
- return futex_func_wait(shared, uaddr, val,
+ SDT_PROBE6(futex, func, wait, entry,
+ uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
+ TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
+ error = futex_func_wait(shared, uaddr, val,
FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
retval);
+ SDT_PROBE1(futex, func, wait, exit, error);
+ break;
+
+ case FUTEX_WAIT_BITSET:
+ SDT_PROBE6(futex, func, wait, entry,
+ uaddr, val, val3, timeout,
+ TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
+ error = futex_func_wait(shared, uaddr, val, val3, timeout,
+ clkid, TIMER_ABSTIME, retval);
+ SDT_PROBE1(futex, func, wait, exit, error);
+ break;
case FUTEX_WAKE:
- val3 = FUTEX_BITSET_MATCH_ANY;
- /* FALLTHROUGH */
+ SDT_PROBE4(futex, func, wake, entry,
+ uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
+ error = futex_func_wake(shared, uaddr, val,
+ FUTEX_BITSET_MATCH_ANY, retval);
+ SDT_PROBE2(futex, func, wake, exit, error, *retval);
+ break;
+
case FUTEX_WAKE_BITSET:
- return futex_func_wake(shared, uaddr, val, val3, retval);
+ SDT_PROBE4(futex, func, wake, entry,
+ uaddr, val, val3, op & ~FUTEX_CMD_MASK);
+ error = futex_func_wake(shared, uaddr, val, val3, retval);
+ SDT_PROBE2(futex, func, wake, exit, error, *retval);
+ break;
case FUTEX_REQUEUE:
+ SDT_PROBE5(futex, func, requeue, entry,
+ uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
+ error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
+ val2, 0, retval);
+ SDT_PROBE2(futex, func, requeue, exit, error, *retval);
+ break;
+
case FUTEX_CMP_REQUEUE:
- return futex_func_requeue(shared, op, uaddr, val, uaddr2,
+ SDT_PROBE6(futex, func, cmp_requeue, entry,
+ uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
+ error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
val2, val3, retval);
-
- case FUTEX_WAIT_BITSET:
- return futex_func_wait(shared, uaddr, val, val3, timeout,
- clkid, TIMER_ABSTIME, retval);
+ SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
+ break;
case FUTEX_WAKE_OP:
- return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
+ SDT_PROBE6(futex, func, wake_op, entry,
+ uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
+ error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
val3, retval);
+ SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
+ break;
case FUTEX_FD:
default:
- return ENOSYS;
+ error = ENOSYS;
+ break;
}
+
+ return error;
}
/*
@@ -1730,6 +1927,7 @@ release_futex(uintptr_t const uptr, lwpi
struct futex *f;
int oldval, newval, actual;
int error;
+ bool shared;
/* If it's misaligned, tough. */
if (__predict_false(uptr & 3))
@@ -1760,8 +1958,11 @@ release_futex(uintptr_t const uptr, lwpi
*/
if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
register_t retval;
- (void) futex_func_wake(/*shared*/true, uaddr, 1,
+ error = futex_func_wake(/*shared*/true, uaddr, 1,
FUTEX_BITSET_MATCH_ANY, &retval);
+ if (error != 0 || retval == 0)
+ (void) futex_func_wake(/*shared*/false, uaddr, 1,
+ FUTEX_BITSET_MATCH_ANY, &retval);
return;
}
@@ -1805,22 +2006,23 @@ release_futex(uintptr_t const uptr, lwpi
}
/*
- * Look for a shared futex since we have no positive indication
- * it is private. If we can't, tough.
- */
- error = futex_lookup(uaddr, /*shared*/true, &f);
- if (error)
- return;
-
- /*
- * If there's no kernel state for this futex, there's nothing to
- * release.
+ * Look for a futex. Try shared first, then private. If we
+ * can't fine one, tough.
+ *
+ * Note: the assumption here is that anyone placing a futex on
+ * the robust list is adhering to the rules, regardless of the
+ * futex class.
*/
- if (f == NULL)
- return;
+ for (f = NULL, shared = true; f == NULL; shared = false) {
+ error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
+ if (error)
+ return;
+ if (f == NULL && shared == false)
+ return;
+ }
- /* Work under the futex queue lock. */
- futex_queue_lock(f);
+ /* Work under the futex op lock. */
+ futex_op_lock(f);
/*
* Fetch the word: if the tid doesn't match ours, skip;
@@ -1844,11 +2046,14 @@ release_futex(uintptr_t const uptr, lwpi
*
* XXX eventual PI handling?
*/
- if (oldval & FUTEX_WAITERS)
- (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
+ if (oldval & FUTEX_WAITERS) {
+ (void)futex_wake(f, FUTEX_WRITERQ, 1,
+ NULL, FUTEX_WRITERQ, 0,
+ FUTEX_BITSET_MATCH_ANY);
+ }
/* Unlock the queue and release the futex. */
-out: futex_queue_unlock(f);
+out: futex_op_unlock(f);
futex_rele(f);
}
Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.212 src/sys/sys/lwp.h:1.212.14.1
--- src/sys/sys/lwp.h:1.212 Fri Oct 23 00:25:45 2020
+++ src/sys/sys/lwp.h Thu Aug 5 23:14:21 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: lwp.h,v 1.212 2020/10/23 00:25:45 thorpej Exp $ */
+/* $NetBSD: lwp.h,v 1.212.14.1 2021/08/05 23:14:21 thorpej Exp $ */
/*
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020
@@ -72,6 +72,7 @@ static __inline struct cpu_info *lwp_get
* S: l_selcluster->sc_lock
* (: unlocked, stable
* !: unlocked, may only be reliably accessed by the LWP itself
+ * x: special; see comments for field.
*
* Fields are clustered together by usage (to increase the likelihood
* of cache hits) and by size (to reduce dead space in the structure).
@@ -79,6 +80,7 @@ static __inline struct cpu_info *lwp_get
#include <sys/pcu.h>
+struct futex;
struct lockdebug;
struct sysent;
@@ -139,9 +141,19 @@ struct lwp {
u_int l_slptime; /* l: time since last blocked */
bool l_vforkwaiting; /* a: vfork() waiting */
- /* User-space synchronization. */
+ /*
+ * User-space synchronization.
+ *
+ * Special locking considerations:
+ *
+ * l_futex and l_futex_wakesel are acccessed unlocked and private
+ * to the LWP *unless* the LWP is present on a futex sleepq, in
+ * which case they are protected by the lwp_lock (which will in
+ * actuality be the futex sleepq lock).
+ */
uintptr_t l_robust_head; /* !: list of robust futexes */
- uint32_t l___rsvd1; /* reserved for future use */
+ struct futex *l_futex; /* x: futex we're waiting on */
+ uint32_t l_futex_wakesel;/* x: futex wake selector */
#if PCU_UNIT_COUNT > 0
struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT];