Module Name:    src
Committed By:   thorpej
Date:           Thu Aug  5 23:14:21 UTC 2021

Modified Files:
        src/sys/kern [thorpej-futex2]: sys_futex.c
        src/sys/sys [thorpej-futex2]: lwp.h

Log Message:
Bring over just the futex sleepq infrastructure changes from thorpej-futex
to a new branch based on current HEAD.  This contains only the fixes for
the priority problems, and is intended to finish debugging those changes
(without the new extensions).


To generate a diff of this commit:
cvs rdiff -u -r1.12 -r1.12.4.1 src/sys/kern/sys_futex.c
cvs rdiff -u -r1.212 -r1.212.14.1 src/sys/sys/lwp.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/sys_futex.c
diff -u src/sys/kern/sys_futex.c:1.12 src/sys/kern/sys_futex.c:1.12.4.1
--- src/sys/kern/sys_futex.c:1.12	Wed Jul 21 06:35:45 2021
+++ src/sys/kern/sys_futex.c	Thu Aug  5 23:14:21 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: sys_futex.c,v 1.12 2021/07/21 06:35:45 skrll Exp $	*/
+/*	$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $	*/
 
 /*-
  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12 2021/07/21 06:35:45 skrll Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $");
 
 /*
  * Futexes
@@ -121,7 +121,9 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,
 #include <sys/futex.h>
 #include <sys/mutex.h>
 #include <sys/rbtree.h>
+#include <sys/sleepq.h>
 #include <sys/queue.h>
+#include <sys/sdt.h>
 
 #include <sys/syscall.h>
 #include <sys/syscallargs.h>
@@ -130,13 +132,56 @@ __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,
 #include <uvm/uvm_extern.h>
 
 /*
+ * DTrace probes.
+ */
+SDT_PROVIDER_DEFINE(futex);
+
+/* entry: uaddr, val, bitset, timeout, clkflags, fflags */
+/* exit: error */
+SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
+		  "struct timespec *", "int", "int");
+SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
+
+/* entry: uaddr, nwake, bitset, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
+SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
+		  "int");
+SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
+		  "int", "int", "int");
+SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
+
+/* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
+/* exit: error, nwoken */
+SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
+		  "int", "int");
+SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
+
+SDT_PROBE_DEFINE0(futex, wait, finish, normally);
+SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
+SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
+
+/* entry: timo */
+/* exit: error */
+SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
+SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
+
+/*
  * Lock order:
  *
  *	futex_tab.lock
- *	futex::fx_qlock			ordered by kva of struct futex
- *	 -> futex_wait::fw_lock		only one at a time
- *	futex_wait::fw_lock		only one at a time
- *	 -> futex::fx_abortlock		only one at a time
+ *	futex::fx_op_lock		ordered by kva of struct futex
+ *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
+ *
+ * N.B. multiple futexes can share a single sleepq lock.
  */
 
 /*
@@ -166,37 +211,52 @@ union futex_key {
  *	that can be present on the system at any given time, and the
  *	assumption is that limit will be good enough on a 32-bit platform.
  *	See futex_wake() for why overflow needs to be avoided.
+ *
+ *	XXX Since futex addresses must be 4-byte aligned, we could
+ *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
+ *	XXX field of the key.  Worth it?
  */
 struct futex {
 	union futex_key		fx_key;
+	struct rb_node		fx_node;
 	unsigned long		fx_refcnt;
 	bool			fx_shared;
 	bool			fx_on_tree;
-	struct rb_node		fx_node;
+	uint8_t			fx_class;
 
-	kmutex_t			fx_qlock;
-	TAILQ_HEAD(, futex_wait)	fx_queue;
-
-	kmutex_t			fx_abortlock;
-	LIST_HEAD(, futex_wait)		fx_abortlist;
-	kcondvar_t			fx_abortcv;
+	kmutex_t		fx_op_lock;	/* adaptive */
+	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
+	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
+	unsigned int		fx_nwaiters[2];
 };
 
 /*
- * struct futex_wait
- *
- *	State for a thread to wait on a futex.  Threads wait on fw_cv
- *	for fw_bitset to be set to zero.  The thread may transition to
- *	a different futex queue at any time under the futex's lock.
- */
-struct futex_wait {
-	kmutex_t		fw_lock;
-	kcondvar_t		fw_cv;
-	struct futex		*fw_futex;
-	TAILQ_ENTRY(futex_wait)	fw_entry;	/* queue lock */
-	LIST_ENTRY(futex_wait)	fw_abort;	/* queue abortlock */
-	int			fw_bitset;
-	bool			fw_aborting;	/* fw_lock */
+ * futex classes: Some futex operations can only be carried out on
+ * futexes that are known to be abiding by a certain protocol.  These
+ * classes are assigned to a futex when created due to a wait event,
+ * and when subsequent wake or requeue operations are issued, the
+ * class is checked at futex lookup time.  If the class does not match,
+ * EINVAL is the result.
+ */
+#define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
+#define	FUTEX_CLASS_NORMAL	1		/* normal futex */
+#define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
+#define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
+
+/* sleepq definitions */
+#define	FUTEX_READERQ		0
+#define	FUTEX_WRITERQ		1
+
+static const char futex_wmesg[] = "futex";
+
+static void	futex_unsleep(lwp_t *, bool);
+
+static syncobj_t futex_syncobj = {
+	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
+	.sobj_unsleep	= futex_unsleep,
+	.sobj_changepri	= sleepq_changepri,
+	.sobj_lendpri	= sleepq_lendpri,
+	.sobj_owner	= syncobj_noowner,
 };
 
 /*
@@ -280,7 +340,95 @@ static const rb_tree_ops_t futex_shared_
 	.rbto_node_offset = offsetof(struct futex, fx_node),
 };
 
-static void	futex_wait_dequeue(struct futex_wait *, struct futex *);
+/*
+ * futex_sq_lock2(f, f2)
+ *
+ *	Acquire the sleepq locks for f and f2, which may be null, or
+ *	which may be the same.  If they are distinct, an arbitrary total
+ *	order is chosen on the locks.
+ *
+ *	Callers should only ever acquire multiple sleepq locks
+ *	simultaneously using futex_sq_lock2.
+ */
+static void
+futex_sq_lock2(struct futex * const f, struct futex * const f2)
+{
+
+	/*
+	 * If both are null, do nothing; if one is null and the other
+	 * is not, lock the other and be done with it.
+	 */
+	if (f == NULL && f2 == NULL) {
+		return;
+	} else if (f == NULL) {
+		mutex_spin_enter(f2->fx_sq_lock);
+		return;
+	} else if (f2 == NULL) {
+		mutex_spin_enter(f->fx_sq_lock);
+		return;
+	}
+
+	kmutex_t * const m = f->fx_sq_lock;
+	kmutex_t * const m2 = f2->fx_sq_lock;
+
+	/* If both locks are the same, acquire only one. */
+	if (m == m2) {
+		mutex_spin_enter(m);
+		return;
+	}
+
+	/* Otherwise, use the ordering on the kva of the lock pointer.  */
+	if ((uintptr_t)m < (uintptr_t)m2) {
+		mutex_spin_enter(m);
+		mutex_spin_enter(m2);
+	} else {
+		mutex_spin_enter(m2);
+		mutex_spin_enter(m);
+	}
+}
+
+/*
+ * futex_sq_unlock2(f, f2)
+ *
+ *	Release the sleep queue locks for f and f2, which may be null, or
+ *	which may have the same underlying lock.
+ */
+static void
+futex_sq_unlock2(struct futex * const f, struct futex * const f2)
+{
+
+	/*
+	 * If both are null, do nothing; if one is null and the other
+	 * is not, unlock the other and be done with it.
+	 */
+	if (f == NULL && f2 == NULL) {
+		return;
+	} else if (f == NULL) {
+		mutex_spin_exit(f2->fx_sq_lock);
+		return;
+	} else if (f2 == NULL) {
+		mutex_spin_exit(f->fx_sq_lock);
+		return;
+	}
+
+	kmutex_t * const m = f->fx_sq_lock;
+	kmutex_t * const m2 = f2->fx_sq_lock;
+
+	/* If both locks are the same, release only one. */
+	if (m == m2) {
+		mutex_spin_exit(m);
+		return;
+	}
+
+	/* Otherwise, use the ordering on the kva of the lock pointer.  */
+	if ((uintptr_t)m < (uintptr_t)m2) {
+		mutex_spin_exit(m2);
+		mutex_spin_exit(m);
+	} else {
+		mutex_spin_exit(m);
+		mutex_spin_exit(m2);
+	}
+}
 
 /*
  * futex_load(uaddr, kaddr)
@@ -313,6 +461,11 @@ futex_test(int *uaddr, int expected)
 	return val == expected;
 }
 
+static pool_cache_t	futex_cache		__read_mostly;
+
+static int	futex_ctor(void *, void *, int);
+static void	futex_dtor(void *, void *);
+
 /*
  * futex_sys_init()
  *
@@ -325,6 +478,10 @@ futex_sys_init(void)
 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
+
+	futex_cache = pool_cache_init(sizeof(struct futex),
+	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
+	    futex_dtor, NULL);
 }
 
 /*
@@ -342,69 +499,38 @@ futex_sys_fini(void)
 }
 
 /*
- * futex_queue_init(f)
+ * futex_ctor()
  *
- *	Initialize the futex queue.  Caller must call futex_queue_fini
- *	when done.
- *
- *	Never sleeps.
+ *	Futex object constructor.
  */
-static void
-futex_queue_init(struct futex *f)
+static int
+futex_ctor(void *arg __unused, void *obj, int flags __unused)
 {
+	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
+	struct futex * const f = obj;
 
-	mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
-	mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
-	cv_init(&f->fx_abortcv, "fqabort");
-	LIST_INIT(&f->fx_abortlist);
-	TAILQ_INIT(&f->fx_queue);
-}
-
-/*
- * futex_queue_drain(f)
- *
- *	Wait for any aborting waiters in f; then empty the queue of
- *	any stragglers and wake them.  Caller must guarantee no new
- *	references to f.
- *
- *	May sleep.
- */
-static void
-futex_queue_drain(struct futex *f)
-{
-	struct futex_wait *fw, *fw_next;
+	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
+	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
 
-	mutex_enter(&f->fx_abortlock);
-	while (!LIST_EMPTY(&f->fx_abortlist))
-		cv_wait(&f->fx_abortcv, &f->fx_abortlock);
-	mutex_exit(&f->fx_abortlock);
+	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
+	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
+	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
 
-	mutex_enter(&f->fx_qlock);
-	TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
-		mutex_enter(&fw->fw_lock);
-		futex_wait_dequeue(fw, f);
-		cv_broadcast(&fw->fw_cv);
-		mutex_exit(&fw->fw_lock);
-	}
-	mutex_exit(&f->fx_qlock);
+	return 0;
 }
 
 /*
- * futex_queue_fini(fq)
+ * futex_dtor()
  *
- *	Finalize the futex queue initialized by futex_queue_init.  Queue
- *	must be empty.  Caller must not use f again until a subsequent
- *	futex_queue_init.
+ *	Futex object destructor.
  */
 static void
-futex_queue_fini(struct futex *f)
+futex_dtor(void *arg __unused, void *obj)
 {
+	struct futex * const f = obj;
 
-	KASSERT(TAILQ_EMPTY(&f->fx_queue));
-	KASSERT(LIST_EMPTY(&f->fx_abortlist));
-	mutex_destroy(&f->fx_qlock);
-	mutex_destroy(&f->fx_abortlock);
-	cv_destroy(&f->fx_abortcv);
+	mutex_destroy(&f->fx_op_lock);
+	f->fx_sq_lock = NULL;
 }
 
 /*
@@ -452,11 +578,11 @@ futex_key_fini(union futex_key *fk, bool
  *	Never sleeps for memory, but may sleep to acquire a lock.
  */
 static struct futex *
-futex_create(union futex_key *fk, bool shared)
+futex_create(union futex_key *fk, bool shared, uint8_t class)
 {
 	struct futex *f;
 
-	f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
+	f = pool_cache_get(futex_cache, PR_NOWAIT);
 	if (f == NULL) {
 		futex_key_fini(fk, shared);
 		return NULL;
@@ -465,7 +591,7 @@ futex_create(union futex_key *fk, bool s
 	f->fx_refcnt = 1;
 	f->fx_shared = shared;
 	f->fx_on_tree = false;
-	futex_queue_init(f);
+	f->fx_class = class;
 
 	return f;
 }
@@ -486,18 +612,18 @@ futex_destroy(struct futex *f)
 
 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
 	KASSERT(!f->fx_on_tree);
-
-	/* Drain and destroy the private queue.  */
-	futex_queue_drain(f);
-	futex_queue_fini(f);
+	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
+	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
+	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
+	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
 
 	futex_key_fini(&f->fx_key, f->fx_shared);
 
-	kmem_free(f, sizeof(*f));
+	pool_cache_put(futex_cache, f);
 }
 
 /*
- * futex_hold(f)
+ * futex_hold_count(f, n)
  *
  *	Attempt to acquire a reference to f.  Return 0 on success,
  *	ENFILE on too many references.
@@ -505,21 +631,23 @@ futex_destroy(struct futex *f)
  *	Never sleeps.
  */
 static int
-futex_hold(struct futex *f)
+futex_hold_count(struct futex *f, unsigned long const count)
 {
 	unsigned long refcnt;
 
 	do {
 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
-		if (refcnt == ULONG_MAX)
+		if (ULONG_MAX - refcnt < count)
 			return ENFILE;
-	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
+	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+				  refcnt + count) != refcnt);
 
 	return 0;
 }
+#define	futex_hold(f)	futex_hold_count(f, 1)
 
 /*
- * futex_rele(f)
+ * futex_rele_count(f, n)
  *
  *	Release a reference to f acquired with futex_create or
  *	futex_hold.
@@ -527,7 +655,7 @@ futex_hold(struct futex *f)
  *	May sleep to free f.
  */
 static void
-futex_rele(struct futex *f)
+futex_rele_count(struct futex *f, unsigned long const count)
 {
 	unsigned long refcnt;
 
@@ -535,14 +663,17 @@ futex_rele(struct futex *f)
 
 	do {
 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
-		if (refcnt == 1)
+		KASSERT(refcnt >= count);
+		if (refcnt - count == 0)
 			goto trylast;
-	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
+	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+				  refcnt - count) != refcnt);
 	return;
 
 trylast:
+	KASSERT(count <= LONG_MAX);
 	mutex_enter(&futex_tab.lock);
-	if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
+	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
 		if (f->fx_on_tree) {
 			if (__predict_false(f->fx_shared))
 				rb_tree_remove_node(&futex_tab.oa, f);
@@ -558,9 +689,10 @@ trylast:
 	if (f != NULL)
 		futex_destroy(f);
 }
+#define	futex_rele(f)	futex_rele_count(f, 1)
 
 /*
- * futex_rele_not_last(f)
+ * futex_rele_count_not_last(f, n)
  *
  *	Release a reference to f acquired with futex_create or
  *	futex_hold.
@@ -569,18 +701,19 @@ trylast:
  *	reference to f.
  */
 static void
-futex_rele_not_last(struct futex *f)
+futex_rele_count_not_last(struct futex *f, unsigned long const count)
 {
 	unsigned long refcnt;
 
 	do {
 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
-		KASSERT(refcnt > 1);
-	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
+		KASSERT(refcnt >= count);
+	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
+				  refcnt - count) != refcnt);
 }
 
 /*
- * futex_lookup_by_key(key, shared, &f)
+ * futex_lookup_by_key(key, shared, class, &f)
  *
  *	Try to find an existing futex va reference in the specified key
  *	On success, return 0, set f to found futex or to NULL if not found,
@@ -592,7 +725,8 @@ futex_rele_not_last(struct futex *f)
  *	futex_lookup_create().
  */
 static int
-futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
+futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
+    struct futex **fp)
 {
 	struct futex *f;
 	int error = 0;
@@ -604,7 +738,11 @@ futex_lookup_by_key(union futex_key *fk,
 		f = rb_tree_find_node(&futex_tab.va, fk);
 	}
 	if (f) {
-		error = futex_hold(f);
+		if (__predict_true(f->fx_class == class ||
+				   class == FUTEX_CLASS_ANY))
+			error = futex_hold(f);
+		else
+			error = EINVAL;
 		if (error)
 			f = NULL;
 	}
@@ -655,7 +793,7 @@ out:	mutex_exit(&futex_tab.lock);
 }
 
 /*
- * futex_lookup(uaddr, shared, &f)
+ * futex_lookup(uaddr, shared, class, &f)
  *
  *	Find a futex at the userland pointer uaddr in the current
  *	process's VM space.  On success, return the futex in f and
@@ -664,7 +802,7 @@ out:	mutex_exit(&futex_tab.lock);
  *	Caller must call futex_rele when done.
  */
 static int
-futex_lookup(int *uaddr, bool shared, struct futex **fp)
+futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
 {
 	union futex_key fk;
 	struct vmspace *vm = curproc->p_vmspace;
@@ -675,7 +813,7 @@ futex_lookup(int *uaddr, bool shared, st
 	 * Reject unaligned user pointers so we don't cross page
 	 * boundaries and so atomics will work.
 	 */
-	if ((va & 3) != 0)
+	if (__predict_false((va & 3) != 0))
 		return EINVAL;
 
 	/* Look it up. */
@@ -683,12 +821,13 @@ futex_lookup(int *uaddr, bool shared, st
 	if (error)
 		return error;
 
-	error = futex_lookup_by_key(&fk, shared, fp);
+	error = futex_lookup_by_key(&fk, shared, class, fp);
 	futex_key_fini(&fk, shared);
 	if (error)
 		return error;
 
 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
+	KASSERT(*fp == NULL || (*fp)->fx_class == class);
 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
 
 	/*
@@ -702,7 +841,7 @@ futex_lookup(int *uaddr, bool shared, st
 }
 
 /*
- * futex_lookup_create(uaddr, shared, &f)
+ * futex_lookup_create(uaddr, shared, class, &f)
  *
  *	Find or create a futex at the userland pointer uaddr in the
  *	current process's VM space.  On success, return the futex in f
@@ -711,7 +850,7 @@ futex_lookup(int *uaddr, bool shared, st
  *	Caller must call futex_rele when done.
  */
 static int
-futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
+futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
 {
 	union futex_key fk;
 	struct vmspace *vm = curproc->p_vmspace;
@@ -723,7 +862,10 @@ futex_lookup_create(int *uaddr, bool sha
 	 * Reject unaligned user pointers so we don't cross page
 	 * boundaries and so atomics will work.
 	 */
-	if ((va & 3) != 0)
+	if (__predict_false((va & 3) != 0))
+		return EINVAL;
+
+	if (__predict_false(class == FUTEX_CLASS_ANY))
 		return EINVAL;
 
 	error = futex_key_init(&fk, vm, va, shared);
@@ -734,7 +876,7 @@ futex_lookup_create(int *uaddr, bool sha
 	 * Optimistically assume there already is one, and try to find
 	 * it.
 	 */
-	error = futex_lookup_by_key(&fk, shared, fp);
+	error = futex_lookup_by_key(&fk, shared, class, fp);
 	if (error || *fp != NULL) {
 		/*
 		 * We either found one, or there was an error.
@@ -748,7 +890,7 @@ futex_lookup_create(int *uaddr, bool sha
 	 * Create a futex recoard.  This tranfers ownership of the key
 	 * in all cases.
 	 */
-	f = futex_create(&fk, shared);
+	f = futex_create(&fk, shared, class);
 	if (f == NULL) {
 		error = ENOMEM;
 		goto out;
@@ -775,222 +917,220 @@ out:	if (f != NULL)
 }
 
 /*
- * futex_wait_init(fw, bitset)
- *
- *	Initialize a record for a thread to wait on a futex matching
- *	the specified bit set.  Should be passed to futex_wait_enqueue
- *	before futex_wait, and should be passed to futex_wait_fini when
- *	done.
- */
-static void
-futex_wait_init(struct futex_wait *fw, int bitset)
-{
-
-	KASSERT(bitset);
-
-	mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
-	cv_init(&fw->fw_cv, "futex");
-	fw->fw_futex = NULL;
-	fw->fw_bitset = bitset;
-	fw->fw_aborting = false;
-}
-
-/*
- * futex_wait_fini(fw)
- *
- *	Finalize a record for a futex waiter.  Must not be on any
- *	futex's queue.
- */
-static void
-futex_wait_fini(struct futex_wait *fw)
-{
-
-	KASSERT(fw->fw_futex == NULL);
-
-	cv_destroy(&fw->fw_cv);
-	mutex_destroy(&fw->fw_lock);
-}
-
-/*
- * futex_wait_enqueue(fw, f)
+ * futex_unsleep:
  *
- *	Put fw on the futex queue.  Must be done before futex_wait.
- *	Caller must hold fw's lock and f's lock, and fw must not be on
- *	any existing futex's waiter list.
+ *	Remove an LWP from the futex and sleep queue.  This is called when
+ *	the LWP has not been awoken normally but instead interrupted: for
+ *	example, when a signal is received.  Must be called with the LWP
+ *	locked.  Will unlock if "unlock" is true.
  */
 static void
-futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
+futex_unsleep(lwp_t *l, bool unlock)
 {
+	struct futex *f __diagused;
 
-	KASSERT(mutex_owned(&f->fx_qlock));
-	KASSERT(mutex_owned(&fw->fw_lock));
-	KASSERT(fw->fw_futex == NULL);
-	KASSERT(!fw->fw_aborting);
-
-	fw->fw_futex = f;
-	TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
-}
+	f = (struct futex *)(uintptr_t)l->l_wchan;
 
-/*
- * futex_wait_dequeue(fw, f)
- *
- *	Remove fw from the futex queue.  Precludes subsequent
- *	futex_wait until a futex_wait_enqueue.  Caller must hold fw's
- *	lock and f's lock, and fw must be on f.
- */
-static void
-futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
-{
+	KASSERT(mutex_owned(f->fx_sq_lock));
 
-	KASSERT(mutex_owned(&f->fx_qlock));
-	KASSERT(mutex_owned(&fw->fw_lock));
-	KASSERT(fw->fw_futex == f);
+	/* WRITERQ is more likely */
+	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
+		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
+		f->fx_nwaiters[FUTEX_WRITERQ]--;
+	} else {
+		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
+		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
+		f->fx_nwaiters[FUTEX_READERQ]--;
+	}
 
-	TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
-	fw->fw_futex = NULL;
+	sleepq_unsleep(l, unlock);
 }
 
 /*
- * futex_wait_abort(fw)
+ * futex_wait(f, timeout, clkid, bitset)
  *
- *	Caller is no longer waiting for fw.  Remove it from any queue
- *	if it was on one.  Caller must hold fw->fw_lock.
+ *	Wait until deadline on the clock clkid, or forever if deadline
+ *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
+ *	ETIMEDOUT on timeout, EINTR on signal.
  */
-static void
-futex_wait_abort(struct futex_wait *fw)
+static int
+futex_wait(struct futex *f, int q, const struct timespec *deadline,
+    clockid_t clkid, int bitset)
 {
-	struct futex *f;
-
-	KASSERT(mutex_owned(&fw->fw_lock));
 
 	/*
-	 * Grab the futex queue.  It can't go away as long as we hold
-	 * fw_lock.  However, we can't take the queue lock because
-	 * that's a lock order reversal.
+	 * Some things to note about how this function works:
+	 *
+	 * ==> When we enter futex_wait(), the futex's op lock is
+	 * held, preventing us from missing wakes.  Once we are on
+	 * the futex's sleepq, it is safe to release the op lock.
+	 *
+	 * ==> We have a single early escape to avoid calling
+	 * sleepq_block(): our deadline already having passed.
+	 * If this is a no-timeout wait, or if the deadline has
+	 * not already passed, we are guaranteed to call sleepq_block(). 
+	 *
+	 * ==> sleepq_block() contains all of the necessary logic
+	 * for handling signals; we do not need to check for them
+	 * ourselves.
+	 *
+	 * ==> Once we have blocked, other futex operations will
+	 * be able to wake us or requeue us.  Until we have blocked,
+	 * those other futex operations will not be able to acquire
+	 * the sleepq locks in order to perform those operations,
+	 * even if we have dropped the op lock.  Thus, we will not
+	 * miss wakes.  This fundamental invariant is relied upon by
+	 * every other synchronization object in the kernel.
+	 *
+	 * ==> sleepq_block() returns for one of three reasons:
+	 *
+	 *	-- We were awakened.
+	 *	-- We were signaled.
+	 *	-- We timed out.
+	 *
+	 * ==> Once sleepq_block() returns, we will not call
+	 * sleepq_block() again.
+	 *
+	 * ==> If sleepq_block() returns due to being signaled,
+	 * we must check to see if we were also awakened.  This
+	 * is to handle the following sequence:
+	 *
+	 *	-- Futex wake takes us off sleepq, places us on runq.
+	 *	-- We are signaled while sitting on the runq.
+	 *	-- We run, and sleepq_block() notices the signal and
+	 *	   returns  EINTR/ERESTART.
+	 *
+	 * In this situation, we want to indicate a successful wake
+	 * to the caller, because that wake has been reported to the
+	 * thread that issued it.
+	 *
+	 * Note that it is NOT possible for a wake to be issued after
+	 * a signal; the LWP will already have been taken off the sleepq
+	 * by the signal, so the wake will never happen.  Note that for
+	 * this reason, we must *never* return ERESTART, because it could
+	 * result in us waiting again with no one to awaken us.
 	 */
-	f = fw->fw_futex;
 
-	/* Put us on the abort list so that fq won't go away.  */
-	mutex_enter(&f->fx_abortlock);
-	LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
-	mutex_exit(&f->fx_abortlock);
+	struct lwp * const l = curlwp;
+	int timo;
+	int error;
 
 	/*
-	 * Mark fw as aborting so it won't lose wakeups and won't be
-	 * transferred to any other queue.
+	 * Compute our timeout before taking any locks.
 	 */
-	fw->fw_aborting = true;
+	if (deadline) {
+		struct timespec ts;
 
-	/* f is now stable, so we can release fw_lock.  */
-	mutex_exit(&fw->fw_lock);
+		/* Check our watch.  */
+		error = clock_gettime1(clkid, &ts);
+		if (error) {
+			mutex_exit(&f->fx_op_lock);
+			return error;
+		}
 
-	/* Now we can remove fw under the queue lock.  */
-	mutex_enter(&f->fx_qlock);
-	mutex_enter(&fw->fw_lock);
-	futex_wait_dequeue(fw, f);
-	mutex_exit(&fw->fw_lock);
-	mutex_exit(&f->fx_qlock);
+		/*
+		 * If we're past the deadline, ETIMEDOUT.
+		 */
+		if (timespeccmp(deadline, &ts, <=)) {
+			mutex_exit(&f->fx_op_lock);
+			return ETIMEDOUT;
+		} else {
+			/* Count how much time is left.  */
+			timespecsub(deadline, &ts, &ts);
+			timo = tstohz(&ts);
+			if (timo == 0) {
+				/*
+				 * tstohz() already rounds up, and
+				 * a return value of 0 ticks means
+				 * "expired now or in the past".
+				 */
+				mutex_exit(&f->fx_op_lock);
+				return ETIMEDOUT;
+			}
+		}
+	} else {
+		timo = 0;
+	}
 
 	/*
-	 * Finally, remove us from the abort list and notify anyone
-	 * waiting for the abort to complete if we were the last to go.
+	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
+	 * that there's room for us to block.
 	 */
-	mutex_enter(&f->fx_abortlock);
-	LIST_REMOVE(fw, fw_abort);
-	if (LIST_EMPTY(&f->fx_abortlist))
-		cv_broadcast(&f->fx_abortcv);
-	mutex_exit(&f->fx_abortlock);
+	mutex_spin_enter(f->fx_sq_lock);
+	if (__predict_false(q == FUTEX_READERQ &&
+			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
+		mutex_spin_exit(f->fx_sq_lock);
+		mutex_exit(&f->fx_op_lock);
+		return ENFILE;
+	}
+	lwp_lock(l);
 
 	/*
-	 * Release our reference to the futex now that we are not
-	 * waiting for it.
+	 * No need for locks here; until we're on a futex's sleepq, these
+	 * values are private to the LWP, and the locks needed to get onto
+	 * the sleepq provide the memory barriers needed to ensure visibility.
 	 */
-	futex_rele(f);
+	l->l_futex = f;
+	l->l_futex_wakesel = bitset;
 
 	/*
-	 * Reacquire the fw lock as caller expects.  Verify that we're
-	 * aborting and no longer associated with a futex.
+	 * This is an inlined bit of sleepq_enter();
+	 * we already hold the lwp lock.
 	 */
-	mutex_enter(&fw->fw_lock);
-	KASSERT(fw->fw_aborting);
-	KASSERT(fw->fw_futex == NULL);
-}
-
-/*
- * futex_wait(fw, deadline, clkid)
- *
- *	fw must be a waiter on a futex's queue.  Wait until deadline on
- *	the clock clkid, or forever if deadline is NULL, for a futex
- *	wakeup.  Return 0 on explicit wakeup or destruction of futex,
- *	ETIMEDOUT on timeout, EINTR/ERESTART on signal.  Either way, fw
- *	will no longer be on a futex queue on return.
- */
-static int
-futex_wait(struct futex_wait *fw, const struct timespec *deadline,
-    clockid_t clkid)
-{
-	int error = 0;
-
-	/* Test and wait under the wait lock.  */
-	mutex_enter(&fw->fw_lock);
-
-	for (;;) {
-		/* If we're done yet, stop and report success.  */
-		if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
-			error = 0;
-			break;
-		}
-
-		/* If anything went wrong in the last iteration, stop.  */
-		if (error)
-			break;
+	lwp_unlock_to(l, f->fx_sq_lock);
+	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
+	KASSERT(lwp_locked(l, f->fx_sq_lock));
 
-		/* Not done yet.  Wait.  */
-		if (deadline) {
-			struct timespec ts;
-
-			/* Check our watch.  */
-			error = clock_gettime1(clkid, &ts);
-			if (error)
-				break;
+	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
+	f->fx_nwaiters[q]++;
 
-			/* If we're past the deadline, ETIMEDOUT.  */
-			if (timespeccmp(deadline, &ts, <=)) {
-				error = ETIMEDOUT;
-				break;
-			}
+	/* We can now safely release the op lock. */
+	mutex_exit(&f->fx_op_lock);
 
-			/* Count how much time is left.  */
-			timespecsub(deadline, &ts, &ts);
+	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
+	error = sleepq_block(timo, true);
+	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
 
-			/* Wait for that much time, allowing signals.  */
-			error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
-			    tstohz(&ts));
-		} else {
-			/* Wait indefinitely, allowing signals. */
-			error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
-		}
-	}
+	/*
+	 * f is no longer valid; we may have been moved to another
+	 * futex sleepq while we slept.
+	 */
 
 	/*
-	 * If we were woken up, the waker will have removed fw from the
-	 * queue.  But if anything went wrong, we must remove fw from
-	 * the queue ourselves.  While here, convert EWOULDBLOCK to
-	 * ETIMEDOUT.
+	 * If something went wrong, we may still have a futex reference.
+	 * Check for that here and drop it if so.  We can do this without
+	 * taking any locks because l->l_futex is private to the LWP when
+	 * not on any futex's sleepq, and we are not on any sleepq because
+	 * we are running.
+	 *
+	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
+	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
 	 */
 	if (error) {
-		futex_wait_abort(fw);
+		f = l->l_futex;
+		if (f != NULL) {
+			SDT_PROBE0(futex, wait, finish, aborted);
+			l->l_futex = NULL;
+			futex_rele(f);
+		} else {
+			/* Raced with wakeup; report success. */
+			SDT_PROBE0(futex, wait, finish, wakerace);
+			error = 0;
+		}
 		if (error == EWOULDBLOCK)
 			error = ETIMEDOUT;
+		else if (error != ETIMEDOUT)
+			error = EINTR;
+	} else {
+		KASSERT(l->l_futex == NULL);
+		SDT_PROBE0(futex, wait, finish, normally);
 	}
 
-	mutex_exit(&fw->fw_lock);
-
 	return error;
 }
 
 /*
- * futex_wake(f, nwake, f2, nrequeue, bitset)
+ * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
  *
  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
  *	provided, move up to nrequeue remaining waiters on f matching
@@ -998,118 +1138,123 @@ futex_wait(struct futex_wait *fw, const 
  *	Caller must hold the locks of f and f2, if provided.
  */
 static unsigned
-futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
-    unsigned nrequeue, int bitset)
+futex_wake(struct futex *f, int q, unsigned int const nwake,
+    struct futex *f2, int q2, unsigned int const nrequeue,
+    int bitset)
 {
-	struct futex_wait *fw, *fw_next;
+	struct lwp *l, *l_next;
 	unsigned nwoken = 0;
-	int hold_error __diagused;
+	unsigned nrequeued = 0;
 
-	KASSERT(mutex_owned(&f->fx_qlock));
-	KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
+	KASSERT(mutex_owned(&f->fx_op_lock));
+	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
+
+	futex_sq_lock2(f, f2);
 
 	/* Wake up to nwake waiters, and count the number woken.  */
-	TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
-		if ((fw->fw_bitset & bitset) == 0)
-			continue;
-		if (nwake > 0) {
-			mutex_enter(&fw->fw_lock);
-			if (__predict_false(fw->fw_aborting)) {
-				mutex_exit(&fw->fw_lock);
-				continue;
-			}
-			futex_wait_dequeue(fw, f);
-			fw->fw_bitset = 0;
-			cv_broadcast(&fw->fw_cv);
-			mutex_exit(&fw->fw_lock);
-			nwake--;
-			nwoken++;
-			/*
-			 * Drop the futex reference on behalf of the
-			 * waiter.  We assert this is not the last
-			 * reference on the futex (our caller should
-			 * also have one).
-			 */
-			futex_rele_not_last(f);
-		} else {
+	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
+		if (nwoken == nwake)
 			break;
-		}
+
+		KASSERT(l->l_syncobj == &futex_syncobj);
+		KASSERT(l->l_wchan == (wchan_t)f);
+		KASSERT(l->l_futex == f);
+		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
+		KASSERT(l->l_mutex == f->fx_sq_lock);
+
+		if ((l->l_futex_wakesel & bitset) == 0)
+			continue;
+
+		l->l_futex_wakesel = 0;
+		l->l_futex = NULL;
+		sleepq_remove(&f->fx_sqs[q], l);
+		f->fx_nwaiters[q]--;
+		nwoken++;
+		/* hold counts adjusted in bulk below */
 	}
 
-	if (f2) {
+	if (nrequeue) {
+		KASSERT(f2 != NULL);
 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
-		TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
-			if ((fw->fw_bitset & bitset) == 0)
-				continue;
-			if (nrequeue > 0) {
-				mutex_enter(&fw->fw_lock);
-				if (__predict_false(fw->fw_aborting)) {
-					mutex_exit(&fw->fw_lock);
-					continue;
-				}
-				futex_wait_dequeue(fw, f);
-				futex_wait_enqueue(fw, f2);
-				mutex_exit(&fw->fw_lock);
-				nrequeue--;
-				/*
-				 * Transfer the reference from f to f2.
-				 * As above, we assert that we are not
-				 * dropping the last reference to f here.
-				 *
-				 * XXX futex_hold() could theoretically
-				 * XXX fail here.
-				 */
-				futex_rele_not_last(f);
-				hold_error = futex_hold(f2);
-				KASSERT(hold_error == 0);
-			} else {
+		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
+			if (nrequeued == nrequeue)
 				break;
-			}
+
+			KASSERT(l->l_syncobj == &futex_syncobj);
+			KASSERT(l->l_wchan == (wchan_t)f);
+			KASSERT(l->l_futex == f);
+			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
+			KASSERT(l->l_mutex == f->fx_sq_lock);
+
+			if ((l->l_futex_wakesel & bitset) == 0)
+				continue;
+
+			l->l_futex = f2;
+			sleepq_transfer(l, &f->fx_sqs[q],
+			    &f2->fx_sqs[q2], f2, futex_wmesg,
+			    &futex_syncobj, f2->fx_sq_lock, true);
+			f->fx_nwaiters[q]--;
+			f2->fx_nwaiters[q2]++;
+			nrequeued++;
+			/* hold counts adjusted in bulk below */
+		}
+		if (nrequeued) {
+			/* XXX futex_hold() could theoretically fail here. */
+			int hold_error __diagused =
+			    futex_hold_count(f2, nrequeued);
+			KASSERT(hold_error == 0);
 		}
-	} else {
-		KASSERT(nrequeue == 0);
 	}
 
-	/* Return the number of waiters woken.  */
-	return nwoken;
+	/*
+	 * Adjust the futex reference counts for the wakes and
+	 * requeues.
+	 */
+	KASSERT(nwoken + nrequeued <= LONG_MAX);
+	futex_rele_count_not_last(f, nwoken + nrequeued);
+
+	futex_sq_unlock2(f, f2);
+
+	/* Return the number of waiters woken and requeued.  */
+	return nwoken + nrequeued;
 }
 
 /*
- * futex_queue_lock(f)
+ * futex_op_lock(f)
  *
- *	Acquire the queue lock of f.  Pair with futex_queue_unlock.  Do
+ *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
  *	not use if caller needs to acquire two locks; use
- *	futex_queue_lock2 instead.
+ *	futex_op_lock2 instead.
  */
 static void
-futex_queue_lock(struct futex *f)
+futex_op_lock(struct futex *f)
 {
-	mutex_enter(&f->fx_qlock);
+	mutex_enter(&f->fx_op_lock);
 }
 
 /*
- * futex_queue_unlock(f)
+ * futex_op_unlock(f)
  *
  *	Release the queue lock of f.
  */
 static void
-futex_queue_unlock(struct futex *f)
+futex_op_unlock(struct futex *f)
 {
-	mutex_exit(&f->fx_qlock);
+	mutex_exit(&f->fx_op_lock);
 }
 
 /*
- * futex_queue_lock2(f, f2)
+ * futex_op_lock2(f, f2)
  *
- *	Acquire the queue locks of both f and f2, which may be null, or
- *	which may have the same underlying queue.  If they are
- *	distinct, an arbitrary total order is chosen on the locks.
+ *	Acquire the op locks of both f and f2, which may be null, or
+ *	which may be the same.  If they are distinct, an arbitrary total
+ *	order is chosen on the locks.
  *
- *	Callers should only ever acquire multiple queue locks
- *	simultaneously using futex_queue_lock2.
+ *	Callers should only ever acquire multiple op locks
+ *	simultaneously using futex_op_lock2.
  */
 static void
-futex_queue_lock2(struct futex *f, struct futex *f2)
+futex_op_lock2(struct futex *f, struct futex *f2)
 {
 
 	/*
@@ -1119,37 +1264,37 @@ futex_queue_lock2(struct futex *f, struc
 	if (f == NULL && f2 == NULL) {
 		return;
 	} else if (f == NULL) {
-		mutex_enter(&f2->fx_qlock);
+		mutex_enter(&f2->fx_op_lock);
 		return;
 	} else if (f2 == NULL) {
-		mutex_enter(&f->fx_qlock);
+		mutex_enter(&f->fx_op_lock);
 		return;
 	}
 
 	/* If both futexes are the same, acquire only one. */
 	if (f == f2) {
-		mutex_enter(&f->fx_qlock);
+		mutex_enter(&f->fx_op_lock);
 		return;
 	}
 
 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
 	if ((uintptr_t)f < (uintptr_t)f2) {
-		mutex_enter(&f->fx_qlock);
-		mutex_enter(&f2->fx_qlock);
+		mutex_enter(&f->fx_op_lock);
+		mutex_enter(&f2->fx_op_lock);
 	} else {
-		mutex_enter(&f2->fx_qlock);
-		mutex_enter(&f->fx_qlock);
+		mutex_enter(&f2->fx_op_lock);
+		mutex_enter(&f->fx_op_lock);
 	}
 }
 
 /*
- * futex_queue_unlock2(f, f2)
+ * futex_op_unlock2(f, f2)
  *
  *	Release the queue locks of both f and f2, which may be null, or
  *	which may have the same underlying queue.
  */
 static void
-futex_queue_unlock2(struct futex *f, struct futex *f2)
+futex_op_unlock2(struct futex *f, struct futex *f2)
 {
 
 	/*
@@ -1159,33 +1304,33 @@ futex_queue_unlock2(struct futex *f, str
 	if (f == NULL && f2 == NULL) {
 		return;
 	} else if (f == NULL) {
-		mutex_exit(&f2->fx_qlock);
+		mutex_exit(&f2->fx_op_lock);
 		return;
 	} else if (f2 == NULL) {
-		mutex_exit(&f->fx_qlock);
+		mutex_exit(&f->fx_op_lock);
 		return;
 	}
 
 	/* If both futexes are the same, release only one. */
 	if (f == f2) {
-		mutex_exit(&f->fx_qlock);
+		mutex_exit(&f->fx_op_lock);
 		return;
 	}
 
 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
 	if ((uintptr_t)f < (uintptr_t)f2) {
-		mutex_exit(&f2->fx_qlock);
-		mutex_exit(&f->fx_qlock);
+		mutex_exit(&f2->fx_op_lock);
+		mutex_exit(&f->fx_op_lock);
 	} else {
-		mutex_exit(&f->fx_qlock);
-		mutex_exit(&f2->fx_qlock);
+		mutex_exit(&f->fx_op_lock);
+		mutex_exit(&f2->fx_op_lock);
 	}
 }
 
 /*
  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
  *
- *	Implement futex(FUTEX_WAIT).
+ *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
  */
 static int
 futex_func_wait(bool shared, int *uaddr, int val, int val3,
@@ -1193,7 +1338,6 @@ futex_func_wait(bool shared, int *uaddr,
     register_t *retval)
 {
 	struct futex *f;
-	struct futex_wait wait, *fw = &wait;
 	struct timespec ts;
 	const struct timespec *deadline;
 	int error;
@@ -1210,62 +1354,71 @@ futex_func_wait(bool shared, int *uaddr,
 		return EAGAIN;
 
 	/* Determine a deadline on the specified clock.  */
-	if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
+	if (timeout == NULL) {
+		deadline = timeout;
+	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
+		/* Sanity-check the deadline. */
+		if (timeout->tv_sec < 0 ||
+		    timeout->tv_nsec < 0 ||
+		    timeout->tv_nsec >= 1000000000L) {
+			return EINVAL;
+		}
 		deadline = timeout;
 	} else {
+		struct timespec interval = *timeout;
+
+		error = itimespecfix(&interval);
+		if (error)
+			return error;
 		error = clock_gettime1(clkid, &ts);
 		if (error)
 			return error;
-		timespecadd(&ts, timeout, &ts);
+		timespecadd(&ts, &interval, &ts);
 		deadline = &ts;
 	}
 
 	/* Get the futex, creating it if necessary.  */
-	error = futex_lookup_create(uaddr, shared, &f);
+	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
 	if (error)
 		return error;
 	KASSERT(f);
 
-	/* Get ready to wait.  */
-	futex_wait_init(fw, val3);
-
 	/*
-	 * Under the queue lock, check the value again: if it has
+	 * Under the op lock, check the value again: if it has
 	 * already changed, EAGAIN; otherwise enqueue the waiter.
 	 * Since FUTEX_WAKE will use the same lock and be done after
 	 * modifying the value, the order in which we check and enqueue
 	 * is immaterial.
 	 */
-	futex_queue_lock(f);
+	futex_op_lock(f);
 	if (!futex_test(uaddr, val)) {
-		futex_queue_unlock(f);
+		futex_op_unlock(f);
 		error = EAGAIN;
 		goto out;
 	}
-	mutex_enter(&fw->fw_lock);
-	futex_wait_enqueue(fw, f);
-	mutex_exit(&fw->fw_lock);
-	futex_queue_unlock(f);
+
+	/*
+	 * Now wait.  futex_wait() will drop our op lock once we
+	 * are entered into the sleep queue, thus ensuring atomicity
+	 * of wakes with respect to waits.
+	 */
+	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
 
 	/*
 	 * We cannot drop our reference to the futex here, because
 	 * we might be enqueued on a different one when we are awakened.
-	 * The references will be managed on our behalf in the requeue
-	 * and wake cases.
+	 * The references will be managed on our behalf in the requeue,
+	 * wake, and error cases.
 	 */
 	f = NULL;
 
-	/* Wait. */
-	error = futex_wait(fw, deadline, clkid);
-	if (error)
-		goto out;
-
-	/* Return 0 on success, error on failure. */
-	*retval = 0;
+	if (__predict_true(error == 0)) {
+		/* Return 0 on success, error on failure. */
+		*retval = 0;
+	}
 
 out:	if (f != NULL)
 		futex_rele(f);
-	futex_wait_fini(fw);
 	return error;
 }
 
@@ -1288,7 +1441,7 @@ futex_func_wake(bool shared, int *uaddr,
 	}
 
 	/* Look up the futex, if any.  */
-	error = futex_lookup(uaddr, shared, &f);
+	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
 	if (error)
 		goto out;
 
@@ -1297,12 +1450,13 @@ futex_func_wake(bool shared, int *uaddr,
 		goto out;
 
 	/*
-	 * Under f's queue lock, wake the waiters and remember the
+	 * Under f's op lock, wake the waiters and remember the
 	 * number woken.
 	 */
-	futex_queue_lock(f);
-	nwoken = futex_wake(f, val, NULL, 0, val3);
-	futex_queue_unlock(f);
+	futex_op_lock(f);
+	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
+			    NULL, FUTEX_WRITERQ, 0, val3);
+	futex_op_unlock(f);
 
 	/* Release the futex.  */
 	futex_rele(f);
@@ -1335,7 +1489,7 @@ futex_func_requeue(bool shared, int op, 
 	}
 
 	/* Look up the source futex, if any. */
-	error = futex_lookup(uaddr, shared, &f);
+	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
 	if (error)
 		goto out;
 
@@ -1347,22 +1501,24 @@ futex_func_requeue(bool shared, int op, 
 	 * We may need to create the destination futex because it's
 	 * entirely possible it does not currently have any waiters.
 	 */
-	error = futex_lookup_create(uaddr2, shared, &f2);
+	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
 	if (error)
 		goto out;
 
 	/*
-	 * Under the futexes' queue locks, check the value; if
+	 * Under the futexes' op locks, check the value; if
 	 * unchanged from val3, wake the waiters.
 	 */
-	futex_queue_lock2(f, f2);
+	futex_op_lock2(f, f2);
 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
 		error = EAGAIN;
 	} else {
 		error = 0;
-		nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
+		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
+				    f2, FUTEX_WRITERQ, val2,
+				    FUTEX_BITSET_MATCH_ANY);
 	}
-	futex_queue_unlock2(f, f2);
+	futex_op_unlock2(f, f2);
 
 out:
 	/* Return the number of waiters woken.  */
@@ -1526,23 +1682,23 @@ futex_func_wake_op(bool shared, int *uad
 		goto out;
 
 	/* Look up the first futex, if any.  */
-	error = futex_lookup(uaddr, shared, &f);
+	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
 	if (error)
 		goto out;
 
 	/* Look up the second futex, if any.  */
-	error = futex_lookup(uaddr2, shared, &f2);
+	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
 	if (error)
 		goto out;
 
 	/*
-	 * Under the queue locks:
+	 * Under the op locks:
 	 *
 	 * 1. Read/modify/write: *uaddr2 op= oparg.
 	 * 2. Unconditionally wake uaddr.
-	 * 3. Conditionally wake uaddr2, if it previously matched val2.
+	 * 3. Conditionally wake uaddr2, if it previously matched val3.
 	 */
-	futex_queue_lock2(f, f2);
+	futex_op_lock2(f, f2);
 	do {
 		error = futex_load(uaddr2, &oldval);
 		if (error)
@@ -1552,15 +1708,18 @@ futex_func_wake_op(bool shared, int *uad
 		if (error)
 			goto out_unlock;
 	} while (actual != oldval);
-	nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
+	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
+				 NULL, FUTEX_WRITERQ, 0,
+				 FUTEX_BITSET_MATCH_ANY) : 0);
 	if (f2 && futex_compute_cmp(oldval, val3))
-		nwoken += futex_wake(f2, val2, NULL, 0,
-		    FUTEX_BITSET_MATCH_ANY);
+		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
+				     NULL, FUTEX_WRITERQ, 0,
+				     FUTEX_BITSET_MATCH_ANY);
 
 	/* Success! */
 	error = 0;
 out_unlock:
-	futex_queue_unlock2(f, f2);
+	futex_op_unlock2(f, f2);
 
 out:
 	/* Return the number of waiters woken. */
@@ -1587,38 +1746,76 @@ do_futex(int *uaddr, int op, int val, co
 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
 							    : CLOCK_MONOTONIC;
+	int error;
 
 	op &= FUTEX_CMD_MASK;
 
 	switch (op) {
 	case FUTEX_WAIT:
-		return futex_func_wait(shared, uaddr, val,
+		SDT_PROBE6(futex, func, wait, entry,
+		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
+		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
+		error = futex_func_wait(shared, uaddr, val,
 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
 		    retval);
+		SDT_PROBE1(futex, func, wait, exit, error);
+		break;
+
+	case FUTEX_WAIT_BITSET:
+		SDT_PROBE6(futex, func, wait, entry,
+		    uaddr, val, val3, timeout,
+		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
+		error = futex_func_wait(shared, uaddr, val, val3, timeout,
+		    clkid, TIMER_ABSTIME, retval);
+		SDT_PROBE1(futex, func, wait, exit, error);
+		break;
 
 	case FUTEX_WAKE:
-		val3 = FUTEX_BITSET_MATCH_ANY;
-		/* FALLTHROUGH */
+		SDT_PROBE4(futex, func, wake, entry,
+		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
+		error = futex_func_wake(shared, uaddr, val,
+		    FUTEX_BITSET_MATCH_ANY, retval);
+		SDT_PROBE2(futex, func, wake, exit, error, *retval);
+		break;
+
 	case FUTEX_WAKE_BITSET:
-		return futex_func_wake(shared, uaddr, val, val3, retval);
+		SDT_PROBE4(futex, func, wake, entry,
+		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
+		error = futex_func_wake(shared, uaddr, val, val3, retval);
+		SDT_PROBE2(futex, func, wake, exit, error, *retval);
+		break;
 
 	case FUTEX_REQUEUE:
+		SDT_PROBE5(futex, func, requeue, entry,
+		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
+		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
+		    val2, 0, retval);
+		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
+		break;
+
 	case FUTEX_CMP_REQUEUE:
-		return futex_func_requeue(shared, op, uaddr, val, uaddr2,
+		SDT_PROBE6(futex, func, cmp_requeue, entry,
+		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
+		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
 		    val2, val3, retval);
-
-	case FUTEX_WAIT_BITSET:
-		return futex_func_wait(shared, uaddr, val, val3, timeout,
-		    clkid, TIMER_ABSTIME, retval);
+		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
+		break;
 
 	case FUTEX_WAKE_OP:
-		return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
+		SDT_PROBE6(futex, func, wake_op, entry,
+		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
+		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
 		    val3, retval);
+		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
+		break;
 
 	case FUTEX_FD:
 	default:
-		return ENOSYS;
+		error = ENOSYS;
+		break;
 	}
+
+	return error;
 }
 
 /*
@@ -1730,6 +1927,7 @@ release_futex(uintptr_t const uptr, lwpi
 	struct futex *f;
 	int oldval, newval, actual;
 	int error;
+	bool shared;
 
 	/* If it's misaligned, tough.  */
 	if (__predict_false(uptr & 3))
@@ -1760,8 +1958,11 @@ release_futex(uintptr_t const uptr, lwpi
 	 */
 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
 		register_t retval;
-		(void) futex_func_wake(/*shared*/true, uaddr, 1,
+		error = futex_func_wake(/*shared*/true, uaddr, 1,
 		    FUTEX_BITSET_MATCH_ANY, &retval);
+		if (error != 0 || retval == 0)
+			(void) futex_func_wake(/*shared*/false, uaddr, 1,
+			    FUTEX_BITSET_MATCH_ANY, &retval);
 		return;
 	}
 
@@ -1805,22 +2006,23 @@ release_futex(uintptr_t const uptr, lwpi
 	}
 
 	/*
-	 * Look for a shared futex since we have no positive indication
-	 * it is private.  If we can't, tough.
-	 */
-	error = futex_lookup(uaddr, /*shared*/true, &f);
-	if (error)
-		return;
-
-	/*
-	 * If there's no kernel state for this futex, there's nothing to
-	 * release.
+	 * Look for a futex.  Try shared first, then private.  If we
+	 * can't fine one, tough.
+	 *
+	 * Note: the assumption here is that anyone placing a futex on
+	 * the robust list is adhering to the rules, regardless of the
+	 * futex class.
 	 */
-	if (f == NULL)
-		return;
+	for (f = NULL, shared = true; f == NULL; shared = false) {
+		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
+		if (error)
+			return;
+		if (f == NULL && shared == false)
+			return;
+	}
 
-	/* Work under the futex queue lock.  */
-	futex_queue_lock(f);
+	/* Work under the futex op lock.  */
+	futex_op_lock(f);
 
 	/*
 	 * Fetch the word: if the tid doesn't match ours, skip;
@@ -1844,11 +2046,14 @@ release_futex(uintptr_t const uptr, lwpi
 	 *
 	 * XXX eventual PI handling?
 	 */
-	if (oldval & FUTEX_WAITERS)
-		(void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
+	if (oldval & FUTEX_WAITERS) {
+		(void)futex_wake(f, FUTEX_WRITERQ, 1,
+				 NULL, FUTEX_WRITERQ, 0,
+				 FUTEX_BITSET_MATCH_ANY);
+	}
 
 	/* Unlock the queue and release the futex.  */
-out:	futex_queue_unlock(f);
+out:	futex_op_unlock(f);
 	futex_rele(f);
 }
 

Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.212 src/sys/sys/lwp.h:1.212.14.1
--- src/sys/sys/lwp.h:1.212	Fri Oct 23 00:25:45 2020
+++ src/sys/sys/lwp.h	Thu Aug  5 23:14:21 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: lwp.h,v 1.212 2020/10/23 00:25:45 thorpej Exp $	*/
+/*	$NetBSD: lwp.h,v 1.212.14.1 2021/08/05 23:14:21 thorpej Exp $	*/
 
 /*
  * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020
@@ -72,6 +72,7 @@ static __inline struct cpu_info *lwp_get
  * S:	l_selcluster->sc_lock
  * (:	unlocked, stable
  * !:	unlocked, may only be reliably accessed by the LWP itself
+ * x:	special; see comments for field.
  *
  * Fields are clustered together by usage (to increase the likelihood
  * of cache hits) and by size (to reduce dead space in the structure).
@@ -79,6 +80,7 @@ static __inline struct cpu_info *lwp_get
 
 #include <sys/pcu.h>
 
+struct futex;
 struct lockdebug;
 struct sysent;
 
@@ -139,9 +141,19 @@ struct lwp {
 	u_int		l_slptime;	/* l: time since last blocked */
 	bool		l_vforkwaiting;	/* a: vfork() waiting */
 
-	/* User-space synchronization. */
+	/*
+	 * User-space synchronization.
+	 *
+	 * Special locking considerations:
+	 *
+	 * l_futex and l_futex_wakesel are acccessed unlocked and private
+	 * to the LWP *unless* the LWP is present on a futex sleepq, in
+	 * which case they are protected by the lwp_lock (which will in
+	 * actuality be the futex sleepq lock).
+	 */
 	uintptr_t	l_robust_head;	/* !: list of robust futexes */
-	uint32_t	l___rsvd1;	/* reserved for future use */
+	struct futex	*l_futex;	/* x: futex we're waiting on */
+	uint32_t	l_futex_wakesel;/* x: futex wake selector */
 
 #if PCU_UNIT_COUNT > 0
 	struct cpu_info	* volatile l_pcu_cpu[PCU_UNIT_COUNT];

Reply via email to