Module Name:    src
Committed By:   ad
Date:           Sun Jan 19 21:08:30 UTC 2020

Modified Files:
        src/sys/arch/amd64/amd64 [ad-namecache]: genassym.cf lock_stubs.S
        src/sys/arch/i386/i386 [ad-namecache]: genassym.cf lock_stubs.S
        src/sys/kern [ad-namecache]: kern_rwlock.c kern_synch.c
        src/sys/sys [ad-namecache]: lwp.h

Log Message:
Adaptive rwlocks proposed on tech-kern and working well on this branch
with vnode locks.


To generate a diff of this commit:
cvs rdiff -u -r1.80.2.1 -r1.80.2.2 src/sys/arch/amd64/amd64/genassym.cf
cvs rdiff -u -r1.35 -r1.35.2.1 src/sys/arch/amd64/amd64/lock_stubs.S
cvs rdiff -u -r1.117.2.1 -r1.117.2.2 src/sys/arch/i386/i386/genassym.cf
cvs rdiff -u -r1.32 -r1.32.2.1 src/sys/arch/i386/i386/lock_stubs.S
cvs rdiff -u -r1.59.2.2 -r1.59.2.3 src/sys/kern/kern_rwlock.c
cvs rdiff -u -r1.334.2.1 -r1.334.2.2 src/sys/kern/kern_synch.c
cvs rdiff -u -r1.192.2.1 -r1.192.2.2 src/sys/sys/lwp.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/amd64/genassym.cf
diff -u src/sys/arch/amd64/amd64/genassym.cf:1.80.2.1 src/sys/arch/amd64/amd64/genassym.cf:1.80.2.2
--- src/sys/arch/amd64/amd64/genassym.cf:1.80.2.1	Fri Jan 17 21:47:22 2020
+++ src/sys/arch/amd64/amd64/genassym.cf	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: genassym.cf,v 1.80.2.1 2020/01/17 21:47:22 ad Exp $
+#	$NetBSD: genassym.cf,v 1.80.2.2 2020/01/19 21:08:29 ad Exp $
 
 #
 # Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -78,7 +78,6 @@ include <sys/resourcevar.h>
 include <sys/device.h>
 include <sys/mbuf.h>
 include <sys/mutex.h>
-include <sys/rwlock.h>
 include <sys/cpu_data.h>
 include <sys/evcnt.h>
 include <sys/cpu.h>
@@ -346,15 +345,6 @@ define	MTX_IPL			offsetof(struct kmutex,
 define	MTX_LOCK		offsetof(struct kmutex, u.s.mtxs_lock)
 define	MTX_OWNER		offsetof(struct kmutex, u.mtxa_owner)
 
-define	RW_OWNER		offsetof(struct krwlock, rw_owner)
-define	RW_WRITE_LOCKED		RW_WRITE_LOCKED
-define	RW_WRITE_WANTED		RW_WRITE_WANTED
-define	RW_READ_INCR		RW_READ_INCR
-define	RW_HAS_WAITERS		RW_HAS_WAITERS
-define	RW_THREAD		RW_THREAD
-define	RW_READER		RW_READER
-define	RW_WRITER		RW_WRITER
-
 define	EV_COUNT		offsetof(struct evcnt, ev_count)
 
 define	OPTERON_MSR_PASSCODE	OPTERON_MSR_PASSCODE

Index: src/sys/arch/amd64/amd64/lock_stubs.S
diff -u src/sys/arch/amd64/amd64/lock_stubs.S:1.35 src/sys/arch/amd64/amd64/lock_stubs.S:1.35.2.1
--- src/sys/arch/amd64/amd64/lock_stubs.S:1.35	Sun Dec  8 20:00:56 2019
+++ src/sys/arch/amd64/amd64/lock_stubs.S	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: lock_stubs.S,v 1.35 2019/12/08 20:00:56 ad Exp $	*/
+/*	$NetBSD: lock_stubs.S,v 1.35.2.1 2020/01/19 21:08:29 ad Exp $	*/
 
 /*
  * Copyright (c) 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -185,126 +185,6 @@ ENTRY(mutex_spin_exit)
 
 END(mutex_spin_exit)
 
-/*
- * void	rw_enter(krwlock_t *rwl, krw_t op);
- *
- * Acquire one hold on a RW lock.
- */
-ENTRY(rw_enter)
-	cmpl	$RW_READER, %esi
-	jne	2f
-
-	/*
-	 * Reader: this is the most common case.
-	 */
-	movq	(%rdi), %rax
-0:
-	testb	$(RW_WRITE_LOCKED|RW_WRITE_WANTED), %al
-	jnz	3f
-	leaq	RW_READ_INCR(%rax), %rdx
-	LOCK
-	cmpxchgq %rdx, (%rdi)
-	jnz	1f
-	RET
-1:
-	jmp	0b
-
-	/*
-	 * Writer: if the compare-and-set fails, don't bother retrying.
-	 */
-2:	movq	CPUVAR(CURLWP), %rcx
-	xorq	%rax, %rax
-	orq	$RW_WRITE_LOCKED, %rcx
-	LOCK
-	cmpxchgq %rcx, (%rdi)
-	jnz	3f
-	RET
-3:
-	jmp	_C_LABEL(rw_vector_enter)
-END(rw_enter)
-
-/*
- * void	rw_exit(krwlock_t *rwl);
- *
- * Release one hold on a RW lock.
- */
-ENTRY(rw_exit)
-	movq	(%rdi), %rax
-	testb	$RW_WRITE_LOCKED, %al
-	jnz	2f
-
-	/*
-	 * Reader
-	 */
-0:	testb	$RW_HAS_WAITERS, %al
-	jnz	3f
-	cmpq	$RW_READ_INCR, %rax
-	jb	3f
-	leaq	-RW_READ_INCR(%rax), %rdx
-	LOCK
-	cmpxchgq %rdx, (%rdi)
-	jnz	1f
-	ret
-1:
-	jmp	0b
-
-	/*
-	 * Writer
-	 */
-2:	leaq	-RW_WRITE_LOCKED(%rax), %rdx
-	subq	CPUVAR(CURLWP), %rdx
-	jnz	3f
-	LOCK
-	cmpxchgq %rdx, (%rdi)
-	jnz	3f
-	ret
-
-3:	jmp	_C_LABEL(rw_vector_exit)
-END(rw_exit)
-
-/*
- * int	rw_tryenter(krwlock_t *rwl, krw_t op);
- *
- * Try to acquire one hold on a RW lock.
- */
-ENTRY(rw_tryenter)
-	cmpl	$RW_READER, %esi
-	jne	2f
-
-	/*
-	 * Reader: this is the most common case.
-	 */
-	movq	(%rdi), %rax
-0:
-	testb	$(RW_WRITE_LOCKED|RW_WRITE_WANTED), %al
-	jnz	4f
-	leaq	RW_READ_INCR(%rax), %rdx
-	LOCK
-	cmpxchgq %rdx, (%rdi)
-	jnz	1f
-	movl	%edx, %eax			/* nonzero */
-	RET
-1:
-	jmp	0b
-
-	/*
-	 * Writer: if the compare-and-set fails, don't bother retrying.
-	 */
-2:	movq	CPUVAR(CURLWP), %rcx
-	xorq	%rax, %rax
-	orq	$RW_WRITE_LOCKED, %rcx
-	LOCK
-	cmpxchgq %rcx, (%rdi)
-	movl	$0, %eax
-	setz	%al
-3:
-	RET
-	ret
-4:
-	xorl	%eax, %eax
-	jmp	3b
-END(rw_tryenter)
-
 #endif	/* LOCKDEBUG */
 
 /*

Index: src/sys/arch/i386/i386/genassym.cf
diff -u src/sys/arch/i386/i386/genassym.cf:1.117.2.1 src/sys/arch/i386/i386/genassym.cf:1.117.2.2
--- src/sys/arch/i386/i386/genassym.cf:1.117.2.1	Fri Jan 17 21:47:25 2020
+++ src/sys/arch/i386/i386/genassym.cf	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-#	$NetBSD: genassym.cf,v 1.117.2.1 2020/01/17 21:47:25 ad Exp $
+#	$NetBSD: genassym.cf,v 1.117.2.2 2020/01/19 21:08:29 ad Exp $
 
 #
 # Copyright (c) 1998, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -79,7 +79,6 @@ include <sys/resourcevar.h>
 include <sys/device.h>
 include <sys/mbuf.h>
 include <sys/mutex.h>
-include <sys/rwlock.h>
 include <sys/cpu.h>
 
 include <netinet/in.h>
@@ -354,15 +353,6 @@ define	MTX_IPL			offsetof(struct kmutex,
 define	MTX_LOCK		offsetof(struct kmutex, mtx_lock)
 define	MTX_OWNER		offsetof(struct kmutex, u.mtxa_owner)
 
-define	RW_OWNER		offsetof(struct krwlock, rw_owner)
-define	RW_WRITE_LOCKED		RW_WRITE_LOCKED
-define	RW_WRITE_WANTED		RW_WRITE_WANTED
-define	RW_READ_INCR		RW_READ_INCR
-define	RW_HAS_WAITERS		RW_HAS_WAITERS
-define	RW_THREAD		RW_THREAD
-define	RW_READER		RW_READER
-define	RW_WRITER		RW_WRITER
-
 define	EV_COUNT		offsetof(struct evcnt, ev_count)
 
 define	OPTERON_MSR_PASSCODE	OPTERON_MSR_PASSCODE

Index: src/sys/arch/i386/i386/lock_stubs.S
diff -u src/sys/arch/i386/i386/lock_stubs.S:1.32 src/sys/arch/i386/i386/lock_stubs.S:1.32.2.1
--- src/sys/arch/i386/i386/lock_stubs.S:1.32	Sun Dec  8 20:00:56 2019
+++ src/sys/arch/i386/i386/lock_stubs.S	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: lock_stubs.S,v 1.32 2019/12/08 20:00:56 ad Exp $	*/
+/*	$NetBSD: lock_stubs.S,v 1.32.2.1 2020/01/19 21:08:29 ad Exp $	*/
 
 /*-
  * Copyright (c) 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
  */
 
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: lock_stubs.S,v 1.32 2019/12/08 20:00:56 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lock_stubs.S,v 1.32.2.1 2020/01/19 21:08:29 ad Exp $");
 
 #include "opt_lockdebug.h"
 
@@ -97,132 +97,6 @@ ENTRY(mutex_exit)
 	jmp	_C_LABEL(mutex_vector_exit)
 END(mutex_exit)
 
-/*
- * void rw_enter(krwlock_t *rwl, krw_t op);
- *
- * Acquire one hold on a RW lock.
- */
-ENTRY(rw_enter)
-	movl	4(%esp), %edx
-	cmpl	$RW_READER, 8(%esp)
-	jne	2f
-
-	/*
-	 * Reader
-	 */
-	movl	(%edx), %eax
-0:
-	testb	$(RW_WRITE_LOCKED|RW_WRITE_WANTED), %al
-	jnz	3f
-	leal	RW_READ_INCR(%eax), %ecx
-	LOCK(2)
-	cmpxchgl %ecx, (%edx)
-	jnz	1f
-	RET(2)
-1:
-	jmp	0b
-
-	/*
-	 * Writer
-	 */
-2:	xorl	%eax, %eax
-	movl	%fs:CPU_INFO_CURLWP(%eax), %ecx
-	orl	$RW_WRITE_LOCKED, %ecx
-	LOCK(3)
-	cmpxchgl %ecx, (%edx)
-	jnz	3f
-	RET(3)
-3:
-	jmp	_C_LABEL(rw_vector_enter)
-END(rw_enter)
-
-/*
- * void rw_exit(krwlock_t *rwl);
- *
- * Release one hold on a RW lock.
- */
-ENTRY(rw_exit)
-	movl	4(%esp), %edx
-	movl	(%edx), %eax
-	testb	$RW_WRITE_LOCKED, %al
-	jnz	2f
-
-	/*
-	 * Reader
-	 */
-0:	testb	$RW_HAS_WAITERS, %al
-	jnz	3f
-	cmpl	$RW_READ_INCR, %eax
-	jb	3f
-	leal	-RW_READ_INCR(%eax), %ecx
-	LOCK(4)
-	cmpxchgl %ecx, (%edx)
-	jnz	1f
-	ret
-1:
-	jmp	0b
-
-	/*
-	 * Writer
-	 */
-2:	leal	-RW_WRITE_LOCKED(%eax), %ecx
-	subl	CPUVAR(CURLWP), %ecx
-	jnz	3f
-	LOCK(5)
-	cmpxchgl %ecx, (%edx)
-	jnz	3f
-	ret
-
-	/*
-	 * Slow path.
-	 */
-3:	jmp	_C_LABEL(rw_vector_exit)
-END(rw_exit)
-
-/*
- * int rw_tryenter(krwlock_t *rwl, krw_t op);
- *
- * Try to acquire one hold on a RW lock.
- */
-ENTRY(rw_tryenter)
-	movl	4(%esp), %edx
-	cmpl	$RW_READER, 8(%esp)
-	jne	2f
-
-	/*
-	 * Reader
-	 */
-	movl	(%edx), %eax
-0:
-	testb	$(RW_WRITE_LOCKED|RW_WRITE_WANTED), %al
-	jnz	4f
-	leal	RW_READ_INCR(%eax), %ecx
-	LOCK(12)
-	cmpxchgl %ecx, (%edx)
-	jnz	1f
-	movl	%edx, %eax			/* nonzero */
-	RET(4)
-1:
-	jmp	0b
-
-	/*
-	 * Writer
-	 */
-2:
-	xorl	%eax, %eax
-	movl	%fs:CPU_INFO_CURLWP(%eax), %ecx
-	orl	$RW_WRITE_LOCKED, %ecx
-	LOCK(13)
-	cmpxchgl %ecx, (%edx)
-	movl	$0, %eax
-	setz	%al
-3:
-	RET(5)
-4:
-	xorl	%eax, %eax
-	jmp	3b
-END(rw_tryenter)
-
 #ifndef XENPV
 
 /*

Index: src/sys/kern/kern_rwlock.c
diff -u src/sys/kern/kern_rwlock.c:1.59.2.2 src/sys/kern/kern_rwlock.c:1.59.2.3
--- src/sys/kern/kern_rwlock.c:1.59.2.2	Sun Jan 19 19:44:34 2020
+++ src/sys/kern/kern_rwlock.c	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_rwlock.c,v 1.59.2.2 2020/01/19 19:44:34 ad Exp $	*/
+/*	$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $	*/
 
 /*-
  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020
@@ -36,10 +36,23 @@
  *
  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
  *	    Richard McDougall.
+ *
+ * The NetBSD implementation is different from that described in the book,
+ * in that the locks are adaptive.  Lock waiters spin wait while the lock
+ * holders are on CPU (if the holds can be tracked: up to N per-thread). 
+ *
+ * While spin waiting, threads compete for the lock without the assistance
+ * of turnstiles.  If a lock holder sleeps for any reason, the lock waiters
+ * will also sleep in response and at that point turnstiles, priority
+ * inheritance and strong efforts at ensuring fairness come into play.
+ *
+ * The adaptive behaviour is controlled by the RW_SPIN flag bit, which is
+ * cleared by a lock owner that is going off the CPU, and set again by the
+ * lock owner that releases the last hold on the lock.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.59.2.2 2020/01/19 19:44:34 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $");
 
 #include "opt_lockdebug.h"
 
@@ -59,8 +72,6 @@ __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.
 
 #include <dev/lockstat.h>
 
-#include <machine/rwlock.h>
-
 /*
  * LOCKDEBUG
  */
@@ -104,19 +115,6 @@ do { \
 #define	RW_MEMBAR_PRODUCER()		membar_producer()
 #endif
 
-/*
- * For platforms that do not provide stubs, or for the LOCKDEBUG case.
- */
-#ifdef LOCKDEBUG
-#undef	__HAVE_RW_STUBS
-#endif
-
-#ifndef __HAVE_RW_STUBS
-__strong_alias(rw_enter,rw_vector_enter);
-__strong_alias(rw_exit,rw_vector_exit);
-__strong_alias(rw_tryenter,rw_vector_tryenter);
-#endif
-
 static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
 static void	rw_dump(const volatile void *, lockop_printer_t);
 static lwp_t	*rw_owner(wchan_t);
@@ -149,6 +147,22 @@ rw_cas(krwlock_t *rw, uintptr_t o, uintp
 }
 
 /*
+ * rw_and:
+ *
+ *	Do an atomic AND on the lock word.
+ */
+static inline void
+rw_and(krwlock_t *rw, uintptr_t m)
+{
+
+#ifdef _LP64
+	atomic_and_64(&rw->rw_owner, m);
+#else
+	atomic_and_32(&rw->rw_owner, m);
+#endif
+}
+
+/*
  * rw_swap:
  *
  *	Do an atomic swap of the lock word.  This is used only when it's
@@ -167,6 +181,75 @@ rw_swap(krwlock_t *rw, uintptr_t o, uint
 }
 
 /*
+ * rw_hold_remember:
+ *
+ *	Helper - when acquring a lock, record the new hold.
+ */
+static inline uintptr_t
+rw_hold_remember(krwlock_t *rw, lwp_t *l)
+{
+	int i;
+
+	KASSERT(kpreempt_disabled());
+
+	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
+		if (__predict_true(l->l_rwlocks[i] == NULL)) {
+			l->l_rwlocks[i] = rw;
+			/*
+			 * Clear the write wanted flag on every acquire to
+			 * give readers a chance once again.
+			 */
+			return ~RW_WRITE_WANTED;
+		}
+	}
+
+	/*
+	 * Nowhere to track the hold so we lose: temporarily disable
+	 * spinning on the lock.
+	 */
+	return ~(RW_WRITE_WANTED | RW_SPIN);
+}
+
+/*
+ * rw_hold_forget:
+ *
+ *	Helper - when releasing a lock, stop tracking the hold.
+ */
+static inline void
+rw_hold_forget(krwlock_t *rw, lwp_t *l)
+{
+	int i;
+
+	KASSERT(kpreempt_disabled());
+
+	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
+		if (__predict_true(l->l_rwlocks[i] == rw)) {
+			l->l_rwlocks[i] = NULL;
+			return;
+		}
+	}
+}
+
+/*
+ * rw_switch:
+ *
+ *	Called by mi_switch() to indicate that an LWP is going off the CPU.
+ */
+void
+rw_switch(void)
+{
+	lwp_t *l = curlwp;
+	int i;
+
+	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
+		if (l->l_rwlocks[i] != NULL) {
+			rw_and(l->l_rwlocks[i], ~RW_SPIN);
+			/* Leave in place for exit to clear. */
+		}
+	}
+}
+
+/*
  * rw_dump:
  *
  *	Dump the contents of a rwlock structure.
@@ -207,9 +290,9 @@ _rw_init(krwlock_t *rw, uintptr_t return
 {
 
 	if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
-		rw->rw_owner = 0;
+		rw->rw_owner = RW_SPIN;
 	else
-		rw->rw_owner = RW_NODEBUG;
+		rw->rw_owner = RW_SPIN | RW_NODEBUG;
 }
 
 void
@@ -228,53 +311,18 @@ void
 rw_destroy(krwlock_t *rw)
 {
 
-	RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
+	RW_ASSERT(rw, (rw->rw_owner & ~(RW_NODEBUG | RW_SPIN)) == 0);
 	LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
 }
 
 /*
- * rw_oncpu:
- *
- *	Return true if an rwlock owner is running on a CPU in the system.
- *	If the target is waiting on the kernel big lock, then we must
- *	release it.  This is necessary to avoid deadlock.
- */
-static bool
-rw_oncpu(uintptr_t owner)
-{
-#ifdef MULTIPROCESSOR
-	struct cpu_info *ci;
-	lwp_t *l;
-
-	KASSERT(kpreempt_disabled());
-
-	if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
-		return false;
-	}
-
-	/*
-	 * See lwp_dtor() why dereference of the LWP pointer is safe.
-	 * We must have kernel preemption disabled for that.
-	 */
-	l = (lwp_t *)(owner & RW_THREAD);
-	ci = l->l_cpu;
-
-	if (ci && ci->ci_curlwp == l) {
-		/* Target is running; do we need to block? */
-		return (ci->ci_biglock_wanted != l);
-	}
-#endif
-	/* Not running.  It may be safe to block now. */
-	return false;
-}
-
-/*
  * rw_vector_enter:
  *
- *	Acquire a rwlock.
+ *	The slow path for acquiring a rwlock, that considers all conditions.
+ *	Marked __noinline to prevent the compiler pulling it into rw_enter().
  */
-void
-rw_vector_enter(krwlock_t *rw, const krw_t op)
+static void __noinline
+rw_vector_enter(krwlock_t *rw, const krw_t op, uintptr_t mask, uintptr_t ra)
 {
 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
 	turnstile_t *ts;
@@ -291,6 +339,7 @@ rw_vector_enter(krwlock_t *rw, const krw
 
 	RW_ASSERT(rw, !cpu_intr_p());
 	RW_ASSERT(rw, curthread != 0);
+	RW_ASSERT(rw, kpreempt_disabled());
 	RW_WANTLOCK(rw, op);
 
 	if (panicstr == NULL) {
@@ -322,15 +371,13 @@ rw_vector_enter(krwlock_t *rw, const krw
 
 	LOCKSTAT_ENTER(lsflag);
 
-	KPREEMPT_DISABLE(curlwp);
 	for (owner = rw->rw_owner;;) {
 		/*
 		 * Read the lock owner field.  If the need-to-wait
 		 * indicator is clear, then try to acquire the lock.
 		 */
 		if ((owner & need_wait) == 0) {
-			next = rw_cas(rw, owner, (owner + incr) &
-			    ~RW_WRITE_WANTED);
+			next = rw_cas(rw, owner, (owner + incr) & mask);
 			if (__predict_true(next == owner)) {
 				/* Got it! */
 				RW_MEMBAR_ENTER();
@@ -348,11 +395,36 @@ rw_vector_enter(krwlock_t *rw, const krw
 			rw_abort(__func__, __LINE__, rw,
 			    "locking against myself");
 		}
+
 		/*
-		 * If the lock owner is running on another CPU, and
-		 * there are no existing waiters, then spin.
+		 * If the lock owner is running on another CPU, and there
+		 * are no existing waiters, then spin.  Notes:
+		 *
+		 * 1) If an LWP on this CPU (possibly curlwp, or an LWP that
+		 * curlwp has interupted) holds kernel_lock, we can't spin
+		 * without a deadlock.  The CPU that holds the rwlock may be
+		 * blocked trying to acquire kernel_lock, or there may be an
+		 * unseen chain of dependant locks.  To defeat the potential
+		 * deadlock, this LWP needs to sleep (and thereby directly
+		 * drop the kernel_lock, or permit the interrupted LWP that
+		 * holds kernel_lock to complete its work).
+		 *
+		 * 2) If trying to acquire a write lock, and the lock is
+		 * currently read held, after a brief wait set the write
+		 * wanted bit to block out new readers and try to avoid
+		 * starvation.  When the hold is acquired, we'll clear the
+		 * WRITE_WANTED flag to give readers a chance again.  With
+		 * luck this should nudge things in the direction of
+		 * interleaving readers and writers when there is high
+		 * contention.
+		 *
+		 * 3) The spin wait can't be done in soft interrupt context,
+		 * because a lock holder could be pinned down underneath the
+		 * soft interrupt LWP (i.e. curlwp) on the same CPU.  For
+		 * the lock holder to make progress and release the lock,
+		 * the soft interrupt needs to sleep.
 		 */
-		if (rw_oncpu(owner)) {
+		if ((owner & RW_SPIN) != 0 && !cpu_softintr_p()) {
 			LOCKSTAT_START_TIMER(lsflag, spintime);
 			u_int count = SPINLOCK_BACKOFF_MIN;
 			do {
@@ -360,7 +432,19 @@ rw_vector_enter(krwlock_t *rw, const krw
 				SPINLOCK_BACKOFF(count);
 				KPREEMPT_DISABLE(curlwp);
 				owner = rw->rw_owner;
-			} while (rw_oncpu(owner));
+				if ((owner & need_wait) == 0)
+					break;
+				if (count != SPINLOCK_BACKOFF_MAX)
+					continue;
+				if (curcpu()->ci_biglock_count != 0)
+					break;
+				if (op == RW_WRITER &&
+				    (owner & RW_WRITE_LOCKED) == 0 &&
+				    (owner & RW_WRITE_WANTED) == 0) {
+					(void)rw_cas(rw, owner,
+					    owner | RW_WRITE_WANTED);
+				}
+			} while ((owner & RW_SPIN) != 0);
 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
 			LOCKSTAT_COUNT(spincnt, 1);
 			if ((owner & need_wait) == 0)
@@ -374,17 +458,18 @@ rw_vector_enter(krwlock_t *rw, const krw
 		ts = turnstile_lookup(rw);
 
 		/*
-		 * Mark the rwlock as having waiters.  If the set fails,
-		 * then we may not need to sleep and should spin again.
-		 * Reload rw_owner because turnstile_lookup() may have
-		 * spun on the turnstile chain lock.
+		 * Mark the rwlock as having waiters, and disable spinning. 
+		 * If the set fails, then we may not need to sleep and
+		 * should spin again.  Reload rw_owner now that we own
+		 * the turnstile chain lock.
 		 */
 		owner = rw->rw_owner;
-		if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
+		if ((owner & need_wait) == 0 ||
+		    ((owner & RW_SPIN) != 0 && !cpu_softintr_p())) {
 			turnstile_exit(rw);
 			continue;
 		}
-		next = rw_cas(rw, owner, owner | set_wait);
+		next = rw_cas(rw, owner, (owner | set_wait) & ~RW_SPIN);
 		if (__predict_false(next != owner)) {
 			turnstile_exit(rw);
 			owner = next;
@@ -409,11 +494,9 @@ rw_vector_enter(krwlock_t *rw, const krw
 
 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
-	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
-	      (uintptr_t)__builtin_return_address(0)));
+	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
-	    (l->l_rwcallsite != 0 ? l->l_rwcallsite :
-	      (uintptr_t)__builtin_return_address(0)));
+	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
 	LOCKSTAT_EXIT(lsflag);
 
 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
@@ -422,11 +505,70 @@ rw_vector_enter(krwlock_t *rw, const krw
 }
 
 /*
- * rw_vector_exit:
+ * rw_enter:
  *
- *	Release a rwlock.
+ *	The fast path for acquiring a lock that considers only the
+ *	uncontended case.  Falls back to rw_vector_enter().
  */
 void
+rw_enter(krwlock_t *rw, const krw_t op)
+{
+	uintptr_t owner, incr, need_wait, curthread, next, mask;
+	lwp_t *l;
+
+	l = curlwp;
+	curthread = (uintptr_t)l;
+
+	RW_ASSERT(rw, !cpu_intr_p());
+	RW_ASSERT(rw, curthread != 0);
+	RW_WANTLOCK(rw, op);
+
+	KPREEMPT_DISABLE(l);
+	mask = rw_hold_remember(rw, l);
+
+	/*
+	 * We play a slight trick here.  If we're a reader, we want
+	 * increment the read count.  If we're a writer, we want to
+	 * set the owner field and the WRITE_LOCKED bit.
+	 *
+	 * In the latter case, we expect those bits to be zero,
+	 * therefore we can use an add operation to set them, which
+	 * means an add operation for both cases.
+	 */
+	if (__predict_true(op == RW_READER)) {
+		incr = RW_READ_INCR;
+		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
+	} else {
+		RW_ASSERT(rw, op == RW_WRITER);
+		incr = curthread | RW_WRITE_LOCKED;
+		need_wait = RW_WRITE_LOCKED | RW_THREAD;
+	}
+
+	/*
+	 * Read the lock owner field.  If the need-to-wait
+	 * indicator is clear, then try to acquire the lock.
+	 */
+	owner = rw->rw_owner;
+	if ((owner & need_wait) == 0) {
+		next = rw_cas(rw, owner, (owner + incr) & mask);
+		if (__predict_true(next == owner)) {
+			/* Got it! */
+			KPREEMPT_ENABLE(l);
+			RW_MEMBAR_ENTER();
+			return;
+		}
+	}
+
+	rw_vector_enter(rw, op, mask, (uintptr_t)__builtin_return_address(0));
+}
+
+/*
+ * rw_vector_exit:
+ *
+ *	The slow path for releasing a rwlock, that considers all conditions.
+ *	Marked __noinline to prevent the compiler pulling it into rw_enter().
+ */
+static void __noinline
 rw_vector_exit(krwlock_t *rw)
 {
 	uintptr_t curthread, owner, decr, newown, next;
@@ -437,6 +579,7 @@ rw_vector_exit(krwlock_t *rw)
 	l = curlwp;
 	curthread = (uintptr_t)l;
 	RW_ASSERT(rw, curthread != 0);
+	RW_ASSERT(rw, kpreempt_disabled());
 
 	/*
 	 * Again, we use a trick.  Since we used an add operation to
@@ -465,9 +608,15 @@ rw_vector_exit(krwlock_t *rw)
 		newown = (owner - decr);
 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
 			break;
+		/* Want spinning enabled if lock is becoming free. */
+		if ((newown & RW_THREAD) == 0)
+			newown |= RW_SPIN;
 		next = rw_cas(rw, owner, newown);
-		if (__predict_true(next == owner))
+		if (__predict_true(next == owner)) {
+			rw_hold_forget(rw, l);
+			kpreempt_enable();
 			return;
+		}
 		owner = next;
 	}
 
@@ -507,12 +656,14 @@ rw_vector_exit(krwlock_t *rw)
 			if (wcnt > 1)
 				newown |= RW_WRITE_WANTED;
 			rw_swap(rw, owner, newown);
+			rw_hold_forget(rw, l);
 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
 		} else {
 			/* Wake all writers and let them fight it out. */
 			newown = owner & RW_NODEBUG;
 			newown |= RW_WRITE_WANTED;
 			rw_swap(rw, owner, newown);
+			rw_hold_forget(rw, l);
 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
 		}
 	} else {
@@ -530,19 +681,73 @@ rw_vector_exit(krwlock_t *rw)
 			
 		/* Wake up all sleeping readers. */
 		rw_swap(rw, owner, newown);
+		rw_hold_forget(rw, l);
 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
 	}
+	kpreempt_enable();
+}
+
+/*
+ * rw_exit:
+ *
+ *	The fast path for releasing a lock that considers only the
+ *	uncontended case.  Falls back to rw_vector_exit().
+ */
+void
+rw_exit(krwlock_t *rw)
+{
+	uintptr_t curthread, owner, decr, newown, next;
+	lwp_t *l;
+
+	l = curlwp;
+	curthread = (uintptr_t)l;
+	RW_ASSERT(rw, curthread != 0);
+
+	/*
+	 * Again, we use a trick.  Since we used an add operation to
+	 * set the required lock bits, we can use a subtract to clear
+	 * them, which makes the read-release and write-release path
+	 * the same.
+	 */
+	owner = rw->rw_owner;
+	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
+		RW_UNLOCKED(rw, RW_WRITER);
+		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
+		decr = curthread | RW_WRITE_LOCKED;
+	} else {
+		RW_UNLOCKED(rw, RW_READER);
+		RW_ASSERT(rw, RW_COUNT(rw) != 0);
+		decr = RW_READ_INCR;
+	}
+
+	/* Now try to release it. */
+	RW_MEMBAR_EXIT();
+	KPREEMPT_DISABLE(l);
+	newown = (owner - decr);
+	if (__predict_true((newown & (RW_THREAD | RW_HAS_WAITERS)) !=
+	    RW_HAS_WAITERS)) {
+		/* Want spinning (re-)enabled if lock is becoming free. */
+		if ((newown & RW_THREAD) == 0)
+			newown |= RW_SPIN;
+		next = rw_cas(rw, owner, newown);
+		if (__predict_true(next == owner)) {
+			rw_hold_forget(rw, l);
+			KPREEMPT_ENABLE(l);
+			return;
+		}
+	}
+	rw_vector_exit(rw);
 }
 
 /*
- * rw_vector_tryenter:
+ * rw_tryenter:
  *
  *	Try to acquire a rwlock.
  */
 int
-rw_vector_tryenter(krwlock_t *rw, const krw_t op)
+rw_tryenter(krwlock_t *rw, const krw_t op)
 {
-	uintptr_t curthread, owner, incr, need_wait, next;
+	uintptr_t curthread, owner, incr, need_wait, next, mask;
 	lwp_t *l;
 
 	l = curlwp;
@@ -550,6 +755,9 @@ rw_vector_tryenter(krwlock_t *rw, const 
 
 	RW_ASSERT(rw, curthread != 0);
 
+	KPREEMPT_DISABLE(l);
+	mask = rw_hold_remember(rw, l);
+
 	if (op == RW_READER) {
 		incr = RW_READ_INCR;
 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
@@ -560,9 +768,12 @@ rw_vector_tryenter(krwlock_t *rw, const 
 	}
 
 	for (owner = rw->rw_owner;; owner = next) {
-		if (__predict_false((owner & need_wait) != 0))
+		if (__predict_false((owner & need_wait) != 0)) {
+			rw_hold_forget(rw, l);
+			KPREEMPT_ENABLE(l);
 			return 0;
-		next = rw_cas(rw, owner, owner + incr);
+		}
+		next = rw_cas(rw, owner, (owner + incr) & mask);
 		if (__predict_true(next == owner)) {
 			/* Got it! */
 			break;
@@ -574,6 +785,7 @@ rw_vector_tryenter(krwlock_t *rw, const 
 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
 	    (op == RW_READER && RW_COUNT(rw) != 0));
 
+	KPREEMPT_ENABLE(l);
 	RW_MEMBAR_ENTER();
 	return 1;
 }
@@ -611,7 +823,7 @@ rw_downgrade(krwlock_t *rw)
 		 * waiters.
 		 */
 		if ((owner & RW_HAS_WAITERS) == 0) {
-			newown = (owner & RW_NODEBUG);
+			newown = (owner & RW_NODEBUG) | RW_SPIN;
 			next = rw_cas(rw, owner, newown + RW_READ_INCR);
 			if (__predict_true(next == owner)) {
 				RW_LOCKED(rw, RW_READER);
@@ -638,7 +850,8 @@ rw_downgrade(krwlock_t *rw)
 			/*
 			 * If there are no readers, just preserve the
 			 * waiters bits, swap us down to one read hold and
-			 * return.
+			 * return.  Don't set the spin bit as nobody's
+			 * running yet.
 			 */
 			RW_ASSERT(rw, wcnt != 0);
 			RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
@@ -656,7 +869,8 @@ rw_downgrade(krwlock_t *rw)
 			 * Give the lock to all blocked readers.  We may
 			 * retain one read hold if downgrading.  If there is
 			 * a writer waiting, new readers will be blocked
-			 * out.
+			 * out.  Don't set the spin bit as nobody's running
+			 * yet.
 			 */
 			newown = owner & RW_NODEBUG;
 			newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
@@ -787,3 +1001,18 @@ rw_owner(wchan_t obj)
 
 	return (void *)(owner & RW_THREAD);
 }
+
+/*
+ * rw_owner_running:
+ *
+ *	Return true if a RW lock is unheld, or held and the owner is running
+ *	on a CPU.  For the pagedaemon only - do not document or use in other
+ *	code.
+ */
+bool
+rw_owner_running(const krwlock_t *rw)
+{
+	uintptr_t owner = rw->rw_owner;
+
+	return (owner & RW_THREAD) == 0 || (owner & RW_SPIN) != 0;
+}

Index: src/sys/kern/kern_synch.c
diff -u src/sys/kern/kern_synch.c:1.334.2.1 src/sys/kern/kern_synch.c:1.334.2.2
--- src/sys/kern/kern_synch.c:1.334.2.1	Fri Jan 17 21:47:35 2020
+++ src/sys/kern/kern_synch.c	Sun Jan 19 21:08:29 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_synch.c,v 1.334.2.1 2020/01/17 21:47:35 ad Exp $	*/
+/*	$NetBSD: kern_synch.c,v 1.334.2.2 2020/01/19 21:08:29 ad Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019
@@ -69,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.334.2.1 2020/01/17 21:47:35 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.334.2.2 2020/01/19 21:08:29 ad Exp $");
 
 #include "opt_kstack.h"
 #include "opt_dtrace.h"
@@ -83,6 +83,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_synch.c
 #include <sys/cpu.h>
 #include <sys/pserialize.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/syscall_stats.h>
 #include <sys/sleepq.h>
@@ -657,6 +658,9 @@ mi_switch(lwp_t *l)
 		/* We're down to only one lock, so do debug checks. */
 		LOCKDEBUG_BARRIER(l->l_mutex, 1);
 
+		/* Disable spinning on any R/W locks that we hold. */
+		rw_switch();
+
 		/* Count the context switch. */
 		CPU_COUNT(CPU_COUNT_NSWTCH, 1);
 		l->l_ncsw++;

Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.192.2.1 src/sys/sys/lwp.h:1.192.2.2
--- src/sys/sys/lwp.h:1.192.2.1	Fri Jan 17 21:47:37 2020
+++ src/sys/sys/lwp.h	Sun Jan 19 21:08:30 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: lwp.h,v 1.192.2.1 2020/01/17 21:47:37 ad Exp $	*/
+/*	$NetBSD: lwp.h,v 1.192.2.2 2020/01/19 21:08:30 ad Exp $	*/
 
 /*
  * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019
@@ -186,6 +186,7 @@ struct lwp {
 	u_short		l_exlocks;	/* !: lockdebug: excl. locks held */
 	u_short		l_psrefs;	/* !: count of psref held */
 	u_short		l_blcnt;	/* !: count of kernel_lock held */
+	struct krwlock	*l_rwlocks[4];	/* !: tracks first N held rwlocks */
 	int		l_nopreempt;	/* !: don't preempt me! */
 	u_int		l_dopreempt;	/* s: kernel preemption pending */
 	int		l_pflag;	/* !: LWP private flags */

Reply via email to