Diff below removes the recursive attribute of the SCHED_LOCK() by turning it into a IPL_NONE mutex. I'm not intending to commit it yet as it raises multiple questions, see below.
This work has been started by art@ more than a decade ago and I'm willing to finish it as I believe it's the easiest way to reduce the scope of this lock. Having a global mutex is the first step to have a per runqueue and per sleepqueue mutex. This is also a way to avoid lock ordering problems exposed by the recent races in single_thread_set(). About the diff: The diff below includes a (hugly) refactoring of rw_exit() to avoid a recursion on the SCHED_LOCK(). In this case the lock is used to protect the global sleepqueue and is grabbed in sleep_setup(). The same pattern can be observed in single_thread_check(). However in this case the lock is used to protect different fields so there's no "recursive access" to the same data structure. assertwaitok() has been moved down in mi_switch() which isn't ideal. It becomes obvious that the per-CPU and per-thread accounting fields updated in mi_switch() won't need a separate mutex as proposed last year and that splitting this global mutex will be enough. It's unclear to me if/how WITNESS should be modified to handle this lock change. This has been tested on sparc64 and amd64. I'm not convinced it exposed all the recursions. So if you want to give it a go and can break it, it is more than welcome. Comments? Questions? Index: kern/kern_fork.c =================================================================== RCS file: /cvs/src/sys/kern/kern_fork.c,v retrieving revision 1.226 diff -u -p -r1.226 kern_fork.c --- kern/kern_fork.c 25 Oct 2020 01:55:18 -0000 1.226 +++ kern/kern_fork.c 2 Nov 2020 10:50:24 -0000 @@ -665,7 +665,7 @@ void proc_trampoline_mp(void) { SCHED_ASSERT_LOCKED(); - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); spl0(); SCHED_ASSERT_UNLOCKED(); KERNEL_ASSERT_UNLOCKED(); Index: kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v retrieving revision 1.71 diff -u -p -r1.71 kern_lock.c --- kern/kern_lock.c 5 Mar 2020 09:28:31 -0000 1.71 +++ kern/kern_lock.c 2 Nov 2020 10:50:24 -0000 @@ -97,9 +97,6 @@ ___mp_lock_init(struct __mp_lock *mpl, c if (mpl == &kernel_lock) mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED | LO_SLEEPABLE | (LO_CLASS_KERNEL_LOCK << LO_CLASSSHIFT); - else if (mpl == &sched_lock) - mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED | - LO_RECURSABLE | (LO_CLASS_SCHED_LOCK << LO_CLASSSHIFT); WITNESS_INIT(&mpl->mpl_lock_obj, type); #endif } Index: kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v retrieving revision 1.45 diff -u -p -r1.45 kern_rwlock.c --- kern/kern_rwlock.c 2 Mar 2020 17:07:49 -0000 1.45 +++ kern/kern_rwlock.c 2 Nov 2020 23:13:01 -0000 @@ -128,36 +128,6 @@ rw_enter_write(struct rwlock *rwl) } } -void -rw_exit_read(struct rwlock *rwl) -{ - unsigned long owner; - - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); - - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) - rw_do_exit(rwl, 0); -} - -void -rw_exit_write(struct rwlock *rwl) -{ - unsigned long owner; - - rw_assert_wrlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); - - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, 0))) - rw_do_exit(rwl, RWLOCK_WRLOCK); -} - #ifdef DIAGNOSTIC /* * Put the diagnostic functions here to keep the main code free @@ -314,9 +284,10 @@ retry: } void -rw_exit(struct rwlock *rwl) +_rw_exit(struct rwlock *rwl, int locked) { unsigned long wrlock; + unsigned long owner, set; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) @@ -330,15 +301,6 @@ rw_exit(struct rwlock *rwl) WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); membar_exit_before_atomic(); - rw_do_exit(rwl, wrlock); -} - -/* membar_exit_before_atomic() has to precede call of this function. */ -void -rw_do_exit(struct rwlock *rwl, unsigned long wrlock) -{ - unsigned long owner, set; - do { owner = rwl->rwl_owner; if (wrlock) @@ -349,7 +311,13 @@ rw_do_exit(struct rwlock *rwl, unsigned } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); if (owner & RWLOCK_WAIT) - wakeup(rwl); + wakeup_n(rwl, -1, locked); +} + +void +rw_exit(struct rwlock *rwl) +{ + _rw_exit(rwl, 0); } int Index: kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v retrieving revision 1.67 diff -u -p -r1.67 kern_sched.c --- kern/kern_sched.c 11 Jun 2020 00:00:01 -0000 1.67 +++ kern/kern_sched.c 2 Nov 2020 10:50:24 -0000 @@ -339,7 +339,7 @@ again: * This is kind of like a stupid idle loop. */ #ifdef MULTIPROCESSOR - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); #endif spl0(); delay(10); Index: kern/kern_sig.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sig.c,v retrieving revision 1.263 diff -u -p -r1.263 kern_sig.c --- kern/kern_sig.c 16 Sep 2020 13:50:42 -0000 1.263 +++ kern/kern_sig.c 3 Nov 2020 10:53:38 -0000 @@ -1956,7 +1956,7 @@ single_thread_check(struct proc *p, int } if (atomic_dec_int_nv(&pr->ps_singlecount) == 0) - wakeup(&pr->ps_singlecount); + wakeup_n(&pr->ps_singlecount, 1, 1); if (pr->ps_flags & PS_SINGLEEXIT) { SCHED_UNLOCK(s); KERNEL_LOCK(); Index: kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v retrieving revision 1.171 diff -u -p -r1.171 kern_synch.c --- kern/kern_synch.c 23 Oct 2020 20:28:09 -0000 1.171 +++ kern/kern_synch.c 2 Nov 2020 23:12:10 -0000 @@ -306,7 +306,7 @@ rwsleep(const volatile void *ident, stru sleep_setup(&sls, ident, priority, wmesg); sleep_setup_timeout(&sls, timo); - rw_exit(rwl); + _rw_exit(rwl, 1); /* signal may stop the process, release rwlock before that */ sleep_setup_signal(&sls); @@ -525,11 +525,12 @@ sleep_finish_signal(struct sleep_state * } int -wakeup_proc(struct proc *p, const volatile void *chan) +_wakeup_proc_locked(struct proc *p, const volatile void *chan) { - int s, awakened = 0; + int awakened = 0; + + SCHED_ASSERT_LOCKED(); - SCHED_LOCK(s); if (p->p_wchan != NULL && ((chan == NULL) || (p->p_wchan == chan))) { awakened = 1; @@ -538,6 +539,17 @@ wakeup_proc(struct proc *p, const volati else unsleep(p); } + + return awakened; +} + +int +wakeup_proc(struct proc *p, const volatile void *chan) +{ + int s, awakened; + + SCHED_LOCK(s); + awakened = _wakeup_proc_locked(p, chan); SCHED_UNLOCK(s); return awakened; @@ -556,7 +568,7 @@ endtsleep(void *arg) int s; SCHED_LOCK(s); - if (wakeup_proc(p, NULL)) + if (_wakeup_proc_locked(p, NULL)) atomic_setbits_int(&p->p_flag, P_TIMEOUT); SCHED_UNLOCK(s); } @@ -580,14 +592,18 @@ unsleep(struct proc *p) * Make a number of processes sleeping on the specified identifier runnable. */ void -wakeup_n(const volatile void *ident, int n) +wakeup_n(const volatile void *ident, int n, int locked) { struct slpque *qp; struct proc *p; struct proc *pnext; int s; - SCHED_LOCK(s); + if (!locked) + SCHED_LOCK(s); + else + SCHED_ASSERT_LOCKED(); + qp = &slpque[LOOKUP(ident)]; for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) { pnext = TAILQ_NEXT(p, p_runq); @@ -604,10 +620,12 @@ wakeup_n(const volatile void *ident, int if (p->p_stat != SSLEEP && p->p_stat != SSTOP) panic("wakeup: p_stat is %d", (int)p->p_stat); #endif - if (wakeup_proc(p, ident)) + if (_wakeup_proc_locked(p, ident)) --n; } - SCHED_UNLOCK(s); + + if (!locked) + SCHED_UNLOCK(s); } /* @@ -616,7 +634,7 @@ wakeup_n(const volatile void *ident, int void wakeup(const volatile void *chan) { - wakeup_n(chan, -1); + wakeup_n(chan, -1, 0); } int Index: kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v retrieving revision 1.64 diff -u -p -r1.64 sched_bsd.c --- kern/sched_bsd.c 15 Oct 2020 07:49:55 -0000 1.64 +++ kern/sched_bsd.c 2 Nov 2020 10:50:24 -0000 @@ -59,7 +59,7 @@ int lbolt; /* once a second sleep addr int rrticks_init; /* # of hardclock ticks per roundrobin() */ #ifdef MULTIPROCESSOR -struct __mp_lock sched_lock; +struct mutex sched_lock; #endif void schedcpu(void *); @@ -337,10 +337,8 @@ mi_switch(void) struct timespec ts; #ifdef MULTIPROCESSOR int hold_count; - int sched_count; #endif - assertwaitok(); KASSERT(p->p_stat != SONPROC); SCHED_ASSERT_LOCKED(); @@ -349,7 +347,6 @@ mi_switch(void) /* * Release the kernel_lock, as we are about to yield the CPU. */ - sched_count = __mp_release_all_but_one(&sched_lock); if (_kernel_lock_held()) hold_count = __mp_release_all(&kernel_lock); else @@ -407,9 +404,10 @@ mi_switch(void) * just release it here. */ #ifdef MULTIPROCESSOR - __mp_unlock(&sched_lock); + mtx_leave(&sched_lock); #endif + assertwaitok(); SCHED_ASSERT_UNLOCKED(); smr_idle(); @@ -431,7 +429,7 @@ mi_switch(void) */ if (hold_count) __mp_acquire_count(&kernel_lock, hold_count); - __mp_acquire_count(&sched_lock, sched_count + 1); + mtx_enter(&sched_lock); #endif } Index: sys/rwlock.h =================================================================== RCS file: /cvs/src/sys/sys/rwlock.h,v retrieving revision 1.26 diff -u -p -r1.26 rwlock.h --- sys/rwlock.h 16 Jul 2019 01:40:49 -0000 1.26 +++ sys/rwlock.h 2 Nov 2020 23:06:39 -0000 @@ -40,15 +40,6 @@ * atomically test for the lock being 0 (it's not possible to have * owner/read count unset and waiter bits set) and if 0 set the owner to * the proc and RWLOCK_WRLOCK. While not zero, loop into rw_enter_wait. - * - * void rw_exit_read(struct rwlock *); - * atomically decrement lock by RWLOCK_READ_INCR and unset RWLOCK_WAIT and - * RWLOCK_WRWANT remembering the old value of lock and if RWLOCK_WAIT was set, - * call rw_exit_waiters with the old contents of the lock. - * - * void rw_exit_write(struct rwlock *); - * atomically swap the contents of the lock with 0 and if RWLOCK_WAIT was - * set, call rw_exit_waiters with the old contents of the lock. */ #ifndef _SYS_RWLOCK_H @@ -149,8 +140,8 @@ void _rw_init_flags(struct rwlock *, con void rw_enter_read(struct rwlock *); void rw_enter_write(struct rwlock *); -void rw_exit_read(struct rwlock *); -void rw_exit_write(struct rwlock *); +#define rw_exit_read(rwl) rw_exit(rwl) +#define rw_exit_write(rwl) rw_exit(rwl) #ifdef DIAGNOSTIC void rw_assert_wrlock(struct rwlock *); @@ -167,6 +158,7 @@ void rw_assert_unlocked(struct rwlock *) int rw_enter(struct rwlock *, int); void rw_exit(struct rwlock *); int rw_status(struct rwlock *); +void _rw_exit(struct rwlock *, int); void _rrw_init_flags(struct rrwlock *, const char *, int, const struct lock_type *); Index: sys/sched.h =================================================================== RCS file: /cvs/src/sys/sys/sched.h,v retrieving revision 1.56 diff -u -p -r1.56 sched.h --- sys/sched.h 21 Oct 2019 10:24:01 -0000 1.56 +++ sys/sched.h 2 Nov 2020 10:50:24 -0000 @@ -194,37 +194,32 @@ void remrunqueue(struct proc *); } while (0) #if defined(MULTIPROCESSOR) -#include <sys/lock.h> +#include <sys/mutex.h> -/* - * XXX Instead of using struct lock for the kernel lock and thus requiring us - * XXX to implement simplelocks, causing all sorts of fine-grained locks all - * XXX over our tree to be activated, the sched_lock is a different kind of - * XXX lock to avoid introducing locking protocol bugs. - */ -extern struct __mp_lock sched_lock; +extern struct mutex sched_lock; #define SCHED_ASSERT_LOCKED() \ do { \ splassert(IPL_SCHED); \ - KASSERT(__mp_lock_held(&sched_lock, curcpu())); \ + MUTEX_ASSERT_LOCKED(&sched_lock); \ } while (0) + #define SCHED_ASSERT_UNLOCKED() \ do { \ - KASSERT(__mp_lock_held(&sched_lock, curcpu()) == 0); \ + MUTEX_ASSERT_UNLOCKED(&sched_lock); \ } while (0) -#define SCHED_LOCK_INIT() __mp_lock_init(&sched_lock) +#define SCHED_LOCK_INIT() mtx_init(&sched_lock, IPL_NONE) #define SCHED_LOCK(s) \ do { \ s = splsched(); \ - __mp_lock(&sched_lock); \ + mtx_enter(&sched_lock); \ } while (/* CONSTCOND */ 0) #define SCHED_UNLOCK(s) \ do { \ - __mp_unlock(&sched_lock); \ + mtx_leave(&sched_lock); \ splx(s); \ } while (/* CONSTCOND */ 0) Index: sys/systm.h =================================================================== RCS file: /cvs/src/sys/sys/systm.h,v retrieving revision 1.148 diff -u -p -r1.148 systm.h --- sys/systm.h 26 Aug 2020 03:29:07 -0000 1.148 +++ sys/systm.h 2 Nov 2020 23:10:43 -0000 @@ -274,9 +274,9 @@ void cond_signal(struct cond *); struct mutex; struct rwlock; -void wakeup_n(const volatile void *, int); +void wakeup_n(const volatile void *, int, int); void wakeup(const volatile void *); -#define wakeup_one(c) wakeup_n((c), 1) +#define wakeup_one(c) wakeup_n((c), 1, 0) int tsleep(const volatile void *, int, const char *, int); int tsleep_nsec(const volatile void *, int, const char *, uint64_t); int msleep(const volatile void *, struct mutex *, int, const char*, int);