Diff below removes the recursive attribute of the SCHED_LOCK() by
turning it into a IPL_NONE mutex.  I'm not intending to commit it
yet as it raises multiple questions, see below. 

This work has been started by art@ more than a decade ago and I'm
willing to finish it as I believe it's the easiest way to reduce
the scope of this lock.  Having a global mutex is the first step to
have a per runqueue and per sleepqueue mutex. 

This is also a way to avoid lock ordering problems exposed by the recent
races in single_thread_set().

About the diff:

 The diff below includes a (hugly) refactoring of rw_exit() to avoid a
 recursion on the SCHED_LOCK().  In this case the lock is used to protect
 the global sleepqueue and is grabbed in sleep_setup().

 The same pattern can be observed in single_thread_check().  However in
 this case the lock is used to protect different fields so there's no
 "recursive access" to the same data structure.

 assertwaitok() has been moved down in mi_switch() which isn't ideal.

 It becomes obvious that the per-CPU and per-thread accounting fields
 updated in mi_switch() won't need a separate mutex as proposed last
 year and that splitting this global mutex will be enough.

It's unclear to me if/how WITNESS should be modified to handle this lock
change.

This has been tested on sparc64 and amd64.  I'm not convinced it exposed
all the recursions.  So if you want to give it a go and can break it, it
is more than welcome.

Comments?  Questions?

Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.226
diff -u -p -r1.226 kern_fork.c
--- kern/kern_fork.c    25 Oct 2020 01:55:18 -0000      1.226
+++ kern/kern_fork.c    2 Nov 2020 10:50:24 -0000
@@ -665,7 +665,7 @@ void
 proc_trampoline_mp(void)
 {
        SCHED_ASSERT_LOCKED();
-       __mp_unlock(&sched_lock);
+       mtx_leave(&sched_lock);
        spl0();
        SCHED_ASSERT_UNLOCKED();
        KERNEL_ASSERT_UNLOCKED();
Index: kern/kern_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_lock.c,v
retrieving revision 1.71
diff -u -p -r1.71 kern_lock.c
--- kern/kern_lock.c    5 Mar 2020 09:28:31 -0000       1.71
+++ kern/kern_lock.c    2 Nov 2020 10:50:24 -0000
@@ -97,9 +97,6 @@ ___mp_lock_init(struct __mp_lock *mpl, c
        if (mpl == &kernel_lock)
                mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
                    LO_SLEEPABLE | (LO_CLASS_KERNEL_LOCK << LO_CLASSSHIFT);
-       else if (mpl == &sched_lock)
-               mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
-                   LO_RECURSABLE | (LO_CLASS_SCHED_LOCK << LO_CLASSSHIFT);
        WITNESS_INIT(&mpl->mpl_lock_obj, type);
 #endif
 }
Index: kern/kern_rwlock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.45
diff -u -p -r1.45 kern_rwlock.c
--- kern/kern_rwlock.c  2 Mar 2020 17:07:49 -0000       1.45
+++ kern/kern_rwlock.c  2 Nov 2020 23:13:01 -0000
@@ -128,36 +128,6 @@ rw_enter_write(struct rwlock *rwl)
        }
 }
 
-void
-rw_exit_read(struct rwlock *rwl)
-{
-       unsigned long owner;
-
-       rw_assert_rdlock(rwl);
-       WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
-
-       membar_exit_before_atomic();
-       owner = rwl->rwl_owner;
-       if (__predict_false((owner & RWLOCK_WAIT) ||
-           rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
-               rw_do_exit(rwl, 0);
-}
-
-void
-rw_exit_write(struct rwlock *rwl)
-{
-       unsigned long owner;
-
-       rw_assert_wrlock(rwl);
-       WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
-
-       membar_exit_before_atomic();
-       owner = rwl->rwl_owner;
-       if (__predict_false((owner & RWLOCK_WAIT) ||
-           rw_cas(&rwl->rwl_owner, owner, 0)))
-               rw_do_exit(rwl, RWLOCK_WRLOCK);
-}
-
 #ifdef DIAGNOSTIC
 /*
  * Put the diagnostic functions here to keep the main code free
@@ -314,9 +284,10 @@ retry:
 }
 
 void
-rw_exit(struct rwlock *rwl)
+_rw_exit(struct rwlock *rwl, int locked)
 {
        unsigned long wrlock;
+       unsigned long owner, set;
 
        /* Avoid deadlocks after panic or in DDB */
        if (panicstr || db_active)
@@ -330,15 +301,6 @@ rw_exit(struct rwlock *rwl)
        WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
 
        membar_exit_before_atomic();
-       rw_do_exit(rwl, wrlock);
-}
-
-/* membar_exit_before_atomic() has to precede call of this function. */
-void
-rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
-{
-       unsigned long owner, set;
-
        do {
                owner = rwl->rwl_owner;
                if (wrlock)
@@ -349,7 +311,13 @@ rw_do_exit(struct rwlock *rwl, unsigned 
        } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
 
        if (owner & RWLOCK_WAIT)
-               wakeup(rwl);
+               wakeup_n(rwl, -1, locked);
+}
+
+void
+rw_exit(struct rwlock *rwl)
+{
+       _rw_exit(rwl, 0);
 }
 
 int
Index: kern/kern_sched.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.67
diff -u -p -r1.67 kern_sched.c
--- kern/kern_sched.c   11 Jun 2020 00:00:01 -0000      1.67
+++ kern/kern_sched.c   2 Nov 2020 10:50:24 -0000
@@ -339,7 +339,7 @@ again:
                         * This is kind of like a stupid idle loop.
                         */
 #ifdef MULTIPROCESSOR
-                       __mp_unlock(&sched_lock);
+                       mtx_leave(&sched_lock);
 #endif
                        spl0();
                        delay(10);
Index: kern/kern_sig.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.263
diff -u -p -r1.263 kern_sig.c
--- kern/kern_sig.c     16 Sep 2020 13:50:42 -0000      1.263
+++ kern/kern_sig.c     3 Nov 2020 10:53:38 -0000
@@ -1956,7 +1956,7 @@ single_thread_check(struct proc *p, int 
                        }
 
                        if (atomic_dec_int_nv(&pr->ps_singlecount) == 0)
-                               wakeup(&pr->ps_singlecount);
+                               wakeup_n(&pr->ps_singlecount, 1, 1);
                        if (pr->ps_flags & PS_SINGLEEXIT) {
                                SCHED_UNLOCK(s);
                                KERNEL_LOCK();
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.171
diff -u -p -r1.171 kern_synch.c
--- kern/kern_synch.c   23 Oct 2020 20:28:09 -0000      1.171
+++ kern/kern_synch.c   2 Nov 2020 23:12:10 -0000
@@ -306,7 +306,7 @@ rwsleep(const volatile void *ident, stru
        sleep_setup(&sls, ident, priority, wmesg);
        sleep_setup_timeout(&sls, timo);
 
-       rw_exit(rwl);
+       _rw_exit(rwl, 1);
        /* signal may stop the process, release rwlock before that */
        sleep_setup_signal(&sls);
 
@@ -525,11 +525,12 @@ sleep_finish_signal(struct sleep_state *
 }
 
 int
-wakeup_proc(struct proc *p, const volatile void *chan)
+_wakeup_proc_locked(struct proc *p, const volatile void *chan)
 {
-       int s, awakened = 0;
+       int awakened = 0;
+
+       SCHED_ASSERT_LOCKED();
 
-       SCHED_LOCK(s);
        if (p->p_wchan != NULL &&
           ((chan == NULL) || (p->p_wchan == chan))) {
                awakened = 1;
@@ -538,6 +539,17 @@ wakeup_proc(struct proc *p, const volati
                else
                        unsleep(p);
        }
+
+       return awakened;
+}
+
+int
+wakeup_proc(struct proc *p, const volatile void *chan)
+{
+       int s, awakened;
+
+       SCHED_LOCK(s);
+       awakened = _wakeup_proc_locked(p, chan);
        SCHED_UNLOCK(s);
 
        return awakened;
@@ -556,7 +568,7 @@ endtsleep(void *arg)
        int s;
 
        SCHED_LOCK(s);
-       if (wakeup_proc(p, NULL))
+       if (_wakeup_proc_locked(p, NULL))
                atomic_setbits_int(&p->p_flag, P_TIMEOUT);
        SCHED_UNLOCK(s);
 }
@@ -580,14 +592,18 @@ unsleep(struct proc *p)
  * Make a number of processes sleeping on the specified identifier runnable.
  */
 void
-wakeup_n(const volatile void *ident, int n)
+wakeup_n(const volatile void *ident, int n, int locked)
 {
        struct slpque *qp;
        struct proc *p;
        struct proc *pnext;
        int s;
 
-       SCHED_LOCK(s);
+       if (!locked)
+               SCHED_LOCK(s);
+       else
+               SCHED_ASSERT_LOCKED();
+
        qp = &slpque[LOOKUP(ident)];
        for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
                pnext = TAILQ_NEXT(p, p_runq);
@@ -604,10 +620,12 @@ wakeup_n(const volatile void *ident, int
                if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
                        panic("wakeup: p_stat is %d", (int)p->p_stat);
 #endif
-               if (wakeup_proc(p, ident))
+               if (_wakeup_proc_locked(p, ident))
                        --n;
        }
-       SCHED_UNLOCK(s);
+
+       if (!locked)
+               SCHED_UNLOCK(s);
 }
 
 /*
@@ -616,7 +634,7 @@ wakeup_n(const volatile void *ident, int
 void
 wakeup(const volatile void *chan)
 {
-       wakeup_n(chan, -1);
+       wakeup_n(chan, -1, 0);
 }
 
 int
Index: kern/sched_bsd.c
===================================================================
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.64
diff -u -p -r1.64 sched_bsd.c
--- kern/sched_bsd.c    15 Oct 2020 07:49:55 -0000      1.64
+++ kern/sched_bsd.c    2 Nov 2020 10:50:24 -0000
@@ -59,7 +59,7 @@ int   lbolt;                  /* once a second sleep addr
 int    rrticks_init;           /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
-struct __mp_lock sched_lock;
+struct mutex sched_lock;
 #endif
 
 void                   schedcpu(void *);
@@ -337,10 +337,8 @@ mi_switch(void)
        struct timespec ts;
 #ifdef MULTIPROCESSOR
        int hold_count;
-       int sched_count;
 #endif
 
-       assertwaitok();
        KASSERT(p->p_stat != SONPROC);
 
        SCHED_ASSERT_LOCKED();
@@ -349,7 +347,6 @@ mi_switch(void)
        /*
         * Release the kernel_lock, as we are about to yield the CPU.
         */
-       sched_count = __mp_release_all_but_one(&sched_lock);
        if (_kernel_lock_held())
                hold_count = __mp_release_all(&kernel_lock);
        else
@@ -407,9 +404,10 @@ mi_switch(void)
         * just release it here.
         */
 #ifdef MULTIPROCESSOR
-       __mp_unlock(&sched_lock);
+       mtx_leave(&sched_lock);
 #endif
 
+       assertwaitok();
        SCHED_ASSERT_UNLOCKED();
 
        smr_idle();
@@ -431,7 +429,7 @@ mi_switch(void)
         */
        if (hold_count)
                __mp_acquire_count(&kernel_lock, hold_count);
-       __mp_acquire_count(&sched_lock, sched_count + 1);
+       mtx_enter(&sched_lock);
 #endif
 }
 
Index: sys/rwlock.h
===================================================================
RCS file: /cvs/src/sys/sys/rwlock.h,v
retrieving revision 1.26
diff -u -p -r1.26 rwlock.h
--- sys/rwlock.h        16 Jul 2019 01:40:49 -0000      1.26
+++ sys/rwlock.h        2 Nov 2020 23:06:39 -0000
@@ -40,15 +40,6 @@
  *  atomically test for the lock being 0 (it's not possible to have
  *  owner/read count unset and waiter bits set) and if 0 set the owner to
  *  the proc and RWLOCK_WRLOCK. While not zero, loop into rw_enter_wait.
- *
- * void rw_exit_read(struct rwlock *);
- *  atomically decrement lock by RWLOCK_READ_INCR and unset RWLOCK_WAIT and
- *  RWLOCK_WRWANT remembering the old value of lock and if RWLOCK_WAIT was set,
- *  call rw_exit_waiters with the old contents of the lock.
- *
- * void rw_exit_write(struct rwlock *);
- *  atomically swap the contents of the lock with 0 and if RWLOCK_WAIT was
- *  set, call rw_exit_waiters with the old contents of the lock.
  */
 
 #ifndef _SYS_RWLOCK_H
@@ -149,8 +140,8 @@ void        _rw_init_flags(struct rwlock *, con
 
 void   rw_enter_read(struct rwlock *);
 void   rw_enter_write(struct rwlock *);
-void   rw_exit_read(struct rwlock *);
-void   rw_exit_write(struct rwlock *);
+#define rw_exit_read(rwl)      rw_exit(rwl)
+#define rw_exit_write(rwl)     rw_exit(rwl)
 
 #ifdef DIAGNOSTIC
 void   rw_assert_wrlock(struct rwlock *);
@@ -167,6 +158,7 @@ void        rw_assert_unlocked(struct rwlock *)
 int    rw_enter(struct rwlock *, int);
 void   rw_exit(struct rwlock *);
 int    rw_status(struct rwlock *);
+void   _rw_exit(struct rwlock *, int);
 
 void   _rrw_init_flags(struct rrwlock *, const char *, int,
            const struct lock_type *);
Index: sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.56
diff -u -p -r1.56 sched.h
--- sys/sched.h 21 Oct 2019 10:24:01 -0000      1.56
+++ sys/sched.h 2 Nov 2020 10:50:24 -0000
@@ -194,37 +194,32 @@ void remrunqueue(struct proc *);
 } while (0)
 
 #if defined(MULTIPROCESSOR)
-#include <sys/lock.h>
+#include <sys/mutex.h>
 
-/*
- * XXX Instead of using struct lock for the kernel lock and thus requiring us
- * XXX to implement simplelocks, causing all sorts of fine-grained locks all
- * XXX over our tree to be activated, the sched_lock is a different kind of
- * XXX lock to avoid introducing locking protocol bugs.
- */
-extern struct __mp_lock sched_lock;
+extern struct mutex sched_lock;
 
 #define        SCHED_ASSERT_LOCKED()                                           
\
 do {                                                                   \
        splassert(IPL_SCHED);                                           \
-       KASSERT(__mp_lock_held(&sched_lock, curcpu()));                 \
+       MUTEX_ASSERT_LOCKED(&sched_lock);                               \
 } while (0)
+
 #define        SCHED_ASSERT_UNLOCKED()                                         
\
 do {                                                                   \
-       KASSERT(__mp_lock_held(&sched_lock, curcpu()) == 0);            \
+       MUTEX_ASSERT_UNLOCKED(&sched_lock);                             \
 } while (0)
 
-#define        SCHED_LOCK_INIT()       __mp_lock_init(&sched_lock)
+#define        SCHED_LOCK_INIT()       mtx_init(&sched_lock, IPL_NONE)
 
 #define        SCHED_LOCK(s)                                                   
\
 do {                                                                   \
        s = splsched();                                                 \
-       __mp_lock(&sched_lock);                                         \
+       mtx_enter(&sched_lock);                                         \
 } while (/* CONSTCOND */ 0)
 
 #define        SCHED_UNLOCK(s)                                                 
\
 do {                                                                   \
-       __mp_unlock(&sched_lock);                                       \
+       mtx_leave(&sched_lock);                                         \
        splx(s);                                                        \
 } while (/* CONSTCOND */ 0)
 
Index: sys/systm.h
===================================================================
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.148
diff -u -p -r1.148 systm.h
--- sys/systm.h 26 Aug 2020 03:29:07 -0000      1.148
+++ sys/systm.h 2 Nov 2020 23:10:43 -0000
@@ -274,9 +274,9 @@ void        cond_signal(struct cond *);
 
 struct mutex;
 struct rwlock;
-void    wakeup_n(const volatile void *, int);
+void    wakeup_n(const volatile void *, int, int);
 void    wakeup(const volatile void *);
-#define wakeup_one(c) wakeup_n((c), 1)
+#define wakeup_one(c) wakeup_n((c), 1, 0)
 int    tsleep(const volatile void *, int, const char *, int);
 int    tsleep_nsec(const volatile void *, int, const char *, uint64_t);
 int    msleep(const volatile void *, struct mutex *, int,  const char*, int);

Reply via email to