Am Wed, Nov 04, 2020 at 09:13:22AM -0300 schrieb Martin Pieuchot:
> Diff below removes the recursive attribute of the SCHED_LOCK() by
> turning it into a IPL_NONE mutex. I'm not intending to commit it
> yet as it raises multiple questions, see below.
>
> This work has been started by art@ more than a decade ago and I'm
> willing to finish it as I believe it's the easiest way to reduce
> the scope of this lock. Having a global mutex is the first step to
> have a per runqueue and per sleepqueue mutex.
>
> This is also a way to avoid lock ordering problems exposed by the recent
> races in single_thread_set().
>
> About the diff:
>
> The diff below includes a (hugly) refactoring of rw_exit() to avoid a
> recursion on the SCHED_LOCK(). In this case the lock is used to protect
> the global sleepqueue and is grabbed in sleep_setup().
>
> The same pattern can be observed in single_thread_check(). However in
> this case the lock is used to protect different fields so there's no
> "recursive access" to the same data structure.
>
> assertwaitok() has been moved down in mi_switch() which isn't ideal.
>
> It becomes obvious that the per-CPU and per-thread accounting fields
> updated in mi_switch() won't need a separate mutex as proposed last
> year and that splitting this global mutex will be enough.
>
> It's unclear to me if/how WITNESS should be modified to handle this lock
> change.
>
> This has been tested on sparc64 and amd64. I'm not convinced it exposed
> all the recursions. So if you want to give it a go and can break it, it
> is more than welcome.
>
> Comments? Questions?
I think that's a good direction. This then allows us to add a few
smaller mutexes and move some users to those.
In the meantime apparently some stuff has changed a little bit. I had
to fix one reject, which was very easy because only the context around
the diff changed a tiny bit.
I then found one schedlock recursion:
single_thread_check() takes the sched lock, which we call through
sleep_signal_check(). We do the call twice, one time with the sched
lock, and one time without. My 'quick' fix is to introduce a locked
version that calls single_thread_check_locked(). There might be
a nicer way to do that, which I don't know yet, but this diff seems
to work fine on my arm64 machine with amdgpu.
Build times increase a little, which might be because mutexes are
not 'fair' like mplocks. If we are looking for 'instant gratification'
we might have to do two steps at once (mplock -> mtx + move contention
to multiple mtxs). In any case, I believe this is a good step to go
and I'll have a look at reducing the contention.
Patrick
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 1e51f71301a..4e4e1fc2a27 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -671,7 +671,7 @@ void
proc_trampoline_mp(void)
{
SCHED_ASSERT_LOCKED();
- __mp_unlock(&sched_lock);
+ mtx_leave(&sched_lock);
spl0();
SCHED_ASSERT_UNLOCKED();
KERNEL_ASSERT_UNLOCKED();
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index 5cc55bb256a..2206ac53321 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -97,9 +97,6 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type
*type)
if (mpl == &kernel_lock)
mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
LO_SLEEPABLE | (LO_CLASS_KERNEL_LOCK << LO_CLASSSHIFT);
- else if (mpl == &sched_lock)
- mpl->mpl_lock_obj.lo_flags = LO_WITNESS | LO_INITIALIZED |
- LO_RECURSABLE | (LO_CLASS_SCHED_LOCK << LO_CLASSSHIFT);
WITNESS_INIT(&mpl->mpl_lock_obj, type);
#endif
}
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
index d79b59748e8..2feeff16943 100644
--- a/sys/kern/kern_rwlock.c
+++ b/sys/kern/kern_rwlock.c
@@ -129,36 +129,6 @@ rw_enter_write(struct rwlock *rwl)
}
}
-void
-rw_exit_read(struct rwlock *rwl)
-{
- unsigned long owner;
-
- rw_assert_rdlock(rwl);
- WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0);
-
- membar_exit_before_atomic();
- owner = rwl->rwl_owner;
- if (__predict_false((owner & RWLOCK_WAIT) ||
- rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR)))
- rw_do_exit(rwl, 0);
-}
-
-void
-rw_exit_write(struct rwlock *rwl)
-{
- unsigned long owner;
-
- rw_assert_wrlock(rwl);
- WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE);
-
- membar_exit_before_atomic();
- owner = rwl->rwl_owner;
- if (__predict_false((owner & RWLOCK_WAIT) ||
- rw_cas(&rwl->rwl_owner, owner, 0)))
- rw_do_exit(rwl, RWLOCK_WRLOCK);
-}
-
#ifdef DIAGNOSTIC
/*
* Put the diagnostic functions here to keep the main code free
@@ -313,9 +283,10 @@ retry:
}
void
-rw_exit(struct rwlock *rwl)
+_rw_exit(struct rwlock *rwl, int locked)
{
unsigned long wrlock;
+ unsigned long owner, set;
/* Avoid deadlocks after panic or in DDB */
if (panicstr || db_active)
@@ -329,15 +300,6 @@ rw_exit(struct rwlock *rwl)
WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0);
membar_exit_before_atomic();
- rw_do_exit(rwl, wrlock);
-}
-
-/* membar_exit_before_atomic() has to precede call of this function. */
-void
-rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
-{
- unsigned long owner, set;
-
do {
owner = rwl->rwl_owner;
if (wrlock)
@@ -348,7 +310,13 @@ rw_do_exit(struct rwlock *rwl, unsigned long wrlock)
} while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set)));
if (owner & RWLOCK_WAIT)
- wakeup(rwl);
+ wakeup_n(rwl, -1, locked);
+}
+
+void
+rw_exit(struct rwlock *rwl)
+{
+ _rw_exit(rwl, 0);
}
int
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 5510108a7d5..7589b698d79 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -339,7 +339,7 @@ again:
* This is kind of like a stupid idle loop.
*/
#ifdef MULTIPROCESSOR
- __mp_unlock(&sched_lock);
+ mtx_leave(&sched_lock);
#endif
spl0();
delay(10);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index f23ffe9c1d6..d867862b18a 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1969,7 +1969,7 @@ single_thread_check_locked(struct proc *p, int deep, int
s)
continue;
if (atomic_dec_int_nv(&pr->ps_singlecount) == 0)
- wakeup(&pr->ps_singlecount);
+ wakeup_n(&pr->ps_singlecount, 1, 1);
if (pr->ps_flags & PS_SINGLEEXIT) {
SCHED_UNLOCK(s);
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index b476a6b4253..73438e8ecdd 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -66,6 +66,7 @@
#endif
int sleep_signal_check(void);
+int sleep_signal_check_locked(int s);
int thrsleep(struct proc *, struct sys___thrsleep_args *);
int thrsleep_unlock(void *);
@@ -308,7 +309,7 @@ rwsleep(const volatile void *ident, struct rwlock *rwl, int
priority,
sleep_setup(&sls, ident, priority, wmesg, timo);
- rw_exit(rwl);
+ _rw_exit(rwl, 1);
/* signal may stop the process, release rwlock before that */
error = sleep_finish(&sls, 1);
@@ -410,7 +411,7 @@ sleep_finish(struct sleep_state *sls, int do_sleep)
* that case we need to unwind immediately.
*/
atomic_setbits_int(&p->p_flag, P_SINTR);
- if ((error = sleep_signal_check()) != 0) {
+ if ((error = sleep_signal_check_locked(sls->sls_s)) != 0) {
p->p_stat = SONPROC;
sls->sls_catch = 0;
do_sleep = 0;
@@ -473,11 +474,23 @@ sleep_finish(struct sleep_state *sls, int do_sleep)
*/
int
sleep_signal_check(void)
+{
+ int err, s;
+
+ SCHED_LOCK(s);
+ err = sleep_signal_check_locked(s);
+ SCHED_UNLOCK(s);
+
+ return err;
+}
+
+int
+sleep_signal_check_locked(int s)
{
struct proc *p = curproc;
int err, sig;
- if ((err = single_thread_check(p, 1)) != 0)
+ if ((err = single_thread_check_locked(p, 1, s)) != 0)
return err;
if ((sig = cursig(p)) != 0) {
if (p->p_p->ps_sigacts->ps_sigintr & sigmask(sig))
@@ -489,11 +502,12 @@ sleep_signal_check(void)
}
int
-wakeup_proc(struct proc *p, const volatile void *chan)
+_wakeup_proc_locked(struct proc *p, const volatile void *chan)
{
- int s, awakened = 0;
+ int awakened = 0;
+
+ SCHED_ASSERT_LOCKED();
- SCHED_LOCK(s);
if (p->p_wchan != NULL &&
((chan == NULL) || (p->p_wchan == chan))) {
awakened = 1;
@@ -502,6 +516,17 @@ wakeup_proc(struct proc *p, const volatile void *chan)
else
unsleep(p);
}
+
+ return awakened;
+}
+
+int
+wakeup_proc(struct proc *p, const volatile void *chan)
+{
+ int s, awakened;
+
+ SCHED_LOCK(s);
+ awakened = _wakeup_proc_locked(p, chan);
SCHED_UNLOCK(s);
return awakened;
@@ -521,7 +546,7 @@ endtsleep(void *arg)
int s;
SCHED_LOCK(s);
- if (wakeup_proc(p, NULL))
+ if (_wakeup_proc_locked(p, NULL))
atomic_setbits_int(&p->p_flag, P_TIMEOUT);
SCHED_UNLOCK(s);
}
@@ -545,14 +570,18 @@ unsleep(struct proc *p)
* Make a number of processes sleeping on the specified identifier runnable.
*/
void
-wakeup_n(const volatile void *ident, int n)
+wakeup_n(const volatile void *ident, int n, int locked)
{
struct slpque *qp;
struct proc *p;
struct proc *pnext;
int s;
- SCHED_LOCK(s);
+ if (!locked)
+ SCHED_LOCK(s);
+ else
+ SCHED_ASSERT_LOCKED();
+
qp = &slpque[LOOKUP(ident)];
for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
pnext = TAILQ_NEXT(p, p_runq);
@@ -569,10 +598,12 @@ wakeup_n(const volatile void *ident, int n)
if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
panic("wakeup: p_stat is %d", (int)p->p_stat);
#endif
- if (wakeup_proc(p, ident))
+ if (_wakeup_proc_locked(p, ident))
--n;
}
- SCHED_UNLOCK(s);
+
+ if (!locked)
+ SCHED_UNLOCK(s);
}
/*
@@ -581,7 +612,7 @@ wakeup_n(const volatile void *ident, int n)
void
wakeup(const volatile void *chan)
{
- wakeup_n(chan, -1);
+ wakeup_n(chan, -1, 0);
}
int
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
index 3b0a0b2cfc7..eea875e47db 100644
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -59,7 +59,7 @@ int lbolt; /* once a second sleep address
*/
int rrticks_init; /* # of hardclock ticks per roundrobin() */
#ifdef MULTIPROCESSOR
-struct __mp_lock sched_lock;
+struct mutex sched_lock;
#endif
void schedcpu(void *);
@@ -337,10 +337,8 @@ mi_switch(void)
struct timespec ts;
#ifdef MULTIPROCESSOR
int hold_count;
- int sched_count;
#endif
- assertwaitok();
KASSERT(p->p_stat != SONPROC);
SCHED_ASSERT_LOCKED();
@@ -349,7 +347,6 @@ mi_switch(void)
/*
* Release the kernel_lock, as we are about to yield the CPU.
*/
- sched_count = __mp_release_all_but_one(&sched_lock);
if (_kernel_lock_held())
hold_count = __mp_release_all(&kernel_lock);
else
@@ -407,9 +404,10 @@ mi_switch(void)
* just release it here.
*/
#ifdef MULTIPROCESSOR
- __mp_unlock(&sched_lock);
+ mtx_leave(&sched_lock);
#endif
+ assertwaitok();
SCHED_ASSERT_UNLOCKED();
smr_idle();
@@ -431,7 +429,7 @@ mi_switch(void)
*/
if (hold_count)
__mp_acquire_count(&kernel_lock, hold_count);
- __mp_acquire_count(&sched_lock, sched_count + 1);
+ mtx_enter(&sched_lock);
#endif
}
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 4dbc097f242..978fd5632cd 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -605,6 +605,7 @@ int single_thread_set(struct proc *, enum
single_thread_mode, int);
int single_thread_wait(struct process *, int);
void single_thread_clear(struct proc *, int);
int single_thread_check(struct proc *, int);
+int single_thread_check_locked(struct proc *, int, int);
void child_return(void *);
diff --git a/sys/sys/rwlock.h b/sys/sys/rwlock.h
index f5a2504f4e8..fbed8783805 100644
--- a/sys/sys/rwlock.h
+++ b/sys/sys/rwlock.h
@@ -40,15 +40,6 @@
* atomically test for the lock being 0 (it's not possible to have
* owner/read count unset and waiter bits set) and if 0 set the owner to
* the proc and RWLOCK_WRLOCK. While not zero, loop into rw_enter_wait.
- *
- * void rw_exit_read(struct rwlock *);
- * atomically decrement lock by RWLOCK_READ_INCR and unset RWLOCK_WAIT and
- * RWLOCK_WRWANT remembering the old value of lock and if RWLOCK_WAIT was set,
- * call rw_exit_waiters with the old contents of the lock.
- *
- * void rw_exit_write(struct rwlock *);
- * atomically swap the contents of the lock with 0 and if RWLOCK_WAIT was
- * set, call rw_exit_waiters with the old contents of the lock.
*/
#ifndef _SYS_RWLOCK_H
@@ -149,8 +140,8 @@ void _rw_init_flags(struct rwlock *, const char *,
int,
void rw_enter_read(struct rwlock *);
void rw_enter_write(struct rwlock *);
-void rw_exit_read(struct rwlock *);
-void rw_exit_write(struct rwlock *);
+#define rw_exit_read(rwl) rw_exit(rwl)
+#define rw_exit_write(rwl) rw_exit(rwl)
#ifdef DIAGNOSTIC
void rw_assert_wrlock(struct rwlock *);
@@ -167,6 +158,7 @@ void rw_assert_unlocked(struct rwlock *);
int rw_enter(struct rwlock *, int);
void rw_exit(struct rwlock *);
int rw_status(struct rwlock *);
+void _rw_exit(struct rwlock *, int);
static inline int
rw_read_held(struct rwlock *rwl)
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index e5d461dcf81..b85bba9297f 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -195,37 +195,32 @@ void remrunqueue(struct proc *);
} while (0)
#if defined(MULTIPROCESSOR)
-#include <sys/lock.h>
+#include <sys/mutex.h>
-/*
- * XXX Instead of using struct lock for the kernel lock and thus requiring us
- * XXX to implement simplelocks, causing all sorts of fine-grained locks all
- * XXX over our tree to be activated, the sched_lock is a different kind of
- * XXX lock to avoid introducing locking protocol bugs.
- */
-extern struct __mp_lock sched_lock;
+extern struct mutex sched_lock;
#define SCHED_ASSERT_LOCKED()
\
do { \
splassert(IPL_SCHED); \
- KASSERT(__mp_lock_held(&sched_lock, curcpu())); \
+ MUTEX_ASSERT_LOCKED(&sched_lock); \
} while (0)
+
#define SCHED_ASSERT_UNLOCKED()
\
do { \
- KASSERT(__mp_lock_held(&sched_lock, curcpu()) == 0); \
+ MUTEX_ASSERT_UNLOCKED(&sched_lock); \
} while (0)
-#define SCHED_LOCK_INIT() __mp_lock_init(&sched_lock)
+#define SCHED_LOCK_INIT() mtx_init(&sched_lock, IPL_NONE)
#define SCHED_LOCK(s)
\
do { \
s = splsched(); \
- __mp_lock(&sched_lock); \
+ mtx_enter(&sched_lock); \
} while (/* CONSTCOND */ 0)
#define SCHED_UNLOCK(s)
\
do { \
- __mp_unlock(&sched_lock); \
+ mtx_leave(&sched_lock); \
splx(s); \
} while (/* CONSTCOND */ 0)
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index a0ef9a3dd7f..f4a816ff855 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -265,9 +265,9 @@ void cond_signal(struct cond *);
struct mutex;
struct rwlock;
-void wakeup_n(const volatile void *, int);
+void wakeup_n(const volatile void *, int, int);
void wakeup(const volatile void *);
-#define wakeup_one(c) wakeup_n((c), 1)
+#define wakeup_one(c) wakeup_n((c), 1, 0)
int tsleep(const volatile void *, int, const char *, int);
int tsleep_nsec(const volatile void *, int, const char *, uint64_t);
int msleep(const volatile void *, struct mutex *, int, const char*, int);