Re: timeout(9): add TIMEOUT_MPSAFE flag

2021-05-13 Thread Visa Hankala
On Thu, May 13, 2021 at 11:08:25AM -0500, Scott Cheloha wrote:
> > On May 13, 2021, at 10:57 AM, Visa Hankala  wrote:
> > 
> > On Thu, May 13, 2021 at 12:04:57AM -0500, Scott Cheloha wrote:
> >> The funky locking dance in softclock() and softclock_thread() is
> >> needed to keep from violating the locking hierarchy.  The
> >> timeout_mutex is above the kernel lock, so we need to leave the
> >> timeout_mutex, then drop the kernel lock, and then reenter the
> >> timeout_mutex to start running TIMEOUT_MPSAFE timeouts.
> > 
> > Are you sure that dropping the kernel lock in softclock(), in
> > non-process context, is a good idea? That makes assumptions how the
> > layers above work.
> 
> I figured it was the most obvious way forward.
> 
> When you say "layers above," do you mean softintr_dispatch()?
> Or something else?

I mean softintr_dispatch() and anything else in the dispatch path.

> > At the minimum, I think the soft interrupt code has to be changed so
> > that it is possible to register MP-safe handlers.
> 
> Some platforms (e.g. arm64) have softintr_establish_flags(), but this
> is not universally available.
> 
> Do I need to unlock all the softintr internals before proceeding
> with this?  Or do I just need to bring softrintr_establish_flags()
> to every platform without it?

I think softrintr_establish() could register an MP-safe handler if
IPL_MPSAFE is ORed in the ipl parameter.

Dropping the kernel lock has various implications. At least the code
has to ensure that at most one CPU can run a given handler at a time.
In addition, softintr_disestablish() should probably block until the
handler has stopped (at the moment, this is not guaranteed on the
platforms that have softintr_establish_flags()).

Below is a hasty sketch for amd64 that I am not suggesting to commit.
It is probably wrong in more ways than one.

Index: arch/amd64/amd64/softintr.c
===
RCS file: src/sys/arch/amd64/amd64/softintr.c,v
retrieving revision 1.10
diff -u -p -r1.10 softintr.c
--- arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -  1.10
+++ arch/amd64/amd64/softintr.c 13 May 2021 17:28:51 -
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -85,7 +86,6 @@ softintr_dispatch(int which)
floor = ci->ci_handled_intr_level;
ci->ci_handled_intr_level = ci->ci_ilevel;
 
-   KERNEL_LOCK();
for (;;) {
mtx_enter(>softintr_lock);
sih = TAILQ_FIRST(>softintr_q);
@@ -95,14 +95,20 @@ softintr_dispatch(int which)
}
TAILQ_REMOVE(>softintr_q, sih, sih_q);
sih->sih_pending = 0;
-
-   uvmexp.softs++;
-
mtx_leave(>softintr_lock);
 
-   (*sih->sih_fn)(sih->sih_arg);
+   atomic_inc_int();
+
+   if (sih->sih_flags & IPL_MPSAFE) {
+   mtx_enter(>sih_mtx);
+   (*sih->sih_fn)(sih->sih_arg);
+   mtx_leave(>sih_mtx);
+   } else {
+   KERNEL_LOCK();
+   (*sih->sih_fn)(sih->sih_arg);
+   KERNEL_UNLOCK();
+   }
}
-   KERNEL_UNLOCK();
 
ci->ci_handled_intr_level = floor;
 }
@@ -117,7 +123,10 @@ softintr_establish(int ipl, void (*func)
 {
struct x86_soft_intr *si;
struct x86_soft_intrhand *sih;
-   int which;
+   int flags, which;
+
+   flags = ipl & IPL_MPSAFE;
+   ipl &= ~IPL_MPSAFE;
 
switch (ipl) {
case IPL_SOFTCLOCK:
@@ -141,9 +150,11 @@ softintr_establish(int ipl, void (*func)
 
sih = malloc(sizeof(*sih), M_DEVBUF, M_NOWAIT);
if (__predict_true(sih != NULL)) {
+   mtx_init(>sih_mtx, IPL_NONE);
sih->sih_intrhead = si;
sih->sih_fn = func;
sih->sih_arg = arg;
+   sih->sih_flags = flags;
sih->sih_pending = 0;
}
return (sih);
@@ -167,5 +178,8 @@ softintr_disestablish(void *arg)
}
mtx_leave(>softintr_lock);
 
+   /* Wait for any running handlers to finish. */
+   smr_barrier();
+
free(sih, M_DEVBUF, sizeof(*sih));
 }
Index: arch/amd64/include/intr.h
===
RCS file: src/sys/arch/amd64/include/intr.h,v
retrieving revision 1.32
diff -u -p -r1.32 intr.h
--- arch/amd64/include/intr.h   17 Jun 2020 06:14:52 -  1.32
+++ arch/amd64/include/intr.h   13 May 2021 17:28:51 -
@@ -232,13 +232,16 @@ extern void (*ipifunc[X86_NIPI])(struct 
 
 #ifndef _LOCORE
 #include 
+#include 
 
 struct x86_soft_intrhand {
TAILQ_ENTRY(x86_soft_intrhand)
sih_q;
+   struct mutex sih_mtx;
struct x86_soft_intr *sih_intrhead;
void(*sih_fn)(void *);
void*sih_arg;
+   int sih_flags;
int sih_pending;
 };
 



Re: timeout(9): add TIMEOUT_MPSAFE flag

2021-05-13 Thread Scott Cheloha
> On May 13, 2021, at 10:57 AM, Visa Hankala  wrote:
> 
> On Thu, May 13, 2021 at 12:04:57AM -0500, Scott Cheloha wrote:
>> The funky locking dance in softclock() and softclock_thread() is
>> needed to keep from violating the locking hierarchy.  The
>> timeout_mutex is above the kernel lock, so we need to leave the
>> timeout_mutex, then drop the kernel lock, and then reenter the
>> timeout_mutex to start running TIMEOUT_MPSAFE timeouts.
> 
> Are you sure that dropping the kernel lock in softclock(), in
> non-process context, is a good idea? That makes assumptions how the
> layers above work.

I figured it was the most obvious way forward.

When you say "layers above," do you mean softintr_dispatch()?
Or something else?

> At the minimum, I think the soft interrupt code has to be changed so
> that it is possible to register MP-safe handlers.

Some platforms (e.g. arm64) have softintr_establish_flags(), but this
is not universally available.

Do I need to unlock all the softintr internals before proceeding
with this?  Or do I just need to bring softrintr_establish_flags()
to every platform without it?



Re: timeout(9): add TIMEOUT_MPSAFE flag

2021-05-13 Thread Visa Hankala
On Thu, May 13, 2021 at 12:04:57AM -0500, Scott Cheloha wrote:
> The funky locking dance in softclock() and softclock_thread() is
> needed to keep from violating the locking hierarchy.  The
> timeout_mutex is above the kernel lock, so we need to leave the
> timeout_mutex, then drop the kernel lock, and then reenter the
> timeout_mutex to start running TIMEOUT_MPSAFE timeouts.

Are you sure that dropping the kernel lock in softclock(), in
non-process context, is a good idea? That makes assumptions how the
layers above work.

At the minimum, I think the soft interrupt code has to be changed so
that it is possible to register MP-safe handlers.



timeout(9): add TIMEOUT_MPSAFE flag

2021-05-12 Thread Scott Cheloha
Hi,

With the removal of the kernel lock from timeout_barrier(9),
softclock() and the timeout thread do not need the kernel lock.

However, many timeouts implicitly rely on the kernel lock.

So to unlock softclock() and the timeout thread I propose adding a new
flag, TIMEOUT_MPSAFE.  The flag signifies that a given timeout doesn't
need the kernel lock at all.  We'll run all such timeouts without the
kernel lock.

Taking/releasing the kernel lock during timeout_run() on a per-timeout
basis is way too slow.  Instead, during softclock() we can aggregate
TIMEOUT_MPSAFE timeouts onto secondary queues and process them all at
once after dropping the kernel lock.  This will minimize locking
overhead.

Normal timeouts are put onto timeout_todo_mpsafe and are run during
softclock().  Process timeouts are put onto timeout_proc_mpsafe and
run from the timeout thread.

The funky locking dance in softclock() and softclock_thread() is
needed to keep from violating the locking hierarchy.  The
timeout_mutex is above the kernel lock, so we need to leave the
timeout_mutex, then drop the kernel lock, and then reenter the
timeout_mutex to start running TIMEOUT_MPSAFE timeouts.

Thoughts?

Index: kern/kern_timeout.c
===
RCS file: /cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.84
diff -u -p -r1.84 kern_timeout.c
--- kern/kern_timeout.c 11 May 2021 13:29:25 -  1.84
+++ kern/kern_timeout.c 13 May 2021 05:01:53 -
@@ -74,7 +74,9 @@ struct circq timeout_wheel[BUCKETS];  /* 
 struct circq timeout_wheel_kc[BUCKETS];/* [T] Clock-based timeouts */
 struct circq timeout_new;  /* [T] New, unscheduled timeouts */
 struct circq timeout_todo; /* [T] Due or needs rescheduling */
+struct circq timeout_todo_mpsafe;  /* [T] Due + no kernel lock */
 struct circq timeout_proc; /* [T] Due + needs process context */
+struct circq timeout_proc_mpsafe;  /* [T] Due, proc ctx, no kernel lock */
 
 time_t timeout_level_width[WHEELCOUNT];/* [I] Wheel level width 
(seconds) */
 struct timespec tick_ts;   /* [I] Length of a tick (1/hz secs) */
@@ -228,7 +230,9 @@ timeout_startup(void)
 
CIRCQ_INIT(_new);
CIRCQ_INIT(_todo);
+   CIRCQ_INIT(_todo_mpsafe);
CIRCQ_INIT(_proc);
+   CIRCQ_INIT(_proc_mpsafe);
for (b = 0; b < nitems(timeout_wheel); b++)
CIRCQ_INIT(_wheel[b]);
for (b = 0; b < nitems(timeout_wheel_kc); b++)
@@ -697,7 +701,13 @@ softclock_process_kclock_timeout(struct 
if (!new && timespeccmp(>to_abstime, >kc_late, <=))
tostat.tos_late++;
if (ISSET(to->to_flags, TIMEOUT_PROC)) {
-   CIRCQ_INSERT_TAIL(_proc, >to_list);
+   if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+   CIRCQ_INSERT_TAIL(_proc_mpsafe, >to_list);
+   else
+   CIRCQ_INSERT_TAIL(_proc, >to_list);
+   return;
+   } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) {
+   CIRCQ_INSERT_TAIL(_todo_mpsafe, >to_list);
return;
}
timeout_run(to);
@@ -719,7 +729,13 @@ softclock_process_tick_timeout(struct ti
if (!new && delta < 0)
tostat.tos_late++;
if (ISSET(to->to_flags, TIMEOUT_PROC)) {
-   CIRCQ_INSERT_TAIL(_proc, >to_list);
+   if (ISSET(to->to_flags, TIMEOUT_MPSAFE))
+   CIRCQ_INSERT_TAIL(_proc_mpsafe, >to_list);
+   else
+   CIRCQ_INSERT_TAIL(_proc, >to_list);
+   return;
+   } else if (ISSET(to->to_flags, TIMEOUT_MPSAFE)) {
+   CIRCQ_INSERT_TAIL(_todo_mpsafe, >to_list);
return;
}
timeout_run(to);
@@ -735,11 +751,14 @@ softclock_process_tick_timeout(struct ti
 void
 softclock(void *arg)
 {
+   struct circq *cq;
struct timeout *first_new, *to;
-   int needsproc, new;
+   int needsproc, new, unlocked;
 
first_new = NULL;
+   needsproc = 1;
new = 0;
+   unlocked = 0;
 
mtx_enter(_mutex);
if (!CIRCQ_EMPTY(_new))
@@ -755,10 +774,27 @@ softclock(void *arg)
else
softclock_process_tick_timeout(to, new);
}
+   if (!CIRCQ_EMPTY(_todo_mpsafe)) {
+   mtx_leave(_mutex);
+   KERNEL_UNLOCK();
+   unlocked = 1;
+   mtx_enter(_mutex);
+   while (!CIRCQ_EMPTY(_todo_mpsafe)) {
+   cq = CIRCQ_FIRST(_todo_mpsafe);
+   to = timeout_from_circq(cq);
+   CIRCQ_REMOVE(>to_list);
+   timeout_run(to);
+   tostat.tos_run_softclock++;
+   }
+   }
tostat.tos_softclocks++;
-   needsproc = !CIRCQ_EMPTY(_proc);
+   if (CIRCQ_EMPTY(_proc) && CIRCQ_EMPTY(_proc_mpsafe))
+