On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> +              * - A delayed freeing and RCU-like quiescing sequence based on
> +              *   mm switching to avoid IPIs completely.

That one's interesting too. so basically you want to count switch_mm()
invocations on each CPU. Then, periodically snapshot the counter on each
CPU, and when they've all changed, increment a global counter.

Then, you snapshot the global counter and wait for it to increment
(twice I think, the first increment might already be in progress).

The only question here is what should drive this machinery.. the tick
probably.

This shouldn't be too hard to do I think.

Something a little like so perhaps?


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41404afb7f4c..27b64a60a468 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4525,6 +4525,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 * finish_task_switch()'s mmdrop().
                 */
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
+               rq->nr_mm_switches++;
 
                if (!prev->mm) {                        // from kernel
                        /* will mmdrop() in finish_task_switch(). */
@@ -4739,6 +4740,80 @@ unsigned long long task_sched_runtime(struct task_struct 
*p)
        return ns;
 }
 
+static DEFINE_PER_CPU(unsigned long[2], mm_switches);
+
+static struct {
+       unsigned long __percpu *switches[2];
+       unsigned long generation;
+       atomic_t complete;
+       struct wait_queue_dead wait;
+} mm_foo = {
+       .switches = &mm_switches,
+       .generation = 0,
+       .complete = -1, // XXX bootstrap, hotplug
+       .wait = __WAIT_QUEUE_HEAD_INITIALIZER(mm_foo.wait),
+};
+
+static void mm_gen_tick(int cpu, struct rq *rq)
+{
+       unsigned long prev, curr, switches = rq->nr_mm_switches;
+       int idx = READ_ONCE(mm_foo.generation) & 1;
+
+       /* DATA-DEP on mm_foo.generation */
+
+       prev = __this_cpu_read(mm_foo.switches[idx^1]);
+       curr = __this_cpu_read(mm_foo.switches[idx]);
+
+       /* we haven't switched since the last generation */
+       if (prev == switches)
+               return false;
+
+       __this_cpu_write(mm_foo.switches[idx], switches);
+
+       /*
+        * If @curr is less than @prev, this is the first update of
+        * this generation, per the above, switches has also increased since,
+        * so mark out CPU complete.
+        */
+       if ((long)(curr - prev) < 0 && atomic_dec_and_test(&mm_foo.complete)) {
+               /*
+                * All CPUs are complete, IOW they all switched at least once
+                * since the last generation. Reset the completion counter and
+                * increment the generation.
+                */
+               atomic_set(&mm_foo.complete, nr_online_cpus());
+               /*
+                * Matches the address dependency above:
+                *
+                *   idx = gen & 1      complete = nr_cpus
+                *   <DATA-DEP>         <WMB>
+                *   curr = sw[idx]     generation++;
+                *   prev = sw[idx^1]
+                *   if (curr < prev)
+                *     complete--
+                *
+                * If we don't observe the new generation; we'll not decrement. 
If we
+                * do see the new generation, we must also see the new 
completion count.
+                */
+               smp_wmb();
+               mm_foo.generation++;
+               return true;
+       }
+
+       return false;
+}
+
+static void mm_gen_wake(void)
+{
+       wake_up_all(&mm_foo.wait);
+}
+
+static void mm_gen_wait(void)
+{
+       unsigned int gen = READ_ONCE(mm_foo.generation);
+       wait_event(&mm_foo.wait, READ_ONCE(mm_foo.generation) - gen > 1);
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4750,6 +4825,7 @@ void scheduler_tick(void)
        struct task_struct *curr = rq->curr;
        struct rq_flags rf;
        unsigned long thermal_pressure;
+       bool wake_mm_gen;
 
        arch_scale_freq_tick();
        sched_clock_tick();
@@ -4763,8 +4839,13 @@ void scheduler_tick(void)
        calc_global_load_tick(rq);
        psi_task_tick(rq);
 
+       wake_mm_gen = mm_gen_tick(cpu, rq);
+
        rq_unlock(rq, &rf);
 
+       if (wake_mm_gen)
+               mm_gen_wake();
+
        perf_event_task_tick();
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf9d8da7d35e..62fb685db8d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -927,6 +927,7 @@ struct rq {
        unsigned int            ttwu_pending;
 #endif
        u64                     nr_switches;
+       u64                     nr_mm_switches;
 
 #ifdef CONFIG_UCLAMP_TASK
        /* Utilization clamp values based on CPU's RUNNABLE tasks */

Reply via email to