On Tue, Sep 03, 2019 at 04:41:47PM -0400, Mathieu Desnoyers wrote:
> As discussed on IRC, one alternative for the multi-threaded case would
> be to grab the task list lock and iterate over all existing tasks to
> set the bit, so we don't have to touch an extra cache line from the
> scheduler.
> 
> In order to keep the speed of the common single-threaded library
> constructor common case fast, we simply set the bit in the current
> task struct, and rely on clone() propagating the flag to children
> threads (which it already does).

Something like the completely untested thing below.

And yes, that do_each_thread/while_each_thread thing is unfortunate and
yuck too, but supposedly that's a slow path not many people are expected
to hit anyway, right?

---
 include/linux/sched.h     |  4 ++++
 kernel/sched/membarrier.c | 20 +++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33b310a826d7..dbafafb8ef40 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1136,6 +1136,10 @@ struct task_struct {
        unsigned long                   numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_MEMBARRIER
+       atomic_t                        membarrier_state;
+#endif
+
 #ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_sig;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..961f6affbf38 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -72,8 +72,8 @@ static int membarrier_global_expedited(void)
 
                rcu_read_lock();
                p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-               if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-                                  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+               if (p && (atomic_read(&p->membarrier_state) &
+                         MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
                        if (!fallback)
                                __cpumask_set_cpu(cpu, tmpmask);
                        else
@@ -185,7 +185,9 @@ static int membarrier_register_global_expedited(void)
        if (atomic_read(&mm->membarrier_state) &
            MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
                return 0;
+
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+       atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &p->membarrier_state);
        if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
                /*
                 * For single mm user, single threaded process, we can
@@ -196,6 +198,17 @@ static int membarrier_register_global_expedited(void)
                 */
                smp_mb();
        } else {
+               struct task_struct *g, *t;
+
+               read_lock(&tasklist_lock);
+               do_each_thread(g, t) {
+                       if (t->mm == mm) {
+                               atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
+                                         &t->membarrier_state);
+                       }
+               } while_each_thread(g, t);
+               read_unlock(&tasklist_lock);
+
                /*
                 * For multi-mm user threads, we need to ensure all
                 * future scheduler executions will observe the new
@@ -229,9 +242,10 @@ static int membarrier_register_private_expedited(int flags)
        if (atomic_read(&mm->membarrier_state) & state)
                return 0;
        atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
-       if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
                atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
                          &mm->membarrier_state);
+       }
        if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
                /*
                 * Ensure all future scheduler executions will observe the

Reply via email to