The membarrier_state field is located within the mm_struct, which
is not guaranteed to exist when used from runqueue-lock-free iteration
on runqueues by the membarrier system call.

Copy the membarrier_state from the mm_struct into the next task_struct
in the scheduler prepare task switch. Upon membarrier registration,
iterate over each runqueue and copy the membarrier_state from the
mm_struct into all currently running task struct which have the same mm
as the current task.

Suggested-by: Linus Torvalds <[email protected]>
Signed-off-by: Mathieu Desnoyers <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Russell King - ARM Linux admin <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Kirill Tkhai <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
 include/linux/sched.h     |  4 ++
 include/linux/sched/mm.h  | 13 +++++++
 kernel/sched/core.c       |  1 +
 kernel/sched/membarrier.c | 78 ++++++++++++++++++++++++++++-----------
 4 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..e24d52a4c37a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1130,6 +1130,10 @@ struct task_struct {
        unsigned long                   numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_MEMBARRIER
+       atomic_t membarrier_state;
+#endif
+
 #ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_sig;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4a7944078cc3..3577cd7b3dbb 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -371,7 +371,17 @@ static inline void 
membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 static inline void membarrier_execve(struct task_struct *t)
 {
        atomic_set(&t->mm->membarrier_state, 0);
+       atomic_set(&t->membarrier_state, 0);
 }
+
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+       if (!t->mm)
+               return;
+       atomic_set(&t->membarrier_state,
+                  atomic_read(&t->mm->membarrier_state));
+}
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -386,6 +396,9 @@ static inline void membarrier_execve(struct task_struct *t)
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct 
*mm)
 {
 }
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..8d4f1f20db15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct 
*prev,
        perf_event_task_sched_out(prev, next);
        rseq_preempt(prev);
        fire_sched_out_preempt_notifiers(prev, next);
+       membarrier_prepare_task_switch(next);
        prepare_task(next);
        prepare_arch_switch(next);
 }
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..d564ca1b5d69 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -72,8 +72,8 @@ static int membarrier_global_expedited(void)
 
                rcu_read_lock();
                p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-               if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-                                  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+               if (p && (atomic_read(&p->membarrier_state) &
+                         MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
                        if (!fallback)
                                __cpumask_set_cpu(cpu, tmpmask);
                        else
@@ -177,6 +177,46 @@ static int membarrier_private_expedited(int flags)
        return 0;
 }
 
+static void sync_other_runqueues_membarrier_state(struct mm_struct *mm)
+{
+       int cpu;
+
+       if (num_online_cpus() == 1)
+               return;
+
+       /*
+        * For multi-mm user threads, we need to ensure all future scheduler
+        * executions will observe @mm's new membarrier state.
+        */
+       synchronize_rcu();
+
+       /*
+        * For each cpu runqueue (except the current cpu), if the task's mm
+        * match @mm, ensure that all @mm's membarrier state set bits are also
+        * set in in the runqueue's current task membarrier state.
+        *
+        * Use an atomic_or() to set the task membarrier state, thus ensuring
+        * this operation is always additive. This is important in case many
+        * different membarrier registration commands are invoked concurrently,
+        * given that they do not hold the mmap_sem.
+        */
+       cpus_read_lock();
+       for_each_online_cpu(cpu) {
+               struct task_struct *p;
+
+               /* Skip current CPU. */
+               if (cpu == raw_smp_processor_id())
+                       continue;
+               rcu_read_lock();
+               p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+               if (p && p->mm == mm)
+                       atomic_or(atomic_read(&mm->membarrier_state),
+                                 &p->membarrier_state);
+               rcu_read_unlock();
+       }
+       cpus_read_unlock();
+}
+
 static int membarrier_register_global_expedited(void)
 {
        struct task_struct *p = current;
@@ -186,6 +226,8 @@ static int membarrier_register_global_expedited(void)
            MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
                return 0;
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+       atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &p->membarrier_state);
+
        if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
                /*
                 * For single mm user, single threaded process, we can
@@ -196,12 +238,7 @@ static int membarrier_register_global_expedited(void)
                 */
                smp_mb();
        } else {
-               /*
-                * For multi-mm user threads, we need to ensure all
-                * future scheduler executions will observe the new
-                * thread flag state for this mm.
-                */
-               synchronize_rcu();
+               sync_other_runqueues_membarrier_state(mm);
        }
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
                  &mm->membarrier_state);
@@ -213,12 +250,14 @@ static int membarrier_register_private_expedited(int 
flags)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
-       int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+       int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+           set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED;
 
        if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
-               state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+               ready_state =
+                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
        }
 
        /*
@@ -226,20 +265,15 @@ static int membarrier_register_private_expedited(int 
flags)
         * groups, which use the same mm. (CLONE_VM but not
         * CLONE_THREAD).
         */
-       if (atomic_read(&mm->membarrier_state) & state)
+       if ((atomic_read(&mm->membarrier_state) & ready_state))
                return 0;
-       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
        if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-               atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
-                         &mm->membarrier_state);
-       if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
-               /*
-                * Ensure all future scheduler executions will observe the
-                * new thread flag state for this process.
-                */
-               synchronize_rcu();
-       }
-       atomic_or(state, &mm->membarrier_state);
+               set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+       atomic_or(set_state, &mm->membarrier_state);
+       atomic_or(set_state, &p->membarrier_state);
+       if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1))
+               sync_other_runqueues_membarrier_state(mm);
+       atomic_or(ready_state, &mm->membarrier_state);
 
        return 0;
 }
-- 
2.17.1

Reply via email to