membarrier: Use per-CPU mutexes for targeted commands

Aniket Gattani Thu, 09 Apr 2026 14:26:05 -0700

Currently, the membarrier system call uses a single global mutex
(`membarrier_ipi_mutex`) to serialize expedited commands. This causes
significant contention on large systems when multiple threads invoke
membarrier concurrently, even if they target different CPUs.


This contention becomes critical when combined with CFS bandwidth
throttling/unthrottling, during which interrupts can be disabled for
relatively long periods on target CPUs. If membarrier is waiting for a
response from such a CPU, it holds the global mutex, blocking all other
membarrier calls on the system. This cascade effect can lead to hard
lockups when thousands of threads stall waiting for the mutex.

Optimize `MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ` when a specific CPU is
targeted by introducing per-CPU mutexes. Broadcast commands and commands
without a specific CPU target continue to use the global mutex.

This prevents the cascade lockup scenario. As measured by the stress test
introduced in the subsequent patch, on an AMD Turin machine with 384 CPUs
(2 NUMA nodes with SMT=2), this optimization yields 200x more
throughput.

Signed-off-by: Aniket Gattani <[email protected]>
---
 kernel/sched/membarrier.c | 48 +++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 623445603725..dc916e6541d2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -165,7 +165,26 @@
        | MEMBARRIER_CMD_GET_REGISTRATIONS)
 
 static DEFINE_MUTEX(membarrier_ipi_mutex);
-#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
+static DEFINE_PER_CPU(struct mutex, membarrier_cpu_mutexes);
+
+static inline struct mutex *membarrier_get_mutex(int cpu)
+{
+       if (cpu >= 0)
+               return &per_cpu(membarrier_cpu_mutexes, cpu);
+       return &membarrier_ipi_mutex;
+}
+
+#define SERIALIZE_IPI(cpu_id) guard(mutex)(membarrier_get_mutex(cpu_id))
+
+static int __init membarrier_init(void)
+{
+       int i;
+
+       for_each_possible_cpu(i)
+               mutex_init(&per_cpu(membarrier_cpu_mutexes, i));
+       return 0;
+}
+core_initcall(membarrier_init);
 
 static void ipi_mb(void *info)
 {
@@ -264,7 +283,7 @@ static int membarrier_global_expedited(void)
        if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
                return -ENOMEM;
 
-       SERIALIZE_IPI();
+       SERIALIZE_IPI(-1);
        cpus_read_lock();
        rcu_read_lock();
        for_each_online_cpu(cpu) {
@@ -358,14 +377,19 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
        if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
                return -ENOMEM;
 
-       SERIALIZE_IPI();
+       if (cpu_id >= 0 && (cpu_id >= nr_cpu_ids || !cpu_possible(cpu_id)))
+               return 0;
+
+       SERIALIZE_IPI(cpu_id);
+
        cpus_read_lock();
 
        if (cpu_id >= 0) {
                struct task_struct *p;
 
-               if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+               if (!cpu_online(cpu_id))
                        goto out;
+
                rcu_read_lock();
                p = rcu_dereference(cpu_rq(cpu_id)->curr);
                if (!p || p->mm != mm) {
@@ -373,6 +397,11 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
                        goto out;
                }
                rcu_read_unlock();
+               /*
+                * smp_call_function_single() will call ipi_func() if cpu_id
+                * is the calling CPU.
+                */
+               smp_call_function_single(cpu_id, ipi_func, NULL, 1);
        } else {
                int cpu;
 
@@ -385,15 +414,6 @@ static int membarrier_private_expedited(int flags, int 
cpu_id)
                                __cpumask_set_cpu(cpu, tmpmask);
                }
                rcu_read_unlock();
-       }
-
-       if (cpu_id >= 0) {
-               /*
-                * smp_call_function_single() will call ipi_func() if cpu_id
-                * is the calling CPU.
-                */
-               smp_call_function_single(cpu_id, ipi_func, NULL, 1);
-       } else {
                /*
                 * For regular membarrier, we can save a few cycles by
                 * skipping the current cpu -- we're about to do smp_mb()
@@ -472,7 +492,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct 
*mm)
         * between threads which are users of @mm has its membarrier state
         * updated.
         */
-       SERIALIZE_IPI();
+       SERIALIZE_IPI(-1);
        cpus_read_lock();
        rcu_read_lock();
        for_each_online_cpu(cpu) {
-- 
2.54.0.rc0.605.g598a273b03-goog

[PATCH 1/2] sched/membarrier: Use per-CPU mutexes for targeted commands

Reply via email to