On Sun, Mar 15, 2015 at 03:24:19PM -0400, Mathieu Desnoyers wrote: TL;DR
> --- a/arch/x86/include/asm/mmu_context.h > +++ b/arch/x86/include/asm/mmu_context.h > @@ -45,6 +45,16 @@ static inline void switch_mm(struct mm_struct *prev, > struct mm_struct *next, > #endif > cpumask_set_cpu(cpu, mm_cpumask(next)); > > + /* > + * smp_mb() between mm_cpumask set and following memory > + * accesses to user-space addresses is required by > + * sys_membarrier(). A smp_mb() is also needed between > + * prior memory accesses and mm_cpumask clear. This > + * ensures that all user-space address memory accesses > + * performed by the current thread are in program order > + * when the mm_cpumask is set. Implied by load_cr3. > + */ > + > /* Re-load page tables */ > load_cr3(next->pgd); > trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); > @@ -82,6 +92,13 @@ static inline void switch_mm(struct mm_struct *prev, > struct mm_struct *next, > * We were in lazy tlb mode and leave_mm disabled > * tlb flush IPI delivery. We must reload CR3 > * to make sure to use no freed page tables. > + * > + * smp_mb() between mm_cpumask set and memory accesses > + * to user-space addresses is required by > + * sys_membarrier(). This ensures that all user-space > + * address memory accesses performed by the current > + * thread are in program order when the mm_cpumask is > + * set. Implied by load_cr3. > */ > load_cr3(next->pgd); > trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, > TLB_FLUSH_ALL); In both cases the cpumask_set_cpu() will also imply a MB. > +enum { > + /* > + * Private flag set: only synchronize across a single process. If this > + * flag is not set, it means "shared": synchronize across multiple > + * processes. The shared mode is useful for shared memory mappings > + * across processes. > + */ > + MEMBARRIER_PRIVATE_FLAG = (1 << 0), > + > + /* > + * Expedited flag set: adds some overhead, fast execution (few > + * microseconds). If this flag is not set, it means "delayed": low > + * overhead, but slow execution (few milliseconds). > + */ > + MEMBARRIER_EXPEDITED_FLAG = (1 << 1), I suppose this is an unprivileged syscall; so what do we do about: for (;;) sys_membar(EXPEDITED); Which would spray the entire system with IPIs at break neck speed. > +static void membarrier_ipi(void *unused) > +{ > + /* Order memory accesses with respects to sys_membarrier caller. */ > + smp_mb(); > +} > + > +/* > + * Handle out-of-memory by sending per-cpu IPIs instead. > + */ > +static void membarrier_fallback(void) > +{ > + struct mm_struct *mm; > + int cpu; > + > + for_each_cpu(cpu, mm_cpumask(current->mm)) { > + raw_spin_lock_irq(&cpu_rq(cpu)->lock); > + mm = cpu_curr(cpu)->mm; > + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); > + if (current->mm == mm) > + smp_call_function_single(cpu, membarrier_ipi, NULL, 1); > + } > +} > +static void membarrier_expedited(void) > +{ > + struct mm_struct *mm; > + cpumask_var_t tmpmask; > + int cpu; > + > + /* > + * Memory barrier on the caller thread between previous memory accesses > + * to user-space addresses and sending memory-barrier IPIs. Orders all > + * user-space address memory accesses prior to sys_membarrier() before > + * mm_cpumask read and membarrier_ipi executions. This barrier is paired > + * with memory barriers in: > + * - membarrier_ipi() (for each running threads of the current process) > + * - switch_mm() (ordering scheduler mm_cpumask update wrt memory > + * accesses to user-space addresses) > + * - Each CPU ->mm update performed with rq lock held by the scheduler. > + * A memory barrier is issued each time ->mm is changed while the rq > + * lock is held. > + */ > + smp_mb(); > + if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { > + membarrier_fallback(); > + goto out; > + } > + cpumask_copy(tmpmask, mm_cpumask(current->mm)); > + preempt_disable(); > + cpumask_clear_cpu(smp_processor_id(), tmpmask); > + for_each_cpu(cpu, tmpmask) { > + raw_spin_lock_irq(&cpu_rq(cpu)->lock); > + mm = cpu_curr(cpu)->mm; > + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); > + if (current->mm != mm) > + cpumask_clear_cpu(cpu, tmpmask); > + } > + smp_call_function_many(tmpmask, membarrier_ipi, NULL, 1); > + preempt_enable(); > + free_cpumask_var(tmpmask); > +out: > + /* > + * Memory barrier on the caller thread between sending & waiting for > + * memory-barrier IPIs and following memory accesses to user-space > + * addresses. Orders mm_cpumask read and membarrier_ipi executions > + * before all user-space address memory accesses following > + * sys_membarrier(). This barrier is paired with memory barriers in: > + * - membarrier_ipi() (for each running threads of the current process) > + * - switch_mm() (ordering scheduler mm_cpumask update wrt memory > + * accesses to user-space addresses) > + * - Each CPU ->mm update performed with rq lock held by the scheduler. > + * A memory barrier is issued each time ->mm is changed while the rq > + * lock is held. > + */ > + smp_mb(); > +} Did you just write: bool membar_cpu_is_mm(int cpu, void *info) { struct mm_struct *mm = info; struct rq *rq = cpu_rq(cpu); bool ret; raw_spin_lock_irq(&rq->lock); ret = rq->curr->mm == mm; raw_spin_unlock_irq(&rq->lock); return ret; } on_each_cpu_cond(membar_cpu_is_mm, membar_ipi, mm, 1, GFP_NOWAIT); On which; I absolutely hate that rq->lock thing in there. What is 'wrong' with doing a lockless compare there? Other than not actually being able to deref rq->curr of course, but we need to fix that anyhow. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/