On Sun, Mar 15, 2015 at 03:24:19PM -0400, Mathieu Desnoyers wrote:

TL;DR

> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -45,6 +45,16 @@ static inline void switch_mm(struct mm_struct *prev, 
> struct mm_struct *next,
>  #endif
>               cpumask_set_cpu(cpu, mm_cpumask(next));
>  
> +             /*
> +              * smp_mb() between mm_cpumask set and following memory
> +              * accesses to user-space addresses is required by
> +              * sys_membarrier(). A smp_mb() is also needed between
> +              * prior memory accesses and mm_cpumask clear. This
> +              * ensures that all user-space address memory accesses
> +              * performed by the current thread are in program order
> +              * when the mm_cpumask is set. Implied by load_cr3.
> +              */
> +
>               /* Re-load page tables */
>               load_cr3(next->pgd);
>               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> @@ -82,6 +92,13 @@ static inline void switch_mm(struct mm_struct *prev, 
> struct mm_struct *next,
>                        * We were in lazy tlb mode and leave_mm disabled
>                        * tlb flush IPI delivery. We must reload CR3
>                        * to make sure to use no freed page tables.
> +                      *
> +                      * smp_mb() between mm_cpumask set and memory accesses
> +                      * to user-space addresses is required by
> +                      * sys_membarrier(). This ensures that all user-space
> +                      * address memory accesses performed by the current
> +                      * thread are in program order when the mm_cpumask is
> +                      * set. Implied by load_cr3.
>                        */
>                       load_cr3(next->pgd);
>                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 
> TLB_FLUSH_ALL);


In both cases the cpumask_set_cpu() will also imply a MB.

> +enum {
> +     /*
> +      * Private flag set: only synchronize across a single process. If this
> +      * flag is not set, it means "shared": synchronize across multiple
> +      * processes.  The shared mode is useful for shared memory mappings
> +      * across processes.
> +      */
> +     MEMBARRIER_PRIVATE_FLAG = (1 << 0),
> +
> +     /*
> +      * Expedited flag set: adds some overhead, fast execution (few
> +      * microseconds).  If this flag is not set, it means "delayed": low
> +      * overhead, but slow execution (few milliseconds).
> +      */
> +     MEMBARRIER_EXPEDITED_FLAG = (1 << 1),


I suppose this is an unprivileged syscall; so what do we do about:

        for (;;)
                sys_membar(EXPEDITED);

Which would spray the entire system with IPIs at break neck speed.

> +static void membarrier_ipi(void *unused)
> +{
> +     /* Order memory accesses with respects to sys_membarrier caller. */
> +     smp_mb();
> +}
> +
> +/*
> + * Handle out-of-memory by sending per-cpu IPIs instead.
> + */
> +static void membarrier_fallback(void)
> +{
> +     struct mm_struct *mm;
> +     int cpu;
> +
> +     for_each_cpu(cpu, mm_cpumask(current->mm)) {
> +             raw_spin_lock_irq(&cpu_rq(cpu)->lock);
> +             mm = cpu_curr(cpu)->mm;
> +             raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
> +             if (current->mm == mm)
> +                     smp_call_function_single(cpu, membarrier_ipi, NULL, 1);
> +     }
> +}

> +static void membarrier_expedited(void)
> +{
> +     struct mm_struct *mm;
> +     cpumask_var_t tmpmask;
> +     int cpu;
> +
> +     /*
> +      * Memory barrier on the caller thread between previous memory accesses
> +      * to user-space addresses and sending memory-barrier IPIs. Orders all
> +      * user-space address memory accesses prior to sys_membarrier() before
> +      * mm_cpumask read and membarrier_ipi executions. This barrier is paired
> +      * with memory barriers in:
> +      * - membarrier_ipi() (for each running threads of the current process)
> +      * - switch_mm() (ordering scheduler mm_cpumask update wrt memory
> +      *                accesses to user-space addresses)
> +      * - Each CPU ->mm update performed with rq lock held by the scheduler.
> +      *   A memory barrier is issued each time ->mm is changed while the rq
> +      *   lock is held.
> +      */
> +     smp_mb();
> +     if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> +             membarrier_fallback();
> +             goto out;
> +     }
> +     cpumask_copy(tmpmask, mm_cpumask(current->mm));
> +     preempt_disable();
> +     cpumask_clear_cpu(smp_processor_id(), tmpmask);
> +     for_each_cpu(cpu, tmpmask) {
> +             raw_spin_lock_irq(&cpu_rq(cpu)->lock);
> +             mm = cpu_curr(cpu)->mm;
> +             raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
> +             if (current->mm != mm)
> +                     cpumask_clear_cpu(cpu, tmpmask);
> +     }
> +     smp_call_function_many(tmpmask, membarrier_ipi, NULL, 1);
> +     preempt_enable();
> +     free_cpumask_var(tmpmask);
> +out:
> +     /*
> +      * Memory barrier on the caller thread between sending & waiting for
> +      * memory-barrier IPIs and following memory accesses to user-space
> +      * addresses. Orders mm_cpumask read and membarrier_ipi executions
> +      * before all user-space address memory accesses following
> +      * sys_membarrier(). This barrier is paired with memory barriers in:
> +      * - membarrier_ipi() (for each running threads of the current process)
> +      * - switch_mm() (ordering scheduler mm_cpumask update wrt memory
> +      *                accesses to user-space addresses)
> +      * - Each CPU ->mm update performed with rq lock held by the scheduler.
> +      *   A memory barrier is issued each time ->mm is changed while the rq
> +      *   lock is held.
> +      */
> +     smp_mb();
> +}

Did you just write:

bool membar_cpu_is_mm(int cpu, void *info)
{
        struct mm_struct *mm = info;
        struct rq *rq = cpu_rq(cpu);
        bool ret;

        raw_spin_lock_irq(&rq->lock);
        ret = rq->curr->mm == mm;
        raw_spin_unlock_irq(&rq->lock);

        return ret;
}

        on_each_cpu_cond(membar_cpu_is_mm, membar_ipi, mm, 1, GFP_NOWAIT);



On which; I absolutely hate that rq->lock thing in there. What is
'wrong' with doing a lockless compare there? Other than not actually
being able to deref rq->curr of course, but we need to fix that anyhow.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to