Now using cmpxchg and closing a race, thanks to Peter Anvin. ---8<---
Subject: x86,mm,sched: make lazy TLB mode even lazier Lazy TLB mode can result in an idle CPU being woken up for a TLB flush, when all it really needed to do was flush %cr3 before the next context switch. This is mostly fine on bare metal, though sub-optimal from a power saving point of view, and deeper C states could make TLB flushes take a little longer than desired. On virtual machines, the pain can be much worse, especially if a currently non-running VCPU is woken up for a TLB invalidation IPI, on a CPU that is busy running another task. It could take a while before that IPI is handled, leading to performance issues. This patch deals with the issue by introducing a third tlb state, TLBSTATE_FLUSH, which causes %cr3 to be flushed at the next context switch. If there was a race transitioning a CPU from TLBSTATE_LAZY to TLBSTATE_FLUSH, an invalidation IPI will be sent. Nothing is done for a CPU that is already in TLBSTATE_FLUH mode. This patch is totally untested, because I am at a conference right now, and Benjamin has the test case :) Benjamin, does this help your issue? Signed-off-by: Rik van Riel <r...@redhat.com> Reported-by: Benjamin Serebrin <sereb...@google.com> --- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..5ae8e4b174f8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -310,6 +310,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 +#define TLBSTATE_FLUSH 3 static inline void reset_lazy_tlbstate(void) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5643fd0b1a7d..8dcc0947681c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -140,10 +140,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { + int oldstate = this_cpu_read(cpu_tlbstate.state); this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + if (oldstate == TLBSTATE_FLUSH || + !cpumask_test_cpu(cpu, mm_cpumask(next))) { /* * On established mms, the mm_cpumask is only changed * from irq context, from ptep_clear_flush() while in @@ -242,11 +244,33 @@ static void flush_tlb_func(void *info) } +/* + * This function moves a CPU from TLBSTATE_LAZY to TLBSTATE_FLUSH, which + * will force it to flush %cr3 at the next context switch, effectively + * doing a delayed TLB flush for a CPU in lazy TLB mode. + * Do nothing if the TLB state is already set to TLBSTATE_FLUSH. + */ +static bool set_lazy_tlbstate_flush(int cpu) { + int *tlbstate = &per_cpu(cpu_tlbstate.state, cpu); + bool skipflush = true; + if (*tlbstate == TLBSTATE_LAZY) { + /* + * If cmpxchg fails, the CPU may have context switched from + * TLBSTATE_LAZY to TLBSTATE_OK. Send a TLB flush IPI. + */ + if (cmpxchg(tlbstate, TLBSTATE_LAZY, TLBSTATE_FLUSH) != + TLBSTATE_LAZY) + skipflush = false; + } + return skipflush; +} + void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end) { struct flush_tlb_info info; + unsigned int cpu; if (end == 0) end = start + PAGE_SIZE; @@ -262,8 +286,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (end - start) >> PAGE_SHIFT); if (is_uv_system()) { - unsigned int cpu; - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) @@ -271,6 +293,19 @@ void native_flush_tlb_others(const struct cpumask *cpumask, &info, 1); return; } + + /* + * Instead of sending IPIs to CPUs in lazy TLB mode, move that + * CPUs TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed + * at the next context switch. + */ + for_each_cpu(cpu, cpumask) { + if (per_cpu(cpu_tlbstate.state, cpu) != TLBSTATE_OK) { + if (set_lazy_tlbstate_flush(cpu)) + cpumask_clear_cpu(cpu, (struct cpumask *)cpumask); + } + } + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); }