On Sat, 27 Aug 2016 16:02:25 -0700 Linus Torvalds <torva...@linux-foundation.org> wrote:
> Yeah, with those small fixes from Ingo, I definitely don't think this > looks hacky at all. This all seems to be exactly what we should always > have done. OK, so I was too tired yesterday to do kernel hacking, and missed yet another bit (xen_flush_tlb_others). Sigh. Otherwise, the patch is identical. Looking forward to Ben's test results. ---8<--- Subject: x86,mm,sched: make lazy TLB mode even lazier Lazy TLB mode can result in an idle CPU being woken up for a TLB flush, when all it really needed to do was flush %CR3 before the next context switch. This is mostly fine on bare metal, though sub-optimal from a power saving point of view, and deeper C-states could make TLB flushes take a little longer than desired. On virtual machines, the pain can be much worse, especially if a currently non-running VCPU is woken up for a TLB invalidation IPI, on a CPU that is busy running another task. It could take a while before that IPI is handled, leading to performance issues. This patch deals with the issue by introducing a third TLB state, TLBSTATE_FLUSH, which causes %CR3 to be flushed at the next context switch. A CPU that transitions from TLBSTATE_LAZY to TLBSTATE_OK during the attempted transition to TLBSTATE_FLUSH will get a TLB flush IPI, just like a CPU that was in TLBSTATE_OK to begin with. Nothing is done for a CPU that is already in TLBSTATE_FLUSH mode. Signed-off-by: Rik van Riel <r...@redhat.com> Reported-by: Benjamin Serebrin <sereb...@google.com> --- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/include/asm/tlbflush.h | 3 +- arch/x86/include/asm/uv/uv.h | 6 ++-- arch/x86/mm/tlb.c | 64 ++++++++++++++++++++++++++++++++--- arch/x86/platform/uv/tlb_uv.c | 2 +- arch/x86/xen/mmu.c | 2 +- 6 files changed, 68 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7fa9e7740ba3..b7e695c90c43 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -225,7 +225,7 @@ struct pv_mmu_ops { void (*flush_tlb_user)(void); void (*flush_tlb_kernel)(void); void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, + void (*flush_tlb_others)(struct cpumask *cpus, struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..c3dbacbc49be 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -304,12 +304,13 @@ extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); #define flush_tlb() flush_tlb_current_task() -void native_flush_tlb_others(const struct cpumask *cpumask, +void native_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 +#define TLBSTATE_FLUSH 3 static inline void reset_lazy_tlbstate(void) { diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 062921ef34e9..7e83cc633ba1 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -13,7 +13,7 @@ extern int is_uv_system(void); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); -extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, +extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end, @@ -25,8 +25,8 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline int is_uv_system(void) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } -static inline const struct cpumask * -uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, +static inline struct cpumask * +uv_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int cpu) { return cpumask; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5643fd0b1a7d..634248b38db9 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -140,10 +140,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { + int *tlbstate = this_cpu_ptr(&cpu_tlbstate.state); + int oldstate = *tlbstate; + + if (unlikely(oldstate == TLBSTATE_LAZY)) { + /* + * The TLB flush code (lazy_tlb_can_skip_flush) can + * move the TLB state to TLBSTATE_FLUSH concurrently + * with a context switch. Using cmpxchg here will catch + * that transition, causing a TLB flush below. + */ + oldstate = cmpxchg(tlbstate, oldstate, TLBSTATE_OK); + } this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + if (oldstate == TLBSTATE_FLUSH || + !cpumask_test_cpu(cpu, mm_cpumask(next))) { /* * On established mms, the mm_cpumask is only changed * from irq context, from ptep_clear_flush() while in @@ -242,11 +256,44 @@ static void flush_tlb_func(void *info) } -void native_flush_tlb_others(const struct cpumask *cpumask, +/* + * Determine whether a CPU's TLB needs to be flushed now, or whether the + * flush can be delayed until the next context switch, by changing the + * tlbstate from TLBSTATE_LAZY to TLBSTATE_FLUSH. + */ +static bool lazy_tlb_can_skip_flush(int cpu) +{ + int *tlbstate = &per_cpu(cpu_tlbstate.state, cpu); + int old; + + switch (*tlbstate) { + case TLBSTATE_FLUSH: + /* The TLB will be flushed on the next context switch. */ + return true; + case TLBSTATE_LAZY: + /* + * The CPU is in TLBSTATE_LAZY, which could context switch back + * to TLBSTATE_OK, re-using the old TLB state without a flush. + * If that happened, send a TLB flush IPI. + * + * Otherwise, the state is now TLBSTATE_FLUSH, and TLB will + * be flushed at the next context switch. Skip the IPI. + */ + old = cmpxchg(tlbstate, TLBSTATE_LAZY, TLBSTATE_FLUSH); + return old != TLBSTATE_OK; + case TLBSTATE_OK: + default: + /* A task on the CPU is actively using the mm. Flush the TLB. */ + return false; + } +} + +void native_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end) { struct flush_tlb_info info; + unsigned int cpu; if (end == 0) end = start + PAGE_SIZE; @@ -262,8 +309,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (end - start) >> PAGE_SHIFT); if (is_uv_system()) { - unsigned int cpu; - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) @@ -271,6 +316,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask, &info, 1); return; } + + /* + * Instead of sending IPIs to CPUs in lazy TLB mode, move that + * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed + * at the next context switch. + */ + for_each_cpu(cpu, cpumask) { + if (lazy_tlb_can_skip_flush(cpu)) + cpumask_clear_cpu(cpu, cpumask); + } + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index fdb4d42b4ce5..7a2221a81e77 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1090,7 +1090,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * Returns pointer to cpumask if some remote flushing remains to be * done. The returned pointer is valid till preemption is re-enabled. */ -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, +struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67433714b791..0e3e5969527f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1370,7 +1370,7 @@ static void xen_flush_tlb_single(unsigned long addr) preempt_enable(); } -static void xen_flush_tlb_others(const struct cpumask *cpus, +static void xen_flush_tlb_others(struct cpumask *cpus, struct mm_struct *mm, unsigned long start, unsigned long end) {