On Sat, 27 Aug 2016 16:02:25 -0700
Linus Torvalds <torva...@linux-foundation.org> wrote:

> Yeah, with those small fixes from Ingo, I definitely don't think this
> looks hacky at all. This all seems to be exactly what we should always
> have done.

OK, so I was too tired yesterday to do kernel hacking, and
missed yet another bit (xen_flush_tlb_others). Sigh.

Otherwise, the patch is identical.

Looking forward to Ben's test results.

---8<---

Subject: x86,mm,sched: make lazy TLB mode even lazier

Lazy TLB mode can result in an idle CPU being woken up for a TLB
flush, when all it really needed to do was flush %CR3 before the
next context switch.

This is mostly fine on bare metal, though sub-optimal from a power
saving point of view, and deeper C-states could make TLB flushes
take a little longer than desired.

On virtual machines, the pain can be much worse, especially if a
currently non-running VCPU is woken up for a TLB invalidation
IPI, on a CPU that is busy running another task. It could take
a while before that IPI is handled, leading to performance issues.

This patch deals with the issue by introducing a third TLB state,
TLBSTATE_FLUSH, which causes %CR3 to be flushed at the next
context switch.

A CPU that transitions from TLBSTATE_LAZY to TLBSTATE_OK during
the attempted transition to TLBSTATE_FLUSH will get a TLB flush
IPI, just like a CPU that was in TLBSTATE_OK to begin with.

Nothing is done for a CPU that is already in TLBSTATE_FLUSH mode.

Signed-off-by: Rik van Riel <r...@redhat.com>
Reported-by: Benjamin Serebrin <sereb...@google.com>
---
 arch/x86/include/asm/paravirt_types.h |  2 +-
 arch/x86/include/asm/tlbflush.h       |  3 +-
 arch/x86/include/asm/uv/uv.h          |  6 ++--
 arch/x86/mm/tlb.c                     | 64 ++++++++++++++++++++++++++++++++---
 arch/x86/platform/uv/tlb_uv.c         |  2 +-
 arch/x86/xen/mmu.c                    |  2 +-
 6 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 7fa9e7740ba3..b7e695c90c43 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -225,7 +225,7 @@ struct pv_mmu_ops {
        void (*flush_tlb_user)(void);
        void (*flush_tlb_kernel)(void);
        void (*flush_tlb_single)(unsigned long addr);
-       void (*flush_tlb_others)(const struct cpumask *cpus,
+       void (*flush_tlb_others)(struct cpumask *cpus,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94e079a..c3dbacbc49be 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -304,12 +304,13 @@ extern void flush_tlb_kernel_range(unsigned long start, 
unsigned long end);
 
 #define flush_tlb()    flush_tlb_current_task()
 
-void native_flush_tlb_others(const struct cpumask *cpumask,
+void native_flush_tlb_others(struct cpumask *cpumask,
                                struct mm_struct *mm,
                                unsigned long start, unsigned long end);
 
 #define TLBSTATE_OK    1
 #define TLBSTATE_LAZY  2
+#define TLBSTATE_FLUSH 3
 
 static inline void reset_lazy_tlbstate(void)
 {
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 062921ef34e9..7e83cc633ba1 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -13,7 +13,7 @@ extern int is_uv_system(void);
 extern void uv_cpu_init(void);
 extern void uv_nmi_init(void);
 extern void uv_system_init(void);
-extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
                                                 struct mm_struct *mm,
                                                 unsigned long start,
                                                 unsigned long end,
@@ -25,8 +25,8 @@ static inline enum uv_system_type get_uv_system_type(void) { 
return UV_NONE; }
 static inline int is_uv_system(void)   { return 0; }
 static inline void uv_cpu_init(void)   { }
 static inline void uv_system_init(void)        { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
+static inline struct cpumask *
+uv_flush_tlb_others(struct cpumask *cpumask, struct mm_struct *mm,
                    unsigned long start, unsigned long end, unsigned int cpu)
 { return cpumask; }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5643fd0b1a7d..634248b38db9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,10 +140,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
        }
 #ifdef CONFIG_SMP
          else {
+               int *tlbstate = this_cpu_ptr(&cpu_tlbstate.state);
+               int oldstate = *tlbstate;
+
+               if (unlikely(oldstate == TLBSTATE_LAZY)) {
+                       /*
+                        * The TLB flush code (lazy_tlb_can_skip_flush) can
+                        * move the TLB state to TLBSTATE_FLUSH concurrently
+                        * with a context switch. Using cmpxchg here will catch
+                        * that transition, causing a TLB flush below.
+                        */
+                       oldstate = cmpxchg(tlbstate, oldstate, TLBSTATE_OK);
+               }
                this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+
                BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
 
-               if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+               if (oldstate == TLBSTATE_FLUSH ||
+                               !cpumask_test_cpu(cpu, mm_cpumask(next))) {
                        /*
                         * On established mms, the mm_cpumask is only changed
                         * from irq context, from ptep_clear_flush() while in
@@ -242,11 +256,44 @@ static void flush_tlb_func(void *info)
 
 }
 
-void native_flush_tlb_others(const struct cpumask *cpumask,
+/*
+ * Determine whether a CPU's TLB needs to be flushed now, or whether the
+ * flush can be delayed until the next context switch, by changing the
+ * tlbstate from TLBSTATE_LAZY to TLBSTATE_FLUSH.
+ */
+static bool lazy_tlb_can_skip_flush(int cpu)
+{
+       int *tlbstate = &per_cpu(cpu_tlbstate.state, cpu);
+       int old;
+
+       switch (*tlbstate) {
+       case TLBSTATE_FLUSH:
+               /* The TLB will be flushed on the next context switch. */
+               return true;
+       case TLBSTATE_LAZY:
+               /*
+                * The CPU is in TLBSTATE_LAZY, which could context switch back
+                * to TLBSTATE_OK, re-using the old TLB state without a flush.
+                * If that happened, send a TLB flush IPI.
+                *
+                * Otherwise, the state is now TLBSTATE_FLUSH, and TLB will
+                * be flushed at the next context switch. Skip the IPI.
+                */
+               old = cmpxchg(tlbstate, TLBSTATE_LAZY, TLBSTATE_FLUSH);
+               return old != TLBSTATE_OK;
+       case TLBSTATE_OK:
+       default:
+               /* A task on the CPU is actively using the mm. Flush the TLB. */
+               return false;
+       }
+}
+
+void native_flush_tlb_others(struct cpumask *cpumask,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long end)
 {
        struct flush_tlb_info info;
+       unsigned int cpu;
 
        if (end == 0)
                end = start + PAGE_SIZE;
@@ -262,8 +309,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                (end - start) >> PAGE_SHIFT);
 
        if (is_uv_system()) {
-               unsigned int cpu;
-
                cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
                if (cpumask)
@@ -271,6 +316,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                                                &info, 1);
                return;
        }
+
+       /*
+        * Instead of sending IPIs to CPUs in lazy TLB mode, move that
+        * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed
+        * at the next context switch.
+        */
+       for_each_cpu(cpu, cpumask) {
+               if (lazy_tlb_can_skip_flush(cpu))
+                       cpumask_clear_cpu(cpu, cpumask);
+       }
+
        smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
 }
 
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index fdb4d42b4ce5..7a2221a81e77 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1090,7 +1090,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, 
struct bau_control *bcp,
  * Returns pointer to cpumask if some remote flushing remains to be
  * done.  The returned pointer is valid till preemption is re-enabled.
  */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask,
                                                struct mm_struct *mm,
                                                unsigned long start,
                                                unsigned long end,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 67433714b791..0e3e5969527f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1370,7 +1370,7 @@ static void xen_flush_tlb_single(unsigned long addr)
        preempt_enable();
 }
 
-static void xen_flush_tlb_others(const struct cpumask *cpus,
+static void xen_flush_tlb_others(struct cpumask *cpus,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long end)
 {

Reply via email to