On Wed, Jun 20, 2018 at 12:57 PM Rik van Riel <[email protected]> wrote: > > Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, > when all it really needs to do is reload %CR3 at the next context switch, > assuming no page table pages got freed. > > This patch deals with that issue by introducing a third TLB state, > TLBSTATE_FLUSH, which causes %CR3 to be reloaded at the next context > switch. > > Atomic compare and exchange is used to close races between the TLB > shootdown code and the context switch code. Keying off just the > tlb_gen is likely to not be enough, since that would not give > lazy_clb_can_skip_flush() information on when it is facing a race > and has to send the IPI to a CPU in the middle of a LAZY -> OK switch. > > Unlike the 2016 version of this patch, CPUs in TLBSTATE_LAZY are not > removed from the mm_cpumask(mm), since that would prevent the TLB > flush IPIs at page table free time from being sent to all the CPUs > that need them.
Eek, this is so complicated. In the 2016 version of the patches, you needed all this. But I rewrote the whole subsystem to make it easier now :) I think that you can get rid of all of this and instead just revert the relevant parts of: b956575bed91ecfb136a8300742ecbbf451471ab All the bookkeeping is already in place -- no need for new state. > > Signed-off-by: Rik van Riel <[email protected]> > Tested-by: Song Liu <[email protected]> > --- > arch/x86/include/asm/tlbflush.h | 5 ++ > arch/x86/include/asm/uv/uv.h | 6 +-- > arch/x86/mm/tlb.c | 108 > ++++++++++++++++++++++++++++++++++++---- > arch/x86/platform/uv/tlb_uv.c | 2 +- > 4 files changed, 107 insertions(+), 14 deletions(-) > > diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h > index 88a4d6b87ff7..0a8bdd6084de 100644 > --- a/arch/x86/include/asm/tlbflush.h > +++ b/arch/x86/include/asm/tlbflush.h > @@ -171,6 +171,7 @@ struct tlb_context { > > #define TLBSTATE_OK 0 > #define TLBSTATE_LAZY 1 > +#define TLBSTATE_FLUSH 2 > > struct tlb_state { > /* > @@ -199,6 +200,10 @@ struct tlb_state { > * We're heuristically guessing that the CR3 load we > * skipped more than makes up for the overhead added by > * lazy mode. > + * > + * - Lazily using a real mm, which has seen a TLB invalidation on > + * other CPUs. TLB needs to be flushed at context switch time, > + * state == TLBSTATE_FLUSH. > */ > int state; > > diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h > index a80c0673798f..d801afb5fe90 100644 > --- a/arch/x86/include/asm/uv/uv.h > +++ b/arch/x86/include/asm/uv/uv.h > @@ -17,7 +17,7 @@ extern int is_uv_hubless(void); > extern void uv_cpu_init(void); > extern void uv_nmi_init(void); > extern void uv_system_init(void); > -extern const struct cpumask *uv_flush_tlb_others(const struct cpumask > *cpumask, > +extern struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask, > const struct flush_tlb_info > *info); > > #else /* X86_UV */ > @@ -27,8 +27,8 @@ static inline int is_uv_system(void) { return 0; } > static inline int is_uv_hubless(void) { return 0; } > static inline void uv_cpu_init(void) { } > static inline void uv_system_init(void) { } > -static inline const struct cpumask * > -uv_flush_tlb_others(const struct cpumask *cpumask, > +static inline struct cpumask * > +uv_flush_tlb_others(struct cpumask *cpumask, > const struct flush_tlb_info *info) > { return cpumask; } > > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c > index e063e623e52c..4b27d8469848 100644 > --- a/arch/x86/mm/tlb.c > +++ b/arch/x86/mm/tlb.c > @@ -7,6 +7,7 @@ > #include <linux/export.h> > #include <linux/cpu.h> > #include <linux/debugfs.h> > +#include <linux/gfp.h> > > #include <asm/tlbflush.h> > #include <asm/mmu_context.h> > @@ -227,8 +228,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct > mm_struct *next, > __flush_tlb_all(); > } > #endif > - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); > - > /* > * The membarrier system call requires a full memory barrier and > * core serialization before returning to user-space, after > @@ -236,8 +235,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct > mm_struct *next, > * memory barrier and core serializing instruction. > */ > if (real_prev == next) { > + int *tlbstate = this_cpu_ptr(&cpu_tlbstate.state); > + int oldstate = *tlbstate; > + > VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) > != > - next->context.ctx_id); > + next->context.ctx_id); > > /* > * We don't currently support having a real mm loaded without > @@ -250,6 +252,26 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct > mm_struct *next, > !cpumask_test_cpu(cpu, mm_cpumask(next)))) > cpumask_set_cpu(cpu, mm_cpumask(next)); > > + /* > + * In lazy TLB mode. The atomic exchange makes sure there was > + * no TLB shootdown all the way to this context switch. > + */ > + if (oldstate == TLBSTATE_LAZY) > + oldstate = cmpxchg(tlbstate, oldstate, TLBSTATE_OK); > + > + /* > + * There was a TLB invalidation while this CPU was idle. > + * The code in choose_new_asid will figure out what kind > + * of flush is needed. > + */ > + if (oldstate == TLBSTATE_FLUSH) > + goto reload_cr3; > + > + /* > + * Nothing to do. Either we are switching between two > + * threads in the same process, or we were in TLBSTATE_LAZY > + * without any TLB invalidations happening. > + */ > return; > } else { > u16 new_asid; > @@ -294,6 +316,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct > mm_struct *next, > * Start remote flushes and then read tlb_gen. > */ > cpumask_set_cpu(cpu, mm_cpumask(next)); > + reload_cr3: > + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); > next_tlb_gen = atomic64_read(&next->context.tlb_gen); > > choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); > @@ -454,6 +478,9 @@ static void flush_tlb_func_common(const struct > flush_tlb_info *f, > * paging-structure cache to avoid speculatively reading > * garbage into our TLB. Since switching to init_mm is barely > * slower than a minimal flush, just switch to init_mm. > + * > + * This should be rare, with native_flush_tlb_others skipping > + * IPIs to lazy TLB mode CPUs. > */ > switch_mm_irqs_off(NULL, &init_mm, NULL); > return; > @@ -557,9 +584,57 @@ static void flush_tlb_func_remote(void *info) > flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); > } > > +/* > + * Determine whether a CPU's TLB needs to be flushed now, or whether the > + * flush can be delayed until the next context switch, by changing the > + * tlbstate from TLBSTATE_LAZY to TLBSTATE_FLUSH. > + */ > +static bool lazy_tlb_can_skip_flush(int cpu) > +{ > + int *tlbstate = &per_cpu(cpu_tlbstate.state, cpu); > + int old; > + > + switch (*tlbstate) { > + case TLBSTATE_FLUSH: > + /* The TLB will be flushed on the next context switch. */ > + return true; > + case TLBSTATE_LAZY: > + /* > + * The CPU is in TLBSTATE_LAZY, which could context switch > back > + * to TLBSTATE_OK, re-using the old TLB state without a flush. > + * if that happened, send a TLB flush IPI. > + * > + * Otherwise, the state is now TLBSTATE_FLUSH, and TLB will > + * be flushed at the next context switch. Skip the IPI. > + */ > + old = cmpxchg(tlbstate, TLBSTATE_LAZY, TLBSTATE_FLUSH); > + return old != TLBSTATE_OK; > + case TLBSTATE_OK: > + default: > + /* A task on the CPU is actively using the mm. Flush the TLB. > */ > + return false; > + } > +} > + > void native_flush_tlb_others(const struct cpumask *cpumask, > const struct flush_tlb_info *info) > { > + cpumask_t *mask = (struct cpumask *)cpumask; > + cpumask_var_t varmask; > + bool can_lazy_flush = false; > + unsigned int cpu; > + > + /* > + * A temporary cpumask allows the kernel to skip sending IPIs > + * to CPUs in lazy TLB state, without also removing them from > + * mm_cpumask(mm). > + */ > + if (alloc_cpumask_var(&varmask, GFP_ATOMIC)) { > + cpumask_copy(varmask, cpumask); > + mask = varmask; > + can_lazy_flush = true; > + } > + > count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); > if (info->end == TLB_FLUSH_ALL) > trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); > @@ -583,17 +658,30 @@ void native_flush_tlb_others(const struct cpumask > *cpumask, > * that UV should be updated so that smp_call_function_many(), > * etc, are optimal on UV. > */ > - unsigned int cpu; > - > cpu = smp_processor_id(); > - cpumask = uv_flush_tlb_others(cpumask, info); > - if (cpumask) > - smp_call_function_many(cpumask, flush_tlb_func_remote, > + mask = uv_flush_tlb_others(mask, info); > + if (mask) > + smp_call_function_many(mask, flush_tlb_func_remote, > (void *)info, 1); > - return; > + goto out; > + } > + > + /* > + * Instead of sending IPIs to CPUs in lazy TLB mode, move that > + * CPU's TLB state to TLBSTATE_FLUSH, causing the TLB to be flushed > + * at the next context switch, or at page table free time. > + */ > + if (can_lazy_flush) { > + for_each_cpu(cpu, mask) { > + if (lazy_tlb_can_skip_flush(cpu)) > + cpumask_clear_cpu(cpu, mask); > + } > } > - smp_call_function_many(cpumask, flush_tlb_func_remote, > + > + smp_call_function_many(mask, flush_tlb_func_remote, > (void *)info, 1); > + out: > + free_cpumask_var(varmask); > } > > /* > diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c > index b36caae0fb2f..42730acec8b7 100644 > --- a/arch/x86/platform/uv/tlb_uv.c > +++ b/arch/x86/platform/uv/tlb_uv.c > @@ -1102,7 +1102,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, > struct bau_control *bcp, > * Returns pointer to cpumask if some remote flushing remains to be > * done. The returned pointer is valid till preemption is re-enabled. > */ > -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, > +struct cpumask *uv_flush_tlb_others(struct cpumask *cpumask, > const struct flush_tlb_info *info) > { > unsigned int cpu = smp_processor_id(); > -- > 2.14.4 >

