On 12/11/2013 03:14 PM, Borislav Petkov wrote: > On Wed, Dec 11, 2013 at 03:08:35PM -0800, H. Peter Anvin wrote: >> So I would like to propose that we switch to using a percpu variable >> which is a single cache line of nothing at all. It would only ever >> be touched by MONITOR and for explicit wakeup. Hopefully that will >> resolve this problem without the need for the CLFLUSH. > > Yep, makes a lot of sense to me to have an exclusive (overloaded meaning > here :-)) cacheline only for that. And, if it works, we'll save us the > penalty from the CLFLUSH too, cool. >
Here is a POC patch... anyone willing to test it out? Two obvious things to watch out for: 1. I couldn't actually spot any obvious cases of a deliberate monitor trigger. 2. Should we do cpu_relax() for all users, not just powerclamp? -hpa
>From 20a54d54ea06f050650ab8923b7d9ee1d21b5317 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <h...@linux.intel.com> Date: Wed, 11 Dec 2013 16:31:04 -0800 Subject: [PATCH] x86, mwait: Use a dedicated percpu area for mwait doorbell We have used the cache line that includes thread_info.flags as the mwait doorbell. However, this is liable to be dirty in the cache, which may trigger hardware errata, plus it might cause false wakeups. Instead, use a dedicated wakeup doorbell area. Signed-off-by: H. Peter Anvin <h...@linux.intel.com> --- arch/x86/include/asm/cpufeature.h | 1 - arch/x86/include/asm/mwait.h | 46 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 23 ------------------- arch/x86/kernel/acpi/cstate.c | 6 +---- arch/x86/kernel/cpu/common.c | 17 ++++++++++++++ arch/x86/kernel/cpu/intel.c | 3 --- arch/x86/kernel/setup_percpu.c | 3 +++ arch/x86/kernel/smpboot.c | 19 +--------------- drivers/acpi/acpi_pad.c | 3 +-- drivers/idle/intel_idle.c | 4 +--- drivers/thermal/intel_powerclamp.c | 2 +- 11 files changed, 71 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e099f95..cdc77f3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -96,7 +96,6 @@ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0..4c82863 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -13,4 +13,50 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +#ifndef __ASSEMBLY__ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/percpu.h> + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +extern char __percpu *mwait_doorbell; + +void __init setup_mwait_doorbell(void); + +static inline void x86_monitor_doorbell(unsigned long ecx, unsigned long edx) +{ + __monitor(__this_cpu_ptr(mwait_doorbell), ecx, edx); + mb(); +} + +static inline void x86_mwait_doorbell_bing_bong(int cpu) +{ + ACCESS_ONCE(*per_cpu_ptr(mwait_doorbell, cpu)) = 0; +} + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b7845a1..a51a79e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -723,29 +723,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27..aec26c5 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -163,11 +163,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); + x86_monitor_doorbell(0, 0); if (!need_resched()) __mwait(ax, cx); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6abc172..b6b19ab 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -24,6 +24,7 @@ #include <linux/cpumask.h> #include <asm/pgtable.h> #include <linux/atomic.h> +#include <asm/mwait.h> #include <asm/proto.h> #include <asm/setup.h> #include <asm/apic.h> @@ -63,6 +64,22 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } +/* allocate percpu area for mwait doorbell */ +char __percpu *mwait_doorbell; + +void __init setup_mwait_doorbell(void) +{ + if (boot_cpu_has(X86_FEATURE_MWAIT)) { + mwait_doorbell = __alloc_percpu(boot_cpu_data.clflush_size, + boot_cpu_data.clflush_size); + + if (!mwait_doorbell) { + /* This should never happen... */ + panic("Unable to allocate mwait doorbell!\n"); + } + } +} + static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dc1ec0d..47b4e7b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -387,9 +387,6 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) - set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); - #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5cdff03..917e1af 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -284,4 +284,7 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + + /* Setup MWAIT doorbell */ + setup_mwait_doorbell(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a..558e097 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1368,7 +1368,6 @@ static inline void mwait_play_dead(void) unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - void *mwait_ptr; int i; if (!this_cpu_has(X86_FEATURE_MWAIT)) @@ -1400,26 +1399,10 @@ static inline void mwait_play_dead(void) (highest_subcstate - 1); } - /* - * This should be a memory location in a cache line which is - * unlikely to be touched by other processors. The actual - * content is immaterial as it is not actually modified in any way. - */ - mwait_ptr = ¤t_thread_info()->flags; - wbinvd(); while (1) { - /* - * The CLFLUSH is a workaround for erratum AAI65 for - * the Xeon 7400 series. It's not clear it is actually - * needed, but it should be harmless in either case. - * The WBINVD is insufficient due to the spurious-wakeup - * case where we return around the loop. - */ - clflush(mwait_ptr); - __monitor(mwait_ptr, 0, 0); - mb(); + x86_monitor_doorbell(0, 0); __mwait(eax, 0); /* * If NMI wants to wake up CPU0, start CPU0. diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index fc6008f..38aaa8b 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -193,8 +193,7 @@ static int power_saving_thread(void *data) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); stop_critical_timings(); - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); + x86_monitor_doorbell(0, 0); if (!need_resched()) __mwait(power_saving_mwait_eax, 1); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 92d1206..6fef6d9 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -376,9 +376,7 @@ static int intel_idle(struct cpuidle_device *dev, clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); if (!current_set_polling_and_test()) { - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); + x86_monitor_doorbell(0, 0); if (!need_resched()) __mwait(eax, ecx); } diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 8f181b3..15cf013 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -438,7 +438,7 @@ static int clamp_thread(void *arg) */ local_touch_nmi(); stop_critical_timings(); - __monitor((void *)¤t_thread_info()->flags, 0, 0); + x86_monitor_doorbell(0, 0); cpu_relax(); /* allow HT sibling to run */ __mwait(eax, ecx); start_critical_timings(); -- 1.8.3.1