Re: [PATCH 2/4] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF
On Tue, Jun 20, 2017 at 6:15 AM, Thomas Gleixner wrote: > On Fri, 16 Jun 2017, Len Brown wrote: >> +#include >> +#include >> +#include >> +#include >> + >> +struct aperfmperf_sample { >> + unsigned int khz; >> + unsigned long jiffies; >> + u64 aperf; >> + u64 mperf; > > Nit. Please write these in tabular fashion: > unsignedint khz; > unsigned long jiffies; > u64 aperf; > u64 mperf; sure, no problem -- I agreed that looks better. But it seems there is some inconsistency about this style nit -- even in this directory. If there is consensus going forward, would it make sense for coding-style.rst and checkpatch.pl to squawk about this, so you don't have to? >> +}; >> + >> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); >> + >> +/* >> + * aperfmperf_snapshot_khz() >> + * On the current CPU, snapshot APERF, MPERF, and jiffies >> + * unless we already did it within 10ms >> + * calculate kHz, save snapshot >> + */ >> +static void aperfmperf_snapshot_khz(void *dummy) >> +{ >> + u64 aperf, aperf_delta; >> + u64 mperf, mperf_delta; >> + struct aperfmperf_sample *s = &get_cpu_var(samples); > > This is invoked via a smp function call, so you want > > this_cpu_ptr(samples) > > here. done. thanks! >> + /* Don't bother re-computing within 10 ms */ >> + if (time_before(jiffies, s->jiffies + HZ/100)) >> + goto out; > > That way you can spare the gotos and simply return happy to remove the gotos, thanks! >> index a5ce0bbe..cfc6220 100644 >> --- a/include/linux/cpufreq.h >> +++ b/include/linux/cpufreq.h >> @@ -883,6 +883,19 @@ static inline bool policy_has_boost_freq(struct >> cpufreq_policy *policy) >> } >> #endif >> >> +#ifdef CONFIG_X86 >> +extern unsigned int aperfmperf_khz_on_cpu(unsigned int cpu); >> +static inline unsigned int arch_freq_get_on_cpu(int cpu) > ... having cpu as int and unsigned int is not really consistent. True. the SMP code uses int cpu, while cpufreq_policy.cpu is an unsigned int. I'll use "int cpu". > Please don't add arch specific crap in general headers. > > The simple way to avoid that is to have a weak function and have an arch > override for it. If that does not work because the cpufreq stuff must be > built as a module, then you still can avoid CONFIG_X86 and have something > like CONFIG_ARCH_HAS_FOO. Thanks -- this is not modular code, and so yes, __weak works -- hadn't had occasion to use it before... Attached is the incremental diff responding to your comments, in case that is convenient. I'll re-send this series with this patch updated in a sec. thanks, Len Brown, Intel Open Source Technology Center diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 5ccf63a..d869c86 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,10 +14,10 @@ #include struct aperfmperf_sample { - unsigned int khz; - unsigned long jiffies; - u64 aperf; - u64 mperf; + unsigned intkhz; + unsigned long jiffies; + u64 aperf; + u64 mperf; }; static DEFINE_PER_CPU(struct aperfmperf_sample, samples); @@ -32,11 +32,11 @@ static void aperfmperf_snapshot_khz(void *dummy) { u64 aperf, aperf_delta; u64 mperf, mperf_delta; - struct aperfmperf_sample *s = &get_cpu_var(samples); + struct aperfmperf_sample *s = this_cpu_ptr(&samples); /* Don't bother re-computing within 10 ms */ if (time_before(jiffies, s->jiffies + HZ/100)) - goto out; + return; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); @@ -49,7 +49,7 @@ static void aperfmperf_snapshot_khz(void *dummy) * increments faster than we can read it. */ if (mperf_delta == 0) - goto out; + return; /* * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then @@ -63,12 +63,9 @@ static void aperfmperf_snapshot_khz(void *dummy) s->jiffies = jiffies; s->aperf = aperf; s->mperf = mperf; - -out: - put_cpu_var(samples); } -unsigned int aperfmperf_khz_on_cpu(unsigned int cpu) +unsigned int arch_freq_get_on_cpu(int cpu) { if (!cpu_khz) return 0; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a667fac..6e7424d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -632,6 +632,11 @@ show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); +__weak unsigned int arch_freq_get_on_cpu(int cpu) +{ + return 0; +} + static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cfc6220..905117b 100644 --- a/include/linux/cpufreq.h +++ b/include/li
Re: [PATCH 2/4] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF
On Fri, 16 Jun 2017, Len Brown wrote: > +#include > +#include > +#include > +#include > + > +struct aperfmperf_sample { > + unsigned int khz; > + unsigned long jiffies; > + u64 aperf; > + u64 mperf; Nit. Please write these in tabular fashion: unsignedint khz; unsigned long jiffies; u64 aperf; u64 mperf; > +}; > + > +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); > + > +/* > + * aperfmperf_snapshot_khz() > + * On the current CPU, snapshot APERF, MPERF, and jiffies > + * unless we already did it within 10ms > + * calculate kHz, save snapshot > + */ > +static void aperfmperf_snapshot_khz(void *dummy) > +{ > + u64 aperf, aperf_delta; > + u64 mperf, mperf_delta; > + struct aperfmperf_sample *s = &get_cpu_var(samples); This is invoked via a smp function call, so you want this_cpu_ptr(samples) here. > + /* Don't bother re-computing within 10 ms */ > + if (time_before(jiffies, s->jiffies + HZ/100)) > + goto out; That way you can spare the gotos and simply return > index a5ce0bbe..cfc6220 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -883,6 +883,19 @@ static inline bool policy_has_boost_freq(struct > cpufreq_policy *policy) > } > #endif > > +#ifdef CONFIG_X86 > +extern unsigned int aperfmperf_khz_on_cpu(unsigned int cpu); > +static inline unsigned int arch_freq_get_on_cpu(int cpu) Please don't add arch specific crap in general headers. Also having cpu as int and unsigned int is not really consistent. The simple way to avoid that is to have a weak function and have an arch override for it. If that does not work because the cpufreq stuff must be built as a module, then you still can avoid CONFIG_X86 and have something like CONFIG_ARCH_HAS_FOO. Thanks, tglx
[PATCH 2/4] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF
From: Len Brown The goal of this change is to give users a uniform and meaningful result when they read /sys/...cpufreq/scaling_cur_freq on modern x86 hardware, as compared to what they get today. Modern x86 processors include the hardware needed to accurately calculate frequency over an interval -- APERF, MPERF, and the TSC. Here we provide an x86 routine to make this calculation on supported hardware, and use it in preference to any driver driver-specific cpufreq_driver.get() routine. MHz is computed like so: MHz = base_MHz * delta_APERF / delta_MPERF MHz is the average frequency of the busy processor over a measurement interval. The interval is defined to be the time between successive invocations of aperfmperf_khz_on_cpu(), which are expected to to happen on-demand when users read sysfs attribute cpufreq/scaling_cur_freq. As with previous methods of calculating MHz, idle time is excluded. base_MHz above is from TSC calibration global "cpu_khz". This x86 native method to calculate MHz returns a meaningful result no matter if P-states are controlled by hardware or firmware and/or if the Linux cpufreq sub-system is or is-not installed. When this routine is invoked more frequently, the measurement interval becomes shorter. However, the code limits re-computation to 10ms intervals so that average frequency remains meaningful. Discerning users are encouraged to take advantage of the turbostat(8) utility, which can gracefully handle concurrent measurement intervals of arbitrary length. Signed-off-by: Len Brown --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/aperfmperf.c | 82 drivers/cpufreq/cpufreq.c| 7 +++- include/linux/cpufreq.h | 13 +++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/aperfmperf.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 521..cdf8249 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -21,6 +21,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o +obj-$(CONFIG_CPU_FREQ) += aperfmperf.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000..5ccf63a --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,82 @@ +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown + * + * This file is licensed under GPLv2. + */ + +#include +#include +#include +#include + +struct aperfmperf_sample { + unsigned int khz; + unsigned long jiffies; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = &get_cpu_var(samples); + + /* Don't bother re-computing within 10 ms */ + if (time_before(jiffies, s->jiffies + HZ/100)) + goto out; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* +* There is no architectural guarantee that MPERF +* increments faster than we can read it. +*/ + if (mperf_delta == 0) + goto out; + + /* +* if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then +* khz = (cpu_khz * aperf_delta) / mperf_delta +*/ + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + else/* khz = aperf_delta / (mperf_delta / cpu_khz) */ + s->khz = div64_u64(aperf_delta, + div64_u64(mperf_delta, cpu_khz)); + s->jiffies = jiffies; + s->aperf = aperf; + s->mperf = mperf; + +out: + put_cpu_var(samples); +} + +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 26b643d..a667fac 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -635,8 +635,13 @@ show_one(scaling_max_freq, max); static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t