The intel_pstate driver is ported following its kernel code logic (commit: 93f0822d).
In the kernel, a user can adjust the limits via sysfs (limits.min_sysfs_pct/max_sysfs_pct). In Xen, the policy->min_perf_pct/max_perf_pct acts as the transit station. A user interacts with it via xenpm. Signed-off-by: Wei Wang <wei.w.w...@intel.com> --- xen/arch/x86/acpi/cpufreq/Makefile | 1 + xen/arch/x86/acpi/cpufreq/intel_pstate.c | 852 +++++++++++++++++++++++++++++++ xen/include/acpi/cpufreq/cpufreq.h | 6 + xen/include/asm-x86/acpi.h | 2 + xen/include/asm-x86/cpufeature.h | 1 + xen/include/asm-x86/msr-index.h | 3 + 6 files changed, 865 insertions(+) create mode 100644 xen/arch/x86/acpi/cpufreq/intel_pstate.c diff --git a/xen/arch/x86/acpi/cpufreq/Makefile b/xen/arch/x86/acpi/cpufreq/Makefile index f75da9b..99fa9f4 100644 --- a/xen/arch/x86/acpi/cpufreq/Makefile +++ b/xen/arch/x86/acpi/cpufreq/Makefile @@ -1,2 +1,3 @@ obj-y += cpufreq.o +obj-y += intel_pstate.o obj-y += powernow.o diff --git a/xen/arch/x86/acpi/cpufreq/intel_pstate.c b/xen/arch/x86/acpi/cpufreq/intel_pstate.c new file mode 100644 index 0000000..052a0d0 --- /dev/null +++ b/xen/arch/x86/acpi/cpufreq/intel_pstate.c @@ -0,0 +1,852 @@ +#include <xen/kernel.h> +#include <xen/types.h> +#include <xen/init.h> +#include <xen/bitmap.h> +#include <xen/cpumask.h> +#include <xen/timer.h> +#include <asm/msr.h> +#include <asm/msr-index.h> +#include <asm/processor.h> +#include <asm/div64.h> +#include <acpi/cpufreq/cpufreq.h> + +#define BYT_RATIOS 0x66a +#define BYT_VIDS 0x66b +#define BYT_TURBO_RATIOS 0x66c +#define BYT_TURBO_VIDS 0x66d + +#define FRAC_BITS 8 +#define int_tofp(X) ((int64_t)(X) << FRAC_BITS) +#define fp_toint(X) ((X) >> FRAC_BITS) + +static inline int32_t mul_fp(int32_t x, int32_t y) +{ + return ((int64_t)x * (int64_t)y) >> FRAC_BITS; +} + +static inline int32_t div_fp(int32_t x, int32_t y) +{ + return div_s64((int64_t)x << FRAC_BITS, y); +} + +static inline int ceiling_fp(int32_t x) +{ + int mask, ret; + + ret = fp_toint(x); + mask = (1 << FRAC_BITS) - 1; + if (x & mask) + ret += 1; + return ret; +} + +struct sample { + int32_t core_pct_busy; + u64 aperf; + u64 mperf; + int freq; + s_time_t time; +}; + +struct pstate_data { + int current_pstate; + int min_pstate; + int max_pstate; + int scaling; + int turbo_pstate; +}; + +struct vid_data { + int min; + int max; + int turbo; + int32_t ratio; +}; + +struct _pid { + int setpoint; + int32_t integral; + int32_t p_gain; + int32_t i_gain; + int32_t d_gain; + int deadband; + int32_t last_err; +}; + +struct cpudata { + int cpu; + + struct timer timer; + + struct pstate_data pstate; + struct vid_data vid; + struct _pid pid; + + s_time_t last_sample_time; + u64 prev_aperf; + u64 prev_mperf; + struct sample sample; +}; + +static struct cpudata **all_cpu_data; + +struct pstate_adjust_policy { + int sample_rate_ms; + int deadband; + int setpoint; + int p_gain_pct; + int d_gain_pct; + int i_gain_pct; +}; + +struct pstate_funcs { + int (*get_max)(void); + int (*get_min)(void); + int (*get_turbo)(void); + int (*get_scaling)(void); + void (*set)(struct cpudata*, int pstate); + void (*get_vid)(struct cpudata *); +}; + +struct cpu_defaults { + struct pstate_adjust_policy pid_policy; + struct pstate_funcs funcs; +}; + +static struct pstate_adjust_policy pid_params; +static struct pstate_funcs pstate_funcs; + +struct perf_limits { + int no_turbo; + int turbo_disabled; + int max_perf_pct; + int min_perf_pct; + int32_t max_perf; + int32_t min_perf; + int max_policy_pct; + int min_policy_pct; +}; + +static struct perf_limits limits = { + .no_turbo = 0, + .turbo_disabled = 0, + .max_perf_pct = 100, + .max_perf = int_tofp(1), + .min_perf_pct = 0, + .min_perf = 0, + .max_policy_pct = 100, + .min_policy_pct = 0, +}; + +static inline void pid_reset(struct _pid *pid, int setpoint, int busy, + int deadband, int integral) +{ + pid->setpoint = setpoint; + pid->deadband = deadband; + pid->integral = int_tofp(integral); + pid->last_err = int_tofp(setpoint) - int_tofp(busy); +} + +static inline void pid_p_gain_set(struct _pid *pid, int percent) +{ + pid->p_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_i_gain_set(struct _pid *pid, int percent) +{ + pid->i_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_d_gain_set(struct _pid *pid, int percent) +{ + pid->d_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static signed int pid_calc(struct _pid *pid, int32_t busy) +{ + signed int result; + int32_t pterm, dterm, fp_error; + int32_t integral_limit; + + fp_error = int_tofp(pid->setpoint) - busy; + + if (abs(fp_error) <= int_tofp(pid->deadband)) + return 0; + + pterm = mul_fp(pid->p_gain, fp_error); + + pid->integral += fp_error; + + /* + * We limit the integral here so that it will never + * get higher than 30. This prevents it from becoming + * too large an input over long periods of time and allows + * it to get factored out sooner. + * The value of 30 was chosen through experimentation. + */ + integral_limit = int_tofp(30); + if (pid->integral > integral_limit) + pid->integral = integral_limit; + if (pid->integral < -integral_limit) + pid->integral = -integral_limit; + + dterm = mul_fp(pid->d_gain, fp_error - pid->last_err); + pid->last_err = fp_error; + + result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm; + result = result + (1 << (FRAC_BITS-1)); + return (signed int)fp_toint(result); +} + +static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu) +{ + pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct); + pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct); + pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct); + + pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0); +} + +static inline void intel_pstate_reset_all_pid(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) + intel_pstate_busy_pid_reset(all_cpu_data[cpu]); + } +} + +static inline void update_turbo_state(void) +{ + u64 misc_en; + struct cpudata *cpu; + + cpu = all_cpu_data[0]; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); + limits.turbo_disabled = + (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || + cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); +} + +static int byt_get_min_pstate(void) +{ + u64 value; + + rdmsrl(BYT_RATIOS, value); + return (value >> 8) & 0x7F; +} + +static int byt_get_max_pstate(void) +{ + u64 value; + + rdmsrl(BYT_RATIOS, value); + return (value >> 16) & 0x7F; +} + +static int byt_get_turbo_pstate(void) +{ + u64 value; + + rdmsrl(BYT_TURBO_RATIOS, value); + return value & 0x7F; +} + +static void byt_set_pstate(struct cpudata *cpudata, int pstate) +{ + u64 val; + int32_t vid_fp; + u32 vid; + + val = pstate << 8; + if (limits.no_turbo && !limits.turbo_disabled) + val |= (u64)1 << 32; + + vid_fp = cpudata->vid.min + mul_fp( + int_tofp(pstate - cpudata->pstate.min_pstate), + cpudata->vid.ratio); + + vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); + vid = ceiling_fp(vid_fp); + + if (pstate > cpudata->pstate.max_pstate) + vid = cpudata->vid.turbo; + + val |= vid; + + wrmsrl(MSR_IA32_PERF_CTL, val); +} + +#define BYT_BCLK_FREQS 5 +static int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800}; + +static int byt_get_scaling(void) +{ + u64 value; + int i; + + rdmsrl(MSR_FSB_FREQ, value); + i = value & 0x3; + + BUG_ON(i > BYT_BCLK_FREQS); + + return byt_freq_table[i] * 100; +} + +static void byt_get_vid(struct cpudata *cpudata) +{ + u64 value; + + rdmsrl(BYT_VIDS, value); + cpudata->vid.min = int_tofp((value >> 8) & 0x7f); + cpudata->vid.max = int_tofp((value >> 16) & 0x7f); + cpudata->vid.ratio = div_fp( + cpudata->vid.max - cpudata->vid.min, + int_tofp(cpudata->pstate.max_pstate - + cpudata->pstate.min_pstate)); + + rdmsrl(BYT_TURBO_VIDS, value); + cpudata->vid.turbo = value & 0x7f; +} + +static int core_get_min_pstate(void) +{ + u64 value; + + rdmsrl(MSR_INTEL_PLATFORM_INFO, value); + return (value >> 40) & 0xFF; +} + +static int core_get_max_pstate(void) +{ + u64 value; + + rdmsrl(MSR_INTEL_PLATFORM_INFO, value); + return (value >> 8) & 0xFF; +} + +static int core_get_turbo_pstate(void) +{ + u64 value; + int nont, ret; + + rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); + nont = core_get_max_pstate(); + ret = (value) & 255; + if (ret <= nont) + ret = nont; + return ret; +} + +static inline int core_get_scaling(void) +{ + return 100000; +} + +static void core_set_pstate(struct cpudata *cpudata, int pstate) +{ + u64 val; + + val = pstate << 8; + if (limits.no_turbo && !limits.turbo_disabled) + val |= (u64)1 << 32; + + wrmsrl(MSR_IA32_PERF_CTL, val); +} + +static struct cpu_defaults core_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 20, + .d_gain_pct = 0, + .i_gain_pct = 0, + }, + .funcs = { + .get_max = core_get_max_pstate, + .get_min = core_get_min_pstate, + .get_turbo = core_get_turbo_pstate, + .get_scaling = core_get_scaling, + .set = core_set_pstate, + }, +}; + +static struct cpu_defaults byt_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 14, + .d_gain_pct = 0, + .i_gain_pct = 4, + }, + .funcs = { + .get_max = byt_get_max_pstate, + .get_min = byt_get_min_pstate, + .get_turbo = byt_get_turbo_pstate, + .set = byt_set_pstate, + .get_scaling = byt_get_scaling, + .get_vid = byt_get_vid, + }, +}; + +static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) +{ + int max_perf = cpu->pstate.turbo_pstate; + int max_perf_adj; + int min_perf; + + if (limits.no_turbo || limits.turbo_disabled) + max_perf = cpu->pstate.max_pstate; + + /* performance can be limited by user through xenpm */ + max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); + *max = clamp_t(int, max_perf_adj, + cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); + + min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits.min_perf)); + *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); +} + +static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) +{ + int max_perf, min_perf; + struct cpufreq_policy *policy; + + policy = per_cpu(cpufreq_cpu_policy, cpu->cpu); + + update_turbo_state(); + + if (limits.turbo_disabled) + policy->turbo = CPUFREQ_TURBO_UNSUPPORTED; + else if (limits.no_turbo) + policy->turbo = CPUFREQ_TURBO_DISABLED; + else + policy->turbo = CPUFREQ_TURBO_ENABLED; + + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); + + pstate = clamp_t(int, pstate, min_perf, max_perf); + + if (pstate == cpu->pstate.current_pstate) + return; + + cpu->pstate.current_pstate = pstate; + policy->cur = pstate * 100000; + + pstate_funcs.set(cpu, pstate); +} + +static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) +{ + cpu->pstate.min_pstate = pstate_funcs.get_min(); + cpu->pstate.max_pstate = pstate_funcs.get_max(); + cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); + cpu->pstate.scaling = pstate_funcs.get_scaling(); + + if (pstate_funcs.get_vid) + pstate_funcs.get_vid(cpu); + intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); +} + +static inline void intel_pstate_calc_busy(struct cpudata *cpu) +{ + struct sample *sample = &cpu->sample; + int64_t core_pct; + + core_pct = int_tofp(sample->aperf) * int_tofp(100); + core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); + + sample->freq = fp_toint( + mul_fp(int_tofp( + cpu->pstate.max_pstate * cpu->pstate.scaling / 100), + core_pct)); + + sample->core_pct_busy = (int32_t)core_pct; +} + +static inline void intel_pstate_sample(struct cpudata *cpu) +{ + u64 aperf, mperf; + unsigned long flags; + + local_irq_save(flags); + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); + + cpu->last_sample_time = cpu->sample.time; + cpu->sample.time = get_s_time(); + cpu->sample.aperf = aperf; + cpu->sample.mperf = mperf; + cpu->sample.aperf -= cpu->prev_aperf; + cpu->sample.mperf -= cpu->prev_mperf; + + intel_pstate_calc_busy(cpu); + + cpu->prev_aperf = aperf; + cpu->prev_mperf = mperf; +} + +static inline void intel_pstate_set_sample_time(struct cpudata *cpu) +{ + set_timer(&cpu->timer, NOW() + MILLISECS(pid_params.sample_rate_ms)); +} + +static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) +{ + int32_t core_busy, max_pstate, current_pstate, sample_ratio; + u32 duration_us; + u32 sample_time_us; + + /* + * core_busy is the ratio of actual performance to max + * max_pstate is the max non turbo pstate available + * current_pstate was the pstate that was requested during + * the last sample period. + * + * We normalize core_busy, which was our actual percent + * performance to what we requested during the last sample + * period. The result will be a percentage of busy at a + * specified pstate. + */ + core_busy = cpu->sample.core_pct_busy; + max_pstate = int_tofp(cpu->pstate.max_pstate); + current_pstate = int_tofp(cpu->pstate.current_pstate); + core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); + + /* + * Since we have a deferred timer, it will not fire unless + * we are in C0. So, determine if the actual elapsed time + * is significantly greater (3x) than our sample interval. If it + * is, then we were idle for a long enough period of time + * to adjust our busyness. + */ + sample_time_us = pid_params.sample_rate_ms * 1000ULL; + duration_us = (u32)((s_time_t)(cpu->sample.time - cpu->last_sample_time) + / 1000); + if (duration_us > sample_time_us * 3) { + sample_ratio = div_fp(int_tofp(sample_time_us), + int_tofp(duration_us)); + core_busy = mul_fp(core_busy, sample_ratio); + } + + return core_busy; +} + +static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) +{ + int32_t busy_scaled; + struct _pid *pid; + signed int ctl; + + pid = &cpu->pid; + busy_scaled = intel_pstate_get_scaled_busy(cpu); + + ctl = pid_calc(pid, busy_scaled); + + /* Negative values of ctl increase the pstate and vice versa */ + intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl); +} + +static void intel_pstate_timer_func(void *__data) +{ + struct cpudata *cpu = (struct cpudata *) __data; + + intel_pstate_sample(cpu); + + intel_pstate_adjust_busy_pstate(cpu); + + intel_pstate_set_sample_time(cpu); +} + +#define ICPU(model, policy) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ + (unsigned long)&policy } + +static const struct x86_cpu_id intel_pstate_cpu_ids[] = { + ICPU(0x2a, core_params), + ICPU(0x2d, core_params), + ICPU(0x37, byt_params), + ICPU(0x3a, core_params), + ICPU(0x3c, core_params), + ICPU(0x3d, core_params), + ICPU(0x3e, core_params), + ICPU(0x3f, core_params), + ICPU(0x45, core_params), + ICPU(0x46, core_params), + ICPU(0x47, core_params), + ICPU(0x4c, byt_params), + ICPU(0x4e, core_params), + ICPU(0x4f, core_params), + ICPU(0x56, core_params), + {} +}; + +static int intel_pstate_init_cpu(unsigned int cpunum) +{ + struct cpudata *cpu; + s_time_t expires; + + if (!all_cpu_data[cpunum]) + all_cpu_data[cpunum] = xzalloc(struct cpudata); + if (!all_cpu_data[cpunum]) + return -ENOMEM; + + cpu = all_cpu_data[cpunum]; + + cpu->cpu = cpunum; + intel_pstate_get_cpu_pstates(cpu); + + init_timer(&cpu->timer, intel_pstate_timer_func, cpu, cpunum); + expires = NOW() + MILLISECS(10); + + intel_pstate_busy_pid_reset(cpu); + intel_pstate_sample(cpu); + + set_timer(&cpu->timer, expires); + + return 0; +} + +static int intel_pstate_set_policy(struct cpufreq_policy *policy) +{ + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + limits.no_turbo = 0; + limits.max_perf_pct = 100; + limits.max_perf = int_tofp(1); + limits.min_perf_pct = 100; + limits.min_perf = int_tofp(1); + policy->max_perf_pct = 100; + policy->min_perf_pct = 100; + return 0; + } else if (policy->policy == CPUFREQ_POLICY_POWERSAVE) { + limits.min_perf_pct = limits.min_policy_pct; + limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100)); + limits.max_perf_pct = limits.min_perf_pct; + limits.max_perf = limits.min_perf; + policy->min_perf_pct = limits.min_policy_pct; + policy->max_perf_pct = policy->min_perf_pct; + return 0; + } else if (policy->policy == CPUFREQ_POLICY_USERSPACE) { + limits.max_perf_pct = max(limits.min_policy_pct, policy->max_perf_pct); + limits.max_perf_pct = min(limits.max_policy_pct, limits.max_perf_pct); + limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100)); + limits.min_perf_pct = limits.max_perf_pct; + limits.min_perf = limits.max_perf; + policy->max_perf_pct = limits.max_perf_pct; + policy->min_perf_pct = limits.max_perf_pct; + return 0; + } else { + limits.min_perf_pct = max(limits.min_policy_pct, policy->min_perf_pct); + limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100)); + limits.max_perf_pct = min(limits.max_policy_pct, policy->max_perf_pct); + limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100)); + policy->min_perf_pct = limits.min_perf_pct; + policy->max_perf_pct = limits.max_perf_pct; + return 0; + } +} + +static int intel_pstate_verify_policy(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + + if ( policy->policy != CPUFREQ_POLICY_POWERSAVE && + policy->policy != CPUFREQ_POLICY_PERFORMANCE && + policy->policy != CPUFREQ_POLICY_USERSPACE && + policy->policy != CPUFREQ_POLICY_ONDEMAND ) + return -EINVAL; + + return 0; +} + +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) +{ + int cpu_num = policy->cpu; + struct cpudata *cpu = all_cpu_data[cpu_num]; + + kill_timer(&all_cpu_data[cpu_num]->timer); + + intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); + + return 0; +} + +static int intel_pstate_turbo_update(int cpuid, struct cpufreq_policy *policy) +{ + struct cpudata *cpu = all_cpu_data[0]; + + update_turbo_state(); + if (limits.turbo_disabled) { + printk("Turbo disabled by BIOS or not supported on processor\n"); + return -EINVAL; + } + limits.no_turbo = policy->turbo == CPUFREQ_TURBO_ENABLED ? 0 : 1; + + if (limits.no_turbo) + policy->cpuinfo.max_freq = + cpu->pstate.max_pstate * cpu->pstate.scaling; + else + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; + + policy->max = clamp_t(unsigned int, policy->max, + policy->cpuinfo.min_freq, policy->cpuinfo.max_freq); + + return 0; +} + +static int get_turbo_pct(void) +{ + struct cpudata *cpu; + int total, no_turbo, turbo_pct; + uint32_t turbo_fp; + + cpu = all_cpu_data[0]; + + total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1; + no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; + turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total)); + turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100))); + return turbo_pct; +} + +static int intel_pstate_cpu_init(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + int rc; + + rc = intel_pstate_init_cpu(policy->cpu); + if (rc) + return rc; + + cpu = all_cpu_data[policy->cpu]; + if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_ONDEMAND; + + policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + policy->min_perf_pct = 0; + policy->max_perf_pct = 100; + policy->turbo_pct = get_turbo_pct(); + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + cpumask_set_cpu(policy->cpu, policy->cpus); + + /* the first cpu do the init work for limits.min/max_policy_pct */ + if (cpu->cpu == 0) { + limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; + limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0, 100); + limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq; + limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0, 100); + } + + return 0; +} + +static struct cpufreq_driver intel_pstate_driver = { + .verify = intel_pstate_verify_policy, + .setpolicy = intel_pstate_set_policy, + .init = intel_pstate_cpu_init, + .exit = intel_pstate_cpu_exit, + .update = intel_pstate_turbo_update, + .name = "intel_pstate", +}; + +static int intel_pstate_msrs_not_valid(void) +{ + /* Check that all the msr's we are using are valid. */ + u64 aperf, mperf, tmp; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + if (!pstate_funcs.get_max() || + !pstate_funcs.get_min() || + !pstate_funcs.get_turbo()) + return -ENODEV; + + rdmsrl(MSR_IA32_APERF, tmp); + if (!(tmp - aperf)) + return -ENODEV; + + rdmsrl(MSR_IA32_MPERF, tmp); + if (!(tmp - mperf)) + return -ENODEV; + + return 0; +} + +static void copy_pid_params(struct pstate_adjust_policy *policy) +{ + pid_params.sample_rate_ms = policy->sample_rate_ms; + pid_params.p_gain_pct = policy->p_gain_pct; + pid_params.i_gain_pct = policy->i_gain_pct; + pid_params.d_gain_pct = policy->d_gain_pct; + pid_params.deadband = policy->deadband; + pid_params.setpoint = policy->setpoint; +} + +static void copy_cpu_funcs(struct pstate_funcs *funcs) +{ + pstate_funcs.get_max = funcs->get_max; + pstate_funcs.get_min = funcs->get_min; + pstate_funcs.get_turbo = funcs->get_turbo; + pstate_funcs.get_scaling = funcs->get_scaling; + pstate_funcs.set = funcs->set; + pstate_funcs.get_vid = funcs->get_vid; +} + +int __init intel_pstate_init(void) +{ + int cpu, rc = 0; + const struct x86_cpu_id *id; + struct cpu_defaults *cpu_info; + + if (cpuid_ecx(6) & 0x1) + set_bit(X86_FEATURE_APERFMPERF, &boot_cpu_data.x86_capability); + + id = x86_match_cpu(intel_pstate_cpu_ids); + if (!id) + return -ENODEV; + + cpu_info = (struct cpu_defaults *)id->driver_data; + + copy_pid_params(&cpu_info->pid_policy); + copy_cpu_funcs(&cpu_info->funcs); + + if (intel_pstate_msrs_not_valid()) + return -ENODEV; + + all_cpu_data = xzalloc_array(struct cpudata *, num_online_cpus()); + if (!all_cpu_data) + return -ENOMEM; + + rc = cpufreq_register_driver(&intel_pstate_driver); + if (rc) + goto out; + + return rc; +out: + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) { + kill_timer(&all_cpu_data[cpu]->timer); + xfree(all_cpu_data[cpu]); + } + } + xfree(all_cpu_data); + return -ENODEV; +} diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h index 85a055f..abda363 100644 --- a/xen/include/acpi/cpufreq/cpufreq.h +++ b/xen/include/acpi/cpufreq/cpufreq.h @@ -34,6 +34,12 @@ struct acpi_cpufreq_data { extern struct acpi_cpufreq_data *cpufreq_drv_data[NR_CPUS]; +/* + * Maximum transition latency is in nanoseconds - if it's unknown, + * CPUFREQ_ETERNAL shall be used. + */ +#define CPUFREQ_ETERNAL (-1) + struct cpufreq_cpuinfo { unsigned int max_freq; unsigned int second_max_freq; /* P1 if Turbo Mode is on */ diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h index 5e85b38..505d7e7 100644 --- a/xen/include/asm-x86/acpi.h +++ b/xen/include/asm-x86/acpi.h @@ -32,6 +32,8 @@ #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long +extern int intel_pstate_init(void); + /* * Calling conventions: * diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h index 7963a3a..efc9711 100644 --- a/xen/include/asm-x86/cpufeature.h +++ b/xen/include/asm-x86/cpufeature.h @@ -69,6 +69,7 @@ #define X86_FEATURE_XTOPOLOGY (3*32+13) /* cpu topology enum extensions */ #define X86_FEATURE_CPUID_FAULTING (3*32+14) /* cpuid faulting */ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+15) /* clflush reqd with monitor */ +#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h index 83f2f70..57945d9 100644 --- a/xen/include/asm-x86/msr-index.h +++ b/xen/include/asm-x86/msr-index.h @@ -52,6 +52,8 @@ #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 @@ -319,6 +321,7 @@ #define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1<<22) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23) +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL<<38) #define MSR_IA32_TSC_DEADLINE 0x000006E0 #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 -- 1.9.1 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel