[PATCH 28/28] Add a sched_clock paravirt_op
The tsc-based get_scheduled_cycles interface is not a good match for Xen's runstate accounting, which reports everything in nanoseconds. This patch replaces this interface with a sched_clock interface, which matches both Xen and VMI's requirements. In order to do this, we: 1. replace get_scheduled_cycles with sched_clock 2. hoist cycles_2_ns into a common header 3. update vmi accordingly One thing to note: because sched_clock is implemented as a weak function in kernel/sched.c, we must define a real function in order to override this weak binding. This means the usual paravirt_ops technique of using an inline function won't work in this case. Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]> Cc: Zachary Amsden <[EMAIL PROTECTED]> Cc: Dan Hecht <[EMAIL PROTECTED]> Cc: john stultz <[EMAIL PROTECTED]> --- arch/i386/kernel/paravirt.c|2 - arch/i386/kernel/sched-clock.c | 43 ++--- arch/i386/kernel/vmi.c |2 - arch/i386/kernel/vmiclock.c|6 ++--- include/asm-i386/paravirt.h|7 -- include/asm-i386/timer.h | 46 +++- include/asm-i386/vmi_time.h|2 - 7 files changed, 73 insertions(+), 35 deletions(-) === --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -268,7 +268,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, === --- a/arch/i386/kernel/sched-clock.c +++ b/arch/i386/kernel/sched-clock.c @@ -35,28 +35,7 @@ * [EMAIL PROTECTED] "math is hard, lets go shopping!" */ -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -struct sc_data { - unsigned int cyc2ns_scale; - unsigned long long sync_tsc; - unsigned long long ns_base; - unsigned long long last_val; - unsigned long long sync_jiffies; -}; - -static DEFINE_PER_CPU(struct sc_data, sc_data); - -static inline unsigned long long cycles_2_ns(struct sc_data *sc, unsigned long long cyc) -{ - unsigned long long ns; - - cyc -= sc->sync_tsc; - ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; - ns += sc->ns_base; - - return ns; -} +DEFINE_PER_CPU(struct sc_data, sc_data); /* * Scheduler clock - returns current time in nanosec units. @@ -66,7 +45,7 @@ static inline unsigned long long cycles_ * [1] no attempt to stop CPU instruction reordering, which can hit * in a 100 instruction window or so. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long r; struct sc_data *sc = _cpu_var(sc_data); @@ -81,8 +60,8 @@ unsigned long long sched_clock(void) sc->last_val = r; local_irq_restore(flags); } else { - get_scheduled_cycles(r); - r = cycles_2_ns(sc, r); + rdtscll(r); + r = cycles_2_ns(r); sc->last_val = r; } @@ -90,6 +69,18 @@ unsigned long long sched_clock(void) return r; } + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias("native_sched_clock"))); +#endif /* Resync with new CPU frequency */ static void resync_sc_freq(struct sc_data *sc, unsigned int newfreq) @@ -103,7 +94,7 @@ static void resync_sc_freq(struct sc_dat because sched_clock callers should be able to tolerate small errors. */ sc->ns_base = ktime_to_ns(ktime_get()); - get_scheduled_cycles(sc->sync_tsc); + rdtscll(sc->sync_tsc); sc->cyc2ns_scale = (100 << CYC2NS_SCALE_FACTOR) / newfreq; } === --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -887,7 +887,7 @@ static inline int __init activate_vmi(vo paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ === --- a/arch/i386/kernel/vmiclock.c +++
[PATCH 28/28] Add a sched_clock paravirt_op
The tsc-based get_scheduled_cycles interface is not a good match for Xen's runstate accounting, which reports everything in nanoseconds. This patch replaces this interface with a sched_clock interface, which matches both Xen and VMI's requirements. In order to do this, we: 1. replace get_scheduled_cycles with sched_clock 2. hoist cycles_2_ns into a common header 3. update vmi accordingly One thing to note: because sched_clock is implemented as a weak function in kernel/sched.c, we must define a real function in order to override this weak binding. This means the usual paravirt_ops technique of using an inline function won't work in this case. Signed-off-by: Jeremy Fitzhardinge [EMAIL PROTECTED] Cc: Zachary Amsden [EMAIL PROTECTED] Cc: Dan Hecht [EMAIL PROTECTED] Cc: john stultz [EMAIL PROTECTED] --- arch/i386/kernel/paravirt.c|2 - arch/i386/kernel/sched-clock.c | 43 ++--- arch/i386/kernel/vmi.c |2 - arch/i386/kernel/vmiclock.c|6 ++--- include/asm-i386/paravirt.h|7 -- include/asm-i386/timer.h | 46 +++- include/asm-i386/vmi_time.h|2 - 7 files changed, 73 insertions(+), 35 deletions(-) === --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -268,7 +268,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, === --- a/arch/i386/kernel/sched-clock.c +++ b/arch/i386/kernel/sched-clock.c @@ -35,28 +35,7 @@ * [EMAIL PROTECTED] math is hard, lets go shopping! */ -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -struct sc_data { - unsigned int cyc2ns_scale; - unsigned long long sync_tsc; - unsigned long long ns_base; - unsigned long long last_val; - unsigned long long sync_jiffies; -}; - -static DEFINE_PER_CPU(struct sc_data, sc_data); - -static inline unsigned long long cycles_2_ns(struct sc_data *sc, unsigned long long cyc) -{ - unsigned long long ns; - - cyc -= sc-sync_tsc; - ns = (cyc * sc-cyc2ns_scale) CYC2NS_SCALE_FACTOR; - ns += sc-ns_base; - - return ns; -} +DEFINE_PER_CPU(struct sc_data, sc_data); /* * Scheduler clock - returns current time in nanosec units. @@ -66,7 +45,7 @@ static inline unsigned long long cycles_ * [1] no attempt to stop CPU instruction reordering, which can hit * in a 100 instruction window or so. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long r; struct sc_data *sc = get_cpu_var(sc_data); @@ -81,8 +60,8 @@ unsigned long long sched_clock(void) sc-last_val = r; local_irq_restore(flags); } else { - get_scheduled_cycles(r); - r = cycles_2_ns(sc, r); + rdtscll(r); + r = cycles_2_ns(r); sc-last_val = r; } @@ -90,6 +69,18 @@ unsigned long long sched_clock(void) return r; } + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias(native_sched_clock))); +#endif /* Resync with new CPU frequency */ static void resync_sc_freq(struct sc_data *sc, unsigned int newfreq) @@ -103,7 +94,7 @@ static void resync_sc_freq(struct sc_dat because sched_clock callers should be able to tolerate small errors. */ sc-ns_base = ktime_to_ns(ktime_get()); - get_scheduled_cycles(sc-sync_tsc); + rdtscll(sc-sync_tsc); sc-cyc2ns_scale = (100 CYC2NS_SCALE_FACTOR) / newfreq; } === --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -887,7 +887,7 @@ static inline int __init activate_vmi(vo paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ === --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -65,9