This is the guest part of kvm clock implementation It does not do tsc-only timing, as tsc can have deltas between cpus, and it did not seem worthy to me to keep adjusting them.
We do use it, however, for fine-grained adjustment. Other than that, time comes from the host. Signed-off-by: Glauber de Oliveira Costa <[EMAIL PROTECTED]> --- arch/x86/Kconfig | 10 +++ arch/x86/kernel/Makefile_32 | 1 + arch/x86/kernel/kvmclock.c | 161 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 5 ++ 4 files changed, 177 insertions(+), 0 deletions(-) create mode 100644 arch/x86/kernel/kvmclock.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab2df55..968315e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -350,6 +350,16 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_CLOCK + bool "KVM paravirtualized clock" + select PARAVIRT + help + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host + provides the guest with timing infrastructure, as time of day, and + timer expiration. + source "arch/x86/lguest/Kconfig" endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index a7bc93c..f6332b6 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -44,6 +44,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt_32.o obj-y += pcspeaker.o diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 0000000..317bee0 --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,161 @@ +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/clocksource.h> +#include <linux/kvm_para.h> +#include <asm/arch_hooks.h> +#include <asm/msr.h> + + +#define KVM_SCALE 22 + +static int kvmclock = 1; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +struct xen_shared_info shared_info __attribute__((__aligned__(PAGE_SIZE))); + +/* The hypervisor will put information about time periodically here */ +DEFINE_PER_CPU(struct xen_vcpu_time_info *, hv_clock); +#define get_clock(cpu, field) per_cpu(hv_clock, cpu)->field + +static inline u64 kvm_get_delta(u64 last_tsc) +{ + int cpu = smp_processor_id(); + u64 delta = native_read_tsc() - last_tsc; + return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; +} + +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that. Even if the tsc is not accurate, it gives us a more accurate timing + * than not adjusting at all + */ +unsigned long kvm_get_wallclock(void) +{ + u32 wc_sec, wc_nsec; + u64 delta, last_tsc; + struct timespec ts; + int version, nsec, cpu = smp_processor_id(); + + do { + version = shared_info.wc_version; + rmb(); + wc_sec = shared_info.wc_sec; + wc_nsec = shared_info.wc_nsec; + last_tsc = get_clock(cpu, tsc_timestamp); + rmb(); + } while ((shared_info.wc_version != version) || (version & 1)); + + delta = kvm_get_delta(last_tsc); + delta += wc_nsec; + nsec = do_div(delta, NSEC_PER_SEC); + set_normalized_timespec(&ts, wc_sec + delta, nsec); + /* + * Of all mechanisms of time adjustment I've tested, this one + * was the champion! + */ + return ts.tv_sec + 1; +} + +int kvm_set_wallclock(unsigned long now) +{ + return 0; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time, and then we can use it to derive a slightly more + * precise notion of elapsed time, converted to nanoseconds. + */ +static cycle_t kvm_clock_read(void) +{ + u64 last_tsc, now; + u32 version; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + + do { + version = get_clock(cpu, version); + rmb(); + last_tsc = get_clock(cpu, tsc_timestamp); + now = get_clock(cpu, system_time); + rmb(); + } while ((get_clock(cpu, version) != version) || (version & 1)); + + now += kvm_get_delta(last_tsc); + preempt_enable(); + + return now; +} + +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_read, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << KVM_SCALE, + .shift = KVM_SCALE, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int kvm_register_clock(void) +{ + int cpu = smp_processor_id(); + u64 kvm_clock_info = (u64)(u64 *)&shared_info; + + kvm_clock_info = __pa(kvm_clock_info); + __get_cpu_var(hv_clock) = &(shared_info.vcpu_info[cpu].time); + + return native_write_msr_safe(MSR_KVM_PARAVIRT_CLOCK, kvm_clock_info); +} + +static void kvm_setup_secondary_clock(void) +{ + /* + * Now that the first cpu already had this clocksource initialized, + * we shouldn't fail. + */ + WARN_ON(kvm_register_clock()); + /* ok, done with our trickery, call native */ + setup_secondary_APIC_clock(); +} + +void __init kvmclock_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + if (kvm_register_clock()) + return; + pv_time_ops.get_wallclock = kvm_get_wallclock; + pv_time_ops.set_wallclock = kvm_set_wallclock; + pv_time_ops.sched_clock = kvm_clock_read; + pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + clocksource_register(&kvm_clock); + } +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9c24b45..89c0eb2 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -44,6 +44,7 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> +#include <linux/kvm_para.h> #include <video/edid.h> @@ -614,6 +615,10 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = setup_memory(); +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel -- 1.5.0.6 ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel