Hi,

Attached is a first draft to a paravirt implementation for a timer to
KVM. It is inspired in anthony's last patch about it, but not that
much based on it.

I'm not using hypercalls to get the current time, but rather,
registering an address that will get timer updates once in a while.

Also, it includes a clockevent oneshot implementation (which is the
very thing of this patch), that will allow us interest things like
dynticks.

It's still not yet working on SMP, and I'm currently not sure why (ok,
ok, if you actually read the patch, it will become obvious the why: it
only delivers interrupt for vector 0x20, but I'm further with it, this
patch is just a snapshot)

My next TODOs with it are:
* Get SMP working
* Try something for stolen time, as jeremy's last suggestion for anthony's patch
* Measure the time it takes for a hypercall, and subtract this time
for calculating the expiry time for the timer event.
* Testing and fixing bugs: I'm sure they exist!

Meanwhile, all your suggestions are welcome.

-- 
Glauber de Oliveira Costa.
"Free as in Freedom"
http://glommer.net

"The less confident you are, the more serious you have to act."
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 97b64d7..622e4d2 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -236,6 +236,15 @@ config VMI
 	  (it could be used by other hypervisors in theory too, but is not
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.
+config KVM_CLOCK
+	bool "KVM paravirtualized clock"
+	depends on PARAVIRT && GENERIC_CLOCKEVENTS
+	help
+	  Turning on this option will allow you to run a paravirtualized clock
+	  when running over the KVM hypervisor. Instead of relying on a PIT
+	  (or probably other) emulation by the underlying device model, the host
+	  provides the guest with timing infrastructure, as time of day, and
+	  timer expiration.
 
 config ACPI_SRAT
 	bool
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 9d33b00..90c5dc4 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_MGEODE_LX)		+= geode.o
 
 obj-$(CONFIG_VMI)		+= vmi.o vmiclock.o
+obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-y				+= pcspeaker.o
 
diff --git a/arch/i386/kernel/kvmclock.c b/arch/i386/kernel/kvmclock.c
new file mode 100644
index 0000000..8c4df5d
--- /dev/null
+++ b/arch/i386/kernel/kvmclock.c
@@ -0,0 +1,222 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8253.h>
+
+#include <mach_ipi.h>
+#include <irq_vectors.h>
+
+#define KVM_SCALE 22
+
+static int no_kvmclock = 0;
+extern struct clock_event_device *global_clock_event;
+
+static int parse_no_kvmclock(char *arg)
+{
+	no_kvmclock = 1;
+	return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+struct kvm_hv_clock hv_clock;
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that. Even if the tsc is not accurate, it gives us a more accurate timing
+ * than not adjusting at all
+ */
+unsigned long kvm_get_wallclock(void)
+{
+	unsigned long wallclock;
+	unsigned long long now;
+	wallclock  = hv_clock.wc.tv_sec;
+
+	rdtscll(now);
+
+	now -= hv_clock.last_tsc;
+	now = (now * hv_clock.tsc_mult) >> KVM_SCALE;
+	now += hv_clock.wc.tv_nsec;
+	do_div(now, NSEC_PER_SEC);
+	return wallclock;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+	return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time, and then we can use it to derive a slightly more
+ * precise notion of elapsed time, converted to nanoseconds.
+ *
+ * If the platform provides a stable tsc, we just use it, and there is no need
+ * for the host to update anything.
+ */
+static cycle_t kvm_clock_read(void) {
+
+	u64 delta, last_tsc;
+	struct timespec *now;
+
+	if (hv_clock.stable_tsc) {
+		rdtscll(last_tsc);
+		return last_tsc;
+	}
+
+	do {
+		last_tsc = hv_clock.last_tsc;
+		rmb();
+		now = &hv_clock.now;
+		rmb();
+	} while (hv_clock.last_tsc != last_tsc);
+
+	delta = native_read_tsc() - last_tsc;
+	delta = (delta * hv_clock.tsc_mult) >> KVM_SCALE;
+
+	return (cycle_t)now->tv_sec * NSEC_PER_SEC + now->tv_nsec + delta;
+}
+
+static void kvm_timer_set_mode(enum clock_event_mode mode,
+				struct clock_event_device *evt)
+{
+	WARN_ON(!irqs_disabled());
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* this is what we want */
+		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		kvm_hypercall0(KVM_HCALL_STOP_ONESHOT);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Programming the next event is just a matter of asking the host
+ * to generate us an interrupt when the time expires. We pass the
+ * delta on, and hypervisor will do all remaining tricks. For a more
+ * precise timing, we can just subtract the time spent by the hypercall
+ */
+static int kvm_timer_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	kvm_hypercall1(KVM_HCALL_SET_ALARM, delta);
+	return 0;
+}
+
+/* This is our clockevents structure. We only support one shot operation */
+static struct clock_event_device kvm_clockevent = {
+	.name		= "kvm-timer",
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 0,
+	.mult		= 1,
+	.max_delta_ns	= 0xffffffff,
+	.min_delta_ns	= 1000000,
+	.set_mode	= kvm_timer_set_mode,
+	.set_next_event = kvm_timer_next_event,
+	.rating         = 1000,
+	.irq		= 0,
+	.cpumask	= CPU_MASK_NONE,
+};
+
+unsigned long long kvm_sched_clock(void)
+{
+	return kvm_clock_read();
+}
+
+static struct clocksource kvm_clock = {
+	.name = "kvm-clock",
+	.read = kvm_clock_read,
+	.rating = 400,
+	.mask = CLOCKSOURCE_MASK(64),
+	.mult = 1 << KVM_SCALE,
+	.shift = KVM_SCALE,
+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, kvm_clock_evt);
+
+static irqreturn_t kvm_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(kvm_clock_evt);
+	BUG_ON(!evt);
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static struct irqaction irq0  = {
+        .handler = kvm_timer_interrupt,
+        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+        .mask = CPU_MASK_CPU0,
+        .name = "kvm-timer"
+};
+
+static void kvm_clockevents_init(int cpu)
+{
+	struct clock_event_device *evt;
+	evt = &per_cpu(kvm_clock_evt, cpu);
+	memcpy(evt, &kvm_clockevent, sizeof(kvm_clockevent));
+	evt->cpumask = cpumask_of_cpu(cpu);
+	clockevents_register_device(evt);
+}
+
+static void kvm_time_init(void)
+{
+	if (hv_clock.stable_tsc)
+		kvm_clock.mult = hv_clock.tsc_mult;
+
+	clocksource_register(&kvm_clock);
+
+	kvm_clockevents_init(smp_processor_id());
+
+	setup_irq(0, &irq0);
+}
+
+/*
+ * QEMU connects a pit timer, which keeps sending us interrupts. We disable
+ * it when the time comes
+ */
+void kvm_disable_pit(void)
+{
+	outb_p(0x30, PIT_MODE);
+}
+
+void __init kvmclock_init(void)
+{
+
+	unsigned long shared_page = (unsigned long)&hv_clock;
+	/*
+	 * If we can't use the paravirt clock, just go with
+	 * the usual timekeeping
+	 */
+	if (!kvm_para_available() || no_kvmclock)
+		return;
+
+	if (kvm_hypercall1(KVM_HCALL_SET_SHARED_PAGE, shared_page))
+		return;
+
+	paravirt_ops.get_wallclock = kvm_get_wallclock;
+	paravirt_ops.set_wallclock = kvm_set_wallclock;
+	paravirt_ops.sched_clock = kvm_sched_clock;
+	paravirt_ops.time_init = kvm_time_init;
+	/*
+	 * If we let the normal APIC initialization code run, they will
+	 * override our event handler, relying that the APIC will deliver
+	 * the interrupts in the LOCAL_TIMER_VECTOR. The easy solution is
+	 * keep the PIT running until then
+	 */
+	paravirt_ops.setup_boot_clock = kvm_disable_pit;
+}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index d474cd6..fd758f9 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -46,6 +46,7 @@
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
 #include <linux/pfn.h>
+#include <linux/kvm_para.h>
 
 #include <video/edid.h>
 
@@ -579,6 +580,9 @@ void __init setup_arch(char **cmdline_p)
 	vmi_init();
 #endif
 
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
 	 * any memory using the bootmem allocator.  Although the
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
index 0f663fe..7baf798 100644
--- a/drivers/kvm/irq.c
+++ b/drivers/kvm/irq.c
@@ -32,6 +32,8 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
 	struct kvm_pic *s;
 
+	if (v->timer_vector != -1)
+		return 1;
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);	/* PIC */
@@ -43,6 +45,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
+static int kvm_get_pvclock_interrupt(struct kvm_vcpu *v)
+{
+	int ret = v->timer_vector;
+	v->timer_vector = -1;
+	return	ret;
+}
 /*
  * Read pending interrupt vector and intack.
  */
@@ -51,7 +59,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 	int vector;
 
-	vector = kvm_get_apic_interrupt(v);	/* APIC */
+	vector = kvm_get_pvclock_interrupt(v);
+	if (vector == -1)
+		vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 8e58f3b..3766921 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -391,6 +391,8 @@ struct kvm_vcpu {
 	/* emulate context */
 
 	struct x86_emulate_ctxt emulate_ctxt;
+	struct hrtimer oneshooter;
+	int timer_vector;
 };
 
 struct kvm_mem_alias {
@@ -414,6 +416,9 @@ struct kvm {
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
 	int nmemslots;
 	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
+
+	unsigned long clock_addr;
+	int time_needs_update;
 	/*
 	 * Hash table of struct kvm_mmu_page.
 	 */
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 2c3986f..57e2e39 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -20,6 +20,7 @@
 #include "segment_descriptor.h"
 #include "irq.h"
 
+#include <linux/clocksource.h>
 #include <linux/kvm.h>
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -40,6 +41,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/profile.h>
 #include <linux/kvm_para.h>
+#include <linux/ktime.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
@@ -1406,6 +1408,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static struct kvm_hv_clock hv_clock;
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
@@ -1426,7 +1429,32 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		a3 &= 0xFFFFFFFF;
 	}
 
+	ret = 0;
 	switch (nr) {
+	case  KVM_HCALL_SET_SHARED_PAGE:
+		if (!irqchip_in_kernel(vcpu->kvm)) {
+			ret = -1;
+			break;
+		}
+
+		vcpu->kvm->clock_addr = a0;
+		getnstimeofday(&hv_clock.wc);
+		hv_clock.stable_tsc = check_tsc_unstable();
+		hv_clock.tsc_mult = clocksource_khz2mult(tsc_khz, 22);
+		rdtscll(hv_clock.last_tsc);
+		emulator_write_emulated(vcpu->kvm->clock_addr, &hv_clock,
+						sizeof(hv_clock), vcpu);
+
+		break;
+	case KVM_HCALL_SET_ALARM: {
+		ktime_t kt;
+		kt = ktime_add_ns(ktime_get_real(), a0);
+		hrtimer_start(&vcpu->oneshooter, kt, HRTIMER_MODE_ABS);
+		break;
+	}
+	case KVM_HCALL_STOP_ONESHOT:
+		hrtimer_cancel(&vcpu->oneshooter);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
@@ -1961,6 +1989,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 					 vcpu->irq_summary == 0);
 }
 
+void kvm_update_guest_time(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	if ((kvm->clock_addr) && !(hv_clock.stable_tsc) &&
+				  (kvm->time_needs_update)) {
+		ktime_get_ts(&hv_clock.now);
+		rdtscll(hv_clock.last_tsc);
+		emulator_write_emulated(vcpu->kvm->clock_addr,
+					&hv_clock, sizeof(hv_clock), vcpu);
+		kvm->time_needs_update = 0;
+	}
+}
+
 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -2009,6 +2051,7 @@ again:
 		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
 			kvm_x86_ops->tlb_flush(vcpu);
 
+	kvm_update_guest_time(vcpu);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
 	vcpu->guest_mode = 0;
@@ -2565,6 +2608,15 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return fd;
 }
 
+
+static enum hrtimer_restart kvm_clockdev_fn(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, oneshooter);
+	vcpu->timer_vector = 0x20;
+
+	return HRTIMER_NORESTART;
+}
+
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
@@ -2580,6 +2632,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	if (IS_ERR(vcpu))
 		return PTR_ERR(vcpu);
 
+	vcpu->timer_vector = -1;
 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
 	/* We do fxsave: this must be aligned. */
@@ -2600,6 +2653,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	kvm->vcpus[n] = vcpu;
 	mutex_unlock(&kvm->lock);
 
+	hrtimer_init(&vcpu->oneshooter, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	vcpu->oneshooter.function = kvm_clockdev_fn;
+
 	/* Now it's all set up, let userspace reach it */
 	r = create_vcpu_fd(vcpu);
 	if (r < 0)
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index cc5dfb4..c0eac3c 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -16,12 +16,28 @@
 
 #ifdef __KERNEL__
 #include <asm/processor.h>
+extern void kvmclock_init(void);
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
 #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
 
+#define KVM_HCALL_SET_TIMER_MODE 0
+#define KVM_HCALL_SET_SHARED_PAGE 1
+#define KVM_HCALL_SET_ALARM	2
+#define KVM_HCALL_STOP_ONESHOT 3
+
+struct kvm_hv_clock {
+	int stable_tsc; /* use raw tsc for clock_read */
+	unsigned long tsc_mult;
+	struct timespec now;
+	u64 last_tsc;
+	/* That's the wall clock, not the water closet */
+	struct timespec wc;
+};
+
+
 /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
  * instruction.  The hypervisor may replace it with something else but only the
  * instructions are guaranteed to be supported.
-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to