arch

Taylor R Campbell Fri, 29 Jun 2018 14:53:25 -0700

Module Name:    src
Committed By:   riastradh
Date:           Fri Jun 29 21:53:12 UTC 2018


Modified Files:
        src/sys/arch/x86/include: cpu.h
        src/sys/arch/xen/xen: clock.c

Log Message:
Rewrite Xen timecounter and hardclock timer.

With this change, the Xen timecounter should now be globally
monotonic, as every timecounter is supposed to be.  Should also fix a
litany of races in the timecounter logic.

Proposed last year; see mailing list for further details:
https://mail-index.netbsd.org/port-xen/2017/10/31/msg009112.html

ok cherry


To generate a diff of this commit:
cvs rdiff -u -r1.92 -r1.93 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.67 -r1.68 src/sys/arch/xen/xen/clock.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.92 src/sys/arch/x86/include/cpu.h:1.93
--- src/sys/arch/x86/include/cpu.h:1.92	Thu Jun 14 14:36:46 2018
+++ src/sys/arch/x86/include/cpu.h	Fri Jun 29 21:53:12 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.92 2018/06/14 14:36:46 maxv Exp $	*/
+/*	$NetBSD: cpu.h,v 1.93 2018/06/29 21:53:12 riastradh Exp $	*/
 
 /*
  * Copyright (c) 1990 The Regents of the University of California.
@@ -236,6 +236,32 @@ struct cpu_info {
 	struct cpu_tss	*ci_tss;	/* Per-cpu TSSes; shared among LWPs */
 	int ci_tss_sel;			/* TSS selector of this cpu */
 
+#ifdef XEN
+	/* Xen raw system time at which we last ran hardclock.  */
+	uint64_t	ci_xen_hardclock_systime_ns;
+
+	/*
+	 * Last TSC-adjusted local Xen system time we observed.  Used
+	 * to detect whether the Xen clock has gone backwards.
+	 */
+	uint64_t	ci_xen_last_systime_ns;
+
+	/*
+	 * Distance in nanoseconds from the local view of system time
+	 * to the global view of system time, if the local time is
+	 * behind the global time.
+	 */
+	uint64_t	ci_xen_systime_ns_skew;
+
+	/* Event counters for various pathologies that might happen.  */
+	struct evcnt	ci_xen_cpu_tsc_backwards_evcnt;
+	struct evcnt	ci_xen_tsc_delta_negative_evcnt;
+	struct evcnt	ci_xen_raw_systime_wraparound_evcnt;
+	struct evcnt	ci_xen_raw_systime_backwards_evcnt;
+	struct evcnt	ci_xen_systime_backwards_hardclock_evcnt;
+	struct evcnt	ci_xen_missed_hardclock_evcnt;
+#endif
+
 	/*
 	 * The following two are actually region_descriptors,
 	 * but that would pollute the namespace.

Index: src/sys/arch/xen/xen/clock.c
diff -u src/sys/arch/xen/xen/clock.c:1.67 src/sys/arch/xen/xen/clock.c:1.68
--- src/sys/arch/xen/xen/clock.c:1.67	Sun Jun 24 13:35:33 2018
+++ src/sys/arch/xen/xen/clock.c	Fri Jun 29 21:53:12 2018
@@ -1,10 +1,12 @@
-/*	$NetBSD: clock.c,v 1.67 2018/06/24 13:35:33 jdolecek Exp $	*/
+/*	$NetBSD: clock.c,v 1.68 2018/06/29 21:53:12 riastradh Exp $	*/
 
-/*
- *
- * Copyright (c) 2004 Christian Limpach.
+/*-
+ * Copyright (c) 2017, 2018 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Taylor R. Campbell.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -14,580 +16,1013 @@
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_xen.h"
 
+#ifndef XEN_CLOCK_DEBUG
+#define	XEN_CLOCK_DEBUG	0
+#endif
+
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.67 2018/06/24 13:35:33 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.68 2018/06/29 21:53:12 riastradh Exp $");
 
 #include <sys/param.h>
+#include <sys/types.h>
+#include <sys/atomic.h>
+#include <sys/callout.h>
+#include <sys/cpu.h>
+#include <sys/device.h>
+#include <sys/evcnt.h>
+#include <sys/intr.h>
+#include <sys/kernel.h>
+#include <sys/lwp.h>
+#include <sys/percpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/timetc.h>
-#include <sys/timevar.h>
-#include <sys/kernel.h>
-#include <sys/device.h>
-#include <sys/sysctl.h>
 
-#include <xen/xen.h>
-#include <xen/hypervisor.h>
+#include <dev/clock_subr.h>
+
+#include <machine/cpu.h>
+#include <machine/cpu_counter.h>
+#include <machine/lock.h>
+
 #include <xen/evtchn.h>
+#include <xen/hypervisor.h>
 #include <xen/xen-public/vcpu.h>
-#include <machine/cpu_counter.h>
+#include <xen/xen.h>
 
-#include <dev/clock_subr.h>
 #include <x86/rtc.h>
 
-static int xen_timer_handler(void *, struct intrframe *);
-static int (*xen_timer_handler_stub)(void *) = (void *) xen_timer_handler;
-static struct intrhand *ih;
+#define NS_PER_TICK ((uint64_t)1000000000ULL/hz)
+
+static uint64_t	xen_vcputime_systime_ns(void);
+static uint64_t	xen_vcputime_raw_systime_ns(void);
+static void	xen_wallclock_time(struct timespec *);
+static uint64_t	xen_global_systime_ns(void);
+static unsigned	xen_get_timecount(struct timecounter *);
+static int	xen_rtc_get(struct todr_chip_handle *, struct timeval *);
+static int	xen_rtc_set(struct todr_chip_handle *, struct timeval *);
+static int	xen_timer_handler(void *, struct clockframe *);
 
-/* A timecounter: Xen system_time extrapolated with a TSC. */
-u_int xen_get_timecount(struct timecounter*);
+/*
+ * xen timecounter:
+ *
+ *	Xen vCPU system time, plus an adjustment with rdtsc.
+ */
 static struct timecounter xen_timecounter = {
 	.tc_get_timecount = xen_get_timecount,
 	.tc_poll_pps = NULL,
 	.tc_counter_mask = ~0U,
-	.tc_frequency = 1000000000ULL,
+	.tc_frequency = 1000000000ULL,	/* 1 GHz, i.e. units of nanoseconds */
 	.tc_name = "xen_system_time",
-	.tc_quality = 10000 /*
-			     * This needs to take precedence over any hardware
-			     * timecounters (e.g., ACPI in Xen3 dom0), because
-			     * they can't correct for Xen scheduling latency.
-			     */
-};
-
-/* These are periodically updated in shared_info, and then copied here. */
-struct shadow {
-	uint64_t tsc_stamp;
-	uint64_t system_time;
-	unsigned long time_version; /* XXXSMP */
-	uint32_t freq_mul;
-	int8_t freq_shift;
-	struct timespec ts;
+	.tc_quality = 10000,
 };
 
-/* Protects volatile variables ci_shadow & xen_clock_bias */
-static kmutex_t tmutex;
-
-/* Per CPU shadow time values */
-static volatile struct shadow ci_shadow[MAXCPUS];
+/*
+ * xen_global_systime_ns_stamp
+ *
+ *	The latest Xen vCPU system time that has been observed on any
+ *	CPU, for a global monotonic view of the Xen system time clock.
+ */
+static volatile uint64_t xen_global_systime_ns_stamp __cacheline_aligned;
 
-/* The time when the last hardclock(9) call should have taken place,
- * per cpu.
+/*
+ * xen time of day register:
+ *
+ *	Xen wall clock time, plus a Xen vCPU system time adjustment.
  */
-static volatile uint64_t vcpu_system_time[MAXCPUS];
+static struct todr_chip_handle xen_todr_chip = {
+	.todr_gettime = xen_rtc_get,
+	.todr_settime = xen_rtc_set,
+};
 
 /*
- * The clock (as returned by xen_get_timecount) may need to be held
- * back to maintain the illusion that hardclock(9) was called when it
- * was supposed to be, not when Xen got around to scheduling us.
+ * xen timer interrupt handles -- per-CPU struct intrhand *
  */
-static volatile uint64_t xen_clock_bias[MAXCPUS];
+static struct percpu *xen_timer_ih_percpu __read_mostly;
 
 #ifdef DOM0OPS
-/* If we're dom0, send our time to Xen every minute or so. */
-int xen_timepush_ticks = 0;
-static callout_t xen_timepush_co;
+/*
+ * xen timepush state:
+ *
+ *	Callout to periodically, after a sysctl-configurable number of
+ *	NetBSD ticks, set the Xen hypervisor's wall clock time.
+ */
+static struct {
+	struct callout	ch;
+	int		ticks;
+} xen_timepush;
+
+static void	xen_timepush_init(void);
+static void	xen_timepush_intr(void *);
+static int	sysctl_xen_timepush(SYSCTLFN_ARGS);
 #endif
 
-#define NS_PER_TICK (1000000000ULL/hz)
-
 /*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.  Must be called at splhigh (per timecounter requirements).
+ * startrtclock()
+ *
+ *	Initialize the real-time clock from x86 machdep autoconf.
  */
-static void
-get_time_values_from_xen(struct cpu_info *ci)
+void
+startrtclock(void)
 {
 
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
-
-	volatile struct vcpu_time_info *t = &ci->ci_vcpu->time;
-	uint32_t tversion;
-
-	KASSERT(mutex_owned(&tmutex));
+	todr_attach(&xen_todr_chip);
+}
 
-	do {
-		shadow->time_version = t->version;
-		xen_rmb();
-		shadow->tsc_stamp = t->tsc_timestamp;
-		shadow->system_time = t->system_time;
-		shadow->freq_mul = t->tsc_to_system_mul;
-		shadow->freq_shift = t->tsc_shift;
-		xen_rmb();
-	} while ((t->version & 1) || (shadow->time_version != t->version));
-	do {
-		tversion = HYPERVISOR_shared_info->wc_version;
-		xen_rmb();
-		shadow->ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
-		shadow->ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
-		xen_rmb();
-	} while ((HYPERVISOR_shared_info->wc_version & 1) ||
-	    (tversion != HYPERVISOR_shared_info->wc_version));
+/*
+ * setstatclockrate(rate)
+ *
+ *	Set the statclock to run at rate, in units of ticks per second.
+ *
+ *	Currently Xen does not have a separate statclock, so this is a
+ *	noop; instad the statclock runs in hardclock.
+ */
+void
+setstatclockrate(int rate)
+{
 }
 
 /*
- * Are the values we have up to date?
+ * idle_block()
+ *
+ *	Called from the idle loop when we have nothing to do but wait
+ *	for an interrupt.
  */
-static inline int
-time_values_up_to_date(struct cpu_info *ci)
+void
+idle_block(void)
 {
-	int rv;
 
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+	KASSERT(curcpu()->ci_ipending == 0);
+	HYPERVISOR_block();
+}
 
-	KASSERT(ci != NULL);
-	KASSERT(mutex_owned(&tmutex));
+/*
+ * xen_rdtsc()
+ *
+ *	Read the local pCPU's tsc.
+ */
+static inline uint64_t
+xen_rdtsc(void)
+{
+	uint32_t lo, hi;
 
-	xen_rmb();
-	rv = shadow->time_version == ci->ci_vcpu->time.version;
-	xen_rmb();
+	asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
 
-	return rv;
+	return ((uint64_t)hi << 32) | lo;
 }
 
 /*
- * Xen 3 helpfully provides the CPU clock speed in the form of a multiplier
- * and shift that can be used to convert a cycle count into nanoseconds
- * without using an actual (slow) divide insn.
+ * struct xen_vcputime_ticket
+ *
+ *	State for a vCPU read section, during which a caller may read
+ *	from fields of a struct vcpu_time_info and call xen_rdtsc.
+ *	Caller must enter with xen_vcputime_enter, exit with
+ *	xen_vcputime_exit, and be prepared to retry if
+ *	xen_vcputime_exit fails.
+ */
+struct xen_vcputime_ticket {
+	uint64_t	version;
+};
+
+/*
+ * xen_vcputime_enter(tp)
+ *
+ *	Enter a vCPU time read section and store a ticket in *tp, which
+ *	the caller must use with xen_vcputime_exit.  Return a pointer
+ *	to the current CPU's vcpu_time_info structure.  Caller must
+ *	already be bound to the CPU.
  */
-static inline uint64_t
-scale_delta(uint64_t delta, uint32_t mul_frac, int8_t shift)
+static inline volatile struct vcpu_time_info *
+xen_vcputime_enter(struct xen_vcputime_ticket *tp)
 {
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
+	volatile struct vcpu_time_info *vt = &curcpu()->ci_vcpu->time;
+
+	while (__predict_false(1 & (tp->version = vt->version)))
+		SPINLOCK_BACKOFF_HOOK;
 
 	/*
-	 * Here, we multiply a 64-bit and a 32-bit value, and take the top
-	 * 64 bits of that 96-bit product.  This is broken up into two
-	 * 32*32=>64-bit multiplies and a 64-bit add.  The casts are needed
-	 * to hint to GCC that both multiplicands really are 32-bit; the
-	 * generated code is still fairly bad, but not insanely so.
+	 * Must read the version before reading the tsc on the local
+	 * pCPU.  We are racing only with interruption by the
+	 * hypervisor, so no need for a stronger memory barrier.
 	 */
-	return ((uint64_t)(uint32_t)(delta >> 32) * mul_frac)
-	    + ((((uint64_t)(uint32_t)(delta & 0xFFFFFFFF)) * mul_frac) >> 32);
+	__insn_barrier();
+
+	return vt;
 }
 
-/* 
- * Use cycle counter to determine ns elapsed since last Xen time update.
- * Must be called at splhigh (per timecounter requirements).
+/*
+ * xen_vcputime_exit(vt, tp)
+ *
+ *	Exit a vCPU time read section with the ticket in *tp from
+ *	xen_vcputime_enter.  Return true on success, false if caller
+ *	must retry.
  */
-static uint64_t
-get_tsc_offset_ns(struct cpu_info *ci)
+static inline bool
+xen_vcputime_exit(volatile struct vcpu_time_info *vt,
+    struct xen_vcputime_ticket *tp)
 {
-	uint64_t tsc_delta, offset;
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
 
-	KASSERT(mutex_owned(&tmutex));
-	tsc_delta = cpu_counter() - shadow->tsc_stamp;
-	offset = scale_delta(tsc_delta, shadow->freq_mul,
-	    shadow->freq_shift);
+	KASSERT(vt == &curcpu()->ci_vcpu->time);
+
+	/*
+	 * Must read the tsc before re-reading the version on the local
+	 * pCPU.  We are racing only with interruption by the
+	 * hypervisor, so no need for a stronger memory barrier.
+	 */
+	__insn_barrier();
 
-	return offset;
+	return tp->version == vt->version;
 }
 
 /*
- * Returns the current system_time on given vcpu, taking care that the
- * timestamp used is valid for the TSC measurement in question.  Xen2
- * doesn't ensure that this won't step backwards, so we enforce
- * monotonicity on our own in that case.  Must be called at splhigh.
+ * xen_tsc_to_ns_delta(delta_tsc, mul_frac, shift)
+ *
+ *	Convert a difference in tsc units to a difference in
+ *	nanoseconds given a multiplier and shift for the unit
+ *	conversion.
  */
-static uint64_t
-get_vcpu_time(struct cpu_info *ci)
+static inline uint64_t
+xen_tsc_to_ns_delta(uint64_t delta_tsc, uint32_t tsc_to_system_mul,
+    int8_t tsc_shift)
 {
-	uint64_t offset, stime;
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
-	
-		
-	KASSERT(mutex_owned(&tmutex));
-	do {
-		get_time_values_from_xen(ci);
-		offset = get_tsc_offset_ns(ci);
-		stime = shadow->system_time + offset;
-		/* if the timestamp went stale before we used it, refresh */
+	uint32_t delta_tsc_hi, delta_tsc_lo;
 
-	} while (!time_values_up_to_date(ci));
+	if (tsc_shift < 0)
+		delta_tsc >>= -tsc_shift;
+	else
+		delta_tsc <<= tsc_shift;
 
-	return stime;
+	delta_tsc_hi = delta_tsc >> 32;
+	delta_tsc_lo = delta_tsc & 0xffffffffUL;
+
+	/* d*m/2^32 = (2^32 d_h + d_l)*m/2^32 = d_h*m + (d_l*m)/2^32 */
+	return ((uint64_t)delta_tsc_hi * tsc_to_system_mul) +
+	    (((uint64_t)delta_tsc_lo * tsc_to_system_mul) >> 32);
 }
 
-static void
-xen_wall_time(struct timespec *wt)
+/*
+ * xen_vcputime_systime_ns()
+ *
+ *	Return a snapshot of the Xen system time plus an adjustment
+ *	from the tsc, in units of nanoseconds.  Caller must be bound to
+ *	the current CPU.
+ */
+static uint64_t
+xen_vcputime_systime_ns(void)
 {
-	uint64_t nsec;
-
+	volatile struct vcpu_time_info *vt;
 	struct cpu_info *ci = curcpu();
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+	struct xen_vcputime_ticket ticket;
+	uint64_t raw_systime_ns, tsc_timestamp, tsc, delta_tsc, delta_ns;
+	uint32_t tsc_to_system_mul;
+	int8_t tsc_shift;
+	uint64_t systime_ns;
+
+	/* We'd better be bound to the CPU in _some_ way.  */
+	KASSERT(cpu_intr_p() || cpu_softintr_p() || kpreempt_disabled() ||
+	    (curlwp->l_flag & LP_BOUND));
 
-	mutex_enter(&tmutex);
+	/*
+	 * Repeatedly try to read the system time, corresponding tsc
+	 * timestamp, and tsc frequency until we get a consistent view.
+	 */
 	do {
+		vt = xen_vcputime_enter(&ticket);
+
+		/* Grab Xen's snapshot of raw system time and tsc.  */
+		raw_systime_ns = vt->system_time;
+		tsc_timestamp = vt->tsc_timestamp;
+
+		/* Get Xen's current idea of how fast the tsc is counting.  */
+		tsc_to_system_mul = vt->tsc_to_system_mul;
+		tsc_shift = vt->tsc_shift;
+
+		/* Read the CPU's tsc.  */
+		tsc = xen_rdtsc();
+	} while (!xen_vcputime_exit(vt, &ticket));
+
+	/*
+	 * Out of paranoia, check whether the tsc has gone backwards
+	 * since Xen's timestamp.
+	 *
+	 * This shouldn't happen because the Xen hypervisor is supposed
+	 * to have read the tsc _before_ writing to the vcpu_time_info
+	 * page, _before_ we read the tsc.
+	 *
+	 * Further, if we switched pCPUs after reading the tsc
+	 * timestamp but before reading the CPU's tsc, the hypervisor
+	 * had better notify us by updating the version too and forcing
+	 * us to retry the vCPU time read.
+	 */
+	if (__predict_false(tsc < tsc_timestamp)) {
 		/*
-		 * Under Xen3, shadow->ts is the wall time less system time
-		 * get_vcpu_time() will update shadow
+		 * Notify the console that the CPU's tsc appeared to
+		 * run behind Xen's idea of it, and pretend it hadn't.
 		 */
-		nsec = get_vcpu_time(ci);
-		*wt = shadow->ts;
-		nsec += wt->tv_nsec;
-	} while (!time_values_up_to_date(ci));
-	mutex_exit(&tmutex);
-
-	wt->tv_sec += nsec / 1000000000L;
-	wt->tv_nsec = nsec % 1000000000L;
-}
+#if XEN_CLOCK_DEBUG		/* XXX dtrace hook */
+		printf("xen cpu tsc %"PRIu64
+		    " ran backwards from timestamp %"PRIu64
+		    " by %"PRIu64"\n",
+		    tsc, tsc_timestamp, tsc_timestamp - tsc);
+#endif
+		ci->ci_xen_cpu_tsc_backwards_evcnt.ev_count++;
+		delta_ns = delta_tsc = 0;
+	} else {
+		/* Find how far the CPU's tsc has advanced.  */
+		delta_tsc = tsc - tsc_timestamp;
 
-static int
-xen_rtc_get(todr_chip_handle_t todr, struct timeval *tvp)
-{
-	struct timespec wt;
+		/* Convert the tsc delta to a nanosecond delta.  */
+		delta_ns = xen_tsc_to_ns_delta(delta_tsc, tsc_to_system_mul,
+		    tsc_shift);
+	}
 
-	xen_wall_time(&wt);
-	tvp->tv_sec = wt.tv_sec;
-	tvp->tv_usec = wt.tv_nsec / 1000;
+	/*
+	 * Notify the console if the delta computation yielded a
+	 * negative, and pretend it hadn't.
+	 *
+	 * This doesn't make sense but I include it out of paranoia.
+	 */
+	if (__predict_false((int64_t)delta_ns < 0)) {
+#if XEN_CLOCK_DEBUG		/* XXX dtrace hook */
+		printf("xen tsc delta in ns went negative: %"PRId64"\n",
+		    delta_ns);
+#endif
+		ci->ci_xen_tsc_delta_negative_evcnt.ev_count++;
+		delta_ns = 0;
+	}
 
-	return 0;
-}
+	/*
+	 * Compute the TSC-adjusted system time.
+	 */
+	systime_ns = raw_systime_ns + delta_ns;
 
-static int
-xen_rtc_set(todr_chip_handle_t todr, struct timeval *tvp)
-{
-#ifdef DOM0OPS
-#if __XEN_INTERFACE_VERSION__ < 0x00030204
-	dom0_op_t op;
-#else
-	xen_platform_op_t op;
-#endif
-	if (xendomain_is_privileged()) {
- 		/* needs to set the RTC chip too */
- 		struct clock_ymdhms dt;
- 		clock_secs_to_ymdhms(tvp->tv_sec, &dt);
- 		rtc_set_ymdhms(NULL, &dt);
- 
-#if __XEN_INTERFACE_VERSION__ < 0x00030204
-		op.cmd = DOM0_SETTIME;
-#else
-		op.cmd = XENPF_settime;
-#endif
-		/* XXX is rtc_offset handled correctly everywhere? */
-		op.u.settime.secs	 = tvp->tv_sec;
-		op.u.settime.nsecs	 = tvp->tv_usec * 1000;
-		mutex_enter(&tmutex);
-		op.u.settime.system_time = get_vcpu_time(curcpu());
-		mutex_exit(&tmutex);
-#if __XEN_INTERFACE_VERSION__ < 0x00030204
-		return HYPERVISOR_dom0_op(&op);
-#else
-		return HYPERVISOR_platform_op(&op);
+	/*
+	 * Notify the console if the addition wrapped around.
+	 *
+	 * This shouldn't happen because system time should be relative
+	 * to a reasonable reference point, not centuries in the past.
+	 * (2^64 ns is approximately half a millennium.)
+	 */
+	if (__predict_false(systime_ns < raw_systime_ns)) {
+#if XEN_CLOCK_DEBUG		/* XXX dtrace hook */
+		printf("xen raw systime + tsc delta wrapped around:"
+		    " %"PRIu64" + %"PRIu64" = %"PRIu64"\n",
+		    raw_systime_ns, delta_ns, systime_ns);
 #endif
+		ci->ci_xen_raw_systime_wraparound_evcnt.ev_count++;
 	}
+
+	/*
+	 * Notify the console if the TSC-adjusted Xen system time
+	 * appears to have gone backwards, and pretend we had gone
+	 * forward.  This seems to happen pretty regularly under load.
+	 */
+	if (__predict_false(ci->ci_xen_last_systime_ns > systime_ns)) {
+#if XEN_CLOCK_DEBUG		/* XXX dtrace hook */
+		printf("xen raw systime + tsc delta went backwards:"
+		    " %"PRIu64" > %"PRIu64"\n",
+		    ci->ci_xen_last_systime_ns, systime_ns);
+		printf(" raw_systime_ns=%"PRIu64"\n tsc_timestamp=%"PRIu64"\n"
+		    " tsc=%"PRIu64"\n tsc_to_system_mul=%"PRIu32"\n"
+		    " tsc_shift=%"PRId8"\n delta_tsc=%"PRIu64"\n"
+		    " delta_ns=%"PRIu64"\n",
+		    raw_systime_ns, tsc_timestamp, tsc, tsc_to_system_mul,
+		    tsc_shift, delta_tsc, delta_ns);
 #endif
+		ci->ci_xen_raw_systime_backwards_evcnt.ev_count++;
+		systime_ns = ci->ci_xen_last_systime_ns + 1;
+	}
 
-	return 0;
+	/* Remember the TSC-adjusted Xen system time.  */
+	ci->ci_xen_last_systime_ns = systime_ns;
+
+	/* We had better not have migrated CPUs.  */
+	KASSERT(ci == curcpu());
+
+	/* And we're done: return the TSC-adjusted systime in nanoseconds.  */
+	return systime_ns;
 }
 
-void
-startrtclock(void)
+/*
+ * xen_vcputime_raw_systime_ns()
+ *
+ *	Return a snapshot of the current Xen system time to the
+ *	resolution of the Xen hypervisor tick, in units of nanoseconds.
+ */
+static uint64_t
+xen_vcputime_raw_systime_ns(void)
 {
-	static struct todr_chip_handle	tch;
-	tch.todr_gettime = xen_rtc_get;
-	tch.todr_settime = xen_rtc_set;
-	tch.todr_setwen = NULL;
+	volatile struct vcpu_time_info *vt;
+	struct xen_vcputime_ticket ticket;
+	uint64_t raw_systime_ns;
 
-	todr_attach(&tch);
+	do {
+		vt = xen_vcputime_enter(&ticket);
+		raw_systime_ns = vt->system_time;
+	} while (!xen_vcputime_exit(vt, &ticket));
+
+	return raw_systime_ns;
 }
 
 /*
- * Wait approximately `n' microseconds.
+ * struct xen_wallclock_ticket
+ *
+ *	State for a wall clock read section, during which a caller may
+ *	read from the wall clock fields of HYPERVISOR_shared_info.
+ *	Caller must enter with xen_wallclock_enter, exit with
+ *	xen_wallclock_exit, and be prepared to retry if
+ *	xen_wallclock_exit fails.
  */
-void
-xen_delay(unsigned int n)
+struct xen_wallclock_ticket {
+	uint32_t version;
+};
+
+/*
+ * xen_wallclock_enter(tp)
+ *
+ *	Enter a wall clock read section and store a ticket in *tp,
+ *	which the caller must use with xen_wallclock_exit.
+ */
+static inline void
+xen_wallclock_enter(struct xen_wallclock_ticket *tp)
 {
-	struct cpu_info *ci = curcpu();
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
 
-	if (n < 500000) {
-		/*
-		 * shadow->system_time is updated every hz tick, it's not
-		 * precise enough for short delays. Use the CPU counter
-		 * instead. We assume it's working at this point.
-		 */
-		uint64_t cc, cc2, when;
+	while (__predict_false(1 & (tp->version =
+		    HYPERVISOR_shared_info->wc_version)))
+		SPINLOCK_BACKOFF_HOOK;
 
-		cc = cpu_counter();
-		when = cc + (uint64_t)n * cpu_frequency(ci) / 1000000LL;
-		if (when < cc) {
-			/* wait for counter to wrap */
-			cc2 = cpu_counter();
-			while (cc2 > cc)
-				cc2 = cpu_counter();
-		}
-		cc2 = cpu_counter();
-		while (cc2 < when)
-			cc2 = cpu_counter();
-		
-		return;
-	} else {
-		uint64_t when;
+	/*
+	 * Must read the version from memory before reading the
+	 * timestamp from memory, as written potentially by another
+	 * pCPU.
+	 */
+	membar_consumer();
+}
 
-		/* for large delays, shadow->system_time is OK */
-		mutex_enter(&tmutex);
-		get_time_values_from_xen(ci);
-		when = shadow->system_time + n * 1000;
-		while (shadow->system_time < when) {
-			mutex_exit(&tmutex);
-			HYPERVISOR_yield();
-			mutex_enter(&tmutex);
-			get_time_values_from_xen(ci);
-		}
-		mutex_exit(&tmutex);
-	}
+/*
+ * xen_wallclock_exit(tp)
+ *
+ *	Exit a wall clock read section with the ticket in *tp from
+ *	xen_wallclock_enter.  Return true on success, false if caller
+ *	must retry.
+ */
+static inline bool
+xen_wallclock_exit(struct xen_wallclock_ticket *tp)
+{
+
+	/*
+	 * Must read the timestamp from memory before re-reading the
+	 * version from memory, as written potentially by another pCPU.
+	 */
+	membar_consumer();
+
+	return tp->version == HYPERVISOR_shared_info->wc_version;
 }
 
-#ifdef DOM0OPS
-/* ARGSUSED */
+/*
+ * xen_wallclock_time(tsp)
+ *
+ *	Return a snapshot of the current low-resolution wall clock
+ *	time, as reported by the hypervisor, in tsp.
+ */
 static void
-xen_timepush(void *arg)
+xen_wallclock_time(struct timespec *tsp)
 {
-	callout_t *co = arg;
+	struct xen_wallclock_ticket ticket;
+	uint64_t systime_ns;
 
-	resettodr();
-	if (xen_timepush_ticks > 0)
-		callout_schedule(co, xen_timepush_ticks);
+	/* Read the last wall clock sample from the hypervisor. */
+	do {
+		xen_wallclock_enter(&ticket);
+		tsp->tv_sec = HYPERVISOR_shared_info->wc_sec;
+		tsp->tv_nsec = HYPERVISOR_shared_info->wc_nsec;
+	} while (!xen_wallclock_exit(&ticket));
+
+	/* Get the global system time.  */
+	systime_ns = xen_global_systime_ns();
+
+	/* Add the system time to the wall clock time.  */
+	systime_ns += tsp->tv_nsec;
+	tsp->tv_sec += systime_ns / 1000000000ull;
+	tsp->tv_nsec = systime_ns % 1000000000ull;
 }
 
-/* ARGSUSED */
-static int
-sysctl_xen_timepush(SYSCTLFN_ARGS)
+/*
+ * xen_global_systime_ns()
+ *
+ *	Return a global monotonic view of the system time in
+ *	nanoseconds, computed by the per-CPU Xen raw system time plus
+ *	an rdtsc adjustment, and advance the view of the system time
+ *	for all other CPUs.
+ */
+static uint64_t
+xen_global_systime_ns(void)
 {
-	int error, new_ticks;
-	struct sysctlnode node;
+	struct cpu_info *ci;
+	uint64_t local, global, result;
+	int bound;
 
-	new_ticks = xen_timepush_ticks;
-	node = *rnode;
-	node.sysctl_data = &new_ticks;
-	error = sysctl_lookup(SYSCTLFN_CALL(&node));
-	if (error || newp == NULL)
-		return error;
-
-	if (new_ticks < 0)
-		return EINVAL;
-	if (new_ticks != xen_timepush_ticks) {
-		xen_timepush_ticks = new_ticks;
-		if (new_ticks > 0)
-			callout_schedule(&xen_timepush_co, new_ticks);
-		else
-			callout_stop(&xen_timepush_co);
-	}
+	/*
+	 * Find the local timecount on this CPU, and make sure it does
+	 * not precede the latest global timecount witnessed so far by
+	 * any CPU.  If it does, add to the local CPU's skew from the
+	 * fastest CPU.
+	 *
+	 * XXX Can we avoid retrying if the CAS fails?
+	 */
+	bound = curlwp_bind();
+	ci = curcpu();
+	do {
+		local = xen_vcputime_systime_ns();
+		local += ci->ci_xen_systime_ns_skew;
+		global = xen_global_systime_ns_stamp;
+		if (__predict_false(local < global + 1)) {
+			result = global + 1;
+			ci->ci_xen_systime_ns_skew += global + 1 - local;
+		} else {
+			result = local;
+		}
+	} while (atomic_cas_64(&xen_global_systime_ns_stamp, global, result)
+	    != global);
+	KASSERT(ci == curcpu());
+	curlwp_bindx(bound);
 
-	return 0;
+	return result;
 }
-#endif
 
-/* ARGSUSED */
-u_int
+/*
+ * xen_get_timecount(tc)
+ *
+ *	Return the low 32 bits of a global monotonic view of the Xen
+ *	system time.
+ */
+static unsigned
 xen_get_timecount(struct timecounter *tc)
 {
-	uint64_t ns;
 
-	struct cpu_info *ci = curcpu();
-
-	mutex_enter(&tmutex);
-	ns = get_vcpu_time(ci) - xen_clock_bias[ci->ci_cpuid];
-	mutex_exit(&tmutex);
+	KASSERT(tc == &xen_timecounter);
 
-	return (u_int)ns;
+	return (unsigned)xen_global_systime_ns();
 }
 
-/* 
- * Needs to be called per-cpu, from the local cpu, since VIRQ_TIMER is
- * bound per-cpu
+/*
+ * xen_rtc_get(todr, tv)
+ *
+ *	Get the current real-time clock from the Xen wall clock time
+ *	and vCPU system time adjustment.
  */
+static int
+xen_rtc_get(struct todr_chip_handle *todr, struct timeval *tvp)
+{
+	struct timespec ts;
 
-static struct evcnt hardclock_called[MAXCPUS];
+	xen_wallclock_time(&ts);
+	TIMESPEC_TO_TIMEVAL(tvp, &ts);
 
-void
-xen_initclocks(void)
-{
-	int err __diagused;
-	static bool tcdone = false;
+	return 0;
+}
 
-	struct cpu_info *ci = curcpu();
-	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+/*
+ * xen_rtc_set(todr, tv)
+ *
+ *	Set the Xen wall clock time, if we can.
+ */
+static int
+xen_rtc_set(struct todr_chip_handle *todr, struct timeval *tvp)
+{
+#ifdef DOM0OPS
+	struct clock_ymdhms dt;
+	xen_platform_op_t op;
+	uint64_t systime_ns;
 
-	xen_clock_bias[ci->ci_cpuid] = 0;
+	if (xendomain_is_privileged()) {
+		/* Convert to ymdhms and set the x86 ISA RTC.  */
+		clock_secs_to_ymdhms(tvp->tv_sec, &dt);
+		rtc_set_ymdhms(NULL, &dt);
 
-	evcnt_attach_dynamic(&hardclock_called[ci->ci_cpuid],
-			     EVCNT_TYPE_INTR,
-			     NULL,
-			     device_xname(ci->ci_dev),
-			     "hardclock");
+		/* Get the global system time so we can preserve it.  */
+		systime_ns = xen_global_systime_ns();
 
-#ifdef DOM0OPS
-	if (!tcdone) { /* Do this only once */
-		callout_init(&xen_timepush_co, 0);
+		/* Set the hypervisor wall clock time.  */
+		op.cmd = XENPF_settime;
+		op.u.settime.secs = tvp->tv_sec;
+		op.u.settime.nsecs = tvp->tv_usec * 1000;
+		op.u.settime.system_time = systime_ns;
+		return HYPERVISOR_platform_op(&op);
 	}
 #endif
 
-	if (!tcdone) { /* Do this only once */
-		mutex_init(&tmutex, MUTEX_DEFAULT, IPL_CLOCK);
-	}
-	mutex_enter(&tmutex);
-	get_time_values_from_xen(ci);
-	vcpu_system_time[ci->ci_cpuid] = shadow->system_time;
-	mutex_exit(&tmutex);
-	if (!tcdone) { /* Do this only once */
-		tc_init(&xen_timecounter);
-	}
+	/* XXX Should this fail if not on privileged dom0?  */
+	return 0;
+}
 
-	/* The splhigh requirements start here. */
-	xen_resumeclocks(ci);
+/*
+ * xen_delay(n)
+ *
+ *	Wait approximately n microseconds.
+ */
+void
+xen_delay(unsigned n)
+{
+	int bound;
 
-	/*
-	 * The periodic timer looks buggy, we stop receiving events
-	 * after a while. Use the one-shot timer every NS_PER_TICK
-	 * and rearm it from the event handler.
-	 */
-	if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
-		/* exists only on Xen 3.1 and later */
-		err = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
-					 ci->ci_cpuid,
-				 NULL);
-		KASSERT(err == 0);
-	}
+	/* Bind to the CPU so we don't compare tsc on different CPUs.  */
+	bound = curlwp_bind();
 
-	err = HYPERVISOR_set_timer_op(
-	    vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
-	KASSERT(err == 0);
+	/* Short wait (<500us) or long wait?  */
+	if (n < 500000) {
+		/*
+		 * Xen system time is not precise enough for short
+		 * delays, so use the tsc instead.
+		 *
+		 * We work with the current tsc frequency, and figure
+		 * that if it changes while we're delaying, we've
+		 * probably delayed long enough -- up to 500us.
+		 *
+		 * We do not use cpu_frequency(ci), which uses a
+		 * quantity detected at boot time, and which may have
+		 * changed by now if Xen has migrated this vCPU to
+		 * another pCPU.
+		 *
+		 * XXX How long does it take to migrate pCPUs?
+		 */
+		volatile struct vcpu_time_info *vt;
+		struct xen_vcputime_ticket ticket;
+		uint64_t tsc_start, last_tsc, tsc;
+		uint32_t tsc_to_system_mul;
+		int8_t tsc_shift;
+
+		/* Get the starting tsc and tsc frequency.  */
+		do {
+			vt = xen_vcputime_enter(&ticket);
+			tsc_start = last_tsc = xen_rdtsc();
+			tsc_to_system_mul = vt->tsc_to_system_mul;
+			tsc_shift = vt->tsc_shift;
+		} while (!xen_vcputime_exit(vt, &ticket));
 
-#ifdef DOM0OPS
-	const struct sysctlnode *node = NULL;
+		/*
+		 * Wait until as many tsc ticks as there are in n
+		 * microseconds have elapsed, or the tsc has gone
+		 * backwards meaning we've probably migrated pCPUs.
+		 */
+		for (;;) {
+			tsc = xen_rdtsc();
+			if (__predict_false(tsc < last_tsc))
+				break;
+			if (xen_tsc_to_ns_delta(tsc - tsc_start,
+				tsc_to_system_mul, tsc_shift)/1000 >= n)
+				break;
+			last_tsc = tsc;
+		}
+	} else {
+		/*
+		 * Use the Xen system time for >=500us delays.  From my
+		 * testing, it seems to sometimes run backward by about
+		 * 110us, which is not so bad.
+		 */
+		uint64_t n_ns = 1000*(uint64_t)n;
+		uint64_t start_ns;
 
-	if (!tcdone) { /* Do this only once */
+		/* Get the start time.  */
+		start_ns = xen_vcputime_raw_systime_ns();
 
-		xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
-		if (xendomain_is_privileged()) {
-			sysctl_createv(NULL, 0, NULL, &node, 0,
-			    CTLTYPE_NODE, "xen",
-			    SYSCTL_DESCR("Xen top level node"),
-			    NULL, 0, NULL, 0,
-			    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
-			if (node != NULL) {
-				sysctl_createv(NULL, 0, &node, NULL,
-				    CTLFLAG_READWRITE, CTLTYPE_INT,
-				    "timepush_ticks",
-				    SYSCTL_DESCR("How often to update the "
-				    "hypervisor's time-of-day; 0 to disable"),
-				    sysctl_xen_timepush, 0,
-				    &xen_timepush_ticks, 0, 
-				    CTL_CREATE, CTL_EOL);
-			}
-			callout_reset(&xen_timepush_co, xen_timepush_ticks,
-			     &xen_timepush, &xen_timepush_co);
-		}
+		/* Wait until the system time has passed the end.  */
+		do {
+			HYPERVISOR_yield();
+		} while (xen_vcputime_raw_systime_ns() - start_ns < n_ns);
 	}
-#endif
-	tcdone = true;
+
+	/* Unbind from the CPU if we weren't already bound.  */
+	curlwp_bindx(bound);
 }
 
+/*
+ * xen_suspendclocks(ci)
+ *
+ *	Stop handling the Xen timer event on the CPU of ci.  Caller
+ *	must be running on and bound to ci's CPU.
+ *
+ *	Actually, caller must have kpreemption disabled, because that's
+ *	easier to assert at the moment.
+ */
 void
 xen_suspendclocks(struct cpu_info *ci)
 {
+	struct intrhand **ihp, *ih;
 	int evtch;
 
+	KASSERT(ci == curcpu());
+	KASSERT(kpreempt_disabled());
+
 	evtch = unbind_virq_from_evtch(VIRQ_TIMER);
 	KASSERT(evtch != -1);
 
 	hypervisor_mask_event(evtch);
+	ihp = percpu_getref(xen_timer_ih_percpu);
+	ih = *ihp;
+	KASSERT(ih != NULL);
 	intr_disestablish(ih);
+	*ihp = NULL;
+	percpu_putref(xen_timer_ih_percpu);
 
 	aprint_verbose("Xen clock: removed event channel %d\n", evtch);
+
+	/* We'd better not have switched CPUs.  */
+	KASSERT(ci == curcpu());
 }
 
+/*
+ * xen_resumeclocks(ci)
+ *
+ *	Start handling the Xen timer event on the CPU of ci.  Caller
+ *	must be running on and bound to ci's CPU.
+ *
+ *	Actually, caller must have kpreemption disabled, because that's
+ *	easier to assert at the moment.
+ */
 void
 xen_resumeclocks(struct cpu_info *ci)
 {
 	char intr_xname[INTRDEVNAMEBUF];
+	struct intrhand **ihp, *ih;
 	int evtch;
-       
+
+	KASSERT(ci == curcpu());
+	KASSERT(kpreempt_disabled());
+
 	evtch = bind_virq_to_evtch(VIRQ_TIMER);
 	KASSERT(evtch != -1);
 
 	snprintf(intr_xname, sizeof(intr_xname), "%s clock",
 	    device_xname(ci->ci_dev));
 
-	ih = intr_establish_xname(0, &xen_pic, evtch, IST_LEVEL, IPL_CLOCK,
-	    xen_timer_handler_stub, ci, true, intr_xname);
-
-	KASSERT(ih != NULL);
+	ihp = percpu_getref(xen_timer_ih_percpu);
+	KASSERT(*ihp == NULL);
+	/* XXX sketchy function pointer cast -- fix the API, please */
+	ih = intr_establish_xname(0, &xen_pic, evtch, IST_LEVEL,
+	    IPL_CLOCK, (int (*)(void *))xen_timer_handler, ci, true,
+	    intr_xname);
+	if (ih == NULL)
+		panic("failed to establish timer interrupt handler");
+	*ihp = ih;
+	percpu_putref(xen_timer_ih_percpu);
 
 	hypervisor_enable_event(evtch);
 
-	aprint_verbose("Xen %s: using event channel %d\n",
-	    intr_xname, evtch);
+	aprint_verbose("Xen %s: using event channel %d\n", intr_xname, evtch);
+
+	/* We'd better not have switched CPUs.  */
+	KASSERT(ci == curcpu());
 }
 
-/* ARGSUSED */
+/*
+ * xen_timer_handler(cookie, frame)
+ *
+ *	Periodic Xen timer event handler for NetBSD hardclock.  Calls
+ *	to this may get delayed, so we run hardclock as many times as
+ *	we need to in order to cover the Xen system time that elapsed.
+ *	After that, re-arm the timer to run again at the next tick.
+ *	The cookie is the pointer to struct cpu_info.
+ */
 static int
-xen_timer_handler(void *arg, struct intrframe *regs)
+xen_timer_handler(void *cookie, struct clockframe *frame)
 {
-	int64_t delta;
 	struct cpu_info *ci = curcpu();
+	uint64_t last, now, delta, next;
+	int error;
+
+	KASSERT(cpu_intr_p());
+	KASSERT(cookie == ci);
 
-	int err;
 again:
-	mutex_enter(&tmutex);
-	delta = (int64_t)(get_vcpu_time(ci) - vcpu_system_time[ci->ci_cpuid]);
-	mutex_exit(&tmutex);
-
-	/* Several ticks may have passed without our being run; catch up. */
-	while (delta >= (int64_t)NS_PER_TICK) {
-		mutex_enter(&tmutex);
-		vcpu_system_time[ci->ci_cpuid] += NS_PER_TICK;
-		xen_clock_bias[ci->ci_cpuid] = (delta -= NS_PER_TICK);
-		mutex_exit(&tmutex);
-		hardclock((struct clockframe *)regs);
-		hardclock_called[ci->ci_cpuid].ev_count++;
+	/*
+	 * Find how many nanoseconds of Xen system time has elapsed
+	 * since the last hardclock tick.
+	 */
+	last = ci->ci_xen_hardclock_systime_ns;
+	now = xen_vcputime_systime_ns();
+	if (now < last) {
+#if XEN_CLOCK_DEBUG		/* XXX dtrace hook */
+		printf("xen systime ran backwards in hardclock %"PRIu64"ns\n",
+		    last - now);
+#endif
+		ci->ci_xen_systime_backwards_hardclock_evcnt.ev_count++;
+		now = last;
 	}
+	delta = now - last;
 
 	/*
-	 * rearm the timer. If it fails it's probably because the date
-	 * is in the past, update our local time and try again.
-	 */
-	err = HYPERVISOR_set_timer_op(
-	    vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
-	if (err)
-		goto again;
-	
-	if (xen_clock_bias[ci->ci_cpuid]) {
-		mutex_enter(&tmutex);
-		xen_clock_bias[ci->ci_cpuid] = 0;
-		mutex_exit(&tmutex);
+	 * Play hardclock catchup: run the hardclock timer as many
+	 * times as appears necessary based on how much time has
+	 * passed.
+	 */
+	while (delta >= NS_PER_TICK) {
+		ci->ci_xen_hardclock_systime_ns += NS_PER_TICK;
+		delta -= NS_PER_TICK;
+		hardclock(frame);
+		if (__predict_false(delta >= NS_PER_TICK))
+			ci->ci_xen_missed_hardclock_evcnt.ev_count++;
 	}
 
+	/*
+	 * Re-arm the timer.  If it fails, it's probably because the
+	 * time is in the past, so update our idea of what the Xen
+	 * system time is and try again.
+	 */
+	next = ci->ci_xen_hardclock_systime_ns + NS_PER_TICK;
+	error = HYPERVISOR_set_timer_op(next);
+	if (error)
+		goto again;
+
+	/* Success!  */
 	return 0;
 }
 
+/*
+ * xen_initclocks()
+ *
+ *	Initialize the Xen clocks on the current CPU.
+ */
 void
-setstatclockrate(int arg)
+xen_initclocks(void)
 {
+	struct cpu_info *ci = curcpu();
+	int error;
+
+	/* If this is the primary CPU, do global initialization first.  */
+	if (ci == &cpu_info_primary) {
+		/* Allocate the per-CPU interrupt handle array.  */
+		xen_timer_ih_percpu = percpu_alloc(sizeof(struct intrhand *));
+		KASSERT(xen_timer_ih_percpu != NULL);
+
+		/* Initialize the systemwide Xen timecounter.  */
+		tc_init(&xen_timecounter);
+
+#ifdef DOM0OPS
+		/*
+		 * If this is a privileged dom0, start pushing the wall
+		 * clock time back to the Xen hypervisor.
+		 */
+		if (xendomain_is_privileged())
+			xen_timepush_init();
+#endif
+	}
+
+	/* Pretend the last hardclock happened right now.  */
+	ci->ci_xen_hardclock_systime_ns = xen_vcputime_systime_ns();
+
+	/* Attach the event counters.  */
+	evcnt_attach_dynamic(&ci->ci_xen_cpu_tsc_backwards_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "cpu tsc ran backwards");
+	evcnt_attach_dynamic(&ci->ci_xen_tsc_delta_negative_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "tsc delta went negative");
+	evcnt_attach_dynamic(&ci->ci_xen_raw_systime_wraparound_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "raw systime wrapped around");
+	evcnt_attach_dynamic(&ci->ci_xen_raw_systime_backwards_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "raw systime went backwards");
+	evcnt_attach_dynamic(&ci->ci_xen_systime_backwards_hardclock_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "systime went backwards in hardclock");
+	evcnt_attach_dynamic(&ci->ci_xen_missed_hardclock_evcnt,
+	    EVCNT_TYPE_INTR, NULL, device_xname(ci->ci_dev),
+	    "missed hardclock");
+
+	/* Disarm the periodic timer on Xen>=3.1 which is allegedly buggy.  */
+	if (XEN_MAJOR(xen_version) > 3 || XEN_MINOR(xen_version) > 0) {
+		error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+		    ci->ci_cpuid, NULL);
+		KASSERT(error == 0);
+	}
+
+	/* Arm the timer.  */
+	error = HYPERVISOR_set_timer_op(ci->ci_xen_hardclock_systime_ns +
+	    NS_PER_TICK);
+	KASSERT(error == 0);
+
+	/* Fire up the clocks.  */
+	xen_resumeclocks(ci);
 }
 
-void
-idle_block(void)
+#ifdef DOM0OPS
+
+/*
+ * xen_timepush_init()
+ *
+ *	Initialize callout to periodically set Xen hypervisor's wall
+ *	clock time.
+ */
+static void
+xen_timepush_init(void)
 {
-	KASSERT(curcpu()->ci_ipending == 0);
-	HYPERVISOR_block();
+	struct sysctllog *log = NULL;
+	const struct sysctlnode *node = NULL;
+	int error;
+
+	/* Start periodically updating the hypervisor's wall clock time.  */
+	callout_init(&xen_timepush.ch, 0);
+	callout_setfunc(&xen_timepush.ch, xen_timepush_intr, NULL);
+
+	/* Pick a default frequency for timepush.  */
+	xen_timepush.ticks = 53*hz + 3; /* avoid exact # of min/sec */
+
+	/* Create machdep.xen node.  */
+	/* XXX Creation of the `machdep.xen' node should be elsewhere.  */
+	error = sysctl_createv(&log, 0, NULL, &node, 0,
+	    CTLTYPE_NODE, "xen",
+	    SYSCTL_DESCR("Xen top level node"),
+	    NULL, 0, NULL, 0,
+	    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+	if (error)
+		goto fail;
+	KASSERT(node != NULL);
+
+	/* Create int machdep.xen.timepush_ticks knob.  */
+	error = sysctl_createv(&log, 0, NULL, NULL, CTLFLAG_READWRITE,
+	    CTLTYPE_INT, "timepush_ticks",
+	    SYSCTL_DESCR("How often to update the hypervisor's time-of-day;"
+		" 0 to disable"),
+	    sysctl_xen_timepush, 0, &xen_timepush.ticks, 0,
+	    CTL_CREATE, CTL_EOL);
+	if (error)
+		goto fail;
+
+	/* Start the timepush callout.  */
+	callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
+
+	/* Success!  */
+	return;
+
+fail:	sysctl_teardown(&log);
 }
+
+/*
+ * xen_timepush_intr(cookie)
+ *
+ *	Callout interrupt handler to push NetBSD's idea of the wall
+ *	clock time, usually synchronized with NTP, back to the Xen
+ *	hypervisor.
+ */
+static void
+xen_timepush_intr(void *cookie)
+{
+
+	resettodr();
+	if (xen_timepush.ticks)
+		callout_schedule(&xen_timepush.ch, xen_timepush.ticks);
+}
+
+/*
+ * sysctl_xen_timepush(...)
+ *
+ *	Sysctl handler to set machdep.xen.timepush_ticks.
+ */
+static int
+sysctl_xen_timepush(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node;
+	int ticks;
+	int error;
+
+	ticks = xen_timepush.ticks;
+	node = *rnode;
+	node.sysctl_data = &ticks;
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error || newp == NULL)
+		return error;
+
+	if (ticks < 0)
+		return EINVAL;
+
+	if (ticks != xen_timepush.ticks) {
+		xen_timepush.ticks = ticks;
+
+		if (ticks == 0)
+			callout_stop(&xen_timepush.ch);
+		else
+			callout_schedule(&xen_timepush.ch, ticks);
+	}
+
+	return 0;
+}
+
+#endif	/* DOM0OPS */

CVS commit: src/sys/arch

Reply via email to