Module Name:    src
Committed By:   cherry
Date:           Fri Nov 18 06:01:50 UTC 2011

Modified Files:
        src/sys/arch/xen/xen: clock.c

Log Message:
[merging from cherry-xenmp]
 - Make clock MP aware.
 - Bring in fixes that bouyer@ brought in via:
   cvs rdiff -u -r1.54.6.4 -r1.54.6.5 src/sys/arch/xen/xen/clock.c

Thanks to riz@ for testing on dom0


To generate a diff of this commit:
cvs rdiff -u -r1.56 -r1.57 src/sys/arch/xen/xen/clock.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/xen/xen/clock.c
diff -u src/sys/arch/xen/xen/clock.c:1.56 src/sys/arch/xen/xen/clock.c:1.57
--- src/sys/arch/xen/xen/clock.c:1.56	Tue Sep 20 00:12:24 2011
+++ src/sys/arch/xen/xen/clock.c	Fri Nov 18 06:01:50 2011
@@ -1,4 +1,4 @@
-/*	$NetBSD: clock.c,v 1.56 2011/09/20 00:12:24 jym Exp $	*/
+/*	$NetBSD: clock.c,v 1.57 2011/11/18 06:01:50 cherry Exp $	*/
 
 /*
  *
@@ -29,7 +29,7 @@
 #include "opt_xen.h"
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.56 2011/09/20 00:12:24 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.57 2011/11/18 06:01:50 cherry Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,6 +43,7 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
 #include <xen/xen.h>
 #include <xen/hypervisor.h>
 #include <xen/evtchn.h>
+#include <xen/xen3-public/vcpu.h>
 #include <machine/cpu_counter.h>
 
 #include <dev/clock_subr.h>
@@ -66,22 +67,32 @@ static struct timecounter xen_timecounte
 };
 
 /* These are periodically updated in shared_info, and then copied here. */
-static volatile uint64_t shadow_tsc_stamp;
-static volatile uint64_t shadow_system_time;
-static volatile unsigned long shadow_time_version; /* XXXSMP */
-static volatile uint32_t shadow_freq_mul;
-static volatile int8_t shadow_freq_shift;
-static volatile struct timespec shadow_ts;
+struct shadow {
+	uint64_t tsc_stamp;
+	uint64_t system_time;
+	unsigned long time_version; /* XXXSMP */
+	uint32_t freq_mul;
+	int8_t freq_shift;
+	struct timespec ts;
+};
+
+/* Protects volatile variables ci_shadow & xen_clock_bias */
+static kmutex_t tmutex;
 
-/* The time when the last hardclock(9) call should have taken place. */
-static volatile uint64_t processed_system_time;
+/* Per CPU shadow time values */
+static volatile struct shadow ci_shadow[MAXCPUS];
+
+/* The time when the last hardclock(9) call should have taken place,
+ * per cpu.
+ */
+static volatile uint64_t vcpu_system_time[MAXCPUS];
 
 /*
  * The clock (as returned by xen_get_timecount) may need to be held
  * back to maintain the illusion that hardclock(9) was called when it
  * was supposed to be, not when Xen got around to scheduling us.
  */
-static volatile uint64_t xen_clock_bias = 0;
+static volatile uint64_t xen_clock_bias[MAXCPUS];
 
 #ifdef DOM0OPS
 /* If we're dom0, send our time to Xen every minute or so. */
@@ -96,25 +107,30 @@ static callout_t xen_timepush_co;
  * area.  Must be called at splhigh (per timecounter requirements).
  */
 static void
-get_time_values_from_xen(void)
+get_time_values_from_xen(struct cpu_info *ci)
 {
-	volatile struct vcpu_time_info *t = &curcpu()->ci_vcpu->time;
+
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+	volatile struct vcpu_time_info *t = &ci->ci_vcpu->time;
 	uint32_t tversion;
 
+	KASSERT(mutex_owned(&tmutex));
+
 	do {
-		shadow_time_version = t->version;
+		shadow->time_version = t->version;
 		xen_rmb();
-		shadow_tsc_stamp = t->tsc_timestamp;
-		shadow_system_time = t->system_time;
-		shadow_freq_mul = t->tsc_to_system_mul;
-		shadow_freq_shift = t->tsc_shift;
+		shadow->tsc_stamp = t->tsc_timestamp;
+		shadow->system_time = t->system_time;
+		shadow->freq_mul = t->tsc_to_system_mul;
+		shadow->freq_shift = t->tsc_shift;
 		xen_rmb();
-	} while ((t->version & 1) || (shadow_time_version != t->version));
+	} while ((t->version & 1) || (shadow->time_version != t->version));
 	do {
 		tversion = HYPERVISOR_shared_info->wc_version;
 		xen_rmb();
-		shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
-		shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
+		shadow->ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+		shadow->ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
 		xen_rmb();
 	} while ((HYPERVISOR_shared_info->wc_version & 1) ||
 	    (tversion != HYPERVISOR_shared_info->wc_version));
@@ -124,12 +140,17 @@ get_time_values_from_xen(void)
  * Are the values we have up to date?
  */
 static inline int
-time_values_up_to_date(void)
+time_values_up_to_date(struct cpu_info *ci)
 {
 	int rv;
 
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+	KASSERT(ci != NULL);
+	KASSERT(mutex_owned(&tmutex));
+
 	xen_rmb();
-	rv = shadow_time_version == curcpu()->ci_vcpu->time.version;
+	rv = shadow->time_version == ci->ci_vcpu->time.version;
 	xen_rmb();
 
 	return rv;
@@ -164,52 +185,40 @@ scale_delta(uint64_t delta, uint32_t mul
  * Must be called at splhigh (per timecounter requirements).
  */
 static uint64_t
-get_tsc_offset_ns(void)
+get_tsc_offset_ns(struct cpu_info *ci)
 {
 	uint64_t tsc_delta, offset;
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
 
-	tsc_delta = cpu_counter() - shadow_tsc_stamp;
-	offset = scale_delta(tsc_delta, shadow_freq_mul,
-	    shadow_freq_shift);
-#ifdef XEN_CLOCK_DEBUG
-	if (tsc_delta > 100000000000ULL || offset > 10000000000ULL)
-		printf("get_tsc_offset_ns: tsc_delta=%llu offset=%llu"
-		    " pst=%llu sst=%llu\n", tsc_delta, offset,
-		    processed_system_time, shadow_system_time);
-#endif
+	KASSERT(mutex_owned(&tmutex));
+	tsc_delta = cpu_counter() - shadow->tsc_stamp;
+	offset = scale_delta(tsc_delta, shadow->freq_mul,
+	    shadow->freq_shift);
 
 	return offset;
 }
 
 /*
- * Returns the current system_time, taking care that the timestamp
- * used is valid for the TSC measurement in question.  Xen2 doesn't
- * ensure that this won't step backwards, so we enforce monotonicity
- * on our own in that case.  Must be called at splhigh.
+ * Returns the current system_time on given vcpu, taking care that the
+ * timestamp used is valid for the TSC measurement in question.  Xen2
+ * doesn't ensure that this won't step backwards, so we enforce
+ * monotonicity on our own in that case.  Must be called at splhigh.
  */
 static uint64_t
-get_system_time(void)
+get_vcpu_time(struct cpu_info *ci)
 {
 	uint64_t offset, stime;
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
 	
-	for (;;) {
-		offset = get_tsc_offset_ns();
-		stime = shadow_system_time + offset;
 		
+	KASSERT(mutex_owned(&tmutex));
+	do {
+		get_time_values_from_xen(ci);
+		offset = get_tsc_offset_ns(ci);
+		stime = shadow->system_time + offset;
 		/* if the timestamp went stale before we used it, refresh */
-		if (time_values_up_to_date()) {
-			/*
-			 * Work around an intermittent Xen2 bug where, for
-			 * a period of 1<<32 ns, currently running domains
-			 * don't get their timer events as usual (and also
-			 * aren't preempted in favor of other runnable
-			 * domains).  Setting the timer into the past in
-			 * this way causes it to fire immediately.
-			 */
-			break;
-		}
-		get_time_values_from_xen();
-	}
+
+	} while (!time_values_up_to_date(ci));
 
 	return stime;
 }
@@ -218,16 +227,22 @@ static void
 xen_wall_time(struct timespec *wt)
 {
 	uint64_t nsec;
-	int s;
 
-	s = splhigh();
-	get_time_values_from_xen();
-	*wt = shadow_ts;
-	nsec = wt->tv_nsec;
-
-	/* Under Xen3, this is the wall time less system time */
-	nsec += get_system_time();
-	splx(s);
+	struct cpu_info *ci = curcpu();
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+	mutex_enter(&tmutex);
+	do {
+		/*
+		 * Under Xen3, shadow->ts is the wall time less system time
+		 * get_vcpu_time() will update shadow
+		 */
+		nsec = get_vcpu_time(curcpu());
+		*wt = shadow->ts;
+		nsec += wt->tv_nsec;
+	} while (!time_values_up_to_date(ci));
+	mutex_exit(&tmutex);
+
 	wt->tv_sec += nsec / 1000000000L;
 	wt->tv_nsec = nsec % 1000000000L;
 }
@@ -253,8 +268,6 @@ xen_rtc_set(todr_chip_handle_t todr, str
 #else
 	xen_platform_op_t op;
 #endif
-	int s;
-
 	if (xendomain_is_privileged()) {
  		/* needs to set the RTC chip too */
  		struct clock_ymdhms dt;
@@ -269,9 +282,9 @@ xen_rtc_set(todr_chip_handle_t todr, str
 		/* XXX is rtc_offset handled correctly everywhere? */
 		op.u.settime.secs	 = tvp->tv_sec;
 		op.u.settime.nsecs	 = tvp->tv_usec * 1000;
-		s = splhigh();
-		op.u.settime.system_time = get_system_time();
-		splx(s);
+		mutex_enter(&tmutex);
+		op.u.settime.system_time = get_vcpu_time(curcpu());
+		mutex_exit(&tmutex);
 #if __XEN_INTERFACE_VERSION__ < 0x00030204
 		return HYPERVISOR_dom0_op(&op);
 #else
@@ -300,14 +313,16 @@ startrtclock(void)
 void
 xen_delay(unsigned int n)
 {
+	struct cpu_info *ci = curcpu();
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
 	if (n < 500000) {
 		/*
-		 * shadow_system_time is updated every hz tick, it's not
+		 * shadow->system_time is updated every hz tick, it's not
 		 * precise enough for short delays. Use the CPU counter
 		 * instead. We assume it's working at this point.
 		 */
 		uint64_t cc, cc2, when;
-		struct cpu_info *ci = curcpu();
 
 		cc = cpu_counter();
 		when = cc + (uint64_t)n * cpu_frequency(ci) / 1000000LL;
@@ -324,18 +339,18 @@ xen_delay(unsigned int n)
 		return;
 	} else {
 		uint64_t when;
-		int s;
-		/* for large delays, shadow_system_time is OK */
-		
-		s = splhigh();
-		get_time_values_from_xen();
-		when = shadow_system_time + n * 1000;
-		while (shadow_system_time < when) {
-			splx(s);
-			s = splhigh();
-			get_time_values_from_xen();
+
+		/* for large delays, shadow->system_time is OK */
+		mutex_enter(&tmutex);
+		get_time_values_from_xen(ci);
+		when = shadow->system_time + n * 1000;
+		while (shadow->system_time < when) {
+			mutex_exit(&tmutex);
+			HYPERVISOR_yield();
+			mutex_enter(&tmutex);
+			get_time_values_from_xen(ci);
 		}
-		splx(s);
+		mutex_exit(&tmutex);
 	}
 }
 
@@ -380,56 +395,99 @@ sysctl_xen_timepush(SYSCTLFN_ARGS)
 #endif
 
 /* ARGSUSED */
+/* SMP note: Timecounter uses vcpu0's clock */
 u_int
 xen_get_timecount(struct timecounter *tc)
 {
 	uint64_t ns;
-	int s;
-	
-	s = splhigh();
-	ns = get_system_time() - xen_clock_bias;
-	splx(s);
+
+	struct cpu_info *ci = curcpu();
+
+	mutex_enter(&tmutex);
+	ns = get_vcpu_time(ci) - xen_clock_bias[ci->ci_cpuid];
+	mutex_exit(&tmutex);
 
 	return (u_int)ns;
 }
 
+/* 
+ * Needs to be called per-cpu, from the local cpu, since VIRQ_TIMER is
+ * bound per-cpu
+ */
+
+static struct evcnt hardclock_called[MAXCPUS];
+
 void
 xen_initclocks(void)
 {
+	int err, evtch;
+	static bool tcdone = false;
+
+	struct cpu_info *ci = curcpu();
+	volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+	xen_clock_bias[ci->ci_cpuid] = 0;
+
+	evcnt_attach_dynamic(&hardclock_called[ci->ci_cpuid],
+			     EVCNT_TYPE_INTR,
+			     NULL,
+			     device_xname(ci->ci_dev),
+			     "hardclock");
 
 #ifdef DOM0OPS
-	callout_init(&xen_timepush_co, 0);
+	if (!tcdone) { /* Do this only once */
+		callout_init(&xen_timepush_co, 0);
+	}
 #endif
+	evtch = bind_virq_to_evtch(VIRQ_TIMER);
+	aprint_verbose("Xen clock: using event channel %d\n", evtch);
 
-	get_time_values_from_xen();
-	processed_system_time = shadow_system_time;
-	tc_init(&xen_timecounter);
-
+	if (!tcdone) { /* Do this only once */
+		mutex_init(&tmutex, MUTEX_DEFAULT, IPL_CLOCK);
+	}
+	mutex_enter(&tmutex);
+	get_time_values_from_xen(ci);
+	vcpu_system_time[ci->ci_cpuid] = shadow->system_time;
+	mutex_exit(&tmutex);
+	if (!tcdone) { /* Do this only once */
+		tc_init(&xen_timecounter);
+	}
 	/* The splhigh requirements start here. */
-	xen_resumeclocks();
 
-#ifdef DOM0OPS
-	xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
-	if (xendomain_is_privileged()) {
-		const struct sysctlnode *node = NULL;
+	/*
+	 * The periodic timer looks buggy, we stop receiving events
+	 * after a while. Use the one-shot timer every NS_PER_TICK
+	 * and rearm it from the event handler.
+	 */
+	err = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+				 ci->ci_cpuid,
+				 NULL);
+
+	KASSERT(err == 0);
+	err = HYPERVISOR_set_timer_op(
+	    vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
+	KASSERT(err == 0);
 
-		sysctl_createv(NULL, 0, NULL, &node, 0,
-		    CTLTYPE_NODE, "xen",
-		    SYSCTL_DESCR("Xen top level node"),
-		    NULL, 0, NULL, 0,
-		    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
-
-		sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_READWRITE,
-		    CTLTYPE_INT, "timepush_ticks",
-		    SYSCTL_DESCR("How often to update the hypervisor's "
-		    " time-of-day; 0 to disable"),
-		    sysctl_xen_timepush, 0, &xen_timepush_ticks, 0, 
-		    CTL_CREATE, CTL_EOL);
+	event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
+	    ci, IPL_CLOCK, "clock");
+	hypervisor_enable_event(evtch);
+
+#ifdef DOM0OPS
+	if (!tcdone) { /* Do this only once */
 
-		callout_reset(&xen_timepush_co, xen_timepush_ticks,
-		    &xen_timepush, &xen_timepush_co);
+		xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
+		if (xendomain_is_privileged()) {
+			sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE,
+			    CTLTYPE_INT, "xen_timepush_ticks", SYSCTL_DESCR("How often"
+			     " to update the hypervisor's time-of-day; 0 to disable"),
+			     sysctl_xen_timepush, 0, &xen_timepush_ticks, 0, 
+			     CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+			callout_reset(&xen_timepush_co, xen_timepush_ticks,
+			     &xen_timepush, &xen_timepush_co);
+		}
 	}
 #endif
+	tcdone = true;
 }
 
 void
@@ -466,39 +524,38 @@ static int
 xen_timer_handler(void *arg, struct intrframe *regs)
 {
 	int64_t delta;
-	int s, ticks_done;
+	struct cpu_info *ci = curcpu();
+	KASSERT(arg == ci);
+	int err;
+again:
+	mutex_enter(&tmutex);
+	delta = (int64_t)(get_vcpu_time(ci) - vcpu_system_time[ci->ci_cpuid]);
+	mutex_exit(&tmutex);
 
-	s = splhigh();
-#if 0
-	get_time_values_from_xen();
-#endif
-	delta = (int64_t)(get_system_time() - processed_system_time);
-	splx(s);
-
-	ticks_done = 0;
 	/* Several ticks may have passed without our being run; catch up. */
 	while (delta >= (int64_t)NS_PER_TICK) {
-		++ticks_done;
-		s = splhigh();
-		processed_system_time += NS_PER_TICK;
-		xen_clock_bias = (delta -= NS_PER_TICK);
-		splx(s);
+		mutex_enter(&tmutex);
+		vcpu_system_time[ci->ci_cpuid] += NS_PER_TICK;
+		xen_clock_bias[ci->ci_cpuid] = (delta -= NS_PER_TICK);
+		mutex_exit(&tmutex);
 		hardclock((struct clockframe *)regs);
-	}
-	
-	if (xen_clock_bias) {
-		s = splhigh();
- 		xen_clock_bias = 0;
-		splx(s);
+		hardclock_called[ci->ci_cpuid].ev_count++;
 	}
 
 	/*
-	 * Re-arm the timer here, if needed; Xen's auto-ticking while runnable
-	 * is useful only for HZ==100, and even then may be out of phase with
-	 * the processed_system_time steps.
+	 * rearm the timer. If it fails it's probably because the date
+	 * is in the past, update our local time and try again.
 	 */
-	if (ticks_done != 0)
-		HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);
+	err = HYPERVISOR_set_timer_op(
+	    vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
+	if (err)
+		goto again;
+	
+	if (xen_clock_bias[ci->ci_cpuid]) {
+		mutex_enter(&tmutex);
+		xen_clock_bias[ci->ci_cpuid] = 0;
+		mutex_exit(&tmutex);
+	}
 
 	return 0;
 }
@@ -511,17 +568,6 @@ setstatclockrate(int arg)
 void
 idle_block(void)
 {
-	int r;
-
-	/*
-	 * We set the timer to when we expect the next timer
-	 * interrupt.  We could set the timer to later if we could
-	 * easily find out when we will have more work (callouts) to
-	 * process from hardclock.
-	 */
-	r = HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);
-	if (r == 0)
-		HYPERVISOR_block();
-	else
-		__sti();
+	KASSERT(curcpu()->ci_ipending == 0);
+	HYPERVISOR_block();
 }

Reply via email to