Module Name: src
Committed By: cherry
Date: Fri Nov 18 06:01:50 UTC 2011
Modified Files:
src/sys/arch/xen/xen: clock.c
Log Message:
[merging from cherry-xenmp]
- Make clock MP aware.
- Bring in fixes that bouyer@ brought in via:
cvs rdiff -u -r1.54.6.4 -r1.54.6.5 src/sys/arch/xen/xen/clock.c
Thanks to riz@ for testing on dom0
To generate a diff of this commit:
cvs rdiff -u -r1.56 -r1.57 src/sys/arch/xen/xen/clock.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/xen/xen/clock.c
diff -u src/sys/arch/xen/xen/clock.c:1.56 src/sys/arch/xen/xen/clock.c:1.57
--- src/sys/arch/xen/xen/clock.c:1.56 Tue Sep 20 00:12:24 2011
+++ src/sys/arch/xen/xen/clock.c Fri Nov 18 06:01:50 2011
@@ -1,4 +1,4 @@
-/* $NetBSD: clock.c,v 1.56 2011/09/20 00:12:24 jym Exp $ */
+/* $NetBSD: clock.c,v 1.57 2011/11/18 06:01:50 cherry Exp $ */
/*
*
@@ -29,7 +29,7 @@
#include "opt_xen.h"
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.56 2011/09/20 00:12:24 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.57 2011/11/18 06:01:50 cherry Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,6 +43,7 @@ __KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.
#include <xen/xen.h>
#include <xen/hypervisor.h>
#include <xen/evtchn.h>
+#include <xen/xen3-public/vcpu.h>
#include <machine/cpu_counter.h>
#include <dev/clock_subr.h>
@@ -66,22 +67,32 @@ static struct timecounter xen_timecounte
};
/* These are periodically updated in shared_info, and then copied here. */
-static volatile uint64_t shadow_tsc_stamp;
-static volatile uint64_t shadow_system_time;
-static volatile unsigned long shadow_time_version; /* XXXSMP */
-static volatile uint32_t shadow_freq_mul;
-static volatile int8_t shadow_freq_shift;
-static volatile struct timespec shadow_ts;
+struct shadow {
+ uint64_t tsc_stamp;
+ uint64_t system_time;
+ unsigned long time_version; /* XXXSMP */
+ uint32_t freq_mul;
+ int8_t freq_shift;
+ struct timespec ts;
+};
+
+/* Protects volatile variables ci_shadow & xen_clock_bias */
+static kmutex_t tmutex;
-/* The time when the last hardclock(9) call should have taken place. */
-static volatile uint64_t processed_system_time;
+/* Per CPU shadow time values */
+static volatile struct shadow ci_shadow[MAXCPUS];
+
+/* The time when the last hardclock(9) call should have taken place,
+ * per cpu.
+ */
+static volatile uint64_t vcpu_system_time[MAXCPUS];
/*
* The clock (as returned by xen_get_timecount) may need to be held
* back to maintain the illusion that hardclock(9) was called when it
* was supposed to be, not when Xen got around to scheduling us.
*/
-static volatile uint64_t xen_clock_bias = 0;
+static volatile uint64_t xen_clock_bias[MAXCPUS];
#ifdef DOM0OPS
/* If we're dom0, send our time to Xen every minute or so. */
@@ -96,25 +107,30 @@ static callout_t xen_timepush_co;
* area. Must be called at splhigh (per timecounter requirements).
*/
static void
-get_time_values_from_xen(void)
+get_time_values_from_xen(struct cpu_info *ci)
{
- volatile struct vcpu_time_info *t = &curcpu()->ci_vcpu->time;
+
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+ volatile struct vcpu_time_info *t = &ci->ci_vcpu->time;
uint32_t tversion;
+ KASSERT(mutex_owned(&tmutex));
+
do {
- shadow_time_version = t->version;
+ shadow->time_version = t->version;
xen_rmb();
- shadow_tsc_stamp = t->tsc_timestamp;
- shadow_system_time = t->system_time;
- shadow_freq_mul = t->tsc_to_system_mul;
- shadow_freq_shift = t->tsc_shift;
+ shadow->tsc_stamp = t->tsc_timestamp;
+ shadow->system_time = t->system_time;
+ shadow->freq_mul = t->tsc_to_system_mul;
+ shadow->freq_shift = t->tsc_shift;
xen_rmb();
- } while ((t->version & 1) || (shadow_time_version != t->version));
+ } while ((t->version & 1) || (shadow->time_version != t->version));
do {
tversion = HYPERVISOR_shared_info->wc_version;
xen_rmb();
- shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
- shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
+ shadow->ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
+ shadow->ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
xen_rmb();
} while ((HYPERVISOR_shared_info->wc_version & 1) ||
(tversion != HYPERVISOR_shared_info->wc_version));
@@ -124,12 +140,17 @@ get_time_values_from_xen(void)
* Are the values we have up to date?
*/
static inline int
-time_values_up_to_date(void)
+time_values_up_to_date(struct cpu_info *ci)
{
int rv;
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+ KASSERT(ci != NULL);
+ KASSERT(mutex_owned(&tmutex));
+
xen_rmb();
- rv = shadow_time_version == curcpu()->ci_vcpu->time.version;
+ rv = shadow->time_version == ci->ci_vcpu->time.version;
xen_rmb();
return rv;
@@ -164,52 +185,40 @@ scale_delta(uint64_t delta, uint32_t mul
* Must be called at splhigh (per timecounter requirements).
*/
static uint64_t
-get_tsc_offset_ns(void)
+get_tsc_offset_ns(struct cpu_info *ci)
{
uint64_t tsc_delta, offset;
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
- tsc_delta = cpu_counter() - shadow_tsc_stamp;
- offset = scale_delta(tsc_delta, shadow_freq_mul,
- shadow_freq_shift);
-#ifdef XEN_CLOCK_DEBUG
- if (tsc_delta > 100000000000ULL || offset > 10000000000ULL)
- printf("get_tsc_offset_ns: tsc_delta=%llu offset=%llu"
- " pst=%llu sst=%llu\n", tsc_delta, offset,
- processed_system_time, shadow_system_time);
-#endif
+ KASSERT(mutex_owned(&tmutex));
+ tsc_delta = cpu_counter() - shadow->tsc_stamp;
+ offset = scale_delta(tsc_delta, shadow->freq_mul,
+ shadow->freq_shift);
return offset;
}
/*
- * Returns the current system_time, taking care that the timestamp
- * used is valid for the TSC measurement in question. Xen2 doesn't
- * ensure that this won't step backwards, so we enforce monotonicity
- * on our own in that case. Must be called at splhigh.
+ * Returns the current system_time on given vcpu, taking care that the
+ * timestamp used is valid for the TSC measurement in question. Xen2
+ * doesn't ensure that this won't step backwards, so we enforce
+ * monotonicity on our own in that case. Must be called at splhigh.
*/
static uint64_t
-get_system_time(void)
+get_vcpu_time(struct cpu_info *ci)
{
uint64_t offset, stime;
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
- for (;;) {
- offset = get_tsc_offset_ns();
- stime = shadow_system_time + offset;
+ KASSERT(mutex_owned(&tmutex));
+ do {
+ get_time_values_from_xen(ci);
+ offset = get_tsc_offset_ns(ci);
+ stime = shadow->system_time + offset;
/* if the timestamp went stale before we used it, refresh */
- if (time_values_up_to_date()) {
- /*
- * Work around an intermittent Xen2 bug where, for
- * a period of 1<<32 ns, currently running domains
- * don't get their timer events as usual (and also
- * aren't preempted in favor of other runnable
- * domains). Setting the timer into the past in
- * this way causes it to fire immediately.
- */
- break;
- }
- get_time_values_from_xen();
- }
+
+ } while (!time_values_up_to_date(ci));
return stime;
}
@@ -218,16 +227,22 @@ static void
xen_wall_time(struct timespec *wt)
{
uint64_t nsec;
- int s;
- s = splhigh();
- get_time_values_from_xen();
- *wt = shadow_ts;
- nsec = wt->tv_nsec;
-
- /* Under Xen3, this is the wall time less system time */
- nsec += get_system_time();
- splx(s);
+ struct cpu_info *ci = curcpu();
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+ mutex_enter(&tmutex);
+ do {
+ /*
+ * Under Xen3, shadow->ts is the wall time less system time
+ * get_vcpu_time() will update shadow
+ */
+ nsec = get_vcpu_time(curcpu());
+ *wt = shadow->ts;
+ nsec += wt->tv_nsec;
+ } while (!time_values_up_to_date(ci));
+ mutex_exit(&tmutex);
+
wt->tv_sec += nsec / 1000000000L;
wt->tv_nsec = nsec % 1000000000L;
}
@@ -253,8 +268,6 @@ xen_rtc_set(todr_chip_handle_t todr, str
#else
xen_platform_op_t op;
#endif
- int s;
-
if (xendomain_is_privileged()) {
/* needs to set the RTC chip too */
struct clock_ymdhms dt;
@@ -269,9 +282,9 @@ xen_rtc_set(todr_chip_handle_t todr, str
/* XXX is rtc_offset handled correctly everywhere? */
op.u.settime.secs = tvp->tv_sec;
op.u.settime.nsecs = tvp->tv_usec * 1000;
- s = splhigh();
- op.u.settime.system_time = get_system_time();
- splx(s);
+ mutex_enter(&tmutex);
+ op.u.settime.system_time = get_vcpu_time(curcpu());
+ mutex_exit(&tmutex);
#if __XEN_INTERFACE_VERSION__ < 0x00030204
return HYPERVISOR_dom0_op(&op);
#else
@@ -300,14 +313,16 @@ startrtclock(void)
void
xen_delay(unsigned int n)
{
+ struct cpu_info *ci = curcpu();
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
if (n < 500000) {
/*
- * shadow_system_time is updated every hz tick, it's not
+ * shadow->system_time is updated every hz tick, it's not
* precise enough for short delays. Use the CPU counter
* instead. We assume it's working at this point.
*/
uint64_t cc, cc2, when;
- struct cpu_info *ci = curcpu();
cc = cpu_counter();
when = cc + (uint64_t)n * cpu_frequency(ci) / 1000000LL;
@@ -324,18 +339,18 @@ xen_delay(unsigned int n)
return;
} else {
uint64_t when;
- int s;
- /* for large delays, shadow_system_time is OK */
-
- s = splhigh();
- get_time_values_from_xen();
- when = shadow_system_time + n * 1000;
- while (shadow_system_time < when) {
- splx(s);
- s = splhigh();
- get_time_values_from_xen();
+
+ /* for large delays, shadow->system_time is OK */
+ mutex_enter(&tmutex);
+ get_time_values_from_xen(ci);
+ when = shadow->system_time + n * 1000;
+ while (shadow->system_time < when) {
+ mutex_exit(&tmutex);
+ HYPERVISOR_yield();
+ mutex_enter(&tmutex);
+ get_time_values_from_xen(ci);
}
- splx(s);
+ mutex_exit(&tmutex);
}
}
@@ -380,56 +395,99 @@ sysctl_xen_timepush(SYSCTLFN_ARGS)
#endif
/* ARGSUSED */
+/* SMP note: Timecounter uses vcpu0's clock */
u_int
xen_get_timecount(struct timecounter *tc)
{
uint64_t ns;
- int s;
-
- s = splhigh();
- ns = get_system_time() - xen_clock_bias;
- splx(s);
+
+ struct cpu_info *ci = curcpu();
+
+ mutex_enter(&tmutex);
+ ns = get_vcpu_time(ci) - xen_clock_bias[ci->ci_cpuid];
+ mutex_exit(&tmutex);
return (u_int)ns;
}
+/*
+ * Needs to be called per-cpu, from the local cpu, since VIRQ_TIMER is
+ * bound per-cpu
+ */
+
+static struct evcnt hardclock_called[MAXCPUS];
+
void
xen_initclocks(void)
{
+ int err, evtch;
+ static bool tcdone = false;
+
+ struct cpu_info *ci = curcpu();
+ volatile struct shadow *shadow = &ci_shadow[ci->ci_cpuid];
+
+ xen_clock_bias[ci->ci_cpuid] = 0;
+
+ evcnt_attach_dynamic(&hardclock_called[ci->ci_cpuid],
+ EVCNT_TYPE_INTR,
+ NULL,
+ device_xname(ci->ci_dev),
+ "hardclock");
#ifdef DOM0OPS
- callout_init(&xen_timepush_co, 0);
+ if (!tcdone) { /* Do this only once */
+ callout_init(&xen_timepush_co, 0);
+ }
#endif
+ evtch = bind_virq_to_evtch(VIRQ_TIMER);
+ aprint_verbose("Xen clock: using event channel %d\n", evtch);
- get_time_values_from_xen();
- processed_system_time = shadow_system_time;
- tc_init(&xen_timecounter);
-
+ if (!tcdone) { /* Do this only once */
+ mutex_init(&tmutex, MUTEX_DEFAULT, IPL_CLOCK);
+ }
+ mutex_enter(&tmutex);
+ get_time_values_from_xen(ci);
+ vcpu_system_time[ci->ci_cpuid] = shadow->system_time;
+ mutex_exit(&tmutex);
+ if (!tcdone) { /* Do this only once */
+ tc_init(&xen_timecounter);
+ }
/* The splhigh requirements start here. */
- xen_resumeclocks();
-#ifdef DOM0OPS
- xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
- if (xendomain_is_privileged()) {
- const struct sysctlnode *node = NULL;
+ /*
+ * The periodic timer looks buggy, we stop receiving events
+ * after a while. Use the one-shot timer every NS_PER_TICK
+ * and rearm it from the event handler.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+ ci->ci_cpuid,
+ NULL);
+
+ KASSERT(err == 0);
+ err = HYPERVISOR_set_timer_op(
+ vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
+ KASSERT(err == 0);
- sysctl_createv(NULL, 0, NULL, &node, 0,
- CTLTYPE_NODE, "xen",
- SYSCTL_DESCR("Xen top level node"),
- NULL, 0, NULL, 0,
- CTL_MACHDEP, CTL_CREATE, CTL_EOL);
-
- sysctl_createv(NULL, 0, &node, NULL, CTLFLAG_READWRITE,
- CTLTYPE_INT, "timepush_ticks",
- SYSCTL_DESCR("How often to update the hypervisor's "
- " time-of-day; 0 to disable"),
- sysctl_xen_timepush, 0, &xen_timepush_ticks, 0,
- CTL_CREATE, CTL_EOL);
+ event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
+ ci, IPL_CLOCK, "clock");
+ hypervisor_enable_event(evtch);
+
+#ifdef DOM0OPS
+ if (!tcdone) { /* Do this only once */
- callout_reset(&xen_timepush_co, xen_timepush_ticks,
- &xen_timepush, &xen_timepush_co);
+ xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
+ if (xendomain_is_privileged()) {
+ sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE,
+ CTLTYPE_INT, "xen_timepush_ticks", SYSCTL_DESCR("How often"
+ " to update the hypervisor's time-of-day; 0 to disable"),
+ sysctl_xen_timepush, 0, &xen_timepush_ticks, 0,
+ CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+ callout_reset(&xen_timepush_co, xen_timepush_ticks,
+ &xen_timepush, &xen_timepush_co);
+ }
}
#endif
+ tcdone = true;
}
void
@@ -466,39 +524,38 @@ static int
xen_timer_handler(void *arg, struct intrframe *regs)
{
int64_t delta;
- int s, ticks_done;
+ struct cpu_info *ci = curcpu();
+ KASSERT(arg == ci);
+ int err;
+again:
+ mutex_enter(&tmutex);
+ delta = (int64_t)(get_vcpu_time(ci) - vcpu_system_time[ci->ci_cpuid]);
+ mutex_exit(&tmutex);
- s = splhigh();
-#if 0
- get_time_values_from_xen();
-#endif
- delta = (int64_t)(get_system_time() - processed_system_time);
- splx(s);
-
- ticks_done = 0;
/* Several ticks may have passed without our being run; catch up. */
while (delta >= (int64_t)NS_PER_TICK) {
- ++ticks_done;
- s = splhigh();
- processed_system_time += NS_PER_TICK;
- xen_clock_bias = (delta -= NS_PER_TICK);
- splx(s);
+ mutex_enter(&tmutex);
+ vcpu_system_time[ci->ci_cpuid] += NS_PER_TICK;
+ xen_clock_bias[ci->ci_cpuid] = (delta -= NS_PER_TICK);
+ mutex_exit(&tmutex);
hardclock((struct clockframe *)regs);
- }
-
- if (xen_clock_bias) {
- s = splhigh();
- xen_clock_bias = 0;
- splx(s);
+ hardclock_called[ci->ci_cpuid].ev_count++;
}
/*
- * Re-arm the timer here, if needed; Xen's auto-ticking while runnable
- * is useful only for HZ==100, and even then may be out of phase with
- * the processed_system_time steps.
+ * rearm the timer. If it fails it's probably because the date
+ * is in the past, update our local time and try again.
*/
- if (ticks_done != 0)
- HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);
+ err = HYPERVISOR_set_timer_op(
+ vcpu_system_time[ci->ci_cpuid] + NS_PER_TICK);
+ if (err)
+ goto again;
+
+ if (xen_clock_bias[ci->ci_cpuid]) {
+ mutex_enter(&tmutex);
+ xen_clock_bias[ci->ci_cpuid] = 0;
+ mutex_exit(&tmutex);
+ }
return 0;
}
@@ -511,17 +568,6 @@ setstatclockrate(int arg)
void
idle_block(void)
{
- int r;
-
- /*
- * We set the timer to when we expect the next timer
- * interrupt. We could set the timer to later if we could
- * easily find out when we will have more work (callouts) to
- * process from hardclock.
- */
- r = HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);
- if (r == 0)
- HYPERVISOR_block();
- else
- __sti();
+ KASSERT(curcpu()->ci_ipending == 0);
+ HYPERVISOR_block();
}