On Thu, Feb 08, 2018 at 04:32:29PM +1300, Jonathan Matthew wrote: > This diff (most of which has been around for a while) changes delay_func > when running in KVM to use pvclock (effectively the TSC) to determine when to > stop spinning. Since this is done in a KVM-specific driver, it won't have > any effect anywhere other than in KVM guests. > > Using pvclock rather than the lapic avoids the hangs caused by KVM's use of > the VMX preemption timer to emulate a lapic. To summarise that problem: > occasionally KVM fails to restart the lapic timer until it gets an exit from > the guest, and if we're busy polling the lapic, we'll never generate such an > exit, and the lapic counter will never reach the value we're waiting for, at > which point we're stuck. > > It also adds a timecounter based on pvclock, with its priority set below > acpihpet, so it won't be used for now. Later on I'd like to make this > the preferred timecounter as it's much faster to read than the emulated > acpihpet. > > A couple of testers have confirmed that this does *not* fix the time slowdowns > seen on KVM guests. I believe fixing that will require more invasive changes. > > Since this fixes a concrete problem, rather than just making me feel better > about doing lots of clock reads in some other work I'm doing, I'd like to get > this in now. > > ok? >
After discussing this with jmatthew@ offline, I understand what he's trying to do here, and I withdraw the earlier objection. Some comments: 1. does this need special treatment in RAMDISK? 2. can you add a man page for kvmclock(4) and describe what it's for (and summarize why it's desirable on kvm)? 3. do you want to include this in i386? is it even an issue there? other than those things, ok mlarkin > > Index: arch/amd64/conf/GENERIC > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v > retrieving revision 1.450 > diff -u -p -u -p -r1.450 GENERIC > --- arch/amd64/conf/GENERIC 24 Dec 2017 19:50:56 -0000 1.450 > +++ arch/amd64/conf/GENERIC 8 Feb 2018 02:32:49 -0000 > @@ -81,6 +81,8 @@ hyperv0 at pvbus? # Hyper-V guest > hvn* at hyperv? # Hyper-V NetVSC > hvs* at hyperv? # Hyper-V StorVSC > > +kvmclock0 at pvbus? # KVM clock > + > option PCIVERBOSE > option USBVERBOSE > > Index: arch/amd64/conf/RAMDISK_CD > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/conf/RAMDISK_CD,v > retrieving revision 1.169 > diff -u -p -u -p -r1.169 RAMDISK_CD > --- arch/amd64/conf/RAMDISK_CD 16 Nov 2017 18:12:27 -0000 1.169 > +++ arch/amd64/conf/RAMDISK_CD 8 Feb 2018 02:32:49 -0000 > @@ -64,6 +64,8 @@ hyperv0 at pvbus? # Hyper-V guest > hvn* at hyperv? # Hyper-V NetVSC > hvs* at hyperv? # Hyper-V StorVSC > > +kvmclock0 at pvbus? # KVM clock > + > pchb* at pci? # PCI-Host bridges > aapic* at pci? # AMD 8131 IO apic > ppb* at pci? # PCI-PCI bridges > Index: arch/i386/conf/GENERIC > =================================================================== > RCS file: /cvs/src/sys/arch/i386/conf/GENERIC,v > retrieving revision 1.829 > diff -u -p -u -p -r1.829 GENERIC > --- arch/i386/conf/GENERIC 28 Aug 2017 19:32:53 -0000 1.829 > +++ arch/i386/conf/GENERIC 8 Feb 2018 02:32:49 -0000 > @@ -40,6 +40,7 @@ amdmsr0 at mainbus? # MSR access for AM > > pvbus0 at mainbus0 # Paravirtual device bus > vmt0 at pvbus? # VMware Tools > +kvmclock0 at pvbus? # KVM clock > > acpitimer* at acpi? > acpihpet* at acpi? > Index: dev/pv/files.pv > =================================================================== > RCS file: /cvs/src/sys/dev/pv/files.pv,v > retrieving revision 1.13 > diff -u -p -u -p -r1.13 files.pv > --- dev/pv/files.pv 14 Jun 2017 10:25:40 -0000 1.13 > +++ dev/pv/files.pv 8 Feb 2018 02:32:49 -0000 > @@ -75,3 +75,8 @@ file dev/pv/vioscsi.c vioscsi > device vmmci > attach vmmci at virtio > file dev/pv/vmmci.c vmmci > + > +device kvmclock > +attach kvmclock at pvbus > +file dev/pv/kvmclock.c kvmclock > +file dev/pv/pvclock.c kvmclock > Index: dev/pv/kvmclock.c > =================================================================== > RCS file: dev/pv/kvmclock.c > diff -N dev/pv/kvmclock.c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ dev/pv/kvmclock.c 8 Feb 2018 02:32:49 -0000 > @@ -0,0 +1,151 @@ > +/* $OpenBSD$ */ > + > +/* > + * Copyright (c) 2017 Jonathan Matthew > + * > + * Permission to use, copy, modify, and distribute this software for any > + * purpose with or without fee is hereby granted, provided that the above > + * copyright notice and this permission notice appear in all copies. > + * > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > + */ > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/device.h> > +#include <sys/timetc.h> > +#include <sys/malloc.h> > + > +#include <machine/cpu.h> > +#include <machine/cpufunc.h> > + > +#include <uvm/uvm_extern.h> > + > +#include <dev/pv/pvreg.h> > +#include <dev/pv/pvvar.h> > +#include <dev/pv/pvclockreg.h> > + > +struct kvmclock_softc { > + struct device sc_dev; > + struct pvclock_vcpu_time_info *sc_pvclock; > + paddr_t sc_pvclock_pa; > +}; > + > +int kvmclock_match(struct device *, void *, void *); > +void kvmclock_attach(struct device *, struct device *, void *); > + > +u_int kvmclock_get_timecount(struct timecounter *tc); > + > +struct timecounter kvmclock_timecounter = { > + kvmclock_get_timecount, NULL, ~0u, 0, NULL, 999, NULL > +}; > + > +struct cfdriver kvmclock_cd = { > + NULL, "kvmclock", DV_DULL > +}; > + > +const struct cfattach kvmclock_ca = { > + sizeof(struct kvmclock_softc), kvmclock_match, kvmclock_attach, NULL /* > activate? */ > +}; > + > +int > +kvmclock_match(struct device *parent, void *match, void *aux) > +{ > + struct pv_attach_args *pva = aux; > + struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM]; > + > + if (hv->hv_base == 0) > + return (0); > + > + if ((hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) == 0) > + return (0); > + > + return (1); > +} > + > +u_int > +kvmclock_get_timecount(struct timecounter *tc) > +{ > + struct kvmclock_softc *sc; > + sc = kvmclock_timecounter.tc_priv; > + if (sc->sc_pvclock == NULL) > + return (0); > + > + return (pvclock_read_tsc(sc->sc_pvclock + cpu_number())); > +} > + > +void > +kvmclock_init_cpu(struct pvbus_hv *hv) > +{ > + struct kvmclock_softc *sc; > + > + sc = kvmclock_timecounter.tc_priv; > + if (sc == NULL) > + return; > + > + wrmsr(KVM_MSR_SYS_TIME, (sc->sc_pvclock_pa + (cpu_number() * > + sizeof(*sc->sc_pvclock))) | 1); > +} > + > +void > +kvmclock_delay(int usec) > +{ > + struct kvmclock_softc *sc; > + struct pvclock_vcpu_time_info *pvclock; > + uint64_t tscticks; > + uint64_t target; > + > + sc = kvmclock_timecounter.tc_priv; > + pvclock = sc->sc_pvclock + cpu_number(); > + > + tscticks = (usec * pvclock_get_freq(pvclock)) / 1000000; > + target = pvclock_read_tsc(pvclock) + tscticks; > + while (pvclock_read_tsc(pvclock) < target) > + CPU_BUSY_CYCLE(); > +} > + > +void > +kvmclock_attach(struct device *parent, struct device *self, void *aux) > +{ > + struct pv_attach_args *pva = (struct pv_attach_args *)aux; > + struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM]; > + struct kvmclock_softc *sc = (struct kvmclock_softc *)self; > + > + printf(": "); > + sc->sc_pvclock = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); > + if (sc->sc_pvclock == NULL) { > + printf("could not allocate memory\n"); > + return; > + } > + > + if (pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_pvclock, > + &sc->sc_pvclock_pa) == FALSE) { > + printf("could not map memory\n"); > + free(sc->sc_pvclock, M_DEVBUF, PAGE_SIZE); > + sc->sc_pvclock = NULL; > + } > + > + wrmsr(KVM_MSR_SYS_TIME, sc->sc_pvclock_pa | 1); > + > + if (sc->sc_pvclock[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) { > + kvmclock_timecounter.tc_frequency = > + pvclock_get_freq(sc->sc_pvclock); > + kvmclock_timecounter.tc_name = sc->sc_dev.dv_xname; > + kvmclock_timecounter.tc_priv = sc; > + tc_init(&kvmclock_timecounter); > + printf("%lld Hz", kvmclock_timecounter.tc_frequency); > + > + delay_func = kvmclock_delay; > + } else { > + printf("TSC is not stable"); > + } > + > + hv->hv_init_cpu = kvmclock_init_cpu; > + printf("\n"); > +} > Index: dev/pv/pvclock.c > =================================================================== > RCS file: dev/pv/pvclock.c > diff -N dev/pv/pvclock.c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ dev/pv/pvclock.c 8 Feb 2018 02:32:49 -0000 > @@ -0,0 +1,122 @@ > +/* $OpenBSD$ */ > +/* > + * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org> > + * > + * Permission to use, copy, modify, and distribute this software for any > + * purpose with or without fee is hereby granted, provided that the above > + * copyright notice and this permission notice appear in all copies. > + * > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > + */ > + > +#include <sys/param.h> > +#include <sys/types.h> > +#include <sys/atomic.h> > +#include <sys/systm.h> > + > +#include <machine/cpufunc.h> > + > +#include <dev/pv/pvreg.h> > +#include <dev/pv/pvclockreg.h> > + > +#define NSEC_PER_SEC 1000000000LL > + > +uint64_t > +pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift) > +{ > + uint64_t hns; > + > + hns = ns & 0xffffffff00000000ull; > + ns <<= 32; > + hns = hns / mul; > + ns = (ns / mul) + (hns << 32); > + if (shift >= 0) > + ns >>= shift; > + else > + ns <<= -shift; > + > + return (ns); > +} > + > +uint64_t > +pvclock_tsc_to_ns(uint64_t tsc, uint32_t mul, int shift) > +{ > + uint64_t otsc, htsc, nsec; > + > + otsc = tsc; > + if (shift >= 0) > + tsc <<= shift; > + else > + tsc >>= -shift; > + > + htsc = tsc >> 32; > + tsc &= 0xffffffffll; > + nsec = ((tsc * mul) >> 32) + (htsc * mul); > + return (nsec); > +} > + > +void > +pvclock_read(struct pvclock_vcpu_time_info *time_info, uint64_t *systime, > + uint64_t *tsc, uint32_t *mul, int *shift) > +{ > + uint32_t v; > + int r = 0; > + > + do { > + v = time_info->version; > + virtio_membar_sync(); > + *tsc = time_info->tsc_timestamp; > + *systime = time_info->system_time; > + *mul = time_info->tsc_to_system_mul; > + *shift = time_info->tsc_shift; > + virtio_membar_sync(); > + r++; > + } while ((v != time_info->version) && (r < 100)); > + > + if (r == 100) { > + printf("pvclock spun out\n"); > + *mul = 0; > + } > +} > + > +uint64_t > +pvclock_read_tsc(struct pvclock_vcpu_time_info *time_info) > +{ > + uint32_t mul; > + uint64_t systime, tsc, delta; > + int shift; > + > + pvclock_read(time_info, &systime, &tsc, &mul, &shift); > + delta = rdtsc() - tsc; > + return (pvclock_ns_to_tsc(systime, mul, shift) + delta); > +} > + > +void > +pvclock_read_nano(struct pvclock_vcpu_time_info *time_info, struct timespec > *ts) > +{ > + uint32_t mul; > + uint64_t systime, tsc; > + int shift; > + > + pvclock_read(time_info, &systime, &tsc, &mul, &shift); > + if (mul == 0) > + systime = 0; > + else > + systime += pvclock_tsc_to_ns(rdtsc() - tsc, mul, shift); > + > + ts->tv_sec = systime / NSEC_PER_SEC; > + ts->tv_nsec = systime % NSEC_PER_SEC; > +} > + > +uint64_t > +pvclock_get_freq(struct pvclock_vcpu_time_info *time_info) > +{ > + return (pvclock_ns_to_tsc(NSEC_PER_SEC, time_info->tsc_to_system_mul, > + time_info->tsc_shift)); > +} > Index: dev/pv/pvclockreg.h > =================================================================== > RCS file: dev/pv/pvclockreg.h > diff -N dev/pv/pvclockreg.h > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ dev/pv/pvclockreg.h 8 Feb 2018 02:32:49 -0000 > @@ -0,0 +1,24 @@ > +#ifndef _PV_PVCLOCKREG_H_ > +#define _PV_PVCLOCKREG_H_ > + > +#define PVCLOCK_FLAG_TSC_STABLE_BIT (1 << 0) > +#define PVCLOCK_FLAG_GUEST_STOPPED (1 << 1) > + > +struct pvclock_vcpu_time_info { > + volatile uint32_t version; > + volatile uint32_t pad1; > + volatile uint64_t tsc_timestamp; > + volatile uint64_t system_time; > + volatile uint32_t tsc_to_system_mul; > + volatile int8_t tsc_shift; > + volatile uint8_t flags; > + volatile uint8_t pad2[2]; > +} __packed; > + > +struct pvclock_wall_clock { > + volatile uint32_t version; > + volatile uint32_t sec; > + volatile uint32_t nsec; > +} __packed; > + > +#endif /* _PV_PVCLOCKREG_H_ */ > Index: dev/pv/pvreg.h > =================================================================== > RCS file: /cvs/src/sys/dev/pv/pvreg.h,v > retrieving revision 1.4 > diff -u -p -u -p -r1.4 pvreg.h > --- dev/pv/pvreg.h 12 Dec 2015 12:33:49 -0000 1.4 > +++ dev/pv/pvreg.h 8 Feb 2018 02:32:49 -0000 > @@ -40,6 +40,9 @@ > #define KVM_FEATURE_PV_UNHALT 7 > #define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 > > +#define KVM_MSR_WALL_CLOCK 0x4b564d00 > +#define KVM_MSR_SYS_TIME 0x4b564d01 > + > #define KVM_MSR_EOI_EN 0x4b564d04 > #define KVM_PV_EOI_BIT 0 > > Index: dev/pv/pvvar.h > =================================================================== > RCS file: /cvs/src/sys/dev/pv/pvvar.h,v > retrieving revision 1.10 > diff -u -p -u -p -r1.10 pvvar.h > --- dev/pv/pvvar.h 22 Jun 2017 06:21:12 -0000 1.10 > +++ dev/pv/pvvar.h 8 Feb 2018 02:32:49 -0000 > @@ -82,5 +82,12 @@ void pvbus_init_cpu(void); > void pvbus_reboot(struct device *); > void pvbus_shutdown(struct device *); > > +/* pvclock.c */ > +struct pvclock_vcpu_time_info; > + > +uint64_t pvclock_read_tsc(struct pvclock_vcpu_time_info *); > +void pvclock_read_nano(struct pvclock_vcpu_time_info *, struct timespec > *); > +uint64_t pvclock_get_freq(struct pvclock_vcpu_time_info *); > + > #endif /* _KERNEL */ > #endif /* _DEV_PV_PVBUS_H_ */ >