On Thu, Feb 08, 2018 at 04:32:29PM +1300, Jonathan Matthew wrote:
> This diff (most of which has been around for a while) changes delay_func
> when running in KVM to use pvclock (effectively the TSC) to determine when to
> stop spinning.  Since this is done in a KVM-specific driver, it won't have
> any effect anywhere other than in KVM guests.
> 
> Using pvclock rather than the lapic avoids the hangs caused by KVM's use of
> the VMX preemption timer to emulate a lapic.  To summarise that problem:
> occasionally KVM fails to restart the lapic timer until it gets an exit from
> the guest, and if we're busy polling the lapic, we'll never generate such an
> exit, and the lapic counter will never reach the value we're waiting for, at

... should we instead just force an exit by doing an intentional VMCALL if
we detect this?

Seems like it might be a less arduous approach than creating a whole device.
If we're going to do that, we might as well call it kvmbrokenclock(4)

-ml

> which point we're stuck.
> 
> It also adds a timecounter based on pvclock, with its priority set below
> acpihpet, so it won't be used for now.  Later on I'd like to make this
> the preferred timecounter as it's much faster to read than the emulated
> acpihpet.
> 
> A couple of testers have confirmed that this does *not* fix the time slowdowns
> seen on KVM guests.  I believe fixing that will require more invasive changes.
> 
> Since this fixes a concrete problem, rather than just making me feel better
> about doing lots of clock reads in some other work I'm doing, I'd like to get
> this in now.
> 
> ok?
> 
> 
> Index: arch/amd64/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v
> retrieving revision 1.450
> diff -u -p -u -p -r1.450 GENERIC
> --- arch/amd64/conf/GENERIC   24 Dec 2017 19:50:56 -0000      1.450
> +++ arch/amd64/conf/GENERIC   8 Feb 2018 02:32:49 -0000
> @@ -81,6 +81,8 @@ hyperv0     at pvbus?               # Hyper-V guest
>  hvn* at hyperv?              # Hyper-V NetVSC
>  hvs* at hyperv?              # Hyper-V StorVSC
>  
> +kvmclock0 at pvbus?          # KVM clock
> +
>  option               PCIVERBOSE
>  option               USBVERBOSE
>  
> Index: arch/amd64/conf/RAMDISK_CD
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/RAMDISK_CD,v
> retrieving revision 1.169
> diff -u -p -u -p -r1.169 RAMDISK_CD
> --- arch/amd64/conf/RAMDISK_CD        16 Nov 2017 18:12:27 -0000      1.169
> +++ arch/amd64/conf/RAMDISK_CD        8 Feb 2018 02:32:49 -0000
> @@ -64,6 +64,8 @@ hyperv0             at pvbus?               # Hyper-V guest
>  hvn*         at hyperv?              # Hyper-V NetVSC
>  hvs*         at hyperv?              # Hyper-V StorVSC
>  
> +kvmclock0    at pvbus?               # KVM clock
> +
>  pchb*                at pci?                 # PCI-Host bridges
>  aapic*               at pci?                 # AMD 8131 IO apic
>  ppb*         at pci?                 # PCI-PCI bridges
> Index: arch/i386/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/conf/GENERIC,v
> retrieving revision 1.829
> diff -u -p -u -p -r1.829 GENERIC
> --- arch/i386/conf/GENERIC    28 Aug 2017 19:32:53 -0000      1.829
> +++ arch/i386/conf/GENERIC    8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,7 @@ amdmsr0     at mainbus?             # MSR access for AM
>  
>  pvbus0       at mainbus0             # Paravirtual device bus
>  vmt0 at pvbus?               # VMware Tools
> +kvmclock0 at pvbus?          # KVM clock
>  
>  acpitimer*   at acpi?
>  acpihpet*    at acpi?
> Index: dev/pv/files.pv
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/files.pv,v
> retrieving revision 1.13
> diff -u -p -u -p -r1.13 files.pv
> --- dev/pv/files.pv   14 Jun 2017 10:25:40 -0000      1.13
> +++ dev/pv/files.pv   8 Feb 2018 02:32:49 -0000
> @@ -75,3 +75,8 @@ file        dev/pv/vioscsi.c                vioscsi
>  device       vmmci
>  attach       vmmci at virtio
>  file dev/pv/vmmci.c                  vmmci
> +
> +device       kvmclock
> +attach       kvmclock at pvbus
> +file dev/pv/kvmclock.c               kvmclock
> +file dev/pv/pvclock.c                kvmclock
> Index: dev/pv/kvmclock.c
> ===================================================================
> RCS file: dev/pv/kvmclock.c
> diff -N dev/pv/kvmclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/kvmclock.c 8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,151 @@
> +/*   $OpenBSD$       */
> +
> +/*
> + * Copyright (c) 2017 Jonathan Matthew
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/device.h>
> +#include <sys/timetc.h>
> +#include <sys/malloc.h>
> +
> +#include <machine/cpu.h>
> +#include <machine/cpufunc.h>
> +
> +#include <uvm/uvm_extern.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvvar.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +struct kvmclock_softc {
> +     struct device                   sc_dev;
> +     struct pvclock_vcpu_time_info   *sc_pvclock;
> +     paddr_t                         sc_pvclock_pa;
> +};
> +
> +int  kvmclock_match(struct device *, void *, void *);
> +void kvmclock_attach(struct device *, struct device *, void *);
> +
> +u_int        kvmclock_get_timecount(struct timecounter *tc);
> +
> +struct timecounter kvmclock_timecounter = {
> +     kvmclock_get_timecount, NULL, ~0u, 0, NULL, 999, NULL
> +};
> +
> +struct cfdriver kvmclock_cd = {
> +     NULL, "kvmclock", DV_DULL
> +};
> +
> +const struct cfattach kvmclock_ca = {
> +     sizeof(struct kvmclock_softc), kvmclock_match, kvmclock_attach, NULL /* 
> activate? */
> +};
> +
> +int
> +kvmclock_match(struct device *parent, void *match, void *aux)
> +{
> +     struct pv_attach_args *pva = aux;
> +     struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> +
> +     if (hv->hv_base == 0)
> +             return (0);
> +
> +     if ((hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) == 0)
> +             return (0);
> +
> +     return (1);
> +}
> +
> +u_int
> +kvmclock_get_timecount(struct timecounter *tc)
> +{
> +     struct kvmclock_softc *sc;
> +     sc = kvmclock_timecounter.tc_priv;
> +     if (sc->sc_pvclock == NULL)
> +             return (0);
> +
> +     return (pvclock_read_tsc(sc->sc_pvclock + cpu_number()));
> +}
> +
> +void
> +kvmclock_init_cpu(struct pvbus_hv *hv)
> +{
> +     struct kvmclock_softc *sc;
> +
> +     sc = kvmclock_timecounter.tc_priv;
> +     if (sc == NULL)
> +             return;
> +
> +     wrmsr(KVM_MSR_SYS_TIME, (sc->sc_pvclock_pa + (cpu_number() *
> +         sizeof(*sc->sc_pvclock))) | 1);
> +}
> +
> +void
> +kvmclock_delay(int usec)
> +{
> +     struct kvmclock_softc *sc;
> +     struct pvclock_vcpu_time_info *pvclock;
> +     uint64_t tscticks;
> +     uint64_t target;
> +
> +     sc = kvmclock_timecounter.tc_priv;
> +     pvclock = sc->sc_pvclock + cpu_number();
> +
> +     tscticks = (usec * pvclock_get_freq(pvclock)) / 1000000;
> +     target = pvclock_read_tsc(pvclock) + tscticks;
> +     while (pvclock_read_tsc(pvclock) < target)
> +             CPU_BUSY_CYCLE();
> +}
> +
> +void
> +kvmclock_attach(struct device *parent, struct device *self, void *aux)
> +{
> +     struct pv_attach_args *pva = (struct pv_attach_args *)aux;
> +     struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> +     struct kvmclock_softc *sc = (struct kvmclock_softc *)self;
> +
> +     printf(": ");
> +     sc->sc_pvclock = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
> +     if (sc->sc_pvclock == NULL) {
> +             printf("could not allocate memory\n");
> +             return;
> +     }
> +
> +     if (pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_pvclock,
> +         &sc->sc_pvclock_pa) == FALSE) {
> +             printf("could not map memory\n");
> +             free(sc->sc_pvclock, M_DEVBUF, PAGE_SIZE);
> +             sc->sc_pvclock = NULL;
> +     }
> +
> +     wrmsr(KVM_MSR_SYS_TIME, sc->sc_pvclock_pa | 1);
> +
> +     if (sc->sc_pvclock[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) {
> +             kvmclock_timecounter.tc_frequency =
> +                 pvclock_get_freq(sc->sc_pvclock);
> +             kvmclock_timecounter.tc_name = sc->sc_dev.dv_xname;
> +             kvmclock_timecounter.tc_priv = sc;
> +             tc_init(&kvmclock_timecounter);
> +             printf("%lld Hz", kvmclock_timecounter.tc_frequency);
> +
> +             delay_func = kvmclock_delay;
> +     } else {
> +             printf("TSC is not stable");
> +     }
> +
> +     hv->hv_init_cpu = kvmclock_init_cpu;
> +     printf("\n");
> +}
> Index: dev/pv/pvclock.c
> ===================================================================
> RCS file: dev/pv/pvclock.c
> diff -N dev/pv/pvclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclock.c  8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,122 @@
> +/*   $OpenBSD$ */
> +/*
> + * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/types.h>
> +#include <sys/atomic.h>
> +#include <sys/systm.h>
> +
> +#include <machine/cpufunc.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +#define NSEC_PER_SEC 1000000000LL
> +
> +uint64_t
> +pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift)
> +{
> +     uint64_t hns;
> +
> +     hns = ns & 0xffffffff00000000ull;
> +     ns <<= 32;
> +     hns = hns / mul;
> +     ns = (ns / mul) + (hns << 32);
> +     if (shift >= 0)
> +             ns >>= shift;
> +     else
> +             ns <<= -shift;
> +
> +     return (ns);
> +}
> +
> +uint64_t
> +pvclock_tsc_to_ns(uint64_t tsc, uint32_t mul, int shift)
> +{
> +     uint64_t otsc, htsc, nsec;
> +
> +     otsc = tsc;
> +     if (shift >= 0)
> +             tsc <<= shift;
> +     else
> +             tsc >>= -shift;
> +
> +     htsc = tsc >> 32;
> +     tsc &= 0xffffffffll;
> +     nsec = ((tsc * mul) >> 32) + (htsc * mul);
> +     return (nsec);
> +}
> +
> +void
> +pvclock_read(struct pvclock_vcpu_time_info *time_info, uint64_t *systime,
> +    uint64_t *tsc, uint32_t *mul, int *shift)
> +{
> +     uint32_t v;
> +     int r = 0;
> +
> +     do {
> +             v = time_info->version;
> +             virtio_membar_sync();
> +             *tsc = time_info->tsc_timestamp;
> +             *systime = time_info->system_time;
> +             *mul = time_info->tsc_to_system_mul;
> +             *shift = time_info->tsc_shift;
> +             virtio_membar_sync();
> +             r++;
> +     } while ((v != time_info->version) && (r < 100));
> +
> +     if (r == 100) {
> +             printf("pvclock spun out\n");
> +             *mul = 0;
> +     }
> +}
> +
> +uint64_t
> +pvclock_read_tsc(struct pvclock_vcpu_time_info *time_info)
> +{
> +     uint32_t mul;
> +     uint64_t systime, tsc, delta;
> +     int shift;
> +
> +     pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> +     delta = rdtsc() - tsc;
> +     return (pvclock_ns_to_tsc(systime, mul, shift) + delta);
> +}
> +
> +void
> +pvclock_read_nano(struct pvclock_vcpu_time_info *time_info, struct timespec 
> *ts)
> +{
> +     uint32_t mul;
> +     uint64_t systime, tsc;
> +     int shift;
> +
> +     pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> +     if (mul == 0)
> +             systime = 0;
> +     else
> +             systime += pvclock_tsc_to_ns(rdtsc() - tsc, mul, shift);
> +
> +     ts->tv_sec = systime / NSEC_PER_SEC;
> +     ts->tv_nsec = systime % NSEC_PER_SEC;
> +}
> +
> +uint64_t
> +pvclock_get_freq(struct pvclock_vcpu_time_info *time_info)
> +{
> +     return (pvclock_ns_to_tsc(NSEC_PER_SEC, time_info->tsc_to_system_mul,
> +         time_info->tsc_shift));
> +}
> Index: dev/pv/pvclockreg.h
> ===================================================================
> RCS file: dev/pv/pvclockreg.h
> diff -N dev/pv/pvclockreg.h
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclockreg.h       8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,24 @@
> +#ifndef _PV_PVCLOCKREG_H_
> +#define _PV_PVCLOCKREG_H_
> +
> +#define PVCLOCK_FLAG_TSC_STABLE_BIT             (1 << 0)
> +#define PVCLOCK_FLAG_GUEST_STOPPED              (1 << 1)
> +
> +struct pvclock_vcpu_time_info {
> +     volatile uint32_t       version;
> +     volatile uint32_t       pad1;
> +     volatile uint64_t       tsc_timestamp;
> +     volatile uint64_t       system_time;
> +     volatile uint32_t       tsc_to_system_mul;
> +     volatile int8_t         tsc_shift;
> +     volatile uint8_t        flags;
> +     volatile uint8_t        pad2[2];
> +} __packed;
> +
> +struct pvclock_wall_clock {
> +     volatile uint32_t       version;
> +     volatile uint32_t       sec;
> +     volatile uint32_t       nsec;
> +} __packed;
> +
> +#endif  /* _PV_PVCLOCKREG_H_ */
> Index: dev/pv/pvreg.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvreg.h,v
> retrieving revision 1.4
> diff -u -p -u -p -r1.4 pvreg.h
> --- dev/pv/pvreg.h    12 Dec 2015 12:33:49 -0000      1.4
> +++ dev/pv/pvreg.h    8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,9 @@
>  #define      KVM_FEATURE_PV_UNHALT                   7
>  #define      KVM_FEATURE_CLOCKSOURCE_STABLE_BIT      24
>  
> +#define KVM_MSR_WALL_CLOCK                   0x4b564d00
> +#define KVM_MSR_SYS_TIME                     0x4b564d01
> +
>  #define      KVM_MSR_EOI_EN                          0x4b564d04
>  #define KVM_PV_EOI_BIT                               0
>  
> Index: dev/pv/pvvar.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
> retrieving revision 1.10
> diff -u -p -u -p -r1.10 pvvar.h
> --- dev/pv/pvvar.h    22 Jun 2017 06:21:12 -0000      1.10
> +++ dev/pv/pvvar.h    8 Feb 2018 02:32:49 -0000
> @@ -82,5 +82,12 @@ void        pvbus_init_cpu(void);
>  void  pvbus_reboot(struct device *);
>  void  pvbus_shutdown(struct device *);
>  
> +/* pvclock.c */
> +struct pvclock_vcpu_time_info;
> +
> +uint64_t pvclock_read_tsc(struct pvclock_vcpu_time_info *);
> +void     pvclock_read_nano(struct pvclock_vcpu_time_info *, struct timespec 
> *);
> +uint64_t pvclock_get_freq(struct pvclock_vcpu_time_info *);
> +
>  #endif /* _KERNEL */
>  #endif /* _DEV_PV_PVBUS_H_ */
> 

Reply via email to