From: Paolo Bonzini <pbonz...@redhat.com> The version field in struct pvclock_vcpu_time_info basically implements a seqcount. Wrap it with the usual read_begin and read_retry functions, and use these APIs instead of peppering the code with smp_rmb()s. While at it, change it to the more pedantically correct virt_rmb().
Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> (cherry picked from commit 3aed64f6d341cdb62bb2d6232589fb13448ce063) Fix #PSBM-60589 The patch was cherry picked to resolve the race condition appeared on delta calculation when using kvm_clock and tsc. The delta calculation involves rdtsc and hv_clock.tsc_timestamp reading and difference calculating. The race happened because reading of tsc value wasn't protected by the version checking while tsc_timestamp reading was. This led to setting the delta to high values because of getting negative difference on unsigned integers in case of updating tsc_timestamp by the hypervisor right after tsc reading. This was observed as system stucking in some tasks at random momenets. Some code adapted for virtuozzo kernel: 1. virt_rmb isn't exists in this kernel. It is replaced with smp_rmb which does the same 2. vread_pvclock is changed to use modified pvclock reading scheme Signed-off-by: Denis Plotnikov <dplotni...@virtuozzo.com> Reviewed-by: Roman Kagan <rka...@virtuozzo.com> --- arch/x86/include/asm/pvclock.h | 44 +++++++++++++------------- arch/x86/kernel/pvclock.c | 14 ++++----- arch/x86/kvm/x86.c | 3 +- arch/x86/vdso/vclock_gettime.c | 71 ++++++++++++++++++++---------------------- 4 files changed, 63 insertions(+), 69 deletions(-) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 67613db..e63813d 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -16,6 +16,24 @@ void pvclock_resume(void); void pvclock_touch_watchdogs(void); +static __always_inline +unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) +{ + unsigned version = src->version & ~1; + /* Make sure that the version is read before the data. */ + smp_rmb(); + return version; +} + +static __always_inline +bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, + unsigned version) +{ + /* Make sure that the version is re-read after the data. */ + smp_rmb(); + return unlikely(version != src->version); +} + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -60,30 +78,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src, u64 tsc) +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc) { u64 delta = tsc - src->tsc_timestamp; - return pvclock_scale_delta(delta, src->tsc_to_system_mul, - src->tsc_shift); -} - -static __always_inline -unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, - cycle_t *cycles, u8 *flags, u64 tsc) -{ - unsigned version; - cycle_t ret, offset; - u8 ret_flags; - - version = src->version; - - offset = pvclock_get_nsec_offset(src, tsc); - ret = src->system_time + offset; - ret_flags = src->flags; - - *cycles = ret; - *flags = ret_flags; - return version; + cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); + return src->system_time + offset; } struct pvclock_vsyscall_time_info { diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 07391d4..0b347ed 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -61,13 +61,12 @@ void pvclock_resume(void) u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags, - native_read_tsc()); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); return flags & valid_flags; } @@ -80,9 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags, - native_read_tsc()); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); + flags = src->flags; + } while (pvclock_read_retry(src, version)); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8ca07d0..407ceb2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1711,11 +1711,10 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); struct kvm_arch *ka = &kvm->arch; u64 ns; - u8 flags; if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { u64 tsc = kvm_read_l1_tsc(vcpu, native_read_tsc()); - __pvclock_read_cycles(&vcpu->arch.hv_clock, &ns, &flags, tsc); + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); } else { ns = ktime_to_ns(ktime_get_boottime()) + ka->kvmclock_offset; } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index b6d3917..f079e1f 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -68,52 +68,47 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) return &pvti_base[offset]; } -static notrace cycle_t vread_pvclock(int *mode) +static notrace u64 vread_pvclock(int *mode) { - const struct pvclock_vsyscall_time_info *pvti; - cycle_t ret; + const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti; + u64 ret; u64 last; u32 version; - u8 flags; - unsigned cpu, cpu1; - /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. */ + do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags, - rdtsc()); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) - *mode = VCLOCK_NONE; - - /* refer to tsc.c read_tsc() comment for rationale */ + version = pvclock_read_begin(pvti); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); + } while (pvclock_read_retry(pvti, version)); + + /* refer to vread_tsc() comment for rationale */ last = VVAR(vsyscall_gtod_data).clock.cycle_last; if (likely(ret >= last)) -- 2.10.1.352.g0cf3611 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel