On Fri, Aug 23, 2019 at 5:03 PM Graf (AWS), Alexander <g...@amazon.com> wrote: > > > > > Am 23.08.2019 um 13:05 schrieb Anup Patel <a...@brainfault.org>: > > > >> On Fri, Aug 23, 2019 at 1:23 PM Alexander Graf <g...@amazon.com> wrote: > >> > >>> On 22.08.19 10:46, Anup Patel wrote: > >>> From: Atish Patra <atish.pa...@wdc.com> > >>> > >>> The RISC-V hypervisor specification doesn't have any virtual timer > >>> feature. > >>> > >>> Due to this, the guest VCPU timer will be programmed via SBI calls. > >>> The host will use a separate hrtimer event for each guest VCPU to > >>> provide timer functionality. We inject a virtual timer interrupt to > >>> the guest VCPU whenever the guest VCPU hrtimer event expires. > >>> > >>> The following features are not supported yet and will be added in > >>> future: > >>> 1. A time offset to adjust guest time from host time > >>> 2. A saved next event in guest vcpu for vm migration > >> > >> Implementing these 2 bits right now should be trivial. Why wait? > > > > We were waiting for HTIMEDELTA CSR to be merged so we > > deferred this items. > > > >> > >>> > >>> Signed-off-by: Atish Patra <atish.pa...@wdc.com> > >>> Signed-off-by: Anup Patel <anup.pa...@wdc.com> > >>> Acked-by: Paolo Bonzini <pbonz...@redhat.com> > >>> Reviewed-by: Paolo Bonzini <pbonz...@redhat.com> > >>> --- > >>> arch/riscv/include/asm/kvm_host.h | 4 + > >>> arch/riscv/include/asm/kvm_vcpu_timer.h | 32 +++++++ > >>> arch/riscv/kvm/Makefile | 2 +- > >>> arch/riscv/kvm/vcpu.c | 6 ++ > >>> arch/riscv/kvm/vcpu_timer.c | 106 ++++++++++++++++++++++++ > >>> drivers/clocksource/timer-riscv.c | 8 ++ > >>> include/clocksource/timer-riscv.h | 16 ++++ > >>> 7 files changed, 173 insertions(+), 1 deletion(-) > >>> create mode 100644 arch/riscv/include/asm/kvm_vcpu_timer.h > >>> create mode 100644 arch/riscv/kvm/vcpu_timer.c > >>> create mode 100644 include/clocksource/timer-riscv.h > >>> > >>> diff --git a/arch/riscv/include/asm/kvm_host.h > >>> b/arch/riscv/include/asm/kvm_host.h > >>> index ab33e59a3d88..d2a2e45eefc0 100644 > >>> --- a/arch/riscv/include/asm/kvm_host.h > >>> +++ b/arch/riscv/include/asm/kvm_host.h > >>> @@ -12,6 +12,7 @@ > >>> #include <linux/types.h> > >>> #include <linux/kvm.h> > >>> #include <linux/kvm_types.h> > >>> +#include <asm/kvm_vcpu_timer.h> > >>> > >>> #ifdef CONFIG_64BIT > >>> #define KVM_MAX_VCPUS (1U << 16) > >>> @@ -167,6 +168,9 @@ struct kvm_vcpu_arch { > >>> unsigned long irqs_pending; > >>> unsigned long irqs_pending_mask; > >>> > >>> + /* VCPU Timer */ > >>> + struct kvm_vcpu_timer timer; > >>> + > >>> /* MMIO instruction details */ > >>> struct kvm_mmio_decode mmio_decode; > >>> > >>> diff --git a/arch/riscv/include/asm/kvm_vcpu_timer.h > >>> b/arch/riscv/include/asm/kvm_vcpu_timer.h > >>> new file mode 100644 > >>> index 000000000000..df67ea86988e > >>> --- /dev/null > >>> +++ b/arch/riscv/include/asm/kvm_vcpu_timer.h > >>> @@ -0,0 +1,32 @@ > >>> +/* SPDX-License-Identifier: GPL-2.0-only */ > >>> +/* > >>> + * Copyright (C) 2019 Western Digital Corporation or its affiliates. > >>> + * > >>> + * Authors: > >>> + * Atish Patra <atish.pa...@wdc.com> > >>> + */ > >>> + > >>> +#ifndef __KVM_VCPU_RISCV_TIMER_H > >>> +#define __KVM_VCPU_RISCV_TIMER_H > >>> + > >>> +#include <linux/hrtimer.h> > >>> + > >>> +#define VCPU_TIMER_PROGRAM_THRESHOLD_NS 1000 > >>> + > >>> +struct kvm_vcpu_timer { > >>> + bool init_done; > >>> + /* Check if the timer is programmed */ > >>> + bool is_set; > >>> + struct hrtimer hrt; > >>> + /* Mult & Shift values to get nanosec from cycles */ > >>> + u32 mult; > >>> + u32 shift; > >>> +}; > >>> + > >>> +int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu); > >>> +int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu); > >>> +int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu); > >>> +int kvm_riscv_vcpu_timer_next_event(struct kvm_vcpu *vcpu, > >>> + unsigned long ncycles); > >> > >> This function never gets called? > > > > It's called from SBI emulation. > > > >> > >>> + > >>> +#endif > >>> diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile > >>> index c0f57f26c13d..3e0c7558320d 100644 > >>> --- a/arch/riscv/kvm/Makefile > >>> +++ b/arch/riscv/kvm/Makefile > >>> @@ -9,6 +9,6 @@ ccflags-y := -Ivirt/kvm -Iarch/riscv/kvm > >>> kvm-objs := $(common-objs-y) > >>> > >>> kvm-objs += main.o vm.o vmid.o tlb.o mmu.o > >>> -kvm-objs += vcpu.o vcpu_exit.o vcpu_switch.o > >>> +kvm-objs += vcpu.o vcpu_exit.o vcpu_switch.o vcpu_timer.o > >>> > >>> obj-$(CONFIG_KVM) += kvm.o > >>> diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c > >>> index 6124077d154f..018fca436776 100644 > >>> --- a/arch/riscv/kvm/vcpu.c > >>> +++ b/arch/riscv/kvm/vcpu.c > >>> @@ -54,6 +54,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) > >>> > >>> memcpy(cntx, reset_cntx, sizeof(*cntx)); > >>> > >>> + kvm_riscv_vcpu_timer_reset(vcpu); > >>> + > >>> WRITE_ONCE(vcpu->arch.irqs_pending, 0); > >>> WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); > >>> } > >>> @@ -108,6 +110,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > >>> cntx->hstatus |= HSTATUS_SP2P; > >>> cntx->hstatus |= HSTATUS_SPV; > >>> > >>> + /* Setup VCPU timer */ > >>> + kvm_riscv_vcpu_timer_init(vcpu); > >>> + > >>> /* Reset VCPU */ > >>> kvm_riscv_reset_vcpu(vcpu); > >>> > >>> @@ -116,6 +121,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > >>> > >>> void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) > >>> { > >>> + kvm_riscv_vcpu_timer_deinit(vcpu); > >>> kvm_riscv_stage2_flush_cache(vcpu); > >>> kmem_cache_free(kvm_vcpu_cache, vcpu); > >>> } > >>> diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c > >>> new file mode 100644 > >>> index 000000000000..a45ca06e1aa6 > >>> --- /dev/null > >>> +++ b/arch/riscv/kvm/vcpu_timer.c > >>> @@ -0,0 +1,106 @@ > >>> +// SPDX-License-Identifier: GPL-2.0 > >>> +/* > >>> + * Copyright (C) 2019 Western Digital Corporation or its affiliates. > >>> + * > >>> + * Authors: > >>> + * Atish Patra <atish.pa...@wdc.com> > >>> + */ > >>> + > >>> +#include <linux/errno.h> > >>> +#include <linux/err.h> > >>> +#include <linux/kvm_host.h> > >>> +#include <clocksource/timer-riscv.h> > >>> +#include <asm/csr.h> > >>> +#include <asm/kvm_vcpu_timer.h> > >>> + > >>> +static enum hrtimer_restart kvm_riscv_vcpu_hrtimer_expired(struct > >>> hrtimer *h) > >>> +{ > >>> + struct kvm_vcpu_timer *t = container_of(h, struct kvm_vcpu_timer, > >>> hrt); > >>> + struct kvm_vcpu *vcpu = container_of(t, struct kvm_vcpu, > >>> arch.timer); > >>> + > >>> + t->is_set = false; > >>> + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_S_TIMER); > >>> + > >>> + return HRTIMER_NORESTART; > >>> +} > >>> + > >>> +static u64 kvm_riscv_delta_cycles2ns(u64 cycles, struct kvm_vcpu_timer > >>> *t) > >>> +{ > >>> + unsigned long flags; > >>> + u64 cycles_now, cycles_delta, delta_ns; > >>> + > >>> + local_irq_save(flags); > >>> + cycles_now = get_cycles64(); > >>> + if (cycles_now < cycles) > >>> + cycles_delta = cycles - cycles_now; > >>> + else > >>> + cycles_delta = 0; > >>> + delta_ns = (cycles_delta * t->mult) >> t->shift; > >>> + local_irq_restore(flags); > >>> + > >>> + return delta_ns; > >>> +} > >>> + > >>> +static int kvm_riscv_vcpu_timer_cancel(struct kvm_vcpu_timer *t) > >>> +{ > >>> + if (!t->init_done || !t->is_set) > >>> + return -EINVAL; > >>> + > >>> + hrtimer_cancel(&t->hrt); > >>> + t->is_set = false; > >>> + > >>> + return 0; > >>> +} > >>> + > >>> +int kvm_riscv_vcpu_timer_next_event(struct kvm_vcpu *vcpu, > >>> + unsigned long ncycles) > >>> +{ > >>> + struct kvm_vcpu_timer *t = &vcpu->arch.timer; > >>> + u64 delta_ns = kvm_riscv_delta_cycles2ns(ncycles, t); > >> > >> ... in fact, I feel like I'm missing something obvious here. How does > >> the guest trigger the timer event? What is the argument it uses for that > >> and how does that play with the tbfreq in the earlier patch? > > > > We have SBI call inferface between Hypervisor and Guest. One of the > > SBI call allows Guest to program time event. The next event is specified > > as absolute cycles. The Guest can read time using TIME CSR which > > returns system timer value (@ tbfreq freqency). > > > > Guest Linux will know the tbfreq from DTB passed by QEMU/KVMTOOL > > and it has to be same as Host tbfreq. > > > > The TBFREQ config register visible to user-space is a read-only CONFIG > > register which tells user-space tools (QEMU/KVMTOOL) about Host tbfreq. > > And it's read-only because you can not trap on TB reads?
There is no TB registers. The tbfreq can only be know through DT/ACPI kind-of HW description for both Host and Guest. The KVM user-space tool needs to know TBFREQ so that it can set correct value in generated DT for Guest Linux. Regards, Anup > > Alex > > > > > Regards, > > Anup > > > >> > >> > >> Alex > >>