Re: [PATCH v3 14/15] KVM: arm64: Handle protected guests at 32 bits

2021-07-19 Thread Oliver Upton
On Mon, Jul 19, 2021 at 9:04 AM Fuad Tabba  wrote:
>
> Protected KVM does not support protected AArch32 guests. However,
> it is possible for the guest to force run AArch32, potentially
> causing problems. Add an extra check so that if the hypervisor
> catches the guest doing that, it can prevent the guest from
> running again by resetting vcpu->arch.target and returning
> ARM_EXCEPTION_IL.
>
> Adapted from commit 22f553842b14 ("KVM: arm64: Handle Asymmetric
> AArch32 systems")
>
> Signed-off-by: Fuad Tabba 

Would it make sense to document how we handle misbehaved guests, in
case a particular VMM wants to clean up the mess afterwards?

--
Thanks,
Oliver

> ---
>  arch/arm64/kvm/hyp/include/hyp/switch.h | 24 
>  1 file changed, 24 insertions(+)
>
> diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
> b/arch/arm64/kvm/hyp/include/hyp/switch.h
> index 8431f1514280..f09343e15a80 100644
> --- a/arch/arm64/kvm/hyp/include/hyp/switch.h
> +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
> @@ -23,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -477,6 +478,29 @@ static inline bool fixup_guest_exit(struct kvm_vcpu 
> *vcpu, u64 *exit_code)
> write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, 
> SYS_ELR);
> }
>
> +   /*
> +* Protected VMs might not be allowed to run in AArch32. The check 
> below
> +* is based on the one in kvm_arch_vcpu_ioctl_run().
> +* The ARMv8 architecture doesn't give the hypervisor a mechanism to
> +* prevent a guest from dropping to AArch32 EL0 if implemented by the
> +* CPU. If the hypervisor spots a guest in such a state ensure it is
> +* handled, and don't trust the host to spot or fix it.
> +*/
> +   if (unlikely(is_nvhe_hyp_code() &&
> +kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)) &&
> +FIELD_GET(FEATURE(ID_AA64PFR0_EL0),
> +  PVM_ID_AA64PFR0_ALLOW) <
> +ID_AA64PFR0_ELx_32BIT_64BIT &&
> +vcpu_mode_is_32bit(vcpu))) {
> +   /*
> +* As we have caught the guest red-handed, decide that it 
> isn't
> +* fit for purpose anymore by making the vcpu invalid.
> +*/
> +   vcpu->arch.target = -1;
> +   *exit_code = ARM_EXCEPTION_IL;
> +   goto exit;
> +   }
> +
> /*
>  * We're using the raw exception code in order to only process
>  * the trap if no SError is pending. We will come back to the
> --
> 2.32.0.402.g57bb445576-goog
>
> ___
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 0/5] KVM: arm64: Pass PSCI to userspace

2021-07-19 Thread Oliver Upton
On Mon, Jul 19, 2021 at 11:02 AM Jean-Philippe Brucker
 wrote:
> We forward the whole PSCI function range, so it's either KVM or userspace.
> If KVM manages PSCI and the guest calls an unimplemented function, that
> returns directly to the guest without going to userspace.
>
> The concern is valid for any other range, though. If userspace enables the
> HVC cap it receives function calls that at some point KVM might need to
> handle itself. So we need some negotiation between user and KVM about the
> specific HVC ranges that userspace can and will handle.

Are we going to use KVM_CAPs for every interesting HVC range that
userspace may want to trap? I wonder if a more generic interface for
hypercall filtering would have merit to handle the aforementioned
cases, and whatever else a VMM will want to intercept down the line.

For example, x86 has the concept of 'MSR filtering', wherein userspace
can specify a set of registers that it wants to intercept. Doing
something similar for HVCs would avoid the need for a kernel change
each time a VMM wishes to intercept a new hypercall.

--
Thanks,
Oliver
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 11/12] selftests: KVM: Test physical counter offsetting

2021-07-19 Thread Oliver Upton
Test that userpace adjustment of the guest physical counter-timer
results in the correct view of within the guest.

Signed-off-by: Oliver Upton 
---
 .../selftests/kvm/include/aarch64/processor.h | 12 
 .../kvm/system_counter_offset_test.c  | 29 ---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h 
b/tools/testing/selftests/kvm/include/aarch64/processor.h
index 3168cdbae6ee..7f53d90e9512 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -141,4 +141,16 @@ static inline uint64_t read_cntvct_ordered(void)
return r;
 }
 
+static inline uint64_t read_cntpct_ordered(void)
+{
+   uint64_t r;
+
+   __asm__ __volatile__("isb\n\t"
+"mrs %0, cntpct_el0\n\t"
+"isb\n\t"
+: "=r"(r));
+
+   return r;
+}
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
index 88ad997f5b69..3eed9dcb7693 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -57,6 +57,7 @@ static uint64_t host_read_guest_system_counter(struct 
test_case *test)
 
 enum arch_counter {
VIRTUAL,
+   PHYSICAL,
 };
 
 struct test_case {
@@ -68,23 +69,41 @@ static struct test_case test_cases[] = {
{ .counter = VIRTUAL, .offset = 0 },
{ .counter = VIRTUAL, .offset = 180 * NSEC_PER_SEC },
{ .counter = VIRTUAL, .offset = -180 * NSEC_PER_SEC },
+   { .counter = PHYSICAL, .offset = 0 },
+   { .counter = PHYSICAL, .offset = 180 * NSEC_PER_SEC },
+   { .counter = PHYSICAL, .offset = -180 * NSEC_PER_SEC },
 };
 
 static void check_preconditions(struct kvm_vm *vm)
 {
if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
-  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER))
+  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER) &&
+   !_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+  KVM_ARM_VCPU_TIMER_OFFSET_PTIMER))
return;
 
-   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_VTIMER not supported; skipping 
test");
+   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_{VTIMER,PTIMER} not supported; 
skipping test");
exit(KSFT_SKIP);
 }
 
 static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
 {
+   u64 attr = 0;
+
+   switch (test->counter) {
+   case VIRTUAL:
+   attr = KVM_ARM_VCPU_TIMER_OFFSET_VTIMER;
+   break;
+   case PHYSICAL:
+   attr = KVM_ARM_VCPU_TIMER_OFFSET_PTIMER;
+   break;
+   default:
+   TEST_ASSERT(false, "unrecognized counter index %u",
+   test->counter);
+   }
+
vcpu_access_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
-   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER, >offset,
-   true);
+   attr, >offset, true);
 }
 
 static uint64_t guest_read_system_counter(struct test_case *test)
@@ -92,6 +111,8 @@ static uint64_t guest_read_system_counter(struct test_case 
*test)
switch (test->counter) {
case VIRTUAL:
return read_cntvct_ordered();
+   case PHYSICAL:
+   return read_cntpct_ordered();
default:
GUEST_ASSERT(0);
}
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 10/12] KVM: arm64: Provide userspace access to the physical counter offset

2021-07-19 Thread Oliver Upton
Presently, KVM provides no facilities for correctly migrating a guest
that depends on the physical counter-timer. While most guests (barring
NV, of course) should not depend on the physical counter-timer, an
operator may still wish to provide a consistent view of the physical
counter-timer across migrations.

Provide userspace with a new vCPU attribute to modify the guest physical
counter-timer offset. Since the base architecture doesn't provide a
physical counter-timer offset register, emulate the correct behavior by
trapping accesses to the physical counter-timer whenever the offset
value is non-zero.

Uphold the same behavior as CNTVOFF_EL2 and broadcast the physical
offset to all vCPUs whenever written. This guarantees that the
counter-timer we provide the guest remains architectural, wherein all
views of the counter-timer are consistent across vCPUs. Reconfigure
timer traps for VHE on every guest entry, as different VMs will now have
different traps enabled. Enable physical counter traps for nVHE whenever
the offset is nonzero (we already trap physical timer registers in
nVHE).

FEAT_ECV provides a guest physical counter-timer offset register
(CNTPOFF_EL2), but ECV-enabled hardware is nonexistent at the time of
writing so support for it was elided for the sake of the author :)

Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst   | 22 ++
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/include/asm/kvm_hyp.h  |  2 -
 arch/arm64/include/asm/sysreg.h   |  1 +
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 arch/arm64/kvm/arch_timer.c   | 50 ---
 arch/arm64/kvm/arm.c  |  4 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h   | 23 +++
 arch/arm64/kvm/hyp/include/hyp/timer-sr.h | 26 
 arch/arm64/kvm/hyp/nvhe/switch.c  |  2 -
 arch/arm64/kvm/hyp/nvhe/timer-sr.c| 21 +-
 arch/arm64/kvm/hyp/vhe/timer-sr.c | 27 
 include/kvm/arm_arch_timer.h  |  2 -
 13 files changed, 158 insertions(+), 24 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/include/hyp/timer-sr.h

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index 7b57cba3416a..a8547ce09b47 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -161,6 +161,28 @@ the following equation:
 KVM does not allow the use of varying offset values for different vCPUs;
 the last written offset value will be broadcasted to all vCPUs in a VM.
 
+2.3. ATTRIBUTE: KVM_ARM_VCPU_TIMER_OFFSET_PTIMER
+
+
+:Parameters: Pointer to a 64-bit unsigned counter-timer offset.
+
+Returns:
+
+ === ==
+ -EFAULT Error reading/writing the provided
+ parameter address
+ -ENXIO  Attribute not supported
+ === ==
+
+Specifies the guest's physical counter-timer offset from the host's
+virtual counter. The guest's physical counter is then derived by
+the following equation:
+
+  guest_cntpct = host_cntvct - KVM_ARM_VCPU_TIMER_OFFSET_PTIMER
+
+KVM does not allow the use of varying offset values for different vCPUs;
+the last written offset value will be broadcasted to all vCPUs in a VM.
+
 3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
 ==
 
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..de92fa678924 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -204,6 +204,7 @@ enum vcpu_sysreg {
SP_EL1,
SPSR_EL1,
 
+   CNTPOFF_EL2,
CNTVOFF_EL2,
CNTV_CVAL_EL0,
CNTV_CTL_EL0,
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 9d60b3006efc..01eb3864e50f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -65,10 +65,8 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
-#ifdef __KVM_NVHE_HYPERVISOR__
 void __timer_enable_traps(struct kvm_vcpu *vcpu);
 void __timer_disable_traps(struct kvm_vcpu *vcpu);
-#endif
 
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 347ccac2341e..243e36c088e7 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -505,6 +505,7 @@
 #define SYS_AMEVCNTR0_MEM_STALLSYS_AMEVCNTR0_EL0(3)
 
 #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)
+#define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1)
 
 #define SYS_CNTP_TVAL_EL0  sys_reg(3, 3, 14, 2, 0)
 #define SYS_CNTP_CTL_EL0

[PATCH v3 07/12] selftests: KVM: Introduce system counter offset test

2021-07-19 Thread Oliver Upton
Introduce a KVM selftest to verify that userspace manipulation of the
TSC (via the new vCPU attribute) results in the correct behavior within
the guest.

Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../kvm/system_counter_offset_test.c  | 133 ++
 3 files changed, 135 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/system_counter_offset_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index d0877d01e771..2752813d5090 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -50,3 +50,4 @@
 /set_memory_region_test
 /steal_time
 /kvm_binary_stats_test
+/system_counter_offset_test
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index f7e24f334c6e..7bf2e5fb1d5a 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -83,6 +83,7 @@ TEST_GEN_PROGS_x86_64 += memslot_perf_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
+TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
new file mode 100644
index ..7e9015770759
--- /dev/null
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the system counter from userspace
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#ifdef __x86_64__
+
+struct test_case {
+   uint64_t tsc_offset;
+};
+
+static struct test_case test_cases[] = {
+   { 0 },
+   { 180 * NSEC_PER_SEC },
+   { -180 * NSEC_PER_SEC },
+};
+
+static void check_preconditions(struct kvm_vm *vm)
+{
+   if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, 
KVM_VCPU_TSC_OFFSET))
+   return;
+
+   print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test");
+   exit(KSFT_SKIP);
+}
+
+static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
+{
+   vcpu_access_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL,
+   KVM_VCPU_TSC_OFFSET, >tsc_offset, true);
+}
+
+static uint64_t guest_read_system_counter(struct test_case *test)
+{
+   return rdtsc();
+}
+
+static uint64_t host_read_guest_system_counter(struct test_case *test)
+{
+   return rdtsc() + test->tsc_offset;
+}
+
+#else /* __x86_64__ */
+
+#error test not implemented for this architecture!
+
+#endif
+
+#define GUEST_SYNC_CLOCK(__stage, __val)   \
+   GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(void)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+   struct test_case *test = _cases[i];
+
+   GUEST_SYNC_CLOCK(i, guest_read_system_counter(test));
+   }
+
+   GUEST_DONE();
+}
+
+static void handle_sync(struct ucall *uc, uint64_t start, uint64_t end)
+{
+   uint64_t obs = uc->args[2];
+
+   TEST_ASSERT(start <= obs && obs <= end,
+   "unexpected system counter value: %"PRIu64" expected range: 
[%"PRIu64", %"PRIu64"]",
+   obs, start, end);
+
+   pr_info("system counter value: %"PRIu64" expected range [%"PRIu64", 
%"PRIu64"]\n",
+   obs, start, end);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+   TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+static void enter_guest(struct kvm_vm *vm)
+{
+   uint64_t start, end;
+   struct ucall uc;
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+   struct test_case *test = _cases[i];
+
+   setup_system_counter(vm, test);
+   start = host_read_guest_system_counter(test);
+   vcpu_run(vm, VCPU_ID);
+   end = host_read_guest_system_counter(test);
+
+   switch (get_ucall(vm, VCPU_ID, )) {
+   case UCALL_SYNC:
+   handle_sync(, start, end);
+   break;
+   case UCALL_ABORT:
+   handle_abort();
+   return;
+   case UCALL_DONE:
+   return;
+   }
+   }
+}
+
+int main(void)
+{
+   struct kvm_vm *vm;
+
+   vm = vm_create_default(VCPU_ID, 0, guest_main);
+   check_preconditions(vm);
+   ucall_init(vm, NULL);
+
+   enter_guest(vm);
+   kvm_vm_free(vm);
+}
-- 

[PATCH v3 08/12] KVM: arm64: Allow userspace to configure a vCPU's virtual offset

2021-07-19 Thread Oliver Upton
Add a new vCPU attribute that allows userspace to directly manipulate
the virtual counter-timer offset. Exposing such an interface allows for
the precise migration of guest virtual counter-timers, as it is an
indepotent interface.

Uphold the existing behavior of writes to CNTVOFF_EL2 for this new
interface, wherein a write to a single vCPU is broadcasted to all vCPUs
within a VM.

Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst | 22 
 arch/arm64/include/uapi/asm/kvm.h   |  1 +
 arch/arm64/kvm/arch_timer.c | 68 -
 3 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index b46d5f742e69..7b57cba3416a 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -139,6 +139,28 @@ configured values on other VCPUs.  Userspace should 
configure the interrupt
 numbers on at least one VCPU after creating all VCPUs and before running any
 VCPUs.
 
+2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_OFFSET_VTIMER
+
+
+:Parameters: Pointer to a 64-bit unsigned counter-timer offset.
+
+Returns:
+
+=== ==
+-EFAULT Error reading/writing the provided
+parameter address
+-ENXIO  Attribute not supported
+=== ==
+
+Specifies the guest's virtual counter-timer offset from the host's
+virtual counter. The guest's virtual counter is then derived by
+the following equation:
+
+  guest_cntvct = host_cntvct - KVM_ARM_VCPU_TIMER_OFFSET_VTIMER
+
+KVM does not allow the use of varying offset values for different vCPUs;
+the last written offset value will be broadcasted to all vCPUs in a VM.
+
 3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
 ==
 
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index b3edde68bc3e..008d0518d2b1 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -365,6 +365,7 @@ struct kvm_arm_copy_mte_tags {
 #define KVM_ARM_VCPU_TIMER_CTRL1
 #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER0
 #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER1
+#define   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER 2
 #define KVM_ARM_VCPU_PVTIME_CTRL   2
 #define   KVM_ARM_VCPU_PVTIME_IPA  0
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 3df67c127489..d2b1b13af658 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1305,7 +1305,7 @@ static void set_timer_irqs(struct kvm *kvm, int 
vtimer_irq, int ptimer_irq)
}
 }
 
-int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+int kvm_arm_timer_set_attr_irq(struct kvm_vcpu *vcpu, struct kvm_device_attr 
*attr)
 {
int __user *uaddr = (int __user *)(long)attr->addr;
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
@@ -1338,7 +1338,39 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
return 0;
 }
 
-int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+int kvm_arm_timer_set_attr_offset(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
+{
+   u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+   u64 offset;
+
+   if (get_user(offset, uaddr))
+   return -EFAULT;
+
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   update_vtimer_cntvoff(vcpu, offset);
+   break;
+   default:
+   return -ENXIO;
+   }
+
+   return 0;
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+   case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+   return kvm_arm_timer_set_attr_irq(vcpu, attr);
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   return kvm_arm_timer_set_attr_offset(vcpu, attr);
+   }
+
+   return -ENXIO;
+}
+
+int kvm_arm_timer_get_attr_irq(struct kvm_vcpu *vcpu, struct kvm_device_attr 
*attr)
 {
int __user *uaddr = (int __user *)(long)attr->addr;
struct arch_timer_context *timer;
@@ -1359,11 +1391,43 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, 
struct kvm_device_attr *attr)
return put_user(irq, uaddr);
 }
 
+int kvm_arm_timer_get_attr_offset(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
+{
+   u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+   struct arch_timer_context *timer;
+   u64 offset;
+
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   timer = vcpu_vtimer(vcpu);
+   break;
+   default:
+   return -ENXIO;
+   }
+
+   offset = timer_get_offset(timer);
+   return 

[PATCH v3 09/12] selftests: KVM: Add support for aarch64 to system_counter_offset_test

2021-07-19 Thread Oliver Upton
KVM/arm64 now allows userspace to adjust the guest virtual counter-timer
via a vCPU device attribute. Test that changes to the virtual
counter-timer offset result in the correct view being presented to the
guest.

Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/Makefile  |  1 +
 .../selftests/kvm/include/aarch64/processor.h | 12 +
 .../kvm/system_counter_offset_test.c  | 54 ++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index 7bf2e5fb1d5a..d89908108c97 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -96,6 +96,7 @@ TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
+TEST_GEN_PROGS_aarch64 += system_counter_offset_test
 
 TEST_GEN_PROGS_s390x = s390x/memop
 TEST_GEN_PROGS_s390x += s390x/resets
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h 
b/tools/testing/selftests/kvm/include/aarch64/processor.h
index 27dc5c2e56b9..3168cdbae6ee 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -129,4 +129,16 @@ void vm_install_sync_handler(struct kvm_vm *vm,
 
 #define isb()  asm volatile("isb" : : : "memory")
 
+static inline uint64_t read_cntvct_ordered(void)
+{
+   uint64_t r;
+
+   __asm__ __volatile__("isb\n\t"
+"mrs %0, cntvct_el0\n\t"
+"isb\n\t"
+: "=r"(r));
+
+   return r;
+}
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
index 7e9015770759..88ad997f5b69 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -53,7 +53,59 @@ static uint64_t host_read_guest_system_counter(struct 
test_case *test)
return rdtsc() + test->tsc_offset;
 }
 
-#else /* __x86_64__ */
+#elif __aarch64__ /* __x86_64__ */
+
+enum arch_counter {
+   VIRTUAL,
+};
+
+struct test_case {
+   enum arch_counter counter;
+   uint64_t offset;
+};
+
+static struct test_case test_cases[] = {
+   { .counter = VIRTUAL, .offset = 0 },
+   { .counter = VIRTUAL, .offset = 180 * NSEC_PER_SEC },
+   { .counter = VIRTUAL, .offset = -180 * NSEC_PER_SEC },
+};
+
+static void check_preconditions(struct kvm_vm *vm)
+{
+   if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER))
+   return;
+
+   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_VTIMER not supported; skipping 
test");
+   exit(KSFT_SKIP);
+}
+
+static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
+{
+   vcpu_access_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER, >offset,
+   true);
+}
+
+static uint64_t guest_read_system_counter(struct test_case *test)
+{
+   switch (test->counter) {
+   case VIRTUAL:
+   return read_cntvct_ordered();
+   default:
+   GUEST_ASSERT(0);
+   }
+
+   /* unreachable */
+   return 0;
+}
+
+static uint64_t host_read_guest_system_counter(struct test_case *test)
+{
+   return read_cntvct_ordered() - test->offset;
+}
+
+#else /* __aarch64__ */
 
 #error test not implemented for this architecture!
 
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 12/12] selftests: KVM: Add counter emulation benchmark

2021-07-19 Thread Oliver Upton
Add a test case for counter emulation on arm64. A side effect of how KVM
handles physical counter offsetting on non-ECV systems is that the
virtual counter will always hit hardware and the physical could be
emulated. Force emulation by writing a nonzero offset to the physical
counter and compare the elapsed cycles to a direct read of the hardware
register.

Reviewed-by: Ricardo Koller 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../kvm/aarch64/counter_emulation_benchmark.c | 215 ++
 3 files changed, 217 insertions(+)
 create mode 100644 
tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 2752813d5090..1d811c6a769b 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /aarch64/debug-exceptions
+/aarch64/counter_emulation_benchmark
 /aarch64/get-reg-list
 /aarch64/vgic_init
 /s390x/memop
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index d89908108c97..e560a3e74bc2 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -86,6 +86,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
 TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
+TEST_GEN_PROGS_aarch64 += aarch64/counter_emulation_benchmark
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
diff --git a/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c 
b/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c
new file mode 100644
index ..73aeb6cdebfe
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * counter_emulation_benchmark.c -- test to measure the effects of counter
+ * emulation on guest reads of the physical counter.
+ *
+ * Copyright (c) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID 0
+
+static struct counter_values {
+   uint64_t cntvct_start;
+   uint64_t cntpct;
+   uint64_t cntvct_end;
+} counter_values;
+
+static uint64_t nr_iterations = 1000;
+
+static void do_test(void)
+{
+   /*
+* Open-coded approach instead of using helper methods to keep a tight
+* interval around the physical counter read.
+*/
+   asm volatile("isb\n\t"
+"mrs %[cntvct_start], cntvct_el0\n\t"
+"isb\n\t"
+"mrs %[cntpct], cntpct_el0\n\t"
+"isb\n\t"
+"mrs %[cntvct_end], cntvct_el0\n\t"
+"isb\n\t"
+: [cntvct_start] "=r"(counter_values.cntvct_start),
+[cntpct] "=r"(counter_values.cntpct),
+[cntvct_end] "=r"(counter_values.cntvct_end));
+}
+
+static void guest_main(void)
+{
+   int i;
+
+   for (i = 0; i < nr_iterations; i++) {
+   do_test();
+   GUEST_SYNC(i);
+   }
+
+   for (i = 0; i < nr_iterations; i++) {
+   do_test();
+   GUEST_SYNC(i);
+   }
+
+   GUEST_DONE();
+}
+
+static bool enter_guest(struct kvm_vm *vm)
+{
+   struct ucall uc;
+
+   vcpu_ioctl(vm, VCPU_ID, KVM_RUN, NULL);
+
+   switch (get_ucall(vm, VCPU_ID, )) {
+   case UCALL_DONE:
+   return true;
+   case UCALL_SYNC:
+   break;
+   case UCALL_ABORT:
+   TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0],
+   __FILE__, uc.args[1]);
+   break;
+   default:
+   TEST_ASSERT(false, "unexpected exit: %s",
+   exit_reason_str(vcpu_state(vm, 
VCPU_ID)->exit_reason));
+   break;
+   }
+
+   /* more work to do in the guest */
+   return false;
+}
+
+static double counter_frequency(void)
+{
+   uint32_t freq;
+
+   asm volatile("mrs %0, cntfrq_el0"
+: "=r" (freq));
+
+   return freq / 100.0;
+}
+
+static void log_csv(FILE *csv, bool trapped)
+{
+   double freq = counter_frequency();
+
+   fprintf(csv, "%s,%.02f,%lu,%lu,%lu\n",
+   trapped ? "true" : "false", freq,
+   counter_values.cntvct_start,
+   counter_values.cntpct,
+   counter_values.cntvct_end);
+}
+
+static double run_loop(struct kvm_vm *vm, FILE *csv, bool trapped)
+{
+   double avg = 0;
+   int i;
+
+   for (i = 0; i < nr_iterations; i++) {
+   

[PATCH v3 02/12] KVM: x86: Refactor tsc synchronization code

2021-07-19 Thread Oliver Upton
Refactor kvm_synchronize_tsc to make a new function that allows callers
to specify TSC parameters (offset, value, nanoseconds, etc.) explicitly
for the sake of participating in TSC synchronization.

This changes the locking semantics around TSC writes. Writes to the TSC
will now take the pvclock gtod lock while holding the tsc write lock,
whereas before these locks were disjoint.

Reviewed-by: David Matlack 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/locking.rst |  11 +++
 arch/x86/kvm/x86.c | 106 +
 2 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/Documentation/virt/kvm/locking.rst 
b/Documentation/virt/kvm/locking.rst
index 35eca377543d..ac62e1c76694 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -30,6 +30,9 @@ On x86:
   holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
   there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
 
+- kvm->arch.tsc_write_lock is taken outside
+  kvm->arch.pvclock_gtod_sync_lock
+
 Everything else is a leaf: no other lock is taken inside the critical
 sections.
 
@@ -216,6 +219,14 @@ time it will be set using the Dirty tracking mechanism 
described above.
 :Comment:  'raw' because hardware enabling/disabling must be atomic /wrt
migration.
 
+:Name: kvm_arch::pvclock_gtod_sync_lock
+:Type: raw_spinlock_t
+:Arch: x86
+:Protects: kvm_arch::{cur_tsc_generation,cur_tsc_nsec,cur_tsc_write,
+   cur_tsc_offset,nr_vcpus_matched_tsc}
+:Comment:  'raw' because updating the kvm master clock must not be
+   preempted.
+
 :Name: kvm_arch::tsc_write_lock
 :Type: raw_spinlock
 :Arch: x86
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bff78168d4a2..580ba0e86687 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2441,13 +2441,73 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
 }
 
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ *
+ * Must hold kvm->arch.tsc_write_lock to call this function.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+   struct kvm *kvm = vcpu->kvm;
+   bool already_matched;
+   unsigned long flags;
+
+   lockdep_assert_held(>arch.tsc_write_lock);
+
+   already_matched =
+  (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
+
+   /*
+* We track the most recent recorded KHZ, write and time to
+* allow the matching interval to be extended at each write.
+*/
+   kvm->arch.last_tsc_nsec = ns;
+   kvm->arch.last_tsc_write = tsc;
+   kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+
+   vcpu->arch.last_guest_tsc = tsc;
+
+   /* Keep track of which generation this VCPU has synchronized to */
+   vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+   vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+   vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+   kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+   spin_lock_irqsave(>arch.pvclock_gtod_sync_lock, flags);
+   if (!matched) {
+   /*
+* We split periods of matched TSC writes into generations.
+* For each generation, we track the original measured
+* nanosecond time, offset, and write, so if TSCs are in
+* sync, we can match exact offset, and if not, we can match
+* exact software computation in compute_guest_tsc()
+*
+* These values are tracked in kvm->arch.cur_xxx variables.
+*/
+   kvm->arch.nr_vcpus_matched_tsc = 0;
+   kvm->arch.cur_tsc_generation++;
+   kvm->arch.cur_tsc_nsec = ns;
+   kvm->arch.cur_tsc_write = tsc;
+   kvm->arch.cur_tsc_offset = offset;
+   matched = false;
+   } else if (!already_matched) {
+   kvm->arch.nr_vcpus_matched_tsc++;
+   }
+
+   kvm_track_tsc_matching(vcpu);
+   spin_unlock_irqrestore(>arch.pvclock_gtod_sync_lock, flags);
+}
+
 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
-   bool matched;
-   bool already_matched;
+   bool matched = false;
bool synchronizing = false;
 
raw_spin_lock_irqsave(>arch.tsc_write_lock, flags);
@@ -2493,51 +2553,11 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, 
u64 data)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
-  

[PATCH v3 05/12] selftests: KVM: Add test for KVM_{GET,SET}_CLOCK

2021-07-19 Thread Oliver Upton
Add a selftest for the new KVM clock UAPI that was introduced. Ensure
that the KVM clock is consistent between userspace and the guest, and
that the difference in realtime will only ever cause the KVM clock to
advance forward.

Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/include/kvm_util.h  |   2 +
 .../selftests/kvm/x86_64/kvm_clock_test.c | 210 ++
 4 files changed, 214 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/kvm_clock_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 06a351b4f93b..d0877d01e771 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -11,6 +11,7 @@
 /x86_64/emulator_error_test
 /x86_64/get_cpuid_test
 /x86_64/get_msr_index_features
+/x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
 /x86_64/hyperv_clock
 /x86_64/hyperv_cpuid
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index b853be2ae3c6..f7e24f334c6e 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -46,6 +46,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
 TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 010b59b13917..a8ac5d52e17b 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -19,6 +19,8 @@
 #define KVM_DEV_PATH "/dev/kvm"
 #define KVM_MAX_VCPUS 512
 
+#define NSEC_PER_SEC 10L
+
 /*
  * Callers of kvm_util only have an incomplete/opaque description of the
  * structure kvm_util is using to maintain the state of a VM.
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c 
b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
new file mode 100644
index ..12aef5725dc4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the KVM clock from userspace
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+struct test_case {
+   uint64_t kvmclock_base;
+   int64_t realtime_offset;
+};
+
+static struct test_case test_cases[] = {
+   { .kvmclock_base = 0 },
+   { .kvmclock_base = 180 * NSEC_PER_SEC },
+   { .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC },
+   { .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC },
+};
+
+#define GUEST_SYNC_CLOCK(__stage, __val)   \
+   GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti)
+{
+   int i;
+
+   wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+   GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc()));
+   }
+
+   GUEST_DONE();
+}
+
+#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+static inline void assert_flags(struct kvm_clock_data *data)
+{
+   TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS,
+   "unexpected clock data flags: %x (want set: %x)",
+   data->flags, EXPECTED_FLAGS);
+}
+
+static void handle_sync(struct ucall *uc, struct kvm_clock_data *start,
+   struct kvm_clock_data *end)
+{
+   uint64_t obs, exp_lo, exp_hi;
+
+   obs = uc->args[2];
+   exp_lo = start->clock;
+   exp_hi = end->clock;
+
+   assert_flags(start);
+   assert_flags(end);
+
+   TEST_ASSERT(exp_lo <= obs && obs <= exp_hi,
+   "unexpected kvm-clock value: %"PRIu64" expected range: 
[%"PRIu64", %"PRIu64"]",
+   obs, exp_lo, exp_hi);
+
+   pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", 
%"PRIu64"]\n",
+   obs, exp_lo, exp_hi);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+   TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+static void setup_clock(struct kvm_vm *vm, struct test_case *test_case)
+{
+   struct kvm_clock_data data;
+
+   memset(, 0, sizeof(data));
+
+   data.clock = test_case->kvmclock_base;
+   if (test_case->realtime_offset) {
+   struct timespec ts;
+   int r;
+
+   

[PATCH v3 06/12] selftests: KVM: Add helpers for vCPU device attributes

2021-07-19 Thread Oliver Upton
vCPU file descriptors are abstracted away from test code in KVM
selftests, meaning that tests cannot directly access a vCPU's device
attributes. Add helpers that tests can use to get at vCPU device
attributes.

Signed-off-by: Oliver Upton 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  9 +
 tools/testing/selftests/kvm/lib/kvm_util.c| 38 +++
 2 files changed, 47 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index a8ac5d52e17b..1b3ef5757819 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -240,6 +240,15 @@ int _kvm_device_access(int dev_fd, uint32_t group, 
uint64_t attr,
 int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
  void *val, bool write);
 
+int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+ uint64_t attr);
+int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr);
+int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t 
group,
+ uint64_t attr, void *val, bool write);
+int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr, void *val, bool write);
+
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 10a8ed691c66..b595e7dc3fc5 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2040,6 +2040,44 @@ int kvm_device_access(int dev_fd, uint32_t group, 
uint64_t attr,
return ret;
 }
 
+int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+ uint64_t attr)
+{
+   struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+   TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid);
+
+   return _kvm_device_check_attr(vcpu->fd, group, attr);
+}
+
+int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr)
+{
+   int ret = _vcpu_has_device_attr(vm, vcpuid, group, attr);
+
+   TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", 
ret, errno);
+   return ret;
+}
+
+int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t 
group,
+uint64_t attr, void *val, bool write)
+{
+   struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+   TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid);
+
+   return _kvm_device_access(vcpu->fd, group, attr, val, write);
+}
+
+int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+   uint64_t attr, void *val, bool write)
+{
+   int ret = _vcpu_access_device_attr(vm, vcpuid, group, attr, val, write);
+
+   TEST_ASSERT(!ret, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: 
%i", ret, errno);
+   return ret;
+}
+
 /*
  * VM Dump
  *
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 04/12] tools: arch: x86: pull in pvclock headers

2021-07-19 Thread Oliver Upton
Copy over approximately clean versions of the pvclock headers into
tools. Reconcile headers/symbols missing in tools that are unneeded.

Signed-off-by: Oliver Upton 
---
 tools/arch/x86/include/asm/pvclock-abi.h |  48 +++
 tools/arch/x86/include/asm/pvclock.h | 103 +++
 2 files changed, 151 insertions(+)
 create mode 100644 tools/arch/x86/include/asm/pvclock-abi.h
 create mode 100644 tools/arch/x86/include/asm/pvclock.h

diff --git a/tools/arch/x86/include/asm/pvclock-abi.h 
b/tools/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index ..1436226efe3e
--- /dev/null
+++ b/tools/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time.  There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done.  Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+   u32   version;
+   u32   pad0;
+   u64   tsc_timestamp;
+   u64   system_time;
+   u32   tsc_to_system_mul;
+   s8tsc_shift;
+   u8flags;
+   u8pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+   u32   version;
+   u32   sec;
+   u32   nsec;
+} __attribute__((__packed__));
+
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_GUEST_STOPPED  (1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/tools/arch/x86/include/asm/pvclock.h 
b/tools/arch/x86/include/asm/pvclock.h
new file mode 100644
index ..2628f9a6330b
--- /dev/null
+++ b/tools/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_H
+#define _ASM_X86_PVCLOCK_H
+
+#include 
+#include 
+
+/* some helper functions for xen and kvm pv clock sources */
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
+void pvclock_resume(void);
+
+void pvclock_touch_watchdogs(void);
+
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+   unsigned version = src->version & ~1;
+   /* Make sure that the version is read before the data. */
+   rmb();
+   return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+   unsigned version)
+{
+   /* Make sure that the version is re-read after the data. */
+   rmb();
+   return version != src->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+   u64 product;
+#ifdef __i386__
+   u32 tmp1, tmp2;
+#else
+   unsigned long tmp;
+#endif
+
+   if (shift < 0)
+   delta >>= -shift;
+   else
+   delta <<= shift;
+
+#ifdef __i386__
+   __asm__ (
+   "mul  %5   ; "
+   "mov  %4,%%eax ; "
+   "mov  %%edx,%4 ; "
+   "mul  %5   ; "
+   "xor  %5,%5; "
+   "add  %4,%%eax ; "
+   "adc  %5,%%edx ; "
+   : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+   : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+   __asm__ (
+   "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+   : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+   : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
+#else
+#error implement me!
+#endif
+
+   return product;
+}
+
+static __always_inline
+u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
+{
+   u64 delta = tsc - src->tsc_timestamp;
+   u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+src->tsc_shift);
+   return 

[PATCH v3 03/12] KVM: x86: Expose TSC offset controls to userspace

2021-07-19 Thread Oliver Upton
To date, VMM-directed TSC synchronization and migration has been a bit
messy. KVM has some baked-in heuristics around TSC writes to infer if
the VMM is attempting to synchronize. This is problematic, as it depends
on host userspace writing to the guest's TSC within 1 second of the last
write.

A much cleaner approach to configuring the guest's views of the TSC is to
simply migrate the TSC offset for every vCPU. Offsets are idempotent,
and thus not subject to change depending on when the VMM actually
reads/writes values from/to KVM. The VMM can then read the TSC once with
KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
the guest is paused.

Cc: David Matlack 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst |  57 
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/include/uapi/asm/kvm.h |   4 +
 arch/x86/kvm/x86.c  | 167 
 4 files changed, 229 insertions(+)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index 2acec3b9ef65..b46d5f742e69 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -161,3 +161,60 @@ Specifies the base address of the stolen time structure 
for this VCPU. The
 base address must be 64 byte aligned and exist within a valid guest memory
 region. See Documentation/virt/kvm/arm/pvtime.rst for more information
 including the layout of the stolen time structure.
+
+4. GROUP: KVM_VCPU_TSC_CTRL
+===
+
+:Architectures: x86
+
+4.1 ATTRIBUTE: KVM_VCPU_TSC_OFFSET
+
+:Parameters: 64-bit unsigned TSC offset
+
+Returns:
+
+=== ==
+-EFAULT Error reading/writing the provided
+parameter address.
+-ENXIO  Attribute not supported
+=== ==
+
+Specifies the guest's TSC offset relative to the host's TSC. The guest's
+TSC is then derived by the following equation:
+
+  guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET
+
+This attribute is useful for the precise migration of a guest's TSC. The
+following describes a possible algorithm to use for the migration of a
+guest's TSC:
+
+From the source VMM process:
+
+1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_0),
+   kvmclock nanoseconds (k_0), and realtime nanoseconds (r_0).
+
+2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the
+   guest TSC offset (off_n).
+
+3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the
+   guest's TSC (freq).
+
+From the destination VMM process:
+
+4. Invoke the KVM_SET_CLOCK ioctl, providing the kvmclock nanoseconds
+   (k_0) and realtime nanoseconds (r_0) in their respective fields.
+   Ensure that the KVM_CLOCK_REALTIME flag is set in the provided
+   structure. KVM will advance the VM's kvmclock to account for elapsed
+   time since recording the clock values.
+
+5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_1) and
+   kvmclock nanoseconds (k_1).
+
+6. Adjust the guest TSC offsets for every vCPU to account for (1) time
+   elapsed since recording state and (2) difference in TSCs between the
+   source and destination machine:
+
+   new_off_n = t_0 + off_n = (k_1 - k_0) * freq - t_1
+
+7. Write the KVM_VCPU_TSC_OFFSET attribute for every vCPU with the
+   respective value derived in the previous step.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fb2b9270d01..5f2e909b83eb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1070,6 +1070,7 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_write;
u32 last_tsc_khz;
+   u64 last_tsc_offset;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..0b22e1e84e78 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -503,4 +503,8 @@ struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 580ba0e86687..9e7867410091 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2411,6 +2411,11 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu 
*vcpu, u64 l1_offset)
static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 }
 
+static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.l1_tsc_offset;
+}
+
 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 
l1_multiplier)
 {
vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
@@ -2467,6 

[PATCH v3 01/12] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK

2021-07-19 Thread Oliver Upton
Handling the migration of TSCs correctly is difficult, in part because
Linux does not provide userspace with the ability to retrieve a (TSC,
realtime) clock pair for a single instant in time. In lieu of a more
convenient facility, KVM can report similar information in the kvm_clock
structure.

Provide userspace with a host TSC & realtime pair iff the realtime clock
is based on the TSC. If userspace provides KVM_SET_CLOCK with a valid
realtime value, advance the KVM clock by the amount of elapsed time. Do
not step the KVM clock backwards, though, as it is a monotonic
oscillator.

Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/api.rst  |  42 +++--
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/x86.c  | 149 
 include/uapi/linux/kvm.h|   7 +-
 4 files changed, 137 insertions(+), 64 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index b9ddce5638f5..70ccf68cd574 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -993,20 +993,34 @@ such as migration.
 When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the
 set of bits that KVM can return in struct kvm_clock_data's flag member.
 
-The only flag defined now is KVM_CLOCK_TSC_STABLE.  If set, the returned
-value is the exact kvmclock value seen by all VCPUs at the instant
-when KVM_GET_CLOCK was called.  If clear, the returned value is simply
-CLOCK_MONOTONIC plus a constant offset; the offset can be modified
-with KVM_SET_CLOCK.  KVM will try to make all VCPUs follow this clock,
-but the exact value read by each VCPU could differ, because the host
-TSC is not stable.
+FLAGS:
+
+KVM_CLOCK_TSC_STABLE.  If set, the returned value is the exact kvmclock
+value seen by all VCPUs at the instant when KVM_GET_CLOCK was called.
+If clear, the returned value is simply CLOCK_MONOTONIC plus a constant
+offset; the offset can be modified with KVM_SET_CLOCK.  KVM will try
+to make all VCPUs follow this clock, but the exact value read by each
+VCPU could differ, because the host TSC is not stable.
+
+KVM_CLOCK_REALTIME.  If set, the `realtime` field in the kvm_clock_data
+structure is populated with the value of the host's real time
+clocksource at the instant when KVM_GET_CLOCK was called. If clear,
+the `realtime` field does not contain a value.
+
+KVM_CLOCK_HOST_TSC.  If set, the `host_tsc` field in the kvm_clock_data
+structure is populated with the value of the host's timestamp counter (TSC)
+at the instant when KVM_GET_CLOCK was called. If clear, the `host_tsc` field
+does not contain a value.
 
 ::
 
   struct kvm_clock_data {
__u64 clock;  /* kvmclock current value */
__u32 flags;
-   __u32 pad[9];
+   __u32 pad0;
+   __u64 realtime;
+   __u64 host_tsc;
+   __u32 pad[4];
   };
 
 
@@ -1023,12 +1037,22 @@ Sets the current timestamp of kvmclock to the value 
specified in its parameter.
 In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on 
scenarios
 such as migration.
 
+FLAGS:
+
+KVM_CLOCK_REALTIME.  If set, KVM will compare the value of the `realtime` field
+with the value of the host's real time clocksource at the instant when
+KVM_SET_CLOCK was called. The difference in elapsed time is added to the final
+kvmclock value that will be provided to guests.
+
 ::
 
   struct kvm_clock_data {
__u64 clock;  /* kvmclock current value */
__u32 flags;
-   __u32 pad[9];
+   __u32 pad0;
+   __u64 realtime;
+   __u64 host_tsc;
+   __u32 pad[4];
   };
 
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 974cbfb1eefe..3fb2b9270d01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1936,4 +1936,7 @@ int kvm_cpu_dirty_log_size(void);
 
 int alloc_all_memslots_rmaps(struct kvm *kvm);
 
+#define KVM_CLOCK_VALID_FLAGS  \
+   (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c471b1375f36..bff78168d4a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2780,17 +2780,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-u64 get_kvmclock_ns(struct kvm *kvm)
+/*
+ * Returns true if realtime and TSC values were written back to the caller.
+ * Returns false if a clock triplet cannot be obtained, such as if the host's
+ * realtime clock is not based on the TSC.
+ */
+static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
+ u64 *realtime_ns, u64 *tsc)
 {
struct kvm_arch *ka = >arch;
struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;
-   u64 ret;
+   bool ret = false;
 
spin_lock_irqsave(>pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {

[PATCH v3 00/12] KVM: Add idempotent controls for migrating system counter state

2021-07-19 Thread Oliver Upton
KVM's current means of saving/restoring system counters is plagued with
temporal issues. At least on ARM64 and x86, we migrate the guest's
system counter by-value through the respective guest system register
values (cntvct_el0, ia32_tsc). Restoring system counters by-value is
brittle as the state is not idempotent: the host system counter is still
oscillating between the attempted save and restore. Furthermore, VMMs
may wish to transparently live migrate guest VMs, meaning that they
include the elapsed time due to live migration blackout in the guest
system counter view. The VMM thread could be preempted for any number of
reasons (scheduler, L0 hypervisor under nested) between the time that
it calculates the desired guest counter value and when KVM actually sets
this counter state.

Despite the value-based interface that we present to userspace, KVM
actually has idempotent guest controls by way of system counter offsets.
We can avoid all of the issues associated with a value-based interface
by abstracting these offset controls in new ioctls. This series
introduces new vCPU device attributes to provide userspace access to the
vCPU's system counter offset.

Patch 1 adopts Paolo's suggestion, augmenting the KVM_{GET,SET}_CLOCK
ioctls to provide userspace with a (host_tsc, realtime) instant. This is
essential for a VMM to perform precise migration of the guest's system
counters.

Patches 2-3 add support for x86 by shoehorning the new controls into the
pre-existing synchronization heuristics.

Patches 4-5 implement a test for the new additions to
KVM_{GET,SET}_CLOCK.

Patches 6-7 implement at test for the tsc offset attribute introduced in
patch 3.

Patch 8 adds a device attribute for the arm64 virtual counter-timer
offset.

Patch 9 extends the test from patch 7 to cover the arm64 virtual
counter-timer offset.

Patch 10 adds a device attribute for the arm64 physical counter-timer
offset. Currently, this is implemented as a synthetic register, forcing
the guest to trap to the host and emulating the offset in the fast exit
path. Later down the line we will have hardware with FEAT_ECV, which
allows the hypervisor to perform physical counter-timer offsetting in
hardware (CNTPOFF_EL2).

Patch 11 extends the test from patch 7 to cover the arm64 physical
counter-timer offset.

Patch 12 introduces a benchmark to measure the overhead of emulation in
patch 10.

Physical counter benchmark
--

The following data was collected by running 1 iterations of the
benchmark test from Patch 6 on an Ampere Mt. Jade reference server, A 2S
machine with 2 80-core Ampere Altra SoCs. Measurements were collected
for both VHE and nVHE operation using the `kvm-arm.mode=` command-line
parameter.

nVHE


+++-+
|   Metric   | Native | Trapped |
+++-+
| Average| 54ns   | 148ns   |
| Standard Deviation | 124ns  | 122ns   |
| 95th Percentile| 258ns  | 348ns   |
+++-+

VHE
---

+++-+
|   Metric   | Native | Trapped |
+++-+
| Average| 53ns   | 152ns   |
| Standard Deviation | 92ns   | 94ns|
| 95th Percentile| 204ns  | 307ns   |
+++-+

This series applies cleanly to the following commit:

1889228d80fe ("KVM: selftests: smm_test: Test SMM enter from L2")

v1 -> v2:
  - Reimplemented as vCPU device attributes instead of a distinct ioctl.
  - Added the (realtime, host_tsc) instant support to KVM_{GET,SET}_CLOCK
  - Changed the arm64 implementation to broadcast counter
offset values to all vCPUs in a guest. This upholds the
architectural expectations of a consistent counter-timer across CPUs.
  - Fixed a bug with traps in VHE mode. We now configure traps on every
transition into a guest to handle differing VMs (trapped, emulated).

v2 -> v3:
  - Added documentation for additions to KVM_{GET,SET}_CLOCK
  - Added documentation for all new vCPU attributes
  - Added documentation for suggested algorithm to migrate a guest's
TSC(s)
  - Bug fixes throughout series
  - Rename KVM_CLOCK_REAL_TIME -> KVM_CLOCK_REALTIME

v1: https://lore.kernel.org/kvm/20210608214742.1897483-1-oup...@google.com/
v2: https://lore.kernel.org/r/20210716212629.2232756-1-oup...@google.com

Oliver Upton (12):
  KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK
  KVM: x86: Refactor tsc synchronization code
  KVM: x86: Expose TSC offset controls to userspace
  tools: arch: x86: pull in pvclock headers
  selftests: KVM: Add test for KVM_{GET,SET}_CLOCK
  selftests: KVM: Add helpers for vCPU device attributes
  selftests: KVM: Introduce system counter offset test
  KVM: arm64: Allow userspace to configure a vCPU's virtual offset
  selftests: KVM: Add support for aarch64 to system_counter_offset_test
  KVM: arm64: Provide userspace access to the physical counter 

Re: [PATCH 02/16] KVM: arm64: Don't issue CMOs when the physical address is invalid

2021-07-19 Thread Quentin Perret
On Thursday 15 Jul 2021 at 17:31:45 (+0100), Marc Zyngier wrote:
> Make sure we don't issue CMOs when mapping something that
> is not a memory address in the S2 page tables.
> 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kvm/hyp/pgtable.c | 16 ++--
>  1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 05321f4165e3..a5874ebd0354 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -619,12 +619,16 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> end, u32 level,
>   }
>  
>   /* Perform CMOs before installation of the guest stage-2 PTE */
> - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
> - mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
> - granule);
> -
> - if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
> - mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
> + if (kvm_phys_is_valid(phys)) {
> + if (mm_ops->dcache_clean_inval_poc &&
> + stage2_pte_cacheable(pgt, new))
> + mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new,
> +   mm_ops),
> +granule);
> + if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
> + mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
> +  granule);
> + }

Hrmpf so this makes me realize we have a problem here, not really caused
by your patch though.

Specifically, calling kvm_pgtable_stage2_set_owner() can lead to
overriding valid mappings with invalid mappings, which is effectively an
unmap operation. In this case we should issue CMOs when unmapping a
cacheable page to ensure it is clean to the PoC, like the
kvm_pgtable_stage2_unmap() does.

Note that you patch is already an improvement over the current state of
things, because calling stage2_pte_cacheable(pgt, new),
kvm_pte_follow(new, mm_ops) and friends is bogus when 'new' is invalid
...

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 11/15] KVM: arm64: Add trap handlers for protected VMs

2021-07-19 Thread Fuad Tabba
Add trap handlers for protected VMs. These are mainly for Sys64
and debug traps.

No functional change intended as these are not hooked in yet to
the guest exit handlers introduced earlier. So even when trapping
is triggered, the exit handlers would let the host handle it, as
before.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_fixed_config.h | 178 +
 arch/arm64/include/asm/kvm_host.h |   2 +
 arch/arm64/include/asm/kvm_hyp.h  |   3 +
 arch/arm64/kvm/Makefile   |   2 +-
 arch/arm64/kvm/arm.c  |  11 +
 arch/arm64/kvm/hyp/nvhe/Makefile  |   2 +-
 arch/arm64/kvm/hyp/nvhe/sys_regs.c| 443 ++
 arch/arm64/kvm/pkvm.c | 183 +
 8 files changed, 822 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_fixed_config.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/sys_regs.c
 create mode 100644 arch/arm64/kvm/pkvm.c

diff --git a/arch/arm64/include/asm/kvm_fixed_config.h 
b/arch/arm64/include/asm/kvm_fixed_config.h
new file mode 100644
index ..b39a5de2c4b9
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_fixed_config.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba 
+ */
+
+#ifndef __ARM64_KVM_FIXED_CONFIG_H__
+#define __ARM64_KVM_FIXED_CONFIG_H__
+
+#include 
+
+/*
+ * This file contains definitions for features to be allowed or restricted for
+ * guest virtual machines as a baseline, depending on what mode KVM is running
+ * in and on the type of guest is running.
+ *
+ * The features are represented as the highest allowed value for a feature in
+ * the feature id registers. If the field is set to all ones (i.e., 0b),
+ * then it's only restricted by what the system allows. If the feature is set 
to
+ * another value, then that value would be the maximum value allowed and
+ * supported in pKVM, even if the system supports a higher value.
+ *
+ * Some features are forced to a certain value, in which case a SET bitmap is
+ * used to force these values.
+ */
+
+
+/*
+ * Allowed features for protected guests (Protected KVM)
+ *
+ * The approach taken here is to allow features that are:
+ * - needed by common Linux distributions (e.g., flooating point)
+ * - are trivial, e.g., supporting the feature doesn't introduce or require the
+ * tracking of additional state
+ * - not trapable
+ */
+
+/*
+ * - Floating-point and Advanced SIMD:
+ * Don't require much support other than maintaining the context, which KVM
+ * already has.
+ * - AArch64 guests only (no support for AArch32 guests):
+ * Simplify support in case of asymmetric AArch32 systems.
+ * - RAS (v1)
+ * v1 doesn't require much additional support, but later versions do.
+ * - Data Independent Timing
+ * Trivial
+ * Remaining features are not supported either because they require too much
+ * support from KVM, or risk leaking guest data.
+ */
+#define PVM_ID_AA64PFR0_ALLOW (\
+   FEATURE(ID_AA64PFR0_FP) | \
+   FIELD_PREP(FEATURE(ID_AA64PFR0_EL0), ID_AA64PFR0_ELx_64BIT_ONLY) | \
+   FIELD_PREP(FEATURE(ID_AA64PFR0_EL1), ID_AA64PFR0_ELx_64BIT_ONLY) | \
+   FIELD_PREP(FEATURE(ID_AA64PFR0_EL2), ID_AA64PFR0_ELx_64BIT_ONLY) | \
+   FIELD_PREP(FEATURE(ID_AA64PFR0_EL3), ID_AA64PFR0_ELx_64BIT_ONLY) | \
+   FIELD_PREP(FEATURE(ID_AA64PFR0_RAS), ID_AA64PFR0_RAS_V1) | \
+   FEATURE(ID_AA64PFR0_ASIMD) | \
+   FEATURE(ID_AA64PFR0_DIT) \
+   )
+
+/*
+ * - Branch Target Identification
+ * - Speculative Store Bypassing
+ * These features are trivial to support
+ */
+#define PVM_ID_AA64PFR1_ALLOW (\
+   FEATURE(ID_AA64PFR1_BT) | \
+   FEATURE(ID_AA64PFR1_SSBS) \
+   )
+
+/*
+ * No support for Scalable Vectors:
+ * Requires additional support from KVM
+ */
+#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
+
+/*
+ * No support for debug, including breakpoints, and watchpoints:
+ * Reduce complexity and avoid exposing/leaking guest data
+ *
+ * NOTE: The Arm architecture mandates support for at least the Armv8 debug
+ * architecture, which would include at least 2 hardware breakpoints and
+ * watchpoints. Providing that support to protected guests adds considerable
+ * state and complexity, and risks leaking guest data. Therefore, the reserved
+ * value of 0 is used for debug-related fields.
+ */
+#define PVM_ID_AA64DFR0_ALLOW (0ULL)
+
+/*
+ * These features are chosen because they are supported by KVM and to limit the
+ * confiruation state space and make it more deterministic.
+ * - 40-bit IPA
+ * - 16-bit ASID
+ * - Mixed-endian
+ * - Distinction between Secure and Non-secure Memory
+ * - Mixed-endian at EL0 only
+ * - Non-context synchronizing exception entry and exit
+ */
+#define PVM_ID_AA64MMFR0_ALLOW (\
+   FIELD_PREP(FEATURE(ID_AA64MMFR0_PARANGE), ID_AA64MMFR0_PARANGE_40) | \
+   FIELD_PREP(FEATURE(ID_AA64MMFR0_ASID), ID_AA64MMFR0_ASID_16) | \
+   

[PATCH v3 14/15] KVM: arm64: Handle protected guests at 32 bits

2021-07-19 Thread Fuad Tabba
Protected KVM does not support protected AArch32 guests. However,
it is possible for the guest to force run AArch32, potentially
causing problems. Add an extra check so that if the hypervisor
catches the guest doing that, it can prevent the guest from
running again by resetting vcpu->arch.target and returning
ARM_EXCEPTION_IL.

Adapted from commit 22f553842b14 ("KVM: arm64: Handle Asymmetric
AArch32 systems")

Signed-off-by: Fuad Tabba 
---
 arch/arm64/kvm/hyp/include/hyp/switch.h | 24 
 1 file changed, 24 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 8431f1514280..f09343e15a80 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -477,6 +478,29 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, 
u64 *exit_code)
write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
}
 
+   /*
+* Protected VMs might not be allowed to run in AArch32. The check below
+* is based on the one in kvm_arch_vcpu_ioctl_run().
+* The ARMv8 architecture doesn't give the hypervisor a mechanism to
+* prevent a guest from dropping to AArch32 EL0 if implemented by the
+* CPU. If the hypervisor spots a guest in such a state ensure it is
+* handled, and don't trust the host to spot or fix it.
+*/
+   if (unlikely(is_nvhe_hyp_code() &&
+kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)) &&
+FIELD_GET(FEATURE(ID_AA64PFR0_EL0),
+  PVM_ID_AA64PFR0_ALLOW) <
+ID_AA64PFR0_ELx_32BIT_64BIT &&
+vcpu_mode_is_32bit(vcpu))) {
+   /*
+* As we have caught the guest red-handed, decide that it isn't
+* fit for purpose anymore by making the vcpu invalid.
+*/
+   vcpu->arch.target = -1;
+   *exit_code = ARM_EXCEPTION_IL;
+   goto exit;
+   }
+
/*
 * We're using the raw exception code in order to only process
 * the trap if no SError is pending. We will come back to the
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 13/15] KVM: arm64: Trap access to pVM restricted features

2021-07-19 Thread Fuad Tabba
Trap accesses to restricted features for VMs running in protected
mode.

Access to feature registers are emulated, and only supported
features are exposed to protected VMs.

Accesses to restricted registers as well as restricted
instructions are trapped, and an undefined exception is injected
into the protected guests, i.e., with EC = 0x0 (unknown reason).
This EC is the one used, according to the Arm Architecture
Reference Manual, for unallocated or undefined system registers
or instructions.

Only affects the functionality of protected VMs. Otherwise,
should not affect non-protected VMs when KVM is running in
protected mode.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/kvm/hyp/include/hyp/switch.h |  3 ++
 arch/arm64/kvm/hyp/nvhe/switch.c| 52 ++---
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5a2b89b96c67..8431f1514280 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -33,6 +33,9 @@
 extern struct exception_table_entry __start___kvm_ex_table;
 extern struct exception_table_entry __stop___kvm_ex_table;
 
+int kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu);
+int kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu);
+
 /* Check whether the FP regs were dirtied while in the host-side run loop: */
 static inline bool update_fp_enabled(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 36da423006bd..99a90094 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -158,30 +158,54 @@ static void __pmu_switch_to_host(struct kvm_cpu_context 
*host_ctxt)
write_sysreg(pmu->events_host, pmcntenset_el0);
 }
 
+/**
+ * Handle system register accesses for protected VMs.
+ *
+ * Return 1 if handled, or 0 if not.
+ */
+static int handle_pvm_sys64(struct kvm_vcpu *vcpu)
+{
+   return kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)) ?
+kvm_handle_pvm_sys64(vcpu) :
+0;
+}
+
+/**
+ * Handle restricted feature accesses for protected VMs.
+ *
+ * Return 1 if handled, or 0 if not.
+ */
+static int handle_pvm_restricted(struct kvm_vcpu *vcpu)
+{
+   return kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)) ?
+kvm_handle_pvm_restricted(vcpu) :
+0;
+}
+
 typedef int (*exit_handle_fn)(struct kvm_vcpu *);
 
 static exit_handle_fn hyp_exit_handlers[] = {
-   [0 ... ESR_ELx_EC_MAX]  = NULL,
+   [0 ... ESR_ELx_EC_MAX]  = handle_pvm_restricted,
[ESR_ELx_EC_WFx]= NULL,
-   [ESR_ELx_EC_CP15_32]= NULL,
-   [ESR_ELx_EC_CP15_64]= NULL,
-   [ESR_ELx_EC_CP14_MR]= NULL,
-   [ESR_ELx_EC_CP14_LS]= NULL,
-   [ESR_ELx_EC_CP14_64]= NULL,
+   [ESR_ELx_EC_CP15_32]= handle_pvm_restricted,
+   [ESR_ELx_EC_CP15_64]= handle_pvm_restricted,
+   [ESR_ELx_EC_CP14_MR]= handle_pvm_restricted,
+   [ESR_ELx_EC_CP14_LS]= handle_pvm_restricted,
+   [ESR_ELx_EC_CP14_64]= handle_pvm_restricted,
[ESR_ELx_EC_HVC32]  = NULL,
[ESR_ELx_EC_SMC32]  = NULL,
[ESR_ELx_EC_HVC64]  = NULL,
[ESR_ELx_EC_SMC64]  = NULL,
-   [ESR_ELx_EC_SYS64]  = NULL,
-   [ESR_ELx_EC_SVE]= NULL,
+   [ESR_ELx_EC_SYS64]  = handle_pvm_sys64,
+   [ESR_ELx_EC_SVE]= handle_pvm_restricted,
[ESR_ELx_EC_IABT_LOW]   = NULL,
[ESR_ELx_EC_DABT_LOW]   = NULL,
-   [ESR_ELx_EC_SOFTSTP_LOW]= NULL,
-   [ESR_ELx_EC_WATCHPT_LOW]= NULL,
-   [ESR_ELx_EC_BREAKPT_LOW]= NULL,
-   [ESR_ELx_EC_BKPT32] = NULL,
-   [ESR_ELx_EC_BRK64]  = NULL,
-   [ESR_ELx_EC_FP_ASIMD]   = NULL,
+   [ESR_ELx_EC_SOFTSTP_LOW]= handle_pvm_restricted,
+   [ESR_ELx_EC_WATCHPT_LOW]= handle_pvm_restricted,
+   [ESR_ELx_EC_BREAKPT_LOW]= handle_pvm_restricted,
+   [ESR_ELx_EC_BKPT32] = handle_pvm_restricted,
+   [ESR_ELx_EC_BRK64]  = handle_pvm_restricted,
+   [ESR_ELx_EC_FP_ASIMD]   = handle_pvm_restricted,
[ESR_ELx_EC_PAC]= NULL,
 };
 
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 08/15] KVM: arm64: Add feature register flag definitions

2021-07-19 Thread Fuad Tabba
Add feature register flag definitions to clarify which features
might be supported.

Consolidate the various ID_AA64PFR0_ELx flags for all ELs.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/cpufeature.h |  4 ++--
 arch/arm64/include/asm/sysreg.h | 12 
 arch/arm64/kernel/cpufeature.c  |  8 
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 9bb9d11750d7..b7d9bb17908d 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
 {
u32 val = cpuid_feature_extract_unsigned_field(pfr0, 
ID_AA64PFR0_EL1_SHIFT);
 
-   return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+   return val == ID_AA64PFR0_ELx_32BIT_64BIT;
 }
 
 static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
 {
u32 val = cpuid_feature_extract_unsigned_field(pfr0, 
ID_AA64PFR0_EL0_SHIFT);
 
-   return val == ID_AA64PFR0_EL0_32BIT_64BIT;
+   return val == ID_AA64PFR0_ELx_32BIT_64BIT;
 }
 
 static inline bool id_aa64pfr0_sve(u64 pfr0)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 326f49e7bd42..0b773037251c 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -784,14 +784,13 @@
 #define ID_AA64PFR0_AMU0x1
 #define ID_AA64PFR0_SVE0x1
 #define ID_AA64PFR0_RAS_V1 0x1
+#define ID_AA64PFR0_RAS_ANY0xf
 #define ID_AA64PFR0_FP_NI  0xf
 #define ID_AA64PFR0_FP_SUPPORTED   0x0
 #define ID_AA64PFR0_ASIMD_NI   0xf
 #define ID_AA64PFR0_ASIMD_SUPPORTED0x0
-#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1
-#define ID_AA64PFR0_EL1_32BIT_64BIT0x2
-#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1
-#define ID_AA64PFR0_EL0_32BIT_64BIT0x2
+#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1
+#define ID_AA64PFR0_ELx_32BIT_64BIT0x2
 
 /* id_aa64pfr1 */
 #define ID_AA64PFR1_MPAMFRAC_SHIFT 16
@@ -847,12 +846,16 @@
 #define ID_AA64MMFR0_ASID_SHIFT4
 #define ID_AA64MMFR0_PARANGE_SHIFT 0
 
+#define ID_AA64MMFR0_ASID_80x0
+#define ID_AA64MMFR0_ASID_16   0x2
+
 #define ID_AA64MMFR0_TGRAN4_NI 0xf
 #define ID_AA64MMFR0_TGRAN4_SUPPORTED  0x0
 #define ID_AA64MMFR0_TGRAN64_NI0xf
 #define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0
 #define ID_AA64MMFR0_TGRAN16_NI0x0
 #define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1
+#define ID_AA64MMFR0_PARANGE_400x2
 #define ID_AA64MMFR0_PARANGE_480x5
 #define ID_AA64MMFR0_PARANGE_520x6
 
@@ -900,6 +903,7 @@
 #define ID_AA64MMFR2_CNP_SHIFT 0
 
 /* id_aa64dfr0 */
+#define ID_AA64DFR0_MTPMU_SHIFT48
 #define ID_AA64DFR0_TRBE_SHIFT 44
 #define ID_AA64DFR0_TRACE_FILT_SHIFT   40
 #define ID_AA64DFR0_DOUBLELOCK_SHIFT   36
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0ead8bfedf20..5b59fe5e26e4 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -239,8 +239,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL3_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL2_SHIFT, 4, 0),
-   ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
-   ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+   ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
+   ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 
ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY),
ARM64_FTR_END,
 };
 
@@ -1956,7 +1956,7 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.sys_reg = SYS_ID_AA64PFR0_EL1,
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64PFR0_EL0_SHIFT,
-   .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
+   .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
},
 #ifdef CONFIG_KVM
{
@@ -1967,7 +1967,7 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.sys_reg = SYS_ID_AA64PFR0_EL1,
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64PFR0_EL1_SHIFT,
-   .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+   .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT,
},
{
.desc = "Protected KVM",
-- 
2.32.0.402.g57bb445576-goog

___

[PATCH v3 03/15] KVM: arm64: MDCR_EL2 is a 64-bit register

2021-07-19 Thread Fuad Tabba
Fix the places in KVM that treat MDCR_EL2 as a 32-bit register.
More recent features (e.g., FEAT_SPEv1p2) use bits above 31.

No functional change intended.

Acked-by: Will Deacon 
Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_arm.h   | 20 ++--
 arch/arm64/include/asm/kvm_asm.h   |  2 +-
 arch/arm64/include/asm/kvm_host.h  |  2 +-
 arch/arm64/kvm/debug.c |  2 +-
 arch/arm64/kvm/hyp/nvhe/debug-sr.c |  2 +-
 arch/arm64/kvm/hyp/vhe/debug-sr.c  |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index d436831dd706..6a523ec83415 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -281,18 +281,18 @@
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_E2TB_MASK (UL(0x3))
 #define MDCR_EL2_E2TB_SHIFT(UL(24))
-#define MDCR_EL2_TTRF  (1 << 19)
-#define MDCR_EL2_TPMS  (1 << 14)
+#define MDCR_EL2_TTRF  (UL(1) << 19)
+#define MDCR_EL2_TPMS  (UL(1) << 14)
 #define MDCR_EL2_E2PB_MASK (UL(0x3))
 #define MDCR_EL2_E2PB_SHIFT(UL(12))
-#define MDCR_EL2_TDRA  (1 << 11)
-#define MDCR_EL2_TDOSA (1 << 10)
-#define MDCR_EL2_TDA   (1 << 9)
-#define MDCR_EL2_TDE   (1 << 8)
-#define MDCR_EL2_HPME  (1 << 7)
-#define MDCR_EL2_TPM   (1 << 6)
-#define MDCR_EL2_TPMCR (1 << 5)
-#define MDCR_EL2_HPMN_MASK (0x1F)
+#define MDCR_EL2_TDRA  (UL(1) << 11)
+#define MDCR_EL2_TDOSA (UL(1) << 10)
+#define MDCR_EL2_TDA   (UL(1) << 9)
+#define MDCR_EL2_TDE   (UL(1) << 8)
+#define MDCR_EL2_HPME  (UL(1) << 7)
+#define MDCR_EL2_TPM   (UL(1) << 6)
+#define MDCR_EL2_TPMCR (UL(1) << 5)
+#define MDCR_EL2_HPMN_MASK (UL(0x1F))
 
 /* For compatibility with fault code shared with 32-bit */
 #define FSC_FAULT  ESR_ELx_FSC_FAULT
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 9f0bf2109be7..63ead9060ab5 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -210,7 +210,7 @@ extern u64 __vgic_v3_read_vmcr(void);
 extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
 
-extern u32 __kvm_get_mdcr_el2(void);
+extern u64 __kvm_get_mdcr_el2(void);
 
 #define __KVM_EXTABLE(from, to)
\
"   .pushsection__kvm_ex_table, \"a\"\n"\
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 347781f99b6a..4d2d974c1522 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvm_vcpu_arch {
 
/* HYP configuration */
u64 hcr_el2;
-   u32 mdcr_el2;
+   u64 mdcr_el2;
 
/* Exception Information */
struct kvm_vcpu_fault_info fault;
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index d5e79d7ee6e9..db9361338b2a 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -21,7 +21,7 @@
DBG_MDSCR_KDE | \
DBG_MDSCR_MDE)
 
-static DEFINE_PER_CPU(u32, mdcr_el2);
+static DEFINE_PER_CPU(u64, mdcr_el2);
 
 /**
  * save/restore_guest_debug_regs
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c 
b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index 7d3f25868cae..df361d839902 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
__debug_switch_to_host_common(vcpu);
 }
 
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
 {
return read_sysreg(mdcr_el2);
 }
diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c 
b/arch/arm64/kvm/hyp/vhe/debug-sr.c
index f1e2e5a00933..289689b2682d 100644
--- a/arch/arm64/kvm/hyp/vhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c
@@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu)
__debug_switch_to_host_common(vcpu);
 }
 
-u32 __kvm_get_mdcr_el2(void)
+u64 __kvm_get_mdcr_el2(void)
 {
return read_sysreg(mdcr_el2);
 }
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 15/15] KVM: arm64: Restrict protected VM capabilities

2021-07-19 Thread Fuad Tabba
Restrict protected VM capabilities based on the
fixed-configuration for protected VMs.

No functional change intended in current KVM-supported modes
(nVHE, VHE).

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_fixed_config.h | 10 
 arch/arm64/kvm/arm.c  | 63 ++-
 arch/arm64/kvm/pkvm.c | 30 +++
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_fixed_config.h 
b/arch/arm64/include/asm/kvm_fixed_config.h
index b39a5de2c4b9..14310b035bf7 100644
--- a/arch/arm64/include/asm/kvm_fixed_config.h
+++ b/arch/arm64/include/asm/kvm_fixed_config.h
@@ -175,4 +175,14 @@
  */
 #define PVM_ID_AA64ISAR1_ALLOW (~0ULL)
 
+/*
+ * Returns the maximum number of breakpoints supported for protected VMs.
+ */
+int kvm_arm_pkvm_get_max_brps(void);
+
+/*
+ * Returns the maximum number of watchpoints supported for protected VMs.
+ */
+int kvm_arm_pkvm_get_max_wrps(void);
+
 #endif /* __ARM64_KVM_FIXED_CONFIG_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3f28549aff0d..bc41e3b71fab 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -188,9 +189,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
atomic_set(>online_vcpus, 0);
 }
 
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+static int kvm_check_extension(struct kvm *kvm, long ext)
 {
int r;
+
switch (ext) {
case KVM_CAP_IRQCHIP:
r = vgic_present;
@@ -281,6 +283,65 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
 }
 
+static int pkvm_check_extension(struct kvm *kvm, long ext, int kvm_cap)
+{
+   int r;
+
+   switch (ext) {
+   case KVM_CAP_ARM_PSCI:
+   case KVM_CAP_ARM_PSCI_0_2:
+   case KVM_CAP_NR_VCPUS:
+   case KVM_CAP_MAX_VCPUS:
+   case KVM_CAP_MAX_VCPU_ID:
+   r = kvm_cap;
+   break;
+   case KVM_CAP_ARM_EL1_32BIT:
+   r = kvm_cap &&
+   (FIELD_GET(FEATURE(ID_AA64PFR0_EL1), PVM_ID_AA64PFR0_ALLOW) 
>=
+ID_AA64PFR0_ELx_32BIT_64BIT);
+   break;
+   case KVM_CAP_GUEST_DEBUG_HW_BPS:
+   r = min(kvm_cap, kvm_arm_pkvm_get_max_brps());
+   break;
+   case KVM_CAP_GUEST_DEBUG_HW_WPS:
+   r = min(kvm_cap, kvm_arm_pkvm_get_max_wrps());
+   break;
+   case KVM_CAP_ARM_PMU_V3:
+   r = kvm_cap &&
+   FIELD_GET(FEATURE(ID_AA64DFR0_PMUVER), 
PVM_ID_AA64DFR0_ALLOW);
+   break;
+   case KVM_CAP_ARM_SVE:
+   r = kvm_cap &&
+   FIELD_GET(FEATURE(ID_AA64PFR0_SVE), PVM_ID_AA64PFR0_ALLOW);
+   break;
+   case KVM_CAP_ARM_PTRAUTH_ADDRESS:
+   r = kvm_cap &&
+   FIELD_GET(FEATURE(ID_AA64ISAR1_API), 
PVM_ID_AA64ISAR1_ALLOW) &&
+   FIELD_GET(FEATURE(ID_AA64ISAR1_APA), 
PVM_ID_AA64ISAR1_ALLOW);
+   break;
+   case KVM_CAP_ARM_PTRAUTH_GENERIC:
+   r = kvm_cap &&
+   FIELD_GET(FEATURE(ID_AA64ISAR1_GPI), 
PVM_ID_AA64ISAR1_ALLOW) &&
+   FIELD_GET(FEATURE(ID_AA64ISAR1_GPA), 
PVM_ID_AA64ISAR1_ALLOW);
+   break;
+   default:
+   r = 0;
+   break;
+   }
+
+   return r;
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+   int r = kvm_check_extension(kvm, ext);
+
+   if (unlikely(kvm && kvm_vm_is_protected(kvm)))
+   r = pkvm_check_extension(kvm, ext, r);
+
+   return r;
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
 {
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index b8430b3d97af..d41553594d08 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -181,3 +181,33 @@ void kvm_init_protected_traps(struct kvm_vcpu *vcpu)
pvm_init_traps_aa64mmfr0(vcpu);
pvm_init_traps_aa64mmfr1(vcpu);
 }
+
+int kvm_arm_pkvm_get_max_brps(void)
+{
+   int num = FIELD_GET(FEATURE(ID_AA64DFR0_BRPS), PVM_ID_AA64DFR0_ALLOW);
+
+   /*
+* If breakpoints are supported, the maximum number is 1 + the field.
+* Otherwise, return 0, which is not compliant with the architecture,
+* but is reserved and is used here to indicate no debug support.
+*/
+   if (num)
+   return 1 + num;
+   else
+   return 0;
+}
+
+int kvm_arm_pkvm_get_max_wrps(void)
+{
+   int num = FIELD_GET(FEATURE(ID_AA64DFR0_WRPS), PVM_ID_AA64DFR0_ALLOW);
+
+   /*
+* If breakpoints are supported, the maximum number is 1 + the field.
+* Otherwise, return 0, which is not compliant with the architecture,
+* but is reserved and is used here to indicate no debug support.
+*/
+ 

[PATCH v3 12/15] KVM: arm64: Move sanitized copies of CPU features

2021-07-19 Thread Fuad Tabba
Move the sanitized copies of the CPU feature registers to the
recently created sys_regs.c. This consolidates all copies in a
more relevant file.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 --
 arch/arm64/kvm/hyp/nvhe/sys_regs.c| 2 ++
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d938ce95d3bd..925c7db7fa34 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -25,12 +25,6 @@ struct host_kvm host_kvm;
 
 static struct hyp_pool host_s2_pool;
 
-/*
- * Copies of the host's CPU features registers holding sanitized values.
- */
-u64 id_aa64mmfr0_el1_sys_val;
-u64 id_aa64mmfr1_el1_sys_val;
-
 static const u8 pkvm_hyp_id = 1;
 
 static void *host_s2_zalloc_pages_exact(size_t size)
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c 
b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 6c7230aa70e9..e928567430c1 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -20,6 +20,8 @@
  */
 u64 id_aa64pfr0_el1_sys_val;
 u64 id_aa64pfr1_el1_sys_val;
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
 u64 id_aa64mmfr2_el1_sys_val;
 
 /*
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 07/15] KVM: arm64: Track value of cptr_el2 in struct kvm_vcpu_arch

2021-07-19 Thread Fuad Tabba
Track the baseline guest value for cptr_el2 in struct
kvm_vcpu_arch, similar to the other registers that control traps.
Use this value when setting cptr_el2 for the guest.

Currently this value is unchanged (CPTR_EL2_DEFAULT), but future
patches will set trapping bits based on features supported for
the guest.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_host.h | 1 +
 arch/arm64/kvm/arm.c  | 1 +
 arch/arm64/kvm/hyp/nvhe/switch.c  | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 76462c6a91ee..ac67d5699c68 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -290,6 +290,7 @@ struct kvm_vcpu_arch {
/* Values of trap registers for the guest. */
u64 hcr_el2;
u64 mdcr_el2;
+   u64 cptr_el2;
 
/* Values of trap registers for the host before guest entry. */
u64 mdcr_el2_host;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e9a2b8f27792..14b12f2c08c0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1104,6 +1104,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu 
*vcpu,
}
 
vcpu_reset_hcr(vcpu);
+   vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
 
/*
 * Handle the "start in power-off" case.
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 1778593a08a9..86f3d6482935 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
___activate_traps(vcpu);
__activate_traps_common(vcpu);
 
-   val = CPTR_EL2_DEFAULT;
+   val = vcpu->arch.cptr_el2;
val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
if (!update_fp_enabled(vcpu)) {
val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 04/15] KVM: arm64: Fix names of config register fields

2021-07-19 Thread Fuad Tabba
Change the names of hcr_el2 register fields to match the Arm
Architecture Reference Manual. Easier for cross-referencing and
for grepping.

Also, change the name of CPTR_EL2_RES1 to CPTR_NVHE_EL2_RES1,
because res1 bits are different for VHE.

No functional change intended.

Acked-by: Will Deacon 
Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_arm.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 6a523ec83415..a928b2dc0b0f 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -32,9 +32,9 @@
 #define HCR_TVM(UL(1) << 26)
 #define HCR_TTLB   (UL(1) << 25)
 #define HCR_TPU(UL(1) << 24)
-#define HCR_TPC(UL(1) << 23)
+#define HCR_TPC(UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
 #define HCR_TSW(UL(1) << 22)
-#define HCR_TAC(UL(1) << 21)
+#define HCR_TACR   (UL(1) << 21)
 #define HCR_TIDCP  (UL(1) << 20)
 #define HCR_TSC(UL(1) << 19)
 #define HCR_TID3   (UL(1) << 18)
@@ -61,7 +61,7 @@
  * The bits we set in HCR:
  * TLOR:   Trap LORegion register accesses
  * RW: 64bit by default, can be overridden for 32bit VMs
- * TAC:Trap ACTLR
+ * TACR:   Trap ACTLR
  * TSC:Trap SMC
  * TSW:Trap cache operations by set/way
  * TWE:Trap WFE
@@ -76,7 +76,7 @@
  * PTW:Take a stage2 fault if a stage1 walk steps in device 
memory
  */
 #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
-HCR_BSU_IS | HCR_FB | HCR_TAC | \
+HCR_BSU_IS | HCR_FB | HCR_TACR | \
 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
 HCR_FMO | HCR_IMO | HCR_PTW )
 #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
@@ -275,8 +275,8 @@
 #define CPTR_EL2_TTA   (1 << 20)
 #define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
 #define CPTR_EL2_TZ(1 << 8)
-#define CPTR_EL2_RES1  0x32ff /* known RES1 bits in CPTR_EL2 */
-#define CPTR_EL2_DEFAULT   CPTR_EL2_RES1
+#define CPTR_NVHE_EL2_RES1 0x32ff /* known RES1 bits in CPTR_EL2 
(nVHE) */
+#define CPTR_EL2_DEFAULT   CPTR_NVHE_EL2_RES1
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_E2TB_MASK (UL(0x3))
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 02/15] KVM: arm64: Remove trailing whitespace in comment

2021-07-19 Thread Fuad Tabba
Remove trailing whitespace from comment in trap_dbgauthstatus_el1().

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/kvm/sys_regs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f6f126eb6ac1..80a6e41cadad 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -318,14 +318,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
 /*
  * We want to avoid world-switching all the DBG registers all the
  * time:
- * 
+ *
  * - If we've touched any debug register, it is likely that we're
  *   going to touch more of them. It then makes sense to disable the
  *   traps and start doing the save/restore dance
  * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is
  *   then mandatory to save/restore the registers, as the guest
  *   depends on them.
- * 
+ *
  * For this, we use a DIRTY bit, indicating the guest has modified the
  * debug registers, used as follow:
  *
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 01/15] KVM: arm64: placeholder to check if VM is protected

2021-07-19 Thread Fuad Tabba
Add a function to check whether a VM is protected (under pKVM).
Since the creation of protected VMs isn't enabled yet, this is a
placeholder that always returns false. The intention is for this
to become a check for protected VMs in the future (see Will's RFC
[*]).

No functional change intended.

Signed-off-by: Fuad Tabba 

[*] https://lore.kernel.org/kvmarm/20210603183347.1695-1-w...@kernel.org/
---
 arch/arm64/include/asm/kvm_host.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..347781f99b6a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -771,6 +771,11 @@ void kvm_arch_free_vm(struct kvm *kvm);
 
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
 
+static inline bool kvm_vm_is_protected(struct kvm *kvm)
+{
+   return false;
+}
+
 int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
 bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 00/15] KVM: arm64: Fixed features for protected VMs

2021-07-19 Thread Fuad Tabba
Hi,

Changes since v2 [1]:
- Both trapping and setting of feature id registers are toggled by an allowed
  features bitmap of the feature id registers (Will)
- Documentation explaining the rationale behind allowed/blocked features (Drew)
- Restrict protected VM features by checking and restricting VM capabilities
- Misc small fixes and tidying up (mostly Will)
- Remove dependency on Will's protected VM user ABI series [2]
- Rebase on 5.14-rc2
- Carried Will's acks

Changes since v1 [3]:
- Restrict protected VM features based on an allowed features rather than
  rejected ones (Drew)
- Add more background describing protected KVM to the cover letter (Alex)

This patch series adds support for restricting CPU features for protected VMs
in KVM (pKVM) [4].

Various VM feature configurations are allowed in KVM/arm64, each requiring
specific handling logic to deal with traps, context-switching and potentially
emulation. Achieving feature parity in pKVM therefore requires either elevating
this logic to EL2 (and substantially increasing the TCB) or continuing to trust
the host handlers at EL1. Since neither of these options are especially
appealing, pKVM instead limits the CPU features exposed to a guest to a fixed
configuration based on the underlying hardware and which can mostly be provided
straightforwardly by EL2.

This series approaches that by restricting CPU features exposed to protected
guests. Features advertised through feature registers are limited, which pKVM
enforces by trapping register accesses and instructions associated with these
features.

This series is based on 5.14-rc2. You can find the applied series here [5].

Cheers,
/fuad

[1] https://lore.kernel.org/kvmarm/20210615133950.693489-1-ta...@google.com/

[2] https://lore.kernel.org/kvmarm/20210603183347.1695-1-w...@kernel.org/

[3] https://lore.kernel.org/kvmarm/20210608141141.997398-1-ta...@google.com/

[4] Once complete, protected KVM adds the ability to create protected VMs.
These protected VMs are protected from the host Linux kernel (and from other
VMs), where the host does not have access to guest memory,even if compromised.
Normal (nVHE) guests can still be created and run in parallel with protected
VMs. Their functionality should not be affected.

For protected VMs, the host should not even have access to a protected guest's
state or anything that would enable it to manipulate it (e.g., vcpu register
context and el2 system registers); only hyp would have that access. If the host
could access that state, then it might be able to get around the protection
provided.  Therefore, anything that is sensitive and that would require such
access needs to happen at hyp, hence the code in nvhe running only at hyp.

For more details about pKVM, please refer to Will's talk at KVM Forum 2020:
https://mirrors.edge.kernel.org/pub/linux/kernel/people/will/slides/kvmforum-2020-edited.pdf
https://www.youtube.com/watch?v=edqJSzsDRxk

[5] 
https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/el2_fixed_feature_v3

Fuad Tabba (15):
  KVM: arm64: placeholder to check if VM is protected
  KVM: arm64: Remove trailing whitespace in comment
  KVM: arm64: MDCR_EL2 is a 64-bit register
  KVM: arm64: Fix names of config register fields
  KVM: arm64: Refactor sys_regs.h,c for nVHE reuse
  KVM: arm64: Restore mdcr_el2 from vcpu
  KVM: arm64: Track value of cptr_el2 in struct kvm_vcpu_arch
  KVM: arm64: Add feature register flag definitions
  KVM: arm64: Add config register bit definitions
  KVM: arm64: Guest exit handlers for nVHE hyp
  KVM: arm64: Add trap handlers for protected VMs
  KVM: arm64: Move sanitized copies of CPU features
  KVM: arm64: Trap access to pVM restricted features
  KVM: arm64: Handle protected guests at 32 bits
  KVM: arm64: Restrict protected VM capabilities

 arch/arm64/include/asm/cpufeature.h   |   4 +-
 arch/arm64/include/asm/kvm_arm.h  |  54 ++-
 arch/arm64/include/asm/kvm_asm.h  |   2 +-
 arch/arm64/include/asm/kvm_fixed_config.h | 188 +
 arch/arm64/include/asm/kvm_host.h |  15 +-
 arch/arm64/include/asm/kvm_hyp.h  |   5 +-
 arch/arm64/include/asm/sysreg.h   |  15 +-
 arch/arm64/kernel/cpufeature.c|   8 +-
 arch/arm64/kvm/Makefile   |   2 +-
 arch/arm64/kvm/arm.c  |  75 +++-
 arch/arm64/kvm/debug.c|   2 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h   |  76 +++-
 arch/arm64/kvm/hyp/nvhe/Makefile  |   2 +-
 arch/arm64/kvm/hyp/nvhe/debug-sr.c|   2 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |   6 -
 arch/arm64/kvm/hyp/nvhe/switch.c  |  72 +++-
 arch/arm64/kvm/hyp/nvhe/sys_regs.c| 445 ++
 arch/arm64/kvm/hyp/vhe/debug-sr.c |   2 +-
 arch/arm64/kvm/hyp/vhe/switch.c   |  12 +-
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c|   2 +-
 arch/arm64/kvm/pkvm.c | 213 +++
 arch/arm64/kvm/sys_regs.c |  34 +-
 

[PATCH v3 10/15] KVM: arm64: Guest exit handlers for nVHE hyp

2021-07-19 Thread Fuad Tabba
Add an array of pointers to handlers for various trap reasons in
nVHE code.

The current code selects how to fixup a guest on exit based on a
series of if/else statements. Future patches will also require
different handling for guest exists. Create an array of handlers
to consolidate them.

No functional change intended as the array isn't populated yet.

Acked-by: Will Deacon 
Signed-off-by: Fuad Tabba 
---
 arch/arm64/kvm/hyp/include/hyp/switch.h | 43 +
 arch/arm64/kvm/hyp/nvhe/switch.c| 35 
 2 files changed, 78 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index a0e78a6027be..5a2b89b96c67 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -409,6 +409,46 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu 
*vcpu)
return true;
 }
 
+typedef int (*exit_handle_fn)(struct kvm_vcpu *);
+
+exit_handle_fn kvm_get_nvhe_exit_handler(struct kvm_vcpu *vcpu);
+
+static exit_handle_fn kvm_get_hyp_exit_handler(struct kvm_vcpu *vcpu)
+{
+   return is_nvhe_hyp_code() ? kvm_get_nvhe_exit_handler(vcpu) : NULL;
+}
+
+/*
+ * Allow the hypervisor to handle the exit with an exit handler if it has one.
+ *
+ * Returns true if the hypervisor handled the exit, and control should go back
+ * to the guest, or false if it hasn't.
+ */
+static bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu)
+{
+   bool is_handled = false;
+   exit_handle_fn exit_handler = kvm_get_hyp_exit_handler(vcpu);
+
+   if (exit_handler) {
+   /*
+* There's limited vcpu context here since it's not synced yet.
+* Ensure that relevant vcpu context that might be used by the
+* exit_handler is in sync before it's called and if handled.
+*/
+   *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
+   *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+
+   is_handled = exit_handler(vcpu);
+
+   if (is_handled) {
+   write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
+   write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
+   }
+   }
+
+   return is_handled;
+}
+
 /*
  * Return true when we were able to fixup the guest exit and should return to
  * the guest, false when we should restore the host state and return to the
@@ -496,6 +536,9 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, 
u64 *exit_code)
goto guest;
}
 
+   /* Check if there's an exit handler and allow it to handle the exit. */
+   if (kvm_hyp_handle_exit(vcpu))
+   goto guest;
 exit:
/* Return to the host kernel and handle the exit */
return false;
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 86f3d6482935..36da423006bd 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -158,6 +158,41 @@ static void __pmu_switch_to_host(struct kvm_cpu_context 
*host_ctxt)
write_sysreg(pmu->events_host, pmcntenset_el0);
 }
 
+typedef int (*exit_handle_fn)(struct kvm_vcpu *);
+
+static exit_handle_fn hyp_exit_handlers[] = {
+   [0 ... ESR_ELx_EC_MAX]  = NULL,
+   [ESR_ELx_EC_WFx]= NULL,
+   [ESR_ELx_EC_CP15_32]= NULL,
+   [ESR_ELx_EC_CP15_64]= NULL,
+   [ESR_ELx_EC_CP14_MR]= NULL,
+   [ESR_ELx_EC_CP14_LS]= NULL,
+   [ESR_ELx_EC_CP14_64]= NULL,
+   [ESR_ELx_EC_HVC32]  = NULL,
+   [ESR_ELx_EC_SMC32]  = NULL,
+   [ESR_ELx_EC_HVC64]  = NULL,
+   [ESR_ELx_EC_SMC64]  = NULL,
+   [ESR_ELx_EC_SYS64]  = NULL,
+   [ESR_ELx_EC_SVE]= NULL,
+   [ESR_ELx_EC_IABT_LOW]   = NULL,
+   [ESR_ELx_EC_DABT_LOW]   = NULL,
+   [ESR_ELx_EC_SOFTSTP_LOW]= NULL,
+   [ESR_ELx_EC_WATCHPT_LOW]= NULL,
+   [ESR_ELx_EC_BREAKPT_LOW]= NULL,
+   [ESR_ELx_EC_BKPT32] = NULL,
+   [ESR_ELx_EC_BRK64]  = NULL,
+   [ESR_ELx_EC_FP_ASIMD]   = NULL,
+   [ESR_ELx_EC_PAC]= NULL,
+};
+
+exit_handle_fn kvm_get_nvhe_exit_handler(struct kvm_vcpu *vcpu)
+{
+   u32 esr = kvm_vcpu_get_esr(vcpu);
+   u8 esr_ec = ESR_ELx_EC(esr);
+
+   return hyp_exit_handlers[esr_ec];
+}
+
 /* Switch to the guest for legacy non-VHE systems */
 int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 {
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v3 06/15] KVM: arm64: Restore mdcr_el2 from vcpu

2021-07-19 Thread Fuad Tabba
On deactivating traps, restore the value of mdcr_el2 from the
newly created and preserved host value vcpu context, rather than
directly reading the hardware register.

Up until and including this patch the two values are the same,
i.e., the hardware register and the vcpu one. A future patch will
be changing the value of mdcr_el2 on activating traps, and this
ensures that its value will be restored.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_host.h   |  5 -
 arch/arm64/include/asm/kvm_hyp.h|  2 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h |  6 +-
 arch/arm64/kvm/hyp/nvhe/switch.c| 11 ++-
 arch/arm64/kvm/hyp/vhe/switch.c | 12 ++--
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c  |  2 +-
 6 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 4d2d974c1522..76462c6a91ee 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -287,10 +287,13 @@ struct kvm_vcpu_arch {
/* Stage 2 paging state used by the hardware on next switch */
struct kvm_s2_mmu *hw_mmu;
 
-   /* HYP configuration */
+   /* Values of trap registers for the guest. */
u64 hcr_el2;
u64 mdcr_el2;
 
+   /* Values of trap registers for the host before guest entry. */
+   u64 mdcr_el2_host;
+
/* Exception Information */
struct kvm_vcpu_fault_info fault;
 
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 9d60b3006efc..657d0c94cf82 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr);
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
-void deactivate_traps_vhe_put(void);
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu);
 #endif
 
 u64 __guest_enter(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index e4a2f295a394..a0e78a6027be 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu 
*vcpu)
write_sysreg(0, pmselr_el0);
write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
}
+
+   vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 }
 
-static inline void __deactivate_traps_common(void)
+static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
 {
+   write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2);
+
write_sysreg(0, hstr_el2);
if (kvm_arm_support_pmu_v3())
write_sysreg(0, pmuserenr_el0);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index f7af9688c1f7..1778593a08a9 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 static void __deactivate_traps(struct kvm_vcpu *vcpu)
 {
extern char __kvm_hyp_host_vector[];
-   u64 mdcr_el2, cptr;
+   u64 cptr;
 
___deactivate_traps(vcpu);
 
-   mdcr_el2 = read_sysreg(mdcr_el2);
-
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
 
@@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
isb();
}
 
-   __deactivate_traps_common();
-
-   mdcr_el2 &= MDCR_EL2_HPMN_MASK;
-   mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
-   mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
+   __deactivate_traps_common(vcpu);
 
-   write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(this_cpu_ptr(_init_params)->hcr_el2, hcr_el2);
 
cptr = CPTR_EL2_DEFAULT;
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index b3229924d243..0d0c9550fb08 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
 }
 
-void deactivate_traps_vhe_put(void)
+void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu)
 {
-   u64 mdcr_el2 = read_sysreg(mdcr_el2);
-
-   mdcr_el2 &= MDCR_EL2_HPMN_MASK |
-   MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
-   MDCR_EL2_TPMS;
-
-   write_sysreg(mdcr_el2, mdcr_el2);
-
-   __deactivate_traps_common();
+   __deactivate_traps_common(vcpu);
 }
 
 /* Switch to the guest for VHE systems running in EL2 */
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c 
b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 2a0b8c88d74f..007a12dd4351 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -101,7 +101,7 @@ void 

[PATCH v3 05/15] KVM: arm64: Refactor sys_regs.h,c for nVHE reuse

2021-07-19 Thread Fuad Tabba
Refactor sys_regs.h and sys_regs.c to make it easier to reuse
common code. It will be used in nVHE in a later patch.

Note that the refactored code uses __inline_bsearch for find_reg
instead of bsearch to avoid copying the bsearch code for nVHE.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/sysreg.h |  3 +++
 arch/arm64/kvm/sys_regs.c   | 30 +-
 arch/arm64/kvm/sys_regs.h   | 31 +++
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 7b9c3acba684..326f49e7bd42 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1153,6 +1153,9 @@
 #define ICH_VTR_A3V_SHIFT  21
 #define ICH_VTR_A3V_MASK   (1 << ICH_VTR_A3V_SHIFT)
 
+/* Extract the feature specified from the feature id register. */
+#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT))
+
 #ifdef __ASSEMBLY__
 
.irp
num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 80a6e41cadad..1a939c464858 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -44,10 +44,6 @@
  * 64bit interface.
  */
 
-#define reg_to_encoding(x) \
-   sys_reg((u32)(x)->Op0, (u32)(x)->Op1,   \
-   (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
-
 static bool read_from_write_only(struct kvm_vcpu *vcpu,
 struct sys_reg_params *params,
 const struct sys_reg_desc *r)
@@ -1026,8 +1022,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
 }
 
-#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT))
-
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
 static u64 read_id_reg(const struct kvm_vcpu *vcpu,
struct sys_reg_desc const *r, bool raz)
@@ -2106,23 +2100,6 @@ static int check_sysreg_table(const struct sys_reg_desc 
*table, unsigned int n,
return 0;
 }
 
-static int match_sys_reg(const void *key, const void *elt)
-{
-   const unsigned long pval = (unsigned long)key;
-   const struct sys_reg_desc *r = elt;
-
-   return pval - reg_to_encoding(r);
-}
-
-static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
-const struct sys_reg_desc table[],
-unsigned int num)
-{
-   unsigned long pval = reg_to_encoding(params);
-
-   return bsearch((void *)pval, table, num, sizeof(table[0]), 
match_sys_reg);
-}
-
 int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
 {
kvm_inject_undefined(vcpu);
@@ -2365,13 +2342,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
 
trace_kvm_handle_sys_reg(esr);
 
-   params.Op0 = (esr >> 20) & 3;
-   params.Op1 = (esr >> 14) & 0x7;
-   params.CRn = (esr >> 10) & 0xf;
-   params.CRm = (esr >> 1) & 0xf;
-   params.Op2 = (esr >> 17) & 0x7;
+   params = esr_sys64_to_params(esr);
params.regval = vcpu_get_reg(vcpu, Rt);
-   params.is_write = !(esr & 1);
 
ret = emulate_sys_reg(vcpu, );
 
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 9d0621417c2a..cc0cc95a0280 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -11,6 +11,12 @@
 #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__
 #define __ARM64_KVM_SYS_REGS_LOCAL_H__
 
+#include 
+
+#define reg_to_encoding(x) \
+   sys_reg((u32)(x)->Op0, (u32)(x)->Op1,   \
+   (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)
+
 struct sys_reg_params {
u8  Op0;
u8  Op1;
@@ -21,6 +27,14 @@ struct sys_reg_params {
boolis_write;
 };
 
+#define esr_sys64_to_params(esr)   
\
+   ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3,\
+ .Op1 = ((esr) >> 14) & 0x7,  \
+ .CRn = ((esr) >> 10) & 0xf,  \
+ .CRm = ((esr) >> 1) & 0xf,   \
+ .Op2 = ((esr) >> 17) & 0x7,  \
+ .is_write = !((esr) & 1) })
+
 struct sys_reg_desc {
/* Sysreg string for debug */
const char *name;
@@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc 
*i1,
return i1->Op2 - i2->Op2;
 }
 
+static inline int match_sys_reg(const void *key, const void *elt)
+{
+   const unsigned long pval = (unsigned long)key;
+   const struct sys_reg_desc *r = elt;
+
+   return pval - reg_to_encoding(r);
+}
+
+static inline const struct 

[PATCH v3 09/15] KVM: arm64: Add config register bit definitions

2021-07-19 Thread Fuad Tabba
Add hardware configuration register bit definitions for HCR_EL2
and MDCR_EL2. Future patches toggle these hyp configuration
register bits to trap on certain accesses.

No functional change intended.

Signed-off-by: Fuad Tabba 
---
 arch/arm64/include/asm/kvm_arm.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index a928b2dc0b0f..327120c0089f 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,8 +12,13 @@
 #include 
 
 /* Hyp Configuration Register (HCR) bits */
+
+#define HCR_TID5   (UL(1) << 58)
+#define HCR_DCT(UL(1) << 57)
 #define HCR_ATA_SHIFT  56
 #define HCR_ATA(UL(1) << HCR_ATA_SHIFT)
+#define HCR_AMVOFFEN   (UL(1) << 51)
+#define HCR_FIEN   (UL(1) << 47)
 #define HCR_FWB(UL(1) << 46)
 #define HCR_API(UL(1) << 41)
 #define HCR_APK(UL(1) << 40)
@@ -56,6 +61,7 @@
 #define HCR_PTW(UL(1) << 2)
 #define HCR_SWIO   (UL(1) << 1)
 #define HCR_VM (UL(1) << 0)
+#define HCR_RES0   ((UL(1) << 48) | (UL(1) << 39))
 
 /*
  * The bits we set in HCR:
@@ -277,11 +283,21 @@
 #define CPTR_EL2_TZ(1 << 8)
 #define CPTR_NVHE_EL2_RES1 0x32ff /* known RES1 bits in CPTR_EL2 
(nVHE) */
 #define CPTR_EL2_DEFAULT   CPTR_NVHE_EL2_RES1
+#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) |  \
+GENMASK(29, 21) |  \
+GENMASK(19, 14) |  \
+BIT(11))
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_E2TB_MASK (UL(0x3))
 #define MDCR_EL2_E2TB_SHIFT(UL(24))
+#define MDCR_EL2_HPMFZS(UL(1) << 36)
+#define MDCR_EL2_HPMFZO(UL(1) << 29)
+#define MDCR_EL2_MTPME (UL(1) << 28)
+#define MDCR_EL2_TDCC  (UL(1) << 27)
+#define MDCR_EL2_HCCD  (UL(1) << 23)
 #define MDCR_EL2_TTRF  (UL(1) << 19)
+#define MDCR_EL2_HPMD  (UL(1) << 17)
 #define MDCR_EL2_TPMS  (UL(1) << 14)
 #define MDCR_EL2_E2PB_MASK (UL(0x3))
 #define MDCR_EL2_E2PB_SHIFT(UL(12))
@@ -293,6 +309,12 @@
 #define MDCR_EL2_TPM   (UL(1) << 6)
 #define MDCR_EL2_TPMCR (UL(1) << 5)
 #define MDCR_EL2_HPMN_MASK (UL(0x1F))
+#define MDCR_EL2_RES0  (GENMASK(63, 37) |  \
+GENMASK(35, 30) |  \
+GENMASK(25, 24) |  \
+GENMASK(22, 20) |  \
+BIT(18) |  \
+GENMASK(16, 15))
 
 /* For compatibility with fault code shared with 32-bit */
 #define FSC_FAULT  ESR_ELx_FSC_FAULT
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 09/14] KVM: arm64: Mark host bss and rodata section as shared

2021-07-19 Thread Quentin Perret
On Monday 19 Jul 2021 at 16:01:40 (+0100), Marc Zyngier wrote:
> On Mon, 19 Jul 2021 11:47:30 +0100,
> Quentin Perret  wrote:
> > +static int finalize_mappings(void)
> > +{
> > +   enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
> > +   int ret;
> > +
> > +   /*
> > +* The host's .bss and .rodata sections are now conceptually owned by
> > +* the hypervisor, so mark them as 'borrowed' in the host stage-2. We
> > +* can safely use host_stage2_idmap_locked() at this point since the
> > +* host stage-2 has not been enabled yet.
> > +*/
> > +   prot |= KVM_PGTABLE_STATE_SHARED | KVM_PGTABLE_STATE_BORROWED;
> > +   ret = host_stage2_idmap_locked(__hyp_pa(__start_rodata),
> > +  __hyp_pa(__end_rodata), prot);
> 
> Do we really want to map the rodata section as RWX?

I know, feels odd, but for now I think so. The host is obviously
welcome to restrict things in its stage-1, but for stage-2, this is
just 'memory' so far, the host is allowed to patch it if it wants too.

Eventually, yes, I think we should make it RO in the host stage-2, but
maybe that's for another series?

> > +   if (ret)
> > +   return ret;
> > +
> > +   return host_stage2_idmap_locked(__hyp_pa(__hyp_bss_end),
> > +   __hyp_pa(__bss_stop), prot);
> 
> If the 'locked' state implies SHARED+BORROWED, maybe consider moving
> the ORRing of the prot into host_stage2_idmap_locked()?

Ah no, sorry for the confusion, but 'locked' means that we already hold
the pgtable lock. That is not actually true here, but this is a special
case as only the current CPU can be messing with it at this point in
time so taking the lock would just be wasted cycles.

> > +}
> > +
> >  void __noreturn __pkvm_init_finalise(void)
> >  {
> > struct kvm_host_data *host_data = this_cpu_ptr(_host_data);
> > @@ -167,6 +199,10 @@ void __noreturn __pkvm_init_finalise(void)
> > if (ret)
> > goto out;
> >  
> > +   ret = finalize_mappings();
> > +   if (ret)
> > +   goto out;
> > +
> > pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
> > .zalloc_page = hyp_zalloc_hyp_page,
> > .phys_to_virt = hyp_phys_to_virt,

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 08/14] KVM: arm64: Add support for tagging shared pages in page-table

2021-07-19 Thread Quentin Perret
On Monday 19 Jul 2021 at 15:43:34 (+0100), Marc Zyngier wrote:
> On Mon, 19 Jul 2021 11:47:29 +0100,
> Quentin Perret  wrote:
> > 
> > The hypervisor will soon be in charge of tracking ownership of all
> > memory pages in the system. The current page-tracking infrastructure at
> > EL2 only allows binary states: a page is either owned or not by an
> > entity. But a number of use-cases will require more complex states for
> > pages that are shared between two entities (host, hypervisor, or guests).
> > 
> > In preparation for supporting these use-cases, introduce in the KVM
> > page-table library some infrastructure allowing to tag shared pages
> > using ignored bits (a.k.a. software bits) in PTEs.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h |  5 +
> >  arch/arm64/kvm/hyp/pgtable.c | 25 +
> >  2 files changed, 30 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> > b/arch/arm64/include/asm/kvm_pgtable.h
> > index dd72653314c7..f6d3d5c8910d 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -81,6 +81,8 @@ enum kvm_pgtable_stage2_flags {
> >   * @KVM_PGTABLE_PROT_W:Write permission.
> >   * @KVM_PGTABLE_PROT_R:Read permission.
> >   * @KVM_PGTABLE_PROT_DEVICE:   Device attributes.
> > + * @KVM_PGTABLE_STATE_SHARED:  Page shared with another entity.
> > + * @KVM_PGTABLE_STATE_BORROWED:Page borrowed from another entity.
> >   */
> >  enum kvm_pgtable_prot {
> > KVM_PGTABLE_PROT_X  = BIT(0),
> > @@ -88,6 +90,9 @@ enum kvm_pgtable_prot {
> > KVM_PGTABLE_PROT_R  = BIT(2),
> >  
> > KVM_PGTABLE_PROT_DEVICE = BIT(3),
> > +
> > +   KVM_PGTABLE_STATE_SHARED= BIT(4),
> > +   KVM_PGTABLE_STATE_BORROWED  = BIT(5),
> 
> I'd rather have some indirection here, as we have other potential
> users for the SW bits outside of pKVM (see the NV series, which uses
> some of these SW bits as the backend for TTL-based TLB invalidation).
> 
> Can we instead only describe the SW bit states in this enum, and let
> the users map the semantic they require onto that state? See [1] for
> what I carry in the NV branch.

Works for me -- I just wanted to make sure we don't have users in
different places that use the same bits without knowing, but no strong
opinions, so happy to change.

> >  };
> >  
> >  #define KVM_PGTABLE_PROT_RW(KVM_PGTABLE_PROT_R | 
> > KVM_PGTABLE_PROT_W)
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 5bdbe7a31551..51598b79dafc 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -211,6 +211,29 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 
> > owner_id)
> > return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
> >  }
> >  
> > +static kvm_pte_t pte_ignored_bit_prot(enum kvm_pgtable_prot prot)
> 
> Can we call these sw rather than ignored?

Sure.

> > +{
> > +   kvm_pte_t ignored_bits = 0;
> > +
> > +   /*
> > +* Ignored bits 0 and 1 are reserved to track the memory ownership
> > +* state of each page:
> > +*   00: The page is owned solely by the page-table owner.
> > +*   01: The page is owned by the page-table owner, but is shared
> > +*   with another entity.
> > +*   10: The page is shared with, but not owned by the page-table owner.
> > +*   11: Reserved for future use (lending).
> > +*/
> > +   if (prot & KVM_PGTABLE_STATE_SHARED) {
> > +   if (prot & KVM_PGTABLE_STATE_BORROWED)
> > +   ignored_bits |= BIT(1);
> > +   else
> > +   ignored_bits |= BIT(0);
> > +   }
> > +
> > +   return FIELD_PREP(KVM_PTE_LEAF_ATTR_IGNORED, ignored_bits);
> > +}
> > +
> >  static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 
> > addr,
> >   u32 level, kvm_pte_t *ptep,
> >   enum kvm_pgtable_walk_flags flag)
> > @@ -357,6 +380,7 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot 
> > prot, kvm_pte_t *ptep)
> > attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
> > attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
> > attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
> > +   attr |= pte_ignored_bit_prot(prot);
> > *ptep = attr;
> >  
> > return 0;
> > @@ -558,6 +582,7 @@ static int stage2_set_prot_attr(struct kvm_pgtable 
> > *pgt, enum kvm_pgtable_prot p
> >  
> > attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
> > attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
> > +   attr |= pte_ignored_bit_prot(prot);
> > *ptep = attr;
> >  
> > return 0;
> 
> How about kvm_pgtable_stage2_relax_perms()?

It should leave SW bits untouched, and it really felt like a path were
we want to change permissions and nothing else. What did you have in
mind?

Cheers,
Quentin

Re: [PATCH 07/14] KVM: arm64: Enable forcing page-level stage-2 mappings

2021-07-19 Thread Quentin Perret
On Monday 19 Jul 2021 at 15:24:32 (+0100), Marc Zyngier wrote:
> On Mon, 19 Jul 2021 11:47:28 +0100,
> Quentin Perret  wrote:
> > 
> > Much of the stage-2 manipulation logic relies on being able to destroy
> > block mappings if e.g. installing a smaller mapping in the range. The
> > rationale for this behaviour is that stage-2 mappings can always be
> > re-created lazily. However, this gets more complicated when the stage-2
> > page-table is used to store metadata about the underlying pages. In such
> > a case, destroying a block mapping may lead to losing part of the
> > state, and confuse the user of those metadata (such as the hypervisor in
> > nVHE protected mode).
> > 
> > To fix this, introduce a callback function in the pgtable struct which
> > is called during all map operations to determine whether the mappings
> > can us blocks, or should be forced to page-granularity level. This is
> 
> nit: use?

Ack.

> > used by the hypervisor when creating the host stage-2 to force
> > page-level mappings when using non-default protection attributes.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h  | 63 +--
> >  arch/arm64/kvm/hyp/nvhe/mem_protect.c | 16 +--
> >  arch/arm64/kvm/hyp/pgtable.c  | 20 +++--
> >  3 files changed, 69 insertions(+), 30 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> > b/arch/arm64/include/asm/kvm_pgtable.h
> > index af62203d2f7a..dd72653314c7 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -75,25 +75,6 @@ enum kvm_pgtable_stage2_flags {
> > KVM_PGTABLE_S2_IDMAP= BIT(1),
> >  };
> >  
> > -/**
> > - * struct kvm_pgtable - KVM page-table.
> > - * @ia_bits:   Maximum input address size, in bits.
> > - * @start_level:   Level at which the page-table walk starts.
> > - * @pgd:   Pointer to the first top-level entry of the page-table.
> > - * @mm_ops:Memory management callbacks.
> > - * @mmu:   Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
> > - */
> > -struct kvm_pgtable {
> > -   u32 ia_bits;
> > -   u32 start_level;
> > -   kvm_pte_t   *pgd;
> > -   struct kvm_pgtable_mm_ops   *mm_ops;
> > -
> > -   /* Stage-2 only */
> > -   struct kvm_s2_mmu   *mmu;
> > -   enum kvm_pgtable_stage2_flags   flags;
> > -};
> > -
> >  /**
> >   * enum kvm_pgtable_prot - Page-table permissions and attributes.
> >   * @KVM_PGTABLE_PROT_X:Execute permission.
> > @@ -109,11 +90,41 @@ enum kvm_pgtable_prot {
> > KVM_PGTABLE_PROT_DEVICE = BIT(3),
> >  };
> >  
> > -#define PAGE_HYP   (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> > +#define KVM_PGTABLE_PROT_RW(KVM_PGTABLE_PROT_R | 
> > KVM_PGTABLE_PROT_W)
> > +#define KVM_PGTABLE_PROT_RWX   (KVM_PGTABLE_PROT_RW | 
> > KVM_PGTABLE_PROT_X)
> > +
> > +#define PAGE_HYP   KVM_PGTABLE_PROT_RW
> >  #define PAGE_HYP_EXEC  (KVM_PGTABLE_PROT_R | 
> > KVM_PGTABLE_PROT_X)
> >  #define PAGE_HYP_RO(KVM_PGTABLE_PROT_R)
> >  #define PAGE_HYP_DEVICE(PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
> >  
> > +typedef bool (*kvm_pgtable_want_pte_cb_t)(u64 addr, u64 end,
> > + enum kvm_pgtable_prot prot);
> > +
> > +/**
> > + * struct kvm_pgtable - KVM page-table.
> > + * @ia_bits:   Maximum input address size, in bits.
> > + * @start_level:   Level at which the page-table walk starts.
> > + * @pgd:   Pointer to the first top-level entry of the page-table.
> > + * @mm_ops:Memory management callbacks.
> > + * @mmu:   Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
> > + * @flags: Stage-2 page-table flags.
> > + * @want_pte_cb:   Callback function used during map operations to decide
> > + * whether block mappings can be used to map the given IPA
> > + * range.
> > + */
> > +struct kvm_pgtable {
> > +   u32 ia_bits;
> > +   u32 start_level;
> > +   kvm_pte_t   *pgd;
> > +   struct kvm_pgtable_mm_ops   *mm_ops;
> > +
> > +   /* Stage-2 only */
> > +   struct kvm_s2_mmu   *mmu;
> > +   enum kvm_pgtable_stage2_flags   flags;
> > +   kvm_pgtable_want_pte_cb_t   want_pte_cb;
> > +};
> 
> nit: does this whole definition really need to move around?

The alternative is to move (or forward declare) enum kvm_pgtable_prot
higher up in the file, but I have no strong opinion, so whatever you
prefer will work for me.

> > +
> >  /**
> >   * struct kvm_mem_range - Range of Intermediate Physical Addresses
> >   * @start: Start of the range.
> > @@ -216,21 

Re: [RFC PATCH 0/5] KVM: arm64: Pass PSCI to userspace

2021-07-19 Thread Jean-Philippe Brucker
Hi Alex,

I'm not planning to resend this work at the moment, because it looks like
vcpu hot-add will go a different way so I don't have a user. But I'll
probably address the feedback so far and park it on some branch, in case
anyone else needs it.

On Mon, Jul 19, 2021 at 04:29:18PM +0100, Alexandru Elisei wrote:
> 1. Why forwarding PSCI calls to userspace depend on enabling forwarding for 
> other
> HVC calls? As I understand from the patches, those handle distinct function 
> IDs.

The HVC cap from patch 4 enables returning from the VCPU_RUN ioctl with
KVM_EXIT_HYPERCALL, for any HVC not handled by KVM. This one should
definitely be improved, either by letting userspace choose the ranges of
HVC it wants, or at least by reporting ranges reserved by KVM to
userspace.

The PSCI cap from patch 5 disables the in-kernel PSCI implementation. As a
result those HVCs are forwarded to userspace.

It was suggested that other users will want to handle HVC calls (SDEI for
example [1]), hence splitting into two capabilities rather than just the
PSCI cap. In v5.14 x86 added KVM_CAP_EXIT_HYPERCALL [2], which lets
userspace receive specific hypercalls. We could reuse that and have PSCI
be one bit of that capability's parameter.

[1] 
https://lore.kernel.org/linux-arm-kernel/20170808164616.25949-12-james.mo...@arm.com/
[2] 
https://lore.kernel.org/kvm/90778988e1ee01926ff9cac447aacb745f954c8c.1623174621.git.ashish.ka...@amd.com/

> 2. HVC call forwarding to userspace also forwards PSCI functions which are 
> defined
> in ARM DEN 0022D, but not (yet) implemented by KVM. What happens if KVM's PSCI
> implementation gets support for one of those functions? How does userspace 
> know
> that now it also needs to enable PSCI call forwarding to be able to handle 
> that
> function?

We forward the whole PSCI function range, so it's either KVM or userspace.
If KVM manages PSCI and the guest calls an unimplemented function, that
returns directly to the guest without going to userspace.

The concern is valid for any other range, though. If userspace enables the
HVC cap it receives function calls that at some point KVM might need to
handle itself. So we need some negotiation between user and KVM about the
specific HVC ranges that userspace can and will handle.

> It looks to me like the boundary between the functions that are forwarded 
> when HVC
> call forwarding is enabled and the functions that are forwarded when PSCI call
> forwarding is enabled is based on what Linux v5.13 handles. Have you 
> considered
> choosing this boundary based on something less arbitrary, like the function 
> types
> specified in ARM DEN 0028C, table 2-1?

For PSCI I've used the range 0-0x1f as the boundary, which is reserved for
PSCI by SMCCC (table 6-4 in that document).

> 
> In my opinion, setting the MP state to HALTED looks like a sensible approach 
> to
> implementing PSCI_SUSPEND. I'll take a closer look at the patches after I get 
> a
> better understanding about what is going on.
> 
> On 6/8/21 4:48 PM, Jean-Philippe Brucker wrote:
> > Allow userspace to request handling PSCI calls from guests. Our goal is
> > to enable a vCPU hot-add solution for Arm where the VMM presents
> > possible resources to the guest at boot, and controls which vCPUs can be
> > brought up by allowing or denying PSCI CPU_ON calls. Passing HVC and
> > PSCI to userspace has been discussed on the list in the context of vCPU
> > hot-add [1,2] but it can also be useful for implementing other SMCCC and
> > vendor hypercalls [3,4,5].
> >
> > Patches 1-3 allow userspace to request WFI to be executed in KVM. That
> 
> I don't understand this. KVM, in kvm_vcpu_block(), does not execute an WFI.
> PSCI_SUSPEND is documented as being indistinguishable from an WFI from the 
> guest's
> point of view, but it's implementation is not architecturally defined.

Yes that was an oversimplification on my part

Thanks,
Jean
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 05/14] KVM: arm64: Don't overwrite ignored bits with owner id

2021-07-19 Thread Quentin Perret
On Monday 19 Jul 2021 at 13:55:29 (+0100), Marc Zyngier wrote:
> On Mon, 19 Jul 2021 11:47:26 +0100,
> Quentin Perret  wrote:
> > 
> > The nVHE protected mode uses invalid mappings in the host stage-2
> > page-table to track the owner of each page in the system. In order to
> > allow the usage of ignored bits (a.k.a. software bits) in these
> > mappings, move the owner encoding away from the top bits.
> 
> But that's exactly what the current situation is allowing: the use of
> the SW bits. I am guessing that what you really mean is that you want
> to *preserve* the SW bits from an existing mapping and use other bits
> when the mapping is invalid?

Yes, this is really just forward looking, but I think it might be useful
to allow annotating invalid mappings with both an owner id _and_
additional flags for e.g. shared pages and such. And using bits [58-55]
to store those flags just like we do for valid mappings should make
things easier, but no big deal.

I see how this is going to conflict with kvm_pgtable_stage2_annotate()
from your series though, so maybe I should just drop this patch and
leave the encoding 'issue' to the caller -- the rest of the series
doesn't depend on this anyway, this was just small cleanup.

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 03/14] KVM: arm64: Continue stage-2 map when re-creating mappings

2021-07-19 Thread Quentin Perret
On Monday 19 Jul 2021 at 13:14:48 (+0100), Marc Zyngier wrote:
> On Mon, 19 Jul 2021 11:47:24 +0100,
> Quentin Perret  wrote:
> > 
> > The stage-2 map walkers currently return -EAGAIN when re-creating
> > identical mappings or only changing access permissions. This allows to
> > optimize mapping pages for concurrent (v)CPUs faulting on the same
> > page.
> > 
> > While this works as expected when touching one page-table leaf at a
> > time, this can lead to difficult situations when mapping larger ranges.
> > Indeed, a large map operation can fail in the middle if an existing
> > mapping is found in the range, even if it has compatible attributes,
> > hence leaving only half of the range mapped.
> 
> I'm curious of when this can happen. We normally map a single leaf at
> a time, and we don't have a way to map multiple leaves at once: we
> either use the VMA base size or try to upgrade it to a THP, but the
> result is always a single leaf entry. What changed?

Nothing _yet_ :-)

The 'share' hypercall introduced near the end of the series allows to
share multiple physically contiguous pages in one go -- this is mostly
to allow sharing data-structures that are larger than a page.

So if one of the pages happens to be already mapped by the time the
hypercall is issued, mapping the range with the right SW bits becomes
difficult as kvm_pgtable_stage2_map() will fail halfway through, which
is tricky to handle.

This patch shouldn't change anything for existing users that only map
things that are nicely aligned at block/page granularity, but should
make the life of new users easier, so that seemed like a win.

> > To avoid having to deal with such failures in the caller, don't
> > interrupt the map operation when hitting existing PTEs, but make sure to
> > still return -EAGAIN so that user_mem_abort() can mark the page dirty
> > when needed.
> 
> I don't follow you here: if you return -EAGAIN for a writable mapping,
> we don't account for the page to be dirty on the assumption that
> nothing has been mapped. But if there is a way to map more than a
> single entry and to get -EAGAIN at the same time, then we're bound to
> lose data on page eviction.
> 
> Can you shed some light on this?

Sure. For guests, hitting the -EAGAIN case means we've lost the race
with another vCPU that faulted the same page. In this case the other
vCPU either mapped the page RO, which means that our vCPU will then get
a permission fault next time we run it which will lead to the page being
marked dirty, or the other vCPU mapped the page RW in which case it
already marked the page dirty for us and we can safely re-enter the
guest without doing anything else.

So what I meant by "still return -EAGAIN so that user_mem_abort() can
mark the page dirty when needed" is "make sure to mark the page dirty
only when necessary: if winning the race and marking the page RW, or
in the permission fault path". That is, by keeping the -EAGAIN I want to
make sure we don't mark the page dirty twice. (This might fine, but this
would be new behaviour, and it was not clear that would scale well to
many vCPUs faulting the same page).

I see how this wording can be highly confusing though, I'll and re-word
for the next version.

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 4/4] KVM: arm64: Remove PMSWINC_EL0 shadow register

2021-07-19 Thread Marc Zyngier

Hi Alex,

On 2021-07-19 17:35, Alexandru Elisei wrote:

Hi Marc,

On 7/19/21 1:39 PM, Marc Zyngier wrote:

We keep an entry for the PMSWINC_EL0 register in the vcpu structure,
while *never* writing anything there outside of reset.

Given that the register is defined as write-only, that we always
trap when this register is accessed, there is little point in saving
anything anyway.

Get rid of the entry, and save a mighty 8 bytes per vcpu structure.

We still need to keep it exposed to userspace in order to preserve
backward compatibility with previously saved VMs. Since userspace
cannot expect any effect of writing to PMSWINC_EL0, treat the
register as RAZ/WI for the purpose of userspace access.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kvm/sys_regs.c | 21 -
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h

index 41911585ae0c..afc169630884 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -185,7 +185,6 @@ enum vcpu_sysreg {
PMCNTENSET_EL0, /* Count Enable Set Register */
PMINTENSET_EL1, /* Interrupt Enable Set Register */
PMOVSSET_EL0,   /* Overflow Flag Status Set Register */
-   PMSWINC_EL0,/* Software Increment Register */
PMUSERENR_EL0,  /* User Enable Register */

/* Pointer Authentication Registers in a strict increasing order. */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f22139658e48..a1f5101f49a3 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1286,6 +1286,20 @@ static int set_raz_id_reg(struct kvm_vcpu 
*vcpu, const struct sys_reg_desc *rd,

return __set_id_reg(vcpu, rd, uaddr, true);
 }

+static int set_wi_reg(struct kvm_vcpu *vcpu, const struct 
sys_reg_desc *rd,

+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+   int err;
+   u64 val;
+
+   /* Perform the access even if we are going to ignore the value */
+   err = reg_from_user(, uaddr, sys_reg_to_index(rd));


I don't understand why the read still happens if the value is ignored.
Just so KVM
preserves the previous behaviour and tells userspace there was an 
error?


If userspace has given us a duff pointer, it needs to know about it.


+   if (err)
+   return err;
+
+   return 0;
+}
+
 static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params 
*p,

   const struct sys_reg_desc *r)
 {
@@ -1629,8 +1643,13 @@ static const struct sys_reg_desc 
sys_reg_descs[] = {

  .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
{ PMU_SYS_REG(SYS_PMOVSCLR_EL0),
  .access = access_pmovs, .reg = PMOVSSET_EL0 },
+   /*
+* PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
+* previously (and pointlessly) advertised in the past...
+*/
{ PMU_SYS_REG(SYS_PMSWINC_EL0),
- .access = access_pmswinc, .reg = PMSWINC_EL0 },
+ .get_user = get_raz_id_reg, .set_user = set_wi_reg,


In my opinion, the call chain to return 0 looks pretty confusing to me, 
as the

functions seemed made for ID register accesses, and the leaf function,
read_id_reg(), tries to match this register with a list of ID
registers. Since we
have already added a new function just for PMSWINC_EL0, I was
wondering if adding
another function, something like get_raz_reg(), would make more sense.


In that case, I'd rather just kill get_raz_id_reg() and replace it with
this get_raz_reg(). If we trat something as RAZ, who cares whether it is
an idreg or not?

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 4/4] KVM: arm64: Remove PMSWINC_EL0 shadow register

2021-07-19 Thread Alexandru Elisei
Hi Marc,

On 7/19/21 1:39 PM, Marc Zyngier wrote:
> We keep an entry for the PMSWINC_EL0 register in the vcpu structure,
> while *never* writing anything there outside of reset.
>
> Given that the register is defined as write-only, that we always
> trap when this register is accessed, there is little point in saving
> anything anyway.
>
> Get rid of the entry, and save a mighty 8 bytes per vcpu structure.
>
> We still need to keep it exposed to userspace in order to preserve
> backward compatibility with previously saved VMs. Since userspace
> cannot expect any effect of writing to PMSWINC_EL0, treat the
> register as RAZ/WI for the purpose of userspace access.
>
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/include/asm/kvm_host.h |  1 -
>  arch/arm64/kvm/sys_regs.c | 21 -
>  2 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 41911585ae0c..afc169630884 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -185,7 +185,6 @@ enum vcpu_sysreg {
>   PMCNTENSET_EL0, /* Count Enable Set Register */
>   PMINTENSET_EL1, /* Interrupt Enable Set Register */
>   PMOVSSET_EL0,   /* Overflow Flag Status Set Register */
> - PMSWINC_EL0,/* Software Increment Register */
>   PMUSERENR_EL0,  /* User Enable Register */
>  
>   /* Pointer Authentication Registers in a strict increasing order. */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index f22139658e48..a1f5101f49a3 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -1286,6 +1286,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const 
> struct sys_reg_desc *rd,
>   return __set_id_reg(vcpu, rd, uaddr, true);
>  }
>  
> +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
> +   const struct kvm_one_reg *reg, void __user *uaddr)
> +{
> + int err;
> + u64 val;
> +
> + /* Perform the access even if we are going to ignore the value */
> + err = reg_from_user(, uaddr, sys_reg_to_index(rd));

I don't understand why the read still happens if the value is ignored. Just so 
KVM
preserves the previous behaviour and tells userspace there was an error?

> + if (err)
> + return err;
> +
> + return 0;
> +}
> +
>  static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
>  const struct sys_reg_desc *r)
>  {
> @@ -1629,8 +1643,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
>   { PMU_SYS_REG(SYS_PMOVSCLR_EL0),
> .access = access_pmovs, .reg = PMOVSSET_EL0 },
> + /*
> +  * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
> +  * previously (and pointlessly) advertised in the past...
> +  */
>   { PMU_SYS_REG(SYS_PMSWINC_EL0),
> -   .access = access_pmswinc, .reg = PMSWINC_EL0 },
> +   .get_user = get_raz_id_reg, .set_user = set_wi_reg,

In my opinion, the call chain to return 0 looks pretty confusing to me, as the
functions seemed made for ID register accesses, and the leaf function,
read_id_reg(), tries to match this register with a list of ID registers. Since 
we
have already added a new function just for PMSWINC_EL0, I was wondering if 
adding
another function, something like get_raz_reg(), would make more sense.

Thanks,

Alex

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 1/4] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-19 Thread Alexandru Elisei
Hi Marc,

On 7/19/21 4:56 PM, Marc Zyngier wrote:
> On 2021-07-19 16:55, Alexandru Elisei wrote:
>> Hi Marc,
>>
>> On 7/19/21 1:38 PM, Marc Zyngier wrote:
>>> A number of the PMU sysregs expose reset values that are not
>>> compliant with the architecture (set bits in the RES0 ranges,
>>> for example).
>>>
>>> This in turn has the effect that we need to pointlessly mask
>>> some register fields when using them.
>>>
>>> Let's start by making sure we don't have illegal values in the
>>> shadow registers at reset time. This affects all the registers
>>> that dedicate one bit per counter, the counters themselves,
>>> PMEVTYPERn_EL0 and PMSELR_EL0.
>>>
>>> Reported-by: Alexandre Chartre 
>>> Reviewed-by: Alexandre Chartre 
>>> Acked-by: Russell King (Oracle) 
>>> Signed-off-by: Marc Zyngier 
>>> ---
>>>  arch/arm64/kvm/sys_regs.c | 43 ---
>>>  1 file changed, 40 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>>> index f6f126eb6ac1..96bdfa0e68b2 100644
>>> --- a/arch/arm64/kvm/sys_regs.c
>>> +++ b/arch/arm64/kvm/sys_regs.c
>>> @@ -603,6 +603,41 @@ static unsigned int pmu_visibility(const struct 
>>> kvm_vcpu
>>> *vcpu,
>>>  return REG_HIDDEN;
>>>  }
>>>
>>> +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
>>> *r)
>>> +{
>>> +    u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
>>> +
>>> +    /* No PMU available, any PMU reg may UNDEF... */
>>> +    if (!kvm_arm_support_pmu_v3())
>>> +    return;
>>> +
>>> +    n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
>>> +    n &= ARMV8_PMU_PMCR_N_MASK;
>>> +    if (n)
>>> +    mask |= GENMASK(n - 1, 0);
>>
>> Hm... seems to be missing the cycle counter.
>
> Check the declaration for 'mask'... :-)

Yeah, sorry for that, I still had in my mind the original function body.

Everything looks alright to me, no changes from the previous version 
(PMSWINC_EL1
is handled in the last patch) where I had checked that the reset values match 
the
architecture:

Reviewed-by: Alexandru Elisei 

Thanks,

Alex

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 1/4] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-19 Thread Marc Zyngier

On 2021-07-19 16:55, Alexandru Elisei wrote:

Hi Marc,

On 7/19/21 1:38 PM, Marc Zyngier wrote:

A number of the PMU sysregs expose reset values that are not
compliant with the architecture (set bits in the RES0 ranges,
for example).

This in turn has the effect that we need to pointlessly mask
some register fields when using them.

Let's start by making sure we don't have illegal values in the
shadow registers at reset time. This affects all the registers
that dedicate one bit per counter, the counters themselves,
PMEVTYPERn_EL0 and PMSELR_EL0.

Reported-by: Alexandre Chartre 
Reviewed-by: Alexandre Chartre 
Acked-by: Russell King (Oracle) 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/sys_regs.c | 43 
---

 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f6f126eb6ac1..96bdfa0e68b2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -603,6 +603,41 @@ static unsigned int pmu_visibility(const struct 
kvm_vcpu *vcpu,

return REG_HIDDEN;
 }

+static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct 
sys_reg_desc *r)

+{
+   u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
+
+   /* No PMU available, any PMU reg may UNDEF... */
+   if (!kvm_arm_support_pmu_v3())
+   return;
+
+   n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
+   n &= ARMV8_PMU_PMCR_N_MASK;
+   if (n)
+   mask |= GENMASK(n - 1, 0);


Hm... seems to be missing the cycle counter.


Check the declaration for 'mask'... :-)

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 1/4] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-19 Thread Alexandru Elisei
Hi Marc,

On 7/19/21 1:38 PM, Marc Zyngier wrote:
> A number of the PMU sysregs expose reset values that are not
> compliant with the architecture (set bits in the RES0 ranges,
> for example).
>
> This in turn has the effect that we need to pointlessly mask
> some register fields when using them.
>
> Let's start by making sure we don't have illegal values in the
> shadow registers at reset time. This affects all the registers
> that dedicate one bit per counter, the counters themselves,
> PMEVTYPERn_EL0 and PMSELR_EL0.
>
> Reported-by: Alexandre Chartre 
> Reviewed-by: Alexandre Chartre 
> Acked-by: Russell King (Oracle) 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kvm/sys_regs.c | 43 ---
>  1 file changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index f6f126eb6ac1..96bdfa0e68b2 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -603,6 +603,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu 
> *vcpu,
>   return REG_HIDDEN;
>  }
>  
> +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
> *r)
> +{
> + u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
> +
> + /* No PMU available, any PMU reg may UNDEF... */
> + if (!kvm_arm_support_pmu_v3())
> + return;
> +
> + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
> + n &= ARMV8_PMU_PMCR_N_MASK;
> + if (n)
> + mask |= GENMASK(n - 1, 0);

Hm... seems to be missing the cycle counter.

Thanks,

Alex

> +
> + reset_unknown(vcpu, r);
> + __vcpu_sys_reg(vcpu, r->reg) &= mask;
> +}
> +
> +static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
> *r)
> +{
> + reset_unknown(vcpu, r);
> + __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
> +}
> +
> +static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
> *r)
> +{
> + reset_unknown(vcpu, r);
> + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
> +}
> +
> +static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
> +{
> + reset_unknown(vcpu, r);
> + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
> +}
> +
>  static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
>  {
>   u64 pmcr, val;
> @@ -944,16 +979,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, 
> struct sys_reg_params *p,
> trap_wcr, reset_wcr, 0, 0,  get_wcr, set_wcr }
>  
>  #define PMU_SYS_REG(r)   \
> - SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility
> + SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility
>  
>  /* Macro to expand the PMEVCNTRn_EL0 register */
>  #define PMU_PMEVCNTR_EL0(n)  \
>   { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)),\
> +   .reset = reset_pmevcntr,  \
> .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
>  
>  /* Macro to expand the PMEVTYPERn_EL0 register */
>  #define PMU_PMEVTYPER_EL0(n) \
>   { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)),   \
> +   .reset = reset_pmevtyper, \
> .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
>  
>  static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
> @@ -1595,13 +1632,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>   { PMU_SYS_REG(SYS_PMSWINC_EL0),
> .access = access_pmswinc, .reg = PMSWINC_EL0 },
>   { PMU_SYS_REG(SYS_PMSELR_EL0),
> -   .access = access_pmselr, .reg = PMSELR_EL0 },
> +   .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
>   { PMU_SYS_REG(SYS_PMCEID0_EL0),
> .access = access_pmceid, .reset = NULL },
>   { PMU_SYS_REG(SYS_PMCEID1_EL0),
> .access = access_pmceid, .reset = NULL },
>   { PMU_SYS_REG(SYS_PMCCNTR_EL0),
> -   .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 },
> +   .access = access_pmu_evcntr, .reset = reset_unknown, .reg = 
> PMCCNTR_EL0 },
>   { PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
> .access = access_pmu_evtyper, .reset = NULL },
>   { PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 0/5] KVM: arm64: Pass PSCI to userspace

2021-07-19 Thread Alexandru Elisei
Hi Jean-Philippe,

I'm not really familiar with this part of KVM, and I'm still trying to get my 
head
around how this works, so please bare with me if I ask silly questions.

This is how I understand this will work:

1. VMM opts in to forward HVC calls not handled by KVM.

2. VMM opts in to forward PSCI calls, other than
PSCI_1_0_FN_PSCI_FEATURES(ARM_SMCCC_VERSION_FUNC_ID).

3. Guest emulates PSCI calls (and all the other HVC calls).

    3.a For CPU_SUSPEND coming from VCPU A, userspace does a
KVM_SET_MP_STATE(KVM_MP_STATE_HALTED) ioctl on the VCPU fd which sets the 
request
KVM_REQ_SUSPEND.

    3.b The next time the VCPU is run, KVM blocks the VCPU as a result of the
request. kvm_vcpu_block() does a schedule() in a loop until it decides that the
CPU must unblock.

    3.c The VCPU will run as normal after kvm_vcpu_block() returns.

Please correct me if I got something wrong.

I have a few general questions. It doesn't mean there's something wrong with 
your
approach, I'm just trying to understand it better.

1. Why forwarding PSCI calls to userspace depend on enabling forwarding for 
other
HVC calls? As I understand from the patches, those handle distinct function IDs.

2. HVC call forwarding to userspace also forwards PSCI functions which are 
defined
in ARM DEN 0022D, but not (yet) implemented by KVM. What happens if KVM's PSCI
implementation gets support for one of those functions? How does userspace know
that now it also needs to enable PSCI call forwarding to be able to handle that
function?

It looks to me like the boundary between the functions that are forwarded when 
HVC
call forwarding is enabled and the functions that are forwarded when PSCI call
forwarding is enabled is based on what Linux v5.13 handles. Have you considered
choosing this boundary based on something less arbitrary, like the function 
types
specified in ARM DEN 0028C, table 2-1?

In my opinion, setting the MP state to HALTED looks like a sensible approach to
implementing PSCI_SUSPEND. I'll take a closer look at the patches after I get a
better understanding about what is going on.

On 6/8/21 4:48 PM, Jean-Philippe Brucker wrote:
> Allow userspace to request handling PSCI calls from guests. Our goal is
> to enable a vCPU hot-add solution for Arm where the VMM presents
> possible resources to the guest at boot, and controls which vCPUs can be
> brought up by allowing or denying PSCI CPU_ON calls. Passing HVC and
> PSCI to userspace has been discussed on the list in the context of vCPU
> hot-add [1,2] but it can also be useful for implementing other SMCCC and
> vendor hypercalls [3,4,5].
>
> Patches 1-3 allow userspace to request WFI to be executed in KVM. That

I don't understand this. KVM, in kvm_vcpu_block(), does not execute an WFI.
PSCI_SUSPEND is documented as being indistinguishable from an WFI from the 
guest's
point of view, but it's implementation is not architecturally defined.

Thanks,

Alex

> way the VMM can easily implement the PSCI CPU_SUSPEND function, which is
> mandatory from PSCI v0.2 onwards (even if it doesn't have a more useful
> implementation than WFI, natively available to the guest).
>
> Patch 4 lets userspace request any HVC that isn't handled by KVM, and
> patch 5 lets userspace request PSCI calls, disabling in-kernel PSCI
> handling.
>
> I'm focusing on the PSCI bits, but a complete prototype of vCPU hot-add
> for arm64 on Linux and QEMU, most of it from Salil and James, is
> available at [6].
>
> [1] 
> https://lore.kernel.org/kvmarm/82879258-46a7-a6e9-ee54-fc3692c1c...@arm.com/
> [2] 
> https://lore.kernel.org/linux-arm-kernel/20200625133757.22332-1-salil.me...@huawei.com/
> (Followed by KVM forum and Linaro Open discussions)
> [3] 
> https://lore.kernel.org/linux-arm-kernel/f56cf420-affc-35f0-2355-801a924b8...@arm.com/
> [4] https://lore.kernel.org/kvm/bf7e83f1-c58e-8d65-edd0-d08f27b8b...@arm.com/
> [5] 
> https://lore.kernel.org/kvm/1569338454-26202-2-git-send-email-guoh...@huawei.com/
> [6] https://jpbrucker.net/git/linux/log/?h=cpuhp/devel
> https://jpbrucker.net/git/qemu/log/?h=cpuhp/devel
>
> Jean-Philippe Brucker (5):
>   KVM: arm64: Replace power_off with mp_state in struct kvm_vcpu_arch
>   KVM: arm64: Move WFI execution to check_vcpu_requests()
>   KVM: arm64: Allow userspace to request WFI
>   KVM: arm64: Pass hypercalls to userspace
>   KVM: arm64: Pass PSCI calls to userspace
>
>  Documentation/virt/kvm/api.rst  | 46 +++
>  Documentation/virt/kvm/arm/psci.rst |  1 +
>  arch/arm64/include/asm/kvm_host.h   | 10 +++-
>  include/kvm/arm_hypercalls.h|  1 +
>  include/kvm/arm_psci.h  |  4 ++
>  include/uapi/linux/kvm.h|  3 ++
>  arch/arm64/kvm/arm.c| 71 +
>  arch/arm64/kvm/handle_exit.c|  3 +-
>  arch/arm64/kvm/hypercalls.c | 28 +++-
>  arch/arm64/kvm/psci.c   | 69 ++--
>  10 files changed, 170 insertions(+), 66 deletions(-)

Re: [PATCH 09/14] KVM: arm64: Mark host bss and rodata section as shared

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 11:47:30 +0100,
Quentin Perret  wrote:
> 
> As the hypervisor maps the host's .bss and .rodata sections in its
> stage-1, make sure to tag them as shared in hyp and host page-tables.
> 
> But since the hypervisor relies on the presence of these mappings, we
> cannot let the host in complete control of the memory regions -- it
> must not unshare or donate them to another entity for example. To
> prevent this, let's transfer the ownership of those ranges to the
> hypervisor itself, and share the page back with the host.
> 
> Signed-off-by: Quentin Perret 
> ---
>  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c |  7 ++-
>  arch/arm64/kvm/hyp/nvhe/setup.c   | 52 ---
>  3 files changed, 51 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h 
> b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> index 9c227d87c36d..b39047463075 100644
> --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> @@ -23,6 +23,7 @@ extern struct host_kvm host_kvm;
>  int __pkvm_prot_finalize(void);
>  int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
>  
> +int host_stage2_idmap_locked(u64 start, u64 end, enum kvm_pgtable_prot prot);
>  int kvm_host_prepare_stage2(void *pgt_pool_base);
>  void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
> b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> index cdace80d3e28..6f28edf58407 100644
> --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> @@ -235,6 +235,11 @@ static bool host_stage2_want_pte_cb(u64 addr, u64 end, 
> enum kvm_pgtable_prot pro
>   return prot != KVM_PGTABLE_PROT_RW;
>  }
>  
> +int host_stage2_idmap_locked(u64 start, u64 end, enum kvm_pgtable_prot prot)
> +{
> + return host_stage2_try(__host_stage2_idmap, start, end, prot);
> +}
> +
>  static int host_stage2_idmap(u64 addr)
>  {
>   enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RW;
> @@ -250,7 +255,7 @@ static int host_stage2_idmap(u64 addr)
>   if (ret)
>   goto unlock;
>  
> - ret = host_stage2_try(__host_stage2_idmap, range.start, range.end, 
> prot);
> + ret = host_stage2_idmap_locked(range.start, range.end, prot);
>  unlock:
>   hyp_spin_unlock(_kvm.lock);
>  
> diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
> index 0b574d106519..74dce83a6fad 100644
> --- a/arch/arm64/kvm/hyp/nvhe/setup.c
> +++ b/arch/arm64/kvm/hyp/nvhe/setup.c
> @@ -83,10 +83,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, 
> unsigned long size,
>   if (ret)
>   return ret;
>  
> - ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
> - if (ret)
> - return ret;
> -
>   ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, 
> PAGE_HYP_RO);
>   if (ret)
>   return ret;
> @@ -95,10 +91,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, 
> unsigned long size,
>   if (ret)
>   return ret;
>  
> - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
> - if (ret)
> - return ret;
> -
>   ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
>   if (ret)
>   return ret;
> @@ -117,6 +109,25 @@ static int recreate_hyp_mappings(phys_addr_t phys, 
> unsigned long size,
>   return ret;
>   }
>  
> + /*
> +  * Map the host's .bss and .rodata sections RO in the hypervisor, but
> +  * transfer the ownerhsip from the host to the hypervisor itself to
> +  * make sure it can't be donated or shared with another entity.
> +  *
> +  * The ownership transtion requires matching changes in the host
> +  * stage-2. This will done later (see finalize_mappings()) once the
> +  * hyp_vmemmap is addressable.
> +  */
> + ret = pkvm_create_mappings(__start_rodata, __end_rodata,
> +PAGE_HYP_RO | KVM_PGTABLE_STATE_SHARED);
> + if (ret)
> + return ret;
> +
> + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop,
> +PAGE_HYP_RO | KVM_PGTABLE_STATE_SHARED);
> + if (ret)
> + return ret;
> +
>   return 0;
>  }
>  
> @@ -148,6 +159,27 @@ static void hpool_put_page(void *addr)
>   hyp_put_page(, addr);
>  }
>  
> +static int finalize_mappings(void)
> +{
> + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
> + int ret;
> +
> + /*
> +  * The host's .bss and .rodata sections are now conceptually owned by
> +  * the hypervisor, so mark them as 'borrowed' in the host stage-2. We
> +  * can safely use host_stage2_idmap_locked() at this point since the
> +  * host stage-2 has not been enabled yet.
> +  */
> + prot |= KVM_PGTABLE_STATE_SHARED | 

[PATCH 11/14] KVM: arm64: Expose kvm_pte_valid() helper

2021-07-19 Thread Quentin Perret
The KVM pgtable API exposes the kvm_pgtable_walk() function to allow
the definition of walkers outside of pgtable.c. However, it is not easy
to implement any of those walkers without some of the low-level helpers,
such as kvm_pte_valid(). Make it static inline, and move it to the
header file to allow its re-use in other places.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h | 7 +++
 arch/arm64/kvm/hyp/pgtable.c | 6 --
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index 1aa49d6aabb7..8240c881ae1e 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -25,6 +25,13 @@ static inline u64 kvm_get_parange(u64 mmfr0)
 
 typedef u64 kvm_pte_t;
 
+#define KVM_PTE_VALID  BIT(0)
+
+static inline bool kvm_pte_valid(kvm_pte_t pte)
+{
+   return pte & KVM_PTE_VALID;
+}
+
 /**
  * struct kvm_pgtable_mm_ops - Memory management callbacks.
  * @zalloc_page:   Allocate a single zeroed memory page.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c7120797404a..e0ae57dca827 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -11,7 +11,6 @@
 #include 
 #include 
 
-#define KVM_PTE_VALID  BIT(0)
 
 #define KVM_PTE_TYPE   BIT(1)
 #define KVM_PTE_TYPE_BLOCK 0
@@ -135,11 +134,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
return __kvm_pgd_page_idx(, -1ULL) + 1;
 }
 
-static bool kvm_pte_valid(kvm_pte_t pte)
-{
-   return pte & KVM_PTE_VALID;
-}
-
 static bool kvm_pte_table(kvm_pte_t pte, u32 level)
 {
if (level == KVM_PGTABLE_MAX_LEVELS - 1)
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 12/14] KVM: arm64: Refactor pkvm_pgtable locking

2021-07-19 Thread Quentin Perret
Refactor the hypervisor stage-1 locking in nVHE protected mode to expose
a new pkvm_create_mappings_locked() function. This will be used in later
patches to allow walking and changing the hypervisor stage-1 without
releasing the lock.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/mm.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mm.c | 16 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h 
b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 8ec3a5a7744b..c76d7136ed9b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -23,6 +23,7 @@ int hyp_map_vectors(void);
 int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
 int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
 int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot 
prot);
 int __pkvm_create_mappings(unsigned long start, unsigned long size,
   unsigned long phys, enum kvm_pgtable_prot prot);
 unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index a8efdf0f9003..dde22e2a322a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -67,7 +67,7 @@ unsigned long __pkvm_create_private_mapping(phys_addr_t phys, 
size_t size,
return addr;
 }
 
-int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot 
prot)
 {
unsigned long start = (unsigned long)from;
unsigned long end = (unsigned long)to;
@@ -81,7 +81,8 @@ int pkvm_create_mappings(void *from, void *to, enum 
kvm_pgtable_prot prot)
int err;
 
phys = hyp_virt_to_phys((void *)virt_addr);
-   err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+   err = kvm_pgtable_hyp_map(_pgtable, virt_addr, PAGE_SIZE,
+ phys, prot);
if (err)
return err;
}
@@ -89,6 +90,17 @@ int pkvm_create_mappings(void *from, void *to, enum 
kvm_pgtable_prot prot)
return 0;
 }
 
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+   int ret;
+
+   hyp_spin_lock(_pgd_lock);
+   ret = pkvm_create_mappings_locked(from, to, prot);
+   hyp_spin_unlock(_pgd_lock);
+
+   return ret;
+}
+
 int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
 {
unsigned long start, end;
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 13/14] KVM: arm64: Restrict hyp stage-1 manipulation in protected mode

2021-07-19 Thread Quentin Perret
The host kernel is currently able to change EL2 stage-1 mappings without
restrictions thanks to the __pkvm_create_mappings() hypercall. But in a
world where the host is no longer part of the TCB, this clearly poses a
problem.

To fix this, introduce a new hypercall to allow the host to share a
range of physical memory with the hypervisor, and remove the
__pkvm_create_mappings() variant. The new hypercall implements
ownership and permission checks before allowing the sharing operation,
and it annotates the shared pages in the hypervisor stage-1 and host
stage-2 page-tables.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_asm.h  |   2 +-
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   1 +
 arch/arm64/kvm/hyp/include/nvhe/mm.h  |   2 -
 arch/arm64/kvm/hyp/nvhe/hyp-main.c|  12 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 105 ++
 arch/arm64/kvm/hyp/nvhe/mm.c  |   4 +-
 arch/arm64/kvm/mmu.c  |  14 ++-
 7 files changed, 124 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 9f0bf2109be7..78db818ae2c9 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -59,7 +59,7 @@
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs  13
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs   14
 #define __KVM_HOST_SMCCC_FUNC___pkvm_init  15
-#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings   16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp16
 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping17
 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector18
 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h 
b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b39047463075..f37e4d3b831b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -22,6 +22,7 @@ extern struct host_kvm host_kvm;
 
 int __pkvm_prot_finalize(void);
 int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+int __pkvm_host_share_hyp(phys_addr_t start, phys_addr_t end);
 
 int host_stage2_idmap_locked(u64 start, u64 end, enum kvm_pgtable_prot prot);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h 
b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index c76d7136ed9b..c9a8f535212e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -24,8 +24,6 @@ int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, 
phys_addr_t back);
 int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
 int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot 
prot);
-int __pkvm_create_mappings(unsigned long start, unsigned long size,
-  unsigned long phys, enum kvm_pgtable_prot prot);
 unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
enum kvm_pgtable_prot prot);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 1632f001f4ed..f05ecbd382d0 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -140,14 +140,12 @@ static void handle___pkvm_cpu_set_vector(struct 
kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
 }
 
-static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
 {
-   DECLARE_REG(unsigned long, start, host_ctxt, 1);
-   DECLARE_REG(unsigned long, size, host_ctxt, 2);
-   DECLARE_REG(unsigned long, phys, host_ctxt, 3);
-   DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+   DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
+   DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
 
-   cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+   cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(start, end);
 }
 
 static void handle___pkvm_create_private_mapping(struct kvm_cpu_context 
*host_ctxt)
@@ -193,7 +191,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__vgic_v3_restore_aprs),
HANDLE_FUNC(__pkvm_init),
HANDLE_FUNC(__pkvm_cpu_set_vector),
-   HANDLE_FUNC(__pkvm_create_mappings),
+   HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_create_private_mapping),
HANDLE_FUNC(__pkvm_prot_finalize),
HANDLE_FUNC(__pkvm_mark_hyp),
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 6f28edf58407..20b3cb3fdc67 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -262,6 

[PATCH 09/14] KVM: arm64: Mark host bss and rodata section as shared

2021-07-19 Thread Quentin Perret
As the hypervisor maps the host's .bss and .rodata sections in its
stage-1, make sure to tag them as shared in hyp and host page-tables.

But since the hypervisor relies on the presence of these mappings, we
cannot let the host in complete control of the memory regions -- it
must not unshare or donate them to another entity for example. To
prevent this, let's transfer the ownership of those ranges to the
hypervisor itself, and share the page back with the host.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |  7 ++-
 arch/arm64/kvm/hyp/nvhe/setup.c   | 52 ---
 3 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h 
b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 9c227d87c36d..b39047463075 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -23,6 +23,7 @@ extern struct host_kvm host_kvm;
 int __pkvm_prot_finalize(void);
 int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
 
+int host_stage2_idmap_locked(u64 start, u64 end, enum kvm_pgtable_prot prot);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index cdace80d3e28..6f28edf58407 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -235,6 +235,11 @@ static bool host_stage2_want_pte_cb(u64 addr, u64 end, 
enum kvm_pgtable_prot pro
return prot != KVM_PGTABLE_PROT_RW;
 }
 
+int host_stage2_idmap_locked(u64 start, u64 end, enum kvm_pgtable_prot prot)
+{
+   return host_stage2_try(__host_stage2_idmap, start, end, prot);
+}
+
 static int host_stage2_idmap(u64 addr)
 {
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RW;
@@ -250,7 +255,7 @@ static int host_stage2_idmap(u64 addr)
if (ret)
goto unlock;
 
-   ret = host_stage2_try(__host_stage2_idmap, range.start, range.end, 
prot);
+   ret = host_stage2_idmap_locked(range.start, range.end, prot);
 unlock:
hyp_spin_unlock(_kvm.lock);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 0b574d106519..74dce83a6fad 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -83,10 +83,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned 
long size,
if (ret)
return ret;
 
-   ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
-   if (ret)
-   return ret;
-
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, 
PAGE_HYP_RO);
if (ret)
return ret;
@@ -95,10 +91,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned 
long size,
if (ret)
return ret;
 
-   ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
-   if (ret)
-   return ret;
-
ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
if (ret)
return ret;
@@ -117,6 +109,25 @@ static int recreate_hyp_mappings(phys_addr_t phys, 
unsigned long size,
return ret;
}
 
+   /*
+* Map the host's .bss and .rodata sections RO in the hypervisor, but
+* transfer the ownerhsip from the host to the hypervisor itself to
+* make sure it can't be donated or shared with another entity.
+*
+* The ownership transtion requires matching changes in the host
+* stage-2. This will done later (see finalize_mappings()) once the
+* hyp_vmemmap is addressable.
+*/
+   ret = pkvm_create_mappings(__start_rodata, __end_rodata,
+  PAGE_HYP_RO | KVM_PGTABLE_STATE_SHARED);
+   if (ret)
+   return ret;
+
+   ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop,
+  PAGE_HYP_RO | KVM_PGTABLE_STATE_SHARED);
+   if (ret)
+   return ret;
+
return 0;
 }
 
@@ -148,6 +159,27 @@ static void hpool_put_page(void *addr)
hyp_put_page(, addr);
 }
 
+static int finalize_mappings(void)
+{
+   enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
+   int ret;
+
+   /*
+* The host's .bss and .rodata sections are now conceptually owned by
+* the hypervisor, so mark them as 'borrowed' in the host stage-2. We
+* can safely use host_stage2_idmap_locked() at this point since the
+* host stage-2 has not been enabled yet.
+*/
+   prot |= KVM_PGTABLE_STATE_SHARED | KVM_PGTABLE_STATE_BORROWED;
+   ret = host_stage2_idmap_locked(__hyp_pa(__start_rodata),
+  __hyp_pa(__end_rodata), prot);
+   if (ret)
+   return ret;
+
+  

[PATCH 07/14] KVM: arm64: Enable forcing page-level stage-2 mappings

2021-07-19 Thread Quentin Perret
Much of the stage-2 manipulation logic relies on being able to destroy
block mappings if e.g. installing a smaller mapping in the range. The
rationale for this behaviour is that stage-2 mappings can always be
re-created lazily. However, this gets more complicated when the stage-2
page-table is used to store metadata about the underlying pages. In such
a case, destroying a block mapping may lead to losing part of the
state, and confuse the user of those metadata (such as the hypervisor in
nVHE protected mode).

To fix this, introduce a callback function in the pgtable struct which
is called during all map operations to determine whether the mappings
can us blocks, or should be forced to page-granularity level. This is
used by the hypervisor when creating the host stage-2 to force
page-level mappings when using non-default protection attributes.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h  | 63 +--
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 16 +--
 arch/arm64/kvm/hyp/pgtable.c  | 20 +++--
 3 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index af62203d2f7a..dd72653314c7 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -75,25 +75,6 @@ enum kvm_pgtable_stage2_flags {
KVM_PGTABLE_S2_IDMAP= BIT(1),
 };
 
-/**
- * struct kvm_pgtable - KVM page-table.
- * @ia_bits:   Maximum input address size, in bits.
- * @start_level:   Level at which the page-table walk starts.
- * @pgd:   Pointer to the first top-level entry of the page-table.
- * @mm_ops:Memory management callbacks.
- * @mmu:   Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
- */
-struct kvm_pgtable {
-   u32 ia_bits;
-   u32 start_level;
-   kvm_pte_t   *pgd;
-   struct kvm_pgtable_mm_ops   *mm_ops;
-
-   /* Stage-2 only */
-   struct kvm_s2_mmu   *mmu;
-   enum kvm_pgtable_stage2_flags   flags;
-};
-
 /**
  * enum kvm_pgtable_prot - Page-table permissions and attributes.
  * @KVM_PGTABLE_PROT_X:Execute permission.
@@ -109,11 +90,41 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_DEVICE = BIT(3),
 };
 
-#define PAGE_HYP   (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RW(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define KVM_PGTABLE_PROT_RWX   (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
+
+#define PAGE_HYP   KVM_PGTABLE_PROT_RW
 #define PAGE_HYP_EXEC  (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
 #define PAGE_HYP_RO(KVM_PGTABLE_PROT_R)
 #define PAGE_HYP_DEVICE(PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
 
+typedef bool (*kvm_pgtable_want_pte_cb_t)(u64 addr, u64 end,
+ enum kvm_pgtable_prot prot);
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits:   Maximum input address size, in bits.
+ * @start_level:   Level at which the page-table walk starts.
+ * @pgd:   Pointer to the first top-level entry of the page-table.
+ * @mm_ops:Memory management callbacks.
+ * @mmu:   Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ * @flags: Stage-2 page-table flags.
+ * @want_pte_cb:   Callback function used during map operations to decide
+ * whether block mappings can be used to map the given IPA
+ * range.
+ */
+struct kvm_pgtable {
+   u32 ia_bits;
+   u32 start_level;
+   kvm_pte_t   *pgd;
+   struct kvm_pgtable_mm_ops   *mm_ops;
+
+   /* Stage-2 only */
+   struct kvm_s2_mmu   *mmu;
+   enum kvm_pgtable_stage2_flags   flags;
+   kvm_pgtable_want_pte_cb_t   want_pte_cb;
+};
+
 /**
  * struct kvm_mem_range - Range of Intermediate Physical Addresses
  * @start: Start of the range.
@@ -216,21 +227,25 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 
addr, u64 size, u64 phys,
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
 
 /**
- * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
+ * kvm_pgtable_stage2_init_full() - Initialise a guest stage-2 page-table.
  * @pgt:   Uninitialised page-table structure to initialise.
  * @arch:  Arch-specific KVM structure representing the guest virtual
  * machine.
  * @mm_ops:Memory management callbacks.
  * @flags: Stage-2 configuration flags.
+ * @want_pte_cb: Callback function used during map operations to decide
+ * whether block mappings can be used to 

[PATCH 10/14] KVM: arm64: Enable retrieving protections attributes of PTEs

2021-07-19 Thread Quentin Perret
Introduce helper functions in the KVM stage-2 and stage-1 page-table
manipulation library allowing to retrieve the enum kvm_pgtable_prot of a
PTE. This will be useful to implement custom walkers outside of
pgtable.c.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h | 20 
 arch/arm64/kvm/hyp/pgtable.c | 49 
 2 files changed, 69 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index f6d3d5c8910d..1aa49d6aabb7 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -467,4 +467,24 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, 
u64 size,
  */
 int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
  int owner, struct kvm_mem_range *range);
+
+/**
+ * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
+ *stage-2 Page-Table Entry.
+ * @pte:   Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ *kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte);
+
+/**
+ * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1
+ * Page-Table Entry.
+ * @pte:   Page-table entry
+ *
+ * Return: protection attributes of the page-table entry in the enum
+ *kvm_pgtable_prot format.
+ */
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte);
 #endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 51598b79dafc..c7120797404a 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -234,6 +234,20 @@ static kvm_pte_t pte_ignored_bit_prot(enum 
kvm_pgtable_prot prot)
return FIELD_PREP(KVM_PTE_LEAF_ATTR_IGNORED, ignored_bits);
 }
 
+static enum kvm_pgtable_prot pte_read_ignored_bits_prot(kvm_pte_t pte)
+{
+   enum kvm_pgtable_prot prot = 0;
+   kvm_pte_t ignored_bits = 0;
+
+   ignored_bits = FIELD_GET(KVM_PTE_LEAF_ATTR_IGNORED, pte);
+   if (ignored_bits & BIT(1))
+   prot |= KVM_PGTABLE_STATE_BORROWED | KVM_PGTABLE_STATE_SHARED;
+   if (ignored_bits & BIT(0))
+   prot |= KVM_PGTABLE_STATE_SHARED;
+
+   return prot;
+}
+
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
  u32 level, kvm_pte_t *ptep,
  enum kvm_pgtable_walk_flags flag)
@@ -386,6 +400,25 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, 
kvm_pte_t *ptep)
return 0;
 }
 
+enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
+{
+   enum kvm_pgtable_prot prot = 0;
+   u32 ap;
+
+   if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
+   prot |= KVM_PGTABLE_PROT_X;
+
+   ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
+   if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
+   prot |= KVM_PGTABLE_PROT_R;
+   else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
+   prot |= KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+
+   prot |= pte_read_ignored_bits_prot(pte);
+
+   return prot;
+}
+
 static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
 {
if (old == new)
@@ -588,6 +621,22 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, 
enum kvm_pgtable_prot p
return 0;
 }
 
+enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
+{
+   enum kvm_pgtable_prot prot = 0;
+
+   if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
+   prot |= KVM_PGTABLE_PROT_R;
+   if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
+   prot |= KVM_PGTABLE_PROT_W;
+   if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
+   prot |= KVM_PGTABLE_PROT_X;
+
+   prot |= pte_read_ignored_bits_prot(pte);
+
+   return prot;
+}
+
 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
 {
if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 14/14] KVM: arm64: Prevent late calls to __pkvm_create_private_mapping()

2021-07-19 Thread Quentin Perret
__pkvm_create_private_mapping() allows the host kernel to create
arbitrary mappings the hypervisor's "private" range. However, this is
only needed early on, and there should be no good reason for the host
to need this past the point where the pkvm static is set. Make sure to
stub the hypercall past this point to ensure it can't be used by a
malicious host.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f05ecbd382d0..e1d12f8122a7 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -154,7 +154,10 @@ static void handle___pkvm_create_private_mapping(struct 
kvm_cpu_context *host_ct
DECLARE_REG(size_t, size, host_ctxt, 2);
DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
 
-   cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+   if (static_branch_unlikely(_protected_mode_initialized))
+   cpu_reg(host_ctxt, 1) = -EPERM;
+   else
+   cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, 
size, prot);
 }
 
 static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 02/14] KVM: arm64: Optimize kvm_pgtable_stage2_find_range()

2021-07-19 Thread Quentin Perret
The kvm_pgtable_stage2_find_range() function is used in the host memory
abort path to try and look for the largest block mapping that can be
used to map the faulting address. In order to do so, the function
currently walks the stage-2 page-table and looks for existing
incompatible mappings within the range of the largest possible block.
If incompatible mappings are found, it tries the same procedure again,
but using a smaller block range, and repeats until a matching range is
found (potentially up to page granularity). While this approach has
benefits (mostly in the fact that it proactively coalesces host stage-2
mappings), it can be slow if the ranges are fragmented, and it isn't
optimized to deal with CPUs faulting on the same IPA as all of them will
do all the work every time.

To avoid these issues, rework kvm_pgtable_stage2_find_range() by walking
the page-table only once to find the closest leaf to the input address,
and return its corresponding range if it is invalid and not owned by
another entity. If a valid leaf is found, return -EAGAIN similar to what
is done in the kvm_pgtable_stage2_map() path to optimize concurrent
faults.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h  | 12 ++---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |  2 +-
 arch/arm64/kvm/hyp/pgtable.c  | 74 +++
 3 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index f004c0115d89..d6649352c8b3 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -434,21 +434,17 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, 
u64 size,
 
 /**
  * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
- *  Addresses with compatible permission
- *  attributes.
+ *  Addresses with a given ownership.
  * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:  Address that must be covered by the range.
- * @prot:  Protection attributes that the range must be compatible with.
+ * @owner: Expected owner of the pages of the range.
  * @range: Range structure used to limit the search space at call time and
  * that will hold the result.
  *
- * The offset of @addr within a page is ignored. An IPA is compatible with 
@prot
- * iff its corresponding stage-2 page-table entry has default ownership and, if
- * valid, is mapped with protection attributes identical to @prot.
+ * The offset of @addr within a page is ignored.
  *
  * Return: 0 on success, negative error code on failure.
  */
 int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
- enum kvm_pgtable_prot prot,
- struct kvm_mem_range *range);
+ int owner, struct kvm_mem_range *range);
 #endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 56f2117c877b..58edc62be6f7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -236,7 +236,7 @@ static int host_stage2_idmap(u64 addr)
prot |= KVM_PGTABLE_PROT_X;
 
hyp_spin_lock(_kvm.lock);
-   ret = kvm_pgtable_stage2_find_range(_kvm.pgt, addr, prot, );
+   ret = kvm_pgtable_stage2_find_range(_kvm.pgt, addr, 0, );
if (ret)
goto unlock;
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 05321f4165e3..978f341d02ca 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1103,76 +1103,60 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
pgt->pgd = NULL;
 }
 
-#define KVM_PTE_LEAF_S2_COMPAT_MASK(KVM_PTE_LEAF_ATTR_S2_PERMS | \
-KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
-KVM_PTE_LEAF_ATTR_S2_IGNORED)
+struct stage2_check_permission_data {
+   u32 level;
+   u8 owner;
+};
 
 static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  enum kvm_pgtable_walk_flags flag,
  void * const arg)
 {
-   kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
+   struct stage2_check_permission_data *data = arg;
+   kvm_pte_t pte = *ptep;
 
-   /*
-* Compatible mappings are either invalid and owned by the page-table
-* owner (whose id is 0), or valid with matching permission attributes.
-*/
-   if (kvm_pte_valid(pte)) {
-   old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
-   if (old_attr != *new_attr)
-   return -EEXIST;
-   } else if (pte) {

[PATCH 03/14] KVM: arm64: Continue stage-2 map when re-creating mappings

2021-07-19 Thread Quentin Perret
The stage-2 map walkers currently return -EAGAIN when re-creating
identical mappings or only changing access permissions. This allows to
optimize mapping pages for concurrent (v)CPUs faulting on the same
page.

While this works as expected when touching one page-table leaf at a
time, this can lead to difficult situations when mapping larger ranges.
Indeed, a large map operation can fail in the middle if an existing
mapping is found in the range, even if it has compatible attributes,
hence leaving only half of the range mapped.

To avoid having to deal with such failures in the caller, don't
interrupt the map operation when hitting existing PTEs, but make sure to
still return -EAGAIN so that user_mem_abort() can mark the page dirty
when needed.

Cc: Yanan Wang 
Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h |  2 +-
 arch/arm64/kvm/hyp/pgtable.c | 21 +
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index d6649352c8b3..af62203d2f7a 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -258,7 +258,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  * If device attributes are not explicitly requested in @prot, then the
  * mapping will be normal, cacheable.
  *
- * Note that the update of a valid leaf PTE in this function will be aborted,
+ * Note that the update of a valid leaf PTE in this function will be skipped,
  * if it's trying to recreate the exact same mapping or only change the access
  * permissions. Instead, the vCPU will exit one more time from guest if still
  * needed and then go through the path of relaxing permissions.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 978f341d02ca..bb73c5331b7c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -475,6 +475,8 @@ struct stage2_map_data {
void*memcache;
 
struct kvm_pgtable_mm_ops   *mm_ops;
+
+   int ret;
 };
 
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
@@ -612,8 +614,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
 * the vCPU will exit one more time from guest if still needed
 * and then go through the path of relaxing permissions.
 */
-   if (!stage2_pte_needs_update(old, new))
-   return -EAGAIN;
+   if (!stage2_pte_needs_update(old, new)) {
+   data->ret = -EAGAIN;
+   goto out;
+   }
 
stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
}
@@ -629,6 +633,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
smp_store_release(ptep, new);
if (stage2_pte_is_counted(new))
mm_ops->get_page(ptep);
+out:
if (kvm_phys_is_valid(phys))
data->phys += granule;
return 0;
@@ -771,6 +776,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
.mmu= pgt->mmu,
.memcache   = mc,
.mm_ops = pgt->mm_ops,
+   .ret= 0,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -789,7 +795,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
 
ret = kvm_pgtable_walk(pgt, addr, size, );
dsb(ishst);
-   return ret;
+   if (ret)
+   return ret;
+
+   return map_data.ret;
 }
 
 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
@@ -802,6 +811,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, 
u64 addr, u64 size,
.memcache   = mc,
.mm_ops = pgt->mm_ops,
.owner_id   = owner_id,
+   .ret= 0,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -815,7 +825,10 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, 
u64 addr, u64 size,
return -EINVAL;
 
ret = kvm_pgtable_walk(pgt, addr, size, );
-   return ret;
+   if (ret)
+   return ret;
+
+   return map_data.ret;
 }
 
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 01/14] KVM: arm64: Provide the host_stage2_try() helper macro

2021-07-19 Thread Quentin Perret
We currently unmap all MMIO mappings from the host stage-2 to recycle
the pages whenever we run out. In order to make this pattern easy to
re-use from other places, factor the logic out into a dedicated macro.
While at it, apply the macro for the kvm_pgtable_stage2_set_owner()
calls. They're currently only called early on and are guaranteed to
succeed, but making them robust to the -ENOMEM case doesn't hurt and
will avoid painful debugging sessions later on.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 38 ++-
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d938ce95d3bd..56f2117c877b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -208,6 +208,23 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
  prot, _s2_pool);
 }
 
+/*
+ * The pool has been provided with enough pages to cover all of memory with
+ * page granularity, but it is difficult to know how much of the MMIO range
+ * we will need to cover upfront, so we may need to 'recycle' the pages if we
+ * run out.
+ */
+#define host_stage2_try(fn, ...)   \
+   ({  \
+   int __ret = fn(__VA_ARGS__);\
+   if (__ret == -ENOMEM) { \
+   __ret = host_stage2_unmap_dev_all();\
+   if (!__ret) \
+   __ret = fn(__VA_ARGS__);\
+   }   \
+   __ret;  \
+})
+
 static int host_stage2_idmap(u64 addr)
 {
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
@@ -223,22 +240,7 @@ static int host_stage2_idmap(u64 addr)
if (ret)
goto unlock;
 
-   ret = __host_stage2_idmap(range.start, range.end, prot);
-   if (ret != -ENOMEM)
-   goto unlock;
-
-   /*
-* The pool has been provided with enough pages to cover all of memory
-* with page granularity, but it is difficult to know how much of the
-* MMIO range we will need to cover upfront, so we may need to 'recycle'
-* the pages if we run out.
-*/
-   ret = host_stage2_unmap_dev_all();
-   if (ret)
-   goto unlock;
-
-   ret = __host_stage2_idmap(range.start, range.end, prot);
-
+   ret = host_stage2_try(__host_stage2_idmap, range.start, range.end, 
prot);
 unlock:
hyp_spin_unlock(_kvm.lock);
 
@@ -257,8 +259,8 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
return -EINVAL;
 
hyp_spin_lock(_kvm.lock);
-   ret = kvm_pgtable_stage2_set_owner(_kvm.pgt, start, end - start,
-  _s2_pool, pkvm_hyp_id);
+   ret = host_stage2_try(kvm_pgtable_stage2_set_owner, _kvm.pgt,
+ start, end - start, _s2_pool, pkvm_hyp_id);
hyp_spin_unlock(_kvm.lock);
 
return ret != -EAGAIN ? ret : 0;
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 00/14] Track shared pages at EL2 in protected mode

2021-07-19 Thread Quentin Perret
Hi all,

This series aims to improve how the nVHE hypervisor tracks ownership of memory
pages when running in protected mode ("kvm-arm.mode=protected" on the kernel
command line).

The main issue with the existing ownership tracking code is that it is
completely binary: a page is either owned by an entity (e.g. the host) or not.
However, we'll need something smarter to track shared pages, as is needed for
virtio, or even just host/hypervisor communications.

This series introduces a few changes to the kvm page-table library to allow
annotating shared pages in ignored bits (a.k.a. software bits) of leaf entries,
and makes use of that infrastructure to track all pages that are shared between
the host and the hypervisor. We will obviously want to apply the same treatment
to guest stage-2 page-tables, but that is not really possible to do until EL2
manages them directly, so I'll keep that for another series.

The series is split as follows:

 - patches 01-04 are essentially cleanups and optimizations of existing code
   paths that might be relevant on their own, but also prepare the ground for
   the rest of the series;

 - patches 05-08 introduce support in the page-table library for annotating
   shared pages with software bits;

 - patches 09-14 make use of the previously introduced infrastructure to
   annotate all pages shared between the host and the hypervisor;

The series is based on the latest kvmarm/fixes branch, and has been tested
on AML-S905X-CC (Le Potato) and using various Qemu configurations.

Thanks!
Quentin

Quentin Perret (14):
  KVM: arm64: Provide the host_stage2_try() helper macro
  KVM: arm64: Optimize kvm_pgtable_stage2_find_range()
  KVM: arm64: Continue stage-2 map when re-creating mappings
  KVM: arm64: Rename KVM_PTE_LEAF_ATTR_S2_IGNORED
  KVM: arm64: Don't overwrite ignored bits with owner id
  KVM: arm64: Tolerate re-creating hyp mappings to set ignored bits
  KVM: arm64: Enable forcing page-level stage-2 mappings
  KVM: arm64: Add support for tagging shared pages in page-table
  KVM: arm64: Mark host bss and rodata section as shared
  KVM: arm64: Enable retrieving protections attributes of PTEs
  KVM: arm64: Expose kvm_pte_valid() helper
  KVM: arm64: Refactor pkvm_pgtable locking
  KVM: arm64: Restrict hyp stage-1 manipulation in protected mode
  KVM: arm64: Prevent late calls to __pkvm_create_private_mapping()

 arch/arm64/include/asm/kvm_asm.h  |   2 +-
 arch/arm64/include/asm/kvm_pgtable.h  | 109 ++---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +
 arch/arm64/kvm/hyp/include/nvhe/mm.h  |   3 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c|  17 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 156 +++--
 arch/arm64/kvm/hyp/nvhe/mm.c  |  20 +-
 arch/arm64/kvm/hyp/nvhe/setup.c   |  52 -
 arch/arm64/kvm/hyp/pgtable.c  | 219 +-
 arch/arm64/kvm/mmu.c  |  14 +-
 10 files changed, 454 insertions(+), 140 deletions(-)

-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 08/14] KVM: arm64: Add support for tagging shared pages in page-table

2021-07-19 Thread Quentin Perret
The hypervisor will soon be in charge of tracking ownership of all
memory pages in the system. The current page-tracking infrastructure at
EL2 only allows binary states: a page is either owned or not by an
entity. But a number of use-cases will require more complex states for
pages that are shared between two entities (host, hypervisor, or guests).

In preparation for supporting these use-cases, introduce in the KVM
page-table library some infrastructure allowing to tag shared pages
using ignored bits (a.k.a. software bits) in PTEs.

Signed-off-by: Quentin Perret 
---
 arch/arm64/include/asm/kvm_pgtable.h |  5 +
 arch/arm64/kvm/hyp/pgtable.c | 25 +
 2 files changed, 30 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index dd72653314c7..f6d3d5c8910d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -81,6 +81,8 @@ enum kvm_pgtable_stage2_flags {
  * @KVM_PGTABLE_PROT_W:Write permission.
  * @KVM_PGTABLE_PROT_R:Read permission.
  * @KVM_PGTABLE_PROT_DEVICE:   Device attributes.
+ * @KVM_PGTABLE_STATE_SHARED:  Page shared with another entity.
+ * @KVM_PGTABLE_STATE_BORROWED:Page borrowed from another entity.
  */
 enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_X  = BIT(0),
@@ -88,6 +90,9 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_R  = BIT(2),
 
KVM_PGTABLE_PROT_DEVICE = BIT(3),
+
+   KVM_PGTABLE_STATE_SHARED= BIT(4),
+   KVM_PGTABLE_STATE_BORROWED  = BIT(5),
 };
 
 #define KVM_PGTABLE_PROT_RW(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 5bdbe7a31551..51598b79dafc 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -211,6 +211,29 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
 }
 
+static kvm_pte_t pte_ignored_bit_prot(enum kvm_pgtable_prot prot)
+{
+   kvm_pte_t ignored_bits = 0;
+
+   /*
+* Ignored bits 0 and 1 are reserved to track the memory ownership
+* state of each page:
+*   00: The page is owned solely by the page-table owner.
+*   01: The page is owned by the page-table owner, but is shared
+*   with another entity.
+*   10: The page is shared with, but not owned by the page-table owner.
+*   11: Reserved for future use (lending).
+*/
+   if (prot & KVM_PGTABLE_STATE_SHARED) {
+   if (prot & KVM_PGTABLE_STATE_BORROWED)
+   ignored_bits |= BIT(1);
+   else
+   ignored_bits |= BIT(0);
+   }
+
+   return FIELD_PREP(KVM_PTE_LEAF_ATTR_IGNORED, ignored_bits);
+}
+
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
  u32 level, kvm_pte_t *ptep,
  enum kvm_pgtable_walk_flags flag)
@@ -357,6 +380,7 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, 
kvm_pte_t *ptep)
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+   attr |= pte_ignored_bit_prot(prot);
*ptep = attr;
 
return 0;
@@ -558,6 +582,7 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, 
enum kvm_pgtable_prot p
 
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+   attr |= pte_ignored_bit_prot(prot);
*ptep = attr;
 
return 0;
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 06/14] KVM: arm64: Tolerate re-creating hyp mappings to set ignored bits

2021-07-19 Thread Quentin Perret
The current hypervisor stage-1 mapping code doesn't allow changing an
existing valid mapping. Relax this condition by allowing changes that
only target ignored bits, as that will soon be needed to annotate shared
pages.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/pgtable.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a0ac8c2bc174..34cf67997a82 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -362,6 +362,17 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, 
kvm_pte_t *ptep)
return 0;
 }
 
+static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+   if (old == new)
+   return false;
+
+   if (!kvm_pte_valid(old))
+   return true;
+
+   return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_IGNORED);
+}
+
 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
 {
@@ -371,9 +382,12 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 
level,
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   /* Tolerate KVM recreating the exact same mapping */
+   /*
+* Tolerate KVM recreating the exact same mapping, or changing ignored
+* bits.
+*/
new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-   if (old != new && !WARN_ON(kvm_pte_valid(old)))
+   if (hyp_pte_needs_update(old, new))
smp_store_release(ptep, new);
 
data->phys += granule;
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 04/14] KVM: arm64: Rename KVM_PTE_LEAF_ATTR_S2_IGNORED

2021-07-19 Thread Quentin Perret
The ignored bits for both stage-1 and stage-2 page and block
descriptors are in [55:58], so rename KVM_PTE_LEAF_ATTR_S2_IGNORED to
make it applicable to both.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bb73c5331b7c..a60653cbd8e5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -40,6 +40,8 @@
 
 #define KVM_PTE_LEAF_ATTR_HI   GENMASK(63, 51)
 
+#define KVM_PTE_LEAF_ATTR_IGNORED  GENMASK(58, 55)
+
 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
@@ -48,8 +50,6 @@
 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
-#define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
-
 #define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
 #define KVM_MAX_OWNER_ID   1
 
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 05/14] KVM: arm64: Don't overwrite ignored bits with owner id

2021-07-19 Thread Quentin Perret
The nVHE protected mode uses invalid mappings in the host stage-2
page-table to track the owner of each page in the system. In order to
allow the usage of ignored bits (a.k.a. software bits) in these
mappings, move the owner encoding away from the top bits.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/pgtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a60653cbd8e5..a0ac8c2bc174 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -50,7 +50,7 @@
 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
 #define KVM_MAX_OWNER_ID   1
 
 struct kvm_pgtable_walk_data {
-- 
2.32.0.402.g57bb445576-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 08/14] KVM: arm64: Add support for tagging shared pages in page-table

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 11:47:29 +0100,
Quentin Perret  wrote:
> 
> The hypervisor will soon be in charge of tracking ownership of all
> memory pages in the system. The current page-tracking infrastructure at
> EL2 only allows binary states: a page is either owned or not by an
> entity. But a number of use-cases will require more complex states for
> pages that are shared between two entities (host, hypervisor, or guests).
> 
> In preparation for supporting these use-cases, introduce in the KVM
> page-table library some infrastructure allowing to tag shared pages
> using ignored bits (a.k.a. software bits) in PTEs.
> 
> Signed-off-by: Quentin Perret 
> ---
>  arch/arm64/include/asm/kvm_pgtable.h |  5 +
>  arch/arm64/kvm/hyp/pgtable.c | 25 +
>  2 files changed, 30 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> b/arch/arm64/include/asm/kvm_pgtable.h
> index dd72653314c7..f6d3d5c8910d 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -81,6 +81,8 @@ enum kvm_pgtable_stage2_flags {
>   * @KVM_PGTABLE_PROT_W:  Write permission.
>   * @KVM_PGTABLE_PROT_R:  Read permission.
>   * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
> + * @KVM_PGTABLE_STATE_SHARED:Page shared with another entity.
> + * @KVM_PGTABLE_STATE_BORROWED:  Page borrowed from another entity.
>   */
>  enum kvm_pgtable_prot {
>   KVM_PGTABLE_PROT_X  = BIT(0),
> @@ -88,6 +90,9 @@ enum kvm_pgtable_prot {
>   KVM_PGTABLE_PROT_R  = BIT(2),
>  
>   KVM_PGTABLE_PROT_DEVICE = BIT(3),
> +
> + KVM_PGTABLE_STATE_SHARED= BIT(4),
> + KVM_PGTABLE_STATE_BORROWED  = BIT(5),

I'd rather have some indirection here, as we have other potential
users for the SW bits outside of pKVM (see the NV series, which uses
some of these SW bits as the backend for TTL-based TLB invalidation).

Can we instead only describe the SW bit states in this enum, and let
the users map the semantic they require onto that state? See [1] for
what I carry in the NV branch.

>  };
>  
>  #define KVM_PGTABLE_PROT_RW  (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index 5bdbe7a31551..51598b79dafc 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -211,6 +211,29 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
>   return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
>  }
>  
> +static kvm_pte_t pte_ignored_bit_prot(enum kvm_pgtable_prot prot)

Can we call these sw rather than ignored?

> +{
> + kvm_pte_t ignored_bits = 0;
> +
> + /*
> +  * Ignored bits 0 and 1 are reserved to track the memory ownership
> +  * state of each page:
> +  *   00: The page is owned solely by the page-table owner.
> +  *   01: The page is owned by the page-table owner, but is shared
> +  *   with another entity.
> +  *   10: The page is shared with, but not owned by the page-table owner.
> +  *   11: Reserved for future use (lending).
> +  */
> + if (prot & KVM_PGTABLE_STATE_SHARED) {
> + if (prot & KVM_PGTABLE_STATE_BORROWED)
> + ignored_bits |= BIT(1);
> + else
> + ignored_bits |= BIT(0);
> + }
> +
> + return FIELD_PREP(KVM_PTE_LEAF_ATTR_IGNORED, ignored_bits);
> +}
> +
>  static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 
> addr,
> u32 level, kvm_pte_t *ptep,
> enum kvm_pgtable_walk_flags flag)
> @@ -357,6 +380,7 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, 
> kvm_pte_t *ptep)
>   attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
>   attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
>   attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
> + attr |= pte_ignored_bit_prot(prot);
>   *ptep = attr;
>  
>   return 0;
> @@ -558,6 +582,7 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, 
> enum kvm_pgtable_prot p
>  
>   attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
>   attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
> + attr |= pte_ignored_bit_prot(prot);
>   *ptep = attr;
>  
>   return 0;

How about kvm_pgtable_stage2_relax_perms()?

Thanks,

M.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/commit/?h=kvm-arm64/nv-5.13=5dea6d82de76cfcda59818ec2532fc34c615db39

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 07/14] KVM: arm64: Enable forcing page-level stage-2 mappings

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 11:47:28 +0100,
Quentin Perret  wrote:
> 
> Much of the stage-2 manipulation logic relies on being able to destroy
> block mappings if e.g. installing a smaller mapping in the range. The
> rationale for this behaviour is that stage-2 mappings can always be
> re-created lazily. However, this gets more complicated when the stage-2
> page-table is used to store metadata about the underlying pages. In such
> a case, destroying a block mapping may lead to losing part of the
> state, and confuse the user of those metadata (such as the hypervisor in
> nVHE protected mode).
> 
> To fix this, introduce a callback function in the pgtable struct which
> is called during all map operations to determine whether the mappings
> can us blocks, or should be forced to page-granularity level. This is

nit: use?

> used by the hypervisor when creating the host stage-2 to force
> page-level mappings when using non-default protection attributes.
> 
> Signed-off-by: Quentin Perret 
> ---
>  arch/arm64/include/asm/kvm_pgtable.h  | 63 +--
>  arch/arm64/kvm/hyp/nvhe/mem_protect.c | 16 +--
>  arch/arm64/kvm/hyp/pgtable.c  | 20 +++--
>  3 files changed, 69 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> b/arch/arm64/include/asm/kvm_pgtable.h
> index af62203d2f7a..dd72653314c7 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -75,25 +75,6 @@ enum kvm_pgtable_stage2_flags {
>   KVM_PGTABLE_S2_IDMAP= BIT(1),
>  };
>  
> -/**
> - * struct kvm_pgtable - KVM page-table.
> - * @ia_bits: Maximum input address size, in bits.
> - * @start_level: Level at which the page-table walk starts.
> - * @pgd: Pointer to the first top-level entry of the page-table.
> - * @mm_ops:  Memory management callbacks.
> - * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
> - */
> -struct kvm_pgtable {
> - u32 ia_bits;
> - u32 start_level;
> - kvm_pte_t   *pgd;
> - struct kvm_pgtable_mm_ops   *mm_ops;
> -
> - /* Stage-2 only */
> - struct kvm_s2_mmu   *mmu;
> - enum kvm_pgtable_stage2_flags   flags;
> -};
> -
>  /**
>   * enum kvm_pgtable_prot - Page-table permissions and attributes.
>   * @KVM_PGTABLE_PROT_X:  Execute permission.
> @@ -109,11 +90,41 @@ enum kvm_pgtable_prot {
>   KVM_PGTABLE_PROT_DEVICE = BIT(3),
>  };
>  
> -#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> +#define KVM_PGTABLE_PROT_RW  (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
> +#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
> +
> +#define PAGE_HYP KVM_PGTABLE_PROT_RW
>  #define PAGE_HYP_EXEC(KVM_PGTABLE_PROT_R | 
> KVM_PGTABLE_PROT_X)
>  #define PAGE_HYP_RO  (KVM_PGTABLE_PROT_R)
>  #define PAGE_HYP_DEVICE  (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
>  
> +typedef bool (*kvm_pgtable_want_pte_cb_t)(u64 addr, u64 end,
> +   enum kvm_pgtable_prot prot);
> +
> +/**
> + * struct kvm_pgtable - KVM page-table.
> + * @ia_bits: Maximum input address size, in bits.
> + * @start_level: Level at which the page-table walk starts.
> + * @pgd: Pointer to the first top-level entry of the page-table.
> + * @mm_ops:  Memory management callbacks.
> + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
> + * @flags:   Stage-2 page-table flags.
> + * @want_pte_cb: Callback function used during map operations to decide
> + *   whether block mappings can be used to map the given IPA
> + *   range.
> + */
> +struct kvm_pgtable {
> + u32 ia_bits;
> + u32 start_level;
> + kvm_pte_t   *pgd;
> + struct kvm_pgtable_mm_ops   *mm_ops;
> +
> + /* Stage-2 only */
> + struct kvm_s2_mmu   *mmu;
> + enum kvm_pgtable_stage2_flags   flags;
> + kvm_pgtable_want_pte_cb_t   want_pte_cb;
> +};

nit: does this whole definition really need to move around?

> +
>  /**
>   * struct kvm_mem_range - Range of Intermediate Physical Addresses
>   * @start:   Start of the range.
> @@ -216,21 +227,25 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 
> addr, u64 size, u64 phys,
>  u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
>  
>  /**
> - * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
> + * kvm_pgtable_stage2_init_full() - Initialise a guest stage-2 page-table.
>   * @pgt: Uninitialised page-table structure to initialise.
>   * @arch:Arch-specific KVM structure 

Re: [PATCH 05/14] KVM: arm64: Don't overwrite ignored bits with owner id

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 11:47:26 +0100,
Quentin Perret  wrote:
> 
> The nVHE protected mode uses invalid mappings in the host stage-2
> page-table to track the owner of each page in the system. In order to
> allow the usage of ignored bits (a.k.a. software bits) in these
> mappings, move the owner encoding away from the top bits.

But that's exactly what the current situation is allowing: the use of
the SW bits. I am guessing that what you really mean is that you want
to *preserve* the SW bits from an existing mapping and use other bits
when the mapping is invalid?

Thanks,

M.

> 
> Signed-off-by: Quentin Perret 
> ---
>  arch/arm64/kvm/hyp/pgtable.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index a60653cbd8e5..a0ac8c2bc174 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -50,7 +50,7 @@
>KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
>KVM_PTE_LEAF_ATTR_HI_S2_XN)
>  
> -#define KVM_INVALID_PTE_OWNER_MASK   GENMASK(63, 56)
> +#define KVM_INVALID_PTE_OWNER_MASK   GENMASK(9, 2)
>  #define KVM_MAX_OWNER_ID 1
>  
>  struct kvm_pgtable_walk_data {
> -- 
> 2.32.0.402.g57bb445576-goog
> 
> 

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 3/4] KVM: arm64: Disabling disabled PMU counters wastes a lot of time

2021-07-19 Thread Marc Zyngier
From: Alexandre Chartre 

In a KVM guest on arm64, performance counters interrupts have an
unnecessary overhead which slows down execution when using the "perf
record" command and limits the "perf record" sampling period.

The problem is that when a guest VM disables counters by clearing the
PMCR_EL0.E bit (bit 0), KVM will disable all counters defined in
PMCR_EL0 even if they are not enabled in PMCNTENSET_EL0.

KVM disables a counter by calling into the perf framework, in particular
by calling perf_event_create_kernel_counter() which is a time consuming
operation. So, for example, with a Neoverse N1 CPU core which has 6 event
counters and one cycle counter, KVM will always disable all 7 counters
even if only one is enabled.

This typically happens when using the "perf record" command in a guest
VM: perf will disable all event counters with PMCNTENTSET_EL0 and only
uses the cycle counter. And when using the "perf record" -F option with
a high profiling frequency, the overhead of KVM disabling all counters
instead of one on every counter interrupt becomes very noticeable.

The problem is fixed by having KVM disable only counters which are
enabled in PMCNTENSET_EL0. If a counter is not enabled in PMCNTENSET_EL0
then KVM will not enable it when setting PMCR_EL0.E and it will remain
disabled as long as it is not enabled in PMCNTENSET_EL0. So there is
effectively no need to disable a counter when clearing PMCR_EL0.E if it
is not enabled PMCNTENSET_EL0.

Acked-by: Russell King (Oracle) 
Reviewed-by: Alexandru Elisei 
Signed-off-by: Alexandre Chartre 
[maz: moved 'mask' close to the actual user, simplifying the patch]
Signed-off-by: Marc Zyngier 
Link: 
https://lore.kernel.org/r/20210712170345.660272-1-alexandre.char...@oracle.com
---
 arch/arm64/kvm/pmu-emul.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index fae4e95b586c..dc65b58dc68f 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -563,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, 
u64 val)
  */
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
 {
-   unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
int i;
 
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
   __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
} else {
-   kvm_pmu_disable_counter_mask(vcpu, mask);
+   kvm_pmu_disable_counter_mask(vcpu,
+  __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
}
 
if (val & ARMV8_PMU_PMCR_C)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
 
if (val & ARMV8_PMU_PMCR_P) {
+   unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, , 32)
kvm_pmu_set_counter_value(vcpu, i, 0);
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 2/4] KVM: arm64: Drop unnecessary masking of PMU registers

2021-07-19 Thread Marc Zyngier
We always sanitise our PMU sysreg on the write side, so there
is no need to do it on the read side as well.

Drop the unnecessary masking.

Acked-by: Russell King (Oracle) 
Reviewed-by: Alexandre Chartre 
Reviewed-by: Alexandru Elisei 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/pmu-emul.c | 3 +--
 arch/arm64/kvm/sys_regs.c | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f33825c995cb..fae4e95b586c 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
-   reg &= kvm_pmu_valid_counter_mask(vcpu);
}
 
return reg;
@@ -569,7 +568,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
 
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
-  __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+  __vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
} else {
kvm_pmu_disable_counter_mask(vcpu, mask);
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 96bdfa0e68b2..f22139658e48 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -880,7 +880,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
kvm_pmu_disable_counter_mask(vcpu, val);
}
} else {
-   p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+   p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
}
 
return true;
@@ -904,7 +904,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
/* accessing PMINTENCLR_EL1 */
__vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
} else {
-   p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
+   p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
}
 
return true;
@@ -926,7 +926,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
/* accessing PMOVSCLR_EL0 */
__vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & 
mask);
} else {
-   p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
+   p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
}
 
return true;
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 4/4] KVM: arm64: Remove PMSWINC_EL0 shadow register

2021-07-19 Thread Marc Zyngier
We keep an entry for the PMSWINC_EL0 register in the vcpu structure,
while *never* writing anything there outside of reset.

Given that the register is defined as write-only, that we always
trap when this register is accessed, there is little point in saving
anything anyway.

Get rid of the entry, and save a mighty 8 bytes per vcpu structure.

We still need to keep it exposed to userspace in order to preserve
backward compatibility with previously saved VMs. Since userspace
cannot expect any effect of writing to PMSWINC_EL0, treat the
register as RAZ/WI for the purpose of userspace access.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kvm/sys_regs.c | 21 -
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..afc169630884 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -185,7 +185,6 @@ enum vcpu_sysreg {
PMCNTENSET_EL0, /* Count Enable Set Register */
PMINTENSET_EL1, /* Interrupt Enable Set Register */
PMOVSSET_EL0,   /* Overflow Flag Status Set Register */
-   PMSWINC_EL0,/* Software Increment Register */
PMUSERENR_EL0,  /* User Enable Register */
 
/* Pointer Authentication Registers in a strict increasing order. */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f22139658e48..a1f5101f49a3 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1286,6 +1286,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const 
struct sys_reg_desc *rd,
return __set_id_reg(vcpu, rd, uaddr, true);
 }
 
+static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ const struct kvm_one_reg *reg, void __user *uaddr)
+{
+   int err;
+   u64 val;
+
+   /* Perform the access even if we are going to ignore the value */
+   err = reg_from_user(, uaddr, sys_reg_to_index(rd));
+   if (err)
+   return err;
+
+   return 0;
+}
+
 static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
   const struct sys_reg_desc *r)
 {
@@ -1629,8 +1643,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  .access = access_pmcnten, .reg = PMCNTENSET_EL0 },
{ PMU_SYS_REG(SYS_PMOVSCLR_EL0),
  .access = access_pmovs, .reg = PMOVSSET_EL0 },
+   /*
+* PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
+* previously (and pointlessly) advertised in the past...
+*/
{ PMU_SYS_REG(SYS_PMSWINC_EL0),
- .access = access_pmswinc, .reg = PMSWINC_EL0 },
+ .get_user = get_raz_id_reg, .set_user = set_wi_reg,
+ .access = access_pmswinc, .reset = NULL },
{ PMU_SYS_REG(SYS_PMSELR_EL0),
  .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
{ PMU_SYS_REG(SYS_PMCEID0_EL0),
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 1/4] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-19 Thread Marc Zyngier
A number of the PMU sysregs expose reset values that are not
compliant with the architecture (set bits in the RES0 ranges,
for example).

This in turn has the effect that we need to pointlessly mask
some register fields when using them.

Let's start by making sure we don't have illegal values in the
shadow registers at reset time. This affects all the registers
that dedicate one bit per counter, the counters themselves,
PMEVTYPERn_EL0 and PMSELR_EL0.

Reported-by: Alexandre Chartre 
Reviewed-by: Alexandre Chartre 
Acked-by: Russell King (Oracle) 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/sys_regs.c | 43 ---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f6f126eb6ac1..96bdfa0e68b2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -603,6 +603,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu 
*vcpu,
return REG_HIDDEN;
 }
 
+static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+   u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
+
+   /* No PMU available, any PMU reg may UNDEF... */
+   if (!kvm_arm_support_pmu_v3())
+   return;
+
+   n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
+   n &= ARMV8_PMU_PMCR_N_MASK;
+   if (n)
+   mask |= GENMASK(n - 1, 0);
+
+   reset_unknown(vcpu, r);
+   __vcpu_sys_reg(vcpu, r->reg) &= mask;
+}
+
+static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+   reset_unknown(vcpu, r);
+   __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+}
+
+static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
*r)
+{
+   reset_unknown(vcpu, r);
+   __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+}
+
+static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+   reset_unknown(vcpu, r);
+   __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+}
+
 static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
u64 pmcr, val;
@@ -944,16 +979,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, 
struct sys_reg_params *p,
  trap_wcr, reset_wcr, 0, 0,  get_wcr, set_wcr }
 
 #define PMU_SYS_REG(r) \
-   SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility
+   SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility
 
 /* Macro to expand the PMEVCNTRn_EL0 register */
 #define PMU_PMEVCNTR_EL0(n)\
{ PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)),\
+ .reset = reset_pmevcntr,  \
  .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
 
 /* Macro to expand the PMEVTYPERn_EL0 register */
 #define PMU_PMEVTYPER_EL0(n)   \
{ PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)),   \
+ .reset = reset_pmevtyper, \
  .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
 
 static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
@@ -1595,13 +1632,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ PMU_SYS_REG(SYS_PMSWINC_EL0),
  .access = access_pmswinc, .reg = PMSWINC_EL0 },
{ PMU_SYS_REG(SYS_PMSELR_EL0),
- .access = access_pmselr, .reg = PMSELR_EL0 },
+ .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
{ PMU_SYS_REG(SYS_PMCEID0_EL0),
  .access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCEID1_EL0),
  .access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCCNTR_EL0),
- .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 },
+ .access = access_pmu_evcntr, .reset = reset_unknown, .reg = 
PMCCNTR_EL0 },
{ PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
  .access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 0/4] kvm-arm64: Fix PMU reset values (and more)

2021-07-19 Thread Marc Zyngier
This is the second version of the series initially posted at [1].

* From v1:
  - Simplified masking in patch #1
  - Added a patch dropping PMSWINC_EL0 as a shadow register, though it
is still advertised to userspace for the purpose of backward
compatibility of VM save/restore
  - Collected ABs/RBs, with thanks

[1] https://lore.kernel.org/r/20210713135900.1473057-1-...@kernel.org

Alexandre Chartre (1):
  KVM: arm64: Disabling disabled PMU counters wastes a lot of time

Marc Zyngier (3):
  KVM: arm64: Narrow PMU sysreg reset values to architectural
requirements
  KVM: arm64: Drop unnecessary masking of PMU registers
  KVM: arm64: Remove PMSWINC_EL0 shadow register

 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kvm/pmu-emul.c |  8 ++--
 arch/arm64/kvm/sys_regs.c | 70 +++
 3 files changed, 67 insertions(+), 12 deletions(-)

-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 03/14] KVM: arm64: Continue stage-2 map when re-creating mappings

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 11:47:24 +0100,
Quentin Perret  wrote:
> 
> The stage-2 map walkers currently return -EAGAIN when re-creating
> identical mappings or only changing access permissions. This allows to
> optimize mapping pages for concurrent (v)CPUs faulting on the same
> page.
> 
> While this works as expected when touching one page-table leaf at a
> time, this can lead to difficult situations when mapping larger ranges.
> Indeed, a large map operation can fail in the middle if an existing
> mapping is found in the range, even if it has compatible attributes,
> hence leaving only half of the range mapped.

I'm curious of when this can happen. We normally map a single leaf at
a time, and we don't have a way to map multiple leaves at once: we
either use the VMA base size or try to upgrade it to a THP, but the
result is always a single leaf entry. What changed?

> To avoid having to deal with such failures in the caller, don't
> interrupt the map operation when hitting existing PTEs, but make sure to
> still return -EAGAIN so that user_mem_abort() can mark the page dirty
> when needed.

I don't follow you here: if you return -EAGAIN for a writable mapping,
we don't account for the page to be dirty on the assumption that
nothing has been mapped. But if there is a way to map more than a
single entry and to get -EAGAIN at the same time, then we're bound to
lose data on page eviction.

Can you shed some light on this?

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/5] KVM: arm64: Walk userspace page tables to compute the THP mapping size

2021-07-19 Thread Marc Zyngier
On Mon, 19 Jul 2021 07:31:30 +0100,
Paolo Bonzini  wrote:
> 
> On 17/07/21 11:55, Marc Zyngier wrote:
> > We currently rely on the kvm_is_transparent_hugepage() helper to
> > discover whether a given page has the potential to be mapped as
> > a block mapping.
> > 
> > However, this API doesn't really give un everything we want:
> > - we don't get the size: this is not crucial today as we only
> >support PMD-sized THPs, but we'd like to have larger sizes
> >in the future
> > - we're the only user left of the API, and there is a will
> >to remove it altogether
> > 
> > To address the above, implement a simple walker using the existing
> > page table infrastructure, and plumb it into transparent_hugepage_adjust().
> > No new page sizes are supported in the process.
> > 
> > Signed-off-by: Marc Zyngier 
> 
> If it's okay for you to reuse the KVM page walker that's fine of
> course, but the arch/x86/mm functions lookup_address_in_{mm,pgd} are
> mostly machine-independent and it may make sense to move them to mm/.
>
> That would also allow reusing the x86 function host_pfn_mapping_level.

That could work to some extent, but the way the x86 code equates level
to mapping size is a bit at odds with the multiple page sizes that
arm64 deals with.

We're also trying to move away from the whole P*D abstraction, because
this isn't something we want to deal with in the self-contained EL2
object.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 5/5] KVM: Get rid of kvm_get_pfn()

2021-07-19 Thread Paolo Bonzini

On 17/07/21 11:55, Marc Zyngier wrote:

Nobody is using kvm_get_pfn() anymore. Get rid of it.

Signed-off-by: Marc Zyngier 
---
  include/linux/kvm_host.h | 1 -
  virt/kvm/kvm_main.c  | 9 +
  2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ae7735b490b4..9818d271c2a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -824,7 +824,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn);
  void kvm_release_pfn_dirty(kvm_pfn_t pfn);
  void kvm_set_pfn_dirty(kvm_pfn_t pfn);
  void kvm_set_pfn_accessed(kvm_pfn_t pfn);
-void kvm_get_pfn(kvm_pfn_t pfn);
  
  void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);

  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e410a8a6a67..0284418c4400 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2215,7 +2215,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 * Get a reference here because callers of *hva_to_pfn* and
 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
-* set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+* set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
 * simply do nothing for reserved pfns.
 *
 * Whoever called remap_pfn_range is also going to call e.g.
@@ -2612,13 +2612,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  }
  EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
  
-void kvm_get_pfn(kvm_pfn_t pfn)

-{
-   if (!kvm_is_reserved_pfn(pfn))
-   get_page(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_get_pfn);
-
  static int next_segment(unsigned long len, int offset)
  {
if (len > PAGE_SIZE - offset)



Acked-by: Paolo Bonzini 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 3/5] KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()

2021-07-19 Thread Paolo Bonzini

On 17/07/21 11:55, Marc Zyngier wrote:

Now that arm64 has stopped using kvm_is_transparent_hugepage(),
we can remove it, as well as PageTransCompoundMap() which was
only used by the former.

Signed-off-by: Marc Zyngier 
---
  include/linux/page-flags.h | 37 -
  virt/kvm/kvm_main.c| 10 --
  2 files changed, 47 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5922031ffab6..1ace27c4a8e0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -632,43 +632,6 @@ static inline int PageTransCompound(struct page *page)
return PageCompound(page);
  }
  
-/*

- * PageTransCompoundMap is the same as PageTransCompound, but it also
- * guarantees the primary MMU has the entire compound page mapped
- * through pmd_trans_huge, which in turn guarantees the secondary MMUs
- * can also map the entire compound page. This allows the secondary
- * MMUs to call get_user_pages() only once for each compound page and
- * to immediately map the entire compound page with a single secondary
- * MMU fault. If there will be a pmd split later, the secondary MMUs
- * will get an update through the MMU notifier invalidation through
- * split_huge_pmd().
- *
- * Unlike PageTransCompound, this is safe to be called only while
- * split_huge_pmd() cannot run from under us, like if protected by the
- * MMU notifier, otherwise it may result in page->_mapcount check false
- * positives.
- *
- * We have to treat page cache THP differently since every subpage of it
- * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
- * mapped in the current process so comparing subpage's _mapcount to
- * compound_mapcount to filter out PTE mapped case.
- */
-static inline int PageTransCompoundMap(struct page *page)
-{
-   struct page *head;
-
-   if (!PageTransCompound(page))
-   return 0;
-
-   if (PageAnon(page))
-   return atomic_read(>_mapcount) < 0;
-
-   head = compound_head(page);
-   /* File THP is PMD mapped and not PTE mapped */
-   return atomic_read(>_mapcount) ==
-  atomic_read(compound_mapcount_ptr(head));
-}
-
  /*
   * PageTransTail returns true for both transparent huge pages
   * and hugetlbfs pages, so it should only be called when it's known
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7d95126cda9e..2e410a8a6a67 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
return true;
  }
  
-bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)

-{
-   struct page *page = pfn_to_page(pfn);
-
-   if (!PageTransCompoundMap(page))
-   return false;
-
-   return is_transparent_hugepage(compound_head(page));
-}
-
  /*
   * Switches to specified vcpu, until a matching vcpu_put()
   */



Acked-by: Paolo Bonzini 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/5] KVM: arm64: Walk userspace page tables to compute the THP mapping size

2021-07-19 Thread Paolo Bonzini

On 17/07/21 11:55, Marc Zyngier wrote:

We currently rely on the kvm_is_transparent_hugepage() helper to
discover whether a given page has the potential to be mapped as
a block mapping.

However, this API doesn't really give un everything we want:
- we don't get the size: this is not crucial today as we only
   support PMD-sized THPs, but we'd like to have larger sizes
   in the future
- we're the only user left of the API, and there is a will
   to remove it altogether

To address the above, implement a simple walker using the existing
page table infrastructure, and plumb it into transparent_hugepage_adjust().
No new page sizes are supported in the process.

Signed-off-by: Marc Zyngier 


If it's okay for you to reuse the KVM page walker that's fine of course, 
but the arch/x86/mm functions lookup_address_in_{mm,pgd} are mostly 
machine-independent and it may make sense to move them to mm/.


That would also allow reusing the x86 function host_pfn_mapping_level.

Paolo


---
  arch/arm64/kvm/mmu.c | 46 
  1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3155c9e778f0..db6314b93e99 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -433,6 +433,44 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t 
size,
return 0;
  }
  
+static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {

+   /* We shouldn't need any other callback to walk the PT */
+   .phys_to_virt   = kvm_host_va,
+};
+
+struct user_walk_data {
+   u32 level;
+};
+
+static int user_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+  enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+   struct user_walk_data *data = arg;
+
+   data->level = level;
+   return 0;
+}
+
+static int get_user_mapping_size(struct kvm *kvm, u64 addr)
+{
+   struct user_walk_data data;
+   struct kvm_pgtable pgt = {
+   .pgd= (kvm_pte_t *)kvm->mm->pgd,
+   .ia_bits= VA_BITS,
+   .start_level= 4 - CONFIG_PGTABLE_LEVELS,
+   .mm_ops = _user_mm_ops,
+   };
+   struct kvm_pgtable_walker walker = {
+   .cb = user_walker,
+   .flags  = KVM_PGTABLE_WALK_LEAF,
+   .arg= ,
+   };
+
+   kvm_pgtable_walk(, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, );
+
+   return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(data.level));
+}
+
  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page= stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_host_zalloc_pages_exact,
@@ -780,7 +818,7 @@ static bool fault_supports_stage2_huge_mapping(struct 
kvm_memory_slot *memslot,
   * Returns the size of the mapping.
   */
  static unsigned long
-transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
  {
@@ -791,8 +829,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 * sure that the HVA and IPA are sufficiently aligned and that the
 * block map is contained within the memslot.
 */
-   if (kvm_is_transparent_hugepage(pfn) &&
-   fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+   if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+   get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
/*
 * The address we faulted on is backed by a transparent huge
 * page.  However, because we map the compound huge page and
@@ -1051,7 +1089,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 * backed by a THP and thus use block mapping if possible.
 */
if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
-   vma_pagesize = transparent_hugepage_adjust(memslot, hva,
+   vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva,
   , _ipa);
  
  	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {




___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm