[PATCH v4 13/13] selftests: KVM: Add counter emulation benchmark

2021-07-28 Thread Oliver Upton
Add a test case for counter emulation on arm64. A side effect of how KVM
handles physical counter offsetting on non-ECV systems is that the
virtual counter will always hit hardware and the physical could be
emulated. Force emulation by writing a nonzero offset to the physical
counter and compare the elapsed cycles to a direct read of the hardware
register.

Reviewed-by: Ricardo Koller 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../kvm/aarch64/counter_emulation_benchmark.c | 215 ++
 3 files changed, 217 insertions(+)
 create mode 100644 
tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 3d2585f0bffc..a23198ea6e7a 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /aarch64/debug-exceptions
+/aarch64/counter_emulation_benchmark
 /aarch64/get-reg-list
 /aarch64/vgic_init
 /s390x/memop
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index fab42e7c23ee..db8706eb6104 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -88,6 +88,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
 TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
+TEST_GEN_PROGS_aarch64 += aarch64/counter_emulation_benchmark
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
diff --git a/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c 
b/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c
new file mode 100644
index ..73aeb6cdebfe
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/counter_emulation_benchmark.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * counter_emulation_benchmark.c -- test to measure the effects of counter
+ * emulation on guest reads of the physical counter.
+ *
+ * Copyright (c) 2021, Google LLC.
+ */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID 0
+
+static struct counter_values {
+   uint64_t cntvct_start;
+   uint64_t cntpct;
+   uint64_t cntvct_end;
+} counter_values;
+
+static uint64_t nr_iterations = 1000;
+
+static void do_test(void)
+{
+   /*
+* Open-coded approach instead of using helper methods to keep a tight
+* interval around the physical counter read.
+*/
+   asm volatile("isb\n\t"
+"mrs %[cntvct_start], cntvct_el0\n\t"
+"isb\n\t"
+"mrs %[cntpct], cntpct_el0\n\t"
+"isb\n\t"
+"mrs %[cntvct_end], cntvct_el0\n\t"
+"isb\n\t"
+: [cntvct_start] "=r"(counter_values.cntvct_start),
+[cntpct] "=r"(counter_values.cntpct),
+[cntvct_end] "=r"(counter_values.cntvct_end));
+}
+
+static void guest_main(void)
+{
+   int i;
+
+   for (i = 0; i < nr_iterations; i++) {
+   do_test();
+   GUEST_SYNC(i);
+   }
+
+   for (i = 0; i < nr_iterations; i++) {
+   do_test();
+   GUEST_SYNC(i);
+   }
+
+   GUEST_DONE();
+}
+
+static bool enter_guest(struct kvm_vm *vm)
+{
+   struct ucall uc;
+
+   vcpu_ioctl(vm, VCPU_ID, KVM_RUN, NULL);
+
+   switch (get_ucall(vm, VCPU_ID, &uc)) {
+   case UCALL_DONE:
+   return true;
+   case UCALL_SYNC:
+   break;
+   case UCALL_ABORT:
+   TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0],
+   __FILE__, uc.args[1]);
+   break;
+   default:
+   TEST_ASSERT(false, "unexpected exit: %s",
+   exit_reason_str(vcpu_state(vm, 
VCPU_ID)->exit_reason));
+   break;
+   }
+
+   /* more work to do in the guest */
+   return false;
+}
+
+static double counter_frequency(void)
+{
+   uint32_t freq;
+
+   asm volatile("mrs %0, cntfrq_el0"
+: "=r" (freq));
+
+   return freq / 100.0;
+}
+
+static void log_csv(FILE *csv, bool trapped)
+{
+   double freq = counter_frequency();
+
+   fprintf(csv, "%s,%.02f,%lu,%lu,%lu\n",
+   trapped ? "true" : "false", freq,
+   counter_values.cntvct_start,
+   counter_values.cntpct,
+   counter_values.cntvct_end);
+}
+
+static double run_loop(struct kvm_vm *vm, FILE *csv, bool trapped)
+{
+   double avg = 0;
+   int i;
+
+   for (i = 0; i < nr_iterations; i++) {
+   

[PATCH v4 10/13] selftests: KVM: Add support for aarch64 to system_counter_offset_test

2021-07-28 Thread Oliver Upton
KVM/arm64 now allows userspace to adjust the guest virtual counter-timer
via a vCPU device attribute. Test that changes to the virtual
counter-timer offset result in the correct view being presented to the
guest.

Reviewed-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/Makefile  |  1 +
 .../selftests/kvm/include/aarch64/processor.h | 12 +
 .../kvm/system_counter_offset_test.c  | 54 ++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index 9f7060c02668..fab42e7c23ee 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -98,6 +98,7 @@ TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
+TEST_GEN_PROGS_aarch64 += system_counter_offset_test
 
 TEST_GEN_PROGS_s390x = s390x/memop
 TEST_GEN_PROGS_s390x += s390x/resets
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h 
b/tools/testing/selftests/kvm/include/aarch64/processor.h
index 27dc5c2e56b9..3168cdbae6ee 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -129,4 +129,16 @@ void vm_install_sync_handler(struct kvm_vm *vm,
 
 #define isb()  asm volatile("isb" : : : "memory")
 
+static inline uint64_t read_cntvct_ordered(void)
+{
+   uint64_t r;
+
+   __asm__ __volatile__("isb\n\t"
+"mrs %0, cntvct_el0\n\t"
+"isb\n\t"
+: "=r"(r));
+
+   return r;
+}
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
index b337bbbfa41f..25806cdd31ef 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -53,7 +53,59 @@ static uint64_t host_read_guest_system_counter(struct 
test_case *test)
return rdtsc() + test->tsc_offset;
 }
 
-#else /* __x86_64__ */
+#elif __aarch64__ /* __x86_64__ */
+
+enum arch_counter {
+   VIRTUAL,
+};
+
+struct test_case {
+   enum arch_counter counter;
+   uint64_t offset;
+};
+
+static struct test_case test_cases[] = {
+   { .counter = VIRTUAL, .offset = 0 },
+   { .counter = VIRTUAL, .offset = 180 * NSEC_PER_SEC },
+   { .counter = VIRTUAL, .offset = -180 * NSEC_PER_SEC },
+};
+
+static void check_preconditions(struct kvm_vm *vm)
+{
+   if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER))
+   return;
+
+   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_VTIMER not supported; skipping 
test");
+   exit(KSFT_SKIP);
+}
+
+static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
+{
+   vcpu_access_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER, &test->offset,
+   true);
+}
+
+static uint64_t guest_read_system_counter(struct test_case *test)
+{
+   switch (test->counter) {
+   case VIRTUAL:
+   return read_cntvct_ordered();
+   default:
+   GUEST_ASSERT(0);
+   }
+
+   /* unreachable */
+   return 0;
+}
+
+static uint64_t host_read_guest_system_counter(struct test_case *test)
+{
+   return read_cntvct_ordered() - test->offset;
+}
+
+#else /* __aarch64__ */
 
 #error test not implemented for this architecture!
 
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 08/13] selftests: KVM: Introduce system counter offset test

2021-07-28 Thread Oliver Upton
Introduce a KVM selftest to verify that userspace manipulation of the
TSC (via the new vCPU attribute) results in the correct behavior within
the guest.

Reviewed-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../kvm/system_counter_offset_test.c  | 132 ++
 3 files changed, 134 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/system_counter_offset_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 958a809c8de4..3d2585f0bffc 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -52,3 +52,4 @@
 /set_memory_region_test
 /steal_time
 /kvm_binary_stats_test
+/system_counter_offset_test
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index 0f94b18b33ce..9f7060c02668 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += memslot_perf_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
+TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
new file mode 100644
index ..b337bbbfa41f
--- /dev/null
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the system counter from userspace
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#ifdef __x86_64__
+
+struct test_case {
+   uint64_t tsc_offset;
+};
+
+static struct test_case test_cases[] = {
+   { 0 },
+   { 180 * NSEC_PER_SEC },
+   { -180 * NSEC_PER_SEC },
+};
+
+static void check_preconditions(struct kvm_vm *vm)
+{
+   if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL, 
KVM_VCPU_TSC_OFFSET))
+   return;
+
+   print_skip("KVM_VCPU_TSC_OFFSET not supported; skipping test");
+   exit(KSFT_SKIP);
+}
+
+static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
+{
+   vcpu_access_device_attr(vm, VCPU_ID, KVM_VCPU_TSC_CTRL,
+   KVM_VCPU_TSC_OFFSET, &test->tsc_offset, true);
+}
+
+static uint64_t guest_read_system_counter(struct test_case *test)
+{
+   return rdtsc();
+}
+
+static uint64_t host_read_guest_system_counter(struct test_case *test)
+{
+   return rdtsc() + test->tsc_offset;
+}
+
+#else /* __x86_64__ */
+
+#error test not implemented for this architecture!
+
+#endif
+
+#define GUEST_SYNC_CLOCK(__stage, __val)   \
+   GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(void)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+   struct test_case *test = &test_cases[i];
+
+   GUEST_SYNC_CLOCK(i, guest_read_system_counter(test));
+   }
+}
+
+static void handle_sync(struct ucall *uc, uint64_t start, uint64_t end)
+{
+   uint64_t obs = uc->args[2];
+
+   TEST_ASSERT(start <= obs && obs <= end,
+   "unexpected system counter value: %"PRIu64" expected range: 
[%"PRIu64", %"PRIu64"]",
+   obs, start, end);
+
+   pr_info("system counter value: %"PRIu64" expected range [%"PRIu64", 
%"PRIu64"]\n",
+   obs, start, end);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+   TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+static void enter_guest(struct kvm_vm *vm)
+{
+   uint64_t start, end;
+   struct ucall uc;
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+   struct test_case *test = &test_cases[i];
+
+   setup_system_counter(vm, test);
+   start = host_read_guest_system_counter(test);
+   vcpu_run(vm, VCPU_ID);
+   end = host_read_guest_system_counter(test);
+
+   switch (get_ucall(vm, VCPU_ID, &uc)) {
+   case UCALL_SYNC:
+   handle_sync(&uc, start, end);
+   break;
+   case UCALL_ABORT:
+   handle_abort(&uc);
+   return;
+   default:
+   TEST_ASSERT(0, "unhandled ucall %ld\n",
+   get_ucall(vm, VCPU_ID, &uc));
+   }
+   }
+}
+
+int main(void)
+{
+   struct kvm_vm *vm;
+
+   vm = vm_create_default(VCPU_ID, 0, guest_main);
+   c

[PATCH v4 07/13] selftests: KVM: Add helpers for vCPU device attributes

2021-07-28 Thread Oliver Upton
vCPU file descriptors are abstracted away from test code in KVM
selftests, meaning that tests cannot directly access a vCPU's device
attributes. Add helpers that tests can use to get at vCPU device
attributes.

Reviewed-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  9 +
 tools/testing/selftests/kvm/lib/kvm_util.c| 38 +++
 2 files changed, 47 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index a8ac5d52e17b..1b3ef5757819 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -240,6 +240,15 @@ int _kvm_device_access(int dev_fd, uint32_t group, 
uint64_t attr,
 int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
  void *val, bool write);
 
+int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+ uint64_t attr);
+int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr);
+int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t 
group,
+ uint64_t attr, void *val, bool write);
+int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr, void *val, bool write);
+
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 0ffc2d39c80d..0fe66ca6139a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2040,6 +2040,44 @@ int kvm_device_access(int dev_fd, uint32_t group, 
uint64_t attr,
return ret;
 }
 
+int _vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+ uint64_t attr)
+{
+   struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+   TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid);
+
+   return _kvm_device_check_attr(vcpu->fd, group, attr);
+}
+
+int vcpu_has_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+uint64_t attr)
+{
+   int ret = _vcpu_has_device_attr(vm, vcpuid, group, attr);
+
+   TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", 
ret, errno);
+   return ret;
+}
+
+int _vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t 
group,
+uint64_t attr, void *val, bool write)
+{
+   struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+   TEST_ASSERT(vcpu, "nonexistent vcpu id: %d", vcpuid);
+
+   return _kvm_device_access(vcpu->fd, group, attr, val, write);
+}
+
+int vcpu_access_device_attr(struct kvm_vm *vm, uint32_t vcpuid, uint32_t group,
+   uint64_t attr, void *val, bool write)
+{
+   int ret = _vcpu_access_device_attr(vm, vcpuid, group, attr, val, write);
+
+   TEST_ASSERT(!ret, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: 
%i", ret, errno);
+   return ret;
+}
+
 /*
  * VM Dump
  *
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 05/13] selftests: KVM: Add test for KVM_{GET,SET}_CLOCK

2021-07-28 Thread Oliver Upton
Add a selftest for the new KVM clock UAPI that was introduced. Ensure
that the KVM clock is consistent between userspace and the guest, and
that the difference in realtime will only ever cause the KVM clock to
advance forward.

Cc: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/include/kvm_util.h  |   2 +
 .../selftests/kvm/x86_64/kvm_clock_test.c | 204 ++
 4 files changed, 208 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/kvm_clock_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 36896d251977..958a809c8de4 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -11,6 +11,7 @@
 /x86_64/emulator_error_test
 /x86_64/get_cpuid_test
 /x86_64/get_msr_index_features
+/x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
 /x86_64/hyperv_clock
 /x86_64/hyperv_cpuid
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index c103873531e0..0f94b18b33ce 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -46,6 +46,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
 TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 010b59b13917..a8ac5d52e17b 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -19,6 +19,8 @@
 #define KVM_DEV_PATH "/dev/kvm"
 #define KVM_MAX_VCPUS 512
 
+#define NSEC_PER_SEC 10L
+
 /*
  * Callers of kvm_util only have an incomplete/opaque description of the
  * structure kvm_util is using to maintain the state of a VM.
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c 
b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
new file mode 100644
index ..e0dcc27ae9f1
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the KVM clock from userspace
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+struct test_case {
+   uint64_t kvmclock_base;
+   int64_t realtime_offset;
+};
+
+static struct test_case test_cases[] = {
+   { .kvmclock_base = 0 },
+   { .kvmclock_base = 180 * NSEC_PER_SEC },
+   { .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC },
+   { .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC },
+};
+
+#define GUEST_SYNC_CLOCK(__stage, __val)   \
+   GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti)
+{
+   int i;
+
+   wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+   for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+   GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc()));
+}
+
+#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+static inline void assert_flags(struct kvm_clock_data *data)
+{
+   TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS,
+   "unexpected clock data flags: %x (want set: %x)",
+   data->flags, EXPECTED_FLAGS);
+}
+
+static void handle_sync(struct ucall *uc, struct kvm_clock_data *start,
+   struct kvm_clock_data *end)
+{
+   uint64_t obs, exp_lo, exp_hi;
+
+   obs = uc->args[2];
+   exp_lo = start->clock;
+   exp_hi = end->clock;
+
+   assert_flags(start);
+   assert_flags(end);
+
+   TEST_ASSERT(exp_lo <= obs && obs <= exp_hi,
+   "unexpected kvm-clock value: %"PRIu64" expected range: 
[%"PRIu64", %"PRIu64"]",
+   obs, exp_lo, exp_hi);
+
+   pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", 
%"PRIu64"]\n",
+   obs, exp_lo, exp_hi);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+   TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+static void setup_clock(struct kvm_vm *vm, struct test_case *test_case)
+{
+   struct kvm_clock_data data;
+
+   memset(&data, 0, sizeof(data));
+
+   data.clock = test_case->kvmclock_base;
+   if (test_case->realtime_offset) {
+   struct timespec ts;
+   int r;
+
+   data.flags |= K

[PATCH v4 09/13] KVM: arm64: Allow userspace to configure a vCPU's virtual offset

2021-07-28 Thread Oliver Upton
Add a new vCPU attribute that allows userspace to directly manipulate
the virtual counter-timer offset. Exposing such an interface allows for
the precise migration of guest virtual counter-timers, as it is an
indepotent interface.

Uphold the existing behavior of writes to CNTVOFF_EL2 for this new
interface, wherein a write to a single vCPU is broadcasted to all vCPUs
within a VM.

Reviewed-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst | 22 
 arch/arm64/include/uapi/asm/kvm.h   |  1 +
 arch/arm64/kvm/arch_timer.c | 68 -
 3 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index b46d5f742e69..7b57cba3416a 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -139,6 +139,28 @@ configured values on other VCPUs.  Userspace should 
configure the interrupt
 numbers on at least one VCPU after creating all VCPUs and before running any
 VCPUs.
 
+2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_OFFSET_VTIMER
+
+
+:Parameters: Pointer to a 64-bit unsigned counter-timer offset.
+
+Returns:
+
+=== ==
+-EFAULT Error reading/writing the provided
+parameter address
+-ENXIO  Attribute not supported
+=== ==
+
+Specifies the guest's virtual counter-timer offset from the host's
+virtual counter. The guest's virtual counter is then derived by
+the following equation:
+
+  guest_cntvct = host_cntvct - KVM_ARM_VCPU_TIMER_OFFSET_VTIMER
+
+KVM does not allow the use of varying offset values for different vCPUs;
+the last written offset value will be broadcasted to all vCPUs in a VM.
+
 3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
 ==
 
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index b3edde68bc3e..008d0518d2b1 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -365,6 +365,7 @@ struct kvm_arm_copy_mte_tags {
 #define KVM_ARM_VCPU_TIMER_CTRL1
 #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER0
 #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER1
+#define   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER 2
 #define KVM_ARM_VCPU_PVTIME_CTRL   2
 #define   KVM_ARM_VCPU_PVTIME_IPA  0
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 3df67c127489..d2b1b13af658 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1305,7 +1305,7 @@ static void set_timer_irqs(struct kvm *kvm, int 
vtimer_irq, int ptimer_irq)
}
 }
 
-int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+int kvm_arm_timer_set_attr_irq(struct kvm_vcpu *vcpu, struct kvm_device_attr 
*attr)
 {
int __user *uaddr = (int __user *)(long)attr->addr;
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
@@ -1338,7 +1338,39 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
return 0;
 }
 
-int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+int kvm_arm_timer_set_attr_offset(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
+{
+   u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+   u64 offset;
+
+   if (get_user(offset, uaddr))
+   return -EFAULT;
+
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   update_vtimer_cntvoff(vcpu, offset);
+   break;
+   default:
+   return -ENXIO;
+   }
+
+   return 0;
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+   case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+   return kvm_arm_timer_set_attr_irq(vcpu, attr);
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   return kvm_arm_timer_set_attr_offset(vcpu, attr);
+   }
+
+   return -ENXIO;
+}
+
+int kvm_arm_timer_get_attr_irq(struct kvm_vcpu *vcpu, struct kvm_device_attr 
*attr)
 {
int __user *uaddr = (int __user *)(long)attr->addr;
struct arch_timer_context *timer;
@@ -1359,11 +1391,43 @@ int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, 
struct kvm_device_attr *attr)
return put_user(irq, uaddr);
 }
 
+int kvm_arm_timer_get_attr_offset(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
+{
+   u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+   struct arch_timer_context *timer;
+   u64 offset;
+
+   switch (attr->attr) {
+   case KVM_ARM_VCPU_TIMER_OFFSET_VTIMER:
+   timer = vcpu_vtimer(vcpu);
+   break;
+   default:
+   return -ENXIO;
+   }
+
+   offset = timer_get_offset

[PATCH v4 11/13] KVM: arm64: Provide userspace access to the physical counter offset

2021-07-28 Thread Oliver Upton
Presently, KVM provides no facilities for correctly migrating a guest
that depends on the physical counter-timer. While most guests (barring
NV, of course) should not depend on the physical counter-timer, an
operator may still wish to provide a consistent view of the physical
counter-timer across migrations.

Provide userspace with a new vCPU attribute to modify the guest physical
counter-timer offset. Since the base architecture doesn't provide a
physical counter-timer offset register, emulate the correct behavior by
trapping accesses to the physical counter-timer whenever the offset
value is non-zero.

Uphold the same behavior as CNTVOFF_EL2 and broadcast the physical
offset to all vCPUs whenever written. This guarantees that the
counter-timer we provide the guest remains architectural, wherein all
views of the counter-timer are consistent across vCPUs. Reconfigure
timer traps for VHE on every guest entry, as different VMs will now have
different traps enabled. Enable physical counter traps for nVHE whenever
the offset is nonzero (we already trap physical timer registers in
nVHE).

FEAT_ECV provides a guest physical counter-timer offset register
(CNTPOFF_EL2), but ECV-enabled hardware is nonexistent at the time of
writing so support for it was elided for the sake of the author :)

Cc: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst   | 22 +++
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/include/asm/kvm_hyp.h  |  2 -
 arch/arm64/include/asm/sysreg.h   |  1 +
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 arch/arm64/kvm/arch_timer.c   | 72 ++-
 arch/arm64/kvm/arm.c  |  4 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h   | 23 
 arch/arm64/kvm/hyp/include/hyp/timer-sr.h | 26 
 arch/arm64/kvm/hyp/nvhe/switch.c  |  2 -
 arch/arm64/kvm/hyp/nvhe/timer-sr.c| 21 +++
 arch/arm64/kvm/hyp/vhe/timer-sr.c | 27 +
 include/kvm/arm_arch_timer.h  |  2 -
 13 files changed, 158 insertions(+), 46 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/include/hyp/timer-sr.h

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index 7b57cba3416a..a8547ce09b47 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -161,6 +161,28 @@ the following equation:
 KVM does not allow the use of varying offset values for different vCPUs;
 the last written offset value will be broadcasted to all vCPUs in a VM.
 
+2.3. ATTRIBUTE: KVM_ARM_VCPU_TIMER_OFFSET_PTIMER
+
+
+:Parameters: Pointer to a 64-bit unsigned counter-timer offset.
+
+Returns:
+
+ === ==
+ -EFAULT Error reading/writing the provided
+ parameter address
+ -ENXIO  Attribute not supported
+ === ==
+
+Specifies the guest's physical counter-timer offset from the host's
+virtual counter. The guest's physical counter is then derived by
+the following equation:
+
+  guest_cntpct = host_cntvct - KVM_ARM_VCPU_TIMER_OFFSET_PTIMER
+
+KVM does not allow the use of varying offset values for different vCPUs;
+the last written offset value will be broadcasted to all vCPUs in a VM.
+
 3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
 ==
 
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..de92fa678924 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -204,6 +204,7 @@ enum vcpu_sysreg {
SP_EL1,
SPSR_EL1,
 
+   CNTPOFF_EL2,
CNTVOFF_EL2,
CNTV_CVAL_EL0,
CNTV_CTL_EL0,
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 9d60b3006efc..01eb3864e50f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -65,10 +65,8 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
-#ifdef __KVM_NVHE_HYPERVISOR__
 void __timer_enable_traps(struct kvm_vcpu *vcpu);
 void __timer_disable_traps(struct kvm_vcpu *vcpu);
-#endif
 
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 7b9c3acba684..d3890a44b7f7 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -505,6 +505,7 @@
 #define SYS_AMEVCNTR0_MEM_STALLSYS_AMEVCNTR0_EL0(3)
 
 #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)
+#define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1)
 
 #define SYS_CNTP_TVAL_EL0  sys_reg(3, 3, 14, 2, 0)
 #define SYS_CNTP_CTL_EL0  

[PATCH v4 04/13] tools: arch: x86: pull in pvclock headers

2021-07-28 Thread Oliver Upton
Copy over approximately clean versions of the pvclock headers into
tools. Reconcile headers/symbols missing in tools that are unneeded.

Signed-off-by: Oliver Upton 
---
 tools/arch/x86/include/asm/pvclock-abi.h |  48 +++
 tools/arch/x86/include/asm/pvclock.h | 103 +++
 2 files changed, 151 insertions(+)
 create mode 100644 tools/arch/x86/include/asm/pvclock-abi.h
 create mode 100644 tools/arch/x86/include/asm/pvclock.h

diff --git a/tools/arch/x86/include/asm/pvclock-abi.h 
b/tools/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index ..1436226efe3e
--- /dev/null
+++ b/tools/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time.  There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done.  Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+   u32   version;
+   u32   pad0;
+   u64   tsc_timestamp;
+   u64   system_time;
+   u32   tsc_to_system_mul;
+   s8tsc_shift;
+   u8flags;
+   u8pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+   u32   version;
+   u32   sec;
+   u32   nsec;
+} __attribute__((__packed__));
+
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_GUEST_STOPPED  (1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/tools/arch/x86/include/asm/pvclock.h 
b/tools/arch/x86/include/asm/pvclock.h
new file mode 100644
index ..2628f9a6330b
--- /dev/null
+++ b/tools/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_H
+#define _ASM_X86_PVCLOCK_H
+
+#include 
+#include 
+
+/* some helper functions for xen and kvm pv clock sources */
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
+void pvclock_resume(void);
+
+void pvclock_touch_watchdogs(void);
+
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+   unsigned version = src->version & ~1;
+   /* Make sure that the version is read before the data. */
+   rmb();
+   return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+   unsigned version)
+{
+   /* Make sure that the version is re-read after the data. */
+   rmb();
+   return version != src->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+   u64 product;
+#ifdef __i386__
+   u32 tmp1, tmp2;
+#else
+   unsigned long tmp;
+#endif
+
+   if (shift < 0)
+   delta >>= -shift;
+   else
+   delta <<= shift;
+
+#ifdef __i386__
+   __asm__ (
+   "mul  %5   ; "
+   "mov  %4,%%eax ; "
+   "mov  %%edx,%4 ; "
+   "mul  %5   ; "
+   "xor  %5,%5; "
+   "add  %4,%%eax ; "
+   "adc  %5,%%edx ; "
+   : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+   : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+   __asm__ (
+   "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+   : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+   : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
+#else
+#error implement me!
+#endif
+
+   return product;
+}
+
+static __always_inline
+u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
+{
+   u64 delta = tsc - src->tsc_timestamp;
+   u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+src->tsc_shift);
+   return src-

[PATCH v4 06/13] selftests: KVM: Fix kvm device helper ioctl assertions

2021-07-28 Thread Oliver Upton
The KVM_CREATE_DEVICE and KVM_{GET,SET}_DEVICE_ATTR ioctls are defined
to return a value of zero on success. As such, tighten the assertions in
the helper functions to only pass if the return code is zero.

Suggested-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 10a8ed691c66..0ffc2d39c80d 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1984,7 +1984,7 @@ int kvm_device_check_attr(int dev_fd, uint32_t group, 
uint64_t attr)
 {
int ret = _kvm_device_check_attr(dev_fd, group, attr);
 
-   TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", 
ret, errno);
+   TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, 
errno);
return ret;
 }
 
@@ -2008,7 +2008,7 @@ int kvm_create_device(struct kvm_vm *vm, uint64_t type, 
bool test)
ret = _kvm_create_device(vm, type, test, &fd);
 
if (!test) {
-   TEST_ASSERT(ret >= 0,
+   TEST_ASSERT(!ret,
"KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", 
ret, errno);
return fd;
}
@@ -2036,7 +2036,7 @@ int kvm_device_access(int dev_fd, uint32_t group, 
uint64_t attr,
 {
int ret = _kvm_device_access(dev_fd, group, attr, val, write);
 
-   TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i 
errno: %i", ret, errno);
+   TEST_ASSERT(!ret, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: 
%i", ret, errno);
return ret;
 }
 
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 00/13] KVM: Add idempotent controls for migrating system counter state

2021-07-28 Thread Oliver Upton
KVM's current means of saving/restoring system counters is plagued with
temporal issues. At least on ARM64 and x86, we migrate the guest's
system counter by-value through the respective guest system register
values (cntvct_el0, ia32_tsc). Restoring system counters by-value is
brittle as the state is not idempotent: the host system counter is still
oscillating between the attempted save and restore. Furthermore, VMMs
may wish to transparently live migrate guest VMs, meaning that they
include the elapsed time due to live migration blackout in the guest
system counter view. The VMM thread could be preempted for any number of
reasons (scheduler, L0 hypervisor under nested) between the time that
it calculates the desired guest counter value and when KVM actually sets
this counter state.

Despite the value-based interface that we present to userspace, KVM
actually has idempotent guest controls by way of system counter offsets.
We can avoid all of the issues associated with a value-based interface
by abstracting these offset controls in new ioctls. This series
introduces new vCPU device attributes to provide userspace access to the
vCPU's system counter offset.

Patch 1 adopts Paolo's suggestion, augmenting the KVM_{GET,SET}_CLOCK
ioctls to provide userspace with a (host_tsc, realtime) instant. This is
essential for a VMM to perform precise migration of the guest's system
counters.

Patches 2-3 add support for x86 by shoehorning the new controls into the
pre-existing synchronization heuristics.

Patches 4-5 implement a test for the new additions to
KVM_{GET,SET}_CLOCK.

Patch 6 fixes some assertions in the kvm device attribute helpers.

Patches 7-8 implement at test for the tsc offset attribute introduced in
patch 3.

Patch 9 adds a device attribute for the arm64 virtual counter-timer
offset.

Patch 10 extends the test from patch 8 to cover the arm64 virtual
counter-timer offset.

Patch 11 adds a device attribute for the arm64 physical counter-timer
offset. Currently, this is implemented as a synthetic register, forcing
the guest to trap to the host and emulating the offset in the fast exit
path. Later down the line we will have hardware with FEAT_ECV, which
allows the hypervisor to perform physical counter-timer offsetting in
hardware (CNTPOFF_EL2).

Patch 12 extends the test from patch 8 to cover the arm64 physical
counter-timer offset.

Patch 13 introduces a benchmark to measure the overhead of emulation in
patch 11.

This series was tested on both an Ampere Mt. Jade and Haswell systems.

Physical counter benchmark
--

The following data was collected by running 1 iterations of the
benchmark test from Patch 6 on an Ampere Mt. Jade reference server, A 2S
machine with 2 80-core Ampere Altra SoCs. Measurements were collected
for both VHE and nVHE operation using the `kvm-arm.mode=` command-line
parameter.

nVHE


+++-+
|   Metric   | Native | Trapped |
+++-+
| Average| 54ns   | 148ns   |
| Standard Deviation | 124ns  | 122ns   |
| 95th Percentile| 258ns  | 348ns   |
+++-+

VHE
---

+++-+
|   Metric   | Native | Trapped |
+++-+
| Average| 53ns   | 152ns   |
| Standard Deviation | 92ns   | 94ns|
| 95th Percentile| 204ns  | 307ns   |
+++-+

This series applies cleanly to kvm/queue at the following commit:

8ad5e63649ff ("KVM: Don't take mmu_lock for range invalidation unless 
necessary")

v1 -> v2:
  - Reimplemented as vCPU device attributes instead of a distinct ioctl.
  - Added the (realtime, host_tsc) instant support to KVM_{GET,SET}_CLOCK
  - Changed the arm64 implementation to broadcast counter
offset values to all vCPUs in a guest. This upholds the
architectural expectations of a consistent counter-timer across CPUs.
  - Fixed a bug with traps in VHE mode. We now configure traps on every
transition into a guest to handle differing VMs (trapped, emulated).

v2 -> v3:
  - Added documentation for additions to KVM_{GET,SET}_CLOCK
  - Added documentation for all new vCPU attributes
  - Added documentation for suggested algorithm to migrate a guest's
TSC(s)
  - Bug fixes throughout series
  - Rename KVM_CLOCK_REAL_TIME -> KVM_CLOCK_REALTIME

v3 -> v4:
  - Added patch to address incorrect device helper assertions (Drew)
  - Carried Drew's r-b tags where appropriate
  - x86 selftest cleanup
  - Removed stale kvm_timer_init_vhe() function
  - Removed unnecessary GUEST_DONE() from selftests

v1: https://lore.kernel.org/kvm/20210608214742.1897483-1-oup...@google.com/
v2: https://lore.kernel.org/r/20210716212629.2232756-1-oup...@google.com
v3: https://lore/kernel.org/r/20210719184949.1385910-1-oup...@google.com

Oliver Upton (13):
  KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK
  KVM: x86: Refac

[PATCH v4 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK

2021-07-28 Thread Oliver Upton
Handling the migration of TSCs correctly is difficult, in part because
Linux does not provide userspace with the ability to retrieve a (TSC,
realtime) clock pair for a single instant in time. In lieu of a more
convenient facility, KVM can report similar information in the kvm_clock
structure.

Provide userspace with a host TSC & realtime pair iff the realtime clock
is based on the TSC. If userspace provides KVM_SET_CLOCK with a valid
realtime value, advance the KVM clock by the amount of elapsed time. Do
not step the KVM clock backwards, though, as it is a monotonic
oscillator.

Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/api.rst  |  42 +++--
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/x86.c  | 149 
 include/uapi/linux/kvm.h|   7 +-
 4 files changed, 137 insertions(+), 64 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index dae68e68ca23..8d4a3471ad9e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -993,20 +993,34 @@ such as migration.
 When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the
 set of bits that KVM can return in struct kvm_clock_data's flag member.
 
-The only flag defined now is KVM_CLOCK_TSC_STABLE.  If set, the returned
-value is the exact kvmclock value seen by all VCPUs at the instant
-when KVM_GET_CLOCK was called.  If clear, the returned value is simply
-CLOCK_MONOTONIC plus a constant offset; the offset can be modified
-with KVM_SET_CLOCK.  KVM will try to make all VCPUs follow this clock,
-but the exact value read by each VCPU could differ, because the host
-TSC is not stable.
+FLAGS:
+
+KVM_CLOCK_TSC_STABLE.  If set, the returned value is the exact kvmclock
+value seen by all VCPUs at the instant when KVM_GET_CLOCK was called.
+If clear, the returned value is simply CLOCK_MONOTONIC plus a constant
+offset; the offset can be modified with KVM_SET_CLOCK.  KVM will try
+to make all VCPUs follow this clock, but the exact value read by each
+VCPU could differ, because the host TSC is not stable.
+
+KVM_CLOCK_REALTIME.  If set, the `realtime` field in the kvm_clock_data
+structure is populated with the value of the host's real time
+clocksource at the instant when KVM_GET_CLOCK was called. If clear,
+the `realtime` field does not contain a value.
+
+KVM_CLOCK_HOST_TSC.  If set, the `host_tsc` field in the kvm_clock_data
+structure is populated with the value of the host's timestamp counter (TSC)
+at the instant when KVM_GET_CLOCK was called. If clear, the `host_tsc` field
+does not contain a value.
 
 ::
 
   struct kvm_clock_data {
__u64 clock;  /* kvmclock current value */
__u32 flags;
-   __u32 pad[9];
+   __u32 pad0;
+   __u64 realtime;
+   __u64 host_tsc;
+   __u32 pad[4];
   };
 
 
@@ -1023,12 +1037,22 @@ Sets the current timestamp of kvmclock to the value 
specified in its parameter.
 In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on 
scenarios
 such as migration.
 
+FLAGS:
+
+KVM_CLOCK_REALTIME.  If set, KVM will compare the value of the `realtime` field
+with the value of the host's real time clocksource at the instant when
+KVM_SET_CLOCK was called. The difference in elapsed time is added to the final
+kvmclock value that will be provided to guests.
+
 ::
 
   struct kvm_clock_data {
__u64 clock;  /* kvmclock current value */
__u32 flags;
-   __u32 pad[9];
+   __u32 pad0;
+   __u64 realtime;
+   __u64 host_tsc;
+   __u32 pad[4];
   };
 
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99f37781a6fc..65a20c64f959 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1932,4 +1932,7 @@ int kvm_cpu_dirty_log_size(void);
 
 int alloc_all_memslots_rmaps(struct kvm *kvm);
 
+#define KVM_CLOCK_VALID_FLAGS  \
+   (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 916c976e99ab..e052c7afaac4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-u64 get_kvmclock_ns(struct kvm *kvm)
+/*
+ * Returns true if realtime and TSC values were written back to the caller.
+ * Returns false if a clock triplet cannot be obtained, such as if the host's
+ * realtime clock is not based on the TSC.
+ */
+static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
+ u64 *realtime_ns, u64 *tsc)
 {
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;
-   u64 ret;
+   bool ret = false;
 
spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
spin_un

[PATCH v4 12/13] selftests: KVM: Test physical counter offsetting

2021-07-28 Thread Oliver Upton
Test that userpace adjustment of the guest physical counter-timer
results in the correct view of within the guest.

Reviewed-by: Andrew Jones 
Signed-off-by: Oliver Upton 
---
 .../selftests/kvm/include/aarch64/processor.h | 12 
 .../kvm/system_counter_offset_test.c  | 29 ---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h 
b/tools/testing/selftests/kvm/include/aarch64/processor.h
index 3168cdbae6ee..7f53d90e9512 100644
--- a/tools/testing/selftests/kvm/include/aarch64/processor.h
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -141,4 +141,16 @@ static inline uint64_t read_cntvct_ordered(void)
return r;
 }
 
+static inline uint64_t read_cntpct_ordered(void)
+{
+   uint64_t r;
+
+   __asm__ __volatile__("isb\n\t"
+"mrs %0, cntpct_el0\n\t"
+"isb\n\t"
+: "=r"(r));
+
+   return r;
+}
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c 
b/tools/testing/selftests/kvm/system_counter_offset_test.c
index 25806cdd31ef..ef215fb43657 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -57,6 +57,7 @@ static uint64_t host_read_guest_system_counter(struct 
test_case *test)
 
 enum arch_counter {
VIRTUAL,
+   PHYSICAL,
 };
 
 struct test_case {
@@ -68,23 +69,41 @@ static struct test_case test_cases[] = {
{ .counter = VIRTUAL, .offset = 0 },
{ .counter = VIRTUAL, .offset = 180 * NSEC_PER_SEC },
{ .counter = VIRTUAL, .offset = -180 * NSEC_PER_SEC },
+   { .counter = PHYSICAL, .offset = 0 },
+   { .counter = PHYSICAL, .offset = 180 * NSEC_PER_SEC },
+   { .counter = PHYSICAL, .offset = -180 * NSEC_PER_SEC },
 };
 
 static void check_preconditions(struct kvm_vm *vm)
 {
if (!_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
-  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER))
+  KVM_ARM_VCPU_TIMER_OFFSET_VTIMER) &&
+   !_vcpu_has_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
+  KVM_ARM_VCPU_TIMER_OFFSET_PTIMER))
return;
 
-   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_VTIMER not supported; skipping 
test");
+   print_skip("KVM_ARM_VCPU_TIMER_OFFSET_{VTIMER,PTIMER} not supported; 
skipping test");
exit(KSFT_SKIP);
 }
 
 static void setup_system_counter(struct kvm_vm *vm, struct test_case *test)
 {
+   u64 attr = 0;
+
+   switch (test->counter) {
+   case VIRTUAL:
+   attr = KVM_ARM_VCPU_TIMER_OFFSET_VTIMER;
+   break;
+   case PHYSICAL:
+   attr = KVM_ARM_VCPU_TIMER_OFFSET_PTIMER;
+   break;
+   default:
+   TEST_ASSERT(false, "unrecognized counter index %u",
+   test->counter);
+   }
+
vcpu_access_device_attr(vm, VCPU_ID, KVM_ARM_VCPU_TIMER_CTRL,
-   KVM_ARM_VCPU_TIMER_OFFSET_VTIMER, &test->offset,
-   true);
+   attr, &test->offset, true);
 }
 
 static uint64_t guest_read_system_counter(struct test_case *test)
@@ -92,6 +111,8 @@ static uint64_t guest_read_system_counter(struct test_case 
*test)
switch (test->counter) {
case VIRTUAL:
return read_cntvct_ordered();
+   case PHYSICAL:
+   return read_cntpct_ordered();
default:
GUEST_ASSERT(0);
}
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 02/13] KVM: x86: Refactor tsc synchronization code

2021-07-28 Thread Oliver Upton
Refactor kvm_synchronize_tsc to make a new function that allows callers
to specify TSC parameters (offset, value, nanoseconds, etc.) explicitly
for the sake of participating in TSC synchronization.

This changes the locking semantics around TSC writes. Writes to the TSC
will now take the pvclock gtod lock while holding the tsc write lock,
whereas before these locks were disjoint.

Reviewed-by: David Matlack 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/locking.rst |  11 +++
 arch/x86/kvm/x86.c | 106 +
 2 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/Documentation/virt/kvm/locking.rst 
b/Documentation/virt/kvm/locking.rst
index 8138201efb09..0bf346adac2a 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -36,6 +36,9 @@ On x86:
   holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
   there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
 
+- kvm->arch.tsc_write_lock is taken outside
+  kvm->arch.pvclock_gtod_sync_lock
+
 Everything else is a leaf: no other lock is taken inside the critical
 sections.
 
@@ -222,6 +225,14 @@ time it will be set using the Dirty tracking mechanism 
described above.
 :Comment:  'raw' because hardware enabling/disabling must be atomic /wrt
migration.
 
+:Name: kvm_arch::pvclock_gtod_sync_lock
+:Type: raw_spinlock_t
+:Arch: x86
+:Protects: kvm_arch::{cur_tsc_generation,cur_tsc_nsec,cur_tsc_write,
+   cur_tsc_offset,nr_vcpus_matched_tsc}
+:Comment:  'raw' because updating the kvm master clock must not be
+   preempted.
+
 :Name: kvm_arch::tsc_write_lock
 :Type: raw_spinlock
 :Arch: x86
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..27435a07fb46 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2443,13 +2443,73 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
 }
 
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ *
+ * Must hold kvm->arch.tsc_write_lock to call this function.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+   struct kvm *kvm = vcpu->kvm;
+   bool already_matched;
+   unsigned long flags;
+
+   lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+   already_matched =
+  (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
+
+   /*
+* We track the most recent recorded KHZ, write and time to
+* allow the matching interval to be extended at each write.
+*/
+   kvm->arch.last_tsc_nsec = ns;
+   kvm->arch.last_tsc_write = tsc;
+   kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+
+   vcpu->arch.last_guest_tsc = tsc;
+
+   /* Keep track of which generation this VCPU has synchronized to */
+   vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+   vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+   vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+   kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+   spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+   if (!matched) {
+   /*
+* We split periods of matched TSC writes into generations.
+* For each generation, we track the original measured
+* nanosecond time, offset, and write, so if TSCs are in
+* sync, we can match exact offset, and if not, we can match
+* exact software computation in compute_guest_tsc()
+*
+* These values are tracked in kvm->arch.cur_xxx variables.
+*/
+   kvm->arch.nr_vcpus_matched_tsc = 0;
+   kvm->arch.cur_tsc_generation++;
+   kvm->arch.cur_tsc_nsec = ns;
+   kvm->arch.cur_tsc_write = tsc;
+   kvm->arch.cur_tsc_offset = offset;
+   matched = false;
+   } else if (!already_matched) {
+   kvm->arch.nr_vcpus_matched_tsc++;
+   }
+
+   kvm_track_tsc_matching(vcpu);
+   spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+}
+
 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
-   bool matched;
-   bool already_matched;
+   bool matched = false;
bool synchronizing = false;
 
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2495,51 +2555,11 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, 
u64 data)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched

[PATCH v4 03/13] KVM: x86: Expose TSC offset controls to userspace

2021-07-28 Thread Oliver Upton
To date, VMM-directed TSC synchronization and migration has been a bit
messy. KVM has some baked-in heuristics around TSC writes to infer if
the VMM is attempting to synchronize. This is problematic, as it depends
on host userspace writing to the guest's TSC within 1 second of the last
write.

A much cleaner approach to configuring the guest's views of the TSC is to
simply migrate the TSC offset for every vCPU. Offsets are idempotent,
and thus not subject to change depending on when the VMM actually
reads/writes values from/to KVM. The VMM can then read the TSC once with
KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
the guest is paused.

Cc: David Matlack 
Signed-off-by: Oliver Upton 
---
 Documentation/virt/kvm/devices/vcpu.rst |  57 
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/include/uapi/asm/kvm.h |   4 +
 arch/x86/kvm/x86.c  | 167 
 4 files changed, 229 insertions(+)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
b/Documentation/virt/kvm/devices/vcpu.rst
index 2acec3b9ef65..b46d5f742e69 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -161,3 +161,60 @@ Specifies the base address of the stolen time structure 
for this VCPU. The
 base address must be 64 byte aligned and exist within a valid guest memory
 region. See Documentation/virt/kvm/arm/pvtime.rst for more information
 including the layout of the stolen time structure.
+
+4. GROUP: KVM_VCPU_TSC_CTRL
+===
+
+:Architectures: x86
+
+4.1 ATTRIBUTE: KVM_VCPU_TSC_OFFSET
+
+:Parameters: 64-bit unsigned TSC offset
+
+Returns:
+
+=== ==
+-EFAULT Error reading/writing the provided
+parameter address.
+-ENXIO  Attribute not supported
+=== ==
+
+Specifies the guest's TSC offset relative to the host's TSC. The guest's
+TSC is then derived by the following equation:
+
+  guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET
+
+This attribute is useful for the precise migration of a guest's TSC. The
+following describes a possible algorithm to use for the migration of a
+guest's TSC:
+
+From the source VMM process:
+
+1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_0),
+   kvmclock nanoseconds (k_0), and realtime nanoseconds (r_0).
+
+2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the
+   guest TSC offset (off_n).
+
+3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the
+   guest's TSC (freq).
+
+From the destination VMM process:
+
+4. Invoke the KVM_SET_CLOCK ioctl, providing the kvmclock nanoseconds
+   (k_0) and realtime nanoseconds (r_0) in their respective fields.
+   Ensure that the KVM_CLOCK_REALTIME flag is set in the provided
+   structure. KVM will advance the VM's kvmclock to account for elapsed
+   time since recording the clock values.
+
+5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (t_1) and
+   kvmclock nanoseconds (k_1).
+
+6. Adjust the guest TSC offsets for every vCPU to account for (1) time
+   elapsed since recording state and (2) difference in TSCs between the
+   source and destination machine:
+
+   new_off_n = t_0 + off_n = (k_1 - k_0) * freq - t_1
+
+7. Write the KVM_VCPU_TSC_OFFSET attribute for every vCPU with the
+   respective value derived in the previous step.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65a20c64f959..855698923dd0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1070,6 +1070,7 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_write;
u32 last_tsc_khz;
+   u64 last_tsc_offset;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..0b22e1e84e78 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -503,4 +503,8 @@ struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27435a07fb46..17d87a8d0c75 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2413,6 +2413,11 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu 
*vcpu, u64 l1_offset)
static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 }
 
+static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.l1_tsc_offset;
+}
+
 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 
l1_multiplier)
 {
vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
@@ -2469,6 +2

Re: [PATCH] KVM: ARM: count remote TLB flushes

2021-07-28 Thread Oliver Upton
On Tue, Jul 27, 2021 at 3:33 AM Paolo Bonzini  wrote:
>
> KVM/ARM has an architecture-specific implementation of
> kvm_flush_remote_tlbs; however, unlike the generic one,
> it does not count the flushes in kvm->stat.remote_tlb_flush,
> so that it inexorably remained stuck to zero.
>
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Oliver Upton 

> ---
>  arch/arm64/kvm/mmu.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index c10207fed2f3..6cf16b43bfcc 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -81,6 +81,7 @@ static bool memslot_is_logging(struct kvm_memory_slot 
> *memslot)
>  void kvm_flush_remote_tlbs(struct kvm *kvm)
>  {
> kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
> +   ++kvm->stat.generic.remote_tlb_flush;
>  }
>
>  static bool kvm_is_device_pfn(unsigned long pfn)
> --
> 2.31.1
>
> ___
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/2] KVM: arm64: Fix off-by-one in range_is_memory

2021-07-28 Thread David Brazdil
Hyp checks whether an address range only covers RAM by checking the
start/endpoints against a list of memblock_region structs. However,
the endpoint here is exclusive but internally is treated as inclusive.
Fix the off-by-one error that caused valid address ranges to be
rejected.

Cc: Quentin Perret 
Fixes: 90134ac9cabb6 ("KVM: arm64: Protect the .hyp sections from the host")
Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d938ce95d3bd..a6ce991b1467 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -193,7 +193,7 @@ static bool range_is_memory(u64 start, u64 end)
 {
struct kvm_mem_range r1, r2;
 
-   if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
+   if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2))
return false;
if (r1.start != r2.start)
return false;
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 2/2] KVM: arm64: Minor optimization of range_is_memory

2021-07-28 Thread David Brazdil
Currently range_is_memory finds the corresponding struct memblock_region
for both the lower and upper bounds of the given address range with two
rounds of binary search, and then checks that the two memblocks are the
same. Simplify this by only doing binary search on the lower bound and
then checking that the upper bound is in the same memblock.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index a6ce991b1467..37d73af69634 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -189,13 +189,18 @@ static bool find_mem_range(phys_addr_t addr, struct 
kvm_mem_range *range)
return false;
 }
 
+static bool is_in_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+   return range->start <= addr && addr < range->end;
+}
+
 static bool range_is_memory(u64 start, u64 end)
 {
-   struct kvm_mem_range r1, r2;
+   struct kvm_mem_range r;
 
-   if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2))
+   if (!find_mem_range(start, &r))
return false;
-   if (r1.start != r2.start)
+   if (!is_in_mem_range(end - 1, &r))
return false;
 
return true;
-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 0/2] Fix off-by-one in range_is_memory

2021-07-28 Thread David Brazdil
Hi, here is an off-by-one bug fix and a very minor improvement for
the range_is_memory function in hyp.

David Brazdil (2):
  KVM: arm64: Fix off-by-one in range_is_memory
  KVM: arm64: Minor optimization of range_is_memory

 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

-- 
2.32.0.432.gabb21c7263-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 12/16] KVM: arm64: Mark host bss and rodata section as shared

2021-07-28 Thread Quentin Perret
On Monday 26 Jul 2021 at 10:29:01 (+0100), Quentin Perret wrote:
> +static int finalize_mappings(void)
> +{
> + enum kvm_pgtable_prot prot;
> + int ret;
> +
> + /*
> +  * The host's .bss and .rodata sections are now conceptually owned by
> +  * the hypervisor, so mark them as 'borrowed' in the host stage-2. We
> +  * can safely use host_stage2_idmap_locked() at this point since the
> +  * host stage-2 has not been enabled yet.
> +  */
> + prot = pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_BORROWED);
> + ret = host_stage2_idmap_locked(__hyp_pa(__start_rodata),
> +__hyp_pa(__end_rodata), prot);
> + if (ret)
> + return ret;
> +
> + return host_stage2_idmap_locked(__hyp_pa(__hyp_bss_end),
> + __hyp_pa(__bss_stop), prot);
> +}
> +
>  void __noreturn __pkvm_init_finalise(void)
>  {
>   struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
> @@ -167,6 +199,10 @@ void __noreturn __pkvm_init_finalise(void)
>   if (ret)
>   goto out;
>  
> + ret = finalize_mappings();
> + if (ret)
> + goto out;

While working on v3 of this series it occurred to me that we can
actually do vastly better than this. Specifically, the annotation of
shared pages currently happens in two places (recreate_hyp_mappings()
and finalize_mappings()) with nothing to guarantee they are in sync. At
the same time, the annotation of pages owned by the hypervisor is left
to the host itself using the __pkvm_mark_hyp hypercall. But clearly, by
the point we arrive to finalize_mappings() above, all the information I
need is already stored in the hyp pgtable. That is, it should be fairly
easy to walk the hyp stage-1, and for each valid mapping create a
matching annotation in the host stage-2 to mark the page shared or owned
by the hypervisor.

I'll have a go at implementing this in v3, which would guarantee
consistency across page-tables once the hypervisor is initialized, and
also allow to get rid of __pkvm_mark_hyp entirely. But if anybody thinks
this is a bad idea in the meantime, please shout!

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] KVM: arm64: fix comment typo

2021-07-28 Thread Jason Wang
Remove the repeated word 'the' from two comments.

Signed-off-by: Jason Wang 
---
 arch/arm64/kvm/vgic/vgic-mmio-v2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index a016f07adc28..5f9014ae595b 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu 
*vcpu,
case GIC_CPU_PRIMASK:
/*
 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-* the PMR field as GICH_VMCR.VMPriMask rather than
+* PMR field as GICH_VMCR.VMPriMask rather than
 * GICC_PMR.Priority, so we expose the upper five bits of
 * priority mask to userspace using the lower bits in the
 * unsigned long.
@@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
case GIC_CPU_PRIMASK:
/*
 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-* the PMR field as GICH_VMCR.VMPriMask rather than
+* PMR field as GICH_VMCR.VMPriMask rather than
 * GICC_PMR.Priority, so we expose the upper five bits of
 * priority mask to userspace using the lower bits in the
 * unsigned long.
-- 
2.32.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 01/16] KVM: arm64: Generalise VM features into a set of flags

2021-07-28 Thread Steven Price
On 28/07/2021 10:41, Marc Zyngier wrote:
> On Tue, 27 Jul 2021 19:10:27 +0100,
> Will Deacon  wrote:
>>
>> On Thu, Jul 15, 2021 at 05:31:44PM +0100, Marc Zyngier wrote:
>>> We currently deal with a set of booleans for VM features,
>>> while they could be better represented as set of flags
>>> contained in an unsigned long, similarily to what we are
>>> doing on the CPU side.
>>>
>>> Signed-off-by: Marc Zyngier 
>>> ---
>>>  arch/arm64/include/asm/kvm_host.h | 12 +++-
>>>  arch/arm64/kvm/arm.c  |  5 +++--
>>>  arch/arm64/kvm/mmio.c |  3 ++-
>>>  3 files changed, 12 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_host.h 
>>> b/arch/arm64/include/asm/kvm_host.h
>>> index 41911585ae0c..4add6c27251f 100644
>>> --- a/arch/arm64/include/asm/kvm_host.h
>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>> @@ -122,7 +122,10 @@ struct kvm_arch {
>>>  * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
>>>  * supported.
>>>  */
>>> -   bool return_nisv_io_abort_to_user;
>>> +#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0
>>> +   /* Memory Tagging Extension enabled for the guest */
>>> +#define KVM_ARCH_FLAG_MTE_ENABLED  1
>>> +   unsigned long flags;
>>
>> One downside of packing all these together is that updating 'flags' now
>> requires an atomic rmw sequence (i.e. set_bit()). Then again, that's
>> probably for the best anyway given that kvm_vm_ioctl_enable_cap() looks
>> like it doesn't hold any locks.
> 
> That, and these operations are supposed to be extremely rare anyway.
> 
>>
>>> /*
>>>  * VM-wide PMU filter, implemented as a bitmap and big enough for
>>> @@ -133,9 +136,6 @@ struct kvm_arch {
>>>  
>>> u8 pfr0_csv2;
>>> u8 pfr0_csv3;
>>> -
>>> -   /* Memory Tagging Extension enabled for the guest */
>>> -   bool mte_enabled;
>>>  };
>>>  
>>>  struct kvm_vcpu_fault_info {
>>> @@ -777,7 +777,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
>>>  #define kvm_arm_vcpu_sve_finalized(vcpu) \
>>> ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
>>>  
>>> -#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
>>> +#define kvm_has_mte(kvm)   \
>>> +   (system_supports_mte() &&   \
>>> +test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
>>
>> Not an issue with this patch, but I just noticed that the
>> system_supports_mte() check is redundant here as we only allow the flag to
>> be set if that's already the case.
> 
> It allows us to save a memory access if system_supports_mte() is false
> (it is eventually implemented as a static key). On the other hand,
> there is so much inlining due to it being a non-final cap that we
> probably lose on that too...

My original logic was that system_supports_mte() checks
IS_ENABLED(CONFIG_ARM64_MTE) - so this enables the code guarded with
kvm_has_mte() to be compiled out if CONFIG_ARM64_MTE is disabled.

Indeed it turns at we currently rely on this (with CONFIG_ARM64_MTE
disabled):

aarch64-linux-gnu-ld: arch/arm64/kvm/mmu.o: in function `sanitise_mte_tags':
/home/stepri01/work/linux/arch/arm64/kvm/mmu.c:887: undefined reference to 
`mte_clear_page_tags'
aarch64-linux-gnu-ld: arch/arm64/kvm/guest.o: in function 
`kvm_vm_ioctl_mte_copy_tags':
/home/stepri01/work/linux/arch/arm64/kvm/guest.c:1066: undefined reference to 
`mte_copy_tags_to_user'
aarch64-linux-gnu-ld: /home/stepri01/work/linux/arch/arm64/kvm/guest.c:1074: 
undefined reference to `mte_copy_tags_from_user'

Obviously we could pull just the IS_ENABLED() into kvm_has_mte() instead.

Steve
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 1/6] KVM: arm64: Introduce helper to retrieve a PTE and its level

2021-07-28 Thread Marc Zyngier
Hi Alex,

On Tue, 27 Jul 2021 16:25:34 +0100,
Alexandru Elisei  wrote:
> 
> Hi Marc,
> 
> On 7/26/21 4:35 PM, Marc Zyngier wrote:
> > It is becoming a common need to fetch the PTE for a given address
> > together with its level. Add such a helper.
> >
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h | 19 ++
> >  arch/arm64/kvm/hyp/pgtable.c | 39 
> >  2 files changed, 58 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> > b/arch/arm64/include/asm/kvm_pgtable.h
> > index f004c0115d89..082b9d65f40b 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -432,6 +432,25 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, 
> > u64 addr, u64 size);
> >  int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
> >  struct kvm_pgtable_walker *walker);
> >  
> > +/**
> > + * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry
> > + * with its level.
> > + * @pgt:   Page-table structure initialised by kvm_pgtable_*_init().
> 
> Yet in the next patch you use a struct kvm_pgtable_pgt not
> initialized by any of the kvm_pgtable_*_init() functions. It doesn't
> hurt correctness, but it might confuse potential users of this
> function.

Fair enough. I'll add something like "[...] or any similar initialisation".

> 
> > + * @addr:  Input address for the start of the walk.
> > + * @ptep:  Pointer to storage for the retrieved PTE.
> > + * @level: Pointer to storage for the level of the retrieved PTE.
> > + *
> > + * The offset of @addr within a page is ignored.
> > + *
> > + * The walker will walk the page-table entries corresponding to the input
> > + * address specified, retrieving the leaf corresponding to this address.
> > + * Invalid entries are treated as leaf entries.
> > + *
> > + * Return: 0 on success, negative error code on failure.
> > + */
> > +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
> > +kvm_pte_t *ptep, u32 *level);
> > +
> >  /**
> >   * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
> >   *  Addresses with compatible permission
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 05321f4165e3..78f36bd5df6c 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -326,6 +326,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 
> > addr, u64 size,
> > return _kvm_pgtable_walk(&walk_data);
> >  }
> >  
> > +struct leaf_walk_data {
> > +   kvm_pte_t   pte;
> > +   u32 level;
> > +};
> > +
> > +static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
> > +  enum kvm_pgtable_walk_flags flag, void * const arg)
> > +{
> > +   struct leaf_walk_data *data = arg;
> > +
> > +   data->pte   = *ptep;
> > +   data->level = level;
> > +
> > +   return 0;
> > +}
> > +
> > +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
> > +kvm_pte_t *ptep, u32 *level)
> > +{
> > +   struct leaf_walk_data data;
> > +   struct kvm_pgtable_walker walker = {
> > +   .cb = leaf_walker,
> > +   .flags  = KVM_PGTABLE_WALK_LEAF,
> > +   .arg= &data,
> > +   };
> > +   int ret;
> > +
> > +   ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
> > +  PAGE_SIZE, &walker);
> 
> kvm_pgtable_walk() already aligns addr down to PAGE_SIZE, I don't
> think that's needed here. But not harmful either.

It is more that if you don't align it down, the size becomes awkward
to express. Masking is both cheap and readable.

> 
> Otherwise, the patch looks good to me:
> 
> Reviewed-by: Alexandru Elisei 

Thanks!

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 12/16] mm/ioremap: Add arch-specific callbacks on ioremap/iounmap calls

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:12:04 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:55PM +0100, Marc Zyngier wrote:
> > Add a pair of hooks (ioremap_page_range_hook/iounmap_page_range_hook)
> > that can be implemented by an architecture.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  include/linux/io.h |  3 +++
> >  mm/ioremap.c   | 13 -
> >  mm/vmalloc.c   |  8 
> >  3 files changed, 23 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/io.h b/include/linux/io.h
> > index 9595151d800d..0ffc265f114c 100644
> > --- a/include/linux/io.h
> > +++ b/include/linux/io.h
> > @@ -21,6 +21,9 @@ void __ioread32_copy(void *to, const void __iomem *from, 
> > size_t count);
> >  void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
> >  
> >  #ifdef CONFIG_MMU
> > +void ioremap_page_range_hook(unsigned long addr, unsigned long end,
> > +phys_addr_t phys_addr, pgprot_t prot);
> > +void iounmap_page_range_hook(phys_addr_t phys_addr, size_t size);
> >  int ioremap_page_range(unsigned long addr, unsigned long end,
> >phys_addr_t phys_addr, pgprot_t prot);
> >  #else
> 
> Can we avoid these hooks by instead not registering the regions proactively
> in the guest and moving that logic to a fault handler which runs off the
> back of the injected data abort? From there, we could check if the faulting
> IPA is a memory address and register it as MMIO if not.
> 
> Dunno, you've spent more time than me thinking about this, but just
> wondering if you'd had a crack at doing it that way, as it _seems_ simpler
> to my naive brain.

I thought about it, but couldn't work out whether it was always
possible for the guest to handle these faults (first access in an
interrupt context, for example?).

Also, this changes the semantics of the protection this is supposed to
offer: any access out of the RAM space will generate an abort, and the
fault handler will grant MMIO forwarding for this page. Stray accesses
that would normally be properly handled as fatal would now succeed and
be forwarded to userspace, even if there was no emulated devices
there.

For this to work, we'd need to work out whether there is any existing
device mapping that actually points to this page. And whether it
actually is supposed to be forwarded to userspace. Do we have a rmap
for device mappings?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 07/16] KVM: arm64: Wire MMIO guard hypercalls

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:11:46 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:50PM +0100, Marc Zyngier wrote:
> > Plumb in the hypercall interface to allow a guest to discover,
> > enroll, map and unmap MMIO regions.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/hypercalls.c | 20 
> >  include/linux/arm-smccc.h   | 28 
> >  2 files changed, 48 insertions(+)
> > 
> > diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> > index 30da78f72b3b..a3deeb907fdd 100644
> > --- a/arch/arm64/kvm/hypercalls.c
> > +++ b/arch/arm64/kvm/hypercalls.c
> > @@ -5,6 +5,7 @@
> >  #include 
> >  
> >  #include 
> > +#include 
> >  
> >  #include 
> >  #include 
> > @@ -129,10 +130,29 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
> > case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
> > val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
> > val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
> > +   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO);
> > +   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL);
> > +   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP);
> > +   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP);
> > break;
> > case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
> > kvm_ptp_get_time(vcpu, val);
> > break;
> > +   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID:
> > +   val[0] = PAGE_SIZE;
> > +   break;
> 
> I get the nagging feeling that querying the stage-2 page-size outside of
> MMIO guard is going to be useful once we start looking at memory sharing,
> so perhaps rename this to something more generic?

At this stage, why not follow the architecture and simply expose it as
ID_AA64MMFR0_EL1.TGran{4,64,16}_2? That's exactly what it is for, and
we already check for this in KVM itself.

> 
> > +   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID:
> > +   set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags);
> > +   val[0] = SMCCC_RET_SUCCESS;
> > +   break;
> > +   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
> > +   if (kvm_install_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
> > +   val[0] = SMCCC_RET_SUCCESS;
> > +   break;
> > +   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID:
> > +   if (kvm_remove_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
> > +   val[0] = SMCCC_RET_SUCCESS;
> > +   break;
> 
> I think there's a slight discrepancy between MAP and UNMAP here in that
> calling UNMAP on something that hasn't been mapped will fail, whereas
> calling MAP on something that's already been mapped will succeed. I think
> that might mean you can't reason about the final state of the page if two
> vCPUs race to call these functions in some cases (and both succeed).

I'm not sure that's the expected behaviour for ioremap(), for example
(you can ioremap two portions of the same page successfully).

I guess MAP could return something indicating that the page is already
mapped, but I wouldn't want to return a hard failure in this case.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 06/16] KVM: arm64: Force a full unmap on vpcu reinit

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:11:33 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:49PM +0100, Marc Zyngier wrote:
> > As we now keep information in the S2PT, we must be careful not
> > to keep it across a VM reboot, which could otherwise lead to
> > interesting problems.
> > 
> > Make sure that the S2 is completely discarded on reset of
> > a vcpu, and remove the flag that enforces the MMIO check.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/arm.c | 8 +++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 97ab1512c44f..b0d2225190d2 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -1096,12 +1096,18 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct 
> > kvm_vcpu *vcpu,
> >  * ensuring that the data side is always coherent. We still
> >  * need to invalidate the I-cache though, as FWB does *not*
> >  * imply CTR_EL0.DIC.
> > +*
> > +* If the MMIO guard was enabled, we pay the price of a full
> > +* unmap to get back to a sane state (and clear the flag).
> >  */
> > if (vcpu->arch.has_run_once) {
> > -   if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
> > +   if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) ||
> > +   test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags))
> > stage2_unmap_vm(vcpu->kvm);
> > else
> > icache_inval_all_pou();
> > +
> > +   clear_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags);
> 
> What prevents this racing with another vCPU trying to set the bit?

Not much. We could take the kvm lock on both ends to serialize it, but
that's pretty ugly. And should we care? What is the semantic of
resetting a vcpu while another is still running?

If we want to support this sort of behaviour, then our tracking is
totally bogus, because it is VM-wide. And you don't even have to play
with that bit from another vcpu: all the information is lost at the
point where we unmap the S2 PTs.

Maybe an alternative is to move this to the halt/reboot PSCI handlers,
making it clearer what we expect?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 05/16] KVM: arm64: Plumb MMIO checking into the fault handling

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:11:21 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:48PM +0100, Marc Zyngier wrote:
> > Plumb the MMIO checking code into the MMIO fault handling code.
> > Nothing allows a region to be registered yet, so there should be
> > no funtional change either.
> 
> Typo: functional
> 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/mmio.c | 10 ++
> >  1 file changed, 10 insertions(+)
> > 
> > diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
> > index 3dd38a151d2a..fd5747279d27 100644
> > --- a/arch/arm64/kvm/mmio.c
> > +++ b/arch/arm64/kvm/mmio.c
> > @@ -6,6 +6,7 @@
> >  
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  
> >  #include "trace.h"
> > @@ -130,6 +131,10 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t 
> > fault_ipa)
> > int len;
> > u8 data_buf[8];
> >  
> > +   /* Check failed? Return to the guest for debriefing... */
> > +   if (!kvm_check_ioguard_page(vcpu, fault_ipa))
> > +   return 1;
> > +
> > /*
> >  * No valid syndrome? Ask userspace for help if it has
> >  * volunteered to do so, and bail out otherwise.
> > @@ -156,6 +161,11 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t 
> > fault_ipa)
> > len = kvm_vcpu_dabt_get_as(vcpu);
> > rt = kvm_vcpu_dabt_get_rd(vcpu);
> >  
> > +   /* If we cross a page boundary, check that too... */
> > +   if (((fault_ipa + len - 1) & PAGE_MASK) != (fault_ipa & PAGE_MASK) &&
> > +   !kvm_check_ioguard_page(vcpu, fault_ipa + len - 1))
> > +   return 1;
> > +
> 
> I find this a little odd as the checks straddle the invalid syndrome check,
> meaning that the relative priorities of KVM_ARCH_FLAG_MMIO_GUARD and
> KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER are unclear.

Good point. And the combination of both flags on its own is odd. Maybe
KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER should be ignored or deemed
incompatible with the MMIO guard feature.

The lack of syndrome information means that we cannot really test for
the boundaries of the access (len is invalid), so I'd be tempted to
inject an abort in this case.

Thoughts?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 04/16] KVM: arm64: Add MMIO checking infrastructure

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:11:08 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:47PM +0100, Marc Zyngier wrote:
> > Introduce the infrastructure required to identify an IPA region
> > that is expected to be used as an MMIO window.
> > 
> > This include mapping, unmapping and checking the regions. Nothing
> > calls into it yet, so no expected functional change.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/include/asm/kvm_host.h |   2 +
> >  arch/arm64/include/asm/kvm_mmu.h  |   5 ++
> >  arch/arm64/kvm/mmu.c  | 115 ++
> >  3 files changed, 122 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h 
> > b/arch/arm64/include/asm/kvm_host.h
> > index 4add6c27251f..914c1b7bb3ad 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -125,6 +125,8 @@ struct kvm_arch {
> >  #define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0
> > /* Memory Tagging Extension enabled for the guest */
> >  #define KVM_ARCH_FLAG_MTE_ENABLED  1
> > +   /* Gues has bought into the MMIO guard extension */
> > +#define KVM_ARCH_FLAG_MMIO_GUARD   2
> > unsigned long flags;
> >  
> > /*
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index b52c5c4b9a3d..f6b8fc1671b3 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -170,6 +170,11 @@ phys_addr_t kvm_mmu_get_httbr(void);
> >  phys_addr_t kvm_get_idmap_vector(void);
> >  int kvm_mmu_init(u32 *hyp_va_bits);
> >  
> > +/* MMIO guard */
> > +bool kvm_install_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
> > +bool kvm_remove_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
> > +bool kvm_check_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
> > +
> >  static inline void *__kvm_vector_slot2addr(void *base,
> >enum arm64_hyp_spectre_vector slot)
> >  {
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 3155c9e778f0..638827c8842b 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1120,6 +1120,121 @@ static void handle_access_fault(struct kvm_vcpu 
> > *vcpu, phys_addr_t fault_ipa)
> > kvm_set_pfn_accessed(pte_pfn(pte));
> >  }
> >  
> > +#define MMIO_NOTE  ('M' << 24 | 'M' << 16 | 'I' << 8 | '0')
> 
> Although this made me smile, maybe we should carve up the bit space a bit
> more carefully ;) Also, you know somebody clever will "fix" that typo to
> 'O'!

They'll get to keep the pieces when the whole thing breaks!

More seriously, happy to have a more elaborate allocation scheme. For
the purpose of this series, it really doesn't matter.

> Quentin, as the other user of this stuff at the moment, how do you see the
> annotation space being allocated? Feels like we should have some 'type'
> bits which decide how to parse the rest of the entry.
> 
> > +
> > +bool kvm_install_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa)
> > +{
> > +   struct kvm_mmu_memory_cache *memcache;
> > +   struct kvm_memory_slot *memslot;
> > +   int ret, idx;
> > +
> > +   if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags))
> > +   return false;
> > +
> > +   /* Must be page-aligned */
> > +   if (ipa & ~PAGE_MASK)
> > +   return false;
> > +
> > +   /*
> > +* The page cannot be in a memslot. At some point, this will
> > +* have to deal with device mappings though.
> > +*/
> > +   idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +   memslot = gfn_to_memslot(vcpu->kvm, ipa >> PAGE_SHIFT);
> > +   srcu_read_unlock(&vcpu->kvm->srcu, idx);
> 
> What does this memslot check achieve? A new memslot could be added after
> you've checked, no?

If you start allowing S2 annotations to coexist with potential memory
mappings, you're in for trouble. The faulting logic will happily
overwrite the annotation, and that's probably not what you want.

As for new (or moving) memslots, I guess they should be checked
against existing annotations.

> 
> > +/* Assumes mmu_lock taken */
> 
> You can use a lockdep assertion for that!

Sure.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 02/16] KVM: arm64: Don't issue CMOs when the physical address is invalid

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:10:45 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:45PM +0100, Marc Zyngier wrote:
> > Make sure we don't issue CMOs when mapping something that
> > is not a memory address in the S2 page tables.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/hyp/pgtable.c | 16 ++--
> >  1 file changed, 10 insertions(+), 6 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> > index 05321f4165e3..a5874ebd0354 100644
> > --- a/arch/arm64/kvm/hyp/pgtable.c
> > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > @@ -619,12 +619,16 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> > end, u32 level,
> > }
> >  
> > /* Perform CMOs before installation of the guest stage-2 PTE */
> > -   if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
> > -   mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
> > -   granule);
> > -
> > -   if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
> > -   mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
> > +   if (kvm_phys_is_valid(phys)) {
> > +   if (mm_ops->dcache_clean_inval_poc &&
> > +   stage2_pte_cacheable(pgt, new))
> > +   mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new,
> > + mm_ops),
> > +  granule);
> > +   if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
> > +   mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
> > +granule);
> > +   }
> 
> Given that this check corresponds to checking the validity of 'new', I
> wonder whether we'd be better off pushing the validity checks down into
> stage2_pte_{cacheable,executable}()?
> 
> I.e. have stage2_pte_cacheable() return false if !kvm_pte_valid()

That would work just as well. I'll update the patch.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 01/16] KVM: arm64: Generalise VM features into a set of flags

2021-07-28 Thread Marc Zyngier
On Tue, 27 Jul 2021 19:10:27 +0100,
Will Deacon  wrote:
> 
> On Thu, Jul 15, 2021 at 05:31:44PM +0100, Marc Zyngier wrote:
> > We currently deal with a set of booleans for VM features,
> > while they could be better represented as set of flags
> > contained in an unsigned long, similarily to what we are
> > doing on the CPU side.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/include/asm/kvm_host.h | 12 +++-
> >  arch/arm64/kvm/arm.c  |  5 +++--
> >  arch/arm64/kvm/mmio.c |  3 ++-
> >  3 files changed, 12 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h 
> > b/arch/arm64/include/asm/kvm_host.h
> > index 41911585ae0c..4add6c27251f 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -122,7 +122,10 @@ struct kvm_arch {
> >  * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
> >  * supported.
> >  */
> > -   bool return_nisv_io_abort_to_user;
> > +#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0
> > +   /* Memory Tagging Extension enabled for the guest */
> > +#define KVM_ARCH_FLAG_MTE_ENABLED  1
> > +   unsigned long flags;
> 
> One downside of packing all these together is that updating 'flags' now
> requires an atomic rmw sequence (i.e. set_bit()). Then again, that's
> probably for the best anyway given that kvm_vm_ioctl_enable_cap() looks
> like it doesn't hold any locks.

That, and these operations are supposed to be extremely rare anyway.

> 
> > /*
> >  * VM-wide PMU filter, implemented as a bitmap and big enough for
> > @@ -133,9 +136,6 @@ struct kvm_arch {
> >  
> > u8 pfr0_csv2;
> > u8 pfr0_csv3;
> > -
> > -   /* Memory Tagging Extension enabled for the guest */
> > -   bool mte_enabled;
> >  };
> >  
> >  struct kvm_vcpu_fault_info {
> > @@ -777,7 +777,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
> >  #define kvm_arm_vcpu_sve_finalized(vcpu) \
> > ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
> >  
> > -#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
> > +#define kvm_has_mte(kvm)   \
> > +   (system_supports_mte() &&   \
> > +test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
> 
> Not an issue with this patch, but I just noticed that the
> system_supports_mte() check is redundant here as we only allow the flag to
> be set if that's already the case.

It allows us to save a memory access if system_supports_mte() is false
(it is eventually implemented as a static key). On the other hand,
there is so much inlining due to it being a non-final cap that we
probably lose on that too...

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm