[RFC PATCH 5/6] kvm: arm64: Fix constant-pool users in hyp

2020-11-19 Thread David Brazdil
Hyp code used to use absolute addressing via a constant pool to obtain
the kernel VA of 3 symbols - panic, __hyp_panic_string and
__kvm_handle_stub_hvc. This used to work because the kernel would
relocate the addresses in the constant pool to kernel VA at boot and hyp
would simply load them from there.

Now that relocations are fixed up to point to hyp VAs, this does not
work any longer. Rework the helpers to convert hyp VA to kernel VA / PA
as needed.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h | 29 +++--
 arch/arm64/kvm/hyp/nvhe/host.S   | 29 +++--
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 8cb8974ec9cc..0676ff2105bb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -72,9 +72,14 @@ alternative_cb kvm_update_va_mask
 alternative_cb_end
 .endm
 
+.macro hyp_pa reg, tmp
+   ldr_l   \tmp, hyp_physvirt_offset
+   add \reg, \reg, \tmp
+.endm
+
 /*
- * Convert a kernel image address to a PA
- * reg: kernel address to be converted in place
+ * Convert a hypervisor VA to a kernel image address
+ * reg: hypervisor address to be converted in place
  * tmp: temporary register
  *
  * The actual code generation takes place in kvm_get_kimage_voffset, and
@@ -82,18 +87,22 @@ alternative_cb_end
  * perform the register allocation (kvm_get_kimage_voffset uses the
  * specific registers encoded in the instructions).
  */
-.macro kimg_pa reg, tmp
+.macro hyp_kimg reg, tmp
+   /* Convert hyp VA -> PA. */
+   hyp_pa  \reg, \tmp
+
+   /* Load kimage_voffset. */
 alternative_cb kvm_get_kimage_voffset
-   movz\tmp, #0
-   movk\tmp, #0, lsl #16
-   movk\tmp, #0, lsl #32
-   movk\tmp, #0, lsl #48
+   movz\tmp, #0
+   movk\tmp, #0, lsl #16
+   movk\tmp, #0, lsl #32
+   movk\tmp, #0, lsl #48
 alternative_cb_end
 
-   /* reg = __pa(reg) */
-   sub \reg, \reg, \tmp
+   /* Convert PA -> kimg VA. */
+   add \reg, \reg, \tmp
 .endm
-
+
 #else
 
 #include 
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 596dd5ae8e77..bcb80d525d8c 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -74,27 +74,28 @@ SYM_FUNC_END(__host_enter)
  * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 
par);
  */
 SYM_FUNC_START(__hyp_do_panic)
-   /* Load the format arguments into x1-7 */
-   mov x6, x3
-   get_vcpu_ptr x7, x3
-
-   mrs x3, esr_el2
-   mrs x4, far_el2
-   mrs x5, hpfar_el2
-
/* Prepare and exit to the host's panic funciton. */
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
  PSR_MODE_EL1h)
msr spsr_el2, lr
ldr lr, =panic
+   hyp_kimg lr, x6
msr elr_el2, lr
 
-   /*
-* Set the panic format string and enter the host, conditionally
-* restoring the host context.
-*/
+   /* Set the panic format string. Use the, now free, LR as scratch. */
+   ldr lr, =__hyp_panic_string
+   hyp_kimg lr, x6
+
+   /* Load the format arguments into x1-7. */
+   mov x6, x3
+   get_vcpu_ptr x7, x3
+   mrs x3, esr_el2
+   mrs x4, far_el2
+   mrs x5, hpfar_el2
+
+   /* Enter the host, conditionally restoring the host context. */
cmp x0, xzr
-   ldr x0, =__hyp_panic_string
+   mov x0, lr
b.eq__host_enter_without_restoring
b   __host_enter_for_panic
 SYM_FUNC_END(__hyp_do_panic)
@@ -124,7 +125,7 @@ SYM_FUNC_END(__hyp_do_panic)
 * Preserve x0-x4, which may contain stub parameters.
 */
ldr x5, =__kvm_handle_stub_hvc
-   kimg_pa x5, x6
+   hyp_pa  x5, x6
br  x5
 .L__vect_end\@:
 .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 0/6] kvm: arm64: Fix up hyp relocations

2020-11-19 Thread David Brazdil
Hi,

KVM nVHE hyp code runs under different VA mapping than the kernel, which
meant that .hyp.text code had to use PC-relative addressing because
relocations would produce a kernel VA. Programmers had to be extremely
careful with C semantics to not break this fragile setup. See
hyp_symbol_addr comments for details.

Now that we're moving to all nVHE hyp code/data being in separate ELF
sections from the rest of the kernel, it is becoming possible to revisit
relocations during early boot, filter those used by nVHE hyp and
converting those (already relocated) kern VAs to hyp VAs.

Sending this as an RFC, mainly to get feedback but also because it's
only lightly tested. It still feels hacky but much more robust than the
existing approach. The one place where I see somebody breaking this is
the list of ELF sections owned by ELF. That list is currently evolving
but should stabilize over time.

The patches are based on kvmarm/queue (with Marc's "Host EL2 entry
improvements") and my "Opt-in always-on nVHE hypervisor" v2 series.

-David

David Brazdil (6):
  kvm: arm64: Set up .hyp.rodata ELF section
  kvm: arm64: Fix up RELA relocations in hyp code/data
  kvm: arm64: Fix up RELR relocation in hyp code/data
  kvm: arm64: Remove patching of fn pointers in hyp
  kvm: arm64: Fix constant-pool users in hyp
  kvm: arm64: Remove hyp_symbol_addr

 arch/arm64/include/asm/kvm_asm.h |  20 
 arch/arm64/include/asm/kvm_mmu.h |  48 -
 arch/arm64/include/asm/sections.h|   2 +-
 arch/arm64/kernel/image-vars.h   |   1 -
 arch/arm64/kernel/smp.c  |   4 +-
 arch/arm64/kernel/vmlinux.lds.S  |   7 +-
 arch/arm64/kvm/arm.c |   7 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h  |   4 +-
 arch/arm64/kvm/hyp/nvhe/host.S   |  29 +++---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c   |  11 +-
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c|   4 +-
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S|   1 +
 arch/arm64/kvm/hyp/nvhe/psci-relay.c |   4 +-
 arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c |   2 +-
 arch/arm64/kvm/va_layout.c   | 123 +--
 15 files changed, 175 insertions(+), 92 deletions(-)

-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 2/6] kvm: arm64: Fix up RELA relocations in hyp code/data

2020-11-19 Thread David Brazdil
KVM nVHE code runs under a different VA mapping than the kernel, hence
so far it relied only on PC-relative addressing to avoid accidentally
using a relocated kernel VA from a constant pool (see hyp_symbol_addr).

So as to reduce the possibility of a programmer error, fixup the
relocated addresses instead. Let the kernel relocate them to kernel VA
first, but then iterate over them again, filter those that point to hyp
code/data and convert the kernel VA to hyp VA.

This is done after kvm_compute_layout and before apply_alternatives.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h |  1 +
 arch/arm64/kernel/smp.c  |  4 +-
 arch/arm64/kvm/va_layout.c   | 76 
 3 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 5168a0c516ae..e5226f7e4732 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -105,6 +105,7 @@ alternative_cb_end
 void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
+void kvm_fixup_hyp_relocations(void);
 
 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727d3f64..30241afc2c93 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -434,8 +434,10 @@ static void __init hyp_mode_check(void)
   "CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
-   if (IS_ENABLED(CONFIG_KVM))
+   if (IS_ENABLED(CONFIG_KVM)) {
kvm_compute_layout();
+   kvm_fixup_hyp_relocations();
+   }
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index d8cc51bd60bf..b80fab974896 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -82,6 +83,81 @@ __init void kvm_compute_layout(void)
init_hyp_physvirt_offset();
 }
 
+#define __load_elf_u64(s)  \
+   ({  \
+   extern u64 s;   \
+   u64 val;\
+   \
+   asm ("ldr %0, =%1" : "=r"(val) : "S"());  \
+   val;\
+   })
+
+static bool __is_within_bounds(u64 addr, char *start, char *end)
+{
+   return start <= (char*)addr && (char*)addr < end;
+}
+
+static bool __is_in_hyp_section(u64 addr)
+{
+   return __is_within_bounds(addr, __hyp_text_start, __hyp_text_end) ||
+  __is_within_bounds(addr, __hyp_rodata_start, __hyp_rodata_end) ||
+  __is_within_bounds(addr,
+ CHOOSE_NVHE_SYM(__per_cpu_start),
+ CHOOSE_NVHE_SYM(__per_cpu_end));
+}
+
+static void __fixup_hyp_rel(u64 addr)
+{
+   u64 *ptr, kern_va, hyp_va;
+
+   /* Adjust the relocation address taken from ELF for KASLR. */
+   addr += kaslr_offset();
+
+   /* Skip addresses not in any of the hyp sections. */
+   if (!__is_in_hyp_section(addr))
+   return;
+
+   /* Get the LM alias of the relocation address. */
+   ptr = (u64*)kvm_ksym_ref((void*)addr);
+
+   /*
+* Read the value at the relocation address. It has already been
+* relocated to the actual kernel kimg VA.
+*/
+   kern_va = (u64)kvm_ksym_ref((void*)*ptr);
+
+   /* Convert to hyp VA. */
+   hyp_va = __early_kern_hyp_va(kern_va);
+
+   /* Store hyp VA at the relocation address. */
+   *ptr = __early_kern_hyp_va(kern_va);
+}
+
+static void __fixup_hyp_rela(void)
+{
+   Elf64_Rela *rel;
+   size_t i, n;
+
+   rel = (Elf64_Rela*)(kimage_vaddr + __load_elf_u64(__rela_offset));
+   n = __load_elf_u64(__rela_size) / sizeof(*rel);
+
+   for (i = 0; i < n; ++i)
+   __fixup_hyp_rel(rel[i].r_offset);
+}
+
+/*
+ * The kernel relocated pointers to kernel VA. Iterate over relocations in
+ * the hypervisor ELF sections and convert them to hyp VA. This avoids the
+ * need to only use PC-relative addressing in hyp.
+ */
+__init void kvm_fixup_hyp_relocations(void)
+{
+   if (!IS_ENABLED(CONFIG_RELOCATABLE) || has_vhe())
+   return;
+
+   __fixup_hyp_rela();
+}
+
 static u32 compute_instruction(int n, u32 rd, u32 rn)
 {
u32 insn = AARCH64_BREAK_FAULT;
-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 6/6] kvm: arm64: Remove hyp_symbol_addr

2020-11-19 Thread David Brazdil
The helper was used to force PC-relative addressing in hyp code because
absolute addressing via constant-pools used to generate kernel VAs. This
was cumbersome and required programmers to remember to use the helper
whenever they wanted to take a pointer.

Now that hyp relocations are fixed up, there is no need for the helper
any logner. Remove it.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_asm.h | 20 
 arch/arm64/kvm/hyp/include/hyp/switch.h  |  4 ++--
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c|  4 ++--
 arch/arm64/kvm/hyp/nvhe/psci-relay.c |  4 ++--
 arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c |  2 +-
 5 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 1a86581e581e..1961d23c0c40 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -203,26 +203,6 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
-/*
- * Obtain the PC-relative address of a kernel symbol
- * s: symbol
- *
- * The goal of this macro is to return a symbol's address based on a
- * PC-relative computation, as opposed to a loading the VA from a
- * constant pool or something similar. This works well for HYP, as an
- * absolute VA is guaranteed to be wrong. Only use this if trying to
- * obtain the address of a symbol (i.e. not something you obtained by
- * following a pointer).
- */
-#define hyp_symbol_addr(s) \
-   ({  \
-   typeof(s) *addr;\
-   asm("adrp   %0, %1\n"   \
-   "add%0, %0, :lo12:%1\n" \
-   : "=r" (addr) : "S" ());  \
-   addr;   \
-   })
-
 #define __KVM_EXTABLE(from, to)
\
"   .pushsection__kvm_ex_table, \"a\"\n"\
"   .align  3\n"\
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 84473574c2e7..54f4860cd87c 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -505,8 +505,8 @@ static inline void __kvm_unexpected_el2_exception(void)
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
 
-   entry = hyp_symbol_addr(__start___kvm_ex_table);
-   end = hyp_symbol_addr(__stop___kvm_ex_table);
+   entry = &__start___kvm_ex_table;
+   end = &__stop___kvm_ex_table;
 
while (entry < end) {
addr = (unsigned long)>insn + entry->insn;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index ceb427aabb91..6870d9f3d4b7 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -33,8 +33,8 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
hyp_panic();
 
-   cpu_base_array = (unsigned 
long*)hyp_symbol_addr(kvm_arm_hyp_percpu_base);
+   cpu_base_array = (unsigned long*)(_arm_hyp_percpu_base[0]);
this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
-   elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start);
+   elf_base = (unsigned long)&__per_cpu_start;
return this_cpu_base - elf_base;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c 
b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 313ef42f0eab..f64380a49a72 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -147,7 +147,7 @@ static int psci_cpu_suspend(u64 func_id, struct 
kvm_cpu_context *host_ctxt)
 * point if it is a deep sleep state.
 */
ret = psci_call(func_id, power_state,
-   __hyp_pa(hyp_symbol_addr(__kvm_hyp_cpu_entry)),
+   __hyp_pa(__kvm_hyp_cpu_entry),
__hyp_pa(cpu_params));
 
release_reset_state(cpu_state);
@@ -182,7 +182,7 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context 
*host_ctxt)
return PSCI_RET_ALREADY_ON;
 
ret = psci_call(func_id, mpidr,
-   __hyp_pa(hyp_symbol_addr(__kvm_hyp_cpu_entry)),
+   __hyp_pa(__kvm_hyp_cpu_entry),
__hyp_pa(cpu_params));
 
/*
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c 
b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 8f0585640241..87a54375bd6e 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -64,7 +64,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
}
 
rd = 

[RFC PATCH 3/6] kvm: arm64: Fix up RELR relocation in hyp code/data

2020-11-19 Thread David Brazdil
The arm64 kernel also supports packing of relocation data using the RELR
format. Implement a parser of RELR data and fixup the relocations using
the same infra as RELA relocs.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/va_layout.c | 41 ++
 1 file changed, 41 insertions(+)

diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index b80fab974896..7f45a98eacfd 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -145,6 +145,43 @@ static void __fixup_hyp_rela(void)
__fixup_hyp_rel(rel[i].r_offset);
 }
 
+#ifdef CONFIG_RELR
+static void __fixup_hyp_relr(void)
+{
+   u64 *rel, *end;
+
+   rel = (u64*)(kimage_vaddr + __load_elf_u64(__relr_offset));
+   end = rel + (__load_elf_u64(__relr_size) / sizeof(*rel));
+
+   while (rel < end) {
+   unsigned n;
+   u64 addr = *(rel++);
+
+   /* Address must not have the LSB set. */
+   BUG_ON(addr & BIT(0));
+
+   /* Fix up the first address of the chain. */
+   __fixup_hyp_rel(addr);
+
+   /*
+* Loop over bitmaps, i.e. as long as words' LSB is 1.
+* Each bit (ordered from LSB to MSB) represents one word from
+* the last full address (exclusive). If the corresponding bit
+* is 1, there is a relative relocation on that word.
+*/
+   for (n = 0; rel < end && (*rel & BIT(0)); n++) {
+   unsigned i;
+   u64 bitmap = *(rel++);
+
+   for (i = 1; i < 64; ++i) {
+   if ((bitmap & BIT(i)))
+   __fixup_hyp_rel(addr + 8 * (63 * n + 
i));
+   }
+   }
+   }
+}
+#endif
+
 /*
  * The kernel relocated pointers to kernel VA. Iterate over relocations in
  * the hypervisor ELF sections and convert them to hyp VA. This avoids the
@@ -156,6 +193,10 @@ __init void kvm_fixup_hyp_relocations(void)
return;
 
__fixup_hyp_rela();
+
+#ifdef CONFIG_RELR
+   __fixup_hyp_relr();
+#endif
 }
 
 static u32 compute_instruction(int n, u32 rd, u32 rn)
-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 4/6] kvm: arm64: Remove patching of fn pointers in hyp

2020-11-19 Thread David Brazdil
Taking a function pointer will now generate a R_AARCH64_RELATIVE that is
fixed up at early boot. Remove the alternative-based mechanism for
converting the address from a kernel VA.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h   | 18 --
 arch/arm64/kernel/image-vars.h |  1 -
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 11 ---
 arch/arm64/kvm/va_layout.c |  6 --
 4 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e5226f7e4732..8cb8974ec9cc 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -121,24 +121,6 @@ static __always_inline unsigned long 
__kern_hyp_va(unsigned long v)
 
 #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v
 
-static __always_inline unsigned long __kimg_hyp_va(unsigned long v)
-{
-   unsigned long offset;
-
-   asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
-   "movk %0, #0, lsl #16\n"
-   "movk %0, #0, lsl #32\n"
-   "movk %0, #0, lsl #48\n",
-   kvm_update_kimg_phys_offset)
-: "=r" (offset));
-
-   return __kern_hyp_va((v - offset) | PAGE_OFFSET);
-}
-
-#define kimg_fn_hyp_va(v)  ((typeof(*v))(__kimg_hyp_va((unsigned 
long)(v
-
-#define kimg_fn_ptr(x) (typeof(x) **)(x)
-
 /*
  * We currently support using a VM-specified IPA size. For backward
  * compatibility, the default IPA size is fixed to 40bits.
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8539f34d7538..6379721236cf 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -64,7 +64,6 @@ __efistub__ctype  = _ctype;
 /* Alternative callbacks for init-time patching of nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
-KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset);
 KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
 
 /* Global kernel state accessed by nVHE hyp code. */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index b3db5f4eea27..7998eff5f0a2 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -110,9 +110,9 @@ static void handle___vgic_v3_restore_aprs(struct 
kvm_cpu_context *host_ctxt)
 
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
-#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x)
+#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
 
-static const hcall_t *host_hcall[] = {
+static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
@@ -132,7 +132,6 @@ static const hcall_t *host_hcall[] = {
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
 {
DECLARE_REG(unsigned long, id, host_ctxt, 0);
-   const hcall_t *kfn;
hcall_t hfn;
 
id -= KVM_HOST_SMCCC_ID(0);
@@ -140,13 +139,11 @@ static void handle_host_hcall(struct kvm_cpu_context 
*host_ctxt)
if (unlikely(id >= ARRAY_SIZE(host_hcall)))
goto inval;
 
-   kfn = host_hcall[id];
-   if (unlikely(!kfn))
+   hfn = host_hcall[id];
+   if (unlikely(!hfn))
goto inval;
 
cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
-
-   hfn = kimg_fn_hyp_va(kfn);
hfn(host_ctxt);
 
return;
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index 7f45a98eacfd..0494315f71f2 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -373,12 +373,6 @@ static void generate_mov_q(u64 val, __le32 *origptr, 
__le32 *updptr, int nr_inst
*updptr++ = cpu_to_le32(insn);
 }
 
-void kvm_update_kimg_phys_offset(struct alt_instr *alt,
-__le32 *origptr, __le32 *updptr, int nr_inst)
-{
-   generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst);
-}
-
 void kvm_get_kimage_voffset(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
 {
-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 1/6] kvm: arm64: Set up .hyp.rodata ELF section

2020-11-19 Thread David Brazdil
We will need to recognize pointers in .rodata specific to hyp, so
establish a .hyp.rodata ELF section. Merge it with the existing
.hyp.data..ro_after_init as they are treated the same at runtime.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/sections.h | 2 +-
 arch/arm64/kernel/vmlinux.lds.S   | 7 ---
 arch/arm64/kvm/arm.c  | 7 +++
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 1 +
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/sections.h 
b/arch/arm64/include/asm/sections.h
index 8ff579361731..a6f3557d1ab2 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -11,7 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
 extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
-extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[];
+extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __idmap_text_start[], __idmap_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 4382b5d0645d..6f2fd9734d63 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -31,10 +31,11 @@ jiffies = jiffies_64;
__stop___kvm_ex_table = .;
 
 #define HYPERVISOR_DATA_SECTIONS   \
-   HYP_SECTION_NAME(.data..ro_after_init) : {  \
-   __hyp_data_ro_after_init_start = .; \
+   HYP_SECTION_NAME(.rodata) : {   \
+   __hyp_rodata_start = .; \
*(HYP_SECTION_NAME(.data..ro_after_init))   \
-   __hyp_data_ro_after_init_end = .;   \
+   *(HYP_SECTION_NAME(.rodata))\
+   __hyp_rodata_end = .;   \
}
 
 #define HYPERVISOR_PERCPU_SECTION  \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index d6d5211653b7..119c97e8900a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1688,11 +1688,10 @@ static int init_hyp_mode(void)
goto out_err;
}
 
-   err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start),
- kvm_ksym_ref(__hyp_data_ro_after_init_end),
- PAGE_HYP_RO);
+   err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
+ kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
if (err) {
-   kvm_err("Cannot map .hyp.data..ro_after_init section\n");
+   kvm_err("Cannot map .hyp.rodata section\n");
goto out_err;
}
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S 
b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 5d76ff2ba63e..b0789183d49d 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -17,4 +17,5 @@ SECTIONS {
PERCPU_INPUT(L1_CACHE_BYTES)
}
HYP_SECTION(.data..ro_after_init)
+   HYP_SECTION(.rodata)
 }
-- 
2.29.2.299.gdc1121823c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Marc Zyngier

On 2020-11-19 18:42, Andrew Jones wrote:

On Thu, Nov 19, 2020 at 03:45:40PM +, Peter Maydell wrote:
On Thu, 19 Nov 2020 at 15:39, Steven Price  
wrote:

> This series adds support for Arm's Memory Tagging Extension (MTE) to
> KVM, allowing KVM guests to make use of it. This builds on the existing
> user space support already in v5.10-rc1, see [1] for an overview.

> The change to require the VMM to map all guest memory PROT_MTE is
> significant as it means that the VMM has to deal with the MTE tags even
> if it doesn't care about them (e.g. for virtual devices or if the VMM
> doesn't support migration). Also unfortunately because the VMM can
> change the memory layout at any time the check for PROT_MTE/VM_MTE has
> to be done very late (at the point of faulting pages into stage 2).

I'm a bit dubious about requring the VMM to map the guest memory
PROT_MTE unless somebody's done at least a sketch of the design
for how this would work on the QEMU side. Currently QEMU just
assumes the guest memory is guest memory and it can access it
without special precautions...



There are two statements being made here:

1) Requiring the use of PROT_MTE when mapping guest memory may not fit
   QEMU well.

2) New KVM features should be accompanied with supporting QEMU code in
   order to prove that the APIs make sense.

I strongly agree with (2). While kvmtool supports some quick testing, 
it
doesn't support migration. We must test all new features with a 
migration

supporting VMM.

I'm not sure about (1). I don't feel like it should be a major problem,
but (2).

I'd be happy to help with the QEMU prototype, but preferably when 
there's

hardware available. Has all the current MTE testing just been done on
simulators? And, if so, are there regression tests regularly running on
the simulators too? And can they test migration? If hardware doesn't
show up quickly and simulators aren't used for regression tests, then
all this code will start rotting from day one.


While I agree with the sentiment, the reality is pretty bleak.

I'm pretty sure nobody will ever run a migration on emulation. I also 
doubt
there is much overlap between MTE users and migration users, 
unfortunately.


No HW is available today, and when it becomes available, it will be in
the form of a closed system on which QEMU doesn't run, either because
we are locked out of EL2 (as usual), or because migration is not part of
the use case (like KVM on Android, for example).

So we can wait another two (five?) years until general purpose HW 
becomes

available, or we start merging what we can test today. I'm inclined to
do the latter.

And I think it is absolutely fine for QEMU to say "no MTE support with 
KVM"

(we can remove all userspace visibility, except for the capability).

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Andrew Jones
On Thu, Nov 19, 2020 at 03:45:40PM +, Peter Maydell wrote:
> On Thu, 19 Nov 2020 at 15:39, Steven Price  wrote:
> > This series adds support for Arm's Memory Tagging Extension (MTE) to
> > KVM, allowing KVM guests to make use of it. This builds on the existing
> > user space support already in v5.10-rc1, see [1] for an overview.
> 
> > The change to require the VMM to map all guest memory PROT_MTE is
> > significant as it means that the VMM has to deal with the MTE tags even
> > if it doesn't care about them (e.g. for virtual devices or if the VMM
> > doesn't support migration). Also unfortunately because the VMM can
> > change the memory layout at any time the check for PROT_MTE/VM_MTE has
> > to be done very late (at the point of faulting pages into stage 2).
> 
> I'm a bit dubious about requring the VMM to map the guest memory
> PROT_MTE unless somebody's done at least a sketch of the design
> for how this would work on the QEMU side. Currently QEMU just
> assumes the guest memory is guest memory and it can access it
> without special precautions...
>

There are two statements being made here:

1) Requiring the use of PROT_MTE when mapping guest memory may not fit
   QEMU well.

2) New KVM features should be accompanied with supporting QEMU code in
   order to prove that the APIs make sense.

I strongly agree with (2). While kvmtool supports some quick testing, it
doesn't support migration. We must test all new features with a migration
supporting VMM.

I'm not sure about (1). I don't feel like it should be a major problem,
but (2).

I'd be happy to help with the QEMU prototype, but preferably when there's
hardware available. Has all the current MTE testing just been done on
simulators? And, if so, are there regression tests regularly running on
the simulators too? And can they test migration? If hardware doesn't
show up quickly and simulators aren't used for regression tests, then
all this code will start rotting from day one.

Thanks,
drew

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v13 01/15] iommu: Introduce attach/detach_pasid_table API

2020-11-19 Thread Auger Eric
Hi Jacob,
On 11/18/20 5:19 PM, Jacob Pan wrote:
> Hi Eric,
> 
> On Wed, 18 Nov 2020 12:21:37 +0100, Eric Auger 
> wrote:
> 
>> In virtualization use case, when a guest is assigned
>> a PCI host device, protected by a virtual IOMMU on the guest,
>> the physical IOMMU must be programmed to be consistent with
>> the guest mappings. If the physical IOMMU supports two
>> translation stages it makes sense to program guest mappings
>> onto the first stage/level (ARM/Intel terminology) while the host
>> owns the stage/level 2.
>>
>> In that case, it is mandated to trap on guest configuration
>> settings and pass those to the physical iommu driver.
>>
>> This patch adds a new API to the iommu subsystem that allows
>> to set/unset the pasid table information.
>>
>> A generic iommu_pasid_table_config struct is introduced in
>> a new iommu.h uapi header. This is going to be used by the VFIO
>> user API.
>>
>> Signed-off-by: Jean-Philippe Brucker 
>> Signed-off-by: Liu, Yi L 
>> Signed-off-by: Ashok Raj 
>> Signed-off-by: Jacob Pan 
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v12 -> v13:
>> - Fix config check
>>
>> v11 -> v12:
>> - add argsz, name the union
>> ---
>>  drivers/iommu/iommu.c  | 68 ++
>>  include/linux/iommu.h  | 21 
>>  include/uapi/linux/iommu.h | 54 ++
>>  3 files changed, 143 insertions(+)
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index b53446bb8c6b..978fe34378fb 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -2171,6 +2171,74 @@ int iommu_uapi_sva_unbind_gpasid(struct
>> iommu_domain *domain, struct device *dev }
>>  EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
>>  
>> +int iommu_attach_pasid_table(struct iommu_domain *domain,
>> + struct iommu_pasid_table_config *cfg)
>> +{
>> +if (unlikely(!domain->ops->attach_pasid_table))
>> +return -ENODEV;
>> +
>> +return domain->ops->attach_pasid_table(domain, cfg);
>> +}
>> +
>> +int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
>> +  void __user *uinfo)
>> +{
>> +struct iommu_pasid_table_config pasid_table_data = { 0 };
>> +u32 minsz;
>> +
>> +if (unlikely(!domain->ops->attach_pasid_table))
>> +return -ENODEV;
>> +
>> +/*
>> + * No new spaces can be added before the variable sized union,
>> the
>> + * minimum size is the offset to the union.
>> + */
>> +minsz = offsetof(struct iommu_pasid_table_config, vendor_data);
>> +
>> +/* Copy minsz from user to get flags and argsz */
>> +if (copy_from_user(_table_data, uinfo, minsz))
>> +return -EFAULT;
>> +
>> +/* Fields before the variable size union are mandatory */
>> +if (pasid_table_data.argsz < minsz)
>> +return -EINVAL;
>> +
>> +/* PASID and address granu require additional info beyond minsz
>> */
>> +if (pasid_table_data.version != PASID_TABLE_CFG_VERSION_1)
>> +return -EINVAL;
>> +if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 &&
>> +pasid_table_data.argsz <
>> +offsetofend(struct iommu_pasid_table_config,
>> vendor_data.smmuv3))
>> +return -EINVAL;
>> +
>> +/*
>> + * User might be using a newer UAPI header which has a larger
>> data
>> + * size, we shall support the existing flags within the current
>> + * size. Copy the remaining user data _after_ minsz but not more
>> + * than the current kernel supported size.
>> + */
>> +if (copy_from_user((void *)_table_data + minsz, uinfo +
>> minsz,
>> +   min_t(u32, pasid_table_data.argsz,
>> sizeof(pasid_table_data)) - minsz))
>> +return -EFAULT;
>> +
>> +/* Now the argsz is validated, check the content */
>> +if (pasid_table_data.config < IOMMU_PASID_CONFIG_TRANSLATE ||
>> +pasid_table_data.config > IOMMU_PASID_CONFIG_ABORT)
>> +return -EINVAL;
>> +
>> +return domain->ops->attach_pasid_table(domain,
>> _table_data); +}
>> +EXPORT_SYMBOL_GPL(iommu_uapi_attach_pasid_table);
>> +
>> +void iommu_detach_pasid_table(struct iommu_domain *domain)
>> +{
>> +if (unlikely(!domain->ops->detach_pasid_table))
>> +return;
>> +
>> +domain->ops->detach_pasid_table(domain);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
>> +
>>  static void __iommu_detach_device(struct iommu_domain *domain,
>>struct device *dev)
>>  {
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index b95a6f8db6ff..464fcbecf841 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -223,6 +223,8 @@ struct iommu_iotlb_gather {
>>   * @cache_invalidate: invalidate translation caches
>>   * @sva_bind_gpasid: bind guest pasid and mm
>>   * @sva_unbind_gpasid: unbind guest pasid and mm
>> + * @attach_pasid_table: attach a pasid table
>> + * 

Re: [RFC PATCH v3 10/16] KVM: arm64: Add a new VM device control group for SPE

2020-11-19 Thread James Morse
Hi Alex,

On 27/10/2020 17:26, Alexandru Elisei wrote:
> Stage 2 faults triggered by the profiling buffer attempting to write to
> memory are reported by the SPE hardware by asserting a buffer management
> event interrupt. Interrupts are by their nature asynchronous, which means
> that the guest might have changed its stage 1 translation tables since the
> attempted write. SPE reports the guest virtual address that caused the data
> abort, but not the IPA, which means that KVM would have to walk the guest's
> stage 1 tables to find the IPA; using the AT instruction to walk the
> guest's tables in hardware is not an option because it doesn't report the
> IPA in the case of a stage 2 fault on a stage 1 table walk.

Great detailed description, I think a summary helps identify 'both' problems:
| To work reliably, both the profiling buffer and the page tables to reach it 
must not
| fault at stage2.

> Fix both problems by pre-mapping the guest's memory at stage 2 with write
> permissions to avoid any faults. Userspace calls mlock() on the VMAs that
> back the guest's memory, pinning the pages in memory, then tells KVM to map
> the memory at stage 2 by using the VM control group KVM_ARM_VM_SPE_CTRL
> with the attribute KVM_ARM_VM_SPE_FINALIZE.

The reason to have this feature is SPE, but is there anything SPE specific in 
the feature?

I can imagine this being useful on its own if I wanted to reduce guest-exits for
quasi-real-time reasons, and had memory to burn!

(as an independent feature, it might be useful on other architectures too...)


Would it make sense to add this as a flag to KVM_SET_USER_MEMORY_REGION? That 
is the point
that the userspace_addr is provided to KVM, this would allow us to fail the 
call if a
KVM_MEM_LOCKED memslot can't be created because the underlying VMA aren't 
VM_LOCKED.

(it also makes it easy to catch incompatible changes of flags in the future)

/me wanders off musing if this can then be combined with VM_PFNMAP in
kvm_arch_prepare_memory_region()


> KVM will map all writable VMAs which have the VM_LOCKED flag set.

> Hugetlb VMAs are practically pinned in
> memory after they are faulted in and mlock() doesn't set the VM_LOCKED
> flag, and just faults the pages in;

Ugh. It would be nice to avoid special casing this. KVM shouldn't have to care 
about the
difference between a hugetlbfs PMD and a THP PMD.

>From mlock_fixup(), it looks like this is because these VMA can't be split.
Is it possible to change this if mlock() is called for the whole range? 
(user-space must
know its hugetlbfs!)

Alternatively, it would good if mm can tell us when a page is locked (and/or 
special
cased). That way dax does the right thing too, without having extra special 
casing in KVM.
This would also catch VM_PFNMAP if mm knows its effectively the same as 
VM_LOCKED...


> KVM will treat hugetlb VMAs like they
> have the VM_LOCKED flag and will also map them, faulting them in if
> necessary, when handling the ioctl.

Surely user-space should call mlock() to do the faulting in? (and do that 
before handing
the memory over to KVM)

Getting KVM to do it will create a loop via the mmu_notifier if this touches a 
COW page,
which in turn bumps the sequence counter causing us to bomb out with -EAGAIN.
(it looks like wp_page_copy() is the only case that calls set_pte_at_notify())


> VM live migration relies on a bitmap of dirty pages. This bitmap is created
> by write-protecting a memslot and updating it as KVM handles stage 2 write
> faults. Because KVM cannot handle stage 2 faults reported by the profiling
> buffer, it will not pre-map a logging memslot. This effectively means that
> profiling is not available when the VM is configured for live migration.

Yeah ... that sucks. Have any of the Qemu folk said what they'd like to see 
here?

I can imagine making the logging-enable call fail if any CPU has SPE profiling 
enabled, as
the logging will change the results of SPE... We'd then need an exit to 
user-space to say
that the vcpu tried to enable SPE while logging was active. Qemu can then 
decide whether
to block that vcpu until migration completes, or abort migration.
But: I've no idea how Qemu manages migration, so it may not be able to do 
irregular things
like this.

As a short cut, can we let the arch code fail calls that make problematic 
changes. (e.g.
setting KVM_MEM_LOG_DIRTY_PAGES or KVM_MEM_READONLY). It looks like you 
currently document
these as silently breaking something else... (an invitation to debug a subtle 
interaction
in the future!)

~

How does this interact with KSM?
I can see its try_to_merge_one_page() calling write_protect_page() before 
testing the
vm_flags for VM_LOCKED ... so it doesn't look like mlock() stop KSM from doing 
its work -
which in turn will cause stage2 faults.

It looks like this is all hinged on VM_MERGEABLE, which can be cleared with an 
madvise()
call using MADV_UNMERGEABLE ... but from the man page at least this is to undo 
a previous
hint.

I 

Re: [RFC PATCH v3 08/16] KVM: arm64: Add a new VCPU device control group for SPE

2020-11-19 Thread James Morse
Hi Alex,

On 27/10/2020 17:26, Alexandru Elisei wrote:
> From: Sudeep Holla 
> 
> To configure the virtual SPE buffer management interrupt number, we use a
> VCPU kvm_device ioctl, encapsulating the KVM_ARM_VCPU_SPE_IRQ attribute
> within the KVM_ARM_VCPU_SPE_CTRL group.
> 
> After configuring the SPE, userspace is required to call the VCPU ioctl
> with the attribute KVM_ARM_VCPU_SPE_INIT to initialize SPE on the VCPU.

> diff --git a/Documentation/virt/kvm/devices/vcpu.rst 
> b/Documentation/virt/kvm/devices/vcpu.rst
> index 2acec3b9ef65..6135b9827fbe 100644
> --- a/Documentation/virt/kvm/devices/vcpu.rst
> +++ b/Documentation/virt/kvm/devices/vcpu.rst
> @@ -161,3 +161,43 @@ Specifies the base address of the stolen time structure 
> for this VCPU. The
>  base address must be 64 byte aligned and exist within a valid guest memory
>  region. See Documentation/virt/kvm/arm/pvtime.rst for more information
>  including the layout of the stolen time structure.
> +
> +4. GROUP: KVM_ARM_VCPU_SPE_CTRL
> +===
> +
> +:Architectures: ARM64
> +
> +4.1 ATTRIBUTE: KVM_ARM_VCPU_SPE_IRQ
> +---
> +
> +:Parameters: in kvm_device_attr.addr the address for the SPE buffer 
> management
> + interrupt is a pointer to an int
> +
> +Returns:
> +
> +  ===  
> +  -EBUSY   The SPE buffer management interrupt is already set
> +  -EINVAL  Invalid SPE overflow interrupt number
> +  -EFAULT  Could not read the buffer management interrupt number
> +  -ENXIO   SPE not supported or not properly configured
> +  ===  
> +
> +A value describing the SPE (Statistical Profiling Extension) overflow 
> interrupt
> +number for this vcpu. This interrupt should be a PPI and the interrupt type 
> and
> +number must be same for each vcpu.
> +
> +4.2 ATTRIBUTE: KVM_ARM_VCPU_SPE_INIT
> +
> +
> +:Parameters: no additional parameter in kvm_device_attr.addr
> +
> +Returns:
> +
> +  ===  ==
> +  -EBUSY   SPE already initialized
> +  -ENODEV  GIC not initialized
> +  -ENXIO   SPE not supported or not properly configured
> +  ===  ==

> +Request the initialization of the SPE. Must be done after initializing the
> +in-kernel irqchip and after setting the interrupt number for the VCPU.

Fantastic!


> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index f32490229a4c..4dc205fa4be1 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -87,6 +87,9 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   case KVM_CAP_ARM_PTRAUTH_GENERIC:
>   r = system_has_full_ptr_auth();
>   break;
> + case KVM_CAP_ARM_SPE:
> + r = kvm_arm_supports_spe();
> + break;
>   default:
>   r = 0;
>   }
> @@ -223,6 +226,19 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
>   return 0;
>  }
>  
> +static int kvm_vcpu_enable_spe(struct kvm_vcpu *vcpu)
> +{
> + if (!kvm_arm_supports_spe())
> + return -EINVAL;
> +
> + /* SPE is disabled if the PE is in AArch32 state */
> + if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
> + return -EINVAL;
> +
> + vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SPE;
> + return 0;
> +}

VCPU-reset promotes the VMM feature into flags. How does this interact with
kvm_arm_spe_init()?

It doesn't look like this resets any state, couldn't it be done once by 
kvm_arm_spe_init()?


>  /**
>   * kvm_reset_vcpu - sets core registers and sys_regs to reset value
>   * @vcpu: The VCPU pointer
> @@ -274,6 +290,13 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
>   }
>   }
>  
> + if (test_bit(KVM_ARM_VCPU_SPE, vcpu->arch.features)) {
> + if (kvm_vcpu_enable_spe(vcpu)) {
> + ret = -EINVAL;
> + goto out;
> + }
> + }
> +
>   switch (vcpu->arch.target) {
>   default:
>   if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {

> diff --git a/arch/arm64/kvm/spe.c b/arch/arm64/kvm/spe.c
> new file mode 100644
> index ..f91a52cd7cd3
> --- /dev/null
> +++ b/arch/arm64/kvm/spe.c
> @@ -0,0 +1,129 @@

> +static bool kvm_arm_spe_irq_is_valid(struct kvm *kvm, int irq)
> +{
> + int i;
> + struct kvm_vcpu *vcpu;
> +
> + /* The SPE overflow interrupt can be a PPI only */
> + if (!irq_is_ppi(irq))
> + return false;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!kvm_arm_spe_irq_initialized(vcpu))
> + continue;
> +
> + if (vcpu->arch.spe_cpu.irq_num != irq)
> + return false;
> + }

Looks like you didn't 

Re: [RFC PATCH v3 06/16] KVM: arm64: Introduce SPE primitives

2020-11-19 Thread James Morse
Hi Alex,

On 27/10/2020 17:26, Alexandru Elisei wrote:
> KVM SPE emulation depends on the configuration option KVM_ARM_SPE and on on
> having hardware SPE support on all CPUs.

> The host driver must be
> compiled-in because we need the SPE interrupt to be enabled; it will be
> used to kick us out of the guest when the profiling buffer management
> interrupt is asserted by the GIC (for example, when the buffer is full).

Great: SPE IRQ very important...


> Add a VCPU flag to inform KVM that the guest has SPE enabled.
> 
> It's worth noting that even though the KVM_ARM_SPE config option is gated
> by the SPE host driver being compiled-in, we don't actually check that the
> driver was loaded successfully when we advertise SPE support for guests.

Eh?

> That's because we can live with the SPE interrupt being disabled. There is
> a delay between when the SPE hardware asserts the interrupt and when the
> GIC samples the interrupt line and asserts it to the CPU. If the SPE
> interrupt is disabled at the GIC level, this delay will be larger,

How does this work? Surely the IRQ needs to be enabled before it can become 
pending at the
CPU to kick us out of the guest...


> at most a host timer tick.

(Because the timer brings us out of the guest anyway?)


Thanks,

James
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v3 04/16] arm64: Introduce CPU SPE feature

2020-11-19 Thread James Morse
Hi Alex,

On 27/10/2020 17:26, Alexandru Elisei wrote:
> Detect Statistical Profiling Extension (SPE) support using the cpufeatures
> framework. The presence of SPE is reported via the ARM64_SPE capability.
> 
> The feature will be necessary for emulating SPE in KVM, because KVM needs
> that all CPUs have SPE hardware to avoid scheduling a VCPU on a CPU without
> support. For this reason, the feature type ARM64_CPUCAP_SYSTEM_FEATURE has
> been selected to disallow hotplugging a CPU which doesn't support SPE.

Can you mention the existing driver in the commit message? Surprisingly it 
doesn't use
cpufeature at all. It looks like arm_spe_pmu_dev_init() goes out of its way to 
support
mismatched systems. (otherwise the significance of the new behaviour isn't 
clear!)

I read it as: the host is fine with mismatched systems, and the existing 
drivers supports
this. But KVM is not. After this patch you can't make the system mismatched 
'late'.


Thanks,

James
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v3 01/16] KVM: arm64: Initialize VCPU mdcr_el2 before loading it

2020-11-19 Thread James Morse
Hi Alex,

On 27/10/2020 17:26, Alexandru Elisei wrote:
> When a VCPU is created, the kvm_vcpu struct is initialized to zero in
> kvm_vm_ioctl_create_vcpu(). On VHE systems, the first time
> vcpu.arch.mdcr_el2 is loaded on hardware is in vcpu_load(), before it is
> set to a sensible value in kvm_arm_setup_debug() later in the run loop. The
> result is that KVM executes for a short time with MDCR_EL2 set to zero.
> 
> This is mostly harmless as we don't need to trap debug and SPE register
> accesses from EL1 (we're still running in the host at EL2), but we do set
> MDCR_EL2.HPMN to 0 which is constrained unpredictable according to ARM DDI
> 0487F.b, page D13-3620; the required behavior from the hardware in this
> case is to reserve an unkown number of registers for EL2 and EL3 exclusive
> use.
> 
> Initialize mdcr_el2 in kvm_vcpu_vcpu_first_run_init(), so we can avoid the
> constrained unpredictable behavior and to ensure that the MDCR_EL2 register
> has the same value after each vcpu_load(), including the first time the
> VCPU is run.


> diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> index 7a7e425616b5..22ee448aee2b 100644
> --- a/arch/arm64/kvm/debug.c
> +++ b/arch/arm64/kvm/debug.c
> @@ -68,6 +68,59 @@ void kvm_arm_init_debug(void)

> +static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu, u32 host_mdcr)
> +{
> + bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
> +
> + /*
> +  * This also clears MDCR_EL2_E2PB_MASK to disable guest access
> +  * to the profiling buffer.
> +  */
> + vcpu->arch.mdcr_el2 = host_mdcr & MDCR_EL2_HPMN_MASK;
> + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
> + MDCR_EL2_TPMS |
> + MDCR_EL2_TPMCR |
> + MDCR_EL2_TDRA |
> + MDCR_EL2_TDOSA);

> + if (vcpu->guest_debug) {
> + /* Route all software debug exceptions to EL2 */
> + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
> + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
> + trap_debug = true;
> + }

This had me confused for a while... could you hint that this is when the guest 
is being
'external' debugged by the VMM? (its clear-er before this change)


Thanks,

James


> + /* Trap debug register access */
> + if (trap_debug)
> + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
> +
> + trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
> +}
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Peter Maydell
On Thu, 19 Nov 2020 at 15:57, Steven Price  wrote:
> On 19/11/2020 15:45, Peter Maydell wrote:
> > I'm a bit dubious about requring the VMM to map the guest memory
> > PROT_MTE unless somebody's done at least a sketch of the design
> > for how this would work on the QEMU side. Currently QEMU just
> > assumes the guest memory is guest memory and it can access it
> > without special precautions...
>
> I agree this needs some investigation - I'm hoping Haibo will be able to
> provide some feedback here as he has been looking at the QEMU support.
> However the VMM is likely going to require some significant changes to
> ensure that migration doesn't break, so either way there's work to be done.
>
> Fundamentally most memory will need a mapping with PROT_MTE just so the
> VMM can get at the tags for migration purposes, so QEMU is going to have
> to learn how to treat guest memory specially if it wants to be able to
> enable MTE for both itself and the guest.

If the only reason the VMM needs tag access is for migration it
feels like there must be a nicer way to do it than by
requiring it to map the whole of the guest address space twice
(once for normal use and once to get the tags)...

Anyway, maybe "must map PROT_MTE" is workable, but it seems
a bit premature to fix the kernel ABI as working that way
until we are at least reasonably sure that it is the right
design.

thanks
-- PMM
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 2/2] arm64: kvm: Introduce MTE VCPU feature

2020-11-19 Thread Catalin Marinas
On Thu, Nov 19, 2020 at 12:45:52PM +, Steven Price wrote:
> On 18/11/2020 17:05, Andrew Jones wrote:
> > On Wed, Nov 18, 2020 at 04:50:01PM +, Catalin Marinas wrote:
> > > On Wed, Nov 18, 2020 at 04:01:20PM +, Steven Price wrote:
> > > > On 17/11/2020 16:07, Catalin Marinas wrote:
> > > > > On Mon, Oct 26, 2020 at 03:57:27PM +, Steven Price wrote:
> > > > > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > > > > index 19aacc7d64de..38fe25310ca1 100644
> > > > > > --- a/arch/arm64/kvm/mmu.c
> > > > > > +++ b/arch/arm64/kvm/mmu.c
> > > > > > @@ -862,6 +862,26 @@ static int user_mem_abort(struct kvm_vcpu 
> > > > > > *vcpu, phys_addr_t fault_ipa,
> > > > > > if (vma_pagesize == PAGE_SIZE && !force_pte)
> > > > > > vma_pagesize = transparent_hugepage_adjust(memslot, hva,
> > > > > >, 
> > > > > > _ipa);
> > > > > > +
> > > > > > +   /*
> > > > > > +* The otherwise redundant test for system_supports_mte() 
> > > > > > allows the
> > > > > > +* code to be compiled out when CONFIG_ARM64_MTE is not present.
> > > > > > +*/
> > > > > > +   if (system_supports_mte() && kvm->arch.mte_enabled && 
> > > > > > pfn_valid(pfn)) {
> > > > > > +   /*
> > > > > > +* VM will be able to see the page's tags, so we must 
> > > > > > ensure
> > > > > > +* they have been initialised.
> > > > > > +*/
> > > > > > +   struct page *page = pfn_to_page(pfn);
> > > > > > +   long i, nr_pages = compound_nr(page);
> > > > > > +
> > > > > > +   /* if PG_mte_tagged is set, tags have already been 
> > > > > > initialised */
> > > > > > +   for (i = 0; i < nr_pages; i++, page++) {
> > > > > > +   if (!test_and_set_bit(PG_mte_tagged, 
> > > > > > >flags))
> > > > > > +   mte_clear_page_tags(page_address(page));
> > > > > > +   }
> > > > > > +   }
> > > > > 
> > > > > If this page was swapped out and mapped back in, where does the
> > > > > restoring from swap happen?
> > > > 
> > > > Restoring from swap happens above this in the call to gfn_to_pfn_prot()
> > > 
> > > Looking at the call chain, gfn_to_pfn_prot() ends up with
> > > get_user_pages() using the current->mm (the VMM) and that does a
> > > set_pte_at(), presumably restoring the tags. Does this mean that all
> > > memory mapped by the VMM in user space should have PROT_MTE set?
> > > Otherwise we don't take the mte_sync_tags() path in set_pte_at() and no
> > > tags restored from swap (we do save them since when they were mapped,
> > > PG_mte_tagged was set).
> > > 
> > > So I think the code above should be similar to mte_sync_tags(), even
> > > calling a common function, but I'm not sure where to get the swap pte
> > > from.
> 
> You're right - the code is broken as it stands. I've just been able to
> reproduce the loss of tags due to swap.
> 
> The problem is that we also don't have a suitable pte to do the restore from
> swap from. So either set_pte_at() would have to unconditionally check for
> MTE tags for all previous swap entries as you suggest below. I had a quick
> go at testing this and hit issues with the idle task getting killed during
> boot - I fear there are some fun issues regarding initialisation order here.

My attempt here but not fully tested (just booted, no swap support):

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b35833259f08..27d7fd336a16 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -304,7 +304,7 @@ static inline void set_pte_at(struct mm_struct *mm, 
unsigned long addr,
__sync_icache_dcache(pte);
 
if (system_supports_mte() &&
-   pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
+   pte_present(pte) && pte_valid_user(pte) && !pte_special(pte))
mte_sync_tags(ptep, pte);
 
__check_racy_pte_update(mm, ptep, pte);
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 52a0638ed967..bbd6c56d33d9 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -20,18 +20,24 @@
 #include 
 #include 
 
-static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
+static void mte_sync_page_tags(struct page *page, pte_t *ptep, pte_t pte,
+  bool check_swap)
 {
pte_t old_pte = READ_ONCE(*ptep);
 
if (check_swap && is_swap_pte(old_pte)) {
swp_entry_t entry = pte_to_swp_entry(old_pte);
 
-   if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
+   if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) {
+   set_bit(PG_mte_tagged, >flags);
return;
+   }
}
 
-   mte_clear_page_tags(page_address(page));
+   if (pte_tagged(pte)) {
+   

Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Steven Price

On 19/11/2020 15:45, Peter Maydell wrote:

On Thu, 19 Nov 2020 at 15:39, Steven Price  wrote:

This series adds support for Arm's Memory Tagging Extension (MTE) to
KVM, allowing KVM guests to make use of it. This builds on the existing
user space support already in v5.10-rc1, see [1] for an overview.



The change to require the VMM to map all guest memory PROT_MTE is
significant as it means that the VMM has to deal with the MTE tags even
if it doesn't care about them (e.g. for virtual devices or if the VMM
doesn't support migration). Also unfortunately because the VMM can
change the memory layout at any time the check for PROT_MTE/VM_MTE has
to be done very late (at the point of faulting pages into stage 2).


I'm a bit dubious about requring the VMM to map the guest memory
PROT_MTE unless somebody's done at least a sketch of the design
for how this would work on the QEMU side. Currently QEMU just
assumes the guest memory is guest memory and it can access it
without special precautions...


I agree this needs some investigation - I'm hoping Haibo will be able to 
provide some feedback here as he has been looking at the QEMU support. 
However the VMM is likely going to require some significant changes to 
ensure that migration doesn't break, so either way there's work to be done.


Fundamentally most memory will need a mapping with PROT_MTE just so the 
VMM can get at the tags for migration purposes, so QEMU is going to have 
to learn how to treat guest memory specially if it wants to be able to 
enable MTE for both itself and the guest.


I'll also hunt down what's happening with my attempts to fix the 
set_pte_at() handling for swap and I'll post that as an alternative if 
it turns out to be a reasonable approach. But I don't think that solve 
the QEMU issue above.


The other alternative would be to implement a new kernel interface to 
fetch tags from the guest and not require the VMM to maintain a PROT_MTE 
mapping. But we need some real feedback from someone familiar with QEMU 
to know what that interface should look like. So I'm holding off on that 
until there's a 'real' PoC implementation.


Thanks,

Steve
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Peter Maydell
On Thu, 19 Nov 2020 at 15:39, Steven Price  wrote:
> This series adds support for Arm's Memory Tagging Extension (MTE) to
> KVM, allowing KVM guests to make use of it. This builds on the existing
> user space support already in v5.10-rc1, see [1] for an overview.

> The change to require the VMM to map all guest memory PROT_MTE is
> significant as it means that the VMM has to deal with the MTE tags even
> if it doesn't care about them (e.g. for virtual devices or if the VMM
> doesn't support migration). Also unfortunately because the VMM can
> change the memory layout at any time the check for PROT_MTE/VM_MTE has
> to be done very late (at the point of faulting pages into stage 2).

I'm a bit dubious about requring the VMM to map the guest memory
PROT_MTE unless somebody's done at least a sketch of the design
for how this would work on the QEMU side. Currently QEMU just
assumes the guest memory is guest memory and it can access it
without special precautions...

thanks
-- PMM
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v5 2/2] arm64: kvm: Introduce MTE VCPU feature

2020-11-19 Thread Steven Price
Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This exposes the feature to the guest and requires the VMM to
have set PROT_MTE on all mappings exposed to the guest. This ensures
that the guest cannot see stale tags, and that the tags are correctly
saved/restored across swap.

Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/kvm_emulate.h | 3 +++
 arch/arm64/include/asm/kvm_host.h| 4 
 arch/arm64/kvm/arm.c | 9 +
 arch/arm64/kvm/mmu.c | 6 ++
 arch/arm64/kvm/sys_regs.c| 6 +-
 include/uapi/linux/kvm.h | 1 +
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 5ef2669ccd6c..7791ef044b7f 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -79,6 +79,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 |= HCR_TID2;
+
+   if (kvm_has_mte(vcpu->kvm))
+   vcpu->arch.hcr_el2 |= HCR_ATA;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index d3e136343468..aeff10bc5b31 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -120,6 +120,8 @@ struct kvm_arch {
unsigned int pmuver;
 
u8 pfr0_csv2;
+   /* Memory Tagging Extension enabled for the guest */
+   bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -658,4 +660,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c0ffb019ca8b..da4aeba1855c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -89,6 +89,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
kvm->arch.return_nisv_io_abort_to_user = true;
break;
+   case KVM_CAP_ARM_MTE:
+   if (!system_supports_mte() || kvm->created_vcpus)
+   return -EINVAL;
+   r = 0;
+   kvm->arch.mte_enabled = true;
+   break;
default:
r = -EINVAL;
break;
@@ -226,6 +232,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 */
r = 1;
break;
+   case KVM_CAP_ARM_MTE:
+   r = system_supports_mte();
+   break;
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
break;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1a01da9fdc99..f804d2109b8c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -815,6 +815,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
fault_ipa &= ~(vma_pagesize - 1);
 
+   /* VMA regions must be mapped with PROT_MTE when VM has MTE enabled */
+   if (kvm_has_mte(kvm) && !(vma->vm_flags & VM_MTE)) {
+   pr_err_ratelimited("Page not mapped VM_MTE in MTE guest\n");
+   return -EFAULT;
+   }
+
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 4792d5249f07..469b0ef3eb07 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1123,7 +1123,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
val &= ~(0xfUL << ID_AA64PFR0_CSV2_SHIFT);
val |= ((u64)vcpu->kvm->arch.pfr0_csv2 << 
ID_AA64PFR0_CSV2_SHIFT);
} else if (id == SYS_ID_AA64PFR1_EL1) {
-   val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT);
+   if (!kvm_has_mte(vcpu->kvm))
+   val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT);
} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
 (0xfUL << ID_AA64ISAR1_API_SHIFT) |
@@ -1369,6 +1370,9 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct 
sys_reg_params *p,
 static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
   const struct sys_reg_desc *rd)
 {
+   if (kvm_has_mte(vcpu->kvm))
+   return 0;
+
return REG_HIDDEN;
 }
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8..3e6fb5b580a9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1053,6 +1053,7 @@ struct 

[PATCH v5 1/2] arm64: kvm: Save/restore MTE registers

2020-11-19 Thread Steven Price
Define the new system registers that MTE introduces and context switch
them. The MTE feature is still hidden from the ID register as it isn't
supported in a VM yet.

Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/kvm_host.h  |  4 
 arch/arm64/include/asm/sysreg.h|  3 ++-
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 14 ++
 arch/arm64/kvm/sys_regs.c  | 14 ++
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 0cd9f0f75c13..d3e136343468 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -136,6 +136,8 @@ enum vcpu_sysreg {
SCTLR_EL1,  /* System Control Register */
ACTLR_EL1,  /* Auxiliary Control Register */
CPACR_EL1,  /* Coprocessor Access Control */
+   RGSR_EL1,   /* Random Allocation Tag Seed Register */
+   GCR_EL1,/* Tag Control Register */
ZCR_EL1,/* SVE Control */
TTBR0_EL1,  /* Translation Table Base Register 0 */
TTBR1_EL1,  /* Translation Table Base Register 1 */
@@ -152,6 +154,8 @@ enum vcpu_sysreg {
TPIDR_EL1,  /* Thread ID, Privileged */
AMAIR_EL1,  /* Aux Memory Attribute Indirection Register */
CNTKCTL_EL1,/* Timer Control Register (EL1) */
+   TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
+   TFSR_EL1,   /* Tag Fault Stauts Register (EL1) */
PAR_EL1,/* Physical Address Register */
MDSCR_EL1,  /* Monitor Debug System Control Register */
MDCCINT_EL1,/* Monitor Debug Comms Channel Interrupt Enable Reg */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index e2ef4c2edf06..b6668ffa04d9 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -569,7 +569,8 @@
 #define SCTLR_ELx_M(BIT(0))
 
 #define SCTLR_ELx_FLAGS(SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
-SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
+SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | \
+SCTLR_ELx_ITFSB)
 
 /* SCTLR_EL2 specific flags. */
 #define SCTLR_EL2_RES1 ((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h 
b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index cce43bfe158f..45255ba60152 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -18,6 +18,11 @@
 static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 {
ctxt_sys_reg(ctxt, MDSCR_EL1)   = read_sysreg(mdscr_el1);
+   if (system_supports_mte()) {
+   ctxt_sys_reg(ctxt, RGSR_EL1)= read_sysreg_s(SYS_RGSR_EL1);
+   ctxt_sys_reg(ctxt, GCR_EL1) = read_sysreg_s(SYS_GCR_EL1);
+   ctxt_sys_reg(ctxt, TFSRE0_EL1)  = read_sysreg_s(SYS_TFSRE0_EL1);
+   }
 }
 
 static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
@@ -45,6 +50,8 @@ static inline void __sysreg_save_el1_state(struct 
kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, CNTKCTL_EL1) = read_sysreg_el1(SYS_CNTKCTL);
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
ctxt_sys_reg(ctxt, TPIDR_EL1)   = read_sysreg(tpidr_el1);
+   if (system_supports_mte())
+   ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
 
ctxt_sys_reg(ctxt, SP_EL1)  = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
@@ -63,6 +70,11 @@ static inline void __sysreg_save_el2_return_state(struct 
kvm_cpu_context *ctxt)
 static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
 {
write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1),  mdscr_el1);
+   if (system_supports_mte()) {
+   write_sysreg_s(ctxt_sys_reg(ctxt, RGSR_EL1), SYS_RGSR_EL1);
+   write_sysreg_s(ctxt_sys_reg(ctxt, GCR_EL1), SYS_GCR_EL1);
+   write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
+   }
 }
 
 static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
@@ -106,6 +118,8 @@ static inline void __sysreg_restore_el1_state(struct 
kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL);
write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),   par_el1);
write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
+   if (system_supports_mte())
+   write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
 
if (!has_vhe() &&
cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c1fac9836af1..4792d5249f07 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1366,6 +1366,12 @@ static bool 

[PATCH v5 0/2] MTE support for KVM guest

2020-11-19 Thread Steven Price
This series adds support for Arm's Memory Tagging Extension (MTE) to
KVM, allowing KVM guests to make use of it. This builds on the existing
user space support already in v5.10-rc1, see [1] for an overview.

[1] https://lwn.net/Articles/834289/

Changes since v4[2]:

 * Rebased on v5.10-rc4.

 * Require the VMM to map all guest memory PROT_MTE if MTE is enabled
   for the guest.

 * Add a kvm_has_mte() accessor.

[2] http://lkml.kernel.org/r/20201026155727.36685-1-steven.price%40arm.com

The change to require the VMM to map all guest memory PROT_MTE is
significant as it means that the VMM has to deal with the MTE tags even
if it doesn't care about them (e.g. for virtual devices or if the VMM
doesn't support migration). Also unfortunately because the VMM can
change the memory layout at any time the check for PROT_MTE/VM_MTE has
to be done very late (at the point of faulting pages into stage 2).

The alternative would be to modify the set_pte_at() handler to always
check if there is MTE data relating to a swap page even if the PTE
doesn't have the MTE bit set. I haven't initially done this because of
ordering issues during early boot, but could investigate further if the
above VMM requirement is too strict.

Steven Price (2):
  arm64: kvm: Save/restore MTE registers
  arm64: kvm: Introduce MTE VCPU feature

 arch/arm64/include/asm/kvm_emulate.h   |  3 +++
 arch/arm64/include/asm/kvm_host.h  |  8 
 arch/arm64/include/asm/sysreg.h|  3 ++-
 arch/arm64/kvm/arm.c   |  9 +
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 14 ++
 arch/arm64/kvm/mmu.c   |  6 ++
 arch/arm64/kvm/sys_regs.c  | 20 +++-
 include/uapi/linux/kvm.h   |  1 +
 8 files changed, 58 insertions(+), 6 deletions(-)

-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 3/5] KVM: arm64: Patch kimage_voffset instead of loading the EL1 value

2020-11-19 Thread David Brazdil
Hey Marc,

Just noticed in kvmarm/queue that the whitespacing in this patch is off.

> +.macro kimg_pa reg, tmp
> +alternative_cb kvm_get_kimage_voffset
> +   movz\tmp, #0
> +   movk\tmp, #0, lsl #16
> +   movk\tmp, #0, lsl #32
> +   movk\tmp, #0, lsl #48
> +alternative_cb_end
> +
> +   /* reg = __pa(reg) */
> +   sub \reg, \reg, \tmp
> +.endm
This uses spaces instead of tabs.

> +  
>  #else
This added empty line actually has a tab in it.

-David
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 4/5] arm64: Add support for SMCCC TRNG entropy source

2020-11-19 Thread Ard Biesheuvel
On Fri, 13 Nov 2020 at 19:24, Andre Przywara  wrote:
>
> The ARM architected TRNG firmware interface, described in ARM spec
> DEN0098, defines an ARM SMCCC based interface to a true random number
> generator, provided by firmware.
> This can be discovered via the SMCCC >=v1.1 interface, and provides
> up to 192 bits of entropy per call.
>
> Hook this SMC call into arm64's arch_get_random_*() implementation,
> coming to the rescue when the CPU does not implement the ARM v8.5 RNG
> system registers.
>
> For the detection, we piggy back on the PSCI/SMCCC discovery (which gives
> us the conduit to use (hvc/smc)), then try to call the
> ARM_SMCCC_TRNG_VERSION function, which returns -1 if this interface is
> not implemented.
>
> Signed-off-by: Andre Przywara 
> ---
>  arch/arm64/include/asm/archrandom.h | 69 -
>  1 file changed, 58 insertions(+), 11 deletions(-)
>
> diff --git a/arch/arm64/include/asm/archrandom.h 
> b/arch/arm64/include/asm/archrandom.h
> index abe07c21da8e..fe34bfd30caa 100644
> --- a/arch/arm64/include/asm/archrandom.h
> +++ b/arch/arm64/include/asm/archrandom.h
> @@ -4,13 +4,24 @@
>
>  #ifdef CONFIG_ARCH_RANDOM
>
> +#include 
>  #include 
>  #include 
>  #include 
>
> +#define ARM_SMCCC_TRNG_MIN_VERSION 0x1UL
> +
> +extern bool smccc_trng_available;
> +
>  static inline bool __init smccc_probe_trng(void)
>  {
> -   return false;
> +   struct arm_smccc_res res;
> +
> +   arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_VERSION, );
> +   if ((s32)res.a0 < 0)
> +   return false;
> +
> +   return res.a0 >= ARM_SMCCC_TRNG_MIN_VERSION;
>  }
>
>  static inline bool __arm64_rndr(unsigned long *v)
> @@ -43,26 +54,52 @@ static inline bool __must_check 
> arch_get_random_int(unsigned int *v)
>
>  static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
>  {
> +   struct arm_smccc_res res;
> +
> /*
>  * Only support the generic interface after we have detected
>  * the system wide capability, avoiding complexity with the
>  * cpufeature code and with potential scheduling between CPUs
>  * with and without the feature.
>  */
> -   if (!cpus_have_const_cap(ARM64_HAS_RNG))
> -   return false;
> +   if (cpus_have_const_cap(ARM64_HAS_RNG))
> +   return __arm64_rndr(v);
>
> -   return __arm64_rndr(v);
> -}
> +   if (smccc_trng_available) {
> +   arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, );
> +   if ((int)res.a0 < 0)
> +   return false;
>
> +   *v = res.a3;
> +   return true;
> +   }
> +
> +   return false;
> +}
>

I think we should be more rigorous here in how we map the concepts of
random seeds and random numbers onto the various sources.

First of all, assuming my patch dropping the call to
arch_get_random_seed_long() from add_interrupt_randomness() gets
accepted, we should switch to RNDRRS here, and implement the non-seed
variants using RNDR.

However, this is still semantically inaccurate: RNDRRS does not return
a random *seed*, it returns a number drawn from a freshly seeded
pseudo-random sequence. This means that the TRNG interface, if
implemented, is a better choice, and so we should try it first. Note
that on platforms that don't implement both, only one of these will be
available in the first place. But on platforms that *do* implement
both, the firmware interface may actually be less wasteful in terms of
resources: the TRNG interface returns every bit drawn from the
underlying entropy source, whereas RNDRRS uses ~500 bits of entropy to
reseed a DRBG that gets used only once to draw a single 64-bit number.
And the cost of the SMCCC call in terms of CPU time is charged to the
caller, which is appropriate here.

Then, I don't think we should ever return false without even trying if
RNDRRS is available if the SMCCC invocation fails.

Something like this perhaps?

if (smccc_trng_available) {
  arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, );
  if ((int)res.a0 >= 0) {
*v = res.a3;
return true;
  }
}

if (cpus_have_const_cap(ARM64_HAS_RNG))
   return __arm64_rndrrs(v);

return false;

(and something similar 2x below)


>  static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
>  {
> +   struct arm_smccc_res res;
> unsigned long val;
> -   bool ok = arch_get_random_seed_long();
>
> -   *v = val;
> -   return ok;
> +   if (cpus_have_const_cap(ARM64_HAS_RNG)) {
> +   if (arch_get_random_seed_long()) {
> +   *v = val;
> +   return true;
> +   }
> +   return false;
> +   }
> +
> +   if (smccc_trng_available) {
> +   arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 32, );
> +   if ((int)res.a0 < 0)
> +   return false;
> +
> +   *v = res.a3 & GENMASK(31, 0);
> +   return 

Re: [PATCH v4 1/2] arm64: kvm: Save/restore MTE registers

2020-11-19 Thread Steven Price

On 18/11/2020 17:02, Catalin Marinas wrote:

On Wed, Nov 18, 2020 at 04:01:18PM +, Steven Price wrote:

On 17/11/2020 19:20, Marc Zyngier wrote:

On 2020-10-26 15:57, Steven Price wrote:

diff --git a/arch/arm64/include/asm/sysreg.h
b/arch/arm64/include/asm/sysreg.h
index d52c1b3ce589..7727df0bc09d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -565,7 +565,8 @@
�#define SCTLR_ELx_M��� (BIT(0))

�#define SCTLR_ELx_FLAGS��� (SCTLR_ELx_M� | SCTLR_ELx_A | SCTLR_ELx_C 
| \
-������������ SCTLR_ELx_SA | SCTLR_ELx_I | 
SCTLR_ELx_IESB)
+������������ SCTLR_ELx_SA | SCTLR_ELx_I | 
SCTLR_ELx_IESB | \
+������������ SCTLR_ELx_ITFSB)

�/* SCTLR_EL2 specific flags. */
�#define SCTLR_EL2_RES1��� ((BIT(4))� | (BIT(5))� | (BIT(11)) |
(BIT(16)) | \
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 7a986030145f..a124ffa49ba3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -18,6 +18,11 @@
�static inline void __sysreg_save_common_state(struct
kvm_cpu_context *ctxt)
�{
���� ctxt_sys_reg(ctxt, MDSCR_EL1)��� = read_sysreg(mdscr_el1);
+��� if (system_supports_mte()) {
+������� ctxt_sys_reg(ctxt, RGSR_EL1)��� = 
read_sysreg_s(SYS_RGSR_EL1);
+������� ctxt_sys_reg(ctxt, GCR_EL1)��� = 
read_sysreg_s(SYS_GCR_EL1);
+������� ctxt_sys_reg(ctxt, TFSRE0_EL1)��� =
read_sysreg_s(SYS_TFSRE0_EL1);


As far as I can tell, HCR_EL2.ATA is still clear when running a guest.
So why, do we save/restore this state yet?

Also, I wonder whether we should keep these in the C code. If one day
we enable MTE in the kernel, we will have to move them to the assembly
part, much like we do for PAuth. And I fear that "one day" is pretty
soon:

https://lore.kernel.org/linux-arm-kernel/cover.1605046192.git.andreyk...@google.com/


Good point. Although for MTE we do have the option of setting TCO in PSTATE
so this could remain in C if we're not bothered about the 'gap' in KASAN
coverage. I haven't yet got my head around how (or indeed if) that series
handles guests.


I think we should be fine with the currently proposed in-kernel MTE
support. However, setting GCR_EL1 can get in the way if stack tagging is
ever enabled (it breaks single image). The compiler uses GCR_EL1 to
generate different colours for variables on the stack and changing it in
the middle of a function may cause confusion. You'd have to set
PSTATE.TCO for the whole function, either from the caller or, if the
compiler gets smarter, some function attribute.



If the compiler might start playing with TCO then this could also be an 
issue for VMMs which will (at least with the current design) need to use 
TCO to safely access guest memory. Especially if we enforce PROT_MTE 
mappings for the VMM.


Steve
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 2/2] arm64: kvm: Introduce MTE VCPU feature

2020-11-19 Thread Steven Price

On 18/11/2020 17:05, Andrew Jones wrote:

On Wed, Nov 18, 2020 at 04:50:01PM +, Catalin Marinas wrote:

On Wed, Nov 18, 2020 at 04:01:20PM +, Steven Price wrote:

On 17/11/2020 16:07, Catalin Marinas wrote:

On Mon, Oct 26, 2020 at 03:57:27PM +, Steven Price wrote:

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 19aacc7d64de..38fe25310ca1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -862,6 +862,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
   , _ipa);
+
+   /*
+* The otherwise redundant test for system_supports_mte() allows the
+* code to be compiled out when CONFIG_ARM64_MTE is not present.
+*/
+   if (system_supports_mte() && kvm->arch.mte_enabled && pfn_valid(pfn)) {
+   /*
+* VM will be able to see the page's tags, so we must ensure
+* they have been initialised.
+*/
+   struct page *page = pfn_to_page(pfn);
+   long i, nr_pages = compound_nr(page);
+
+   /* if PG_mte_tagged is set, tags have already been initialised 
*/
+   for (i = 0; i < nr_pages; i++, page++) {
+   if (!test_and_set_bit(PG_mte_tagged, >flags))
+   mte_clear_page_tags(page_address(page));
+   }
+   }


If this page was swapped out and mapped back in, where does the
restoring from swap happen?


Restoring from swap happens above this in the call to gfn_to_pfn_prot()


Looking at the call chain, gfn_to_pfn_prot() ends up with
get_user_pages() using the current->mm (the VMM) and that does a
set_pte_at(), presumably restoring the tags. Does this mean that all
memory mapped by the VMM in user space should have PROT_MTE set?
Otherwise we don't take the mte_sync_tags() path in set_pte_at() and no
tags restored from swap (we do save them since when they were mapped,
PG_mte_tagged was set).

So I think the code above should be similar to mte_sync_tags(), even
calling a common function, but I'm not sure where to get the swap pte
from.


You're right - the code is broken as it stands. I've just been able to 
reproduce the loss of tags due to swap.


The problem is that we also don't have a suitable pte to do the restore 
from swap from. So either set_pte_at() would have to unconditionally 
check for MTE tags for all previous swap entries as you suggest below. I 
had a quick go at testing this and hit issues with the idle task getting 
killed during boot - I fear there are some fun issues regarding 
initialisation order here.


Or we enforce PROT_MTE...


An alternative is to only enable HCR_EL2.ATA and MTE in guest if the vmm
mapped the memory with PROT_MTE.


This is a very reasonable alternative. The VMM must be aware of whether
the guest may use MTE anyway. Asking it to map the memory with PROT_MTE
when it wants to offer the guest that option is a reasonable requirement.
If the memory is not mapped as such, then the host kernel shouldn't assume
MTE may be used by the guest, and it should even enforce that it is not
(by not enabling the feature).


The main issue with this is that the VMM can change the mappings while 
the guest is running, so the only place we can reliably check this is 
during user_mem_abort(). So we can't just downgrade HCR_EL2.ATA. This 
makes the error reporting not so great as the memory access is simply 
faulted. However I do have this working and it's actually (slightly) 
less code.


Another drawback is that the VMM needs to be more careful with the tags 
- e.g. for virtualised devices the VMM can't simply have a non-PROT_MTE 
mapping and ignore what the guest is doing with tags.


Steve
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm