[PATCH qemu v4 1/2] target/arm: Handle IC IVAU to improve compatibility with JITs

2023-06-26 Thread ~jhogberg
From: John Högberg 

Unlike architectures with precise self-modifying code semantics
(e.g. x86) ARM processors do not maintain coherency for instruction
execution and memory, requiring an instruction synchronization
barrier on every core that will execute the new code, and on many
models also the explicit use of cache management instructions.

While this is required to make JITs work on actual hardware, QEMU
has gotten away with not handling this since it does not emulate
caches, and unconditionally invalidates code whenever the softmmu
or the user-mode page protection logic detects that code has been
modified.

Unfortunately the latter does not work in the face of dual-mapped
code (a common W^X workaround), where one page is executable and
the other is writable: user-mode has no way to connect one with the
other as that is only known to the kernel and the emulated
application.

This commit works around the issue by telling software that
instruction cache invalidation is required by clearing the
CPR_EL0.DIC flag (regardless of whether the emulated processor
needs it), and then invalidating code in IC IVAU instructions.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1034

Co-authored-by: Richard Henderson 
Signed-off-by: John Högberg 
---
 target/arm/cpu.c| 13 +
 target/arm/helper.c | 47 ++---
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 4d5bb57f07..b82fb46157 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1674,6 +1674,19 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
 return;
 }
 
+/*
+ * User mode relies on IC IVAU instructions to catch modification of
+ * dual-mapped code.
+ *
+ * Clear CTR_EL0.DIC to ensure that software that honors these flags uses
+ * IC IVAU even if the emulated processor does not normally require it.
+ */
+#ifdef CONFIG_USER_ONLY
+if (arm_feature(env, ARM_FEATURE_AARCH64)) {
+cpu->ctr = FIELD_DP64(cpu->ctr, CTR_EL0, DIC, 0);
+}
+#endif
+
 if (!cpu->has_vfp) {
 uint64_t t;
 uint32_t u;
diff --git a/target/arm/helper.c b/target/arm/helper.c
index d4bee43bd0..235e3cd0b6 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5228,6 +5228,36 @@ static void mdcr_el2_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 }
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * `IC IVAU` is handled to improve compatibility with JITs that dual-map their
+ * code to get around W^X restrictions, where one region is writable and the
+ * other is executable.
+ *
+ * Since the executable region is never written to we cannot detect code
+ * changes when running in user mode, and rely on the emulated JIT telling us
+ * that the code has changed by executing this instruction.
+ */
+static void ic_ivau_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+uint64_t icache_line_mask, start_address, end_address;
+const ARMCPU *cpu;
+
+cpu = env_archcpu(env);
+
+icache_line_mask = (4 << extract32(cpu->ctr, 0, 4)) - 1;
+start_address = value & ~icache_line_mask;
+end_address = value | icache_line_mask;
+
+mmap_lock();
+
+tb_invalidate_phys_range(start_address, end_address);
+
+mmap_unlock();
+}
+#endif
+
 static const ARMCPRegInfo v8_cp_reginfo[] = {
 /*
  * Minimal set of EL0-visible registers. This will need to be expanded
@@ -5267,7 +5297,10 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "CURRENTEL", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 4, .crm = 2,
   .access = PL1_R, .type = ARM_CP_CURRENTEL },
-/* Cache ops: all NOPs since we don't emulate caches */
+/*
+ * Instruction cache ops. All of these except `IC IVAU` NOP because we
+ * don't emulate caches.
+ */
 { .name = "IC_IALLUIS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
@@ -5280,9 +5313,17 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
   .accessfn = access_tocu },
 { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
-  .access = PL0_W, .type = ARM_CP_NOP,
+  .access = PL0_W,
   .fgt = FGT_ICIVAU,
-  .accessfn = access_tocu },
+  .accessfn = access_tocu,
+#ifdef CONFIG_USER_ONLY
+  .type = ARM_CP_NO_RAW,
+  .writefn = ic_ivau_write
+#else
+  .type = ARM_CP_NOP
+#endif
+},
+/* Cache ops: all NOPs since we don't emulate caches */
 { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
   .access = PL1_W, .accessfn = aa64_cacheop_poc_access,
-- 
2.38.5




[PATCH qemu v4 0/2] target/arm: Improve user-mode compatibility with JITs

2023-06-26 Thread ~jhogberg
Changes since v3:

1) Reworded the first commit comment to note that the need to clear
   cache is implementation-dependent.
2) CTR_EL0.DIC is now cleared in user mode to indicate that IC IVAU
   must be used.
3) The test case now only uses DC CVAU / IC IVAU when this is
   required, as indicated by CTR_EL0.{DIC,IDC}. There have been no
   changes outside of the function `mark_code_modified`



When running in user-mode QEMU currently fails to emulate JITs that
use dual-mapped code to get around W^X restrictions, where one mapping
is writable and one is executable. As it has no way of knowing that a
write to the writable region is reflected in the executable one, it
fails to invalidate previously translated code which leads to a crash
at best.

(Note that system mode is unaffected as the softmmu is fully aware of
what is going on.)

This patch series catches changes to dual-mapped code by honoring the
cache management instructions required to make things work on actual
hardware.

See https://gitlab.com/qemu-project/qemu/-/issues/1034 for more
background information

John Högberg (2):
  target/arm: Handle IC IVAU to improve compatibility with JITs
  tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

 target/arm/cpu.c  |  13 ++
 target/arm/helper.c   |  47 +++-
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 189 ++
 4 files changed, 248 insertions(+), 4 deletions(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

-- 
2.38.5



[PATCH qemu v4 2/2] tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

2023-06-26 Thread ~jhogberg
From: John Högberg 

https://gitlab.com/qemu-project/qemu/-/issues/1034

Signed-off-by: John Högberg 
---
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 189 ++
 2 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

diff --git a/tests/tcg/aarch64/Makefile.target 
b/tests/tcg/aarch64/Makefile.target
index 3430fd3cd8..de6566d0d4 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -9,9 +9,10 @@ AARCH64_SRC=$(SRC_PATH)/tests/tcg/aarch64
 VPATH  += $(AARCH64_SRC)
 
 # Base architecture tests
-AARCH64_TESTS=fcvt pcalign-a64
+AARCH64_TESTS=fcvt pcalign-a64 icivau
 
 fcvt: LDFLAGS+=-lm
+icivau: LDFLAGS+=-lrt
 
 run-fcvt: fcvt
$(call run-test,$<,$(QEMU) $<, "$< on $(TARGET_NAME)")
diff --git a/tests/tcg/aarch64/icivau.c b/tests/tcg/aarch64/icivau.c
new file mode 100644
index 00..e3e8569912
--- /dev/null
+++ b/tests/tcg/aarch64/icivau.c
@@ -0,0 +1,189 @@
+/*
+ * Tests the IC IVAU-driven workaround for catching changes made to dual-mapped
+ * code that would otherwise go unnoticed in user mode.
+ *
+ * Copyright (c) 2023 Ericsson AB
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define MAX_CODE_SIZE 128
+
+typedef int (SelfModTest)(uint32_t, uint32_t*);
+typedef int (BasicTest)(int);
+
+static void mark_code_modified(const uint32_t *exec_data, size_t length)
+{
+int dc_required, ic_required;
+unsigned long ctr_el0;
+
+/*
+ * Clear the data/instruction cache, as indicated by the CTR_ELO.{DIC,IDC}
+ * flags.
+ *
+ * For completeness we might be tempted to assert that we should fail when
+ * the whole code update sequence is omitted, but that would make the test
+ * flaky as it can succeed by coincidence on actual hardware.
+ */
+asm ("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+
+/* CTR_EL0.IDC */
+dc_required = !((ctr_el0 >> 28) & 1);
+
+/* CTR_EL0.DIC */
+ic_required = !((ctr_el0 >> 29) & 1);
+
+if (dc_required) {
+size_t dcache_stride, i;
+
+/*
+ * Step according to the minimum cache size, as the cache maintenance
+ * instructions operate on the cache line of the given address.
+ *
+ * We assume that exec_data is properly aligned.
+ */
+dcache_stride = (4 << ((ctr_el0 >> 16) & 0xF));
+
+for (i = 0; i < length; i += dcache_stride) {
+const char *dc_addr = &((const char *)exec_data)[i];
+asm volatile ("dc cvau, %x[dc_addr]\n"
+  : /* no outputs */
+  : [dc_addr] "r"(dc_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n");
+}
+
+if (ic_required) {
+size_t icache_stride, i;
+
+icache_stride = (4 << (ctr_el0 & 0xF));
+
+for (i = 0; i < length; i += icache_stride) {
+const char *ic_addr = &((const char *)exec_data)[i];
+asm volatile ("ic ivau, %x[ic_addr]\n"
+  : /* no outputs */
+  : [ic_addr] "r"(ic_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n");
+}
+
+asm volatile ("isb sy\n");
+}
+
+static int basic_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * As user mode only misbehaved for dual-mapped code when previously
+ * translated code had been changed, we'll start off with this basic test
+ * function to ensure that there's already some translated code at
+ * exec_data before the next test. This should cause the next test to fail
+ * if `mark_code_modified` fails to invalidate the code.
+ *
+ * Note that the payload is in binary form instead of inline assembler
+ * because we cannot use __attribute__((naked)) on this platform and the
+ * workarounds are at least as ugly as this is.
+ */
+static const uint32_t basic_payload[] = {
+0xD65F03C0 /* 0x00: RET */
+};
+
+BasicTest *copied_ptr = (BasicTest *)exec_data;
+
+memcpy(rw_data, basic_payload, sizeof(basic_payload));
+mark_code_modified(exec_data, sizeof(basic_payload));
+
+return copied_ptr(1234) == 1234;
+}
+
+static int self_modification_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * This test is self-modifying in an attempt to cover an edge case where
+ * the IC IVAU instruction invalidates itself.
+ *
+ * Note that the IC IVAU instruction is 16 bytes into the function, in what
+ * will be the same cache line as the modifed instruction on machines with
+ * a cache line size >= 16 bytes.
+ */
+static const uint32_t self_mod_payload[] = {
+/* Overwrite the placeholder instruction with the new one. */
+0xB9001C20, /* 0x00: STR w0, [x1, 0x1C] */
+
+/* Get the 

[PATCH qemu v3 1/2] target/arm: Handle IC IVAU to improve compatibility with JITs

2023-06-19 Thread ~jhogberg
From: John Högberg 

Unlike architectures with precise self-modifying code semantics
(e.g. x86) ARM processors do not maintain coherency for instruction
execution and memory, and require the explicit use of cache
management instructions as well as an instruction barrier to make
code updates visible (the latter on every core that is going to
execute said code).

While this is required to make JITs work on actual hardware, QEMU
has gotten away with not handling this since it does not emulate
caches, and unconditionally invalidates code whenever the softmmu
or the user-mode page protection logic detects that code has been
modified.

Unfortunately the latter does not work in the face of dual-mapped
code (a common W^X workaround), where one page is executable and
the other is writable: user-mode has no way to connect one with the
other as that is only known to the kernel and the emulated
application.

This commit works around the issue by invalidating code in
IC IVAU instructions.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1034

Co-authored-by: Richard Henderson 
Signed-off-by: John Högberg 
Reviewed-by: Richard Henderson 
---
 target/arm/helper.c | 47 ++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index d4bee43bd0..235e3cd0b6 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5228,6 +5228,36 @@ static void mdcr_el2_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 }
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * `IC IVAU` is handled to improve compatibility with JITs that dual-map their
+ * code to get around W^X restrictions, where one region is writable and the
+ * other is executable.
+ *
+ * Since the executable region is never written to we cannot detect code
+ * changes when running in user mode, and rely on the emulated JIT telling us
+ * that the code has changed by executing this instruction.
+ */
+static void ic_ivau_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+uint64_t icache_line_mask, start_address, end_address;
+const ARMCPU *cpu;
+
+cpu = env_archcpu(env);
+
+icache_line_mask = (4 << extract32(cpu->ctr, 0, 4)) - 1;
+start_address = value & ~icache_line_mask;
+end_address = value | icache_line_mask;
+
+mmap_lock();
+
+tb_invalidate_phys_range(start_address, end_address);
+
+mmap_unlock();
+}
+#endif
+
 static const ARMCPRegInfo v8_cp_reginfo[] = {
 /*
  * Minimal set of EL0-visible registers. This will need to be expanded
@@ -5267,7 +5297,10 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "CURRENTEL", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 4, .crm = 2,
   .access = PL1_R, .type = ARM_CP_CURRENTEL },
-/* Cache ops: all NOPs since we don't emulate caches */
+/*
+ * Instruction cache ops. All of these except `IC IVAU` NOP because we
+ * don't emulate caches.
+ */
 { .name = "IC_IALLUIS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
@@ -5280,9 +5313,17 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
   .accessfn = access_tocu },
 { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
-  .access = PL0_W, .type = ARM_CP_NOP,
+  .access = PL0_W,
   .fgt = FGT_ICIVAU,
-  .accessfn = access_tocu },
+  .accessfn = access_tocu,
+#ifdef CONFIG_USER_ONLY
+  .type = ARM_CP_NO_RAW,
+  .writefn = ic_ivau_write
+#else
+  .type = ARM_CP_NOP
+#endif
+},
+/* Cache ops: all NOPs since we don't emulate caches */
 { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
   .access = PL1_W, .accessfn = aa64_cacheop_poc_access,
-- 
2.38.5




[PATCH qemu v3 2/2] tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

2023-06-19 Thread ~jhogberg
From: John Högberg 

https://gitlab.com/qemu-project/qemu/-/issues/1034

Signed-off-by: John Högberg 
---
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 169 ++
 2 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

diff --git a/tests/tcg/aarch64/Makefile.target 
b/tests/tcg/aarch64/Makefile.target
index 3430fd3cd8..de6566d0d4 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -9,9 +9,10 @@ AARCH64_SRC=$(SRC_PATH)/tests/tcg/aarch64
 VPATH  += $(AARCH64_SRC)
 
 # Base architecture tests
-AARCH64_TESTS=fcvt pcalign-a64
+AARCH64_TESTS=fcvt pcalign-a64 icivau
 
 fcvt: LDFLAGS+=-lm
+icivau: LDFLAGS+=-lrt
 
 run-fcvt: fcvt
$(call run-test,$<,$(QEMU) $<, "$< on $(TARGET_NAME)")
diff --git a/tests/tcg/aarch64/icivau.c b/tests/tcg/aarch64/icivau.c
new file mode 100644
index 00..a01f45f172
--- /dev/null
+++ b/tests/tcg/aarch64/icivau.c
@@ -0,0 +1,169 @@
+/*
+ * Tests the IC IVAU-driven workaround for catching changes made to dual-mapped
+ * code that would otherwise go unnoticed in user mode.
+ *
+ * Copyright (c) 2023 Ericsson AB
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define MAX_CODE_SIZE 128
+
+typedef int (SelfModTest)(uint32_t, uint32_t*);
+typedef int (BasicTest)(int);
+
+static void mark_code_modified(const uint32_t *exec_data, size_t length)
+{
+size_t dcache_stride, icache_stride, i;
+unsigned long ctr_el0;
+
+/*
+ * Step according to minimum cache sizes, as the cache maintenance
+ * instructions operate on the cache line of the given address.
+ *
+ * We assume that exec_data is properly aligned.
+ */
+asm ("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+dcache_stride = (4 << ((ctr_el0 >> 16) & 0xF));
+icache_stride = (4 << (ctr_el0 & 0xF));
+
+/*
+ * For completeness we might be tempted to assert that we should fail when
+ * the whole code update sequence is omitted, but that would make the test
+ * flaky as it can succeed by coincidence on actual hardware.
+ */
+for (i = 0; i < length; i += dcache_stride) {
+const char *dc_addr = &((const char *)exec_data)[i];
+asm volatile ("dc cvau, %x[dc_addr]\n"
+  : /* no outputs */
+  : [dc_addr] "r"(dc_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n");
+
+for (i = 0; i < length; i += icache_stride) {
+const char *ic_addr = &((const char *)exec_data)[i];
+asm volatile ("ic ivau, %x[ic_addr]\n"
+  : /* no outputs */
+  : [ic_addr] "r"(ic_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n"
+  "isb sy\n");
+}
+
+static int basic_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * As user mode only misbehaved for dual-mapped code when previously
+ * translated code had been changed, we'll start off with this basic test
+ * function to ensure that there's already some translated code at
+ * exec_data before the next test. This should cause the next test to fail
+ * if `mark_code_modified` fails to invalidate the code.
+ *
+ * Note that the payload is in binary form instead of inline assembler
+ * because we cannot use __attribute__((naked)) on this platform and the
+ * workarounds are at least as ugly as this is.
+ */
+static const uint32_t basic_payload[] = {
+0xD65F03C0 /* 0x00: RET */
+};
+
+BasicTest *copied_ptr = (BasicTest *)exec_data;
+
+memcpy(rw_data, basic_payload, sizeof(basic_payload));
+mark_code_modified(exec_data, sizeof(basic_payload));
+
+return copied_ptr(1234) == 1234;
+}
+
+static int self_modification_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * This test is self-modifying in an attempt to cover an edge case where
+ * the IC IVAU instruction invalidates itself.
+ *
+ * Note that the IC IVAU instruction is 16 bytes into the function, in what
+ * will be the same cache line as the modifed instruction on machines with
+ * a cache line size >= 16 bytes.
+ */
+static const uint32_t self_mod_payload[] = {
+/* Overwrite the placeholder instruction with the new one. */
+0xB9001C20, /* 0x00: STR w0, [x1, 0x1C] */
+
+/* Get the executable address of the modified instruction. */
+0x10A8, /* 0x04: ADR x8, <0x1C> */
+
+/* Mark the modified instruction as updated. */
+0xD50B7B28, /* 0x08: DC CVAU x8 */
+0xD5033BBF, /* 0x0C: DMB ISH */
+0xD50B7528, /* 0x10: IC IVAU x8 */
+0xD5033BBF, /* 0x14: DMB ISH */
+0xD5033FDF, /* 0x18: ISB */
+
+/* Placeholder instruction, overwritten above. */
+0x5280, /* 0x1C: MOV w0, 0 */
+
+   

[PATCH qemu v3 0/2] target/arm: Improve user-mode compatibility with JITs

2023-06-19 Thread ~jhogberg
The test cases have been changed in v3 to fix some issues pointed out in
code review. The main change is that the tests no longer naively copy C
code around, opting instead to have hard-coded binary payloads. Given
the small amount of code I found that the workarounds for position-
independence and figuring out the actual code length were at least as
ugly, but that's only my preference, please tell me if you'd prefer
something different.



When running in user-mode QEMU currently fails to emulate JITs that
use dual-mapped code to get around W^X restrictions, where one mapping
is writable and one is executable. As it has no way of knowing that a
write to the writable region is reflected in the executable one, it
fails to invalidate previously translated code which leads to a crash
at best.

(Note that system mode is unaffected as the softmmu is fully aware of
what is going on.)

This patch series catches changes to dual-mapped code by honoring the
cache management instructions required to make things work on actual
hardware.

See https://gitlab.com/qemu-project/qemu/-/issues/1034 for more
background information

John Högberg (2):
  target/arm: Handle IC IVAU to improve compatibility with JITs
  tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

 target/arm/helper.c   |  47 -
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 169 ++
 3 files changed, 215 insertions(+), 4 deletions(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

-- 
2.38.5



[PATCH qemu v2 1/2] target/arm: Handle IC IVAU to improve compatibility with JITs

2023-06-12 Thread ~jhogberg
From: John Högberg 

Unlike architectures with precise self-modifying code semantics
(e.g. x86) ARM processors do not maintain coherency for instruction
execution and memory, and require the explicit use of cache
management instructions as well as an instruction barrier to make
code updates visible (the latter on every core that is going to
execute said code).

While this is required to make JITs work on actual hardware, QEMU
has gotten away with not handling this since it does not emulate
caches, and unconditionally invalidates code whenever the softmmu
or the user-mode page protection logic detects that code has been
modified.

Unfortunately the latter does not work in the face of dual-mapped
code (a common W^X workaround), where one page is executable and
the other is writable: user-mode has no way to connect one with the
other as that is only known to the kernel and the emulated
application.

This commit works around the issue by invalidating code in
IC IVAU instructions.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1034

Co-authored-by: Richard Henderson 
Signed-off-by: John Högberg 
---
 target/arm/helper.c | 47 ++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index d4bee43bd0..235e3cd0b6 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5228,6 +5228,36 @@ static void mdcr_el2_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 }
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * `IC IVAU` is handled to improve compatibility with JITs that dual-map their
+ * code to get around W^X restrictions, where one region is writable and the
+ * other is executable.
+ *
+ * Since the executable region is never written to we cannot detect code
+ * changes when running in user mode, and rely on the emulated JIT telling us
+ * that the code has changed by executing this instruction.
+ */
+static void ic_ivau_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+uint64_t icache_line_mask, start_address, end_address;
+const ARMCPU *cpu;
+
+cpu = env_archcpu(env);
+
+icache_line_mask = (4 << extract32(cpu->ctr, 0, 4)) - 1;
+start_address = value & ~icache_line_mask;
+end_address = value | icache_line_mask;
+
+mmap_lock();
+
+tb_invalidate_phys_range(start_address, end_address);
+
+mmap_unlock();
+}
+#endif
+
 static const ARMCPRegInfo v8_cp_reginfo[] = {
 /*
  * Minimal set of EL0-visible registers. This will need to be expanded
@@ -5267,7 +5297,10 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "CURRENTEL", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 4, .crm = 2,
   .access = PL1_R, .type = ARM_CP_CURRENTEL },
-/* Cache ops: all NOPs since we don't emulate caches */
+/*
+ * Instruction cache ops. All of these except `IC IVAU` NOP because we
+ * don't emulate caches.
+ */
 { .name = "IC_IALLUIS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
@@ -5280,9 +5313,17 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
   .accessfn = access_tocu },
 { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
-  .access = PL0_W, .type = ARM_CP_NOP,
+  .access = PL0_W,
   .fgt = FGT_ICIVAU,
-  .accessfn = access_tocu },
+  .accessfn = access_tocu,
+#ifdef CONFIG_USER_ONLY
+  .type = ARM_CP_NO_RAW,
+  .writefn = ic_ivau_write
+#else
+  .type = ARM_CP_NOP
+#endif
+},
+/* Cache ops: all NOPs since we don't emulate caches */
 { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
   .access = PL1_W, .accessfn = aa64_cacheop_poc_access,
-- 
2.38.5




[PATCH qemu v2 0/2] target/arm: Improve user-mode compatibility with JITs

2023-06-12 Thread ~jhogberg
The previous version of this got mangled, so I'm re-sending it through
sourcehut as mentioned in the documentation in the hopes that it's
foolproof. Sorry about the extra traffic :-(



When running in user-mode QEMU currently fails to emulate JITs that
use dual-mapped code to get around W^X restrictions, where one mapping
is writable and one is executable. As it has no way of knowing that a
write to the writable region is reflected in the executable one, it
fails to invalidate previously translated code which leads to a crash
at best.

(Note that system mode is unaffected as the softmmu is fully aware of
what is going on.)

This patch series catches changes to dual-mapped code by honoring the
cache management instructions required to make things work on actual
hardware.

See https://gitlab.com/qemu-project/qemu/-/issues/1034 for more
background information

John Högberg (2):
  target/arm: Handle IC IVAU to improve compatibility with JITs
  tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

 target/arm/helper.c   |  47 ++-
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 204 ++
 3 files changed, 250 insertions(+), 4 deletions(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

-- 
2.38.5



[PATCH qemu v2 2/2] tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

2023-06-12 Thread ~jhogberg
From: John Högberg 

https://gitlab.com/qemu-project/qemu/-/issues/1034

Signed-off-by: John Högberg 
---
 tests/tcg/aarch64/Makefile.target |   3 +-
 tests/tcg/aarch64/icivau.c| 204 ++
 2 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

diff --git a/tests/tcg/aarch64/Makefile.target 
b/tests/tcg/aarch64/Makefile.target
index 3430fd3cd8..de6566d0d4 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -9,9 +9,10 @@ AARCH64_SRC=$(SRC_PATH)/tests/tcg/aarch64
 VPATH  += $(AARCH64_SRC)
 
 # Base architecture tests
-AARCH64_TESTS=fcvt pcalign-a64
+AARCH64_TESTS=fcvt pcalign-a64 icivau
 
 fcvt: LDFLAGS+=-lm
+icivau: LDFLAGS+=-lrt
 
 run-fcvt: fcvt
$(call run-test,$<,$(QEMU) $<, "$< on $(TARGET_NAME)")
diff --git a/tests/tcg/aarch64/icivau.c b/tests/tcg/aarch64/icivau.c
new file mode 100644
index 00..ff80d3d868
--- /dev/null
+++ b/tests/tcg/aarch64/icivau.c
@@ -0,0 +1,204 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define PAYLOAD_SIZE (256)
+
+typedef int (*SelfModTestPtr)(char *, const char*, int);
+typedef int (*CompareTestPtr)(int, int);
+
+void flush_icache(const char *exec_data, size_t length)
+{
+size_t dcache_stride, icache_stride, i;
+unsigned long ctr_el0;
+
+/*
+ * Step according to minimum cache sizes, as the cache maintenance
+ * instructions operate on the cache line of the given address.
+ *
+ * We assume that exec_data is properly aligned.
+ */
+__asm__("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+dcache_stride = (4 << ((ctr_el0 >> 16) & 0xF));
+icache_stride = (4 << (ctr_el0 & 0xF));
+
+for (i = 0; i < length; i += dcache_stride) {
+const char *dc_addr = _data[i];
+__asm__ ("dc cvau, %x[dc_addr]\n"
+ : /* no outputs */
+ : [dc_addr] "r"(dc_addr)
+ : "memory");
+}
+
+__asm__ ("dmb ish\n");
+
+for (i = 0; i < length; i += icache_stride) {
+const char *ic_addr = _data[i];
+__asm__ ("ic ivau, %x[ic_addr]\n"
+ : /* no outputs */
+ : [ic_addr] "r"(ic_addr)
+ : "memory");
+}
+
+__asm__ ("dmb ish\n"
+ "isb sy\n");
+}
+
+/*
+ * The unmodified assembly of this function returns 0, it self-modifies to
+ * return the value indicated by new_move.
+ */
+int self_modification_payload(char *rw_data, const char *exec_data,
+  int new_move)
+{
+register int result __asm__ ("w0") = new_move;
+
+__asm__ (/* Get the writable address of __modify_me. */
+ "sub %x[rw_data], %x[rw_data], %x[exec_data]\n"
+ "adr %x[exec_data], __modify_me\n"
+ "add %x[rw_data], %x[rw_data], %x[exec_data]\n"
+ /* Overwrite the `MOV W0, #0` with the new move. */
+ "str %w[result], [%x[rw_data]]\n"
+ /*
+  * Mark the code as modified.
+  *
+  * Note that we align to the nearest 64 bytes in an attempt to put
+  * the flush sequence in the same cache line as the modified move.
+  */
+ ".align 6\n"
+ "dc cvau, %x[exec_data]\n"
+ ".align 2\n"
+ "dmb ish\n"
+ "ic ivau, %x[exec_data]\n"
+ "dmb ish\n"
+ "isb sy\n"
+ "__modify_me: mov w0, #0x0\n"
+ : [result] "+r"(result),
+   [rw_data] "+r"(rw_data),
+   [exec_data] "+r"(exec_data)
+ : /* No untouched inputs */
+ : "memory");
+
+return result;
+}
+
+int self_modification_test(char *rw_data, const char *exec_data)
+{
+SelfModTestPtr copied_ptr = (SelfModTestPtr)exec_data;
+int i;
+
+/*
+ * Bluntly assumes that the payload is position-independent and not larger
+ * than PAYLOAD_SIZE.
+ */
+memcpy(rw_data, self_modification_payload, PAYLOAD_SIZE);
+
+/*
+ * Notify all PEs that the code at exec_data has been altered.
+ *
+ * For completeness we could assert that we should fail when this is
+ * omitted, which works in user mode and on actual hardware as the
+ * modification won't "take," but doesn't work in system mode as the
+ * softmmu handles everything for us.
+ */
+flush_icache(exec_data, PAYLOAD_SIZE);
+
+for (i = 1; i < 10; i++) {
+const int mov_w0_template = 0x5280;
+
+/* MOV W0, i */
+if (copied_ptr(rw_data, exec_data, mov_w0_template | (i << 5)) != i) {
+return 0;
+}
+}
+
+return 1;
+}
+
+int compare_copied(char *rw_data, const char *exec_data,
+   int (*reference_ptr)(int, int))
+{
+CompareTestPtr copied_ptr = (CompareTestPtr)exec_data;
+int a, b;
+
+memcpy(rw_data, reference_ptr, PAYLOAD_SIZE);
+flush_icache(exec_data, PAYLOAD_SIZE);
+
+for (a =