> On 29 Apr 2025, at 14:03, Richard Sandiford <[email protected]> wrote: > > External email: Use caution opening links or attachments > > > Jennifer Schmitz <[email protected]> writes: >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index f7bccf532f8..1c06b8528e9 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -6416,13 +6416,30 @@ aarch64_stack_protect_canary_mem (machine_mode mode, >> rtx decl_rtl, >> void >> aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src) >> { >> - expand_operand ops[3]; >> machine_mode mode = GET_MODE (dest); >> - create_output_operand (&ops[0], dest, mode); >> - create_input_operand (&ops[1], pred, GET_MODE(pred)); >> - create_input_operand (&ops[2], src, mode); >> - temporary_volatile_ok v (true); >> - expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); >> + if ((MEM_P (dest) || MEM_P (src)) >> + && known_eq (BYTES_PER_SVE_VECTOR, 16) > > I suppose it's personal preference, but I think this would be more > obvious as the suggested: > > known_eq (GET_MODE_SIZE (mode), 16) > > so that everything is defined/tested in terms of the mode that we > want to move. Done. Also adjusted the cover letter accordingly. > >> + && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA >> + && !BYTES_BIG_ENDIAN) >> + { >> + if (MEM_P (src)) >> + { >> + rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0)); >> + emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode)); >> + } >> + else >> + emit_move_insn (adjust_address (dest, V16QImode, 0), >> + force_lowpart_subreg (V16QImode, src, mode)); >> + } >> + else >> + { >> + expand_operand ops[3]; >> + create_output_operand (&ops[0], dest, mode); >> + create_input_operand (&ops[1], pred, GET_MODE(pred)); >> + create_input_operand (&ops[2], src, mode); >> + temporary_volatile_ok v (true); >> + expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); >> + } >> } >> >> /* Expand a pre-RA SVE data move from SRC to DEST in which at least one >> [...] >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c >> b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c >> index d99ce1202a9..370bd9e3bfe 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c >> @@ -473,17 +473,11 @@ SEL2 (struct, pst_uniform4) >> ** sub sp, sp, #144 >> ** add (x[0-9]+), sp, #?31 >> ** and x7, \1, #?(?:-32|4294967264) >> -** ptrue (p[0-7])\.b, vl16 >> -** st1w z0\.s, \2, \[x7\] >> -** add (x[0-9]+), x7, #?32 >> -** ( >> -** str z1, \[\3\] >> -** str z2, \[\3, #1, mul vl\] >> -** | >> -** stp q1, q2, \[\3\] >> -** ) >> -** str z3, \[\3, #2, mul vl\] >> -** st1w z4\.s, \2, \[x7, #6, mul vl\] >> +** mov x0, x7 >> +** str q0, \[x0\], 32 >> +** stp q1, q2, \[x0\] >> +** str z3, \[x0, #2, mul vl\] >> +** str q4, \[x7, 96\] >> ** add sp, sp, #?144 >> ** ret >> */ > > Sorry for not noticing last time, but: > > There's no need for the temporary register to be x0, so it should be > handled using captures rather than hard-coded to x0, as with the > original ADD. > > Also, the patch doesn't change the handling of unpredicated SVE stores, > and so doesn't change whether we use: > > str z1, \[\3\] > str z2, \[\3, #1, mul vl\] > > or: > > stp q1, q2, \[\3\] > > So I think we should keep the (...|...) part of the test as-is, with > just the captures updated. > > There would be a benefit in "defending" the use of STP if we paired q0+q1 > and q2+q3, but until then, I think we should continue to accept both. > > Same for the other tests in the file. I made the change to the test file (I hope I caught all cases). Thanks, Jennifer
If -msve-vector-bits=128, SVE loads and stores (LD1 and ST1) with a
ptrue predicate can be replaced by neon instructions (LDR and STR),
thus avoiding the predicate altogether. This also enables formation of
LDP/STP pairs.
For example, the test cases
svfloat64_t
ptrue_load (float64_t *x)
{
svbool_t pg = svptrue_b64 ();
return svld1_f64 (pg, x);
}
void
ptrue_store (float64_t *x, svfloat64_t data)
{
svbool_t pg = svptrue_b64 ();
return svst1_f64 (pg, x, data);
}
were previously compiled to
(with -O2 -march=armv8.2-a+sve -msve-vector-bits=128):
ptrue_load:
ptrue p3.b, vl16
ld1d z0.d, p3/z, [x0]
ret
ptrue_store:
ptrue p3.b, vl16
st1d z0.d, p3, [x0]
ret
Now the are compiled to:
ptrue_load:
ldr q0, [x0]
ret
ptrue_store:
str q0, [x0]
ret
The implementation includes the if-statement
if (known_eq (GET_MODE_SIZE (mode), 16)
&& aarch64_classify_vector_mode (mode) == VEC_SVE_DATA)
which checks for 128-bit VLS and excludes partial modes with a
mode size < 128 (e.g. VNx2QI).
The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?
Signed-off-by: Jennifer Schmitz <[email protected]>
gcc/
* config/aarch64/aarch64.cc (aarch64_emit_sve_pred_move):
Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS.
gcc/testsuite/
* gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c: New test.
* gcc.target/aarch64/sve/cond_arith_6.c: Adjust expected outcome.
* gcc.target/aarch64/sve/pst/return_4_128.c: Likewise.
* gcc.target/aarch64/sve/pst/return_5_128.c: Likewise.
* gcc.target/aarch64/sve/pst/struct_3_128.c: Likewise.
---
gcc/config/aarch64/aarch64.cc | 29 ++++++++--
.../gcc.target/aarch64/sve/cond_arith_6.c | 3 +-
.../aarch64/sve/ldst_ptrue_128_to_neon.c | 48 ++++++++++++++++
.../gcc.target/aarch64/sve/pcs/return_4_128.c | 39 +++++--------
.../gcc.target/aarch64/sve/pcs/return_5_128.c | 39 +++++--------
.../gcc.target/aarch64/sve/pcs/struct_3_128.c | 56 +++++++------------
6 files changed, 118 insertions(+), 96 deletions(-)
create mode 100644
gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f7bccf532f8..fff8d9da49d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6416,13 +6416,30 @@ aarch64_stack_protect_canary_mem (machine_mode mode,
rtx decl_rtl,
void
aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
{
- expand_operand ops[3];
machine_mode mode = GET_MODE (dest);
- create_output_operand (&ops[0], dest, mode);
- create_input_operand (&ops[1], pred, GET_MODE(pred));
- create_input_operand (&ops[2], src, mode);
- temporary_volatile_ok v (true);
- expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+ if ((MEM_P (dest) || MEM_P (src))
+ && known_eq (GET_MODE_SIZE (mode), 16)
+ && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
+ && !BYTES_BIG_ENDIAN)
+ {
+ if (MEM_P (src))
+ {
+ rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0));
+ emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode));
+ }
+ else
+ emit_move_insn (adjust_address (dest, V16QImode, 0),
+ force_lowpart_subreg (V16QImode, src, mode));
+ }
+ else
+ {
+ expand_operand ops[3];
+ create_output_operand (&ops[0], dest, mode);
+ create_input_operand (&ops[1], pred, GET_MODE(pred));
+ create_input_operand (&ops[2], src, mode);
+ temporary_volatile_ok v (true);
+ expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+ }
}
/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
index 4085ab12444..d5a12f1df07 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
@@ -8,7 +8,8 @@ f (float *x)
x[i] -= 1.0f;
}
-/* { dg-final { scan-assembler {\tld1w\tz} } } */
+/* { dg-final { scan-assembler {\tld1w\tz} { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler {\tldr\tq} { target aarch64_little_endian } } }
*/
/* { dg-final { scan-assembler {\tfcmgt\tp} } } */
/* { dg-final { scan-assembler {\tfsub\tz} } } */
/* { dg-final { scan-assembler {\tst1w\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
new file mode 100644
index 00000000000..43d36e86ad9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=128" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#include <arm_sve.h>
+
+#define TEST(TYPE, TY, B) \
+ sv##TYPE \
+ ld1_##TY##B (TYPE *x) \
+ { \
+ svbool_t pg = svptrue_b##B (); \
+ return svld1_##TY##B (pg, x); \
+ } \
+ \
+ void \
+ st1_##TY##B (TYPE *x, sv##TYPE data) \
+ { \
+ svbool_t pg = svptrue_b##B (); \
+ svst1_##TY##B (pg, x, data); \
+ } \
+ \
+ sv##TYPE \
+ ld1_vol_##TY##B (volatile sv##TYPE *ptr) \
+ { \
+ return *ptr; \
+ } \
+ \
+ void \
+ st1_vol_##TY##B (volatile sv##TYPE *ptr, sv##TYPE x) \
+ { \
+ *ptr = x; \
+ }
+
+TEST (bfloat16_t, bf, 16)
+TEST (float16_t, f, 16)
+TEST (float32_t, f, 32)
+TEST (float64_t, f, 64)
+TEST (int8_t, s, 8)
+TEST (int16_t, s, 16)
+TEST (int32_t, s, 32)
+TEST (int64_t, s, 64)
+TEST (uint8_t, u, 8)
+TEST (uint16_t, u, 16)
+TEST (uint32_t, u, 32)
+TEST (uint64_t, u, 64)
+
+/* { dg-final { scan-assembler-times {\tldr\tq0, \[x0\]} 24 } } */
+/* { dg-final { scan-assembler-times {\tstr\tq0, \[x0\]} 24 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
index 87d528c84cd..ac5f981490a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_128.c
@@ -11,104 +11,91 @@
/*
** callee_s8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s8, __SVInt8_t)
/*
** callee_u8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u8, __SVUint8_t)
/*
** callee_mf8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (mf8, __SVMfloat8_t)
/*
** callee_s16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s16, __SVInt16_t)
/*
** callee_u16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u16, __SVUint16_t)
/*
** callee_f16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f16, __SVFloat16_t)
/*
** callee_bf16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (bf16, __SVBfloat16_t)
/*
** callee_s32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s32, __SVInt32_t)
/*
** callee_u32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u32, __SVUint32_t)
/*
** callee_f32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f32, __SVFloat32_t)
/*
** callee_s64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s64, __SVInt64_t)
/*
** callee_u64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u64, __SVUint64_t)
/*
** callee_f64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
index 347a16c1367..2fab6feb41c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_128.c
@@ -13,104 +13,91 @@
/*
** callee_s8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s8, svint8_t)
/*
** callee_u8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u8, svuint8_t)
/*
** callee_mf8:
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (mf8, svmfloat8_t)
/*
** callee_s16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s16, svint16_t)
/*
** callee_u16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u16, svuint16_t)
/*
** callee_f16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f16, svfloat16_t)
/*
** callee_bf16:
-** ptrue (p[0-7])\.b, vl16
-** ld1h z0\.h, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (bf16, svbfloat16_t)
/*
** callee_s32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s32, svint32_t)
/*
** callee_u32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u32, svuint32_t)
/*
** callee_f32:
-** ptrue (p[0-7])\.b, vl16
-** ld1w z0\.s, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f32, svfloat32_t)
/*
** callee_s64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (s64, svint64_t)
/*
** callee_u64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (u64, svuint64_t)
/*
** callee_f64:
-** ptrue (p[0-7])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
CALLEE (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
index d99ce1202a9..29bdaf3ae59 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
@@ -473,17 +473,16 @@ SEL2 (struct, pst_uniform4)
** sub sp, sp, #144
** add (x[0-9]+), sp, #?31
** and x7, \1, #?(?:-32|4294967264)
-** ptrue (p[0-7])\.b, vl16
-** st1w z0\.s, \2, \[x7\]
-** add (x[0-9]+), x7, #?32
+** mov (x[0-9]+), x7
+** str q0, \[\2\], 32
** (
-** str z1, \[\3\]
-** str z2, \[\3, #1, mul vl\]
+** str z1, \[\2\]
+** str z2, \[\2, #1, mul vl\]
** |
-** stp q1, q2, \[\3\]
+** stp q1, q2, \[\2\]
** )
-** str z3, \[\3, #2, mul vl\]
-** st1w z4\.s, \2, \[x7, #6, mul vl\]
+** str z3, \[\2, #2, mul vl\]
+** str q4, \[x7, 96\]
** add sp, sp, #?144
** ret
*/
@@ -516,20 +515,12 @@ SEL2 (struct, pst_mixed1)
** test_pst_mixed1:
** sub sp, sp, #176
** str p0, \[sp\]
-** ptrue p0\.b, vl16
-** st1h z0\.h, p0, \[sp, #1, mul vl\]
-** st1h z1\.h, p0, \[sp, #2, mul vl\]
-** st1w z2\.s, p0, \[sp, #3, mul vl\]
-** st1d z3\.d, p0, \[sp, #4, mul vl\]
+** stp q0, q1, \[sp, 16\]
+** stp q2, q3, \[sp, 48\]
** str p1, \[sp, #40, mul vl\]
** str p2, \[sp, #41, mul vl\]
-** st1b z4\.b, p0, \[sp, #6, mul vl\]
-** st1h z5\.h, p0, \[sp, #7, mul vl\]
-** ...
-** st1w z6\.s, p0, [^\n]*
-** ...
-** st1d z7\.d, p0, [^\n]*
-** ...
+** stp q4, q5, \[sp, 96\]
+** stp q6, q7, \[sp, 128\]
** str p3, \[sp, #80, mul vl\]
** mov (x7, sp|w7, wsp)
** add sp, sp, #?176
@@ -557,15 +548,11 @@ SEL2 (struct, pst_mixed2)
** test_pst_mixed2:
** sub sp, sp, #128
** str p0, \[sp\]
-** ptrue (p[03])\.b, vl16
-** add (x[0-9]+), sp, #?2
-** st1b z0\.b, \1, \[\2\]
+** str q0, \[sp, 2\]
** str p1, \[sp, #9, mul vl\]
-** add (x[0-9]+), sp, #?20
-** st1b z1\.b, \1, \[\3\]
+** str q1, \[sp, 20\]
** str p2, \[sp, #18, mul vl\]
-** add (x[0-9]+), sp, #?38
-** st1b z2\.b, \1, \[\4\]
+** str q2, \[sp, 38\]
** (
** str z3, \[sp, #4, mul vl\]
** str z4, \[sp, #5, mul vl\]
@@ -595,8 +582,7 @@ SEL2 (struct, pst_big1)
/*
** test_pst_big1_a: { target lp64 }
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
/*
@@ -760,8 +746,7 @@ test_pst_big3_d (struct pst_big3 x)
/*
** test_pst_big3_e: { target lp64 }
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0, #1, mul vl\]
+** ldr q0, \[x0, 16\]
** ret
*/
/*
@@ -780,8 +765,7 @@ test_pst_big3_e (struct pst_big3 x)
/*
** test_pst_big3_f: { target lp64 }
-** ptrue (p[0-7])\.b, vl16
-** ld1b z0\.b, \1/z, \[x0, #5, mul vl\]
+** ldr q0, \[x0, 80\]
** ret
*/
/*
@@ -1035,8 +1019,7 @@ SEL2 (struct, nonpst6)
/*
** test_nonpst6: { target lp64 }
-** ptrue (p[0-3])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
/*
@@ -1063,8 +1046,7 @@ SEL2 (struct, nonpst7)
/*
** test_nonpst7: { target lp64 }
-** ptrue (p[0-3])\.b, vl16
-** ld1d z0\.d, \1/z, \[x0\]
+** ldr q0, \[x0\]
** ret
*/
/*
--
2.34.1
>
> Thanks,
> Richard
smime.p7s
Description: S/MIME cryptographic signature
