https://gcc.gnu.org/g:52d5a8870d2108c660c34c8c1b7ea255809817d5
commit r17-899-g52d5a8870d2108c660c34c8c1b7ea255809817d5 Author: Artemiy Volkov <[email protected]> Date: Mon Dec 22 12:46:21 2025 +0000 aarch64/sve: combine AdvSIMD and SVE vec_duplicates Currently, to duplicate a 64-bit or narrower value into a SVE register, we choose to go via an intermediate 128-bit AdvSIMD register, viz.: svfloat32_t foo(float x) { return svdupq_n_f32(x, x, x, x); } which will produce the following code: dup v0.4s, v0.s[0] dup z0.q, z0.q[0] ret when compiled with -O2 -march=armv9-a+sve. This can be simplified into a single dup instruction going to an SVE register directly from a scalar (or a smaller vector) value: mov z0.s, s0 ret To facilitate this, this patch adds a pattern that combine can use to merge two vec_duplicate instructions (scalar -> AdvSIMD and AdvSIMD -> SVE) into a single one (scalar -> SVE). To demonstrate the effect of this patch, the vec-init-23.c test from AdvSIMD was reused as a new SVE test (vec_init_5.c). gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*aarch64_vec_duplicate_subvector<vconsv><vconq><mode>): New pattern. * config/aarch64/iterators.md (VCONSV): New mode attribute. (vconsv): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/vec_init_5.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve.md | 14 + gcc/config/aarch64/iterators.md | 24 ++ gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c | 382 ++++++++++++++++++++++ 3 files changed, 420 insertions(+) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index e7d98c3754f1..ba4ff7267914 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -2890,6 +2890,20 @@ [(set_attr "sve_type" "sve_int_general")] ) +;; Initialize an SVE vector by duplicating a 128-bit AdvSIMD register that +;; itself contains a duplicated scalar or subvector value. +(define_insn "*aarch64_vec_duplicate_subvector<vconsv><vconq><mode>" + [(set (match_operand:<VCONSV> 0 "register_operand") + (vec_duplicate:<VCONSV> + (vec_duplicate:<VCONQ> + (match_operand:VQDUP 1 "register_operand"))))] + "TARGET_SVE" + {@ [ cons: =0 , 1 ] + [ w , r ] mov\t%0.<single_type>, %<single_wx>1 + [ w , w ] mov\t%0.<single_type>, %<single_type>1 + } +) + ;; This is used for vec_duplicate<mode>s from memory, but can also ;; be used by combine to optimize selects of a vec_duplicate<mode> ;; with zero. diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 462f2d996f07..41410095ba39 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1995,6 +1995,30 @@ (HI "v8hi") (QI "v16qi") (SF "v4sf") (DF "v2df")]) +;; SVE container modes for duplication into a full SVE register. +(define_mode_attr VCONSV [(V8QI "VNx16QI") (V4QI "VNx16QI") + (V2QI "VNx16QI") (QI "VNx16QI") + (V4HI "VNx8HI") (V2HI "VNx8HI") + (HI "VNx8HI") (V2SI "VNx4SI") + (SI "VNx4SI") (DI "VNx2DI") + (V4BF "VNx8BF") (V2BF "VNx8BF") + (BF "VNx8BF") (V4HF "VNx8HF") + (V2HF "VNx8HF") (HF "VNx8HF") + (V2SF "VNx4SF") (SF "VNx4SF") + (DF "VNx2DF")]) + +;; Same as above, but in lowercase. +(define_mode_attr vconsv [(V8QI "vnx16qi") (V4QI "vnx16qi") + (V2QI "vnx16qi") (QI "vnx16qi") + (V4HI "vnx8hi") (V2HI "vnx8hi") + (HI "vnx8hi") (V2SI "vnx4si") + (SI "vnx4si") (DI "vnx2di") + (V4BF "vnx8bf") (V2BF "vnx8bf") + (BF "vnx8bf") (V4HF "vnx8hf") + (V2HF "vnx8hf") (HF "vnx8hf") + (V2SF "vnx4sf") (SF "vnx4sf") + (DF "vnx2df")]) + ;; Half modes of all vector modes. (define_mode_attr VHALF [(V8QI "V4QI") (V16QI "V8QI") (V4HI "V2HI") (V8HI "V4HI") diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c new file mode 100644 index 000000000000..99e04aac2650 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c @@ -0,0 +1,382 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_sve.h> + +#define TESTCASE(TYPE, ETYPE, T, SZ, NUM, MULT, ...)\ + sv##TYPE##SZ##_t test_##TYPE##SZ##_##NUM (ETYPE x0, ETYPE x1, ETYPE x2, ETYPE x3,\ + ETYPE x4, ETYPE x5, ETYPE x6, ETYPE x7)\ + {\ + return svdupq_n_##T##SZ (__VA_ARGS__);\ + } + +#include "../vec-init-23.c" + +/* +** test_int8_1: +** mov z0\.b, w0 +** ret +*/ + +/* +** test_int8_2: +** bfi w0, w1, 8, 8 +** mov z0\.h, w0 +** ret +*/ + +/* +** test_int8_3: +** bfi w0, w1, 8, 8 +** bfi w0, w2, 16, 8 +** bfi w0, w3, 24, 8 +** mov z0\.s, w0 +** ret +*/ + +/* +** test_int8_4: +** bfi w0, w2, 8, 8 +** bfi w1, w3, 8, 8 +** bfi w0, w4, 16, 8 +** bfi w1, w5, 16, 8 +** bfi w0, w6, 24, 8 +** bfi w1, w7, 24, 8 +** dup v31\.2s, w0 +** dup v30\.2s, w1 +** zip1 v31\.16b, v31\.16b, v30\.16b +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int8_5: +** uxtb w0, w0 +** mov z0\.h, w0 +** ret +*/ + +/* +** test_int8_6: +** mov w1, 0 +** bfi w1, w0, 8, 8 +** mov z0\.h, w1 +** ret +*/ + +/* +** test_int8_7: +** mov w2, 16777472 +** bfi w2, w0, 0, 8 +** bfi w2, w1, 8, 8 +** mov z0\.s, w2 +** ret +*/ + +/* +** test_int8_8: +** mov w2, 16777472 +** bfi w2, w0, 16, 8 +** bfi w2, w1, 24, 8 +** mov z0\.s, w2 +** ret +*/ + +/* +** test_int8_9: +** mov w2, 16777216 +** bfi w2, w0, 0, 8 +** bfi w2, w1, 16, 8 +** mov z0.s, w2 +** ret +*/ + +/* +** test_int8_10: +** bfi w0, w1, 8, 8 +** bfi w0, w2, 16, 8 +** bfi w0, w3, 24, 8 +** dup v31\.2s, w0 +** adrp x0, .LANCHOR[0-9]+ +** ldr d30, \[x0, #:lo12:.LANCHOR[0-9]+\] +** zip1 v31\.16b, v31\.16b, v30\.16b +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int8_11: +** bfi w0, w1, 8, 8 +** adrp x4, .LANCHOR[0-9]+ +** bfi w0, w2, 16, 8 +** ldr d31, \[x4, #:lo12:\.LANCHOR[0-9]+\] +** bfi w0, w3, 24, 8 +** dup v30\.2s, w0 +** zip1 v31\.16b, v31\.16b, v30\.16b +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int8_12: +** mov w4, 255 +** movk w4, 0x201, lsl 16 +** bfi w4, w0, 0, 8 +** mov w0, 256 +** movk w0, 0x302, lsl 16 +** bfi w0, w1, 0, 8 +** bfi w4, w2, 16, 8 +** bfi w0, w3, 16, 8 +** dup v31\.2s, w4 +** dup v30\.2s, w0 +** zip1 v31\.16b, v31\.16b, v30\.16b +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int8_13: +** mov w4, 256 +** movk w4, 0x302, lsl 16 +** bfi w4, w0, 8, 8 +** mov w0, 513 +** movk w0, 0x403, lsl 16 +** bfi w0, w1, 8, 8 +** bfi w4, w2, 24, 8 +** bfi w0, w3, 24, 8 +** dup v31\.2s, w4 +** dup v30\.2s, w0 +** zip1 v31\.16b, v31\.16b, v30\.16b +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_float16_1: +** fcvt h0, s0 +** mov z0\.h, h0 +** ret +*/ + +/* +** test_float16_2: +** fcvt h1, s1 +** fcvt h0, s0 +** uzp1 v0\.4h, v0\.4h, v1\.4h +** mov z0\.s, s0 +** ret +*/ + +/* +** test_float16_3: +** uzp1 v2\.2s, v0\.2s, v2\.2s +** uzp1 v3\.2s, v1\.2s, v3\.2s +** zip1 v3\.4s, v2\.4s, v3\.4s +** fcvtn v3\.4h, v3\.4s +** mov z0\.d, d3 +** ret +*/ + +/* +** test_float16_4: +** fcvt h0, s0 +** fmov h0, h0 +** mov z0\.s, s0 +** ret +*/ + +/* +** test_float16_5: +** movi v31\.4h, #0 +** fcvt h0, s0 +** uzp1 v0\.4h, v31\.4h, v0\.4h +** mov z0\.s, s0 +** ret +*/ + +/* +** test_float16_6: +** fcvt h2, s0 +** fcvt h1, s1 +** fmov h31, 1.0e\+0 +** fmov h2, h2 +** uzp1 v1\.4h, v1\.4h, v31\.4h +** dup v0\.2s, v2\.s\[0\] +** dup v1\.2s, v1\.s\[0\] +** zip1 v0\.8h, v0\.8h, v1\.8h +** dup z0\.q, z0\.q\[0\] +** ret +*/ + +/* +** test_float16_7: +** fcvt h3, s0 +** fcvt h2, s1 +** movi v0\.4h, #0 +** fmov h1, 1.0e\+0 +** uzp1 v1\.4h, v1\.4h, v2\.4h +** uzp1 v0\.4h, v0\.4h, v3\.4h +** dup v1\.2s, v1\.s\[0\] +** dup v0\.2s, v0\.s\[0\] +** zip1 v0\.8h, v0\.8h, v1\.8h +** dup z0\.q, z0\.q\[0\] +** ret +*/ + +/* +** test_float16_8: +** fcvt h1, s1 +** fcvt h0, s0 +** movi v31\.2s, 0x3c, lsl 24 +** uzp1 v0\.4h, v0\.4h, v1.4h +** dup v0\.2s, v0\.s\[0\] +** zip1 v0\.8h, v31\.8h, v0\.8h +** dup z0\.q, z0\.q\[0\] +** ret +*/ + +/* +** test_int16_1: +** mov z0\.h, w0 +** ret +*/ + +/* +** test_int16_2: +** bfi w0, w1, 16, 16 +** mov z0\.s, w0 +** ret +*/ + +/* +** test_int16_3: +** bfi w0, w2, 16, 16 +** bfi w1, w3, 16, 16 +** dup v31\.2s, w0 +** dup v30\.2s, w1 +** zip1 v31\.8h, v31\.8h, v30\.8h +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int16_4: +** uxth w0, w0 +** mov z0\.s, w0 +** ret +*/ + +/* +** test_int16_5: +** mov w1, 0 +** bfi w1, w0, 16, 16 +** mov z0\.s, w1 +** ret +*/ + +/* +** test_int16_6: +** uxth w0, w0 +** dup v31\.2s, w0 +** mov w0, 1 +** bfi w1, w0, 16, 16 +** dup v30\.2s, w1 +** zip1 v31\.8h, v31\.8h, v30\.8h +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int16_7: +** mov w2, 0 +** bfi w2, w0, 16, 16 +** mov w0, 65537 +** bfi w0, w1, 16, 16 +** dup v31\.2s, w2 +** dup v30\.2s, w0 +** zip1 v31\.8h, v31\.8h, v30\.8h +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int16_8: +** bfi w0, w1, 16, 16 +** movi v31\.2s, 0x1, lsl 16 +** dup v30\.2s, w0 +** zip1 v31\.8h, v31\.8h, v30\.8h +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_float32_1: +** mov z0\.s, s0 +** ret +*/ + +/* +** test_float32_2: +** uzp1 v0\.2s, v0\.2s, v1\.2s +** mov z0\.d, d0 +** ret +*/ + +/* +** test_float32_3: +** fmov s0, s0 +** mov z0\.d, d0 +** ret +*/ + +/* +** test_float32_4: +** movi v31\.2s, #0 +** uzp1 v0\.2s, v31\.2s, v0\.2s +** mov z0\.d, d0 +** ret +*/ + +/* +** test_int32_1: +** mov z0\.s, w0 +** ret +*/ + +/* +** test_int32_2: +** fmov s0, w0 +** ins v0\.s\[1\], w1 +** mov z0\.d, d0 +** ret +*/ + +/* +** test_int32_3: +** fmov s0, w0 +** mov z0\.d, d0 +** ret +*/ + +/* +** test_int32_4: +** dup v30\.2s, w0 +** movi v31\.2s, 0 +** zip1 v31\.4s, v31\.4s, v30\.4s +** dup z0\.q, z31\.q\[0\] +** ret +*/ + +/* +** test_int64_1: +** mov z0\.d, x0 +** ret +*/ + +/* +** test_float64_1: +** mov z0\.d, d0 +** ret +*/ +
