https://gcc.gnu.org/g:44a31df54837adf2f7815e7966dfe8ac32eb8f3b
commit r17-896-g44a31df54837adf2f7815e7966dfe8ac32eb8f3b Author: Artemiy Volkov <[email protected]> Date: Mon May 18 10:21:18 2026 +0000 aarch64: introduce partial AdvSIMD vector modes In addition to V2HF that already exists, this patch adds 4 more partial (16- and 32-bit) AdvSIMD vector modes: V4QI, V2QI, V2HI, and V2BF. For now, these are intended only for duplication into full-sized (32-, 64-, and 128-bit) registers. As a minimal closure required to bootstrap the compiler, this also implements the "mov" expand and the "aarch64_simd_mov" insn_and_split for the new modes (gathered under the VSUB64 iterator). This patch also adds the new aarch64_advsimd_sub_dword_mode_p () helper to facilitate detecting the new modes; that is then used (a) to disable vec_perm_const vectorization for those modes, (b) in the "mov" expander for those modes, and (c) to define the new "Da" constraint. Some existing testcases were adjusted where needed. (The _Float16 testcase in sve/slp_1.c temporarily expects GPRs to be used for V2HF, which is corrected to FPRs by the succeeding patch; and the half-float complex tests now recognize some of the patterns, but check that V2BF still can't be used for vectorization.) gcc/ChangeLog: * config/aarch64/aarch64-modes.def (VECTOR_MODE): Remove V2HF. (VECTOR_MODES): Define V2QI, V4QI, V2HI, V2HF, V2BF. * config/aarch64/aarch64-protos.h (aarch64_advsimd_sub_dword_mode_p): Declare new predicate. * config/aarch64/aarch64-simd.md (*aarch64_simd_mov<mode>): New define_insn_and_split pattern. (mov<mode>): Add sub-64-bit vector modes to the VALL_F16 expander. Forego const vector expansion for those modes. * config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Handle 16- and 32-bit vector modes. (aarch64_advsimd_sub_dword_mode_p): Define new predicate. (aarch64_vectorize_vec_perm_const): Refuse for partial vector modes. * config/aarch64/constraints.md (Da): New constraint. * config/aarch64/iterators.md (VSUB64): New iterator. (VALL_F16_SUB64): Likewise. (size): Define attribute for sub-64-bit vector modes. (VSC): New mode attribute. (vstype): Likewise. gcc/testsuite/ChangeLog: * gcc.dg/vect/complex/bb-slp-complex-add-half-float.c: Adjust testcase. * gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c: Likewise. * gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c: Likewise. * gcc.target/aarch64/sve/slp_1.c: Likewise. Diff: --- gcc/config/aarch64/aarch64-modes.def | 4 +- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 64 +++++++++++++++++++++- gcc/config/aarch64/aarch64.cc | 18 ++++++ gcc/config/aarch64/constraints.md | 5 ++ gcc/config/aarch64/iterators.md | 19 ++++++- .../vect/complex/bb-slp-complex-add-half-float.c | 3 + .../vect/complex/bb-slp-complex-mla-half-float.c | 4 +- .../vect/complex/bb-slp-complex-mul-half-float.c | 7 ++- gcc/testsuite/gcc.target/aarch64/sve/slp_1.c | 11 ++-- 10 files changed, 123 insertions(+), 13 deletions(-) diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index d9bff61adec1..d5a54689f7aa 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -79,8 +79,10 @@ VECTOR_MODES (FLOAT, 8); /* V2SF. */ VECTOR_MODES (FLOAT, 16); /* V4SF V2DF. */ VECTOR_MODE (INT, DI, 1); /* V1DI. */ VECTOR_MODE (FLOAT, DF, 1); /* V1DF. */ -VECTOR_MODE (FLOAT, HF, 2); /* V2HF. */ +VECTOR_MODES (INT, 2); /* V2QI. */ +VECTOR_MODES (INT, 4); /* V4QI V2HI. */ +VECTOR_MODES (FLOAT, 4); /* V2BF V2HF. */ /* Integer vector modes used to represent intermediate widened values in some instructions. Not intended to be moved to and from registers or memory. */ diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 24da650da76f..513b556398fa 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -872,6 +872,7 @@ bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode); int aarch64_branch_cost (bool, bool); enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx); bool aarch64_advsimd_struct_mode_p (machine_mode mode); +bool aarch64_advsimd_sub_dword_mode_p (machine_mode mode); opt_machine_mode aarch64_v64_mode (scalar_mode); opt_machine_mode aarch64_v128_mode (scalar_mode); opt_machine_mode aarch64_full_sve_mode (scalar_mode); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7496da3a70c1..2b7f6b467c62 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -49,8 +49,8 @@ (define_subst_attr "vczbe" "add_vec_concat_subst_be" "" "_vec_concatz_be") (define_expand "mov<mode>" - [(set (match_operand:VALL_F16 0 "nonimmediate_operand") - (match_operand:VALL_F16 1 "general_operand"))] + [(set (match_operand:VALL_F16_SUB64 0 "nonimmediate_operand") + (match_operand:VALL_F16_SUB64 1 "general_operand"))] "TARGET_FLOAT" " /* Force the operand into a register if it is not an @@ -77,7 +77,8 @@ aarch64_expand_vector_init (operands[0], operands[1]); DONE; } - else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode) + else if (!aarch64_advsimd_sub_dword_mode_p (<MODE>mode) + && !aarch64_simd_imm_zero (operands[1], <MODE>mode) && !aarch64_simd_special_constant_p (operands[1], <MODE>mode) && !aarch64_simd_valid_mov_imm (operands[1]) && !aarch64_const_vec_fmov_p (operands[1])) @@ -244,6 +245,63 @@ } ) +(define_insn_and_split "*aarch64_simd_mov<mode>" + [(set (match_operand:VSUB64 0 "nonimmediate_operand") + (match_operand:VSUB64 1 "general_operand"))] + "TARGET_FLOAT + && (register_operand (operands[0], <MODE>mode) + || aarch64_simd_reg_or_zero (operands[1], <MODE>mode) + || CONST_VECTOR_P (operands[1]))" + {@ [cons: =0, 1; attrs: type, arch] + [r , Dz ; mov_imm , * ] mov\t%w0, 0 + [r , rZ ; mov_reg , * ] mov\t%w0, %w1 + [r , Da ; mov_imm , * ] # + [r , w ; mov_reg , simd ] # + [r , m ; load_4 , * ] ldr<size>\t%w0, %1 + [w , w ; neon_logic , simd ] mov\t%0.8b, %1.8b + [w , m ; neon_load1_1reg , simd ] ldr\t%<vstype>0, %1 + [w , Dz ; neon_move , simd ] movi\t%0.2d, #0 + [m , rZ ; store_4 , * ] str<size>\t%w1, %0 + [m , w ; neon_store1_1reg , simd ] str\t%<vstype>1, %0 + } + "&& reload_completed + && REG_P (operands[0])" + [(const_int 0)] + { + if (CONST_VECTOR_P (operands[1])) + { + int elt_bitsize + = GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (operands[1]))); + int n_elts = CONST_VECTOR_NUNITS (operands[1]).to_constant (); + int val = 0; + bool int_vector_p = CONST_INT_P (CONST_VECTOR_ELT (operands[1], 0)); + unsigned HOST_WIDE_INT eltval; + rtx elt; + for (int i = 0; i < n_elts; i++) + { + elt = CONST_VECTOR_ELT (operands[1], BYTES_BIG_ENDIAN + ? i + : n_elts - 1 - i); + if (int_vector_p) + eltval = INTVAL (elt); + else + { + bool res = aarch64_reinterpret_float_as_int (elt, &eltval); + gcc_assert (res); + } + + val = (val << elt_bitsize) + (eltval & ((1 << elt_bitsize) - 1)); + } + emit_move_insn (gen_rtx_REG (SImode, REGNO (operands[0])), + GEN_INT (val)); + } + else if (REG_P (operands[1])) + aarch64_simd_emit_reg_reg_move (operands, <VSC>mode, 1); + DONE; + } + [(set_attr "type" "mov_reg")] +) + ;; When storing lane zero we can use the normal STR and its more permissive ;; addressing modes. diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 40bbb92ed740..4ed24c869652 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -1778,6 +1778,13 @@ aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false) case E_V4x2DFmode: return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0; + /* 16-bit Advanced SIMD vectors. */ + case E_V2QImode: + /* 32-bit Advanced SIMD vectors. */ + case E_V2HFmode: + case E_V2BFmode: + case E_V2HImode: + case E_V4QImode: /* 64-bit Advanced SIMD vectors. */ case E_V8QImode: case E_V4HImode: @@ -1856,6 +1863,14 @@ aarch64_advsimd_full_struct_mode_p (machine_mode mode) return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT)); } +/* Return true if MODE is a partial (sub-64-bit) Advanced SIMD mode. */ +bool +aarch64_advsimd_sub_dword_mode_p (machine_mode mode) +{ + return (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD) + && known_lt (GET_MODE_BITSIZE (mode), 64); +} + /* Return true if MODE is any of the data vector modes, including structure modes. */ static bool @@ -28415,6 +28430,9 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, { struct expand_vec_perm_d d; + if (aarch64_advsimd_sub_dword_mode_p (op_mode)) + return false; + /* Check whether the mask can be applied to a single vector. */ if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1))) diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 8760220835b7..829b2c949d07 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -531,6 +531,11 @@ (and (match_code "const_int") (match_test "aarch64_simd_scalar_immediate_valid_for_move (op, QImode)"))) +(define_constraint "Da" + "@internal + A constraint that matches all sub-64-bit AdvSIMD vectors." + (and (match_code "const_vector") + (match_test "aarch64_advsimd_sub_dword_mode_p (GET_MODE (op))"))) (define_constraint "Dt" "@internal diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 39b1e84edcc2..dfca3327f1fa 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -227,10 +227,17 @@ ;; All Advanced SIMD integer modes (define_mode_iterator VALLI [VDQ_BHSI V2DI]) +;; All sub-64-bit vector modes. +(define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF]) + ;; All Advanced SIMD modes suitable for moving, loading, and storing. (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) +;; All Advanced SIMD modes suitable for moving, loading, and storing, +;; plus all sub-64-bit vector modes. +(define_mode_iterator VALL_F16_SUB64 [VALL_F16 VSUB64]) + ;; The VALL_F16 modes except the 128-bit 2-element ones. (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF]) @@ -1466,7 +1473,9 @@ (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")]) ;; Give the length suffix letter for a sign- or zero-extension. -(define_mode_attr size [(QI "b") (HI "h") (SI "w")]) +(define_mode_attr size [(QI "b") (HI "h") (SI "w") (HF "") (BF "") (SF "") + (V2QI "h") (V4QI "") (V2HI "") + (V2HF "") (V2BF "")]) ;; Give the number of bits in the mode (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")]) @@ -1883,6 +1892,10 @@ (VNx4SI "v2si") (VNx4SF "v2sf") (VNx2DI "di") (VNx2DF "df")]) +;; Sub-64-bit vector mode to equivalent scalar mode. +(define_mode_attr VSC [(V4QI "SI") (V2QI "HI") + (V2HI "SI") (V2HF "SF") (V2BF "SF")]) + (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")]) ;; 64-bit container modes the inner or scalar source mode. @@ -2169,6 +2182,10 @@ (V2SI "q") (V2SF "q") (DI "q") (DF "q")]) +;; Scalar size of a sub-64-bit vector mode. +(define_mode_attr vstype [(V4QI "s") (V2QI "h") + (V2HI "s") (V2BF "s") (V2HF "s")]) + ;; Define corresponding core/FP element mode for each vector mode. (define_mode_attr vw [(V8QI "w") (V16QI "w") (V4HI "w") (V8HI "w") diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c index 3f1cce569558..2cd2d9112cc1 100644 --- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c +++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c @@ -12,3 +12,6 @@ /* { dg-final { scan-tree-dump "add new stmt: \[^\n\r]*COMPLEX_ADD_ROT270" "slp1" { xfail *-*-* } } } */ /* { dg-final { scan-tree-dump "add new stmt: \[^\n\r]*COMPLEX_ADD_ROT90" "slp1" { xfail *-*-* } } } */ + +/* { dg-final { scan-tree-dump "Found COMPLEX_ADD_ROT90" "slp1" { xfail arm*-*-* } } } */ +/* { dg-final { scan-tree-dump "Found COMPLEX_ADD_ROT270" "slp1" { xfail arm*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c index 33e500f3f4cd..e7a349b49c69 100644 --- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c +++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c @@ -8,5 +8,7 @@ #define N 16 #include "complex-mla-template.c" +/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_FMA" 1 "slp1" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump "Found COMPLEX_FMA_CONJ" "slp1" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump "Found COMPLEX_FMA" "slp1" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "Found COMPLEX_FMA" "slp1" { xfail arm*-*-* } } */ diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c index 259dd6b2e067..06d08da41ad6 100644 --- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c +++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c @@ -8,5 +8,8 @@ #define N 16 #include "complex-mul-template.c" -/* { dg-final { scan-tree-dump "Found COMPLEX_MUL_CONJ" "slp1" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump "Found COMPLEX_MUL" "slp1" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_MUL_CONJ" 1 "slp1" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_MUL" 1 "slp1" { xfail *-*-* } } } */ + +/* { dg-final { scan-tree-dump "Found COMPLEX_MUL_CONJ" "slp1" { xfail arm*-*-* } } } */ +/* { dg-final { scan-tree-dump "Found COMPLEX_MUL" "slp1" { xfail arm*-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c index 07d71a63414b..739e63a96a1c 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c @@ -30,12 +30,14 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n) \ TEST_ALL (VEC_PERM) /* We should use one DUP for each of the 8-, 16- and 32-bit types, - although we currently use LD1RW for _Float16. We should use two + (for now, insert both elements with umov + ins for _Float16). We should use two DUPs for each of the three 64-bit types. */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */ -/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */ +/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.h} 2 } } */ +/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], w[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], w[0-9]+} 3 } } */ /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tzip2\t} } } */ @@ -53,7 +55,6 @@ TEST_ALL (VEC_PERM) /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ /* { dg-final { scan-assembler-not {\tldr} } } */ -/* { dg-final { scan-assembler-times {\tstr} 2 } } */ -/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-not {\tstr} } } */ /* { dg-final { scan-assembler-not {\tuqdec} } } */
