https://gcc.gnu.org/g:4ddae2a94a032d77ef5564117e9906247a29a05f
commit r17-897-g4ddae2a94a032d77ef5564117e9906247a29a05f Author: Artemiy Volkov <[email protected]> Date: Thu Feb 26 09:01:30 2026 +0000 aarch64: initialize vectors from starting subsequence Now that we have 2- and 4-element vector modes for all the sub-word scalar modes, we can emit more efficient code when the elements of a vector constructor can be generated from a common starting subsequence of length power of two. To do this, first detect the shortest possible starting subsequence by repeatedly folding the initial constructor element array in half, as long as the left and the right halves are equal. Afterwards, after emitting the subsequence, duplicate it by generating a vec_duplicate with the correct source mode. On the MD side, this requires implementing the vec_duplicate optab to duplicate an arbitrary sub-128-bit value into a full 64- or a 128-bit AdvSIMD register, as well as the vec_set insn for the VSUB64 modes (needed as fallback for the divide-and-conquer approach). The latter uses a properly scaled and shifted "bfi" for integer values, and a properly indexed "ins" for FP elements. This change allows us to get rid of long chains of inserts and compile things like: int16x8_t f (int16_t x, int16_t y, int16_t z, int16_t w) { return (int16x8_t) {x, y, z, w, x, y, z, w}; } into: bfi w0, w2, 16, 16 bfi w1, w3, 16, 16 dup v31.2s, w0 dup v0.2s, w1 zip1 v0.8h, v31.8h, v0.8h ret rather than: dup v31.4h, w0 dup v0.4h, w1 ins v31.h[1], w2 ins v0.h[1], w3 ins v31.h[3], w2 ins v0.h[3], w3 zip1 v0.8h, v31.8h, v0.8h ret This patch also includes an extensive new test, which includes the above case, as well as adjustments to existing codegen tests as necessary. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_simd_dup_subvector<vconq><mode>): New insn pattern. (*aarch64_simd_dup_subvector<vcond><mode>): Likewise. (@aarch64_simd_vec_set<mode>): Likewise. (vec_set<mode>): Handle 16- and 32-bit vector modes in the expander. * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Add logic to initialize vector from starting subsequence. Make static. (scalar_move_insn_p): Consider sub-64-bit vector moves scalar. * config/aarch64/iterators.md (VDDUP): New iterator. (VQDUP): Likewise. (elem_bits): Define attribute for sub-64-bit vector modes. (Vetype): Likewise. (VEL): Likewise. (single_wx): Define attribute for sub-64-bit vector and scalar modes. (single_type): Likewise. (VCOND): Likewise. (VCONQ): Likewise. (Vqduptype): New mode attribute. (Vdduptype): Likewise. (vcond): Likewise. (vconq): Likewise. (vstype): Define attribute for 64-bit vector and sub-128-bit scalar modes. gcc/testsuite/ChangeLog: * gcc.target/aarch64/ldp_stp_16.c: Adjust testcase. * gcc.target/aarch64/sve/slp_1.c: Likewise. * gcc.target/aarch64/vec-init-18.c: Likewise. * gcc.target/aarch64/vec-init-23.c: New test. Diff: --- gcc/config/aarch64/aarch64-simd.md | 50 ++- gcc/config/aarch64/aarch64.cc | 42 ++- gcc/config/aarch64/iterators.md | 107 +++++- gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c | 5 +- gcc/testsuite/gcc.target/aarch64/sve/slp_1.c | 7 +- gcc/testsuite/gcc.target/aarch64/vec-init-18.c | 8 +- gcc/testsuite/gcc.target/aarch64/vec-init-23.c | 435 +++++++++++++++++++++++++ 7 files changed, 632 insertions(+), 22 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 2b7f6b467c62..b13a680119ea 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -137,6 +137,28 @@ } ) +(define_insn "*aarch64_simd_dup_subvector<vconq><mode>" + [(set (match_operand:<VCONQ> 0 "register_operand") + (vec_duplicate:<VCONQ> + (match_operand:VQDUP 1 "register_operand")))] + "TARGET_SIMD" + {@ [ cons: =0 , 1 ; attrs: type ] + [ w , w ; neon_dup_q ] dup\t%0.<Vqduptype>, %1.<vstype>[0] + [ w , r ; neon_from_gp_q ] dup\t%0.<Vqduptype>, %<single_wx>1 + } +) + +(define_insn "*aarch64_simd_dup_subvector<vcond><mode>" + [(set (match_operand:<VCOND> 0 "register_operand") + (vec_duplicate:<VCOND> + (match_operand:VDDUP 1 "register_operand")))] + "TARGET_SIMD" + {@ [ cons: =0 , 1 ; attrs: type ] + [ w , w ; neon_dup ] dup\t%0.<Vdduptype>, %1.<vstype>[0] + [ w , r ; neon_from_gp ] dup\t%0.<Vdduptype>, %<single_wx>1 + } +) + (define_insn "@aarch64_dup_lane<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (vec_duplicate:VALL_F16 @@ -1291,6 +1313,32 @@ [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")] ) +(define_insn "@aarch64_simd_vec_set<mode>" + [(set (match_operand:VSUB64 0 "register_operand" "=r,w,w") + (vec_merge:VSUB64 + (vec_duplicate:VSUB64 + (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "r,w,Utv")) + (match_operand:VSUB64 3 "register_operand" "0,0,0") + (match_operand:SI 2 "immediate_operand" "i,i,i")))] + "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0" + { + int elt = exact_log2 (INTVAL (operands[2])); + switch (which_alternative) + { + case 0: + operands[2] = GEN_INT (elt * <elem_bits>); + return "bfi\t%w0, %w1, %2, <elem_bits>"; + case 1: + return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; + case 2: + return "ld1\t{%0.<Vetype>}[%p2], %1"; + default: + gcc_unreachable (); + } + } + [(set_attr "type" "bfm, neon_ins, neon_load1_one_lane")] +) + ;; Inserting from the zero register into a vector lane is treated as an ;; expensive GP->FP move on all CPUs. Avoid it when optimizing for speed. (define_insn "aarch64_simd_vec_set_zero<mode>" @@ -1720,7 +1768,7 @@ ) (define_expand "vec_set<mode>" - [(match_operand:VALL_F16 0 "register_operand") + [(match_operand:VALL_F16_SUB64 0 "register_operand") (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand") (match_operand:SI 2 "immediate_operand")] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4ed24c869652..889b774c00fb 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25658,7 +25658,7 @@ aarch64_choose_vector_init_constant (machine_mode mode, rtx vals) The caller has already tried a divide-and-conquer approach, so do not consider that case here. */ -void +static void aarch64_expand_vector_init_fallback (rtx target, rtx vals) { machine_mode mode = GET_MODE (target); @@ -25716,6 +25716,43 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals) return; } + /* Check if the vector can be represented as a duplicate of a + subvector starting at index 0. */ + if (pow2p_hwi (n_elts)) + { + bool halves_equal = true; + int n_seq = n_elts; + while (n_seq > 2) + { + for (int i = 0; i < n_seq / 2; i++) + if (!rtx_equal_p (XVECEXP (vals, 0, i), + XVECEXP (vals, 0, i + n_seq / 2))) + { + halves_equal = false; + break; + } + + if (!halves_equal) + break; + + n_seq /= 2; + } + + if (n_seq != n_elts) + { + machine_mode subv_mode = mode_for_vector (inner_mode, + n_seq).require (); + rtx new_target = gen_reg_rtx (subv_mode); + rtvec new_vals = rtvec_alloc (n_seq); + for (int i = 0; i < n_seq; i++) + RTVEC_ELT (new_vals, i) = XVECEXP (vals, 0, i); + aarch64_expand_vector_init (new_target, + gen_rtx_PARALLEL (subv_mode, new_vals)); + aarch64_emit_move (target, gen_vec_duplicate (mode, new_target)); + return; + } + } + enum insn_code icode = optab_handler (vec_set_optab, mode); gcc_assert (icode != CODE_FOR_nothing); @@ -25875,7 +25912,8 @@ scalar_move_insn_p (rtx set) rtx src = SET_SRC (set); rtx dest = SET_DEST (set); return (is_a<scalar_mode> (GET_MODE (dest)) - && aarch64_mov_operand (src, GET_MODE (dest))); + && aarch64_mov_operand (src, GET_MODE (dest))) + || aarch64_advsimd_sub_dword_mode_p (GET_MODE (dest)); } /* Similar to seq_cost, but ignore cost for scalar moves. */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index dfca3327f1fa..f3e7b9d58f37 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -139,6 +139,14 @@ ;; VQMOV without 2-element modes. (define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF]) +;; Modes that can be duplicated into a 64-bit register. +(define_mode_iterator VDDUP [V4QI V2QI QI V2HI HI SI + V2BF BF V2HF HF SF]) + +;; Modes that can be duplicated into a 128-bit register. +(define_mode_iterator VQDUP [V8QI V4QI V2QI QI V4HI V2HI HI V2SI SI DI + V4BF V2BF BF V4HF V2HF HF V2SF SF DF]) + ;; Double integer vector modes. (define_mode_iterator VD_I [V8QI V4HI V2SI DI]) @@ -1488,7 +1496,9 @@ ;; The number of bits in a vector element, or controlled by a predicate ;; element. -(define_mode_attr elem_bits [(VNx16BI "8") (VNx8BI "16") +(define_mode_attr elem_bits [(V2QI "8") (V4QI "8") (V2HF "16") (V2HI "16") + (V2BF "16") + (VNx16BI "8") (VNx8BI "16") (VNx4BI "32") (VNx2BI "64") (VNx16QI "8") (VNx32QI "8") (VNx64QI "8") (VNx8HI "16") (VNx16HI "16") (VNx32HI "16") @@ -1593,11 +1603,12 @@ ;; Mode-to-individual element type mapping. (define_mode_attr Vetype [(V8QI "b") (V16QI "b") - (V4HI "h") (V8HI "h") + (V2QI "b") (V4QI "b") + (V4HI "h") (V8HI "h") (V2HI "h") (V2SI "s") (V4SI "s") (V2DI "d") (V1DI "d") - (V4HF "h") (V8HF "h") - (V2SF "s") (V4SF "s") + (V4HF "h") (V8HF "h") (V2HF "h") + (V2SF "s") (V4SF "s") (V2BF "h") (V2DF "d") (V1DF "d") (V2x8QI "b") (V2x4HI "h") (V2x2SI "s") (V2x1DI "d") @@ -1772,8 +1783,10 @@ (V4x2DF "v2df") (V4x8BF "v8bf")]) ;; Define element mode for each vector mode. -(define_mode_attr VEL [(V8QI "QI") (V16QI "QI") +(define_mode_attr VEL [(V8QI "QI") (V16QI "QI") + (V2QI "QI") (V4QI "QI") (V4HI "HI") (V8HI "HI") + (V2HI "HI") (V2HF "HF") (V2SI "SI") (V4SI "SI") (DI "DI") (V1DI "DI") (V2DI "DI") @@ -1784,6 +1797,7 @@ (SI "SI") (HI "HI") (QI "QI") (V4BF "BF") (V8BF "BF") + (V2BF "BF") (V2x8QI "QI") (V2x4HI "HI") (V2x2SI "SI") (V2x1DI "DI") (V2x4HF "HF") (V2x2SF "SF") @@ -1900,25 +1914,66 @@ ;; 64-bit container modes the inner or scalar source mode. (define_mode_attr VCOND [(HI "V4HI") (SI "V2SI") + (V2HI "V4HI") (V4HI "V4HI") (V8HI "V4HI") (V2SI "V2SI") (V4SI "V2SI") + (QI "V8QI") (V2QI "V8QI") + (V4QI "V8QI") (DI "DI") (V2DI "DI") + (HF "V4HF") (V2HF "V4HF") (V4HF "V4HF") (V8HF "V4HF") + (BF "V4BF") (V2BF "V4BF") + (SF "V2SF") (V2SF "V2SF") (V4SF "V2SF") (V2DF "DF")]) +;; Same as above, but in lowercase. +(define_mode_attr vcond [(HI "v4hi") (SI "v2si") + (V2HI "v4hi") + (V4HI "v4hi") (V8HI "v4hi") + (V2SI "v2si") (V4SI "v2si") + (QI "v8qi") (V2QI "v8qi") + (V4QI "v8qi") + (DI "di") (V2DI "di") + (HF "v4hf") (V2HF "v4hf") + (V4HF "v4hf") (V8HF "v4hf") + (BF "v4bf") (V2BF "v4bf") + (SF "v2sf") + (V2SF "v2sf") (V4SF "v2sf") + (V2DF "df")]) + ;; 128-bit container modes the inner or scalar source mode. (define_mode_attr VCONQ [(V8QI "V16QI") (V16QI "V16QI") + (V4QI "V16QI") (V2QI "V16QI") (V4HI "V8HI") (V8HI "V8HI") + (V2HI "V8HI") (V2SI "V4SI") (V4SI "V4SI") (DI "V2DI") (V2DI "V2DI") (V4HF "V8HF") (V8HF "V8HF") + (V2HF "V8HF") (HF "V8HF") (V4BF "V8BF") (V8BF "V8BF") + (V2BF "V8BF") (BF "V8BF") (V2SF "V4SF") (V4SF "V4SF") (V2DF "V2DF") (SI "V4SI") (HI "V8HI") (QI "V16QI") (SF "V4SF") (DF "V2DF")]) +;; Same as above, but in lowercase. +(define_mode_attr vconq [(V8QI "v16qi") (V16QI "v16qi") + (V4QI "v16qi") (V2QI "v16qi") + (V4HI "v8hi") (V8HI "v8hi") + (V2HI "v8hi") + (V2SI "v4si") (V4SI "v4si") + (DI "v2di") (V2DI "v2di") + (V4HF "v8hf") (V8HF "v8hf") + (V2HF "v8hf") (HF "v8hf") + (V4BF "v8bf") (V8BF "v8bf") + (V2BF "v8bf") (BF "v8bf") + (V2SF "v4sf") (V4SF "v4sf") + (V2DF "v2df") (SI "v4si") + (HI "v8hi") (QI "v16qi") + (SF "v4sf") (DF "v2df")]) + ;; Half modes of all vector modes. (define_mode_attr VHALF [(V8QI "V4QI") (V16QI "V8QI") (V4HI "V2HI") (V8HI "V4HI") @@ -2037,6 +2092,26 @@ (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h") (V2DI "4s")]) +;; Register suffix used when duplicating a value of a certain mode +;; into a full 128-bit AdvSIMD register. +(define_mode_attr Vqduptype [(QI "16b") (V2QI "8h") (V4QI "4s") (V8QI "2d") + (HI "8h") (V2HI "4s") (V4HI "2d") + (HF "8h") (V2HF "4s") (V4HF "2d") + (BF "8h") (V2BF "4s") (V4BF "2d") + (SI "4s") (V2SI "2d") + (SF "4s") (V2SF "2d") + (DI "2d") (DF "2d")]) + +;; Register suffix used when duplicating a value of a certain mode +;; into a partial 64-bit AdvSIMD register. +(define_mode_attr Vdduptype [(QI "8b") (V2QI "4h") (V4QI "2s") (V8QI "") + (HI "4h") (V2HI "2s") (V4HI "") + (HF "4h") (V2HF "2s") (V4HF "") + (BF "4h") (V2BF "2s") (V4BF "") + (SI "2s") (V2SI "") + (SF "2s") (V2SF "") + (DI "") (DF "")]) + ;; The result of FCVTN on two vectors of the given mode. The result has ;; twice as many QI elements as the input. (define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")]) @@ -2161,8 +2236,13 @@ ;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes ;; and "x" for 64-bit modes). (define_mode_attr single_wx [(SI "w") (SF "w") + (V2QI "w") (V4QI "w") (V8QI "x") (V4HI "x") (V4HF "x") (V4BF "x") + (V2HI "w") (V2HF "w") + (HF "w") (QI "w") + (V2BF "w") (BF "w") + (HI "w") (V2SI "x") (V2SF "x") (DI "x") (DF "x")]) @@ -2172,7 +2252,12 @@ (V8QI "d") (V4HI "d") (V4HF "d") (V4BF "d") (V2SI "d") (V2SF "d") - (DI "d") (DF "d")]) + (DI "d") (DF "d") + (QI "b") (BF "h") + (V2HF "s") (HI "h") + (V4QI "s") (V2QI "h") + (V2HI "s") (V2BF "s") + (HF "h")]) ;; Whether a double-width mode fits in D or Q registers (i.e. "d" for ;; 32-bit modes and "q" for 64-bit modes). @@ -2182,9 +2267,13 @@ (V2SI "q") (V2SF "q") (DI "q") (DF "q")]) -;; Scalar size of a sub-64-bit vector mode. -(define_mode_attr vstype [(V4QI "s") (V2QI "h") - (V2HI "s") (V2BF "s") (V2HF "s")]) +;; Scalar size of a sub-128-bit vector or scalar mode. +(define_mode_attr vstype [(V8QI "d") (V4QI "s") (V2QI "h") (QI "b") + (V4HI "d") (V2HI "s") (HI "h") + (V2SI "d") (SI "s") (DI "d") + (V4BF "d") (V2BF "s") (BF "h") + (V4HF "d") (V2HF "s") (HF "h") + (V2SF "d") (SF "s") (DF "d")]) ;; Define corresponding core/FP element mode for each vector mode. (define_mode_attr vw [(V8QI "w") (V16QI "w") diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c index 95835aa2eb41..a6b4d50f34fa 100644 --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c @@ -96,9 +96,8 @@ CONS2_FN (4, float); /* ** cons2_8_float: -** dup v[0-9]+\.2s, v[0-9]+\.s\[0\] -** dup v[0-9]+\.2s, v[0-9]+\.s\[0\] -** zip1 v([0-9]+)\.4s, v[0-9]+\.4s, v[0-9]+\.4s +** uzp1 v1\.2s, v0\.2s, v1\.2s +** dup v([0-9]+)\.2d, v1\.d\[0\] ** stp q\1, q\1, \[x0\] ** stp q\1, q\1, \[x0, #?32\] ** ret diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c index 739e63a96a1c..ddf4c23869f7 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c @@ -30,14 +30,13 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n) \ TEST_ALL (VEC_PERM) /* We should use one DUP for each of the 8-, 16- and 32-bit types, - (for now, insert both elements with umov + ins for _Float16). We should use two + (for now, insert both elements with ins for _Float16). We should use two DUPs for each of the three 64-bit types. */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */ -/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.h} 2 } } */ -/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], w[0-9]+} 3 } } */ -/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], w[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], v[0-9]+\.h\[0\]} 3 } } */ +/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 3 } } */ /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tzip2\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c index ecb59fe510b6..99e84096708d 100644 --- a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c @@ -15,6 +15,8 @@ int16x8_t foo2(int16_t x) return v; } -/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */ -/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */ -/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} 2 } } */ +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, v[0-9]+\.s\[0\]} 1 } } */ +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */ +/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } */ +/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c new file mode 100644 index 000000000000..940fe34c3251 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c @@ -0,0 +1,435 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8.2-a+fp16" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_neon.h> + +/* Check vector initialization with a repeating sequence of elements. */ + +#ifndef TESTCASE +#define TESTCASE(TYPE, ETYPE, T, SZ, NUM, MULT, ...)\ + TYPE##SZ##MULT##_t test_##TYPE##SZ##_##NUM (ETYPE x0, ETYPE x1, ETYPE x2, ETYPE x3,\ + ETYPE x4, ETYPE x5, ETYPE x6, ETYPE x7)\ + {\ + return (TYPE##SZ##MULT##_t) {__VA_ARGS__};\ + } +#endif + +#define TEST_8(TYPE, ETYPE, T)\ + TESTCASE (TYPE, ETYPE, T, 8, 1, x16, x0, x0, x0, x0, x0, x0, x0, x0,\ + x0, x0, x0, x0, x0, x0, x0, x0)\ + TESTCASE (TYPE, ETYPE, T, 8, 2, x16, x0, x1, x0, x1, x0, x1, x0, x1,\ + x0, x1, x0, x1, x0, x1, x0, x1)\ + TESTCASE (TYPE, ETYPE, T, 8, 3, x16, x0, x1, x2, x3, x0, x1, x2, x3,\ + x0, x1, x2, x3, x0, x1, x2, x3)\ + TESTCASE (TYPE, ETYPE, T, 8, 4, x16, x0, x1, x2, x3, x4, x5, x6, x7,\ + x0, x1, x2, x3, x4, x5, x6, x7)\ + TESTCASE (TYPE, ETYPE, T, 8, 5, x16, x0, 0, x0, 0, x0, 0, x0, 0,\ + x0, 0, x0, 0, x0, 0, x0, 0)\ + TESTCASE (TYPE, ETYPE, T, 8, 6, x16, 0, x0, 0, x0, 0, x0, 0, x0,\ + 0, x0, 0, x0, 0, x0, 0, x0)\ + TESTCASE (TYPE, ETYPE, T, 8, 7, x16, x0, x1, 0, 1, x0, x1, 0, 1,\ + x0, x1, 0, 1, x0, x1, 0, 1)\ + TESTCASE (TYPE, ETYPE, T, 8, 8, x16, 0, 1, x0, x1, 0, 1, x0, x1,\ + 0, 1, x0, x1, 0, 1, x0, x1)\ + TESTCASE (TYPE, ETYPE, T, 8, 9, x16, x0, 0, x1, 1, x0, 0, x1, 1,\ + x0, 0, x1, 1, x0, 0, x1, 1)\ + TESTCASE (TYPE, ETYPE, T, 8, 10, x16, x0, 0, x1, 1, x2, 2, x3, 3,\ + x0, 0, x1, 1, x2, 2, x3, 3)\ + TESTCASE (TYPE, ETYPE, T, 8, 11, x16, 0, x0, 1, x1, 2, x2, 3, x3,\ + 0, x0, 1, x1, 2, x2, 3, x3)\ + TESTCASE (TYPE, ETYPE, T, 8, 12, x16, x0, x1, 0, 1, x2, x3, 2, 3,\ + x0, x1, 0, 1, x2, x3, 2, 3)\ + TESTCASE (TYPE, ETYPE, T, 8, 13, x16, 0, 1, x0, x1, 2, 3, x2, x3,\ + 0, 1, x0, x1, 2, 3, x2, x3) + +#define TEST_16(TYPE, ETYPE, T)\ + TESTCASE (TYPE, ETYPE, T, 16, 1, x8, x0, x0, x0, x0, x0, x0, x0, x0)\ + TESTCASE (TYPE, ETYPE, T, 16, 2, x8, x0, x1, x0, x1, x0, x1, x0, x1)\ + TESTCASE (TYPE, ETYPE, T, 16, 3, x8, x0, x1, x2, x3, x0, x1, x2, x3)\ + TESTCASE (TYPE, ETYPE, T, 16, 4, x8, x0, 0, x0, 0, x0, 0, x0, 0)\ + TESTCASE (TYPE, ETYPE, T, 16, 5, x8, 0, x0, 0, x0, 0, x0, 0, x0)\ + TESTCASE (TYPE, ETYPE, T, 16, 6, x8, x0, x1, 0, 1, x0, x1, 0, 1)\ + TESTCASE (TYPE, ETYPE, T, 16, 7, x8, 0, 1, x0, x1, 0, 1, x0, x1)\ + TESTCASE (TYPE, ETYPE, T, 16, 8, x8, 0, x0, 1, x1, 0, x0, 1, x1)\ + +#define TEST_32(TYPE, ETYPE, T)\ + TESTCASE (TYPE, ETYPE, T, 32, 1, x4, x0, x0, x0, x0)\ + TESTCASE (TYPE, ETYPE, T, 32, 2, x4, x0, x1, x0, x1)\ + TESTCASE (TYPE, ETYPE, T, 32, 3, x4, x0, 0, x0, 0)\ + TESTCASE (TYPE, ETYPE, T, 32, 4, x4, 0, x0, 0, x0) + +#define TEST_64(TYPE, ETYPE, T)\ + TESTCASE (TYPE, ETYPE, T, 64, 1, x2, x0, x0) + +TEST_8(int, int8_t, s) + +TEST_16(float, float, f) +TEST_16(int, int16_t, s) + +TEST_32(float, float, f) +TEST_32(int, int32_t, s) + +TEST_64(float, double, f) +TEST_64(int, int64_t, s) + +/* +** test_int8_1: +** dup v0\.16b, w0 +** ret +*/ + +/* +** test_int8_2: +** bfi w0, w1, 8, 8 +** dup v0\.8h, w0 +** ret +*/ + +/* +** test_int8_3: +** bfi w0, w1, 8, 8 +** bfi w0, w2, 16, 8 +** bfi w0, w3, 24, 8 +** dup v0\.4s, w0 +** ret +*/ + +/* +** test_int8_4: +** bfi w0, w2, 8, 8 +** bfi w1, w3, 8, 8 +** bfi w0, w4, 16, 8 +** bfi w1, w5, 16, 8 +** bfi w0, w6, 24, 8 +** bfi w1, w7, 24, 8 +** dup v31\.2s, w0 +** dup v0\.2s, w1 +** zip1 v0\.16b, v31\.16b, v0\.16b +** ret +*/ + +/* +** test_int8_5: +** mov w1, 0 +** bfi w1, w0, 0, 8 +** dup v0\.8h, w1 +** ret +*/ + +/* +** test_int8_6: +** mov w1, 0 +** bfi w1, w0, 8, 8 +** dup v0\.8h, w1 +** ret +*/ + +/* +** test_int8_7: +** mov w2, 16777472 +** bfi w2, w0, 0, 8 +** bfi w2, w1, 8, 8 +** dup v0\.4s, w2 +** ret +*/ + +/* +** test_int8_8: +** mov w2, 16777472 +** bfi w2, w0, 16, 8 +** bfi w2, w1, 24, 8 +** dup v0\.4s, w2 +** ret +*/ + +/* +** test_int8_9: +** mov w2, 16777216 +** bfi w2, w0, 0, 8 +** bfi w2, w1, 16, 8 +** dup v0\.4s, w2 +** ret +*/ + +/* +** test_int8_10: +** bfi w0, w1, 8, 8 +** bfi w0, w2, 16, 8 +** bfi w0, w3, 24, 8 +** dup v31\.2s, w0 +** adrp x0, .LANCHOR[0-9]+ +** ldr d0, \[x0, #:lo12:.LANCHOR[0-9]+\] +** zip1 v0\.16b, v31\.16b, v0\.16b +** ret +*/ + +/* +** test_int8_11: +** bfi w0, w1, 8, 8 +** adrp x4, .LANCHOR[0-9]+ +** bfi w0, w2, 16, 8 +** ldr d0, \[x4, #:lo12:\.LANCHOR[0-9]+\] +** bfi w0, w3, 24, 8 +** dup v31\.2s, w0 +** zip1 v0\.16b, v0\.16b, v31\.16b +** ret +*/ + +/* +** test_int8_12: +** mov w4, 33685504 +** bfi w4, w0, 0, 8 +** mov w0, 257 +** movk w0, 0x303, lsl 16 +** bfi w0, w1, 0, 8 +** bfi w4, w2, 16, 8 +** bfi w0, w3, 16, 8 +** dup v31\.2s, w4 +** dup v0\.2s, w0 +** zip1 v0\.16b, v31\.16b, v0\.16b +** ret +*/ + +/* +** test_int8_13: +** mov w4, 33685504 +** bfi w4, w0, 8, 8 +** mov w0, 257 +** movk w0, 0x303, lsl 16 +** bfi w0, w1, 8, 8 +** bfi w4, w2, 24, 8 +** bfi w0, w3, 24, 8 +** dup v31\.2s, w4 +** dup v0\.2s, w0 +** zip1 v0\.16b, v31\.16b, v0\.16b +** ret +*/ + +/* +** test_float16_1: +** fcvt h0, s0 +** dup v0\.8h, v0\.h\[0\] +** ret +*/ + +/* +** test_float16_2: +** fcvt h1, s1 +** fcvt h0, s0 +** ins v0\.h\[1\], v1\.h\[0\] +** dup v0\.4s, v0\.s\[0\] +** ret +*/ + +/* +** test_float16_3: +** uzp1 v2\.2s, v0\.2s, v2\.2s +** uzp1 v3\.2s, v1\.2s, v3\.2s +** zip1 v3\.4s, v2\.4s, v3\.4s +** fcvtn v0\.4h, v3\.4s +** uzp1 v0\.2d, v0\.2d, v0\.2d +** ret +*/ + +/* +** test_float16_4: +** fcvt h0, s0 +** movi v31\.2d, #0 +** ins v31\.h\[0\], v0\.h\[0\] +** dup v0\.4s, v31\.s\[0\] +** ret +*/ + +/* +** test_float16_5: +** fcvt h0, s0 +** movi v31\.2d, #0 +** ins v31\.h\[1\], v0\.h\[0\] +** dup v0\.4s, v31\.s\[0\] +** ret +*/ + +/* +** test_float16_6: +** fcvt h1, s1 +** fcvt h0, s0 +** movi v31\.2d, #0 +** mov w0, 1006648320 +** umov w1, v1\.h\[0\] +** ins v31\.h\[0\], v0\.h\[0\] +** bfi w0, w1, 0, 16 +** dup v31\.2s, v31\.s\[0\] +** dup v0\.2s, w0 +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_float16_7: +** fcvt h1, s1 +** fcvt h0, s0 +** movi v31\.2d, #0 +** mov w0, 1006648320 +** umov w1, v1\.h\[0\] +** ins v31\.h\[1\], v0\.h\[0\] +** bfi w0, w1, 16, 16 +** dup v31\.2s, v31\.s\[0\] +** dup v0\.2s, w0 +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_float16_8: +** fcvt h1, s1 +** fcvt h0, s0 +** movi v31\.2s, 0x3c, lsl 24 +** ins v0\.h\[1\], v1\.h\[0\] +** dup v0\.2s, v0\.s\[0\] +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_int16_1: +** dup v0\.8h, w0 +** ret +*/ + +/* +** test_int16_2: +** bfi w0, w1, 16, 16 +** dup v0\.4s, w0 +** ret +*/ + +/* +** test_int16_3: +** bfi w0, w2, 16, 16 +** bfi w1, w3, 16, 16 +** dup v31\.2s, w0 +** dup v0\.2s, w1 +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_int16_4: +** mov w1, 0 +** bfi w1, w0, 0, 16 +** dup v0\.4s, w1 +** ret +*/ + +/* +** test_int16_5: +** mov w1, 0 +** bfi w1, w0, 16, 16 +** dup v0\.4s, w1 +** ret +*/ + +/* +** test_int16_6: +** mov w2, 0 +** bfi w2, w0, 0, 16 +** mov w0, 65537 +** bfi w0, w1, 0, 16 +** dup v31\.2s, w2 +** dup v0\.2s, w0 +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_int16_7: +** mov w2, 0 +** bfi w2, w0, 16, 16 +** mov w0, 65537 +** bfi w0, w1, 16, 16 +** dup v31\.2s, w2 +** dup v0\.2s, w0 +** zip1 v0\.8h, v31\.8h, v0\.8h +** ret +*/ + +/* +** test_int16_8: +** bfi w0, w1, 16, 16 +** movi v0\.2s, 0x1, lsl 16 +** dup v31\.2s, w0 +** zip1 v0\.8h, v0\.8h, v31\.8h +** ret +*/ + +/* +** test_float32_1: +** dup v0\.4s, v0\.s\[0\] +** ret +*/ + +/* +** test_float32_2: +** uzp1 v0\.2s, v0\.2s, v1\.2s +** dup v0\.2d, v0\.d\[0\] +** ret +*/ + +/* +** test_float32_3: +** movi v31\.2s, 0 +** dup v0\.2s, v0\.s\[0\] +** zip1 v0\.4s, v0\.4s, v31\.4s +** ret +*/ + +/* +** test_float32_4: +** movi v31\.2s, 0 +** dup v0\.2s, v0\.s\[0\] +** zip1 v0\.4s, v31\.4s, v0\.4s +** ret +*/ + +/* +** test_int32_1: +** dup v0\.4s, w0 +** ret +*/ + +/* +** test_int32_2: +** fmov s0, w0 +** ins v0\.s\[1\], w1 +** dup v0\.2d, v0\.d\[0\] +** ret +*/ + +/* +** test_int32_3: +** dup v31\.2s, w0 +** movi v0\.2s, 0 +** zip1 v0\.4s, v31\.4s, v0\.4s +** ret +*/ + +/* +** test_int32_4: +** dup v31\.2s, w0 +** movi v0\.2s, 0 +** zip1 v0\.4s, v0\.4s, v31\.4s +** ret +*/ + +/* +** test_float64_1: +** dup v0\.2d, v0\.d\[0\] +** ret +*/ + +/* +** test_int64_1: +** dup v0\.2d, x0 +** ret +*/
