https://gcc.gnu.org/g:920eeb67a3537b024521f21f983be0e249faa5ea
commit r17-898-g920eeb67a3537b024521f21f983be0e249faa5ea Author: Artemiy Volkov <[email protected]> Date: Thu Feb 26 08:45:08 2026 +0000 aarch64: implement vec_concat support for sub-64-bit types This patch improves handling of 2-element vec_concats in aarch64_vector_init_fallback (); where previously the aarch64_vec_concat insn was emitted only for pairs of vectors, we now allow scalar operands as well. Furthermore, if the two operands are the same, we can now emit a vec_duplicate instead of a vec_concat, leading to better code generation. This is backed by the new combine{z,_internal}{,_be} insn patterns, that were each split between integral 16- and 32-bit modes (only involving GPRs and memory), and the rest (requiring the "w" alternatives as well). The effect of the changes is illustrated by the changes to vec-init-23.c, introduced in the previous patch (and a handful of other vector-init related tests). gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>): New insn pattern. (*aarch64_combine_internal_be<mode>): Likewise. (*aarch64_combinez<mode>): Likewise. (*aarch64_combinez_be<mode>): Likewise. (@aarch64_vec_concat<mode>): Support smaller vector and scalar modes. * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Handle the case of two scalar elements. * config/aarch64/iterators.md (SSUB64): New mode iterator. (VSSUB64): Likewise. (VSSUB32_I) : Likewise. (VSSUB64_F): Likewise. (VS32_I_SUB64_F): Likewise. (single_wx): Define attribute for sub-64-bit vector and scalar modes. (bitsize): Likewise. (VDBL): Likewise. (single_dwx): New mode attribute. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/gather_load_10.c: Adjust testcase. * gcc.target/aarch64/sve/slp_1.c: Likewise. * gcc.target/aarch64/vec-init-18.c: Likewise. * gcc.target/aarch64/vec-init-23.c: Likewise. Diff: --- gcc/config/aarch64/aarch64-simd.md | 115 ++++++++++++++++++++- gcc/config/aarch64/aarch64.cc | 22 ++-- gcc/config/aarch64/iterators.md | 39 ++++++- .../gcc.target/aarch64/sve/gather_load_10.c | 3 +- gcc/testsuite/gcc.target/aarch64/sve/slp_1.c | 6 +- gcc/testsuite/gcc.target/aarch64/vec-init-18.c | 8 +- gcc/testsuite/gcc.target/aarch64/vec-init-23.c | 85 +++++++-------- 7 files changed, 211 insertions(+), 67 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index b13a680119ea..ec14474fe520 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4856,6 +4856,34 @@ } ) +(define_insn "*aarch64_combine_internal<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VS32_I_SUB64_F 1 "register_operand") + (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_nonimmediate_operand")))] + "TARGET_FLOAT + && !BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] + [ w , w , w ; neon_permute , simd ] uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype> + [ w , 0 , w ; neon_move , simd ] mov\t%0.<single_type>[1], %2.<single_type>[0] + [ w , 0 , Utv ; neon_load1_one_lane , simd ] ld1\t{%0.<single_type>}[1], %2 + [ w , 0 , r ; neon_from_gp , simd ] ins\t%0.<single_type>[1], %<single_wx>2 + [ ?r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize> + } +) + +(define_insn "*aarch64_combine_internal<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VSSUB32_I 1 "register_operand") + (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand")))] + "TARGET_FLOAT + && !BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] + [ r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize> + } +) + (define_insn "*aarch64_combine_internal_be<mode>" [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand") (vec_concat:<VDBL> @@ -4875,6 +4903,35 @@ } ) +(define_insn "*aarch64_combine_internal_be<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_nonimmediate_operand") + (match_operand:VS32_I_SUB64_F 1 "register_operand")))] + "TARGET_FLOAT + && BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] + [ w , w , w ; neon_permute , simd ] uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype> + [ w , 0 , w ; neon_move , simd ] mov\t%0.<single_type>[1], %2.<single_type>[0] + [ w , 0 , Utv ; neon_load1_one_lane , simd ] ld1\t{%0.<single_type>}[1], %2 + [ w , 0 , r ; neon_from_gp , simd ] ins\t%0.<single_type>[1], %<single_wx>2 + [ ?r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize> + } +) + +(define_insn "*aarch64_combine_internal_be<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand") + (match_operand:VSSUB32_I 1 "register_operand")))] + "TARGET_FLOAT + && BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] + [ r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize> + } +) + + ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. @@ -4891,6 +4948,33 @@ } ) +(define_insn "*aarch64_combinez<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VSSUB32_I 1 "nonimmediate_operand") + (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_FLOAT && !BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 ; attrs: type ] + [ r , r ; mov_reg ] uxt<size>\t%w0, %w1 + [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1 + } +) + +(define_insn "*aarch64_combinez<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand") + (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero")))] + "TARGET_FLOAT && !BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 ; attrs: type ] + [ w , w ; neon_move ] fmov\t%<single_type>0, %<single_type>1 + [ w , r ; neon_from_gp ] fmov\t%<single_type>0, %<single_wx>1 + [ w , m ; neon_load1_1reg ] ldr\t%<single_type>0, %1 + [ r , r ; mov_reg ] uxtw\t%x0, %w1 + [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1 + } +) + (define_insn "*aarch64_combinez_be<mode>" [(set (match_operand:<VDBL> 0 "register_operand") (vec_concat:<VDBL> @@ -4904,14 +4988,41 @@ } ) +(define_insn "*aarch64_combinez_be<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero") + (match_operand:VSSUB32_I 1 "nonimmediate_operand")))] + "TARGET_FLOAT && BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 ; attrs: type ] + [ r , r ; mov_reg ] uxt<size>\t%w0, %w1 + [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1 + } +) + +(define_insn "*aarch64_combinez_be<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero") + (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")))] + "TARGET_FLOAT && BYTES_BIG_ENDIAN" + {@ [ cons: =0 , 1 ; attrs: type ] + [ w , w ; neon_move ] fmov\t%<single_type>0, %<single_type>1 + [ w , r ; neon_from_gp ] fmov\t%<single_type>0, %<single_wx>1 + [ w , m ; neon_load1_1reg ] ldr\t%<single_type>0, %1 + [ r , r ; mov_reg ] uxtw\t%x0, %w1 + [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1 + } +) + ;; Form a vector whose first half (in array order) comes from operand 1 ;; and whose second half (in array order) comes from operand 2. ;; This operand order follows the RTL vec_concat operation. (define_expand "@aarch64_vec_concat<mode>" [(set (match_operand:<VDBL> 0 "register_operand") (vec_concat:<VDBL> - (match_operand:VDCSIF 1 "general_operand") - (match_operand:VDCSIF 2 "general_operand")))] + (match_operand:VQDUP 1 "general_operand") + (match_operand:VQDUP 2 "general_operand")))] "TARGET_FLOAT" { int lo = BYTES_BIG_ENDIAN ? 2 : 1; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 889b774c00fb..8465303649f6 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25669,21 +25669,29 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals) int n_var = 0; /* The first element of vals. */ rtx v0 = XVECEXP (vals, 0, 0); + machine_mode v0mode = GET_MODE (v0); bool all_same = true; - /* This is a special vec_init<M><N> where N is not an element mode but a + /* This is a special vec_init<M><N> where N is either an element mode or a vector mode with half the elements of M. We expect to find two entries of mode N in VALS and we must put their concatentation into TARGET. */ - if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0)))) + if (n_elts == 2 && (VECTOR_MODE_P (v0mode) + || SCALAR_INT_MODE_P (v0mode) + || SCALAR_FLOAT_MODE_P (v0mode))) { - machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0)); + rtx v1 = XVECEXP (vals, 0, 1); + machine_mode narrow_mode = GET_MODE (v0); gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode && known_eq (GET_MODE_SIZE (mode), 2 * GET_MODE_SIZE (narrow_mode))); - emit_insn (gen_aarch64_vec_concat (narrow_mode, target, - XVECEXP (vals, 0, 0), - XVECEXP (vals, 0, 1))); - return; + if (rtx_equal_p (v0, v1)) + aarch64_emit_move (target, + gen_vec_duplicate (mode, + force_reg (narrow_mode, v0))); + else + emit_insn (gen_aarch64_vec_concat (narrow_mode, target, + v0, v1)); + return; } /* Count the number of variable elements to initialise. */ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index f3e7b9d58f37..462f2d996f07 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -238,6 +238,21 @@ ;; All sub-64-bit vector modes. (define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF]) +;; All sub-64-bit scalar modes. +(define_mode_iterator SSUB64 [QI HI HF BF SI SF]) + +;; All sub-64-bit modes. +(define_mode_iterator VSSUB64 [VSUB64 SSUB64]) + +;; All sub-32-bit integer modes. +(define_mode_iterator VSSUB32_I [V2QI QI HI]) + +;; All sub-64-bit floating-point modes. +(define_mode_iterator VSSUB64_F [V2HF V2BF HF BF]) + +;; All 32-bit integer and sub-64-bit floating point modes. +(define_mode_iterator VS32_I_SUB64_F [V4QI V2HI VSSUB64_F]) + ;; All Advanced SIMD modes suitable for moving, loading, and storing. (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) @@ -1475,7 +1490,13 @@ (define_mode_attr bitsize [(V8QI "64") (V16QI "128") (V4HI "64") (V8HI "128") (V2SI "64") (V4SI "128") - (V1DI "64") (V2DI "128")]) + (V1DI "64") (V2DI "128") + (QI "8") (V2QI "16") + (V4QI "32") (HI "16") + (HF "16") (BF "16") + (SI "32") (SF "32") + (V2HI "32") (V2HF "32") + (V2BF "32")]) ;; Map a floating point or integer mode to the appropriate register name prefix (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")]) @@ -2015,10 +2036,16 @@ (define_mode_attr V1half [(V2DI "v1di") (V2DF "v1df")]) ;; Double modes of vector modes. -(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI") +(define_mode_attr VDBL [(V8QI "V16QI") (V4QI "V8QI") + (V2QI "V4QI") (V4HI "V8HI") (V4HF "V8HF") (V4BF "V8BF") + (V2BF "V4BF") (V2SI "V4SI") (V2SF "V4SF") + (V2HI "V4HI") (V2HF "V4HF") + (BF "V2BF") (SI "V2SI") (SF "V2SF") + (QI "V2QI") + (HI "V2HI") (HF "V2HF") (DI "V2DI") (DF "V2DF")]) ;; Load/store pair mode. @@ -2246,6 +2273,14 @@ (V2SI "x") (V2SF "x") (DI "x") (DF "x")]) +(define_mode_attr single_dwx [(SI "x") (SF "x") + (V2QI "w") (V4QI "x") + (V2HI "x") (V2HF "x") + (HF "w") (QI "w") + (V2BF "x") (BF "w") + (HI "w")]) + + ;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes ;; and "d" for 64-bit modes). (define_mode_attr single_type [(SI "s") (SF "s") diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c index 2a07c0be866c..75283d355ae1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c @@ -11,7 +11,8 @@ foo (uint64_t *restrict x, uint64_t *restrict y, uint64_t *restrict index) x[i] += y[index[i]]; } -/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl #?3\]} 2 } } */ +/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl #?3\]} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1\t{v[0-9]+\.d}\[1\], \[x[0-9]+\]} 1 } } */ /* { dg-final { scan-assembler-not {\tshl\tv[0-9]+\.2d,} } } */ /* { dg-final { scan-assembler-not {\tumov\t} } } */ /* { dg-final { scan-assembler {\tadd\tv[0-9]+\.2d,} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c index ddf4c23869f7..1fbb08c7566e 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c @@ -30,13 +30,13 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n) \ TEST_ALL (VEC_PERM) /* We should use one DUP for each of the 8-, 16- and 32-bit types, - (for now, insert both elements with ins for _Float16). We should use two + (and we now use fmov + ins for _Float16). We should use two DUPs for each of the three 64-bit types. */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */ -/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], v[0-9]+\.h\[0\]} 3 } } */ -/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, h} 1 } } */ +/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 1 } } */ /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tzip2\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c index 99e84096708d..394537c80d8f 100644 --- a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c @@ -15,8 +15,6 @@ int16x8_t foo2(int16_t x) return v; } -/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, v[0-9]+\.s\[0\]} 1 } } */ -/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */ -/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } */ -/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tw1, 1} 1 } } */ +/* { dg-final { scan-assembler-times {\tdup\tv0+\.4s, w0} 2 } } */ +/* { dg-final { scan-assembler-times {\tbfi\tw0, w1, 16, 16} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c index 940fe34c3251..8c154f3680df 100644 --- a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c @@ -111,9 +111,8 @@ TEST_64(int, int64_t, s) /* ** test_int8_5: -** mov w1, 0 -** bfi w1, w0, 0, 8 -** dup v0\.8h, w1 +** uxtb w0, w0 +** dup v0\.8h, w0 ** ret */ @@ -217,7 +216,7 @@ TEST_64(int, int64_t, s) ** test_float16_2: ** fcvt h1, s1 ** fcvt h0, s0 -** ins v0\.h\[1\], v1\.h\[0\] +** uzp1 v0\.4h, v0\.4h, v1\.4h ** dup v0\.4s, v0\.s\[0\] ** ret */ @@ -227,55 +226,51 @@ TEST_64(int, int64_t, s) ** uzp1 v2\.2s, v0\.2s, v2\.2s ** uzp1 v3\.2s, v1\.2s, v3\.2s ** zip1 v3\.4s, v2\.4s, v3\.4s -** fcvtn v0\.4h, v3\.4s -** uzp1 v0\.2d, v0\.2d, v0\.2d +** fcvtn v3\.4h, v3\.4s +** dup v0\.2d, v3\.d\[0\] ** ret */ /* ** test_float16_4: ** fcvt h0, s0 -** movi v31\.2d, #0 -** ins v31\.h\[0\], v0\.h\[0\] -** dup v0\.4s, v31\.s\[0\] +** fmov h0, h0 +** dup v0\.4s, v0\.s\[0\] ** ret */ /* ** test_float16_5: +** movi v31\.4h, #0 ** fcvt h0, s0 -** movi v31\.2d, #0 -** ins v31\.h\[1\], v0\.h\[0\] -** dup v0\.4s, v31\.s\[0\] +** uzp1 v0\.4h, v31\.4h, v0\.4h +** dup v0\.4s, v0\.s\[0\] ** ret */ /* ** test_float16_6: -** fcvt h1, s1 ** fcvt h0, s0 -** movi v31\.2d, #0 -** mov w0, 1006648320 -** umov w1, v1\.h\[0\] -** ins v31\.h\[0\], v0\.h\[0\] -** bfi w0, w1, 0, 16 -** dup v31\.2s, v31\.s\[0\] -** dup v0\.2s, w0 -** zip1 v0\.8h, v31\.8h, v0\.8h +** fcvt h1, s1 +** fmov h31, 1.0e\+0 +** fmov h0, h0 +** uzp1 v1\.4h, v1\.4h, v31\.4h +** dup v0\.2s, v0\.s\[0\] +** dup v1\.2s, v1\.s\[0\] +** zip1 v0\.8h, v0\.8h, v1\.8h ** ret */ /* ** test_float16_7: -** fcvt h1, s1 ** fcvt h0, s0 -** movi v31\.2d, #0 -** mov w0, 1006648320 -** umov w1, v1\.h\[0\] -** ins v31\.h\[1\], v0\.h\[0\] -** bfi w0, w1, 16, 16 +** movi v31\.4h, #0 +** fcvt h1, s1 +** uzp1 v31\.4h, v31\.4h, v0\.4h +** fmov h0, 1.0e\+0 +** uzp1 v0\.4h, v0\.4h, v1\.4h ** dup v31\.2s, v31\.s\[0\] -** dup v0\.2s, w0 +** dup v0\.2s, v0\.s\[0\] ** zip1 v0\.8h, v31\.8h, v0\.8h ** ret */ @@ -285,7 +280,7 @@ TEST_64(int, int64_t, s) ** fcvt h1, s1 ** fcvt h0, s0 ** movi v31\.2s, 0x3c, lsl 24 -** ins v0\.h\[1\], v1\.h\[0\] +** uzp1 v0\.4h, v0\.4h, v1\.4h ** dup v0\.2s, v0\.s\[0\] ** zip1 v0\.8h, v31\.8h, v0\.8h ** ret @@ -316,9 +311,8 @@ TEST_64(int, int64_t, s) /* ** test_int16_4: -** mov w1, 0 -** bfi w1, w0, 0, 16 -** dup v0\.4s, w1 +** uxth w0, w0 +** dup v0\.4s, w0 ** ret */ @@ -332,12 +326,11 @@ TEST_64(int, int64_t, s) /* ** test_int16_6: -** mov w2, 0 -** bfi w2, w0, 0, 16 -** mov w0, 65537 -** bfi w0, w1, 0, 16 -** dup v31\.2s, w2 -** dup v0\.2s, w0 +** uxth w0, w0 +** dup v31\.2s, w0 +** mov w0, 1 +** bfi w1, w0, 16, 16 +** dup v0\.2s, w1 ** zip1 v0\.8h, v31\.8h, v0\.8h ** ret */ @@ -378,17 +371,16 @@ TEST_64(int, int64_t, s) /* ** test_float32_3: -** movi v31\.2s, 0 -** dup v0\.2s, v0\.s\[0\] -** zip1 v0\.4s, v0\.4s, v31\.4s +** fmov s0, s0 +** dup v0\.2d, v0\.d\[0\] ** ret */ /* ** test_float32_4: -** movi v31\.2s, 0 -** dup v0\.2s, v0\.s\[0\] -** zip1 v0\.4s, v31\.4s, v0\.4s +** movi v31\.2s, #0 +** uzp1 v0\.2s, v31\.2s, v0\.2s +** dup v0\.2d, v0\.d\[0\] ** ret */ @@ -408,9 +400,8 @@ TEST_64(int, int64_t, s) /* ** test_int32_3: -** dup v31\.2s, w0 -** movi v0\.2s, 0 -** zip1 v0\.4s, v31\.4s, v0\.4s +** fmov s0, w0 +** dup v0\.2d, v0\.d\[0\] ** ret */
