[gcc r17-898] aarch64: implement vec_concat support for sub-64-bit types

Artemiy Volkov via Gcc-cvs Thu, 28 May 2026 04:27:45 -0700

https://gcc.gnu.org/g:920eeb67a3537b024521f21f983be0e249faa5ea


commit r17-898-g920eeb67a3537b024521f21f983be0e249faa5ea
Author: Artemiy Volkov <[email protected]>
Date:   Thu Feb 26 08:45:08 2026 +0000

    aarch64: implement vec_concat support for sub-64-bit types
    
    This patch improves handling of 2-element vec_concats in
    aarch64_vector_init_fallback (); where previously the aarch64_vec_concat
    insn was emitted only for pairs of vectors, we now allow scalar operands
    as well.  Furthermore, if the two operands are the same, we can now emit a
    vec_duplicate instead of a vec_concat, leading to better code generation.
    
    This is backed by the new combine{z,_internal}{,_be} insn patterns, that
    were each split between integral 16- and 32-bit modes (only involving GPRs
    and memory), and the rest (requiring the "w" alternatives as well).
    
    The effect of the changes is illustrated by the changes to vec-init-23.c,
    introduced in the previous patch (and a handful of other vector-init
    related tests).
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>):
            New insn pattern.
            (*aarch64_combine_internal_be<mode>): Likewise.
            (*aarch64_combinez<mode>): Likewise.
            (*aarch64_combinez_be<mode>): Likewise.
            (@aarch64_vec_concat<mode>): Support smaller vector and scalar 
modes.
            * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
            Handle the case of two scalar elements.
            * config/aarch64/iterators.md (SSUB64): New mode iterator.
            (VSSUB64): Likewise.
            (VSSUB32_I) : Likewise.
            (VSSUB64_F): Likewise.
            (VS32_I_SUB64_F): Likewise.
            (single_wx): Define attribute for sub-64-bit vector and scalar 
modes.
            (bitsize): Likewise.
            (VDBL): Likewise.
            (single_dwx): New mode attribute.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/gather_load_10.c: Adjust testcase.
            * gcc.target/aarch64/sve/slp_1.c: Likewise.
            * gcc.target/aarch64/vec-init-18.c: Likewise.
            * gcc.target/aarch64/vec-init-23.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 | 115 ++++++++++++++++++++-
 gcc/config/aarch64/aarch64.cc                      |  22 ++--
 gcc/config/aarch64/iterators.md                    |  39 ++++++-
 .../gcc.target/aarch64/sve/gather_load_10.c        |   3 +-
 gcc/testsuite/gcc.target/aarch64/sve/slp_1.c       |   6 +-
 gcc/testsuite/gcc.target/aarch64/vec-init-18.c     |   8 +-
 gcc/testsuite/gcc.target/aarch64/vec-init-23.c     |  85 +++++++--------
 7 files changed, 211 insertions(+), 67 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index b13a680119ea..ec14474fe520 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4856,6 +4856,34 @@
   }
 )
 
+(define_insn "*aarch64_combine_internal<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VS32_I_SUB64_F 1 "register_operand")
+         (match_operand:VS32_I_SUB64_F 2 
"aarch64_simd_nonimmediate_operand")))]
+  "TARGET_FLOAT
+   && !BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
+     [ w        , w  , w   ; neon_permute              , simd  ] 
uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype>
+     [ w        , 0  , w   ; neon_move                 , simd  ] 
mov\t%0.<single_type>[1], %2.<single_type>[0]
+     [ w        , 0  , Utv ; neon_load1_one_lane       , simd  ] 
ld1\t{%0.<single_type>}[1], %2
+     [ w        , 0  , r   ; neon_from_gp              , simd  ] 
ins\t%0.<single_type>[1], %<single_wx>2
+     [ ?r       , 0  , r   ; bfm                       , *     ] 
bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+  }
+)
+
+(define_insn "*aarch64_combine_internal<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VSSUB32_I 1 "register_operand")
+         (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand")))]
+  "TARGET_FLOAT
+   && !BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  , 2  ; attrs: type               , arch  ]
+     [ r        , 0  , r  ; bfm                       , *     ] 
bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+  }
+)
+
 (define_insn "*aarch64_combine_internal_be<mode>"
   [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand")
        (vec_concat:<VDBL>
@@ -4875,6 +4903,35 @@
   }
 )
 
+(define_insn "*aarch64_combine_internal_be<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_nonimmediate_operand")
+         (match_operand:VS32_I_SUB64_F 1 "register_operand")))]
+  "TARGET_FLOAT
+   && BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
+     [ w        , w  , w   ; neon_permute              , simd  ] 
uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype>
+     [ w        , 0  , w   ; neon_move                 , simd  ] 
mov\t%0.<single_type>[1], %2.<single_type>[0]
+     [ w        , 0  , Utv ; neon_load1_one_lane       , simd  ] 
ld1\t{%0.<single_type>}[1], %2
+     [ w        , 0  , r   ; neon_from_gp              , simd  ] 
ins\t%0.<single_type>[1], %<single_wx>2
+     [ ?r       , 0  , r   ; bfm                       , *     ] 
bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+  }
+)
+
+(define_insn "*aarch64_combine_internal_be<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand")
+         (match_operand:VSSUB32_I 1 "register_operand")))]
+  "TARGET_FLOAT
+   && BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  , 2  ; attrs: type               , arch  ]
+     [ r        , 0  , r  ; bfm                       , *     ] 
bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+  }
+)
+
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
@@ -4891,6 +4948,33 @@
   }
 )
 
+(define_insn "*aarch64_combinez<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+          (match_operand:VSSUB32_I 1 "nonimmediate_operand")
+         (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  ; attrs: type      ]
+     [ r        , r  ; mov_reg          ] uxt<size>\t%w0, %w1
+     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
+  }
+)
+
+(define_insn "*aarch64_combinez<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+          (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")
+         (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  ; attrs: type      ]
+     [ w        , w  ; neon_move        ] fmov\t%<single_type>0, 
%<single_type>1
+     [ w        , r  ; neon_from_gp     ] fmov\t%<single_type>0, %<single_wx>1
+     [ w        , m  ; neon_load1_1reg  ] ldr\t%<single_type>0, %1
+     [ r        , r  ; mov_reg          ] uxtw\t%x0, %w1
+     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
+  }
+)
+
 (define_insn "*aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand")
         (vec_concat:<VDBL>
@@ -4904,14 +4988,41 @@
   }
 )
 
+(define_insn "*aarch64_combinez_be<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")
+          (match_operand:VSSUB32_I 1 "nonimmediate_operand")))]
+  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  ; attrs: type      ]
+     [ r        , r  ; mov_reg          ] uxt<size>\t%w0, %w1
+     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
+  }
+)
+
+(define_insn "*aarch64_combinez_be<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand")
+       (vec_concat:<VDBL>
+         (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero")
+          (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")))]
+  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+  {@ [ cons: =0 , 1  ; attrs: type      ]
+     [ w        , w  ; neon_move        ] fmov\t%<single_type>0, 
%<single_type>1
+     [ w        , r  ; neon_from_gp     ] fmov\t%<single_type>0, %<single_wx>1
+     [ w        , m  ; neon_load1_1reg  ] ldr\t%<single_type>0, %1
+     [ r        , r  ; mov_reg          ] uxtw\t%x0, %w1
+     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
+  }
+)
+
 ;; Form a vector whose first half (in array order) comes from operand 1
 ;; and whose second half (in array order) comes from operand 2.
 ;; This operand order follows the RTL vec_concat operation.
 (define_expand "@aarch64_vec_concat<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand")
        (vec_concat:<VDBL>
-         (match_operand:VDCSIF 1 "general_operand")
-         (match_operand:VDCSIF 2 "general_operand")))]
+         (match_operand:VQDUP 1 "general_operand")
+         (match_operand:VQDUP 2 "general_operand")))]
   "TARGET_FLOAT"
 {
   int lo = BYTES_BIG_ENDIAN ? 2 : 1;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 889b774c00fb..8465303649f6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25669,21 +25669,29 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
   int n_var = 0;
   /* The first element of vals.  */
   rtx v0 = XVECEXP (vals, 0, 0);
+  machine_mode v0mode = GET_MODE (v0);
   bool all_same = true;
 
-  /* This is a special vec_init<M><N> where N is not an element mode but a
+  /* This is a special vec_init<M><N> where N is either an element mode or a
      vector mode with half the elements of M.  We expect to find two entries
      of mode N in VALS and we must put their concatentation into TARGET.  */
-  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 
0))))
+  if (n_elts == 2 && (VECTOR_MODE_P (v0mode)
+                                || SCALAR_INT_MODE_P (v0mode)
+                                || SCALAR_FLOAT_MODE_P (v0mode)))
     {
-      machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
+      rtx v1 = XVECEXP (vals, 0, 1);
+      machine_mode narrow_mode = GET_MODE (v0);
       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
                  && known_eq (GET_MODE_SIZE (mode),
                               2 * GET_MODE_SIZE (narrow_mode)));
-      emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
-                                        XVECEXP (vals, 0, 0),
-                                        XVECEXP (vals, 0, 1)));
-     return;
+      if (rtx_equal_p (v0, v1))
+       aarch64_emit_move (target,
+                         gen_vec_duplicate (mode,
+                                            force_reg (narrow_mode, v0)));
+      else
+       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
+                                         v0, v1));
+      return;
    }
 
   /* Count the number of variable elements to initialise.  */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f3e7b9d58f37..462f2d996f07 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -238,6 +238,21 @@
 ;; All sub-64-bit vector modes.
 (define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF])
 
+;; All sub-64-bit scalar modes.
+(define_mode_iterator SSUB64 [QI HI HF BF SI SF])
+
+;; All sub-64-bit modes.
+(define_mode_iterator VSSUB64 [VSUB64 SSUB64])
+
+;; All sub-32-bit integer modes.
+(define_mode_iterator VSSUB32_I [V2QI QI HI])
+
+;; All sub-64-bit floating-point modes.
+(define_mode_iterator VSSUB64_F [V2HF V2BF HF BF])
+
+;; All 32-bit integer and sub-64-bit floating point modes.
+(define_mode_iterator VS32_I_SUB64_F [V4QI V2HI VSSUB64_F])
+
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
                                V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
@@ -1475,7 +1490,13 @@
 (define_mode_attr bitsize [(V8QI "64") (V16QI "128")
                           (V4HI "64") (V8HI "128")
                           (V2SI "64") (V4SI "128")
-                          (V1DI "64") (V2DI "128")])
+                          (V1DI "64") (V2DI "128")
+                          (QI "8") (V2QI "16")
+                          (V4QI "32") (HI "16")
+                          (HF "16") (BF "16")
+                          (SI "32") (SF "32")
+                          (V2HI "32") (V2HF "32")
+                          (V2BF "32")])
 
 ;; Map a floating point or integer mode to the appropriate register name prefix
 (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
@@ -2015,10 +2036,16 @@
 (define_mode_attr V1half [(V2DI "v1di")  (V2DF  "v1df")])
 
 ;; Double modes of vector modes.
-(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+(define_mode_attr VDBL [(V8QI "V16QI") (V4QI "V8QI")
+                       (V2QI "V4QI")  (V4HI "V8HI")
                        (V4HF "V8HF")  (V4BF "V8BF")
+                       (V2BF "V4BF")
                        (V2SI "V4SI")  (V2SF "V4SF")
+                       (V2HI "V4HI")  (V2HF "V4HF")
+                       (BF   "V2BF")
                        (SI   "V2SI")  (SF   "V2SF")
+                       (QI   "V2QI")
+                       (HI   "V2HI")  (HF   "V2HF")
                        (DI   "V2DI")  (DF   "V2DF")])
 
 ;; Load/store pair mode.
@@ -2246,6 +2273,14 @@
                             (V2SI "x") (V2SF "x")
                             (DI   "x") (DF   "x")])
 
+(define_mode_attr single_dwx [(SI  "x") (SF   "x")
+                            (V2QI "w") (V4QI "x")
+                            (V2HI "x") (V2HF "x")
+                            (HF   "w") (QI   "w")
+                            (V2BF "x") (BF   "w")
+                            (HI   "w")])
+
+
 ;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
 ;; and "d" for 64-bit modes).
 (define_mode_attr single_type [(SI   "s") (SF   "s")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c 
b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
index 2a07c0be866c..75283d355ae1 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
@@ -11,7 +11,8 @@ foo (uint64_t *restrict x, uint64_t *restrict y, uint64_t 
*restrict index)
     x[i] += y[index[i]];
 }
 
-/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl 
#?3\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl 
#?3\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1\t{v[0-9]+\.d}\[1\], \[x[0-9]+\]} 1 
} } */
 /* { dg-final { scan-assembler-not {\tshl\tv[0-9]+\.2d,} } } */
 /* { dg-final { scan-assembler-not {\tumov\t} } } */
 /* { dg-final { scan-assembler {\tadd\tv[0-9]+\.2d,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index ddf4c23869f7..1fbb08c7566e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,13 +30,13 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)    
\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   (for now, insert both elements with ins for _Float16).  We should use two
+   (and we now use fmov + ins for _Float16).  We should use two
    DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], v[0-9]+\.h\[0\]} 
3 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 
3 } } */
+/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, h} 1 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 
1 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, 
z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index 99e84096708d..394537c80d8f 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -15,8 +15,6 @@ int16x8_t foo2(int16_t x)
   return v;
 }
 
-/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, v[0-9]+\.s\[0\]} 1 } 
} */
-/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 1 } } */
-/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */
-/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } */
-/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tw1, 1} 1 } } */
+/* { dg-final { scan-assembler-times {\tdup\tv0+\.4s, w0} 2 } } */
+/* { dg-final { scan-assembler-times {\tbfi\tw0, w1, 16, 16} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
index 940fe34c3251..8c154f3680df 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
@@ -111,9 +111,8 @@ TEST_64(int, int64_t, s)
 
 /*
 ** test_int8_5:
-**     mov     w1, 0
-**     bfi     w1, w0, 0, 8
-**     dup     v0\.8h, w1
+**     uxtb    w0, w0
+**     dup     v0\.8h, w0
 **     ret
 */
 
@@ -217,7 +216,7 @@ TEST_64(int, int64_t, s)
 ** test_float16_2:
 **     fcvt    h1, s1
 **     fcvt    h0, s0
-**     ins     v0\.h\[1\], v1\.h\[0\]
+**     uzp1    v0\.4h, v0\.4h, v1\.4h
 **     dup     v0\.4s, v0\.s\[0\]
 **     ret
 */
@@ -227,55 +226,51 @@ TEST_64(int, int64_t, s)
 **     uzp1    v2\.2s, v0\.2s, v2\.2s
 **     uzp1    v3\.2s, v1\.2s, v3\.2s
 **     zip1    v3\.4s, v2\.4s, v3\.4s
-**     fcvtn   v0\.4h, v3\.4s
-**     uzp1    v0\.2d, v0\.2d, v0\.2d
+**     fcvtn   v3\.4h, v3\.4s
+**     dup     v0\.2d, v3\.d\[0\]
 **     ret
 */
 
 /*
 ** test_float16_4:
 **     fcvt    h0, s0
-**     movi    v31\.2d, #0
-**     ins     v31\.h\[0\], v0\.h\[0\]
-**     dup     v0\.4s, v31\.s\[0\]
+**     fmov    h0, h0
+**     dup     v0\.4s, v0\.s\[0\]
 **     ret
 */
 
 /*
 ** test_float16_5:
+**     movi    v31\.4h, #0
 **     fcvt    h0, s0
-**     movi    v31\.2d, #0
-**     ins     v31\.h\[1\], v0\.h\[0\]
-**     dup     v0\.4s, v31\.s\[0\]
+**     uzp1    v0\.4h, v31\.4h, v0\.4h
+**     dup     v0\.4s, v0\.s\[0\]
 **     ret
 */
 
 /*
 ** test_float16_6:
-**     fcvt    h1, s1
 **     fcvt    h0, s0
-**     movi    v31\.2d, #0
-**     mov     w0, 1006648320
-**     umov    w1, v1\.h\[0\]
-**     ins     v31\.h\[0\], v0\.h\[0\]
-**     bfi     w0, w1, 0, 16
-**     dup     v31\.2s, v31\.s\[0\]
-**     dup     v0\.2s, w0
-**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     fcvt    h1, s1
+**     fmov    h31, 1.0e\+0
+**     fmov    h0, h0
+**     uzp1    v1\.4h, v1\.4h, v31\.4h
+**     dup     v0\.2s, v0\.s\[0\]
+**     dup     v1\.2s, v1\.s\[0\]
+**     zip1    v0\.8h, v0\.8h, v1\.8h
 **     ret
 */
 
 /*
 ** test_float16_7:
-**     fcvt    h1, s1
 **     fcvt    h0, s0
-**     movi    v31\.2d, #0
-**     mov     w0, 1006648320
-**     umov    w1, v1\.h\[0\]
-**     ins     v31\.h\[1\], v0\.h\[0\]
-**     bfi     w0, w1, 16, 16
+**     movi    v31\.4h, #0
+**     fcvt    h1, s1
+**     uzp1    v31\.4h, v31\.4h, v0\.4h
+**     fmov    h0, 1.0e\+0
+**     uzp1    v0\.4h, v0\.4h, v1\.4h
 **     dup     v31\.2s, v31\.s\[0\]
-**     dup     v0\.2s, w0
+**     dup     v0\.2s, v0\.s\[0\]
 **     zip1    v0\.8h, v31\.8h, v0\.8h
 **     ret
 */
@@ -285,7 +280,7 @@ TEST_64(int, int64_t, s)
 **     fcvt    h1, s1
 **     fcvt    h0, s0
 **     movi    v31\.2s, 0x3c, lsl 24
-**     ins     v0\.h\[1\], v1\.h\[0\]
+**     uzp1    v0\.4h, v0\.4h, v1\.4h
 **     dup     v0\.2s, v0\.s\[0\]
 **     zip1    v0\.8h, v31\.8h, v0\.8h
 **     ret
@@ -316,9 +311,8 @@ TEST_64(int, int64_t, s)
 
 /*
 ** test_int16_4:
-**     mov     w1, 0
-**     bfi     w1, w0, 0, 16
-**     dup     v0\.4s, w1
+**     uxth    w0, w0
+**     dup     v0\.4s, w0
 **     ret
 */
 
@@ -332,12 +326,11 @@ TEST_64(int, int64_t, s)
 
 /*
 ** test_int16_6:
-**     mov     w2, 0
-**     bfi     w2, w0, 0, 16
-**     mov     w0, 65537
-**     bfi     w0, w1, 0, 16
-**     dup     v31\.2s, w2
-**     dup     v0\.2s, w0
+**     uxth    w0, w0
+**     dup     v31\.2s, w0
+**     mov     w0, 1
+**     bfi     w1, w0, 16, 16
+**     dup     v0\.2s, w1
 **     zip1    v0\.8h, v31\.8h, v0\.8h
 **     ret
 */
@@ -378,17 +371,16 @@ TEST_64(int, int64_t, s)
 
 /*
 ** test_float32_3:
-**     movi    v31\.2s, 0
-**     dup     v0\.2s, v0\.s\[0\]
-**     zip1    v0\.4s, v0\.4s, v31\.4s
+**     fmov    s0, s0
+**     dup     v0\.2d, v0\.d\[0\]
 **     ret
 */
 
 /*
 ** test_float32_4:
-**     movi    v31\.2s, 0
-**     dup     v0\.2s, v0\.s\[0\]
-**     zip1    v0\.4s, v31\.4s, v0\.4s
+**     movi    v31\.2s, #0
+**     uzp1    v0\.2s, v31\.2s, v0\.2s
+**     dup     v0\.2d, v0\.d\[0\]
 **     ret
 */
 
@@ -408,9 +400,8 @@ TEST_64(int, int64_t, s)
 
 /*
 ** test_int32_3:
-**     dup     v31\.2s, w0
-**     movi    v0\.2s, 0
-**     zip1    v0\.4s, v31\.4s, v0\.4s
+**     fmov    s0, w0
+**     dup     v0\.2d, v0\.d\[0\]
 **     ret
 */

[gcc r17-898] aarch64: implement vec_concat support for sub-64-bit types

Reply via email to