https://gcc.gnu.org/g:4ddae2a94a032d77ef5564117e9906247a29a05f

commit r17-897-g4ddae2a94a032d77ef5564117e9906247a29a05f
Author: Artemiy Volkov <[email protected]>
Date:   Thu Feb 26 09:01:30 2026 +0000

    aarch64: initialize vectors from starting subsequence
    
    Now that we have 2- and 4-element vector modes for all the sub-word scalar
    modes, we can emit more efficient code when the elements of a vector
    constructor can be generated from a common starting subsequence of length
    power of two.  To do this, first detect the shortest possible starting
    subsequence by repeatedly folding the initial constructor element array
    in half, as long as the left and the right halves are equal.  Afterwards,
    after emitting the subsequence, duplicate it by generating a
    vec_duplicate with the correct source mode.
    
    On the MD side, this requires implementing the vec_duplicate optab to
    duplicate an arbitrary sub-128-bit value into a full 64- or a 128-bit
    AdvSIMD register, as well as the vec_set insn for the VSUB64 modes (needed
    as fallback for the divide-and-conquer approach).  The latter uses a
    properly scaled and shifted "bfi" for integer values, and a properly
    indexed "ins" for FP elements.
    
    This change allows us to get rid of long chains of inserts and compile
    things like:
    
    int16x8_t f (int16_t x, int16_t y, int16_t z, int16_t w)
    {
            return (int16x8_t) {x, y, z, w, x, y, z, w};
    }
    
    into:
            bfi     w0, w2, 16, 16
            bfi     w1, w3, 16, 16
            dup     v31.2s, w0
            dup     v0.2s, w1
            zip1    v0.8h, v31.8h, v0.8h
            ret
    
    rather than:
    
            dup     v31.4h, w0
            dup     v0.4h, w1
            ins     v31.h[1], w2
            ins     v0.h[1], w3
            ins     v31.h[3], w2
            ins     v0.h[3], w3
            zip1    v0.8h, v31.8h, v0.8h
            ret
    
    This patch also includes an extensive new test, which includes the above
    case, as well as adjustments to existing codegen tests as necessary.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md 
(*aarch64_simd_dup_subvector<vconq><mode>):
            New insn pattern.
            (*aarch64_simd_dup_subvector<vcond><mode>): Likewise.
            (@aarch64_simd_vec_set<mode>): Likewise.
            (vec_set<mode>): Handle 16- and 32-bit vector modes in the expander.
            * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): 
Add
            logic to initialize vector from starting subsequence.  Make static.
            (scalar_move_insn_p): Consider sub-64-bit vector moves scalar.
            * config/aarch64/iterators.md (VDDUP): New iterator.
            (VQDUP): Likewise.
            (elem_bits): Define attribute for sub-64-bit vector modes.
            (Vetype): Likewise.
            (VEL): Likewise.
            (single_wx): Define attribute for sub-64-bit vector and scalar 
modes.
            (single_type): Likewise.
            (VCOND): Likewise.
            (VCONQ): Likewise.
            (Vqduptype): New mode attribute.
            (Vdduptype): Likewise.
            (vcond): Likewise.
            (vconq): Likewise.
            (vstype): Define attribute for 64-bit vector and sub-128-bit scalar
            modes.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/ldp_stp_16.c: Adjust testcase.
            * gcc.target/aarch64/sve/slp_1.c: Likewise.
            * gcc.target/aarch64/vec-init-18.c: Likewise.
            * gcc.target/aarch64/vec-init-23.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md             |  50 ++-
 gcc/config/aarch64/aarch64.cc                  |  42 ++-
 gcc/config/aarch64/iterators.md                | 107 +++++-
 gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c  |   5 +-
 gcc/testsuite/gcc.target/aarch64/sve/slp_1.c   |   7 +-
 gcc/testsuite/gcc.target/aarch64/vec-init-18.c |   8 +-
 gcc/testsuite/gcc.target/aarch64/vec-init-23.c | 435 +++++++++++++++++++++++++
 7 files changed, 632 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 2b7f6b467c62..b13a680119ea 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -137,6 +137,28 @@
   }
 )
 
+(define_insn "*aarch64_simd_dup_subvector<vconq><mode>"
+  [(set (match_operand:<VCONQ> 0 "register_operand")
+       (vec_duplicate:<VCONQ>
+         (match_operand:VQDUP 1 "register_operand")))]
+  "TARGET_SIMD"
+  {@ [ cons: =0 , 1 ; attrs: type    ]
+     [ w        , w ; neon_dup_q     ] dup\t%0.<Vqduptype>, %1.<vstype>[0]
+     [ w        , r ; neon_from_gp_q ] dup\t%0.<Vqduptype>, %<single_wx>1
+  }
+)
+
+(define_insn "*aarch64_simd_dup_subvector<vcond><mode>"
+  [(set (match_operand:<VCOND> 0 "register_operand")
+       (vec_duplicate:<VCOND>
+         (match_operand:VDDUP 1 "register_operand")))]
+  "TARGET_SIMD"
+  {@ [ cons: =0 , 1 ; attrs: type  ]
+     [ w        , w ; neon_dup     ] dup\t%0.<Vdduptype>, %1.<vstype>[0]
+     [ w        , r ; neon_from_gp ] dup\t%0.<Vdduptype>, %<single_wx>1
+  }
+)
+
 (define_insn "@aarch64_dup_lane<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
        (vec_duplicate:VALL_F16
@@ -1291,6 +1313,32 @@
   [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
+(define_insn "@aarch64_simd_vec_set<mode>"
+  [(set (match_operand:VSUB64 0 "register_operand" "=r,w,w")
+       (vec_merge:VSUB64
+           (vec_duplicate:VSUB64
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" 
"r,w,Utv"))
+           (match_operand:VSUB64 3 "register_operand" "0,0,0")
+           (match_operand:SI 2 "immediate_operand" "i,i,i")))]
+  "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
+  {
+    int elt = exact_log2 (INTVAL (operands[2]));
+    switch (which_alternative)
+      {
+      case 0:
+       operands[2] = GEN_INT (elt * <elem_bits>);
+       return "bfi\t%w0, %w1, %2, <elem_bits>";
+      case 1:
+       return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+      case 2:
+       return "ld1\t{%0.<Vetype>}[%p2], %1";
+      default:
+       gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "bfm, neon_ins, neon_load1_one_lane")]
+)
+
 ;; Inserting from the zero register into a vector lane is treated as an
 ;; expensive GP->FP move on all CPUs.  Avoid it when optimizing for speed.
 (define_insn "aarch64_simd_vec_set_zero<mode>"
@@ -1720,7 +1768,7 @@
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
+  [(match_operand:VALL_F16_SUB64 0 "register_operand")
    (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand")
    (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4ed24c869652..889b774c00fb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25658,7 +25658,7 @@ aarch64_choose_vector_init_constant (machine_mode mode, 
rtx vals)
    The caller has already tried a divide-and-conquer approach, so do
    not consider that case here.  */
 
-void
+static void
 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
@@ -25716,6 +25716,43 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
       return;
     }
 
+  /* Check if the vector can be represented as a duplicate of a
+     subvector starting at index 0.  */
+  if (pow2p_hwi (n_elts))
+    {
+       bool halves_equal = true;
+       int n_seq = n_elts;
+       while (n_seq > 2)
+         {
+           for (int i = 0; i < n_seq / 2; i++)
+             if (!rtx_equal_p (XVECEXP (vals, 0, i),
+                               XVECEXP (vals, 0, i + n_seq / 2)))
+               {
+                 halves_equal = false;
+                 break;
+               }
+
+           if (!halves_equal)
+             break;
+
+           n_seq /= 2;
+         }
+
+       if (n_seq != n_elts)
+         {
+           machine_mode subv_mode = mode_for_vector (inner_mode,
+                                                     n_seq).require ();
+           rtx new_target = gen_reg_rtx (subv_mode);
+           rtvec new_vals = rtvec_alloc (n_seq);
+           for (int i = 0; i < n_seq; i++)
+             RTVEC_ELT (new_vals, i) = XVECEXP (vals, 0, i);
+           aarch64_expand_vector_init (new_target,
+                                       gen_rtx_PARALLEL (subv_mode, new_vals));
+           aarch64_emit_move (target, gen_vec_duplicate (mode, new_target));
+           return;
+         }
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -25875,7 +25912,8 @@ scalar_move_insn_p (rtx set)
   rtx src = SET_SRC (set);
   rtx dest = SET_DEST (set);
   return (is_a<scalar_mode> (GET_MODE (dest))
-         && aarch64_mov_operand (src, GET_MODE (dest)));
+         && aarch64_mov_operand (src, GET_MODE (dest)))
+        || aarch64_advsimd_sub_dword_mode_p (GET_MODE (dest));
 }
 
 /* Similar to seq_cost, but ignore cost for scalar moves.  */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index dfca3327f1fa..f3e7b9d58f37 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -139,6 +139,14 @@
 ;; VQMOV without 2-element modes.
 (define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
 
+;; Modes that can be duplicated into a 64-bit register.
+(define_mode_iterator VDDUP [V4QI V2QI QI V2HI HI SI
+                               V2BF BF V2HF HF SF])
+
+;; Modes that can be duplicated into a 128-bit register.
+(define_mode_iterator VQDUP [V8QI V4QI V2QI QI V4HI V2HI HI V2SI SI DI
+                               V4BF V2BF BF V4HF V2HF HF V2SF SF DF])
+
 ;; Double integer vector modes.
 (define_mode_iterator VD_I [V8QI V4HI V2SI DI])
 
@@ -1488,7 +1496,9 @@
 
 ;; The number of bits in a vector element, or controlled by a predicate
 ;; element.
-(define_mode_attr elem_bits [(VNx16BI "8") (VNx8BI "16")
+(define_mode_attr elem_bits [(V2QI "8") (V4QI "8") (V2HF "16") (V2HI "16")
+                            (V2BF "16")
+                            (VNx16BI "8") (VNx8BI "16")
                             (VNx4BI "32") (VNx2BI "64")
                             (VNx16QI "8") (VNx32QI "8") (VNx64QI "8")
                             (VNx8HI "16") (VNx16HI "16") (VNx32HI "16")
@@ -1593,11 +1603,12 @@
 
 ;; Mode-to-individual element type mapping.
 (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
-                         (V4HI "h") (V8HI  "h")
+                         (V2QI "b") (V4QI "b")
+                         (V4HI "h") (V8HI  "h") (V2HI "h")
                          (V2SI "s") (V4SI  "s")
                          (V2DI "d") (V1DI  "d")
-                         (V4HF "h") (V8HF  "h")
-                         (V2SF "s") (V4SF  "s")
+                         (V4HF "h") (V8HF  "h") (V2HF "h")
+                         (V2SF "s") (V4SF  "s") (V2BF "h")
                          (V2DF "d") (V1DF  "d")
                          (V2x8QI "b") (V2x4HI "h")
                          (V2x2SI "s") (V2x1DI "d")
@@ -1772,8 +1783,10 @@
                               (V4x2DF "v2df") (V4x8BF "v8bf")])
 
 ;; Define element mode for each vector mode.
-(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
+(define_mode_attr VEL [(V8QI "QI") (V16QI "QI")
+                      (V2QI "QI") (V4QI  "QI")
                       (V4HI "HI") (V8HI  "HI")
+                      (V2HI "HI") (V2HF  "HF")
                       (V2SI "SI") (V4SI  "SI")
                       (DI   "DI") (V1DI  "DI")
                       (V2DI "DI")
@@ -1784,6 +1797,7 @@
                       (SI   "SI") (HI    "HI")
                       (QI   "QI")
                       (V4BF "BF") (V8BF "BF")
+                      (V2BF "BF")
                       (V2x8QI "QI") (V2x4HI "HI")
                       (V2x2SI "SI") (V2x1DI "DI")
                       (V2x4HF "HF") (V2x2SF "SF")
@@ -1900,25 +1914,66 @@
 
 ;; 64-bit container modes the inner or scalar source mode.
 (define_mode_attr VCOND [(HI "V4HI") (SI "V2SI")
+                        (V2HI "V4HI")
                         (V4HI "V4HI") (V8HI "V4HI")
                         (V2SI "V2SI") (V4SI "V2SI")
+                        (QI "V8QI") (V2QI "V8QI")
+                        (V4QI "V8QI")
                         (DI   "DI") (V2DI "DI")
+                        (HF "V4HF") (V2HF "V4HF")
                         (V4HF "V4HF") (V8HF "V4HF")
+                        (BF "V4BF") (V2BF "V4BF")
+                        (SF "V2SF")
                         (V2SF "V2SF") (V4SF "V2SF")
                         (V2DF "DF")])
 
+;; Same as above, but in lowercase.
+(define_mode_attr vcond [(HI "v4hi") (SI "v2si")
+                        (V2HI "v4hi")
+                        (V4HI "v4hi") (V8HI "v4hi")
+                        (V2SI "v2si") (V4SI "v2si")
+                        (QI "v8qi") (V2QI "v8qi")
+                        (V4QI "v8qi")
+                        (DI   "di") (V2DI "di")
+                        (HF "v4hf") (V2HF "v4hf")
+                        (V4HF "v4hf") (V8HF "v4hf")
+                        (BF "v4bf") (V2BF "v4bf")
+                        (SF "v2sf")
+                        (V2SF "v2sf") (V4SF "v2sf")
+                        (V2DF "df")])
+
 ;; 128-bit container modes the inner or scalar source mode.
 (define_mode_attr VCONQ [(V8QI "V16QI") (V16QI "V16QI")
+                        (V4QI "V16QI") (V2QI "V16QI")
                         (V4HI "V8HI") (V8HI "V8HI")
+                        (V2HI "V8HI")
                         (V2SI "V4SI") (V4SI "V4SI")
                         (DI   "V2DI") (V2DI "V2DI")
                         (V4HF "V8HF") (V8HF "V8HF")
+                        (V2HF "V8HF") (HF "V8HF")
                         (V4BF "V8BF") (V8BF "V8BF")
+                        (V2BF "V8BF") (BF "V8BF")
                         (V2SF "V4SF") (V4SF "V4SF")
                         (V2DF "V2DF") (SI   "V4SI")
                         (HI   "V8HI") (QI   "V16QI")
                         (SF   "V4SF") (DF   "V2DF")])
 
+;; Same as above, but in lowercase.
+(define_mode_attr vconq [(V8QI "v16qi") (V16QI "v16qi")
+                        (V4QI "v16qi") (V2QI "v16qi")
+                        (V4HI "v8hi") (V8HI "v8hi")
+                        (V2HI "v8hi")
+                        (V2SI "v4si") (V4SI "v4si")
+                        (DI   "v2di") (V2DI "v2di")
+                        (V4HF "v8hf") (V8HF "v8hf")
+                        (V2HF "v8hf") (HF "v8hf")
+                        (V4BF "v8bf") (V8BF "v8bf")
+                        (V2BF "v8bf") (BF "v8bf")
+                        (V2SF "v4sf") (V4SF "v4sf")
+                        (V2DF "v2df") (SI   "v4si")
+                        (HI   "v8hi") (QI   "v16qi")
+                        (SF   "v4sf") (DF   "v2df")])
+
 ;; Half modes of all vector modes.
 (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
                         (V4HI "V2HI")  (V8HI  "V4HI")
@@ -2037,6 +2092,26 @@
 (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
                           (V2DI "4s")])
 
+;; Register suffix used when duplicating a value of a certain mode
+;; into a full 128-bit AdvSIMD register.
+(define_mode_attr Vqduptype [(QI "16b") (V2QI "8h") (V4QI "4s") (V8QI "2d")
+                            (HI "8h") (V2HI "4s") (V4HI "2d")
+                            (HF "8h") (V2HF "4s") (V4HF "2d")
+                            (BF "8h") (V2BF "4s") (V4BF "2d")
+                            (SI "4s") (V2SI "2d")
+                            (SF "4s") (V2SF "2d")
+                            (DI "2d") (DF "2d")])
+
+;; Register suffix used when duplicating a value of a certain mode
+;; into a partial 64-bit AdvSIMD register.
+(define_mode_attr Vdduptype [(QI "8b") (V2QI "4h") (V4QI "2s") (V8QI "")
+                            (HI "4h") (V2HI "2s") (V4HI "")
+                            (HF "4h") (V2HF "2s") (V4HF "")
+                            (BF "4h") (V2BF "2s") (V4BF "")
+                            (SI "2s") (V2SI "")
+                            (SF "2s") (V2SF "")
+                            (DI "") (DF "")])
+
 ;; The result of FCVTN on two vectors of the given mode.  The result has
 ;; twice as many QI elements as the input.
 (define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
@@ -2161,8 +2236,13 @@
 ;; Whether a mode fits in W or X registers (i.e. "w" for 32-bit modes
 ;; and "x" for 64-bit modes).
 (define_mode_attr single_wx [(SI   "w") (SF   "w")
+                            (V2QI "w") (V4QI "w")
                             (V8QI "x") (V4HI "x")
                             (V4HF "x") (V4BF "x")
+                            (V2HI "w") (V2HF "w")
+                            (HF   "w") (QI   "w")
+                            (V2BF "w") (BF   "w")
+                            (HI   "w")
                             (V2SI "x") (V2SF "x")
                             (DI   "x") (DF   "x")])
 
@@ -2172,7 +2252,12 @@
                               (V8QI "d") (V4HI "d")
                               (V4HF "d") (V4BF "d")
                               (V2SI "d") (V2SF "d")
-                              (DI   "d") (DF   "d")])
+                              (DI   "d") (DF   "d")
+                              (QI   "b") (BF   "h")
+                              (V2HF "s") (HI   "h")
+                              (V4QI "s") (V2QI "h")
+                              (V2HI "s") (V2BF "s")
+                              (HF   "h")])
 
 ;; Whether a double-width mode fits in D or Q registers (i.e. "d" for
 ;; 32-bit modes and "q" for 64-bit modes).
@@ -2182,9 +2267,13 @@
                                (V2SI "q") (V2SF "q")
                                (DI   "q") (DF   "q")])
 
-;; Scalar size of a sub-64-bit vector mode.
-(define_mode_attr vstype [(V4QI "s") (V2QI "h")
-                         (V2HI "s") (V2BF "s") (V2HF "s")])
+;; Scalar size of a sub-128-bit vector or scalar mode.
+(define_mode_attr vstype [(V8QI "d") (V4QI "s") (V2QI "h") (QI "b")
+                         (V4HI "d") (V2HI "s") (HI "h")
+                         (V2SI "d") (SI "s") (DI "d")
+                         (V4BF "d") (V2BF "s") (BF "h")
+                         (V4HF "d") (V2HF "s") (HF "h")
+                         (V2SF "d") (SF "s") (DF "d")])
 
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw [(V8QI "w") (V16QI "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index 95835aa2eb41..a6b4d50f34fa 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -96,9 +96,8 @@ CONS2_FN (4, float);
 
 /*
 ** cons2_8_float:
-**     dup     v[0-9]+\.2s, v[0-9]+\.s\[0\]
-**     dup     v[0-9]+\.2s, v[0-9]+\.s\[0\]
-**     zip1    v([0-9]+)\.4s, v[0-9]+\.4s, v[0-9]+\.4s
+**     uzp1    v1\.2s, v0\.2s, v1\.2s
+**     dup     v([0-9]+)\.2d, v1\.d\[0\]
 **     stp     q\1, q\1, \[x0\]
 **     stp     q\1, q\1, \[x0, #?32\]
 **     ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 739e63a96a1c..ddf4c23869f7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,14 +30,13 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)    
\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   (for now, insert both elements with umov + ins for _Float16).  We should 
use two
+   (for now, insert both elements with ins for _Float16).  We should use two
    DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
-/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.h} 2 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], w[0-9]+} 3 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], w[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], v[0-9]+\.h\[0\]} 
3 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 
3 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, 
z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index ecb59fe510b6..99e84096708d 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -15,6 +15,8 @@ int16x8_t foo2(int16_t x)
   return v;
 }
 
-/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
-/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
-/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, 
v[0-9]+\.8h} 2 } } */
+/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, v[0-9]+\.s\[0\]} 1 } 
} */
+/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */
+/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } */
+/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
new file mode 100644
index 000000000000..940fe34c3251
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
@@ -0,0 +1,435 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+fp16" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/* Check vector initialization with a repeating sequence of elements.  */
+
+#ifndef TESTCASE
+#define TESTCASE(TYPE, ETYPE, T, SZ, NUM, MULT, ...)\
+  TYPE##SZ##MULT##_t test_##TYPE##SZ##_##NUM (ETYPE x0, ETYPE x1, ETYPE x2, 
ETYPE x3,\
+                                          ETYPE x4, ETYPE x5, ETYPE x6, ETYPE 
x7)\
+  {\
+    return (TYPE##SZ##MULT##_t) {__VA_ARGS__};\
+  }
+#endif
+
+#define TEST_8(TYPE, ETYPE, T)\
+    TESTCASE (TYPE, ETYPE, T, 8, 1, x16, x0, x0, x0, x0, x0, x0, x0, x0,\
+                              x0, x0, x0, x0, x0, x0, x0, x0)\
+    TESTCASE (TYPE, ETYPE, T, 8, 2, x16, x0, x1, x0, x1, x0, x1, x0, x1,\
+                              x0, x1, x0, x1, x0, x1, x0, x1)\
+    TESTCASE (TYPE, ETYPE, T, 8, 3, x16, x0, x1, x2, x3, x0, x1, x2, x3,\
+                              x0, x1, x2, x3, x0, x1, x2, x3)\
+    TESTCASE (TYPE, ETYPE, T, 8, 4, x16, x0, x1, x2, x3, x4, x5, x6, x7,\
+                              x0, x1, x2, x3, x4, x5, x6, x7)\
+    TESTCASE (TYPE, ETYPE, T, 8, 5, x16, x0, 0, x0, 0, x0, 0, x0, 0,\
+                              x0, 0, x0, 0, x0, 0, x0, 0)\
+    TESTCASE (TYPE, ETYPE, T, 8, 6, x16, 0, x0, 0, x0, 0, x0, 0, x0,\
+                              0, x0, 0, x0, 0, x0, 0, x0)\
+    TESTCASE (TYPE, ETYPE, T, 8, 7, x16, x0, x1, 0, 1, x0, x1, 0, 1,\
+                              x0, x1, 0, 1, x0, x1, 0, 1)\
+    TESTCASE (TYPE, ETYPE, T, 8, 8, x16, 0, 1, x0, x1, 0, 1, x0, x1,\
+                          0, 1, x0, x1, 0, 1, x0, x1)\
+    TESTCASE (TYPE, ETYPE, T, 8, 9, x16, x0, 0, x1, 1, x0, 0, x1, 1,\
+                              x0, 0, x1, 1, x0, 0, x1, 1)\
+    TESTCASE (TYPE, ETYPE, T, 8, 10, x16, x0, 0, x1, 1, x2, 2, x3, 3,\
+                              x0, 0, x1, 1, x2, 2, x3, 3)\
+    TESTCASE (TYPE, ETYPE, T, 8, 11, x16, 0, x0, 1, x1, 2, x2, 3, x3,\
+                              0, x0, 1, x1, 2, x2, 3, x3)\
+    TESTCASE (TYPE, ETYPE, T, 8, 12, x16, x0, x1, 0, 1, x2, x3, 2, 3,\
+                              x0, x1, 0, 1, x2, x3, 2, 3)\
+    TESTCASE (TYPE, ETYPE, T, 8, 13, x16, 0, 1, x0, x1, 2, 3, x2, x3,\
+                              0, 1, x0, x1, 2, 3, x2, x3)
+
+#define TEST_16(TYPE, ETYPE, T)\
+    TESTCASE (TYPE, ETYPE, T, 16, 1, x8, x0, x0, x0, x0, x0, x0, x0, x0)\
+    TESTCASE (TYPE, ETYPE, T, 16, 2, x8, x0, x1, x0, x1, x0, x1, x0, x1)\
+    TESTCASE (TYPE, ETYPE, T, 16, 3, x8, x0, x1, x2, x3, x0, x1, x2, x3)\
+    TESTCASE (TYPE, ETYPE, T, 16, 4, x8, x0, 0, x0, 0, x0, 0, x0, 0)\
+    TESTCASE (TYPE, ETYPE, T, 16, 5, x8, 0, x0, 0, x0, 0, x0, 0, x0)\
+    TESTCASE (TYPE, ETYPE, T, 16, 6, x8, x0, x1, 0, 1, x0, x1, 0, 1)\
+    TESTCASE (TYPE, ETYPE, T, 16, 7, x8, 0, 1, x0, x1, 0, 1, x0, x1)\
+    TESTCASE (TYPE, ETYPE, T, 16, 8, x8, 0, x0, 1, x1, 0, x0, 1, x1)\
+
+#define TEST_32(TYPE, ETYPE, T)\
+    TESTCASE (TYPE, ETYPE, T, 32, 1, x4, x0, x0, x0, x0)\
+    TESTCASE (TYPE, ETYPE, T, 32, 2, x4, x0, x1, x0, x1)\
+    TESTCASE (TYPE, ETYPE, T, 32, 3, x4, x0, 0, x0, 0)\
+    TESTCASE (TYPE, ETYPE, T, 32, 4, x4, 0, x0, 0, x0)
+
+#define TEST_64(TYPE, ETYPE, T)\
+    TESTCASE (TYPE, ETYPE, T, 64, 1, x2, x0, x0)
+
+TEST_8(int, int8_t, s)
+
+TEST_16(float, float, f)
+TEST_16(int, int16_t, s)
+
+TEST_32(float, float, f)
+TEST_32(int, int32_t, s)
+
+TEST_64(float, double, f)
+TEST_64(int, int64_t, s)
+
+/*
+** test_int8_1:
+**     dup     v0\.16b, w0
+**     ret
+*/
+       
+/*
+** test_int8_2:
+**     bfi     w0, w1, 8, 8
+**     dup     v0\.8h, w0
+**     ret
+*/
+
+/*
+** test_int8_3:
+**     bfi     w0, w1, 8, 8
+**     bfi     w0, w2, 16, 8
+**     bfi     w0, w3, 24, 8
+**     dup     v0\.4s, w0
+**     ret
+*/
+
+/*
+** test_int8_4:
+**     bfi     w0, w2, 8, 8
+**     bfi     w1, w3, 8, 8
+**     bfi     w0, w4, 16, 8
+**     bfi     w1, w5, 16, 8
+**     bfi     w0, w6, 24, 8
+**     bfi     w1, w7, 24, 8
+**     dup     v31\.2s, w0
+**     dup     v0\.2s, w1
+**     zip1    v0\.16b, v31\.16b, v0\.16b
+**     ret
+*/
+
+/*
+** test_int8_5:
+**     mov     w1, 0
+**     bfi     w1, w0, 0, 8
+**     dup     v0\.8h, w1
+**     ret
+*/
+
+/*
+** test_int8_6:
+**     mov     w1, 0
+**     bfi     w1, w0, 8, 8
+**     dup     v0\.8h, w1
+**     ret
+*/
+
+/*
+** test_int8_7:
+**     mov     w2, 16777472
+**     bfi     w2, w0, 0, 8
+**     bfi     w2, w1, 8, 8
+**     dup     v0\.4s, w2
+**     ret
+*/
+
+/*
+** test_int8_8:
+**     mov     w2, 16777472
+**     bfi     w2, w0, 16, 8
+**     bfi     w2, w1, 24, 8
+**     dup     v0\.4s, w2
+**     ret
+*/
+
+/*
+** test_int8_9:
+**     mov     w2, 16777216
+**     bfi     w2, w0, 0, 8
+**     bfi     w2, w1, 16, 8
+**     dup     v0\.4s, w2
+**     ret
+*/
+
+/*
+** test_int8_10:
+**     bfi     w0, w1, 8, 8
+**     bfi     w0, w2, 16, 8
+**     bfi     w0, w3, 24, 8
+**     dup     v31\.2s, w0
+**     adrp    x0, .LANCHOR[0-9]+
+**     ldr     d0, \[x0, #:lo12:.LANCHOR[0-9]+\]
+**     zip1    v0\.16b, v31\.16b, v0\.16b
+**     ret
+*/
+
+/*
+** test_int8_11:
+**     bfi     w0, w1, 8, 8
+**     adrp    x4, .LANCHOR[0-9]+
+**     bfi     w0, w2, 16, 8
+**     ldr     d0, \[x4, #:lo12:\.LANCHOR[0-9]+\]
+**     bfi     w0, w3, 24, 8
+**     dup     v31\.2s, w0
+**     zip1    v0\.16b, v0\.16b, v31\.16b
+**     ret
+*/
+
+/*
+** test_int8_12:
+**     mov     w4, 33685504
+**     bfi     w4, w0, 0, 8
+**     mov     w0, 257
+**     movk    w0, 0x303, lsl 16
+**     bfi     w0, w1, 0, 8
+**     bfi     w4, w2, 16, 8
+**     bfi     w0, w3, 16, 8
+**     dup     v31\.2s, w4
+**     dup     v0\.2s, w0
+**     zip1    v0\.16b, v31\.16b, v0\.16b
+**     ret
+*/
+
+/*
+** test_int8_13:
+**     mov     w4, 33685504
+**     bfi     w4, w0, 8, 8
+**     mov     w0, 257
+**     movk    w0, 0x303, lsl 16
+**     bfi     w0, w1, 8, 8
+**     bfi     w4, w2, 24, 8
+**     bfi     w0, w3, 24, 8
+**     dup     v31\.2s, w4
+**     dup     v0\.2s, w0
+**     zip1    v0\.16b, v31\.16b, v0\.16b
+**     ret
+*/
+
+/*
+** test_float16_1:
+**     fcvt    h0, s0
+**     dup     v0\.8h, v0\.h\[0\]
+**     ret
+*/
+
+/*
+** test_float16_2:
+**     fcvt    h1, s1
+**     fcvt    h0, s0
+**     ins     v0\.h\[1\], v1\.h\[0\]
+**     dup     v0\.4s, v0\.s\[0\]
+**     ret
+*/
+
+/*
+** test_float16_3:
+**     uzp1    v2\.2s, v0\.2s, v2\.2s
+**     uzp1    v3\.2s, v1\.2s, v3\.2s
+**     zip1    v3\.4s, v2\.4s, v3\.4s
+**     fcvtn   v0\.4h, v3\.4s
+**     uzp1    v0\.2d, v0\.2d, v0\.2d
+**     ret
+*/
+
+/*
+** test_float16_4:
+**     fcvt    h0, s0
+**     movi    v31\.2d, #0
+**     ins     v31\.h\[0\], v0\.h\[0\]
+**     dup     v0\.4s, v31\.s\[0\]
+**     ret
+*/
+
+/*
+** test_float16_5:
+**     fcvt    h0, s0
+**     movi    v31\.2d, #0
+**     ins     v31\.h\[1\], v0\.h\[0\]
+**     dup     v0\.4s, v31\.s\[0\]
+**     ret
+*/
+
+/*
+** test_float16_6:
+**     fcvt    h1, s1
+**     fcvt    h0, s0
+**     movi    v31\.2d, #0
+**     mov     w0, 1006648320
+**     umov    w1, v1\.h\[0\]
+**     ins     v31\.h\[0\], v0\.h\[0\]
+**     bfi     w0, w1, 0, 16
+**     dup     v31\.2s, v31\.s\[0\]
+**     dup     v0\.2s, w0
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_float16_7:
+**     fcvt    h1, s1
+**     fcvt    h0, s0
+**     movi    v31\.2d, #0
+**     mov     w0, 1006648320
+**     umov    w1, v1\.h\[0\]
+**     ins     v31\.h\[1\], v0\.h\[0\]
+**     bfi     w0, w1, 16, 16
+**     dup     v31\.2s, v31\.s\[0\]
+**     dup     v0\.2s, w0
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_float16_8:
+**     fcvt    h1, s1
+**     fcvt    h0, s0
+**     movi    v31\.2s, 0x3c, lsl 24
+**     ins     v0\.h\[1\], v1\.h\[0\]
+**     dup     v0\.2s, v0\.s\[0\]
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_int16_1:
+**     dup     v0\.8h, w0
+**     ret
+*/
+
+/*
+** test_int16_2:
+**     bfi     w0, w1, 16, 16
+**     dup     v0\.4s, w0
+**     ret
+*/
+
+/*
+** test_int16_3:
+**     bfi     w0, w2, 16, 16
+**     bfi     w1, w3, 16, 16
+**     dup     v31\.2s, w0
+**     dup     v0\.2s, w1
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_int16_4:
+**     mov     w1, 0
+**     bfi     w1, w0, 0, 16
+**     dup     v0\.4s, w1
+**     ret
+*/
+
+/*
+** test_int16_5:
+**     mov     w1, 0
+**     bfi     w1, w0, 16, 16
+**     dup     v0\.4s, w1
+**     ret
+*/
+
+/*
+** test_int16_6:
+**     mov     w2, 0
+**     bfi     w2, w0, 0, 16
+**     mov     w0, 65537
+**     bfi     w0, w1, 0, 16
+**     dup     v31\.2s, w2
+**     dup     v0\.2s, w0
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_int16_7:
+**     mov     w2, 0
+**     bfi     w2, w0, 16, 16
+**     mov     w0, 65537
+**     bfi     w0, w1, 16, 16
+**     dup     v31\.2s, w2
+**     dup     v0\.2s, w0
+**     zip1    v0\.8h, v31\.8h, v0\.8h
+**     ret
+*/
+
+/*
+** test_int16_8:
+**     bfi     w0, w1, 16, 16
+**     movi    v0\.2s, 0x1, lsl 16
+**     dup     v31\.2s, w0
+**     zip1    v0\.8h, v0\.8h, v31\.8h
+**     ret
+*/
+
+/*
+** test_float32_1:
+**     dup     v0\.4s, v0\.s\[0\]
+**     ret
+*/
+
+/*
+** test_float32_2:
+**     uzp1    v0\.2s, v0\.2s, v1\.2s
+**     dup     v0\.2d, v0\.d\[0\]
+**     ret
+*/
+
+/*
+** test_float32_3:
+**     movi    v31\.2s, 0
+**     dup     v0\.2s, v0\.s\[0\]
+**     zip1    v0\.4s, v0\.4s, v31\.4s
+**     ret
+*/
+
+/*
+** test_float32_4:
+**     movi    v31\.2s, 0
+**     dup     v0\.2s, v0\.s\[0\]
+**     zip1    v0\.4s, v31\.4s, v0\.4s
+**     ret
+*/
+
+/*
+** test_int32_1:
+**     dup     v0\.4s, w0
+**     ret
+*/
+
+/*
+** test_int32_2:
+**     fmov    s0, w0
+**     ins     v0\.s\[1\], w1
+**     dup     v0\.2d, v0\.d\[0\]
+**     ret
+*/
+
+/*
+** test_int32_3:
+**     dup     v31\.2s, w0
+**     movi    v0\.2s, 0
+**     zip1    v0\.4s, v31\.4s, v0\.4s
+**     ret
+*/
+
+/*
+** test_int32_4:
+**     dup     v31\.2s, w0
+**     movi    v0\.2s, 0
+**     zip1    v0\.4s, v0\.4s, v31\.4s
+**     ret
+*/
+
+/*
+** test_float64_1:
+**     dup     v0\.2d, v0\.d\[0\]
+**     ret
+*/
+
+/*
+** test_int64_1:
+**     dup     v0\.2d, x0      
+**     ret
+*/

Reply via email to