[gcc r17-896] aarch64: introduce partial AdvSIMD vector modes

Artemiy Volkov via Gcc-cvs Thu, 28 May 2026 04:28:58 -0700

https://gcc.gnu.org/g:44a31df54837adf2f7815e7966dfe8ac32eb8f3b


commit r17-896-g44a31df54837adf2f7815e7966dfe8ac32eb8f3b
Author: Artemiy Volkov <[email protected]>
Date:   Mon May 18 10:21:18 2026 +0000

    aarch64: introduce partial AdvSIMD vector modes
    
    In addition to V2HF that already exists, this patch adds 4 more partial
    (16- and 32-bit) AdvSIMD vector modes: V4QI, V2QI, V2HI, and V2BF.  For
    now, these are intended only for duplication into full-sized (32-, 64-,
    and 128-bit) registers.  As a minimal closure required to bootstrap the
    compiler, this also implements the "mov" expand and the "aarch64_simd_mov"
    insn_and_split for the new modes (gathered under the VSUB64 iterator).
    
    This patch also adds the new aarch64_advsimd_sub_dword_mode_p () helper to
    facilitate detecting the new modes; that is then used (a) to disable
    vec_perm_const vectorization for those modes, (b) in the "mov" expander
    for those modes, and (c) to define the new "Da" constraint.
    
    Some existing testcases were adjusted where needed.  (The _Float16
    testcase in sve/slp_1.c temporarily expects GPRs to be used for V2HF,
    which is corrected to FPRs by the succeeding patch; and the half-float
    complex tests now recognize some of the patterns, but check that V2BF
    still can't be used for vectorization.)
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-modes.def (VECTOR_MODE): Remove V2HF.
            (VECTOR_MODES): Define V2QI, V4QI, V2HI, V2HF, V2BF.
            * config/aarch64/aarch64-protos.h
            (aarch64_advsimd_sub_dword_mode_p): Declare new predicate.
            * config/aarch64/aarch64-simd.md (*aarch64_simd_mov<mode>): New
            define_insn_and_split pattern.
            (mov<mode>): Add sub-64-bit vector modes to the VALL_F16 expander.
            Forego const vector expansion for those modes.
            * config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
            Handle 16- and 32-bit vector modes.
            (aarch64_advsimd_sub_dword_mode_p): Define new predicate.
            (aarch64_vectorize_vec_perm_const): Refuse for partial vector modes.
            * config/aarch64/constraints.md (Da): New constraint.
            * config/aarch64/iterators.md (VSUB64): New iterator.
            (VALL_F16_SUB64): Likewise.
            (size): Define attribute for sub-64-bit vector modes.
            (VSC): New mode attribute.
            (vstype): Likewise.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/vect/complex/bb-slp-complex-add-half-float.c: Adjust 
testcase.
            * gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c: Likewise.
            * gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c: Likewise.
            * gcc.target/aarch64/sve/slp_1.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-modes.def               |  4 +-
 gcc/config/aarch64/aarch64-protos.h                |  1 +
 gcc/config/aarch64/aarch64-simd.md                 | 64 +++++++++++++++++++++-
 gcc/config/aarch64/aarch64.cc                      | 18 ++++++
 gcc/config/aarch64/constraints.md                  |  5 ++
 gcc/config/aarch64/iterators.md                    | 19 ++++++-
 .../vect/complex/bb-slp-complex-add-half-float.c   |  3 +
 .../vect/complex/bb-slp-complex-mla-half-float.c   |  4 +-
 .../vect/complex/bb-slp-complex-mul-half-float.c   |  7 ++-
 gcc/testsuite/gcc.target/aarch64/sve/slp_1.c       | 11 ++--
 10 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-modes.def 
b/gcc/config/aarch64/aarch64-modes.def
index d9bff61adec1..d5a54689f7aa 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -79,8 +79,10 @@ VECTOR_MODES (FLOAT, 8);      /*                 V2SF.  */
 VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (INT, DI, 1);     /*                 V1DI.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
-VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+VECTOR_MODES (INT, 2);        /*                 V2QI.  */
+VECTOR_MODES (INT, 4);        /*            V4QI V2HI.  */
+VECTOR_MODES (FLOAT, 4);      /*            V2BF V2HF.  */
 
 /* Integer vector modes used to represent intermediate widened values in some
    instructions.  Not intended to be moved to and from registers or memory.  */
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 24da650da76f..513b556398fa 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -872,6 +872,7 @@ bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT 
val_in, machine_mode mode);
 int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
 bool aarch64_advsimd_struct_mode_p (machine_mode mode);
+bool aarch64_advsimd_sub_dword_mode_p (machine_mode mode);
 opt_machine_mode aarch64_v64_mode (scalar_mode);
 opt_machine_mode aarch64_v128_mode (scalar_mode);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 7496da3a70c1..2b7f6b467c62 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -49,8 +49,8 @@
 (define_subst_attr "vczbe" "add_vec_concat_subst_be" "" "_vec_concatz_be")
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-       (match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16_SUB64 0 "nonimmediate_operand")
+       (match_operand:VALL_F16_SUB64 1 "general_operand"))]
   "TARGET_FLOAT"
   "
   /* Force the operand into a register if it is not an
@@ -77,7 +77,8 @@
          aarch64_expand_vector_init (operands[0], operands[1]);
          DONE;
        }
-      else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
+      else if (!aarch64_advsimd_sub_dword_mode_p (<MODE>mode)
+              && !aarch64_simd_imm_zero (operands[1], <MODE>mode)
               && !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
               && !aarch64_simd_valid_mov_imm (operands[1])
               && !aarch64_const_vec_fmov_p (operands[1]))
@@ -244,6 +245,63 @@
   }
 )
 
+(define_insn_and_split "*aarch64_simd_mov<mode>"
+  [(set (match_operand:VSUB64 0 "nonimmediate_operand")
+       (match_operand:VSUB64 1 "general_operand"))]
+  "TARGET_FLOAT
+   && (register_operand (operands[0], <MODE>mode)
+       || aarch64_simd_reg_or_zero (operands[1], <MODE>mode)
+       || CONST_VECTOR_P (operands[1]))"
+   {@ [cons: =0, 1; attrs: type, arch]
+     [r , Dz ; mov_imm          , *    ] mov\t%w0, 0
+     [r , rZ ; mov_reg          , *    ] mov\t%w0, %w1
+     [r , Da ; mov_imm          , *    ] #
+     [r , w  ; mov_reg          , simd ] #
+     [r , m  ; load_4           , *    ] ldr<size>\t%w0, %1
+     [w , w  ; neon_logic       , simd ] mov\t%0.8b, %1.8b
+     [w , m  ; neon_load1_1reg  , simd ] ldr\t%<vstype>0, %1
+     [w , Dz ; neon_move        , simd ] movi\t%0.2d, #0
+     [m , rZ ; store_4          , *    ] str<size>\t%w1, %0
+     [m , w  ; neon_store1_1reg , simd ] str\t%<vstype>1, %0
+  }
+  "&& reload_completed
+   && REG_P (operands[0])"
+  [(const_int 0)]
+  {
+    if (CONST_VECTOR_P (operands[1]))
+      {
+       int elt_bitsize
+        = GET_MODE_BITSIZE (GET_MODE_INNER (GET_MODE (operands[1])));
+       int n_elts = CONST_VECTOR_NUNITS (operands[1]).to_constant ();
+       int val = 0;
+       bool int_vector_p = CONST_INT_P (CONST_VECTOR_ELT (operands[1], 0));
+       unsigned HOST_WIDE_INT eltval;
+       rtx elt;
+       for (int i = 0; i < n_elts; i++)
+        {
+           elt = CONST_VECTOR_ELT (operands[1], BYTES_BIG_ENDIAN
+                                                ? i
+                                                : n_elts - 1 - i);
+           if (int_vector_p)
+            eltval = INTVAL (elt);
+           else
+            {
+               bool res = aarch64_reinterpret_float_as_int (elt, &eltval);
+               gcc_assert (res);
+            }
+
+           val = (val << elt_bitsize) + (eltval & ((1 << elt_bitsize) - 1));
+        }
+       emit_move_insn (gen_rtx_REG (SImode, REGNO (operands[0])),
+                      GEN_INT (val));
+      }
+    else if (REG_P (operands[1]))
+      aarch64_simd_emit_reg_reg_move (operands, <VSC>mode, 1);
+    DONE;
+  }
+  [(set_attr "type" "mov_reg")]
+)
+
 ;; When storing lane zero we can use the normal STR and its more permissive
 ;; addressing modes.
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 40bbb92ed740..4ed24c869652 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1778,6 +1778,13 @@ aarch64_classify_vector_mode (machine_mode mode, bool 
any_target_p = false)
     case E_V4x2DFmode:
       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
 
+    /* 16-bit Advanced SIMD vectors.  */
+    case E_V2QImode:
+    /* 32-bit Advanced SIMD vectors.  */
+    case E_V2HFmode:
+    case E_V2BFmode:
+    case E_V2HImode:
+    case E_V4QImode:
     /* 64-bit Advanced SIMD vectors.  */
     case E_V8QImode:
     case E_V4HImode:
@@ -1856,6 +1863,14 @@ aarch64_advsimd_full_struct_mode_p (machine_mode mode)
   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
 }
 
+/* Return true if MODE is a partial (sub-64-bit) Advanced SIMD mode.  */
+bool
+aarch64_advsimd_sub_dword_mode_p (machine_mode mode)
+{
+  return (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD)
+        && known_lt (GET_MODE_BITSIZE (mode), 64);
+}
+
 /* Return true if MODE is any of the data vector modes, including
    structure modes.  */
 static bool
@@ -28415,6 +28430,9 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
 {
   struct expand_vec_perm_d d;
 
+  if (aarch64_advsimd_sub_dword_mode_p (op_mode))
+    return false;
+
   /* Check whether the mask can be applied to a single vector.  */
   if (sel.ninputs () == 1
       || (op0 && rtx_equal_p (op0, op1)))
diff --git a/gcc/config/aarch64/constraints.md 
b/gcc/config/aarch64/constraints.md
index 8760220835b7..829b2c949d07 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -531,6 +531,11 @@
  (and (match_code "const_int")
       (match_test "aarch64_simd_scalar_immediate_valid_for_move (op,
                                                 QImode)")))
+(define_constraint "Da"
+  "@internal
+  A constraint that matches all sub-64-bit AdvSIMD vectors."
+  (and (match_code "const_vector")
+       (match_test "aarch64_advsimd_sub_dword_mode_p (GET_MODE (op))")))
 
 (define_constraint "Dt"
   "@internal
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 39b1e84edcc2..dfca3327f1fa 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -227,10 +227,17 @@
 ;; All Advanced SIMD integer modes
 (define_mode_iterator VALLI [VDQ_BHSI V2DI])
 
+;; All sub-64-bit vector modes.
+(define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF])
+
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
                                V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; plus all sub-64-bit vector modes.
+(define_mode_iterator VALL_F16_SUB64 [VALL_F16 VSUB64])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
                                V4HF V8HF V2SF V4SF])
@@ -1466,7 +1473,9 @@
 (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
 
 ;; Give the length suffix letter for a sign- or zero-extension.
-(define_mode_attr size [(QI "b") (HI "h") (SI "w")])
+(define_mode_attr size [(QI "b") (HI "h") (SI "w") (HF "") (BF "") (SF "")
+                       (V2QI "h") (V4QI "") (V2HI "")
+                       (V2HF "") (V2BF "")])
 
 ;; Give the number of bits in the mode
 (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
@@ -1883,6 +1892,10 @@
                        (VNx4SI  "v2si") (VNx4SF "v2sf")
                        (VNx2DI  "di") (VNx2DF "df")])
 
+;; Sub-64-bit vector mode to equivalent scalar mode.
+(define_mode_attr VSC [(V4QI "SI") (V2QI "HI")
+                      (V2HI "SI") (V2HF "SF") (V2BF "SF")])
+
 (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
 
 ;; 64-bit container modes the inner or scalar source mode.
@@ -2169,6 +2182,10 @@
                                (V2SI "q") (V2SF "q")
                                (DI   "q") (DF   "q")])
 
+;; Scalar size of a sub-64-bit vector mode.
+(define_mode_attr vstype [(V4QI "s") (V2QI "h")
+                         (V2HI "s") (V2BF "s") (V2HF "s")])
+
 ;; Define corresponding core/FP element mode for each vector mode.
 (define_mode_attr vw [(V8QI "w") (V16QI "w")
                      (V4HI "w") (V8HI "w")
diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c 
b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c
index 3f1cce569558..2cd2d9112cc1 100644
--- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c
+++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-add-half-float.c
@@ -12,3 +12,6 @@
 
 /* { dg-final { scan-tree-dump "add new stmt: \[^\n\r]*COMPLEX_ADD_ROT270" 
"slp1" { xfail *-*-* } } } */
 /* { dg-final { scan-tree-dump "add new stmt: \[^\n\r]*COMPLEX_ADD_ROT90" 
"slp1" { xfail *-*-* } } } */
+
+/* { dg-final { scan-tree-dump "Found COMPLEX_ADD_ROT90" "slp1" { xfail 
arm*-*-* } } } */
+/* { dg-final { scan-tree-dump "Found COMPLEX_ADD_ROT270" "slp1" { xfail 
arm*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c 
b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c
index 33e500f3f4cd..e7a349b49c69 100644
--- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c
+++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mla-half-float.c
@@ -8,5 +8,7 @@
 #define N 16
 #include "complex-mla-template.c"
 
+/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_FMA" 1 
"slp1" { xfail *-*-* } } } */
+
 /* { dg-final { scan-tree-dump "Found COMPLEX_FMA_CONJ" "slp1" { xfail *-*-* } 
} } */
-/* { dg-final { scan-tree-dump "Found COMPLEX_FMA" "slp1"  { xfail *-*-* } } } 
*/
+/* { dg-final { scan-tree-dump "Found COMPLEX_FMA" "slp1" { xfail arm*-*-* } } 
*/
diff --git a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c 
b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c
index 259dd6b2e067..06d08da41ad6 100644
--- a/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c
+++ b/gcc/testsuite/gcc.dg/vect/complex/bb-slp-complex-mul-half-float.c
@@ -8,5 +8,8 @@
 #define N 16
 #include "complex-mul-template.c"
 
-/* { dg-final { scan-tree-dump "Found COMPLEX_MUL_CONJ" "slp1"  { xfail *-*-* 
} } } */
-/* { dg-final { scan-tree-dump "Found COMPLEX_MUL" "slp1"  { xfail *-*-* } } } 
*/
+/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_MUL_CONJ" 
1 "slp1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "add new stmt:\[^\n\r]*COMPLEX_MUL" 1 
"slp1" { xfail *-*-* } } } */
+
+/* { dg-final { scan-tree-dump "Found COMPLEX_MUL_CONJ" "slp1" { xfail 
arm*-*-* } } } */
+/* { dg-final { scan-tree-dump "Found COMPLEX_MUL" "slp1" { xfail arm*-*-* } } 
} */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
index 07d71a63414b..739e63a96a1c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
@@ -30,12 +30,14 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)    
\
 TEST_ALL (VEC_PERM)
 
 /* We should use one DUP for each of the 8-, 16- and 32-bit types,
-   although we currently use LD1RW for _Float16.  We should use two
+   (for now, insert both elements with umov + ins for _Float16).  We should 
use two
    DUPs for each of the three 64-bit types.  */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
+/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], w[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], w[0-9]+} 3 } } */
 /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, 
z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
 
@@ -53,7 +55,6 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 /* { dg-final { scan-assembler-not {\tldr} } } */
-/* { dg-final { scan-assembler-times {\tstr} 2 } } */
-/* { dg-final { scan-assembler-times {\tstr\th[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-not {\tstr} } } */
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */

[gcc r17-896] aarch64: introduce partial AdvSIMD vector modes

Reply via email to