Hi All,
The following testcase
#pragma GCC target ("+sve")
extern char __attribute__ ((simd, const)) fn3 (int, short);
void test_fn3 (float *a, float *b, double *c, int n)
{
for (int i = 0; i < n; ++i)
a[i] = fn3 (b[i], c[i]);
}
at -Ofast ICEs because my previous patch only added support for combining 2
partial SVE vectors into a bigger vector. However There can also 4 and 8
piece subvectors.
This patch fixes this by implementing the missing expansions.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR target/96342
PR middle-end/118272
* config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>,
vec_initvnx16qivnx2qi): New.
* config/aarch64/aarch64.cc (aarch64_sve_expand_vector_init_subvector):
Rewrite to support any arbitrary combinations.
* config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E
(SVE_NO2E, Vquad): New.
gcc/testsuite/ChangeLog:
PR target/96342
PR middle-end/118272
* gcc.target/aarch64/vect-simd-clone-3.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index
6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015f36ba32981af59
100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>"
}
)
+;; Vector constructor combining two half vectors { a, b }
(define_expand "vec_init<mode><Vhalf>"
[(match_operand:SVE_NO2E 0 "register_operand")
(match_operand 1 "")]
@@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>"
}
)
+;; Vector constructor combining four quad vectors { a, b, c, d }
+(define_expand "vec_init<mode><Vquad>"
+ [(match_operand:SVE_NO4E 0 "register_operand")
+ (match_operand 1 "")]
+ "TARGET_SVE"
+ {
+ aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
+ DONE;
+ }
+)
+
+;; Vector constructor combining eight vectors { a, b, c, d, ... }
+(define_expand "vec_initvnx16qivnx2qi"
+ [(match_operand:VNx16QI 0 "register_operand")
+ (match_operand 1 "")]
+ "TARGET_SVE"
+ {
+ aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
+ DONE;
+ }
+)
+
;; Shift an SVE vector left and insert a scalar into element 0.
(define_insn "vec_shl_insert_<mode>"
[(set (match_operand:SVE_FULL 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index
cb9b155826d12b622ae0df1736e4b042d01cf56a..e062cc00d1a548290377382c98ea8f3bb9310513
100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24898,18 +24898,51 @@ aarch64_sve_expand_vector_init_subvector (rtx target,
rtx vals)
machine_mode mode = GET_MODE (target);
int nelts = XVECLEN (vals, 0);
- gcc_assert (nelts == 2);
+ gcc_assert (nelts % 2 == 0);
- rtx arg0 = XVECEXP (vals, 0, 0);
- rtx arg1 = XVECEXP (vals, 0, 1);
-
- /* If we have two elements and are concatting vector. */
- machine_mode elem_mode = GET_MODE (arg0);
+ /* We have to be concatting vector. */
+ machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
gcc_assert (VECTOR_MODE_P (elem_mode));
- arg0 = force_reg (elem_mode, arg0);
- arg1 = force_reg (elem_mode, arg1);
- emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+ auto_vec<rtx> worklist;
+ machine_mode wider_mode
+ = related_vector_mode (elem_mode, GET_MODE_INNER (elem_mode),
+ GET_MODE_NUNITS (elem_mode) * 2).require ();
+
+ /* First create the wider first level pack, which also allows us to force
+ the values to registers and put the elements in a more convenient
+ data structure. */
+
+ for (int i = 0; i < nelts; i+=2)
+ {
+ rtx arg0 = XVECEXP (vals, 0, i);
+ rtx arg1 = XVECEXP (vals, 0, i + 1);
+ arg0 = force_reg (elem_mode, arg0);
+ arg1 = force_reg (elem_mode, arg1);
+ rtx tmp = gen_reg_rtx (wider_mode);
+ emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
+ worklist.safe_push (tmp);
+ }
+
+ /* Keep widening pairwise to have maximum throughput. */
+ while (worklist.length () > 1)
+ {
+ rtx arg0 = worklist.pop ();
+ rtx arg1 = worklist.pop ();
+ gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
+
+ wider_mode
+ = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
+ GET_MODE_NUNITS (wider_mode) * 2).require ();
+
+ rtx tmp = gen_reg_rtx (wider_mode);
+ emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
+ worklist.safe_push (tmp);
+ }
+
+ gcc_assert (wider_mode == mode);
+ emit_move_insn (target, worklist.pop ());
+
return;
}
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224af87ec5451b4b1a9a
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
;; VQ without 2 element modes.
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
+;; SVE modes without 2 and 4 element modes.
+(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF VNx8BF])
+
;; SVE modes without 2 element modes.
-(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
- VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
+(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI
+ VNx4SF])
;; 2 element quad vector modes.
(define_mode_iterator VQ_2E [V2DI V2DF])
@@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi")
(VNx8BF "vnx4bf") (VNx4BF "vnx2bf")
(VNx4SI "vnx2si") (VNx4SF "vnx2sf")])
+;; Quad modes of all vector modes, in lower-case.
+(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi")
+ (VNx8HI "vnx2hi") (VNx8HF "vnx2hf")
+ (VNx8BF "vnx2bf")])
+
;; Single-element half modes of quad vector modes.
(define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
new file mode 100644
index
0000000000000000000000000000000000000000..fb50da1f714ce929da0e35b7d9fead2d93476cb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+
+#pragma GCC target ("+sve")
+extern char __attribute__ ((simd, const)) fn3 (int, short);
+void test_fn3 (float *a, float *b, double *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn3 (b[i], c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
+
+extern char __attribute__ ((simd, const)) fn4 (int, char);
+void test_fn4 (float *a, float *b, double *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn4 (b[i], c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */
+
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b,
z[0-9]+\.b\n} 6 } } */
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h,
z[0-9]+\.h\n} 16 } } */
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s,
z[0-9]+\.s\n} 24 } } */
+
--
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015f36ba32981af59 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>"
}
)
+;; Vector constructor combining two half vectors { a, b }
(define_expand "vec_init<mode><Vhalf>"
[(match_operand:SVE_NO2E 0 "register_operand")
(match_operand 1 "")]
@@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>"
}
)
+;; Vector constructor combining four quad vectors { a, b, c, d }
+(define_expand "vec_init<mode><Vquad>"
+ [(match_operand:SVE_NO4E 0 "register_operand")
+ (match_operand 1 "")]
+ "TARGET_SVE"
+ {
+ aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
+ DONE;
+ }
+)
+
+;; Vector constructor combining eight vectors { a, b, c, d, ... }
+(define_expand "vec_initvnx16qivnx2qi"
+ [(match_operand:VNx16QI 0 "register_operand")
+ (match_operand 1 "")]
+ "TARGET_SVE"
+ {
+ aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
+ DONE;
+ }
+)
+
;; Shift an SVE vector left and insert a scalar into element 0.
(define_insn "vec_shl_insert_<mode>"
[(set (match_operand:SVE_FULL 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index cb9b155826d12b622ae0df1736e4b042d01cf56a..e062cc00d1a548290377382c98ea8f3bb9310513 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24898,18 +24898,51 @@ aarch64_sve_expand_vector_init_subvector (rtx target, rtx vals)
machine_mode mode = GET_MODE (target);
int nelts = XVECLEN (vals, 0);
- gcc_assert (nelts == 2);
+ gcc_assert (nelts % 2 == 0);
- rtx arg0 = XVECEXP (vals, 0, 0);
- rtx arg1 = XVECEXP (vals, 0, 1);
-
- /* If we have two elements and are concatting vector. */
- machine_mode elem_mode = GET_MODE (arg0);
+ /* We have to be concatting vector. */
+ machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
gcc_assert (VECTOR_MODE_P (elem_mode));
- arg0 = force_reg (elem_mode, arg0);
- arg1 = force_reg (elem_mode, arg1);
- emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
+ auto_vec<rtx> worklist;
+ machine_mode wider_mode
+ = related_vector_mode (elem_mode, GET_MODE_INNER (elem_mode),
+ GET_MODE_NUNITS (elem_mode) * 2).require ();
+
+ /* First create the wider first level pack, which also allows us to force
+ the values to registers and put the elements in a more convenient
+ data structure. */
+
+ for (int i = 0; i < nelts; i+=2)
+ {
+ rtx arg0 = XVECEXP (vals, 0, i);
+ rtx arg1 = XVECEXP (vals, 0, i + 1);
+ arg0 = force_reg (elem_mode, arg0);
+ arg1 = force_reg (elem_mode, arg1);
+ rtx tmp = gen_reg_rtx (wider_mode);
+ emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
+ worklist.safe_push (tmp);
+ }
+
+ /* Keep widening pairwise to have maximum throughput. */
+ while (worklist.length () > 1)
+ {
+ rtx arg0 = worklist.pop ();
+ rtx arg1 = worklist.pop ();
+ gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
+
+ wider_mode
+ = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
+ GET_MODE_NUNITS (wider_mode) * 2).require ();
+
+ rtx tmp = gen_reg_rtx (wider_mode);
+ emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, arg1));
+ worklist.safe_push (tmp);
+ }
+
+ gcc_assert (wider_mode == mode);
+ emit_move_insn (target, worklist.pop ());
+
return;
}
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224af87ec5451b4b1a9a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
;; VQ without 2 element modes.
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
+;; SVE modes without 2 and 4 element modes.
+(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF VNx8BF])
+
;; SVE modes without 2 element modes.
-(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI VNx8HF
- VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
+(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI
+ VNx4SF])
;; 2 element quad vector modes.
(define_mode_iterator VQ_2E [V2DI V2DF])
@@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi")
(VNx8BF "vnx4bf") (VNx4BF "vnx2bf")
(VNx4SI "vnx2si") (VNx4SF "vnx2sf")])
+;; Quad modes of all vector modes, in lower-case.
+(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi")
+ (VNx8HI "vnx2hi") (VNx8HF "vnx2hf")
+ (VNx8BF "vnx2bf")])
+
;; Single-element half modes of quad vector modes.
(define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")])
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35b7d9fead2d93476cb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+
+#pragma GCC target ("+sve")
+extern char __attribute__ ((simd, const)) fn3 (int, short);
+void test_fn3 (float *a, float *b, double *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn3 (b[i], c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
+
+extern char __attribute__ ((simd, const)) fn4 (int, char);
+void test_fn4 (float *a, float *b, double *c, int n)
+{
+ for (int i = 0; i < n; ++i)
+ a[i] = fn4 (b[i], c[i]);
+}
+
+/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */
+
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 6 } } */
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 16 } } */
+/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 24 } } */
+