Hi All,
The vectorizer now tries to maintain the target VF that a user wanted through
uncreasing the unroll factor if the user used pragma GCC unroll and we've
vectorized the loop.
This change makes the AArch64 backend honor this initial value being set by
the vectorizer.
Consider the loop
void f1 (int *restrict a, int n)
{
#pragma GCC unroll 4 requested
for (int i = 0; i < n; i++)
a[i] *= 2;
}
The target can then choose to create multiple epilogues to deal with the "rest".
The example above now generates:
.L4:
ldr q31, [x2]
add v31.4s, v31.4s, v31.4s
str q31, [x2], 16
cmp x2, x3
bne .L4
as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:
.L4:
ldp q30, q31, [x2]
add v30.4s, v30.4s, v30.4s
add v31.4s, v31.4s, v31.4s
stp q30, q31, [x2], 32
cmp x3, x2
bne .L4
Note that as a follow up I plan on looking into asking the vectorizer to
generate multiple epilogues when we do unroll like this as we can
re-request the same mode but without the unroll as the first epilogue.
Atm I added a TODO since e.g. for early break we don't support vector
epilogues yet and multiple epilogues needs some thoughts and internal
discussions.
Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64.cc
(aarch64_vector_costs::determine_suggested_unroll_factor): Use
m_suggested_unroll_factor instead of 1.
(aarch64_vector_costs::finish_cost): Add todo for epilogues.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/unroll-vect.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index
9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd
100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor
()
if (!sve && !TARGET_SVE2 && m_has_avg)
return 1;
- unsigned int max_unroll_factor = 1;
+ unsigned int max_unroll_factor = m_suggested_unroll_factor;
for (auto vec_ops : m_ops)
{
aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs
*uncast_scalar_costs)
m_costs[vect_body]);
m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+ /* TODO: Add support for multiple epilogues and costing for early break.
*/
+
/* For gather and scatters there's an additional overhead for the first
iteration. For low count loops they're not beneficial so model the
overhead as loop prologue costs. */
diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
new file mode 100644
index
0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8-a --param
aarch64-autovec-preference=asimd-only -std=gnu99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+** ...
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 16
+ for (int i = 0; i < n; i++)
+ a[i] *= 2;
+}
+
--
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor ()
if (!sve && !TARGET_SVE2 && m_has_avg)
return 1;
- unsigned int max_unroll_factor = 1;
+ unsigned int max_unroll_factor = m_suggested_unroll_factor;
for (auto vec_ops : m_ops)
{
aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
m_costs[vect_body]);
m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+ /* TODO: Add support for multiple epilogues and costing for early break. */
+
/* For gather and scatters there's an additional overhead for the first
iteration. For low count loops they're not beneficial so model the
overhead as loop prologue costs. */
diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
new file mode 100644
index 0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8-a --param aarch64-autovec-preference=asimd-only -std=gnu99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+** ...
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 16
+ for (int i = 0; i < n; i++)
+ a[i] *= 2;
+}
+