[6/6][AArch64] Enable vect-compare-loop-costs by default for SVE

Richard Sandiford Tue, 05 Nov 2019 06:33:11 -0800

This patch enables vect-compare-loop-costs by default for SVE, both so
that we can compare SVE against Advanced SIMD and so that (with future
patches) we can compare multiple SVE vectorisation approaches against
each other.


I'll apply if the prerequisites are approved.


2019-11-05  Richard Sandiford  <richard.sandif...@arm.com>

gcc/
        * config/aarch64/aarch64.c (aarch64_override_options_internal):
        Set the default value of PARAM_VECT_COMPARE_LOOP_COSTS to 1
        when SVE is enabled.

gcc/testsuite/
        * gcc.target/aarch64/sve/reduc_3.c: Split multi-vector cases out
        into...
        * gcc.target/aarch64/sve/reduc_3_costly.c: ...this new test,
        passing -fno-vect-cost-model for them.
        * gcc.target/aarch64/sve/slp_6.c: Add -fno-vect-cost-model.
        * gcc.target/aarch64/sve/slp_7.c,
        * gcc.target/aarch64/sve/slp_7_run.c: Split multi-vector cases out
        into...
        * gcc.target/aarch64/sve/slp_7_costly.c,
        * gcc.target/aarch64/sve/slp_7_costly_run.c: ...these new tests,
        passing -fno-vect-cost-model for them.

Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c        2019-11-05 11:04:15.559298615 +0000
+++ gcc/config/aarch64/aarch64.c        2019-11-05 14:21:15.416663625 +0000
@@ -13308,6 +13308,14 @@ aarch64_override_options_internal (struc
   initialize_aarch64_code_model (opts);
   initialize_aarch64_tls_size (opts);
 
+  /* Enable vect-compare-loop-costs by default for SVE, both so that we
+     can compare SVE against Advanced SIMD and so that we can compare
+     multiple SVE vectorisation approaches against each other.  */
+  if (TARGET_SVE)
+    maybe_set_param_value (PARAM_VECT_COMPARE_LOOP_COSTS, 1,
+                          opts->x_param_values,
+                          global_options_set.x_param_values);
+
   int queue_depth = 0;
   switch (aarch64_tune_params.autoprefetcher_model)
     {
Index: gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c      2019-03-08 
18:14:29.784994721 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c      2019-11-05 
14:21:15.416663625 +0000
@@ -17,7 +17,6 @@ void reduc_ptr_##DSTTYPE##_##SRCTYPE (DS
 
 REDUC_PTR (int8_t, int8_t)
 REDUC_PTR (int16_t, int16_t)
-
 REDUC_PTR (int32_t, int32_t)
 REDUC_PTR (int64_t, int64_t)
 
@@ -25,17 +24,6 @@ REDUC_PTR (_Float16, _Float16)
 REDUC_PTR (float, float)
 REDUC_PTR (double, double)
 
-/* Widening reductions.  */
-REDUC_PTR (int32_t, int8_t)
-REDUC_PTR (int32_t, int16_t)
-
-REDUC_PTR (int64_t, int8_t)
-REDUC_PTR (int64_t, int16_t)
-REDUC_PTR (int64_t, int32_t)
-
-REDUC_PTR (float, _Float16)
-REDUC_PTR (double, float)
-
 /* Float<>Int conversions */
 REDUC_PTR (_Float16, int16_t)
 REDUC_PTR (float, int32_t)
@@ -45,8 +33,14 @@ REDUC_PTR (int16_t, _Float16)
 REDUC_PTR (int32_t, float)
 REDUC_PTR (int64_t, double)
 
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 
3 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 
1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 
2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 
2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
2 { xfail *-*-* } } } */
+/* We don't yet vectorize the int<-float cases.  */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 
1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 
1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
1 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 
2 } } */
-/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 
3 } } */
-/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
3 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 
2 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/reduc_3_costly.c
===================================================================
--- /dev/null   2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/reduc_3_costly.c       2019-11-05 
14:21:15.416663625 +0000
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+#define NUM_ELEMS(TYPE) (32 / sizeof (TYPE))
+
+#define REDUC_PTR(DSTTYPE, SRCTYPE)                            \
+void reduc_ptr_##DSTTYPE##_##SRCTYPE (DSTTYPE *restrict sum,   \
+                                     SRCTYPE *restrict array,  \
+                                     int count)                \
+{                                                              \
+  *sum = 0;                                                    \
+  for (int i = 0; i < count; ++i)                              \
+    *sum += array[i];                                          \
+}
+
+/* Widening reductions.  */
+REDUC_PTR (int32_t, int8_t)
+REDUC_PTR (int32_t, int16_t)
+
+REDUC_PTR (int64_t, int8_t)
+REDUC_PTR (int64_t, int16_t)
+REDUC_PTR (int64_t, int32_t)
+
+REDUC_PTR (float, _Float16)
+REDUC_PTR (double, float)
+
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 
2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
3 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 
1 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_6.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_6.c        2019-03-08 
18:14:29.780994734 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_6.c        2019-11-05 
14:21:15.416663625 +0000
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" 
} */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math 
-fno-vect-cost-model" } */
 
 #include <stdint.h>
 
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_7.c        2019-10-25 
10:13:15.544226032 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_7.c        2019-11-05 
14:21:15.416663625 +0000
@@ -31,37 +31,27 @@ #define TEST_ALL(T)                         \
   T (uint16_t)                                 \
   T (int32_t)                                  \
   T (uint32_t)                                 \
-  T (int64_t)                                  \
-  T (uint64_t)                                 \
   T (_Float16)                                 \
-  T (float)                                    \
-  T (double)
+  T (float)
 
 TEST_ALL (VEC_PERM)
 
-/* We can't use SLP for the 64-bit loops, since the number of reduction
-   results might be greater than the number of elements in the vector.
-   Otherwise we have two loads per loop, one for the initial vector
-   and one for the loop body.  */
+/* We have two loads per loop, one for the initial vector and one for
+   the loop body.  */
 /* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
-/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
 /* { dg-final { scan-assembler-not {\tld4b\t} } } */
 /* { dg-final { scan-assembler-not {\tld4h\t} } } */
 /* { dg-final { scan-assembler-not {\tld4w\t} } } */
-/* { dg-final { scan-assembler-not {\tld1d\t} } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 
} } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 
} } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 
} } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 
} } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 
} } */
 /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 
} } */
-/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 
} } */
 
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c    2019-03-08 
18:14:29.780994734 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c    2019-11-05 
14:21:15.416663625 +0000
@@ -1,7 +1,11 @@
 /* { dg-do run { target aarch64_sve_hw } } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 
-#include "slp_7.c"
+#ifndef FILENAME
+#define FILENAME "slp_7.c"
+#endif
+
+#include FILENAME
 
 #define N (54 * 4)
 
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly.c
===================================================================
--- /dev/null   2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly.c 2019-11-05 
14:21:15.416663625 +0000
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math 
-fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                         \
+void __attribute__ ((noinline, noclone))                       \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)     \
+{                                                              \
+  TYPE x0 = b[0];                                              \
+  TYPE x1 = b[1];                                              \
+  TYPE x2 = b[2];                                              \
+  TYPE x3 = b[3];                                              \
+  for (int i = 0; i < n; ++i)                                  \
+    {                                                          \
+      x0 += a[i * 4];                                          \
+      x1 += a[i * 4 + 1];                                      \
+      x2 += a[i * 4 + 2];                                      \
+      x3 += a[i * 4 + 3];                                      \
+    }                                                          \
+  b[0] = x0;                                                   \
+  b[1] = x1;                                                   \
+  b[2] = x2;                                                   \
+  b[3] = x3;                                                   \
+}
+
+#define TEST_ALL(T)                            \
+  T (int64_t)                                  \
+  T (uint64_t)                                 \
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* We can't use SLP for the 64-bit loops, since the number of reduction
+   results might be greater than the number of elements in the vector.  */
+/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
+/* { dg-final { scan-assembler-not {\tld1d\t} } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 
} } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 
} } */
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tuqdec} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly_run.c
===================================================================
--- /dev/null   2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly_run.c     2019-11-05 
14:21:15.416663625 +0000
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
+
+#define FILENAME "slp_7_costly.c"
+#include "slp_7_run.c"

[6/6][AArch64] Enable vect-compare-loop-costs by default for SVE

Reply via email to