This patch enables vect-compare-loop-costs by default for SVE, both so that we can compare SVE against Advanced SIMD and so that (with future patches) we can compare multiple SVE vectorisation approaches against each other.
I'll apply if the prerequisites are approved. 2019-11-05 Richard Sandiford <richard.sandif...@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_override_options_internal): Set the default value of PARAM_VECT_COMPARE_LOOP_COSTS to 1 when SVE is enabled. gcc/testsuite/ * gcc.target/aarch64/sve/reduc_3.c: Split multi-vector cases out into... * gcc.target/aarch64/sve/reduc_3_costly.c: ...this new test, passing -fno-vect-cost-model for them. * gcc.target/aarch64/sve/slp_6.c: Add -fno-vect-cost-model. * gcc.target/aarch64/sve/slp_7.c, * gcc.target/aarch64/sve/slp_7_run.c: Split multi-vector cases out into... * gcc.target/aarch64/sve/slp_7_costly.c, * gcc.target/aarch64/sve/slp_7_costly_run.c: ...these new tests, passing -fno-vect-cost-model for them. Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2019-11-05 11:04:15.559298615 +0000 +++ gcc/config/aarch64/aarch64.c 2019-11-05 14:21:15.416663625 +0000 @@ -13308,6 +13308,14 @@ aarch64_override_options_internal (struc initialize_aarch64_code_model (opts); initialize_aarch64_tls_size (opts); + /* Enable vect-compare-loop-costs by default for SVE, both so that we + can compare SVE against Advanced SIMD and so that we can compare + multiple SVE vectorisation approaches against each other. */ + if (TARGET_SVE) + maybe_set_param_value (PARAM_VECT_COMPARE_LOOP_COSTS, 1, + opts->x_param_values, + global_options_set.x_param_values); + int queue_depth = 0; switch (aarch64_tune_params.autoprefetcher_model) { Index: gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c 2019-03-08 18:14:29.784994721 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/reduc_3.c 2019-11-05 14:21:15.416663625 +0000 @@ -17,7 +17,6 @@ void reduc_ptr_##DSTTYPE##_##SRCTYPE (DS REDUC_PTR (int8_t, int8_t) REDUC_PTR (int16_t, int16_t) - REDUC_PTR (int32_t, int32_t) REDUC_PTR (int64_t, int64_t) @@ -25,17 +24,6 @@ REDUC_PTR (_Float16, _Float16) REDUC_PTR (float, float) REDUC_PTR (double, double) -/* Widening reductions. */ -REDUC_PTR (int32_t, int8_t) -REDUC_PTR (int32_t, int16_t) - -REDUC_PTR (int64_t, int8_t) -REDUC_PTR (int64_t, int16_t) -REDUC_PTR (int64_t, int32_t) - -REDUC_PTR (float, _Float16) -REDUC_PTR (double, float) - /* Float<>Int conversions */ REDUC_PTR (_Float16, int16_t) REDUC_PTR (float, int32_t) @@ -45,8 +33,14 @@ REDUC_PTR (int16_t, _Float16) REDUC_PTR (int32_t, float) REDUC_PTR (int64_t, double) -/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 3 } } */ -/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 { xfail *-*-* } } } */ +/* We don't yet vectorize the int<-float cases. */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */ /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 3 } } */ -/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/reduc_3_costly.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/reduc_3_costly.c 2019-11-05 14:21:15.416663625 +0000 @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */ + +#include <stdint.h> + +#define NUM_ELEMS(TYPE) (32 / sizeof (TYPE)) + +#define REDUC_PTR(DSTTYPE, SRCTYPE) \ +void reduc_ptr_##DSTTYPE##_##SRCTYPE (DSTTYPE *restrict sum, \ + SRCTYPE *restrict array, \ + int count) \ +{ \ + *sum = 0; \ + for (int i = 0; i < count; ++i) \ + *sum += array[i]; \ +} + +/* Widening reductions. */ +REDUC_PTR (int32_t, int8_t) +REDUC_PTR (int32_t, int16_t) + +REDUC_PTR (int64_t, int8_t) +REDUC_PTR (int64_t, int16_t) +REDUC_PTR (int64_t, int32_t) + +REDUC_PTR (float, _Float16) +REDUC_PTR (double, float) + +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_6.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/slp_6.c 2019-03-08 18:14:29.780994734 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/slp_6.c 2019-11-05 14:21:15.416663625 +0000 @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math -fno-vect-cost-model" } */ #include <stdint.h> Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/slp_7.c 2019-10-25 10:13:15.544226032 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/slp_7.c 2019-11-05 14:21:15.416663625 +0000 @@ -31,37 +31,27 @@ #define TEST_ALL(T) \ T (uint16_t) \ T (int32_t) \ T (uint32_t) \ - T (int64_t) \ - T (uint64_t) \ T (_Float16) \ - T (float) \ - T (double) + T (float) TEST_ALL (VEC_PERM) -/* We can't use SLP for the 64-bit loops, since the number of reduction - results might be greater than the number of elements in the vector. - Otherwise we have two loads per loop, one for the initial vector - and one for the loop body. */ +/* We have two loads per loop, one for the initial vector and one for + the loop body. */ /* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */ /* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */ /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */ -/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */ /* { dg-final { scan-assembler-not {\tld4b\t} } } */ /* { dg-final { scan-assembler-not {\tld4h\t} } } */ /* { dg-final { scan-assembler-not {\tld4w\t} } } */ -/* { dg-final { scan-assembler-not {\tld1d\t} } } */ /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 } } */ /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 } } */ /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */ -/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */ /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */ -/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */ -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ /* { dg-final { scan-assembler-not {\tuqdec} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c 2019-03-08 18:14:29.780994734 +0000 +++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c 2019-11-05 14:21:15.416663625 +0000 @@ -1,7 +1,11 @@ /* { dg-do run { target aarch64_sve_hw } } */ /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ -#include "slp_7.c" +#ifndef FILENAME +#define FILENAME "slp_7.c" +#endif + +#include FILENAME #define N (54 * 4) Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly.c 2019-11-05 14:21:15.416663625 +0000 @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math -fno-vect-cost-model" } */ + +#include <stdint.h> + +#define VEC_PERM(TYPE) \ +void __attribute__ ((noinline, noclone)) \ +vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \ +{ \ + TYPE x0 = b[0]; \ + TYPE x1 = b[1]; \ + TYPE x2 = b[2]; \ + TYPE x3 = b[3]; \ + for (int i = 0; i < n; ++i) \ + { \ + x0 += a[i * 4]; \ + x1 += a[i * 4 + 1]; \ + x2 += a[i * 4 + 2]; \ + x3 += a[i * 4 + 3]; \ + } \ + b[0] = x0; \ + b[1] = x1; \ + b[2] = x2; \ + b[3] = x3; \ +} + +#define TEST_ALL(T) \ + T (int64_t) \ + T (uint64_t) \ + T (double) + +TEST_ALL (VEC_PERM) + +/* We can't use SLP for the 64-bit loops, since the number of reduction + results might be greater than the number of elements in the vector. */ +/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */ +/* { dg-final { scan-assembler-not {\tld1d\t} } } */ +/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */ +/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */ + +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ + +/* { dg-final { scan-assembler-not {\tuqdec} } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly_run.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/slp_7_costly_run.c 2019-11-05 14:21:15.416663625 +0000 @@ -0,0 +1,5 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */ + +#define FILENAME "slp_7_costly.c" +#include "slp_7_run.c"