On Thu, May 24, 2018 at 10:07 AM Richard Sandiford < richard.sandif...@linaro.org> wrote:
> The folds in r260348 kicked in before vectorisation, which hurts > for two reasons: > (1) the current suboptimal handling of nothrow meant that we could > drop the flag early and so prevent if-conversion > (2) some architectures provide more scalar forms than vector forms > (true for Advanced SIMD) > (1) is a bug in itself that needs to be fixed eventually, but delaying > the folds is still needed for (2). > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install? OK. Richard. > (Patch is mostly just reindent.) > Richard > 2018-05-24 Richard Sandiford <richard.sandif...@linaro.org> > gcc/ > * match.pd: Delay FMA folds until after vectorization. > gcc/testsuite/ > * gcc.dg/vect/vect-fma-1.c: New test. > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd 2018-05-18 09:26:37.735714314 +0100 > +++ gcc/match.pd 2018-05-24 09:05:10.432158893 +0100 > @@ -4703,59 +4703,60 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > wi::to_wide (@ipos) + isize)) > (BIT_FIELD_REF @0 @rsize @rpos))))) > -(for fmas (FMA) > +(if (canonicalize_math_after_vectorization_p ()) > + (for fmas (FMA) > + (simplify > + (fmas:c (negate @0) @1 @2) > + (IFN_FNMA @0 @1 @2)) > + (simplify > + (fmas @0 @1 (negate @2)) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (fmas:c (negate @0) @1 (negate @2)) > + (IFN_FNMS @0 @1 @2)) > + (simplify > + (negate (fmas@3 @0 @1 @2)) > + (if (single_use (@3)) > + (IFN_FNMS @0 @1 @2)))) > + > + (simplify > + (IFN_FMS:c (negate @0) @1 @2) > + (IFN_FNMS @0 @1 @2)) > (simplify > - (fmas:c (negate @0) @1 @2) > + (IFN_FMS @0 @1 (negate @2)) > + (IFN_FMA @0 @1 @2)) > + (simplify > + (IFN_FMS:c (negate @0) @1 (negate @2)) > (IFN_FNMA @0 @1 @2)) > (simplify > - (fmas @0 @1 (negate @2)) > - (IFN_FMS @0 @1 @2)) > + (negate (IFN_FMS@3 @0 @1 @2)) > + (if (single_use (@3)) > + (IFN_FNMA @0 @1 @2))) > + > + (simplify > + (IFN_FNMA:c (negate @0) @1 @2) > + (IFN_FMA @0 @1 @2)) > (simplify > - (fmas:c (negate @0) @1 (negate @2)) > + (IFN_FNMA @0 @1 (negate @2)) > (IFN_FNMS @0 @1 @2)) > (simplify > - (negate (fmas@3 @0 @1 @2)) > + (IFN_FNMA:c (negate @0) @1 (negate @2)) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (negate (IFN_FNMA@3 @0 @1 @2)) > (if (single_use (@3)) > - (IFN_FNMS @0 @1 @2)))) > + (IFN_FMS @0 @1 @2))) > -(simplify > - (IFN_FMS:c (negate @0) @1 @2) > - (IFN_FNMS @0 @1 @2)) > -(simplify > - (IFN_FMS @0 @1 (negate @2)) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (IFN_FMS:c (negate @0) @1 (negate @2)) > - (IFN_FNMA @0 @1 @2)) > -(simplify > - (negate (IFN_FMS@3 @0 @1 @2)) > + (simplify > + (IFN_FNMS:c (negate @0) @1 @2) > + (IFN_FMS @0 @1 @2)) > + (simplify > + (IFN_FNMS @0 @1 (negate @2)) > + (IFN_FNMA @0 @1 @2)) > + (simplify > + (IFN_FNMS:c (negate @0) @1 (negate @2)) > + (IFN_FMA @0 @1 @2)) > + (simplify > + (negate (IFN_FNMS@3 @0 @1 @2)) > (if (single_use (@3)) > - (IFN_FNMA @0 @1 @2))) > - > -(simplify > - (IFN_FNMA:c (negate @0) @1 @2) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (IFN_FNMA @0 @1 (negate @2)) > - (IFN_FNMS @0 @1 @2)) > -(simplify > - (IFN_FNMA:c (negate @0) @1 (negate @2)) > - (IFN_FMS @0 @1 @2)) > -(simplify > - (negate (IFN_FNMA@3 @0 @1 @2)) > - (if (single_use (@3)) > - (IFN_FMS @0 @1 @2))) > - > -(simplify > - (IFN_FNMS:c (negate @0) @1 @2) > - (IFN_FMS @0 @1 @2)) > -(simplify > - (IFN_FNMS @0 @1 (negate @2)) > - (IFN_FNMA @0 @1 @2)) > -(simplify > - (IFN_FNMS:c (negate @0) @1 (negate @2)) > - (IFN_FMA @0 @1 @2)) > -(simplify > - (negate (IFN_FNMS@3 @0 @1 @2)) > - (if (single_use (@3)) > - (IFN_FMA @0 @1 @2))) > + (IFN_FMA @0 @1 @2)))) > Index: gcc/testsuite/gcc.dg/vect/vect-fma-1.c > =================================================================== > --- /dev/null 2018-04-20 16:19:46.369131350 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-fma-1.c 2018-05-24 09:05:10.432158893 +0100 > @@ -0,0 +1,58 @@ > +/* { dg-require-effective-target scalar_all_fma } */ > + > +#include "tree-vect.h" > + > +#define N (VECTOR_BITS * 11 / 64 + 3) > + > +#define DEF(INV) \ > + void __attribute__ ((noipa)) \ > + f_##INV (double *restrict a, double *restrict b, \ > + double *restrict c, double *restrict d) \ > + { \ > + for (int i = 0; i < N; ++i) \ > + { \ > + double mb = (INV & 1 ? -b[i] : b[i]); \ > + double mc = c[i]; \ > + double md = (INV & 2 ? -d[i] : d[i]); \ > + double fma = __builtin_fma (mb, mc, md); \ > + a[i] = (INV & 4 ? -fma : fma); \ > + } \ > + } > + > +#define TEST(INV) \ > + { \ > + f_##INV (a, b, c, d); \ > + for (int i = 0; i < N; ++i) \ > + { \ > + double mb = (INV & 1 ? -b[i] : b[i]); \ > + double mc = c[i]; \ > + double md = (INV & 2 ? -d[i] : d[i]); \ > + double fma = __builtin_fma (mb, mc, md); \ > + double expected = (INV & 4 ? -fma : fma); \ > + if (a[i] != expected) \ > + __builtin_abort (); \ > + asm volatile ("" ::: "memory"); \ > + } \ > + } > + > +#define FOR_EACH_INV(T) \ > + T (0) T (1) T (2) T (3) T (4) T (5) T (6) T (7) > + > +FOR_EACH_INV (DEF) > + > +int > +main (void) > +{ > + double a[N], b[N], c[N], d[N]; > + for (int i = 0; i < N; ++i) > + { > + b[i] = i % 17; > + c[i] = i % 9 + 11; > + d[i] = i % 13 + 14; > + asm volatile ("" ::: "memory"); > + } > + FOR_EACH_INV (TEST) > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 8 "vect" { target vect_double } } } */