Reassoc carefully ranks operands to form reduction chains for vectorization so we are careful to not apply any width related changes in the early pass. Unfortunately we are not careful enough. The following gates fma related re-ordering and also the >= 3 ops tail "optimization" which is the culprit here.
This does not fix the reported inefficient vectorization when using signed integer reductions yet. Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. PR tree-optimization/120687 * tree-ssa-reassoc.cc (reassociate_bb): Do not disturb the sorted operand order in the early pass. * tree-vect-slp.cc (vect_analyze_slp): Dump when a detected reduction chain fails SLP discovery. * gcc.dg/vect/pr120687-1.c: New testcase. * gcc.dg/vect/pr120687-2.c: Likewise. --- gcc/testsuite/gcc.dg/vect/pr120687-1.c | 16 ++++++++++++++++ gcc/testsuite/gcc.dg/vect/pr120687-2.c | 17 +++++++++++++++++ gcc/tree-ssa-reassoc.cc | 10 ++++++---- gcc/tree-vect-slp.cc | 3 +++ 4 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr120687-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr120687-2.c diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-1.c b/gcc/testsuite/gcc.dg/vect/pr120687-1.c new file mode 100644 index 00000000000..ce9cf6301ce --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr120687-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +unsigned +frd (unsigned *p, unsigned *lastone) +{ + unsigned sum = 0; + for (; p <= lastone; p += 16) + sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7] + + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15]; + return sum; +} + +/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } } */ +/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-2.c b/gcc/testsuite/gcc.dg/vect/pr120687-2.c new file mode 100644 index 00000000000..dfc6dc726e9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr120687-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-additional-options "-ffast-math" } */ + +float +frd (float *p, float *lastone) +{ + float sum = 0; + for (; p <= lastone; p += 16) + sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7] + + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15]; + return sum; +} + +/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } } */ +/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */ diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc index 3c38f3d7a19..c140f76766e 100644 --- a/gcc/tree-ssa-reassoc.cc +++ b/gcc/tree-ssa-reassoc.cc @@ -7167,9 +7167,10 @@ reassociate_bb (basic_block bb) /* If the target support FMA, rank_ops_for_fma will detect if the chain has fmas and rearrange the ops if so. */ - if (direct_internal_fn_supported_p (IFN_FMA, - TREE_TYPE (lhs), - opt_type) + if (!reassoc_insert_powi_p + && direct_internal_fn_supported_p (IFN_FMA, + TREE_TYPE (lhs), + opt_type) && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)) { mult_num = rank_ops_for_fma (&ops); @@ -7200,7 +7201,8 @@ reassociate_bb (basic_block bb) to make sure the ones that get the double binary op are chosen wisely. */ int len = ops.length (); - if (len >= 3 + if (!reassoc_insert_powi_p + && len >= 3 && (!has_fma /* width > 1 means ranking ops results in better parallelism. Check current value to avoid diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 1e93fae39d9..55d02fcea39 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4958,6 +4958,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, max_tree_size, &limit, force_single_lane)) { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP discovery of reduction chain failed\n"); /* Dissolve reduction chain group. */ stmt_vec_info vinfo = first_element; stmt_vec_info last = NULL; -- 2.43.0