Reassoc carefully ranks operands to form reduction chains for
vectorization so we are careful to not apply any width related
changes in the early pass.  Unfortunately we are not careful
enough.  The following gates fma related re-ordering and also
the >= 3 ops tail "optimization" which is the culprit here.

This does not fix the reported inefficient vectorization when
using signed integer reductions yet.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

        PR tree-optimization/120687
        * tree-ssa-reassoc.cc (reassociate_bb): Do not disturb
        the sorted operand order in the early pass.
        * tree-vect-slp.cc (vect_analyze_slp): Dump when a detected
        reduction chain fails SLP discovery.

        * gcc.dg/vect/pr120687-1.c: New testcase.
        * gcc.dg/vect/pr120687-2.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr120687-1.c | 16 ++++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr120687-2.c | 17 +++++++++++++++++
 gcc/tree-ssa-reassoc.cc                | 10 ++++++----
 gcc/tree-vect-slp.cc                   |  3 +++
 4 files changed, 42 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr120687-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr120687-2.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-1.c 
b/gcc/testsuite/gcc.dg/vect/pr120687-1.c
new file mode 100644
index 00000000000..ce9cf6301ce
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr120687-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+unsigned
+frd (unsigned *p, unsigned *lastone)
+{
+  unsigned sum = 0;
+  for (; p <= lastone; p += 16)
+    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
+           + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } 
} */
+/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" 
"vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-2.c 
b/gcc/testsuite/gcc.dg/vect/pr120687-2.c
new file mode 100644
index 00000000000..dfc6dc726e9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr120687-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-ffast-math" } */
+
+float
+frd (float *p, float *lastone)
+{
+  float sum = 0;
+  for (; p <= lastone; p += 16)
+    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
+           + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } 
} */
+/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" 
"vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 3c38f3d7a19..c140f76766e 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -7167,9 +7167,10 @@ reassociate_bb (basic_block bb)
 
                  /* If the target support FMA, rank_ops_for_fma will detect if
                     the chain has fmas and rearrange the ops if so.  */
-                 if (direct_internal_fn_supported_p (IFN_FMA,
-                                                     TREE_TYPE (lhs),
-                                                     opt_type)
+                 if (!reassoc_insert_powi_p
+                     && direct_internal_fn_supported_p (IFN_FMA,
+                                                        TREE_TYPE (lhs),
+                                                        opt_type)
                      && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR))
                    {
                      mult_num = rank_ops_for_fma (&ops);
@@ -7200,7 +7201,8 @@ reassociate_bb (basic_block bb)
                         to make sure the ones that get the double
                         binary op are chosen wisely.  */
                      int len = ops.length ();
-                     if (len >= 3
+                     if (!reassoc_insert_powi_p
+                         && len >= 3
                          && (!has_fma
                              /* width > 1 means ranking ops results in better
                                 parallelism.  Check current value to avoid
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1e93fae39d9..55d02fcea39 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4958,6 +4958,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
                                                 max_tree_size, &limit,
                                                 force_single_lane))
          {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "SLP discovery of reduction chain failed\n");
            /* Dissolve reduction chain group.  */
            stmt_vec_info vinfo = first_element;
            stmt_vec_info last = NULL;
-- 
2.43.0

Reply via email to