Hi!

>> Can you repeat the compile-time measurement there?  I also wonder
>> whether we should worry about compile-time at -O[12] when SLP is not run.
>> Thus, probably rename the cleanup pass to pre_slp_scalar_cleanup and
>> gate it on && flag_slp_vectorize
> 
> Good idea, will evaluate it.
> 

Sorry for the late update.  

I evaluated compilation time on SPEC2017 INT bmks,

for several option sets:
  A1: -Ofast -funroll-loops
  A2: -O1
  A3: -O1 -funroll-loops
  A4: -O2
  A5: -O2 -funroll-loops

and for several guard conditions:
  C1: no loops after cunroll (the previous version)
  C2: any outermost loop unrolled
  C3: C1 + C2
  C4: C1 + C2 + SLP only
  C5: C2 + SLP only (the current version)

Compilation time increase percentages table:
    A1          A2      A3      A4      A5
C1: 0.74%       0.07%   -0.25%  0.00%   0.10%
C2: 0.21%       0.00%   -0.19%  0.00%   0.71%
C3: 0.21%       0.00%   -0.06%  0.30%   0.00%
C4: 0.21%       -0.07%  -0.38%  0.20%   -0.19%
C5: 0.08%       0.00%   -0.38%  -0.10%  -0.05%

C2 is a better guard than C1 (C2/A1 better than C1/A1).
SLP guard is good from C2/A5 vs. C5/A5.
btw, the data could have some noises especially when
the difference is very small.

Bootstrapped/regtested on powerpc64le-linux-gnu P8.

Is it ok for trunk?

BR,
Kewen
-----
gcc/ChangeLog:

        PR tree-optimization/96789
        * passes.c (class pass_pre_slp_scalar_cleanup): New class.
        (make_pass_pre_slp_scalar_cleanup): New function.
        (pass_data_pre_slp_scalar_cleanup): New pass data.
        (execute_one_pass): Add support for
        TODO_force_next_scalar_cleanup.
        (pending_TODOs): Init.
        * passes.def (pass_pre_slp_scalar_cleanup): New pass, add
        pass_fre and pass_dse as its children.
        * timevar.def (TV_SCALAR_CLEANUP): New timevar.
        * tree-pass.h (TODO_force_next_scalar_cleanup): New TODO flag.
        (make_pass_pre_slp_scalar_cleanup): New declare.
        (pending_TODOs): Likewise.
        * tree-ssa-loop-ivcanon.c (tree_unroll_loops_completely_1):
        Extend to set father_bbs for outermost loop.
        (tree_unroll_loops_completely): Once any outermost loop gets
        unrolled, set outermost_unrolled and further flag return value
        with TODO_force_next_scalar_cleanup.

gcc/testsuite/ChangeLog:

        PR tree-optimization/96789
        * gcc.dg/tree-ssa/ssa-dse-28.c: Adjust.
        * gcc.dg/tree-ssa/ssa-dse-29.c: Likewise.
        * gcc.dg/vect/bb-slp-41.c: Likewise.
        * gcc.dg/tree-ssa/pr96789.c: New test.
diff --git a/gcc/passes.c b/gcc/passes.c
index 6ff31ec37d7..eb938d72a42 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -71,6 +71,8 @@ using namespace gcc;
    The variable current_pass is also used for statistics and plugins.  */
 opt_pass *current_pass;
 
+unsigned int pending_TODOs = 0;
+
 /* Most passes are single-instance (within their context) and thus don't
    need to implement cloning, but passes that support multiple instances
    *must* provide their own implementation of the clone method.
@@ -731,7 +733,54 @@ make_pass_late_compilation (gcc::context *ctxt)
   return new pass_late_compilation (ctxt);
 }
 
+/* Pre-SLP scalar cleanup, it has several cleanup passes like FRE, DSE.  */
+
+namespace {
+
+const pass_data pass_data_pre_slp_scalar_cleanup =
+{
+  GIMPLE_PASS, /* type */
+  "*pre_slp_scalar_cleanup", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_SCALAR_CLEANUP, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_pre_slp_scalar_cleanup : public gimple_opt_pass
+{
+public:
+  pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_pre_slp_scalar_cleanup, ctxt)
+  {
+  }
+
+  virtual bool
+  gate (function *)
+  {
+    return flag_tree_slp_vectorize
+          && (pending_TODOs & TODO_force_next_scalar_cleanup);
+  }
+
+  virtual unsigned int
+  execute (function *)
+  {
+    pending_TODOs &= ~TODO_force_next_scalar_cleanup;
+    return 0;
+  }
 
+}; // class pass_pre_slp_scalar_cleanup
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt)
+{
+  return new pass_pre_slp_scalar_cleanup (ctxt);
+}
 
 /* Set the static pass number of pass PASS to ID and record that
    in the mapping from static pass number to pass.  */
@@ -2538,6 +2587,12 @@ execute_one_pass (opt_pass *pass)
       return true;
     }
 
+  if (todo_after & TODO_force_next_scalar_cleanup)
+    {
+      todo_after &= ~TODO_force_next_scalar_cleanup;
+      pending_TODOs |= TODO_force_next_scalar_cleanup;
+    }
+
   do_per_function (clear_last_verified, NULL);
 
   do_per_function (update_properties_after_pass, pass);
diff --git a/gcc/passes.def b/gcc/passes.def
index c0098d755bf..c74add75068 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -288,11 +288,16 @@ along with GCC; see the file COPYING3.  If not see
          /* pass_vectorize must immediately follow pass_if_conversion.
             Please do not add any other passes in between.  */
          NEXT_PASS (pass_vectorize);
-          PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
+         PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
              NEXT_PASS (pass_dce);
-          POP_INSERT_PASSES ()
-          NEXT_PASS (pass_predcom);
+         POP_INSERT_PASSES ()
+         NEXT_PASS (pass_predcom);
          NEXT_PASS (pass_complete_unroll);
+         NEXT_PASS (pass_pre_slp_scalar_cleanup);
+          PUSH_INSERT_PASSES_WITHIN (pass_pre_slp_scalar_cleanup)
+             NEXT_PASS (pass_fre, false /* may_iterate */);
+             NEXT_PASS (pass_dse);
+          POP_INSERT_PASSES ()
          NEXT_PASS (pass_slp_vectorize);
          NEXT_PASS (pass_loop_prefetch);
          /* Run IVOPTs after the last pass that uses data-reference analysis
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
new file mode 100644
index 00000000000..d6139a014d8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funroll-loops -ftree-vectorize -fdump-tree-dse-details" 
} */
+
+/* Test if scalar cleanup pass takes effects, mainly check
+   its secondary pass DSE can remove dead stores on array
+   tmp.  */
+
+#include "stdint.h"
+
+static inline void
+foo (int16_t *diff, int i_size, uint8_t *val1, int i_val1, uint8_t *val2,
+     int i_val2)
+{
+  for (int y = 0; y < i_size; y++)
+    {
+      for (int x = 0; x < i_size; x++)
+       diff[x + y * i_size] = val1[x] - val2[x];
+      val1 += i_val1;
+      val2 += i_val2;
+    }
+}
+
+void
+bar (int16_t res[16], uint8_t *val1, uint8_t *val2)
+{
+  int16_t d[16];
+  int16_t tmp[16];
+
+  foo (d, 4, val1, 16, val2, 32);
+
+  for (int i = 0; i < 4; i++)
+    {
+      int s03 = d[i * 4 + 0] + d[i * 4 + 3];
+      int s12 = d[i * 4 + 1] + d[i * 4 + 2];
+      int d03 = d[i * 4 + 0] - d[i * 4 + 3];
+      int d12 = d[i * 4 + 1] - d[i * 4 + 2];
+
+      tmp[0 * 4 + i] = s03 + s12;
+      tmp[1 * 4 + i] = 2 * d03 + d12;
+      tmp[2 * 4 + i] = s03 - s12;
+      tmp[3 * 4 + i] = d03 - 2 * d12;
+    }
+
+  for (int i = 0; i < 4; i++)
+    {
+      int s03 = tmp[i * 4 + 0] + tmp[i * 4 + 3];
+      int s12 = tmp[i * 4 + 1] + tmp[i * 4 + 2];
+      int d03 = tmp[i * 4 + 0] - tmp[i * 4 + 3];
+      int d12 = tmp[i * 4 + 1] - tmp[i * 4 + 2];
+
+      res[i * 4 + 0] = s03 + s12;
+      res[i * 4 + 1] = 2 * d03 + d12;
+      res[i * 4 + 2] = s03 - s12;
+      res[i * 4 + 3] = d03 - 2 * d12;
+    }
+}
+
+/* { dg-final { scan-tree-dump {Deleted dead store:.*tmp} "dse3" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c
index d3a1bbc5ce5..b81cabe604a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c
@@ -17,5 +17,5 @@ int foo (int *p, int b)
 
 /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse1"} } */
 /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
-/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
+/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c
index 31529e7caed..f4ef89c590c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c
@@ -22,5 +22,5 @@ foo(int cond, struct z *s)
 
 /* { dg-final { scan-tree-dump-times "Deleted dead store" 3 "dse1"} } */
 /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */
-/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */
+/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-41.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-41.c
index 7de5ed1f5be..72245115f30 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-41.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-41.c
@@ -10,7 +10,10 @@ foo (int *a, int *b)
     a[i] = b[0] + b[1] + b[i+1] + b[i+2];
 }
 
-void bar (int *a, int *b)
+/* Disable pre-slp FRE to avoid unexpected SLP on the epilogue
+   of the 1st loop.  */
+void __attribute__((optimize("-fno-tree-fre")))
+bar (int *a, int *b)
 {
   int i;
   for (i = 0; i < (ARR_SIZE - 2); ++i)
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 7dd1e2685a4..7549a331bc2 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -193,6 +193,7 @@ DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop 
unswitching")
 DEFTIMEVAR (TV_LOOP_SPLIT            , "loop splitting")
 DEFTIMEVAR (TV_LOOP_JAM              , "unroll and jam")
 DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
+DEFTIMEVAR (TV_SCALAR_CLEANUP        , "scalar cleanup")
 DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
 DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
 DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index f01e811917d..a6b202d98f5 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -301,6 +301,11 @@ protected:
 /* Release function body and stop pass manager.  */
 #define TODO_discard_function          (1 << 23)
 
+/* Used as one pending action, it expects the following scalar
+   cleanup pass will clear it and do the cleanup work when it's
+   met.  */
+#define TODO_force_next_scalar_cleanup  (1 << 24)
+
 /* Internally used in execute_function_todo().  */
 #define TODO_update_ssa_any            \
     (TODO_update_ssa                   \
@@ -380,6 +385,7 @@ extern gimple_opt_pass *make_pass_simduid_cleanup 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
@@ -627,6 +633,12 @@ extern gimple_opt_pass *make_pass_convert_switch 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lower_vaarg (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_gimple_isel (gcc::context *ctxt);
 
+/* Different from normal TODO_flags which are handled right at the begin
+   or the end of one pass execution, the pending TODO_flags are allowed
+   to be passed down in the pipeline until one of its consumers can take
+   over it.  */
+extern unsigned int pending_TODOs;
+
 /* Current optimization pass.  */
 extern opt_pass *current_pass;
 
diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c
index 298ab215530..905bd3add59 100644
--- a/gcc/tree-ssa-loop-ivcanon.c
+++ b/gcc/tree-ssa-loop-ivcanon.c
@@ -1404,13 +1404,14 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
bool unroll_outer,
         computations; otherwise, the size might blow up before the
         iteration is complete and the IR eventually cleaned up.  */
       if (loop_outer (loop_father))
-       {
-         /* Once we process our father we will have processed
-            the fathers of our children as well, so avoid doing
-            redundant work and clear fathers we've gathered sofar.  */
-         bitmap_clear (father_bbs);
-         bitmap_set_bit (father_bbs, loop_father->header->index);
-       }
+       /* Once we process our father we will have processed
+          the fathers of our children as well, so avoid doing
+          redundant work and clear fathers we've gathered sofar.
+          But don't clear it for one outermost loop since its
+          propagation doesn't run necessarily all the time.  */
+       bitmap_clear (father_bbs);
+
+      bitmap_set_bit (father_bbs, loop_father->header->index);
 
       return true;
     }
@@ -1429,6 +1430,8 @@ tree_unroll_loops_completely (bool may_increase_size, 
bool unroll_outer)
   bool changed;
   int iteration = 0;
   bool irred_invalidated = false;
+  int retval = 0;
+  bool outermost_unrolled = false;
 
   estimate_numbers_of_iterations (cfun);
 
@@ -1472,6 +1475,8 @@ tree_unroll_loops_completely (bool may_increase_size, 
bool unroll_outer)
              if (loop_outer (unrolled_loop_bb->loop_father))
                bitmap_set_bit (fathers,
                                unrolled_loop_bb->loop_father->num);
+             else if (!outermost_unrolled)
+               outermost_unrolled = true;
            }
          bitmap_clear (father_bbs);
          /* Propagate the constants within the new basic blocks.  */
@@ -1509,11 +1514,15 @@ tree_unroll_loops_completely (bool may_increase_size, 
bool unroll_outer)
 
   BITMAP_FREE (father_bbs);
 
+  /* Trigger scalar cleanup once any outermost loop gets unrolled.  */
+  if (outermost_unrolled && unroll_outer)
+    retval |= TODO_force_next_scalar_cleanup;
+
   if (irred_invalidated
       && loops_state_satisfies_p (LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS))
     mark_irreducible_loops ();
 
-  return 0;
+  return retval;
 }
 
 /* Canonical induction variable creation pass.  */

Reply via email to