I' like to split this patch:

https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html

to two patches:

0001-Fix-loop-split-incorrect-count-and-probability.patch
0002-Don-t-move-cold-code-out-of-loop-by-checking-bb-coun.patch

since they are solving two different things, please help to review
the attached series.  They show obvious performance improvement on
both P8 and P9 for CPU2017, and I am not sure how it will affect other
platforms like X86 and AArch64, it will be grateful if someone could
try it.  Thanks.


Xionghu
From 4e1ef5b1f423484a6789750e7cc0cf2e94517f20 Mon Sep 17 00:00:00 2001
From: Xionghu Luo <luo...@linux.ibm.com>
Date: Tue, 3 Aug 2021 03:44:14 -0500
Subject: [PATCH 1/2] Fix loop split incorrect count and probability

loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.

Regression tested pass, OK for master?

diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
   int prephitmp_16;
   int prephitmp_25;

   <bb 2> [local count: 118111600]:
   if (n_7(D) > 0)
     goto <bb 4>; [89.00%]
   else
     goto <bb 3>; [11.00%]

   <bb 3> [local count: 118111600]:
   return;

   <bb 4> [local count: 105119324]:
   pretmp_3 = ga;

-  <bb 5> [local count: 955630225]:
+  <bb 5> [local count: 315357973]:
   # i_13 = PHI <i_10(20), 0(4)>
   # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)>
   if (prephitmp_12 != 0)
     goto <bb 6>; [33.00%]
   else
     goto <bb 7>; [67.00%]

-  <bb 6> [local count: 315357972]:
+  <bb 6> [local count: 104068130]:
   _2 = do_something ();
   ga = _2;

-  <bb 7> [local count: 955630225]:
+  <bb 7> [local count: 315357973]:
   # prephitmp_5 = PHI <prephitmp_12(5), _2(6)>
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
     goto <bb 21>; [89.00%]
   else
     goto <bb 11>; [11.00%]

   <bb 11> [local count: 105119324]:
   goto <bb 3>; [100.00%]

-  <bb 21> [local count: 850510901]:
+  <bb 21> [local count: 280668596]:
   if (prephitmp_12 != 0)
-    goto <bb 20>; [100.00%]
+    goto <bb 20>; [33.00%]
   else
-    goto <bb 19>; [INV]
+    goto <bb 19>; [67.00%]

-  <bb 20> [local count: 850510901]:
+  <bb 20> [local count: 280668596]:
   goto <bb 5>; [100.00%]

-  <bb 19> [count: 0]:
+  <bb 19> [local count: 70429947]:
   # i_23 = PHI <i_10(21)>
   # prephitmp_25 = PHI <prephitmp_5(21)>

-  <bb 12> [local count: 955630225]:
+  <bb 12> [local count: 640272252]:
   # i_15 = PHI <i_23(19), i_22(16)>
   # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)>
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
     goto <bb 16>; [89.00%]
   else
     goto <bb 11>; [11.00%]

-  <bb 16> [local count: 850510901]:
+  <bb 16> [local count: 569842305]:
   goto <bb 12>; [100.00%]

 }

gcc/ChangeLog:

        * tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
        (do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3f6ad046623..d30782888f3 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -575,7 +575,11 @@ split_loop (class loop *loop1)
                                            stmts2);
        tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
        if (!initial_true)
-         cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+         cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+       edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+                          ? EDGE_SUCC (bbs[i], 0)
+                          : EDGE_SUCC (bbs[i], 1);
 
        /* Now version the loop, placing loop2 after loop1 connecting
           them, and fix up SSA form for that.  */
@@ -583,10 +587,10 @@ split_loop (class loop *loop1)
        basic_block cond_bb;
 
        class loop *loop2 = loop_version (loop1, cond, &cond_bb,
-                                          profile_probability::always (),
-                                          profile_probability::always (),
-                                          profile_probability::always (),
-                                          profile_probability::always (),
+                                          true_edge->probability,
+                                          true_edge->probability.invert (),
+                                          true_edge->probability,
+                                          true_edge->probability.invert (),
                                           true);
        gcc_assert (loop2);
 
@@ -1486,10 +1490,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-                                    profile_probability::always (),
-                                    profile_probability::never (),
-                                    profile_probability::always (),
-                                    profile_probability::always (),
+                                    invar_branch->probability.invert (),
+                                    invar_branch->probability,
+                                    invar_branch->probability.invert (),
+                                    invar_branch->probability,
                                     true);
   if (!loop2)
     {
@@ -1530,6 +1534,9 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   to_loop1->flags |= true_invar ? EDGE_FALSE_VALUE : EDGE_TRUE_VALUE;
   to_loop2->flags |= true_invar ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE;
 
+  to_loop1->probability = invar_branch->probability.invert ();
+  to_loop2->probability = invar_branch->probability;
+
   /* Due to introduction of a control flow edge from loop1 latch to loop2
      pre-header, we should update PHIs in loop2 to reflect this connection
      between loop1 and loop2.  */
-- 
2.27.0.90.geebb51ba8c

From b015708a4a3a3cdadb184012174f813c11bcdec2 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo <luo...@linux.ibm.com>
Date: Mon, 5 Jul 2021 03:57:11 -0500
Subject: [PATCH 2/2] Don't move cold code out of loop by checking bb count

There was a patch trying to avoid move cold block out of loop:

https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html

Richard suggested to "never hoist anything from a bb with lower execution
frequency to a bb with higher one in LIM invariantness_dom_walker
before_dom_children".

This patch does this profile count check in both gimple LIM
move_computations_worker and RTL loop-invariant.c find_invariants_bb,
if the loop bb is colder than loop preheader, don't hoist it out of
loop.

SPEC2017 performance evaluation shows 1% performance improvement for
intrate GEOMEAN and no obvious regression for others.  Especially,
500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is
largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00%
on P8LE.

Regression and bootstrap tested pass on P8LE, any comments?  Thanks.

gcc/ChangeLog:

        * loop-invariant.c (find_invariants_bb): Check profile count
        before motion.
        (find_invariants_body): Add argument.
        * tree-ssa-loop-im.c (move_computations_worker): Check profile
        count before motion.
        (execute_sm): Likewise.
        (execute_sm_exit): Check pointer validness.

gcc/testsuite/ChangeLog:

        * gcc.dg/tree-ssa/recip-3.c: Adjust.
---
 gcc/loop-invariant.c                    |  10 +-
 gcc/testsuite/gcc.dg/tree-ssa/recip-3.c |   2 +-
 gcc/tree-ssa-loop-im.c                  | 154 +++++++++++++++++++++++-
 3 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index fca0c2b24be..5c3be7bf0eb 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool 
always_reached, bool always_executed)
    call.  */
 
 static void
-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed)
+find_invariants_bb (class loop *loop, basic_block bb, bool always_reached,
+                   bool always_executed)
 {
   rtx_insn *insn;
+  basic_block preheader = loop_preheader_edge (loop)->src;
+
+  if (preheader->count > bb->count)
+    return;
 
   FOR_BB_INSNS (bb, insn)
     {
@@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body,
   unsigned i;
 
   for (i = 0; i < loop->num_nodes; i++)
-    find_invariants_bb (body[i],
-                       bitmap_bit_p (always_reached, i),
+    find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i),
                        bitmap_bit_p (always_executed, i));
 }
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c 
b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c
index 638bf38db8c..641c91e719e 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c
@@ -23,4 +23,4 @@ float h ()
        F[0] += E / d;
 }
 
-/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */
+/* { dg-final { scan-tree-dump-times " / " 5 "recip" } } */
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index d9f75d5025e..80d61ddf531 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -1154,6 +1154,58 @@ move_computations_worker (basic_block bb)
          continue;
        }
 
+      edge e = loop_preheader_edge (level);
+      if (e->src->count > bb->count)
+       {
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             fprintf (dump_file, "PHI node NOT moved to %d from %d:\n",
+                      e->src->index, bb->index);
+             print_gimple_stmt (dump_file, stmt, 0);
+             fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost,
+                      level->num);
+           }
+         gsi_next (&bsi);
+         continue;
+       }
+      else
+       {
+         unsigned i;
+         bool skip_phi_move = false;
+         for (i = 0; i < gimple_phi_num_args (stmt); i++)
+           {
+             tree def = PHI_ARG_DEF (stmt, i);
+
+             if (TREE_CODE (def) != SSA_NAME)
+               continue;
+
+             gimple *def_stmt = SSA_NAME_DEF_STMT (def);
+
+             if (!gimple_bb (def_stmt))
+               continue;
+
+             if (!dominated_by_p (CDI_DOMINATORS, e->src,
+                                  gimple_bb (def_stmt)))
+               {
+                 if (dump_file && (dump_flags & TDF_DETAILS))
+                   {
+                     fprintf (dump_file, "PHI node NOT moved to %d from %d\n",
+                              e->src->index, bb->index);
+                     print_gimple_stmt (dump_file, stmt, 0);
+                     fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost,
+                              level->num);
+                   }
+                 skip_phi_move = true;
+                 break;
+               }
+           }
+         if (skip_phi_move)
+           {
+             gsi_next (&bsi);
+             continue;
+           }
+       }
+
       if (dump_file && (dump_flags & TDF_DETAILS))
        {
          fprintf (dump_file, "Moving PHI node\n");
@@ -1191,14 +1243,13 @@ move_computations_worker (basic_block bb)
          tree lhs = gimple_assign_lhs (new_stmt);
          SSA_NAME_RANGE_INFO (lhs) = NULL;
        }
-      gsi_insert_on_edge (loop_preheader_edge (level), new_stmt);
+      gsi_insert_on_edge (e, new_stmt);
       remove_phi_node (&bsi, false);
     }
 
   for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); )
     {
       edge e;
-
       gimple *stmt = gsi_stmt (bsi);
 
       lim_data = get_lim_data (stmt);
@@ -1221,7 +1272,83 @@ move_computations_worker (basic_block bb)
       /* We do not really want to move conditionals out of the loop; we just
         placed it here to force its operands to be moved if necessary.  */
       if (gimple_code (stmt) == GIMPLE_COND)
-       continue;
+       {
+         gsi_next (&bsi);
+         continue;
+       }
+
+      e = loop_preheader_edge (level);
+      if (e->src->count > bb->count)
+       {
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             fprintf (dump_file, "stmt: Statement NOT moved to %d from %d \n",
+                      e->src->index, bb->index);
+             print_gimple_stmt (dump_file, stmt, 0);
+             fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost,
+                      level->num);
+           }
+         gsi_next (&bsi);
+         continue;
+       }
+      else
+       {
+         if (is_gimple_assign (stmt))
+           {
+             tree rhs1 = gimple_assign_rhs1 (stmt);
+             tree rhs2 = gimple_assign_rhs2 (stmt);
+             if (TREE_CODE (rhs1) == MEM_REF)
+               {
+                 rhs2 = TREE_OPERAND (rhs1, 1);
+                 rhs1 = TREE_OPERAND (rhs1, 0);
+               }
+             gimple *stmt1 = NULL, *stmt2 = NULL;
+             basic_block def_bb;
+             if (rhs1 && TREE_CODE (rhs1) == SSA_NAME)
+               {
+                 stmt1 = SSA_NAME_DEF_STMT (rhs1);
+                 def_bb = gimple_bb (stmt1);
+                 if (stmt1
+                     && def_bb
+                     && (def_bb == bb
+                         || !dominated_by_p (CDI_DOMINATORS, e->src, def_bb)))
+                   {
+                     if (dump_file && (dump_flags & TDF_DETAILS))
+                       {
+                         fprintf (dump_file,
+                                  "stmt1: Statement NOT moved to %d from %d\n",
+                                  e->src->index, bb->index);
+                         print_gimple_stmt (dump_file, stmt, 0);
+                         fprintf (dump_file, "(cost %u) out of loop %d.\n\n",
+                                  cost, level->num);
+                       }
+                     gsi_next (&bsi);
+                     continue;
+                   }
+               }
+             if (rhs2 && TREE_CODE (rhs2) == SSA_NAME)
+               {
+                 stmt2 = SSA_NAME_DEF_STMT (rhs2);
+                 def_bb = gimple_bb (stmt2);
+                 if (stmt2 && def_bb
+                     && (def_bb == bb
+                         || !dominated_by_p (CDI_DOMINATORS, e->src, def_bb)))
+                   {
+                     if (dump_file && (dump_flags & TDF_DETAILS))
+                       {
+                         fprintf (dump_file,
+                                  "stmt2: Statement NOT moved to %d from %d\n",
+                                  e->src->index, bb->index);
+                         print_gimple_stmt (dump_file, stmt, 0);
+                         fprintf (dump_file, "(cost %u) out of loop %d.\n\n",
+                                  cost, level->num);
+                       }
+                     gsi_next (&bsi);
+                     continue;
+                   }
+               }
+           }
+       }
 
       if (dump_file && (dump_flags & TDF_DETAILS))
        {
@@ -1231,7 +1358,6 @@ move_computations_worker (basic_block bb)
                   cost, level->num);
        }
 
-      e = loop_preheader_edge (level);
       gcc_assert (!gimple_vdef (stmt));
       if (gimple_vuse (stmt))
        {
@@ -2133,6 +2259,19 @@ execute_sm (class loop *loop, im_mem_ref *ref,
   bool multi_threaded_model_p = false;
   gimple_stmt_iterator gsi;
   sm_aux *aux = new sm_aux;
+  basic_block bb = gimple_bb (first_mem_ref_loc (loop, ref)->stmt);
+
+  edge e = loop_preheader_edge (loop);
+  if (e->src->count > bb->count)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       {
+         fprintf (dump_file, "Don't execute store motion of ");
+         print_generic_expr (dump_file, ref->mem.ref);
+         fprintf (dump_file, " from loop %d\n", loop->num);
+       }
+      return;
+    }
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -2241,7 +2380,12 @@ execute_sm_exit (class loop *loop, edge ex, 
vec<seq_entry> &seq,
        }
       else
        {
-         sm_aux *aux = *aux_map.get (ref);
+         sm_aux **paux = aux_map.get (ref);
+         sm_aux *aux;
+         if (paux)
+           aux = *paux;
+         else
+           continue;
          if (!aux->store_flag || kind == sm_ord)
            {
              gassign *store;
-- 
2.27.0.90.geebb51ba8c

Reply via email to