I' like to split this patch: https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html
to two patches: 0001-Fix-loop-split-incorrect-count-and-probability.patch 0002-Don-t-move-cold-code-out-of-loop-by-checking-bb-coun.patch since they are solving two different things, please help to review the attached series. They show obvious performance improvement on both P8 and P9 for CPU2017, and I am not sure how it will affect other platforms like X86 and AArch64, it will be grateful if someone could try it. Thanks. Xionghu
From 4e1ef5b1f423484a6789750e7cc0cf2e94517f20 Mon Sep 17 00:00:00 2001 From: Xionghu Luo <luo...@linux.ibm.com> Date: Tue, 3 Aug 2021 03:44:14 -0500 Subject: [PATCH 1/2] Fix loop split incorrect count and probability loop split condition is moved between loop1 and loop2, the split bb's count and probability should also be duplicated instead of (100% vs INV), secondly, the original loop1 and loop2 count need be propotional from the original loop. Regression tested pass, OK for master? diff base/loop-cond-split-1.c.151t.lsplit patched/loop-cond-split-1.c.151t.lsplit: ... int prephitmp_16; int prephitmp_25; <bb 2> [local count: 118111600]: if (n_7(D) > 0) goto <bb 4>; [89.00%] else goto <bb 3>; [11.00%] <bb 3> [local count: 118111600]: return; <bb 4> [local count: 105119324]: pretmp_3 = ga; - <bb 5> [local count: 955630225]: + <bb 5> [local count: 315357973]: # i_13 = PHI <i_10(20), 0(4)> # prephitmp_12 = PHI <prephitmp_5(20), pretmp_3(4)> if (prephitmp_12 != 0) goto <bb 6>; [33.00%] else goto <bb 7>; [67.00%] - <bb 6> [local count: 315357972]: + <bb 6> [local count: 104068130]: _2 = do_something (); ga = _2; - <bb 7> [local count: 955630225]: + <bb 7> [local count: 315357973]: # prephitmp_5 = PHI <prephitmp_12(5), _2(6)> i_10 = inc (i_13); if (n_7(D) > i_10) goto <bb 21>; [89.00%] else goto <bb 11>; [11.00%] <bb 11> [local count: 105119324]: goto <bb 3>; [100.00%] - <bb 21> [local count: 850510901]: + <bb 21> [local count: 280668596]: if (prephitmp_12 != 0) - goto <bb 20>; [100.00%] + goto <bb 20>; [33.00%] else - goto <bb 19>; [INV] + goto <bb 19>; [67.00%] - <bb 20> [local count: 850510901]: + <bb 20> [local count: 280668596]: goto <bb 5>; [100.00%] - <bb 19> [count: 0]: + <bb 19> [local count: 70429947]: # i_23 = PHI <i_10(21)> # prephitmp_25 = PHI <prephitmp_5(21)> - <bb 12> [local count: 955630225]: + <bb 12> [local count: 640272252]: # i_15 = PHI <i_23(19), i_22(16)> # prephitmp_16 = PHI <prephitmp_25(19), prephitmp_16(16)> i_22 = inc (i_15); if (n_7(D) > i_22) goto <bb 16>; [89.00%] else goto <bb 11>; [11.00%] - <bb 16> [local count: 850510901]: + <bb 16> [local count: 569842305]: goto <bb 12>; [100.00%] } gcc/ChangeLog: * tree-ssa-loop-split.c (split_loop): Fix incorrect probability. (do_split_loop_on_cond): Likewise. --- gcc/tree-ssa-loop-split.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c index 3f6ad046623..d30782888f3 100644 --- a/gcc/tree-ssa-loop-split.c +++ b/gcc/tree-ssa-loop-split.c @@ -575,7 +575,11 @@ split_loop (class loop *loop1) stmts2); tree cond = build2 (guard_code, boolean_type_node, guard_init, border); if (!initial_true) - cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); + + edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE + ? EDGE_SUCC (bbs[i], 0) + : EDGE_SUCC (bbs[i], 1); /* Now version the loop, placing loop2 after loop1 connecting them, and fix up SSA form for that. */ @@ -583,10 +587,10 @@ split_loop (class loop *loop1) basic_block cond_bb; class loop *loop2 = loop_version (loop1, cond, &cond_bb, - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), - profile_probability::always (), + true_edge->probability, + true_edge->probability.invert (), + true_edge->probability, + true_edge->probability.invert (), true); gcc_assert (loop2); @@ -1486,10 +1490,10 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) initialize_original_copy_tables (); struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL, - profile_probability::always (), - profile_probability::never (), - profile_probability::always (), - profile_probability::always (), + invar_branch->probability.invert (), + invar_branch->probability, + invar_branch->probability.invert (), + invar_branch->probability, true); if (!loop2) { @@ -1530,6 +1534,9 @@ do_split_loop_on_cond (struct loop *loop1, edge invar_branch) to_loop1->flags |= true_invar ? EDGE_FALSE_VALUE : EDGE_TRUE_VALUE; to_loop2->flags |= true_invar ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE; + to_loop1->probability = invar_branch->probability.invert (); + to_loop2->probability = invar_branch->probability; + /* Due to introduction of a control flow edge from loop1 latch to loop2 pre-header, we should update PHIs in loop2 to reflect this connection between loop1 and loop2. */ -- 2.27.0.90.geebb51ba8c
From b015708a4a3a3cdadb184012174f813c11bcdec2 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo <luo...@linux.ibm.com> Date: Mon, 5 Jul 2021 03:57:11 -0500 Subject: [PATCH 2/2] Don't move cold code out of loop by checking bb count There was a patch trying to avoid move cold block out of loop: https://gcc.gnu.org/pipermail/gcc/2014-November/215551.html Richard suggested to "never hoist anything from a bb with lower execution frequency to a bb with higher one in LIM invariantness_dom_walker before_dom_children". This patch does this profile count check in both gimple LIM move_computations_worker and RTL loop-invariant.c find_invariants_bb, if the loop bb is colder than loop preheader, don't hoist it out of loop. SPEC2017 performance evaluation shows 1% performance improvement for intrate GEOMEAN and no obvious regression for others. Especially, 500.perlbench_r +7.52% (Perf shows function S_regtry of perlbench is largely improved.), and 548.exchange2_r+1.98%, 526.blender_r +1.00% on P8LE. Regression and bootstrap tested pass on P8LE, any comments? Thanks. gcc/ChangeLog: * loop-invariant.c (find_invariants_bb): Check profile count before motion. (find_invariants_body): Add argument. * tree-ssa-loop-im.c (move_computations_worker): Check profile count before motion. (execute_sm): Likewise. (execute_sm_exit): Check pointer validness. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/recip-3.c: Adjust. --- gcc/loop-invariant.c | 10 +- gcc/testsuite/gcc.dg/tree-ssa/recip-3.c | 2 +- gcc/tree-ssa-loop-im.c | 154 +++++++++++++++++++++++- 3 files changed, 157 insertions(+), 9 deletions(-) diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index fca0c2b24be..5c3be7bf0eb 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1183,9 +1183,14 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) call. */ static void -find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) +find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, + bool always_executed) { rtx_insn *insn; + basic_block preheader = loop_preheader_edge (loop)->src; + + if (preheader->count > bb->count) + return; FOR_BB_INSNS (bb, insn) { @@ -1214,8 +1219,7 @@ find_invariants_body (class loop *loop, basic_block *body, unsigned i; for (i = 0; i < loop->num_nodes; i++) - find_invariants_bb (body[i], - bitmap_bit_p (always_reached, i), + find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i), bitmap_bit_p (always_executed, i)); } diff --git a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c index 638bf38db8c..641c91e719e 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/recip-3.c @@ -23,4 +23,4 @@ float h () F[0] += E / d; } -/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */ +/* { dg-final { scan-tree-dump-times " / " 5 "recip" } } */ diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index d9f75d5025e..80d61ddf531 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -1154,6 +1154,58 @@ move_computations_worker (basic_block bb) continue; } + edge e = loop_preheader_edge (level); + if (e->src->count > bb->count) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "PHI node NOT moved to %d from %d:\n", + e->src->index, bb->index); + print_gimple_stmt (dump_file, stmt, 0); + fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost, + level->num); + } + gsi_next (&bsi); + continue; + } + else + { + unsigned i; + bool skip_phi_move = false; + for (i = 0; i < gimple_phi_num_args (stmt); i++) + { + tree def = PHI_ARG_DEF (stmt, i); + + if (TREE_CODE (def) != SSA_NAME) + continue; + + gimple *def_stmt = SSA_NAME_DEF_STMT (def); + + if (!gimple_bb (def_stmt)) + continue; + + if (!dominated_by_p (CDI_DOMINATORS, e->src, + gimple_bb (def_stmt))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "PHI node NOT moved to %d from %d\n", + e->src->index, bb->index); + print_gimple_stmt (dump_file, stmt, 0); + fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost, + level->num); + } + skip_phi_move = true; + break; + } + } + if (skip_phi_move) + { + gsi_next (&bsi); + continue; + } + } + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Moving PHI node\n"); @@ -1191,14 +1243,13 @@ move_computations_worker (basic_block bb) tree lhs = gimple_assign_lhs (new_stmt); SSA_NAME_RANGE_INFO (lhs) = NULL; } - gsi_insert_on_edge (loop_preheader_edge (level), new_stmt); + gsi_insert_on_edge (e, new_stmt); remove_phi_node (&bsi, false); } for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); ) { edge e; - gimple *stmt = gsi_stmt (bsi); lim_data = get_lim_data (stmt); @@ -1221,7 +1272,83 @@ move_computations_worker (basic_block bb) /* We do not really want to move conditionals out of the loop; we just placed it here to force its operands to be moved if necessary. */ if (gimple_code (stmt) == GIMPLE_COND) - continue; + { + gsi_next (&bsi); + continue; + } + + e = loop_preheader_edge (level); + if (e->src->count > bb->count) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "stmt: Statement NOT moved to %d from %d \n", + e->src->index, bb->index); + print_gimple_stmt (dump_file, stmt, 0); + fprintf (dump_file, "(cost %u) out of loop %d.\n\n", cost, + level->num); + } + gsi_next (&bsi); + continue; + } + else + { + if (is_gimple_assign (stmt)) + { + tree rhs1 = gimple_assign_rhs1 (stmt); + tree rhs2 = gimple_assign_rhs2 (stmt); + if (TREE_CODE (rhs1) == MEM_REF) + { + rhs2 = TREE_OPERAND (rhs1, 1); + rhs1 = TREE_OPERAND (rhs1, 0); + } + gimple *stmt1 = NULL, *stmt2 = NULL; + basic_block def_bb; + if (rhs1 && TREE_CODE (rhs1) == SSA_NAME) + { + stmt1 = SSA_NAME_DEF_STMT (rhs1); + def_bb = gimple_bb (stmt1); + if (stmt1 + && def_bb + && (def_bb == bb + || !dominated_by_p (CDI_DOMINATORS, e->src, def_bb))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "stmt1: Statement NOT moved to %d from %d\n", + e->src->index, bb->index); + print_gimple_stmt (dump_file, stmt, 0); + fprintf (dump_file, "(cost %u) out of loop %d.\n\n", + cost, level->num); + } + gsi_next (&bsi); + continue; + } + } + if (rhs2 && TREE_CODE (rhs2) == SSA_NAME) + { + stmt2 = SSA_NAME_DEF_STMT (rhs2); + def_bb = gimple_bb (stmt2); + if (stmt2 && def_bb + && (def_bb == bb + || !dominated_by_p (CDI_DOMINATORS, e->src, def_bb))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "stmt2: Statement NOT moved to %d from %d\n", + e->src->index, bb->index); + print_gimple_stmt (dump_file, stmt, 0); + fprintf (dump_file, "(cost %u) out of loop %d.\n\n", + cost, level->num); + } + gsi_next (&bsi); + continue; + } + } + } + } if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -1231,7 +1358,6 @@ move_computations_worker (basic_block bb) cost, level->num); } - e = loop_preheader_edge (level); gcc_assert (!gimple_vdef (stmt)); if (gimple_vuse (stmt)) { @@ -2133,6 +2259,19 @@ execute_sm (class loop *loop, im_mem_ref *ref, bool multi_threaded_model_p = false; gimple_stmt_iterator gsi; sm_aux *aux = new sm_aux; + basic_block bb = gimple_bb (first_mem_ref_loc (loop, ref)->stmt); + + edge e = loop_preheader_edge (loop); + if (e->src->count > bb->count) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Don't execute store motion of "); + print_generic_expr (dump_file, ref->mem.ref); + fprintf (dump_file, " from loop %d\n", loop->num); + } + return; + } if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -2241,7 +2380,12 @@ execute_sm_exit (class loop *loop, edge ex, vec<seq_entry> &seq, } else { - sm_aux *aux = *aux_map.get (ref); + sm_aux **paux = aux_map.get (ref); + sm_aux *aux; + if (paux) + aux = *paux; + else + continue; if (!aux->store_flag || kind == sm_ord) { gassign *store; -- 2.27.0.90.geebb51ba8c