On Thu, 29 Apr 2021, Jiufu Guo wrote: > When there is the possibility that overflow may happen on the loop index, > a few optimizations would not happen. For example code: > > foo (int *a, int *b, unsigned k, unsigned n) > { > while (++k != n) > a[k] = b[k] + 1; > } > > For this code, if "l > n", overflow may happen. if "l < n" at begining, > it could be optimized (e.g. vectorization). > > We can split the loop into two loops: > > while (++k > n) > a[k] = b[k] + 1; > while (l++ < n) > a[k] = b[k] + 1; > > then for the second loop, it could be optimized. > > This patch is spltting this kind of small loop to achieve better performance. > > Bootstrap and regtest pass on ppc64le. Is this ok for trunk?
Do you have any statistics on how often this splits a loop during bootstrap (use --with-build-config=bootstrap-O3)? Or alternatively on SPEC? Actual comments on the patch inline. > Thanks! > > Jiufu Guo. > > gcc/ChangeLog: > > 2021-04-29 Jiufu Guo <guoji...@linux.ibm.com> > > * params.opt (max-insns-ne-cond-split): New. > * tree-ssa-loop-split.c (connect_loop_phis): Add new param. > (get_ne_cond_branch): New function. > (split_ne_loop): New function. > (split_loop_on_ne_cond): New function. > (tree_ssa_split_loops): Use split_loop_on_ne_cond. > > gcc/testsuite/ChangeLog: > 2021-04-29 Jiufu Guo <guoji...@linux.ibm.com> > > * gcc.dg/loop-split1.c: New test. > > --- > gcc/params.opt | 4 + > gcc/testsuite/gcc.dg/loop-split1.c | 28 ++++ > gcc/tree-ssa-loop-split.c | 219 ++++++++++++++++++++++++++++- > 3 files changed, 247 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c > > diff --git a/gcc/params.opt b/gcc/params.opt > index 2e4cbdd7a71..900b59b5136 100644 > --- a/gcc/params.opt > +++ b/gcc/params.opt > @@ -766,6 +766,10 @@ Min. ratio of insns to prefetches to enable prefetching > for a loop with an unkno > Common Joined UInteger Var(param_min_loop_cond_split_prob) Init(30) > IntegerRange(0, 100) Param Optimization > The minimum threshold for probability of semi-invariant condition statement > to trigger loop split. > > +-param=max-insns-ne-cond-split= > +Common Joined UInteger Var(param_max_insn_ne_cond_split) Init(64) Param > Optimization > +The maximum threshold for insnstructions number of a loop with ne condition > to split. > + > -param=min-nondebug-insn-uid= > Common Joined UInteger Var(param_min_nondebug_insn_uid) Param > The minimum UID to be used for a nondebug insn. > diff --git a/gcc/testsuite/gcc.dg/loop-split1.c > b/gcc/testsuite/gcc.dg/loop-split1.c > new file mode 100644 > index 00000000000..4c466aa9f54 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/loop-split1.c > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */ > + > +void > +foo (int *a, int *b, unsigned l, unsigned n) > +{ > + while (++l != n) > + a[l] = b[l] + 1; > +} > + > +void > +foo1 (int *a, int *b, unsigned l, unsigned n) > +{ > + while (l++ != n) > + a[l] = b[l] + 1; > +} > + > +unsigned > +foo2 (char *a, char *b, unsigned l, unsigned n) > +{ > + while (++l != n) > + if (a[l] != b[l]) > + break; > + > + return l; > +} > + > +/* { dg-final { scan-tree-dump-times "Loop split" 3 "lsplit" } } */ > diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c > index b80b6a75e62..a6d28078e5e 100644 > --- a/gcc/tree-ssa-loop-split.c > +++ b/gcc/tree-ssa-loop-split.c > @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see > #include "cfghooks.h" > #include "gimple-fold.h" > #include "gimplify-me.h" > +#include "tree-ssa-loop-ivopts.h" > > /* This file implements two kinds of loop splitting. > > @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop) > this. The loops need to fulfill easy_exit_values(). */ > > static void > -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e) > +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e, > + bool use_prev = false) > { > basic_block rest = loop_preheader_edge (loop2)->src; > gcc_assert (new_e->dest == rest); > @@ -248,13 +250,14 @@ connect_loop_phis (class loop *loop1, class loop > *loop2, edge new_e) > !gsi_end_p (psi_first); > gsi_next (&psi_first), gsi_next (&psi_second)) > { > - tree init, next, new_init; > + tree init, next, new_init, prev; > use_operand_p op; > gphi *phi_first = psi_first.phi (); > gphi *phi_second = psi_second.phi (); > > init = PHI_ARG_DEF_FROM_EDGE (phi_first, firste); > next = PHI_ARG_DEF_FROM_EDGE (phi_first, firstn); > + prev = PHI_RESULT (phi_first); > op = PHI_ARG_DEF_PTR_FROM_EDGE (phi_second, seconde); > gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op))); > > @@ -279,7 +282,7 @@ connect_loop_phis (class loop *loop1, class loop *loop2, > edge new_e) > > gphi * newphi = create_phi_node (new_init, rest); > add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION); > - add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION); > + add_phi_arg (newphi, use_prev ? prev : next, new_e, UNKNOWN_LOCATION); > SET_USE (op, new_init); > } > } > @@ -1599,6 +1602,213 @@ split_loop_on_cond (struct loop *loop) > return do_split; > } > > +/* Check if the LOOP exit branch likes "if (idx != bound)". > + if INV is not NULL and the branch is "if (bound != idx)", set *INV to > true. If > + return the branch edge which exit loop. */ Return > + > +static edge > +get_ne_cond_branch (struct loop *loop, bool *inv) > +{ > + int i; > + edge e; > + > + auto_vec<edge> edges = get_loop_exit_edges (loop); > + FOR_EACH_VEC_ELT (edges, i, e) > + { > + basic_block bb = e->src; > + > + /* Check gcond. */ > + gimple *last = last_stmt (bb); > + if (!last || gimple_code (last) != GIMPLE_COND) > + continue; > + gcond *cond = as_a<gcond *> (last); > + enum tree_code code = gimple_cond_code (cond); > + if (code != NE_EXPR) > + continue; I'm not sure we canonicalize the case with code == EQ_EXPR, at least for void bar(); void foo(unsigned n) { unsigned i = 0; do { if (i == n) return; bar(); ++i; } while (1); } we don't. Since you return the exit edge can this case be handled transparently? > + > + /* Make sure idx and bound. */ > + tree idx = gimple_cond_lhs (cond); > + tree bnd = gimple_cond_rhs (cond); > + if (expr_invariant_in_loop_p (loop, idx)) > + { > + std::swap (idx, bnd); > + if (inv) > + *inv = true; > + } > + else if (!expr_invariant_in_loop_p (loop, bnd)) > + continue; We canonicalize i < UINT_MAX to i != UINT_MAX so you want to detect that and not split the loop if 'bnd' is the maximum or minimum value of the type I think. > + /* Extract conversion. */ > + if (TREE_CODE (idx) == SSA_NAME) > + { > + gimple *stmt = SSA_NAME_DEF_STMT (idx); > + if (is_gimple_assign (stmt) > + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)) > + && flow_bb_inside_loop_p (loop, gimple_bb (stmt))) > + idx = gimple_assign_rhs1 (stmt); > + } This skips arbitrary extensions and truncations - is that intended? > + /* Make sure idx is iv. */ > + class loop *useloop = loop_containing_stmt (cond); > + affine_iv iv; > + if (!simple_iv (loop, useloop, idx, &iv, false)) > + continue; > + > + /* No need to split loop, if base is know value. > + Or check range info. */ > + if (TREE_CODE (iv.base) == INTEGER_CST) > + continue; I think it would be better to check iv.no_overflow? Also looking it might be possible to use simple_iv_with_niters with IV_NITERS not NULL for most of this analysis? > + /* There is type conversion on idx(or rhs of idx's def). > + And there is converting shorter to longer type. */ > + tree type = TREE_TYPE (idx); > + if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME > + || !TYPE_UNSIGNED (type) > + || TYPE_PRECISION (type) == TYPE_PRECISION (sizetype)) > + continue; That check can be done before the (expensive) simple_iv check. I wonder what the 'sizetype' precision check is about? The function level comment should probably clarify what kind of conversions we handle (and why). > + /* Check loop is simple to split. */ > + gcc_assert (bb != loop->latch); > + > + if (single_pred_p (loop->latch) > + && single_pred_edge (loop->latch)->src == bb > + && empty_block_p (loop->latch)) > + return e; > + > + /* Splitting is cheap for idx increase header. */ > + if (bb == loop->header) > + { > + if (get_virtual_phi (loop->header)) > + continue; > + > + /* In loop header: i++ or ++i. */ > + gimple_stmt_iterator gsi = gsi_start_bb (bb); > + if (gsi_end_p (gsi)) > + return e; > + > + gimple *s1 = gsi_stmt (gsi); > + if (!(is_gimple_assign (s1) > + && (idx == gimple_assign_lhs (s1) > + || idx == gimple_assign_rhs1 (s1)))) > + continue; > + > + gsi_next (&gsi); > + if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond) > + return e; > + } I wonder if these "cheapness" heuristics should simply fold into the cost of the extra duplication of the header/tail in the overall stmt limit? > + } > + > + return NULL; > +} > + > +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR. */ > + > +static bool > +split_ne_loop (struct loop *loop, edge cond_e) > +{ > + initialize_original_copy_tables (); > + > + struct loop *loop2 = loop_version (loop, boolean_true_node, NULL, > + profile_probability::always (), > + profile_probability::never (), > + profile_probability::always (), > + profile_probability::always (), true); > + > + gcc_assert (loop2); > + update_ssa (TODO_update_ssa); > + > + free_original_copy_tables (); > + > + /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */ > + bool inv = false; > + edge dup_cond = get_ne_cond_branch (loop2, &inv); I don't think you should rely in pattern-matching to detect the same condition in the versioned loop - instead you can use the copy tables, do 2nd_loop_exit_block = get_bb_copy (cond_e->src); to get to the block with the COND_EXPR (before free_original_copy_tables obviously). > + enum tree_code up_code = inv ? LT_EXPR : GT_EXPR; > + enum tree_code down_code = inv ? GT_EXPR : LT_EXPR; > + > + gcond *gc = as_a<gcond *> (last_stmt (cond_e->src)); > + gimple_cond_set_code (gc, up_code); > + > + gcond *dup_gc = as_a<gcond *> (last_stmt (dup_cond->src)); > + gimple_cond_set_code (dup_gc, down_code); > + > + /* Link the exit cond edge to new loop. */ > + gcond *break_cond = as_a<gcond *> (gimple_copy (gc)); > + edge pred_e = single_pred_edge (loop->latch); > + gcc_assert (pred_e); > + bool simple_loop = pred_e->src == cond_e->src && empty_block_p > (loop->latch); > + if (simple_loop) > + gimple_cond_set_code (break_cond, down_code); > + else > + gimple_cond_make_true (break_cond); > + > + basic_block break_bb = split_edge (cond_e); > + gimple_stmt_iterator gsi = gsi_last_bb (break_bb); > + gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT); > + > + edge to_exit = single_succ_edge (break_bb); > + edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src, > 0); > + to_new_loop->flags |= EDGE_TRUE_VALUE; > + to_exit->flags |= EDGE_FALSE_VALUE; > + to_exit->flags &= ~EDGE_FALLTHRU; > + to_exit->probability = cond_e->probability; > + to_new_loop->probability = to_exit->probability.invert (); > + > + update_ssa (TODO_update_ssa); > + > + connect_loop_phis (loop, loop2, to_new_loop, !simple_loop); > + > + rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, ";; Loop split.\n"); Maybe ";; Loop split on != condition.\n"? > + > + return true; > +} > + > +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split. > +L_H: > + if (i!=N) > + S; > + i++; > + goto L_H; > + > +The "i!=N" is like "i>N || i<N", then it could be transform to: > + > +L_H: > + if (i>N) > + S; > + i++; > + goto L_H; > +L1_H: > + if (i<N) > + S; > + i++; > + goto L1_H; > + > +The loop with "i<N" is in favor both GIMPLE and RTL passes. */ > + > +static bool > +split_loop_on_ne_cond (class loop *loop) > +{ > + if (!can_duplicate_loop_p (loop)) > + return false; > + > + int num = 0; > + basic_block *bbs = get_loop_body (loop); To avoid repeated DFS walks of the loop body do the can_duplicate_loop_p check here as if (!can_copy_bbs_p (bbs, loop->num_nodes)) { free (bbs); return false; } (see split_loop) > + for (unsigned i = 0; i < loop->num_nodes; i++) > + num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights); > + free (bbs); So with this and the suggestion above it is maybe possible to re-use compute_added_num_insns? That already seems to handle splitting at aribtrary branches (but maybe not loops with multiple exits?). The code using this computation uses param_max_peeled_insns - is that limit not sufficient for your case (to avoid another param?)? It's default is 100, a bit higher than yours (64). I'd really like to see some numbers on how much this triggers since the splitting itself is O (number-of-split-loops * function-size) complexity and thus worse than quadratic as we do update_ssa for each split loop. You can experience this by simply concating one of your testcase loops N times in a function ... > + if (num > param_max_insn_ne_cond_split) > + return false; > + > + edge branch_edge = get_ne_cond_branch (loop, NULL); > + if (branch_edge && split_ne_loop (loop, branch_edge)) > + return true; > + > + return false; > +} > + > /* Main entry point. Perform loop splitting on all suitable loops. */ > > static unsigned int > @@ -1628,7 +1838,8 @@ tree_ssa_split_loops (void) > if (optimize_loop_for_size_p (loop)) > continue; > > - if (split_loop (loop) || split_loop_on_cond (loop)) > + if (split_loop (loop) || split_loop_on_cond (loop) > + || split_loop_on_ne_cond (loop)) > { > /* Mark our containing loop as having had some split inner loops. */ > loop_outer (loop)->aux = loop; >