On Tue, 30 Oct 2012, Jan Hubicka wrote: > Hi, > for past week or two I was playing with ways to throttle down the complette > unrolling heuristics. I made complette unroller to use the > tree-ssa-loop-niter > upper bound and unroll even in non-trivial cases and this has turned out to > increase number of complettely unrolled loops by great amount, so one can > see it as considerable code size growth at -O3 SPEC build. > > http://gcc.opensuse.org/SPEC/CFP/sb-vangelis-head-64/Total-size_big.png > it is the largest jump on right hand side in both peak and base runs. > There are also performance imrovements, most impotantly 11% on applu. > > The intuition is that complette unrolling is most profitable when IV tests > are eliminated and single basic block is created. When condtionals stay > in the code it is not that good idea and also functions containing calls > are less interesting for unrolling since the calls are slow and optimization > oppurtunities are not so great. > > This patch reduces unrolling on loops having many branches or calls on the > hot patch. I found that for applu speedup the number of branches needs to be > pretty high - about 32. > > The patch saves about half of the growth introduced (but on different > benchmarks) > and I think I can move all peeling to trees and reduce peeling limits a bit, > too. > > Does this sound sane? Any ideas?
Yes, this sounds ok (beware of unrelated PARAM_MAX_ONCE_PEELED_INSNS remove in the patch below). Richard. > Honza > > Index: tree-ssa-loop-ivcanon.c > =================================================================== > --- tree-ssa-loop-ivcanon.c (revision 192892) > +++ tree-ssa-loop-ivcanon.c (working copy) > @@ -140,6 +140,20 @@ struct loop_size > instructions after exit are not executed. */ > int last_iteration; > int last_iteration_eliminated_by_peeling; > + > + /* If some IV computation will become constant. */ > + bool constant_iv; > + > + /* Number of call stmts that are not a builtin and are pure or const > + present on the hot path. */ > + int num_pure_calls_on_hot_path; > + /* Number of call stmts that are not a builtin and are not pure nor const > + present on the hot path. */ > + int num_non_pure_calls_on_hot_path; > + /* Number of statements other than calls in the loop. */ > + int non_call_stmts_on_hot_path; > + /* Number of branches seen on the hot path. */ > + int num_branches_on_hot_path; > }; > > /* Return true if OP in STMT will be constant after peeling LOOP. */ > @@ -188,7 +202,11 @@ constant_after_peeling (tree op, gimple > return true; > } > > -/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. > +/* Computes an estimated number of insns in LOOP. > + EXIT (if non-NULL) is an exite edge that will be eliminated in all but > last > + iteration of the loop. > + EDGE_TO_CANCEL (if non-NULL) is an non-exit edge eliminated in the last > iteration > + of loop. > Return results in SIZE, estimate benefits for complete unrolling exiting > by EXIT. */ > > static void > @@ -198,11 +216,17 @@ tree_estimate_loop_size (struct loop *lo > gimple_stmt_iterator gsi; > unsigned int i; > bool after_exit; > + VEC (basic_block, heap) *path = get_loop_hot_path (loop); > > size->overall = 0; > size->eliminated_by_peeling = 0; > size->last_iteration = 0; > size->last_iteration_eliminated_by_peeling = 0; > + size->num_pure_calls_on_hot_path = 0; > + size->num_non_pure_calls_on_hot_path = 0; > + size->non_call_stmts_on_hot_path = 0; > + size->num_branches_on_hot_path = 0; > + size->constant_iv = 0; > > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num); > @@ -221,6 +245,8 @@ tree_estimate_loop_size (struct loop *lo > gimple stmt = gsi_stmt (gsi); > int num = estimate_num_insns (stmt, &eni_size_weights); > bool likely_eliminated = false; > + bool likely_eliminated_last = false; > + bool likely_eliminated_peeled = false; > > if (dump_file && (dump_flags & TDF_DETAILS)) > { > @@ -231,11 +257,21 @@ tree_estimate_loop_size (struct loop *lo > /* Look for reasons why we might optimize this stmt away. */ > > /* Exit conditional. */ > - if (exit && body[i] == exit->src && stmt == last_stmt (exit->src)) > + if (exit && body[i] == exit->src > + && stmt == last_stmt (exit->src)) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > - fprintf (dump_file, " Exit condition will be eliminated.\n"); > - likely_eliminated = true; > + fprintf (dump_file, " Exit condition will be eliminated " > + "in peeled copies.\n"); > + likely_eliminated_peeled = true; > + } > + else if (edge_to_cancel && body[i] == edge_to_cancel->src > + && stmt == last_stmt (edge_to_cancel->src)) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, " Exit condition will be eliminated " > + "in last copy.\n"); > + likely_eliminated_last = true; > } > /* Sets of IV variables */ > else if (gimple_code (stmt) == GIMPLE_ASSIGN > @@ -249,19 +285,22 @@ tree_estimate_loop_size (struct loop *lo > /* Assignments of IV variables. */ > else if (gimple_code (stmt) == GIMPLE_ASSIGN > && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME > - && constant_after_peeling (gimple_assign_rhs1 (stmt), > stmt,loop) > + && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt, > loop) > && (gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS > || constant_after_peeling (gimple_assign_rhs2 (stmt), > stmt, loop))) > { > + size->constant_iv = true; > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, " Constant expression will be folded > away.\n"); > likely_eliminated = true; > } > /* Conditionals. */ > - else if (gimple_code (stmt) == GIMPLE_COND > - && constant_after_peeling (gimple_cond_lhs (stmt), stmt, > loop) > - && constant_after_peeling (gimple_cond_rhs (stmt), stmt, > loop)) > + else if ((gimple_code (stmt) == GIMPLE_COND > + && constant_after_peeling (gimple_cond_lhs (stmt), stmt, > loop) > + && constant_after_peeling (gimple_cond_rhs (stmt), stmt, > loop)) > + || (gimple_code (stmt) == GIMPLE_SWITCH > + && constant_after_peeling (gimple_switch_index (stmt), > stmt, loop))) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, " Constant conditional.\n"); > @@ -269,16 +308,48 @@ tree_estimate_loop_size (struct loop *lo > } > > size->overall += num; > - if (likely_eliminated) > + if (likely_eliminated || likely_eliminated_peeled) > size->eliminated_by_peeling += num; > if (!after_exit) > { > size->last_iteration += num; > - if (likely_eliminated) > + if (likely_eliminated || likely_eliminated_last) > size->last_iteration_eliminated_by_peeling += num; > } > } > } > + while (VEC_length (basic_block, path)) > + { > + basic_block bb = VEC_pop (basic_block, path); > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple stmt = gsi_stmt (gsi); > + if (gimple_code (stmt) == GIMPLE_CALL) > + { > + int flags = gimple_call_flags (stmt); > + tree decl = gimple_call_fndecl (stmt); > + > + if (decl && DECL_IS_BUILTIN (decl)) > + ; > + else if (flags & (ECF_PURE | ECF_CONST)) > + size->num_pure_calls_on_hot_path++; > + else > + size->num_non_pure_calls_on_hot_path++; > + size->num_branches_on_hot_path ++; > + } > + else if (gimple_code (stmt) != GIMPLE_CALL > + && gimple_code (stmt) != GIMPLE_DEBUG) > + size->non_call_stmts_on_hot_path++; > + if (((gimple_code (stmt) == GIMPLE_COND > + && (!constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop) > + || constant_after_peeling (gimple_cond_rhs (stmt), stmt, > loop))) > + || (gimple_code (stmt) == GIMPLE_SWITCH > + && !constant_after_peeling (gimple_switch_index (stmt), > stmt, loop))) > + && (!exit || bb != exit->src)) > + size->num_branches_on_hot_path++; > + } > + } > + VEC_free (basic_block, heap, path); > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "size: %i-%i, last_iteration: %i-%i\n", > size->overall, > size->eliminated_by_peeling, size->last_iteration, > @@ -486,34 +557,85 @@ try_unroll_loop_completely (struct loop > (int) unr_insns); > } > > + /* If the code is going to shrink, we don't need to be extra cautious > + on guessing if the unrolling is going to be profitable. */ > + if (unr_insns > + /* If there is IV variable that will become constant, we save > + one instruction in the loop prologue we do not account > + otherwise. */ > + <= ninsns + (size.constant_iv != false)) > + ; > /* We unroll only inner loops, because we do not consider it profitable > otheriwse. We still can cancel loopback edge of not rolling loop; > this is always a good idea. */ > - if (loop->inner && unr_insns > ninsns) > + else if (ul == UL_NO_GROWTH) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "Not unrolling loop %d: size would grow.\n", > + loop->num); > + return false; > + } > + /* Outer loops tend to be less interesting candidates for complette > + unrolling unless we can do a lot of propagation into the inner loop > + body. For now we disable outer loop unrolling when the code would > + grow. */ > + else if (loop->inner) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > - fprintf (dump_file, "Not unrolling loop %d:" > + fprintf (dump_file, "Not unrolling loop %d: " > "it is not innermost and code would grow.\n", > loop->num); > return false; > } > - > - if (unr_insns > ninsns > - && (unr_insns > - > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS))) > + /* If there is call on a hot path through the loop, then > + there is most probably not much to optimize. */ > + else if (size.num_non_pure_calls_on_hot_path) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > - fprintf (dump_file, "Not unrolling loop %d " > - "(--param max-completely-peeled-insns limit reached).\n", > + fprintf (dump_file, "Not unrolling loop %d: " > + "contains call and code would grow.\n", > loop->num); > return false; > } > - > - if (ul == UL_NO_GROWTH > - && unr_insns > ninsns) > + /* If there is pure/const call in the function, then we > + can still optimize the unrolled loop body if it contains > + some other interesting code than the calls and code > + storing or cumulating the return value. */ > + else if (size.num_pure_calls_on_hot_path > + /* One IV increment, one test, one ivtmp store > + and one usefull stmt. That is about minimal loop > + doing pure call. */ > + && (size.non_call_stmts_on_hot_path > + <= 3 + size.num_pure_calls_on_hot_path)) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > - fprintf (dump_file, "Not unrolling loop %d: size would grow.\n", > + fprintf (dump_file, "Not unrolling loop %d: " > + "contains just pure calls and code would grow.\n", > + loop->num); > + return false; > + } > + /* Complette unrolling is major win when control flow is removed and > + one big basic block is created. If the loop contains control flow > + the optimization may still be a win because of eliminating the loop > + overhead but it also may blow the branch predictor tables. > + Limit number of branches on the hot path through the peeled > + sequence. */ > + else if (size.num_branches_on_hot_path * (int)n_unroll > + > PARAM_VALUE (PARAM_MAX_PEEL_BRANCHES)) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "Not unrolling loop %d: " > + " number of branches on hot path in the unrolled sequence" > + " reach --param max-peel-branches limit.\n", > + loop->num); > + return false; > + } > + else if (unr_insns > + > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS)) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "Not unrolling loop %d: " > + "(--param max-completely-peeled-insns limit reached).\n", > loop->num); > return false; > } > @@ -531,6 +653,8 @@ try_unroll_loop_completely (struct loop > { > free_original_copy_tables (); > free (wont_exit); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "Failed to duplicate the loop\n"); > return false; > } > > @@ -826,7 +950,7 @@ tree_unroll_loops_completely (bool may_i > { > struct loop *loop_father = loop_outer (loop); > > - if (may_increase_size && optimize_loop_for_speed_p (loop) > + if (may_increase_size && optimize_loop_nest_for_speed_p (loop) > /* Unroll outermost loops only if asked to do so or they do > not cause code growth. */ > && (unroll_outer || loop_outer (loop_father))) > Index: testsuite/gcc.dg/tree-ssa/loop-1.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/loop-1.c (revision 192892) > +++ testsuite/gcc.dg/tree-ssa/loop-1.c (working copy) > @@ -17,13 +17,16 @@ > to the load from the GOT this also contains the name of the funtion so for > each call the function name would appear twice. */ > /* { dg-options "-O1 -ftree-loop-ivcanon -funroll-loops > -fdump-tree-ivcanon-details -fdump-tree-cunroll-details -fdump-tree-optimized > -mno-relax-pic-calls" { target mips*-*-* } } */ > - > -void xxx(void) > +__attribute__ ((pure)) > +int foo (int x); > +int xxx(void) > { > int x = 45; > + int sum; > > while (x >>= 1) > - foo (); > + sum += foo (x) * 2; > + return sum; > } > > /* We should be able to find out that the loop iterates four times and > unroll it completely. */ > Index: testsuite/gcc.dg/tree-ssa/cunroll-7.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/cunroll-7.c (revision 0) > +++ testsuite/gcc.dg/tree-ssa/cunroll-7.c (working copy) > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */ > +int t(int a) __attribute__ ((pure)); > +test(void) > +{ > + int i; > + int s=0; > + for (i=0;i<8;i++) > + s+= t(i); > + return s; > +} > +/* { dg-final { scan-tree-dump "Not unrolling.*contains just pure calls and > code would grow." "cunroll"} } */ > +/* { dg-final { cleanup-tree-dump "cunroll" } } */ > Index: testsuite/gcc.dg/tree-ssa/cunroll-6.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/cunroll-6.c (revision 0) > +++ testsuite/gcc.dg/tree-ssa/cunroll-6.c (working copy) > @@ -0,0 +1,10 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */ > +test(int c) > +{ > + int i; > + for (i=0;i<8;i++) > + t(); > +} > +/* { dg-final { scan-tree-dump "Not unrolling loop.*contains call and code > would grow" "cunroll"} } */ > +/* { dg-final { cleanup-tree-dump "cunroll" } } */ > Index: testsuite/gcc.dg/tree-ssa/cunroll-8.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/cunroll-8.c (revision 0) > +++ testsuite/gcc.dg/tree-ssa/cunroll-8.c (working copy) > @@ -0,0 +1,14 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */ > +int a[8][5]; > +int > +test(int c) > +{ > + int i; > + for (i=0;i<c;i++) > + if (a[i][1]==1 || a[i][1]==2 || a[i][3]==3 || a[i][4]==4 ||a[i][5]==5) > + break; > + return i; > +} > +/* { dg-final { scan-tree-dump "Not unrolling.*number of branches on hot > path" "cunroll"} } */ > +/* { dg-final { cleanup-tree-dump "cunroll" } } */ > Index: testsuite/gcc.dg/tree-ssa/loop-23.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/loop-23.c (revision 192892) > +++ testsuite/gcc.dg/tree-ssa/loop-23.c (working copy) > @@ -1,24 +1,27 @@ > /* { dg-do compile } */ > /* { dg-options "-O2 -funroll-loops -fdump-tree-cunroll-details" } */ > > -void bla(int); > +__attribute__ ((pure)) > +int bla(int); > > -void foo(void) > +int foo(void) > { > int i; > + int sum; > > /* This loop used to appear to be too large for unrolling. */ > for (i = 0; i < 4; i++) > { > - bla (i); > - bla (2*i); > - bla (3*i); > - bla (4*i); > - bla (5*i); > - bla (6*i); > - bla (7*i); > - bla (8*i); > + sum += bla (i); > + sum += bla (2*i); > + sum += bla (3*i); > + sum += bla (4*i); > + sum += bla (5*i); > + sum += bla (6*i); > + sum += bla (7*i); > + sum += bla (8*i); > } > + return sum; > } > > /* { dg-final { scan-tree-dump-times "Unrolled loop 1 completely" 1 > "cunroll" } } */ > Index: testsuite/gcc.dg/tree-ssa/cunroll-1.c > =================================================================== > --- testsuite/gcc.dg/tree-ssa/cunroll-1.c (revision 192892) > +++ testsuite/gcc.dg/tree-ssa/cunroll-1.c (working copy) > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O3 -fdump-tree-cunroll-details" } */ > +/* { dg-options "-O3 -fdump-tree-cunrolli-details" } */ > int a[2]; > test(int c) > { > @@ -10,4 +10,4 @@ test(int c) > /* Array bounds says the loop will not roll much. */ > /* { dg-final { scan-tree-dump "Unrolled loop 1 completely .duplicated 1 > times.." "cunroll"} } */ > /* { dg-final { scan-tree-dump "Last iteration exit edge was proved true." > "cunroll"} } */ > -/* { dg-final { cleanup-tree-dump "cunroll" } } */ > +/* { dg-final { cleanup-tree-dump "cunrolli" } } */ > Index: testsuite/gcc.dg/unroll_6.c > =================================================================== > --- testsuite/gcc.dg/unroll_6.c (revision 0) > +++ testsuite/gcc.dg/unroll_6.c (working copy) > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops > -fdisable-tree-cunroll -fdisable-tree-cunrolli -funroll-loops" } */ > + > +int a[2]; > +test(int c) > +{ > + int i; > + for (i=0;i<c;i++) > + { > + a[i]=5; > + if (test2()) > + return; > + } > +} > + > +/* { dg-final { scan-rtl-dump-times "Decided to peel loop completely" 1 > "loop2_unroll" } } */ > +/* { dg-final { scan-rtl-dump-times "Peeled loop completely, 2 times" 1 > "loop2_unroll" } } */ > +/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */ > Index: testsuite/gcc.dg/tree-prof/unroll-1.c > =================================================================== > --- testsuite/gcc.dg/tree-prof/unroll-1.c (revision 192892) > +++ testsuite/gcc.dg/tree-prof/unroll-1.c (working copy) > @@ -21,4 +21,3 @@ main() > } > /* { dg-final-use { scan-rtl-dump "Considering unrolling loop with constant > number of iterations" "loop2_unroll" } } */ > /* { dg-final-use { cleanup-rtl-dump "Not unrolling loop, doesn't roll" } } > */ > -/* { dg-options "-O3 -fdump-rtl-loop2_unroll -funroll-loops -fno-peel-loops" > } */ > Index: params.def > =================================================================== > --- params.def (revision 192892) > +++ params.def (working copy) > @@ -301,11 +306,11 @@ DEFPARAM(PARAM_MAX_COMPLETELY_PEEL_TIMES > "max-completely-peel-times", > "The maximum number of peelings of a single loop that is peeled > completely", > 16, 0, 0) > -/* The maximum number of insns of a peeled loop that rolls only once. */ > -DEFPARAM(PARAM_MAX_ONCE_PEELED_INSNS, > - "max-once-peeled-insns", > - "The maximum number of insns of a peeled loop that rolls only once", > - 400, 0, 0) > +/* The maximum number of peelings of a single loop that is peeled > completely. */ > +DEFPARAM(PARAM_MAX_PEEL_BRANCHES, > + "max-completely-peel-times", > + "The maximum number of branches on the path through the peeled > sequence", > + 32, 0, 0) > /* The maximum depth of a loop nest we completely peel. */ > DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, > "max-completely-peel-loop-nest-depth", > > -- Richard Biener <rguent...@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend