Hi This patch is trying to fix PR88398 for AArch64. As discussed in the PR, loop unrolling is probably what we can do here. As an easy fix, the existing unroll_stupid is unrolling the given example better than the unroll_runtime_iterations since the the loop contains a break inside it.
So all I have done here is: 1) Add a target hook so that this is AArch64 specific. 2) We are not unrolling the loops that decide_unroll_runtime_iterations would reject. 3) Out of the ones that decide_unroll_runtime_iterations would accept, check if the loop has more than 1 exit (this is done in the new target hook) and if it does, try to unroll using unroll_stupid. Regression tested on AArch64 and added the test from the PR. This gives an overall code size reduction of 2.35% and performance gain of 0.498% on Spec2017 Intrate. Is this ok for trunk? Thanks Sudi gcc/ChangeLog: 2019-xx-xx Sudakshina Das <sudi....@arm.com> PR88398 * cfgloop.h: Include target.h. (lpt_dec): Move to... * target.h (lpt_dec): ... Here. * target.def: Define TARGET_LOOP_DECISION_ADJUST. * loop-unroll.c (decide_unroll_runtime_iterations): Use new target hook. (decide_unroll_stupid): Likewise. * config/aarch64/aarch64.c (aarch64_loop_decision_adjust): New function. (TARGET_LOOP_DECISION_ADJUST): Define for AArch64. * doc/tm.texi: Regenerated. * doc/tm.texi.in: Document TARGET_LOOP_DECISION_ADJUST. gcc/testsuite/ChangeLog: 2019-xx-xx Sudakshina Das <sudi....@arm.com> PR88398 * gcc.target/aarch64/pr88398.c: New test.
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 0b0154ffd7bf031a005de993b101d9db6dd98c43..985c74e3b60728fc8c9d34b69634488cae3451cb 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -21,15 +21,7 @@ along with GCC; see the file COPYING3. If not see #define GCC_CFGLOOP_H #include "cfgloopmanip.h" - -/* Structure to hold decision about unrolling/peeling. */ -enum lpt_dec -{ - LPT_NONE, - LPT_UNROLL_CONSTANT, - LPT_UNROLL_RUNTIME, - LPT_UNROLL_STUPID -}; +#include "target.h" struct GTY (()) lpt_decision { enum lpt_dec decision; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 599d07a729e7438080f8b5240ee95037a49fb983..f31ac41d66257c01ead8d5f5b9b22379ecb5d276 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -21093,6 +21093,39 @@ aarch64_sched_can_speculate_insn (rtx_insn *insn) } } +/* Implement TARGET_LOOP_DECISION_ADJUST. CONSIDER is the loop decision + currently being checked for loop LOOP. This returns a decision which could + either be LPT_UNROLL_STUPID or the current value in LOOP. */ +static enum lpt_dec +aarch64_loop_decision_adjust (enum lpt_dec consider, class loop *loop) +{ + switch (consider) + { + case LPT_UNROLL_CONSTANT: + return loop->lpt_decision.decision; + + case LPT_UNROLL_RUNTIME: + /* Fall through. */ + case LPT_UNROLL_STUPID: + { + vec<edge> edges = get_loop_exit_edges (loop); + if (edges.length () > 1) + { + if (dump_file) + fprintf (dump_file, ";; Need change in loop decision\n"); + consider = LPT_UNROLL_STUPID; + return consider; + } + return loop->lpt_decision.decision; + } + + case LPT_NONE: + /* Fall through. */ + default: + gcc_unreachable (); + } +} + /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */ static int @@ -21839,6 +21872,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_CAN_USE_DOLOOP_P #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost +#undef TARGET_LOOP_DECISION_ADJUST +#define TARGET_LOOP_DECISION_ADJUST aarch64_loop_decision_adjust + #undef TARGET_SCHED_ADJUST_PRIORITY #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index cd9aed9874f4e6b2b0e2f8956ed6155975e643a8..61bd00e84c8a2a8865e95ba579c3b94790ab1331 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11857,6 +11857,15 @@ is required only when the target has special constraints like maximum number of memory accesses. @end deftypefn +@deftypefn {Target Hook} {enum lpt_dec} TARGET_LOOP_DECISION_ADJUST (enum lpt_dec @var{consider}, class loop *@var{loop}) +This target hook returns either a new value for the loop unrolling +decision or the existing value in @var{loop}. The parameter @var{consider} +is the loop decision currently being tested. The parameter @var{loop} is a +pointer to the loop, which is going to be checked for unrolling. This target +hook is required only when the target wants to override the unrolling +decisions. +@end deftypefn + @defmac POWI_MAX_MULTS If defined, this macro is interpreted as a signed integer C expression that specifies the maximum number of floating point multiplications diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 2739e9ceec5ad7253ff9135da8dbe3bf6010e8d7..7a7f917fb45a6cc22f373ff16f8b78aa3e35f210 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -8026,6 +8026,8 @@ build_type_attribute_variant (@var{mdecl}, @hook TARGET_LOOP_UNROLL_ADJUST +@hook TARGET_LOOP_DECISION_ADJUST + @defmac POWI_MAX_MULTS If defined, this macro is interpreted as a signed integer C expression that specifies the maximum number of floating point multiplications diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c index 63fccd23fae38f8918a7d94411aaa43c72830dd3..d8f09d445764f74b8142ff3996cd1db229464202 100644 --- a/gcc/loop-unroll.c +++ b/gcc/loop-unroll.c @@ -672,6 +672,7 @@ decide_unroll_runtime_iterations (class loop *loop, int flags) unsigned nunroll, nunroll_by_av, i; class niter_desc *desc; widest_int iterations; + enum lpt_dec decision = LPT_UNROLL_RUNTIME; /* If we were not asked to unroll this loop, just return back silently. */ if (!(flags & UAP_UNROLL) && !loop->unroll) @@ -735,6 +736,16 @@ decide_unroll_runtime_iterations (class loop *loop, int flags) return; } + if (targetm.loop_decision_adjust) + decision = targetm.loop_decision_adjust (decision, loop); + + if (decision != loop->lpt_decision.decision) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling loop, there maybe a better decision\n"); + return; + } + /* Success; now force nunroll to be power of 2, as code-gen requires it, we are unable to cope with overflows in computation of number of iterations. */ @@ -1157,9 +1168,16 @@ decide_unroll_stupid (class loop *loop, int flags) unsigned nunroll, nunroll_by_av, i; class niter_desc *desc; widest_int iterations; + enum lpt_dec decision = LPT_UNROLL_STUPID; - /* If we were not asked to unroll this loop, just return back silently. */ - if (!(flags & UAP_UNROLL_ALL) && !loop->unroll) + if (targetm.loop_decision_adjust) + decision = targetm.loop_decision_adjust (decision, loop); + + /* If we were not asked to unroll this loop and the target also does not + tell us otherwise, just return back silently. */ + if (!(flags & UAP_UNROLL_ALL) + && !loop->unroll + && decision == loop->lpt_decision.decision) return; if (dump_enabled_p ()) @@ -1189,26 +1207,41 @@ decide_unroll_stupid (class loop *loop, int flags) return; } - /* Check for simple loops. */ desc = get_simple_loop_desc (loop); - /* Check simpleness. */ - if (desc->simple_p && !desc->assumptions) + /* Skip/add a few checks if the target changed the decision. */ + if (decision == loop->lpt_decision.decision) { - if (dump_file) - fprintf (dump_file, ";; Loop is simple\n"); - return; - } + /* Check simpleness. */ + if (desc->simple_p && !desc->assumptions) + { + if (dump_file) + fprintf (dump_file, ";; Loop is simple\n"); + return; + } - /* Do not unroll loops with branches inside -- it increases number - of mispredicts. - TODO: this heuristic needs tunning; call inside the loop body - is also relatively good reason to not unroll. */ - if (num_loop_branches (loop) > 1) + /* Do not unroll loops with branches inside -- it increases number + of mispredicts. + TODO: this heuristic needs tunning; call inside the loop body + is also relatively good reason to not unroll. */ + if (num_loop_branches (loop) > 1) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, contains branches\n"); + return; + } + } + else { - if (dump_file) - fprintf (dump_file, ";; Not unrolling, contains branches\n"); - return; + /* For target effected loops, adding the simpleness check from + decide_unroll_runtime_iterations since we do not want to add more + unrolling than before. */ + if (!desc->simple_p && desc->assumptions) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, loop is not simple\n"); + return; + } } /* Check whether the loop rolls. */ diff --git a/gcc/target.def b/gcc/target.def index 8e83c2c7a7136511c07a5bc9e18876c91a38b955..d869495f9c8a9b9187a802298e28aeb334a94cfb 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2680,6 +2680,18 @@ number of memory accesses.", unsigned, (unsigned nunroll, class loop *loop), NULL) +/* Return a new value for loop unroll decision. */ +DEFHOOK +(loop_decision_adjust, + "This target hook returns either a new value for the loop unrolling\n\ +decision or the existing value in @var{loop}. The parameter @var{consider}\n\ +is the loop decision currently being tested. The parameter @var{loop} is a\n\ +pointer to the loop, which is going to be checked for unrolling. This target\n\ +hook is required only when the target wants to override the unrolling\n\ +decisions.", + enum lpt_dec, (enum lpt_dec consider, class loop *loop), + NULL) + /* True if X is a legitimate MODE-mode immediate operand. */ DEFHOOK (legitimate_constant_p, diff --git a/gcc/target.h b/gcc/target.h index 843c3d7887f8b49edfe9e45abf9de760176896e4..f67084a449a145f3c8016df8b9230ae835eef363 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -140,6 +140,16 @@ struct ddg; /* This is defined in cfgloop.h . */ class loop; +/* Structure to hold decision about unrolling/peeling. This is used in + targetm.loop_unroll_adjust and in class loop. */ +enum lpt_dec +{ + LPT_NONE, + LPT_UNROLL_CONSTANT, + LPT_UNROLL_RUNTIME, + LPT_UNROLL_STUPID +}; + /* This is defined in ifcvt.h. */ struct noce_if_info; diff --git a/gcc/testsuite/gcc.target/aarch64/pr88398.c b/gcc/testsuite/gcc.target/aarch64/pr88398.c new file mode 100644 index 0000000000000000000000000000000000000000..ad1929a4e81df92972418ab5c7ed2a2e6db4f157 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr88398.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -funroll-loops -fdump-rtl-loop2_unroll-all" } */ + +typedef unsigned char __uint8_t; +typedef unsigned int __uint32_t; +typedef unsigned long long __uint64_t; + +int +foo (const __uint64_t len_limit, const __uint8_t *cur, + __uint32_t delta, int len) { + + const __uint8_t *pb = cur - delta; + + while (++len != len_limit) { + if (pb[len] != cur[len]) + break; + } + + return len; +} + +/* { dg-final { scan-rtl-dump "considering unrolling loop stupidly" "loop2_unroll" } } */ +/* { dg-final { scan-rtl-dump "optimized: loop unrolled 7 times" "loop2_unroll" } } */