* Claudiu Zissulescu <claudiu.zissule...@synopsys.com> [2017-07-24 10:42:58 +0200]:
> From: claziss <claz...@synopsys.com> > > 2017-05-22 Claudiu Zissulescu <claz...@synopsys.com> > > * config/arc/arc-c.c (__ARC_LPC_WIDTH__): Add builtin define. > * config/arc/arc.c (ARC_MAX_LOOP_LENGTH): Define. > (arc_conditional_register_usage): Remove ARC600 lp_count > exception. > (arc_file_start): Emit Tag_ARC_CPU_variation. > (arc_can_use_doloop_p): New conditions to use ZOLs. > (hwloop_fail): New function. > (hwloop_optimize): Likewise. > (hwloop_pattern_reg): Likewise. > (arc_doloop_hooks): New struct, to be used with reorg_loops. > (arc_reorg_loops): New function, calls reorg_loops. > (arc_reorg): Call arc_reorg_loops. Remove old ZOL handling. > (arc600_corereg_hazard): Remove ZOL checking, case handled by > hwloop_optimize. > (arc_loop_hazard): Remove function, functionality moved into > hwloop_optimize. > (arc_hazard): Remove arc_loop_hazard call. > (arc_adjust_insn_length): Remove ZOL handling, functionality moved > into hwloop_optimize. > (arc_label_align): Remove ZOL handling. > * config/arc/arc.h (LOOP_ALIGN): Changed to 0. > * config/arc/arc.md (doloop_begin): Remove pattern. > (doloop_begin_i): Likewise. > (doloop_end_i): Likewise. > (doloop_fallback): Likewise. > (doloop_fallback_m): Likewise. > (doloop_end): Reimplement expand. > (arc_lp): New pattern for LP instruction. > (loop_end): New pattern. > (loop_fail): Likewise. > (decrement_and_branch_until_zero): Likewise. > * config/arc/arc.opt (mlpc-width): New option. > * doc/invoke.texi (mlpc-width): Document option. > > testsuite/ > 2017-05-22 Claudiu Zissulescu <claz...@synopsys.com> > > * gcc.target/arc/loop-1.c: Update test. I'm happy with this if the doc is updated inline with Sandra's suggestions. Thanks, Andrew > --- > gcc/config/arc/arc-c.c | 2 + > gcc/config/arc/arc.c | 726 > ++++++++++++++++++---------------- > gcc/config/arc/arc.h | 10 +- > gcc/config/arc/arc.md | 419 ++++++-------------- > gcc/config/arc/arc.opt | 25 ++ > gcc/config/arc/predicates.md | 2 + > gcc/doc/invoke.texi | 14 +- > gcc/testsuite/gcc.target/arc/loop-1.c | 49 +-- > 8 files changed, 561 insertions(+), 686 deletions(-) > mode change 100644 => 100755 gcc/testsuite/gcc.target/arc/loop-1.c > > diff --git a/gcc/config/arc/arc-c.c b/gcc/config/arc/arc-c.c > index de877a1..44ff338 100644 > --- a/gcc/config/arc/arc-c.c > +++ b/gcc/config/arc/arc-c.c > @@ -62,6 +62,8 @@ arc_cpu_cpp_builtins (cpp_reader * pfile) > builtin_define_with_int_value ("__ARC_TLS_REGNO__", > arc_tp_regno); > > + builtin_define_with_int_value ("__ARC_LPC_WIDTH__", arc_lpcwidth); > + > builtin_define (TARGET_BIG_ENDIAN > ? "__BIG_ENDIAN__" : "__LITTLE_ENDIAN__"); > if (TARGET_BIG_ENDIAN) > diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c > index c94b187..0f9b553 100644 > --- a/gcc/config/arc/arc.c > +++ b/gcc/config/arc/arc.c > @@ -64,11 +64,15 @@ along with GCC; see the file COPYING3. If not see > #include "rtl-iter.h" > #include "alias.h" > #include "opts.h" > +#include "hw-doloop.h" > > /* Which cpu we're compiling for (ARC600, ARC601, ARC700). */ > static char arc_cpu_name[10] = ""; > static const char *arc_cpu_string = arc_cpu_name; > > +/* Maximum size of a loop. */ > +#define ARC_MAX_LOOP_LENGTH 4095 > + > /* ??? Loads can handle any constant, stores can only handle small ones. */ > /* OTOH, LIMMs cost extra, so their usefulness is limited. */ > #define RTX_OK_FOR_OFFSET_P(MODE, X) \ > @@ -1708,18 +1712,7 @@ arc_conditional_register_usage (void) > i <= ARC_LAST_SIMD_DMA_CONFIG_REG; i++) > reg_alloc_order [i] = i; > } > - /* For ARC600, lp_count may not be read in an instruction > - following immediately after another one setting it to a new value. > - There was some discussion on how to enforce scheduling constraints for > - processors with missing interlocks on the gcc mailing list: > - http://gcc.gnu.org/ml/gcc/2008-05/msg00021.html . > - However, we can't actually use this approach, because for ARC the > - delay slot scheduling pass is active, which runs after > - machine_dependent_reorg. */ > - if (TARGET_ARC600) > - CLEAR_HARD_REG_BIT (reg_class_contents[SIBCALL_REGS], LP_COUNT); > - else if (!TARGET_LP_WR_INTERLOCK) > - fixed_regs[LP_COUNT] = 1; > + > for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) > if (!call_used_regs[regno]) > CLEAR_HARD_REG_BIT (reg_class_contents[SIBCALL_REGS], regno); > @@ -6998,28 +6991,33 @@ arc_pass_by_reference (cumulative_args_t ca_v > ATTRIBUTE_UNUSED, > /* Implement TARGET_CAN_USE_DOLOOP_P. */ > > static bool > -arc_can_use_doloop_p (const widest_int &iterations, const widest_int &, > +arc_can_use_doloop_p (const widest_int &, > + const widest_int &iterations_max, > unsigned int loop_depth, bool entered_at_top) > { > - if (loop_depth > 1) > + /* Considering limitations in the hardware, only use doloop > + for innermost loops which must be entered from the top. */ > + if (loop_depth > 1 || !entered_at_top) > return false; > - /* Setting up the loop with two sr instructions costs 6 cycles. */ > - if (TARGET_ARC700 > - && !entered_at_top > - && wi::gtu_p (iterations, 0) > - && wi::leu_p (iterations, flag_pic ? 6 : 3)) > + > + /* Check for lp_count width boundary. */ > + if (arc_lpcwidth != 32 > + && (wi::gtu_p (iterations_max, ((1 << arc_lpcwidth) - 1)) > + || wi::eq_p (iterations_max, 0))) > return false; > return true; > } > > -/* NULL if INSN insn is valid within a low-overhead loop. > - Otherwise return why doloop cannot be applied. */ > +/* NULL if INSN insn is valid within a low-overhead loop. Otherwise > + return why doloop cannot be applied. */ > > static const char * > arc_invalid_within_doloop (const rtx_insn *insn) > { > if (CALL_P (insn)) > return "Function call in the loop."; > + > + /* FIXME! add here all the ZOL exceptions. */ > return NULL; > } > > @@ -7118,6 +7116,368 @@ workaround_arc_anomaly (void) > } > } > > +/* A callback for the hw-doloop pass. Called when a loop we have discovered > + turns out not to be optimizable; we have to split the loop_end pattern > into > + a subtract and a test. */ > + > +static void > +hwloop_fail (hwloop_info loop) > +{ > + rtx test; > + rtx insn = loop->loop_end; > + > + if (TARGET_V2 > + && (loop->length && (loop->length <= ARC_MAX_LOOP_LENGTH)) > + && REG_P (loop->iter_reg)) > + { > + /* TARGET_V2 has dbnz instructions. */ > + test = gen_dbnz (loop->iter_reg, loop->start_label); > + insn = emit_jump_insn_before (test, loop->loop_end); > + } > + else if (REG_P (loop->iter_reg) && (REGNO (loop->iter_reg) == LP_COUNT)) > + { > + /* We have the lp_count as loop iterator, try to use it. */ > + emit_insn_before (gen_loop_fail (), loop->loop_end); > + test = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_ZNmode, CC_REG), > + const0_rtx); > + test = gen_rtx_IF_THEN_ELSE (VOIDmode, test, > + gen_rtx_LABEL_REF (Pmode, loop->start_label), > + pc_rtx); > + insn = emit_jump_insn_before (gen_rtx_SET (pc_rtx, test), > + loop->loop_end); > + } > + else > + { > + emit_insn_before (gen_addsi3 (loop->iter_reg, > + loop->iter_reg, > + constm1_rtx), > + loop->loop_end); > + test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx); > + insn = emit_jump_insn_before (gen_cbranchsi4 (test, > + loop->iter_reg, > + const0_rtx, > + loop->start_label), > + loop->loop_end); > + } > + JUMP_LABEL (insn) = loop->start_label; > + LABEL_NUSES (loop->start_label)++; > + delete_insn (loop->loop_end); > +} > + > +/* Optimize LOOP. */ > + > +static bool > +hwloop_optimize (hwloop_info loop) > +{ > + int i; > + edge entry_edge; > + basic_block entry_bb, bb; > + rtx iter_reg, end_label; > + rtx_insn *insn, *seq, *entry_after, *last_insn; > + unsigned int length; > + bool need_fix = false; > + rtx lp_reg = gen_rtx_REG (SImode, LP_COUNT); > + > + if (loop->depth > 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d is not innermost\n", > + loop->loop_no); > + return false; > + } > + > + if (!loop->incoming_dest) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has more than one entry\n", > + loop->loop_no); > + return false; > + } > + > + if (loop->incoming_dest != loop->head) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d is not entered from head\n", > + loop->loop_no); > + return false; > + } > + > + if (loop->has_call || loop->has_asm) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has invalid insn\n", > + loop->loop_no); > + return false; > + } > + > + /* Scan all the blocks to make sure they don't use iter_reg. */ > + if (loop->iter_reg_used || loop->iter_reg_used_outside) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d uses iterator\n", > + loop->loop_no); > + return false; > + } > + > + /* Check if start_label appears before doloop_end. */ > + length = 0; > + for (insn = loop->start_label; > + insn && insn != loop->loop_end; > + insn = NEXT_INSN (insn)) > + length += NONDEBUG_INSN_P (insn) ? get_attr_length (insn) : 0; > + > + if (!insn) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d start_label not before loop_end\n", > + loop->loop_no); > + return false; > + } > + > + loop->length = length; > + if (loop->length > ARC_MAX_LOOP_LENGTH) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d too long\n", loop->loop_no); > + return false; > + } > + else if (!loop->length) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d is empty\n", loop->loop_no); > + return false; > + } > + > + /* Check if we use a register or not. */ > + if (!REG_P (loop->iter_reg)) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d iterator is MEM\n", > + loop->loop_no); > + return false; > + } > + > + /* Check if loop register is lpcount. */ > + if (REG_P (loop->iter_reg) && (REGNO (loop->iter_reg)) != LP_COUNT) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d doesn't use lp_count as loop" > + " iterator\n", > + loop->loop_no); > + /* This loop doesn't use the lp_count, check though if we can > + fix it. */ > + if (TEST_HARD_REG_BIT (loop->regs_set_in_loop, LP_COUNT) > + /* In very unique cases we may have LP_COUNT alive. */ > + || (loop->incoming_src > + && REGNO_REG_SET_P (df_get_live_out (loop->incoming_src), > + LP_COUNT))) > + return false; > + else > + need_fix = true; > + } > + > + /* Check for control like instruction as the last instruction of a > + ZOL. */ > + bb = loop->tail; > + last_insn = PREV_INSN (loop->loop_end); > + > + while (1) > + { > + for (; last_insn != BB_HEAD (bb); > + last_insn = PREV_INSN (last_insn)) > + if (NONDEBUG_INSN_P (last_insn)) > + break; > + > + if (last_insn != BB_HEAD (bb)) > + break; > + > + if (single_pred_p (bb) > + && single_pred_edge (bb)->flags & EDGE_FALLTHRU > + && single_pred (bb) != ENTRY_BLOCK_PTR_FOR_FN (cfun)) > + { > + bb = single_pred (bb); > + last_insn = BB_END (bb); > + continue; > + } > + else > + { > + last_insn = NULL; > + break; > + } > + } > + > + if (!last_insn) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has no last instruction\n", > + loop->loop_no); > + return false; > + } > + > + if ((TARGET_ARC600_FAMILY || TARGET_HS) > + && INSN_P (last_insn) > + && (JUMP_P (last_insn) || CALL_P (last_insn) > + || GET_CODE (PATTERN (last_insn)) == SEQUENCE > + /* At this stage we can have (insn (clobber (mem:BLK > + (reg)))) instructions, ignpre them. */ > + || (GET_CODE (PATTERN (last_insn)) != CLOBBER > + && (get_attr_type (last_insn) == TYPE_BRCC > + || get_attr_type (last_insn) == TYPE_BRCC_NO_DELAY_SLOT)))) > + { > + if (loop->length + 2 > ARC_MAX_LOOP_LENGTH) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d too long\n", loop->loop_no); > + return false; > + } > + if (dump_file) > + fprintf (dump_file, ";; loop %d has a control like last insn;" > + "add a nop\n", > + loop->loop_no); > + > + last_insn = emit_insn_after (gen_nopv (), last_insn); > + } > + > + if (LABEL_P (last_insn)) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has a label as last insn;" > + "add a nop\n", > + loop->loop_no); > + last_insn = emit_insn_after (gen_nopv (), last_insn); > + } > + loop->last_insn = last_insn; > + > + /* Get the loop iteration register. */ > + iter_reg = loop->iter_reg; > + > + gcc_assert (REG_P (iter_reg)); > + > + entry_edge = NULL; > + > + FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge) > + if (entry_edge->flags & EDGE_FALLTHRU) > + break; > + > + if (entry_edge == NULL) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has no fallthru edge jumping" > + "into the loop\n", > + loop->loop_no); > + return false; > + } > + /* The loop is good. */ > + end_label = gen_label_rtx (); > + loop->end_label = end_label; > + > + /* Place the zero_cost_loop_start instruction before the loop. */ > + entry_bb = entry_edge->src; > + > + start_sequence (); > + > + if (need_fix) > + { > + /* The loop uses a R-register, but the lp_count is free, thus > + use lp_count. */ > + emit_insn (gen_movsi (lp_reg, iter_reg)); > + SET_HARD_REG_BIT (loop->regs_set_in_loop, LP_COUNT); > + iter_reg = lp_reg; > + if (dump_file) > + { > + fprintf (dump_file, ";; fix loop %d to use lp_count\n", > + loop->loop_no); > + } > + } > + > + insn = emit_insn (gen_arc_lp (iter_reg, > + loop->start_label, > + loop->end_label)); > + > + seq = get_insns (); > + end_sequence (); > + > + entry_after = BB_END (entry_bb); > + if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1 > + || !entry_after) > + { > + basic_block new_bb; > + edge e; > + edge_iterator ei; > + > + emit_insn_before (seq, BB_HEAD (loop->head)); > + seq = emit_label_before (gen_label_rtx (), seq); > + new_bb = create_basic_block (seq, insn, entry_bb); > + FOR_EACH_EDGE (e, ei, loop->incoming) > + { > + if (!(e->flags & EDGE_FALLTHRU)) > + redirect_edge_and_branch_force (e, new_bb); > + else > + redirect_edge_succ (e, new_bb); > + } > + > + make_edge (new_bb, loop->head, 0); > + } > + else > + { > +#if 0 > + while (DEBUG_INSN_P (entry_after) > + || (NOTE_P (entry_after) > + && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK > + /* Make sure we don't split a call and its corresponding > + CALL_ARG_LOCATION note. */ > + && NOTE_KIND (entry_after) != NOTE_INSN_CALL_ARG_LOCATION)) > + entry_after = NEXT_INSN (entry_after); > +#endif > + entry_after = next_nonnote_insn_bb (entry_after); > + > + gcc_assert (entry_after); > + emit_insn_before (seq, entry_after); > + } > + > + delete_insn (loop->loop_end); > + /* Insert the loop end label before the last instruction of the > + loop. */ > + emit_label_after (end_label, loop->last_insn); > + > + return true; > +} > + > +/* A callback for the hw-doloop pass. This function examines INSN; if > + it is a loop_end pattern we recognize, return the reg rtx for the > + loop counter. Otherwise, return NULL_RTX. */ > + > +static rtx > +hwloop_pattern_reg (rtx_insn *insn) > +{ > + rtx reg; > + > + if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end) > + return NULL_RTX; > + > + reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1)); > + if (!REG_P (reg)) > + return NULL_RTX; > + return reg; > +} > + > +static struct hw_doloop_hooks arc_doloop_hooks = > +{ > + hwloop_pattern_reg, > + hwloop_optimize, > + hwloop_fail > +}; > + > +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns > + and tries to rewrite the RTL of these loops so that proper Blackfin > + hardware loops are generated. */ > + > +static void > +arc_reorg_loops (void) > +{ > + reorg_loops (true, &arc_doloop_hooks); > +} > + > static int arc_reorg_in_progress = 0; > > /* ARC's machince specific reorg function. */ > @@ -7131,204 +7491,17 @@ arc_reorg (void) > long offset; > int changed; > > - workaround_arc_anomaly (); > - > cfun->machine->arc_reorg_started = 1; > arc_reorg_in_progress = 1; > > - /* Link up loop ends with their loop start. */ > - { > - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) > - if (GET_CODE (insn) == JUMP_INSN > - && recog_memoized (insn) == CODE_FOR_doloop_end_i) > - { > - rtx_insn *top_label > - = as_a <rtx_insn *> (XEXP (XEXP (SET_SRC (XVECEXP (PATTERN (insn), > 0, 0)), 1), 0)); > - rtx num = GEN_INT (CODE_LABEL_NUMBER (top_label)); > - rtx_insn *lp, *prev = prev_nonnote_insn (top_label); > - rtx_insn *lp_simple = NULL; > - rtx_insn *next = NULL; > - rtx op0 = XEXP (XVECEXP (PATTERN (insn), 0, 1), 0); > - int seen_label = 0; > - > - for (lp = prev; > - (lp && NONJUMP_INSN_P (lp) > - && recog_memoized (lp) != CODE_FOR_doloop_begin_i); > - lp = prev_nonnote_insn (lp)) > - ; > - if (!lp || !NONJUMP_INSN_P (lp) > - || dead_or_set_regno_p (lp, LP_COUNT)) > - { > - HOST_WIDE_INT loop_end_id > - = INTVAL (XEXP (XVECEXP (PATTERN (insn), 0, 4), 0)); > + compute_bb_for_insn (); > > - for (prev = next = insn, lp = NULL ; prev || next;) > - { > - if (prev) > - { > - if (NONJUMP_INSN_P (prev) > - && recog_memoized (prev) == CODE_FOR_doloop_begin_i > - && (INTVAL (XEXP (XVECEXP (PATTERN (prev), 0, 5), 0)) > - == loop_end_id)) > - { > - lp = prev; > - break; > - } > - else if (LABEL_P (prev)) > - seen_label = 1; > - prev = prev_nonnote_insn (prev); > - } > - if (next) > - { > - if (NONJUMP_INSN_P (next) > - && recog_memoized (next) == CODE_FOR_doloop_begin_i > - && (INTVAL (XEXP (XVECEXP (PATTERN (next), 0, 5), 0)) > - == loop_end_id)) > - { > - lp = next; > - break; > - } > - next = next_nonnote_insn (next); > - } > - } > - prev = NULL; > - } > - else > - lp_simple = lp; > - if (lp && !dead_or_set_regno_p (lp, LP_COUNT)) > - { > - rtx begin_cnt = XEXP (XVECEXP (PATTERN (lp), 0 ,3), 0); > - if (INTVAL (XEXP (XVECEXP (PATTERN (lp), 0, 4), 0))) > - /* The loop end insn has been duplicated. That can happen > - when there is a conditional block at the very end of > - the loop. */ > - goto failure; > - /* If Register allocation failed to allocate to the right > - register, There is no point into teaching reload to > - fix this up with reloads, as that would cost more > - than using an ordinary core register with the > - doloop_fallback pattern. */ > - if ((true_regnum (op0) != LP_COUNT || !REG_P (begin_cnt)) > - /* Likewise, if the loop setup is evidently inside the loop, > - we loose. */ > - || (!lp_simple && lp != next && !seen_label)) > - { > - remove_insn (lp); > - goto failure; > - } > - /* It is common that the optimizers copy the loop count from > - another register, and doloop_begin_i is stuck with the > - source of the move. Making doloop_begin_i only accept "l" > - is nonsentical, as this then makes reload evict the pseudo > - used for the loop end. The underlying cause is that the > - optimizers don't understand that the register allocation for > - doloop_begin_i should be treated as part of the loop. > - Try to work around this problem by verifying the previous > - move exists. */ > - if (true_regnum (begin_cnt) != LP_COUNT) > - { > - rtx_insn *mov; > - rtx set, note; > + df_analyze (); > > - for (mov = prev_nonnote_insn (lp); mov; > - mov = prev_nonnote_insn (mov)) > - { > - if (!NONJUMP_INSN_P (mov)) > - mov = 0; > - else if ((set = single_set (mov)) > - && rtx_equal_p (SET_SRC (set), begin_cnt) > - && rtx_equal_p (SET_DEST (set), op0)) > - break; > - } > - if (mov) > - { > - XEXP (XVECEXP (PATTERN (lp), 0 ,3), 0) = op0; > - note = find_regno_note (lp, REG_DEAD, REGNO (begin_cnt)); > - if (note) > - remove_note (lp, note); > - } > - else > - { > - remove_insn (lp); > - goto failure; > - } > - } > - XEXP (XVECEXP (PATTERN (insn), 0, 4), 0) = num; > - XEXP (XVECEXP (PATTERN (lp), 0, 4), 0) = num; > - if (next == lp) > - XEXP (XVECEXP (PATTERN (lp), 0, 6), 0) = const2_rtx; > - else if (!lp_simple) > - XEXP (XVECEXP (PATTERN (lp), 0, 6), 0) = const1_rtx; > - else if (prev != lp) > - { > - remove_insn (lp); > - add_insn_after (lp, prev, NULL); > - } > - if (!lp_simple) > - { > - XEXP (XVECEXP (PATTERN (lp), 0, 7), 0) > - = gen_rtx_LABEL_REF (Pmode, top_label); > - add_reg_note (lp, REG_LABEL_OPERAND, top_label); > - LABEL_NUSES (top_label)++; > - } > - /* We can avoid tedious loop start / end setting for empty loops > - be merely setting the loop count to its final value. */ > - if (next_active_insn (top_label) == insn) > - { > - rtx lc_set > - = gen_rtx_SET (XEXP (XVECEXP (PATTERN (lp), 0, 3), 0), > - const0_rtx); > - > - rtx_insn *lc_set_insn = emit_insn_before (lc_set, insn); > - delete_insn (lp); > - delete_insn (insn); > - insn = lc_set_insn; > - } > - /* If the loop is non-empty with zero length, we can't make it > - a zero-overhead loop. That can happen for empty asms. */ > - else > - { > - rtx_insn *scan; > + /* Doloop optimization. */ > + arc_reorg_loops (); > > - for (scan = top_label; > - (scan && scan != insn > - && (!NONJUMP_INSN_P (scan) || !get_attr_length (scan))); > - scan = NEXT_INSN (scan)); > - if (scan == insn) > - { > - remove_insn (lp); > - goto failure; > - } > - } > - } > - else > - { > - /* Sometimes the loop optimizer makes a complete hash of the > - loop. If it were only that the loop is not entered at the > - top, we could fix this up by setting LP_START with SR . > - However, if we can't find the loop begin were it should be, > - chances are that it does not even dominate the loop, but is > - inside the loop instead. Using SR there would kill > - performance. > - We use the doloop_fallback pattern here, which executes > - in two cycles on the ARC700 when predicted correctly. */ > - failure: > - if (!REG_P (op0)) > - { > - rtx op3 = XEXP (XVECEXP (PATTERN (insn), 0, 5), 0); > - > - emit_insn_before (gen_move_insn (op3, op0), insn); > - PATTERN (insn) > - = gen_doloop_fallback_m (op3, JUMP_LABEL (insn), op0); > - } > - else > - XVEC (PATTERN (insn), 0) > - = gen_rtvec (2, XVECEXP (PATTERN (insn), 0, 0), > - XVECEXP (PATTERN (insn), 0, 1)); > - INSN_CODE (insn) = -1; > - } > - } > - } > + workaround_arc_anomaly (); > > /* FIXME: should anticipate ccfsm action, generate special patterns for > to-be-deleted branches that have no delay slot and have at least the > @@ -7866,11 +8039,11 @@ arc_register_move_cost (machine_mode, > return 6; > } > > - /* The ARC700 stalls for 3 cycles when *reading* from lp_count. */ > - if (TARGET_ARC700 > - && (from_class == LPCOUNT_REG || from_class == ALL_CORE_REGS > - || from_class == WRITABLE_CORE_REGS)) > - return 8; > + /* Using lp_count as scratch reg is a VERY bad idea. */ > + if (from_class == LPCOUNT_REG) > + return 1000; > + if (to_class == LPCOUNT_REG) > + return 6; > > /* Force an attempt to 'mov Dy,Dx' to spill. */ > if ((TARGET_ARC700 || TARGET_EM) && TARGET_DPFP > @@ -8312,14 +8485,6 @@ arc600_corereg_hazard (rtx_insn *pred, rtx_insn *succ) > { > if (!TARGET_ARC600) > return 0; > - /* If SUCC is a doloop_end_i with a preceding label, we must output a nop > - in front of SUCC anyway, so there will be separation between PRED and > - SUCC. */ > - if (recog_memoized (succ) == CODE_FOR_doloop_end_i > - && LABEL_P (prev_nonnote_insn (succ))) > - return 0; > - if (recog_memoized (succ) == CODE_FOR_doloop_begin_i) > - return 0; > if (GET_CODE (PATTERN (pred)) == SEQUENCE) > pred = as_a <rtx_sequence *> (PATTERN (pred))->insn (1); > if (GET_CODE (PATTERN (succ)) == SEQUENCE) > @@ -8393,76 +8558,6 @@ arc_asm_insn_p (rtx x) > return 0; > } > > -/* We might have a CALL to a non-returning function before a loop end. > - ??? Although the manual says that's OK (the target is outside the > - loop, and the loop counter unused there), the assembler barfs on > - this for ARC600, so we must insert a nop before such a call too. > - For ARC700, and ARCv2 is not allowed to have the last ZOL > - instruction a jump to a location where lp_count is modified. */ > - > -static bool > -arc_loop_hazard (rtx_insn *pred, rtx_insn *succ) > -{ > - rtx_insn *jump = NULL; > - rtx label_rtx = NULL_RTX; > - rtx_insn *label = NULL; > - basic_block succ_bb; > - > - if (recog_memoized (succ) != CODE_FOR_doloop_end_i) > - return false; > - > - /* Phase 1: ARC600 and ARCv2HS doesn't allow any control instruction > - (i.e., jump/call) as the last instruction of a ZOL. */ > - if (TARGET_ARC600 || TARGET_HS) > - if (JUMP_P (pred) || CALL_P (pred) > - || arc_asm_insn_p (PATTERN (pred)) > - || GET_CODE (PATTERN (pred)) == SEQUENCE) > - return true; > - > - /* Phase 2: Any architecture, it is not allowed to have the last ZOL > - instruction a jump to a location where lp_count is modified. */ > - > - /* Phase 2a: Dig for the jump instruction. */ > - if (JUMP_P (pred)) > - jump = pred; > - else if (GET_CODE (PATTERN (pred)) == SEQUENCE > - && JUMP_P (XVECEXP (PATTERN (pred), 0, 0))) > - jump = as_a <rtx_insn *> (XVECEXP (PATTERN (pred), 0, 0)); > - else > - return false; > - > - /* Phase 2b: Make sure is not a millicode jump. */ > - if ((GET_CODE (PATTERN (jump)) == PARALLEL) > - && (XVECEXP (PATTERN (jump), 0, 0) == ret_rtx)) > - return false; > - > - label_rtx = JUMP_LABEL (jump); > - if (!label_rtx) > - return false; > - > - /* Phase 2c: Make sure is not a return. */ > - if (ANY_RETURN_P (label_rtx)) > - return false; > - > - /* Pahse 2d: Go to the target of the jump and check for aliveness of > - LP_COUNT register. */ > - label = safe_as_a <rtx_insn *> (label_rtx); > - succ_bb = BLOCK_FOR_INSN (label); > - if (!succ_bb) > - { > - gcc_assert (NEXT_INSN (label)); > - if (NOTE_INSN_BASIC_BLOCK_P (NEXT_INSN (label))) > - succ_bb = NOTE_BASIC_BLOCK (NEXT_INSN (label)); > - else > - succ_bb = BLOCK_FOR_INSN (NEXT_INSN (label)); > - } > - > - if (succ_bb && REGNO_REG_SET_P (df_get_live_out (succ_bb), LP_COUNT)) > - return true; > - > - return false; > -} > - > /* For ARC600: > A write to a core reg greater or equal to 32 must not be immediately > followed by a use. Anticipate the length requirement to insert a nop > @@ -8474,9 +8569,6 @@ arc_hazard (rtx_insn *pred, rtx_insn *succ) > if (!pred || !INSN_P (pred) || !succ || !INSN_P (succ)) > return 0; > > - if (arc_loop_hazard (pred, succ)) > - return 4; > - > if (TARGET_ARC600) > return arc600_corereg_hazard (pred, succ); > > @@ -8494,24 +8586,6 @@ arc_adjust_insn_length (rtx_insn *insn, int len, bool) > if (GET_CODE (PATTERN (insn)) == SEQUENCE) > return len; > > - /* It is impossible to jump to the very end of a Zero-Overhead Loop, as > - the ZOL mechanism only triggers when advancing to the end address, > - so if there's a label at the end of a ZOL, we need to insert a nop. > - The ARC600 ZOL also has extra restrictions on jumps at the end of a > - loop. */ > - if (recog_memoized (insn) == CODE_FOR_doloop_end_i) > - { > - rtx_insn *prev = prev_nonnote_insn (insn); > - > - return ((LABEL_P (prev) > - || (TARGET_ARC600 > - && (JUMP_P (prev) > - || CALL_P (prev) /* Could be a noreturn call. */ > - || (NONJUMP_INSN_P (prev) > - && GET_CODE (PATTERN (prev)) == SEQUENCE)))) > - ? len + 4 : len); > - } > - > /* Check for return with but one preceding insn since function > start / call. */ > if (TARGET_PAD_RETURN > @@ -9871,27 +9945,9 @@ arc_scheduling_not_expected (void) > return cfun->machine->arc_reorg_started; > } > > -/* Oddly enough, sometimes we get a zero overhead loop that branch > - shortening doesn't think is a loop - observed with compile/pr24883.c > - -O3 -fomit-frame-pointer -funroll-loops. Make sure to include the > - alignment visible for branch shortening (we actually align the loop > - insn before it, but that is equivalent since the loop insn is 4 byte > - long.) */ > - > int > arc_label_align (rtx_insn *label) > { > - int loop_align = LOOP_ALIGN (LABEL); > - > - if (loop_align > align_labels_log) > - { > - rtx_insn *prev = prev_nonnote_insn (label); > - > - if (prev && NONJUMP_INSN_P (prev) > - && GET_CODE (PATTERN (prev)) == PARALLEL > - && recog_memoized (prev) == CODE_FOR_doloop_begin_i) > - return loop_align; > - } > /* Code has a minimum p2 alignment of 1, which we must restore after an > ADDR_DIFF_VEC. */ > if (align_labels_log < 1) > diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h > index d4e97cd..4c54b7e 100644 > --- a/gcc/config/arc/arc.h > +++ b/gcc/config/arc/arc.h > @@ -581,15 +581,15 @@ enum reg_class > {0x0000f00f, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* > 'q', r0-r3, r12-r15 */ \ > {0x1000f00f, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* > 'e', r0-r3, r12-r15, sp */ \ > {0x1c001fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* "Rsc", > r0-r12 */ \ > - {0x9fffffff, 0xc0000000, 0x00000000, 0x00000000, 0x00000000}, /* 'r', > r0-r28, blink, ap and pcl */ \ > + {0x9fffffff, 0x80000000, 0x00000000, 0x00000000, 0x00000000}, /* 'r', > r0-r28, blink, ap and pcl */ \ > {0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* 'W', > r0-r31 */ \ > /* Include ap / pcl in WRITABLE_CORE_REGS for sake of symmetry. As these \ > registers are fixed, it does not affect the literal meaning of the \ > constraints, but it makes it a superset of GENERAL_REGS, thus \ > enabling some operations that would otherwise not be possible. */ \ > - {0xffffffff, 0xd0000000, 0x00000000, 0x00000000, 0x00000000}, /* 'w', > r0-r31, r60 */ \ > - {0xffffffff, 0xdfffffff, 0x00000000, 0x00000000, 0x00000000}, /* 'c', > r0-r60, ap, pcl */ \ > - {0xffffffff, 0xdfffffff, 0x00000000, 0x00000000, 0x00000000}, /* > 'Rac', r0-r60, ap, pcl */ \ > + {0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* 'w', > r0-r31, r60 */ \ > + {0xffffffff, 0x9fffffff, 0x00000000, 0x00000000, 0x00000000}, /* 'c', > r0-r60, ap, pcl */ \ > + {0xffffffff, 0x9fffffff, 0x00000000, 0x00000000, 0x00000000}, /* > 'Rac', r0-r60, ap, pcl */ \ > {0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* > 'Rcd', r0-r3 */ \ > {0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* > 'Rsd', r0-r1 */ \ > {0x9fffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, /* 'h', > r0-28, r30 */ \ > @@ -1351,7 +1351,7 @@ do { > \ > of a loop. */ > /* On the ARC, align loops to 4 byte boundaries unless doing all-out size > optimization. */ > -#define LOOP_ALIGN JUMP_ALIGN > +#define LOOP_ALIGN(X) 0 > > #define LABEL_ALIGN(LABEL) (arc_label_align (LABEL)) > > diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md > index 04a1447..27afe40 100644 > --- a/gcc/config/arc/arc.md > +++ b/gcc/config/arc/arc.md > @@ -554,6 +554,11 @@ > (eq_attr "annul_ret_delay_insn" "yes") > (eq_attr "cond_ret_delay_insn" "yes")]) > > +(define_delay (eq_attr "type" "loop_end") > + [(eq_attr "in_delay_slot" "true") > + (eq_attr "in_delay_slot" "true") > + (nil)]) > + > ;; For ARC600, unexposing the delay sloy incurs a penalty also in the > ;; non-taken case, so the only meaningful way to have an annull-true > ;; filled delay slot is to conditionalize the delay slot insn. > @@ -618,8 +623,8 @@ > ; The iscompact attribute allows the epilogue expander to know for which > ; insns it should lengthen the return insn. > (define_insn "*movqi_insn" > - [(set (match_operand:QI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h, w,w,???w,h, w,Rcq, S,!*x, r,r, Ucm,m,???m, m,Usc") > - (match_operand:QI 1 "move_src_operand" " cL, cP,Rcq#q, > P,hCm1,cL,I,?Rac,i,?i, T,Rcq,Usd,Ucm,m,?Rac,c,?Rac,Cm3,i"))] > + [(set (match_operand:QI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h,w*l,w*l,???w,h,w*l,Rcq, S,!*x, r,r, Ucm,m,???m, m,Usc") > + (match_operand:QI 1 "move_src_operand" " cL, cP,Rcq#q, P,hCm1, > cL, I,?Rac,i, ?i, T,Rcq,Usd,Ucm,m,?Rac,c,?Rac,Cm3,i"))] > "register_operand (operands[0], QImode) > || register_operand (operands[1], QImode)" > "@ > @@ -655,8 +660,8 @@ > "if (prepare_move_operands (operands, HImode)) DONE;") > > (define_insn "*movhi_insn" > - [(set (match_operand:HI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h, w,w,???w,Rcq#q,h, w,Rcq, S, r,r, Ucm,m,???m, m,VUsc") > - (match_operand:HI 1 "move_src_operand" " cL, cP,Rcq#q, > P,hCm1,cL,I,?Rac, i,i,?i, T,Rcq,Ucm,m,?Rac,c,?Rac,Cm3,i"))] > + [(set (match_operand:HI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h,w*l,w*l,???w,Rcq#q,h,w*l,Rcq, S, r,r, Ucm,m,???m, m,VUsc") > + (match_operand:HI 1 "move_src_operand" " cL, cP,Rcq#q, P,hCm1, > cL, I,?Rac, i,i, ?i, T,Rcq,Ucm,m,?Rac,c,?Rac,Cm3,i"))] > "register_operand (operands[0], HImode) > || register_operand (operands[1], HImode) > || (CONSTANT_P (operands[1]) > @@ -706,9 +711,9 @@ > ; the iscompact attribute allows the epilogue expander to know for which > ; insns it should lengthen the return insn. > ; N.B. operand 1 of alternative 7 expands into pcl,symbol@gotpc . > -(define_insn "*movsi_insn" ; 0 1 2 3 4 > 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 > 22 23 24 25 26 27 28 29 30 31 > - [(set (match_operand:SI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h, w,w, w, w, w, w,???w, ?w, w,Rcq#q, h, w,Rcq, S, Us<,RcqRck,!*x, > r,!*Rsd,!*Rcd,r,Ucm, Usd,m,???m, m,VUsc") > - (match_operand:SI 1 "move_src_operand" " cL, cP,Rcq#q, > P,hCm1,cL,I,Crr,Clo,Chi,Cbi,?Rac,Cpc,Clb, ?Cal,Cal,?Cal,Uts,Rcq,RcqRck, > Us>,Usd,Ucm, Usd, Ucd,m, w,!*Rzd,c,?Rac,Cm3, C32"))] > +(define_insn "*movsi_insn" ; 0 1 2 3 4 > 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 > 21 22 23 24 25 26 27 28 29 30 31 > + [(set (match_operand:SI 0 "move_dest_operand" "=Rcq,Rcq#q, w,Rcq#q, > h,w*l,w*l, w, w, w, w, ???w, ?w, w,Rcq#q, h, w*l,Rcq, S, > Us<,RcqRck,!*x, r,!*Rsd,!*Rcd,r,Ucm, Usd,m,???m, m,VUsc") > + (match_operand:SI 1 "move_src_operand" " cL, cP,Rcq#q, P,hCm1, > cL, I,Crr,Clo,Chi,Cbi,?Rac*l,Cpc,Clb, ?Cal,Cal,?Cal,Uts,Rcq,RcqRck, > Us>,Usd,Ucm, Usd, Ucd,m, w,!*Rzd,c,?Rac,Cm3, C32"))] > "register_operand (operands[0], SImode) > || register_operand (operands[1], SImode) > || (CONSTANT_P (operands[1]) > @@ -5073,317 +5078,123 @@ > xtr, const0_rtx); > }) > > +;; ------------------------------------------------------------------- > +;; Hardware loop > +;; ------------------------------------------------------------------- > + > ; operand 0 is the loop count pseudo register > -; operand 1 is the loop end pattern > -(define_expand "doloop_begin" > - [(use (match_operand 0 "register_operand" "")) > - (use (match_operand 1 "" ""))] > +; operand 1 is the label to jump to at the top of the loop > +(define_expand "doloop_end" > + [(parallel [(set (pc) > + (if_then_else > + (ne (match_operand 0 "" "") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_dup 0) (plus (match_dup 0) (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_ARC_LP) > + (clobber (match_dup 2))])] > "" > { > - /* Using the INSN_UID of the loop end pattern to identify it causes > - trouble with -fcompare-debug, so allocate a debug-independent > - id instead. We use negative numbers so that we can use the same > - slot in doloop_end_i where we later store a CODE_LABEL_NUMBER, and > - still be able to tell what kind of number this is. */ > - static HOST_WIDE_INT loop_end_id = 0; > - > - rtx id = GEN_INT (--loop_end_id); > - XEXP (XVECEXP (PATTERN (operands[1]), 0, 4), 0) = id; > - emit_insn (gen_doloop_begin_i (operands[0], const0_rtx, id, > - const0_rtx, const0_rtx)); > - DONE; > + if (GET_MODE (operands[0]) != SImode) > + FAIL; > + operands[2] = gen_rtx_SCRATCH (SImode); > }) > > -; ??? can't describe the insn properly as then the optimizers try to > -; hoist the SETs. > -;(define_insn "doloop_begin_i" > -; [(set (reg:SI LP_START) (pc)) > -; (set (reg:SI LP_END) (unspec:SI [(pc)] UNSPEC_ARC_LP)) > -; (use (match_operand 0 "const_int_operand" "n"))] > -; "" > -; "lp .L__GCC__LP%0" > -;) > - > -; The operands of doloop_end_i are also read / written by arc_reorg with > -; XVECEXP (PATTERN (lp, 0, N), so if you want to change the pattern, you > -; might have to adjust arc_reorg. > -; operands 0 / 2 are supplied by the expander, 1, 3 and 4 are filled in > -; by arc_reorg. arc_reorg might also alter operand 0. > -; > -; N in XVECEXP PATTERN (lp, 0 N) > -; V rtl purpose > -; 0 unspec UNSPEC_ARC_LP identify pattern > -; 1 clobber LP_START show LP_START is set > -; 2 clobber LP_END show LP_END is set > -; 3 use operand0 loop count pseudo register > -; 4 use operand1 before arc_reorg: -id > -; after : CODE_LABEL_NUMBER of loop top > label > -; 5 use operand2 INSN_UID of loop end insn > -; 6 use operand3 loop setup not at start (1 above, 2 > below) > -; 7 use operand4 LABEL_REF of top label, if not > -; immediately following > -; If operand1 is still zero after arc_reorg, this is an orphaned loop > -; instruction that was not at the start of the loop. > -; There is no point is reloading this insn - then lp_count would still not > -; be available for the loop end. > -(define_insn "doloop_begin_i" > - [(unspec:SI [(pc)] UNSPEC_ARC_LP) > - (clobber (reg:SI LP_START)) > - (clobber (reg:SI LP_END)) > - (use (match_operand:SI 0 "register_operand" "l,l,????*X")) > - (use (match_operand 1 "const_int_operand" "n,n,C_0")) > - (use (match_operand 2 "const_int_operand" "n,n,X")) > - (use (match_operand 3 "const_int_operand" "C_0,n,X")) > - (use (match_operand 4 "const_int_operand" "C_0,X,X"))] > +(define_insn "arc_lp" > + [(unspec:SI [(match_operand:SI 0 "register_operand" "l")] > + UNSPEC_ARC_LP) > + (use (label_ref (match_operand 1 "" ""))) > + (use (label_ref (match_operand 2 "" "")))] > "" > -{ > - rtx_insn *scan; > - int len, size = 0; > - int n_insns = 0; > - rtx loop_start = operands[4]; > - > - if (CONST_INT_P (loop_start)) > - loop_start = NULL_RTX; > - /* Size implications of the alignment will be taken care of by the > - alignment inserted at the loop start. */ > - if (LOOP_ALIGN (0) && INTVAL (operands[1])) > - { > - asm_fprintf (asm_out_file, "\t.p2align %d\\n", LOOP_ALIGN (0)); > - arc_clear_unalign (); > - } > - if (!INTVAL (operands[1])) > - return "; LITTLE LOST LOOP"; > - if (loop_start && flag_pic) > - { > - /* ??? Can do better for when a scratch register > - is known. But that would require extra testing. */ > - return "push_s r0\;add r0,pcl,%4@pcl\;sr r0,[2]; LP_START\;add > r0,pcl,.L__GCC__LP%1@pcl\;sr r0,[3]; LP_END\;pop_s r0"; > - } > - /* Check if the loop end is in range to be set by the lp instruction. */ > - size = INTVAL (operands[3]) < 2 ? 0 : 2048; > - for (scan = insn; scan && size < 2048; scan = NEXT_INSN (scan)) > - { > - if (!INSN_P (scan)) > - continue; > - if (recog_memoized (scan) == CODE_FOR_doloop_end_i > - && (XEXP (XVECEXP (PATTERN (scan), 0, 4), 0) > - == XEXP (XVECEXP (PATTERN (insn), 0, 4), 0))) > - break; > - len = get_attr_length (scan); > - size += len; > - } > - /* Try to verify that there are at least three instruction fetches > - between the loop setup and the first encounter of the loop end. */ > - for (scan = NEXT_INSN (insn); scan && n_insns < 3; scan = NEXT_INSN (scan)) > - { > - if (!INSN_P (scan)) > - continue; > - if (rtx_sequence *seq = dyn_cast <rtx_sequence *> (PATTERN (scan))) > - scan = seq->insn (0); > - if (JUMP_P (scan)) > - { > - if (recog_memoized (scan) != CODE_FOR_doloop_end_i) > - { > - n_insns += 2; > - if (simplejump_p (scan)) > - { > - scan = as_a <rtx_insn *> (XEXP (SET_SRC (PATTERN (scan)), 0)); > - continue; > - } > - > - rtx lab = JUMP_LABEL (scan); > - if (!lab) > - break; > - > - rtx_insn *next_scan > - = next_active_insn (NEXT_INSN (PREV_INSN (scan))); > - if (next_scan > - && recog_memoized (next_scan) != CODE_FOR_doloop_begin_i) > - break; > - > - /* JUMP_LABEL might be simple_return instead if an insn. */ > - if (!INSN_P (lab)) > - { > - n_insns++; > - break; > - } > - > - rtx_insn *next_lab = next_active_insn (as_a<rtx_insn *> (lab)); > - if (next_lab > - && recog_memoized (next_lab) != CODE_FOR_doloop_begin_i) > - break; > - > - n_insns++; > - } > - break; > - } > - len = get_attr_length (scan); > - /* Size estimation of asms assumes that each line which is nonempty > - codes an insn, and that each has a long immediate. For minimum insn > - count, assume merely that a nonempty asm has at least one insn. */ > - if (GET_CODE (PATTERN (scan)) == ASM_INPUT > - || asm_noperands (PATTERN (scan)) >= 0) > - n_insns += (len != 0); > - else > - n_insns += (len > 4 ? 2 : (len ? 1 : 0)); > - } > - if (LOOP_ALIGN (0)) > - { > - asm_fprintf (asm_out_file, "\t.p2align %d\\n", LOOP_ALIGN (0)); > - arc_clear_unalign (); > - } > - gcc_assert (n_insns || GET_CODE (next_nonnote_insn (insn)) == CODE_LABEL); > - if (size >= 2048 || (TARGET_ARC600 && n_insns == 1) || loop_start) > - { > - if (flag_pic) > - { > - /* ??? Can do better for when a scratch register > - is known. But that would require extra testing. */ > - arc_clear_unalign (); > - return ".p2align 2\;push_s r0\;add r0,pcl,24\;sr r0,[2]; > LP_START\;add r0,pcl,.L__GCC__LP%1@pcl\;sr r0,[3]; LP_END\;pop_s r0"; > - } > - output_asm_insn ((size < 2048 > - ? "lp .L__GCC__LP%1" : "sr .L__GCC__LP%1,[3]; LP_END"), > - operands); > - output_asm_insn (loop_start > - ? "sr %4,[2]; LP_START" : "sr 0f,[2]; LP_START", > - operands); > - if (TARGET_ARC600 && n_insns < 1) > - output_asm_insn ("nop", operands); > - return (TARGET_ARC600 && n_insns < 3) ? "nop_s\;nop_s\;0:" : "0:"; > - } > - else if (TARGET_ARC600 && n_insns < 3) > - { > - /* At least four instructions are needed between the setting of > LP_COUNT > - and the loop end - but the lp instruction qualifies as one. */ > - rtx_insn *prev = prev_nonnote_insn (insn); > - > - if (!INSN_P (prev) || dead_or_set_regno_p (prev, LP_COUNT)) > - output_asm_insn ("nop", operands); > - } > - return "lp .L__GCC__LP%1"; > -} > + "lp\\t@%l2\\t; %0:@%l1->@%l2" > [(set_attr "type" "loop_setup") > - (set_attr_alternative "length" > -; FIXME: length is usually 4, but we need branch shortening > -; to get this right. > -; [(if_then_else (match_test "TARGET_ARC600") (const_int 16) (const_int > 4)) > - [(if_then_else (match_test "flag_pic") (const_int 24) (const_int 16)) > - (if_then_else (match_test "flag_pic") (const_int 28) (const_int 16)) > - (const_int 0)])] > - ;; ??? we should really branch shorten this insn, but then we'd > - ;; need a proper label first. N.B. the end label can not only go out > - ;; of range when it is far away, but also when it precedes the loop - > - ;; which, unfortunately, it sometimes does, when the loop "optimizer" > - ;; messes things up. > -) > - > -; operand 0 is the loop count pseudo register > -; operand 1 is the label to jump to at the top of the loop > -; Use this for the ARC600 and ARC700. > -; ??? ARC600 might want to check if the loop has few iteration and only a > -; single insn - loop setup is expensive then. > -(define_expand "doloop_end" > - [(use (match_operand 0 "register_operand" "")) > - (use (label_ref (match_operand 1 "" "")))] > - "!TARGET_ARC601" > -{ > - /* We could do smaller bivs with biv widening, and wider bivs by having > - a high-word counter in an outer loop - but punt on this for now. */ > - if (GET_MODE (operands[0]) != SImode) > - FAIL; > - emit_jump_insn (gen_doloop_end_i (operands[0], operands[1], const0_rtx)); > - DONE; > -}) > + (set_attr "length" "4")]) > > -(define_insn_and_split "doloop_end_i" > +;; if by any chance the lp_count is not used, then use an 'r' > +;; register, instead of going to memory. > +(define_insn "loop_end" > [(set (pc) > - (if_then_else (ne (match_operand:SI 0 "shouldbe_register_operand" > "+l,*c,*m") > - (const_int 1)) > + (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0,0") > + (const_int 1)) > (label_ref (match_operand 1 "" "")) > (pc))) > - (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) > - (use (reg:SI LP_START)) > - (use (reg:SI LP_END)) > - (use (match_operand 2 "const_int_operand" "n,???Cn0,???X")) > - (clobber (match_scratch:SI 3 "=X,X,&????r"))] > + (set (match_operand:SI 0 "nonimmediate_operand" "=l!r,m") > + (plus (match_dup 2) (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_ARC_LP) > + (clobber (match_scratch:SI 3 "=X,&r"))] > "" > - "* > -{ > - rtx_insn *prev = prev_nonnote_insn (insn); > - > - /* If there is an immediately preceding label, we must output a nop, > - lest a branch to that label will fall out of the loop. > - ??? We could try to avoid this by claiming to have a delay slot if there > - is a preceding label, and outputting the delay slot insn instead, if > - present. > - Or we could have some optimization that changes the source edge to > update > - the loop count and jump to the loop start instead. */ > - /* For ARC600, we must also prevent jumps inside the loop and jumps where > - the loop counter value is live at the target from being directly at the > - loop end. Being sure that the loop counter is dead at the target is > - too much hair - we can't rely on data flow information at this point - > - so insert a nop for all branches. > - The ARC600 also can't read the loop counter in the last insn of a loop. > */ > - if (LABEL_P (prev)) > - output_asm_insn (\"nop%?\", operands); > - return \"\\n.L__GCC__LP%2: ; loop end, start is %1\"; > -}" > - "&& memory_operand (operands[0], SImode)" > - [(pc)] > -{ > - emit_move_insn (operands[3], operands[0]); > - emit_jump_insn (gen_doloop_fallback_m (operands[3], operands[1], > operands[0])); > - DONE; > -} > - [(set_attr "type" "loop_end") > - (set (attr "length") > - (if_then_else (match_test "LABEL_P (prev_nonnote_insn (insn))") > - (const_int 4) (const_int 0)))] > -) > + "\\t;%0 %1 %2" > + [(set_attr "length" "0") > + (set_attr "predicable" "no") > + (set_attr "type" "loop_end")]) > > -; This pattern is generated by arc_reorg when there is no recognizable > -; loop start. > -(define_insn "*doloop_fallback" > - [(set (pc) (if_then_else (ne (match_operand:SI 0 "register_operand" > "+r,!w") > - (const_int 1)) > - (label_ref (match_operand 1 "" "")) > - (pc))) > - (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))] > - ; avoid fooling the loop optimizer into assuming this is a special insn. > - "reload_completed" > - "*return get_attr_length (insn) == 8 > - ? \"brne.d %0,1,%1\;sub %0,%0,1\" > - : \"breq %0,1,0f\;b.d %1\;sub %0,%0,1\\n0:\";" > - [(set (attr "length") > - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -256)) > - (le (minus (match_dup 1) (pc)) (const_int 244))) > - (const_int 8) (const_int 12))) > - (set_attr "type" "brcc_no_delay_slot") > - (set_attr "cond" "nocond")] > -) > +;; split pattern for the very slim chance when the loop register is > +;; memory. > +(define_split > + [(set (pc) > + (if_then_else (ne (match_operand:SI 0 "memory_operand") > + (const_int 1)) > + (label_ref (match_operand 1 "")) > + (pc))) > + (set (match_dup 0) (plus (match_dup 0) (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_ARC_LP) > + (clobber (match_scratch:SI 2))] > + "memory_operand (operands[0], SImode)" > + [(set (match_dup 2) (match_dup 0)) > + (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1))) > + (set (match_dup 0) (match_dup 2)) > + (set (reg:CC CC_REG) (compare:CC (match_dup 2) (const_int 0))) > + (set (pc) > + (if_then_else (ne (reg:CC CC_REG) > + (const_int 0)) > + (label_ref (match_dup 1)) > + (pc)))] > + "") > > -; reload can't make output reloads for jump insns, so we have to do this by > hand. > -(define_insn "doloop_fallback_m" > - [(set (pc) (if_then_else (ne (match_operand:SI 0 "register_operand" "+&r") > - (const_int 1)) > - (label_ref (match_operand 1 "" "")) > - (pc))) > - (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) > - (set (match_operand:SI 2 "memory_operand" "=m") > - (plus:SI (match_dup 0) (const_int -1)))] > - ; avoid fooling the loop optimizer into assuming this is a special insn. > - "reload_completed" > - "*return get_attr_length (insn) == 12 > - ? \"sub %0,%0,1\;brne.d %0,0,%1\;st%U2%V2 %0,%2\" > - : \"sub %0,%0,1\;breq %0,0,0f\;b.d %1\\n0:\tst%U2%V2 %0,%2\";" > - [(set (attr "length") > - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -252)) > - (le (minus (match_dup 1) (pc)) (const_int 244))) > - (const_int 12) (const_int 16))) > - (set_attr "type" "brcc_no_delay_slot") > - (set_attr "cond" "nocond")] > -) > +(define_insn "loop_fail" > + [(set (reg:SI LP_COUNT) > + (plus:SI (reg:SI LP_COUNT) (const_int -1))) > + (set (reg:CC_ZN CC_REG) > + (compare:CC_ZN (plus:SI (reg:SI LP_COUNT) (const_int -1)) > + (const_int 0)))] > + "" > + "sub.f%?\\tlp_count,lp_count,1" > + [(set_attr "iscompact" "false") > + (set_attr "type" "compare") > + (set_attr "cond" "set_zn") > + (set_attr "length" "4") > + (set_attr "predicable" "yes")]) > + > +(define_insn_and_split "dbnz" > + [(set (pc) > + (if_then_else > + (ne (plus:SI (match_operand:SI 0 "nonimmediate_operand" "+r!l,m") > + (const_int -1)) > + (const_int 0)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_dup 0) > + (plus:SI (match_dup 0) > + (const_int -1))) > + (clobber (match_scratch:SI 2 "=X,r"))] > + "TARGET_V2" > + "@ > + dbnz%#\\t%0,%l1 > + #" > + "TARGET_V2 && reload_completed && memory_operand (operands[0], SImode)" > + [(set (match_dup 2) (match_dup 0)) > + (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1))) > + (set (reg:CC CC_REG) (compare:CC (match_dup 2) (const_int 0))) > + (set (match_dup 0) (match_dup 2)) > + (set (pc) (if_then_else (ge (reg:CC CC_REG) > + (const_int 0)) > + (label_ref (match_dup 1)) > + (pc)))] > + "" > + [(set_attr "iscompact" "false") > + (set_attr "type" "loop_end") > + (set_attr "length" "4,20")]) > > (define_expand "movmemsi" > [(match_operand:BLK 0 "" "") > diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt > index ad2df26..d1ebd44 100644 > --- a/gcc/config/arc/arc.opt > +++ b/gcc/config/arc/arc.opt > @@ -494,3 +494,28 @@ Specifies the registers that the processor saves on an > interrupt entry and exit. > mrgf-banked-regs= > Target RejectNegative Joined Var(arc_deferred_options) Defer > Specifies the number of registers replicated in second register bank on > entry to fast interrupt. > + > +mlpc-width= > +Target RejectNegative Joined Enum(arc_lpc) Var(arc_lpcwidth) Init(32) > +Sets LP_COUNT register width. Possible values are 8, 16, 20, 24, 28, and 32. > + > +Enum > +Name(arc_lpc) Type(int) > + > +EnumValue > +Enum(arc_lpc) String(8) Value(8) > + > +EnumValue > +Enum(arc_lpc) String(16) Value(16) > + > +EnumValue > +Enum(arc_lpc) String(20) Value(20) > + > +EnumValue > +Enum(arc_lpc) String(24) Value(24) > + > +EnumValue > +Enum(arc_lpc) String(28) Value(28) > + > +EnumValue > +Enum(arc_lpc) String(32) Value(32) > diff --git a/gcc/config/arc/predicates.md b/gcc/config/arc/predicates.md > index 1f66438..2610f84 100644 > --- a/gcc/config/arc/predicates.md > +++ b/gcc/config/arc/predicates.md > @@ -362,6 +362,8 @@ > else if (TARGET_MUL64_SET > && (REGNO (op) == 57 || REGNO(op) == 58 || REGNO(op) == 59 )) > return 0; > + else if (REGNO (op) == LP_COUNT) > + return 1; > else > return dest_reg_operand (op, mode); > case SUBREG : > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index f480051..3d1f7f3 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -614,7 +614,7 @@ Objective-C and Objective-C++ Dialects}. > -mcrc -mdsp-packa -mdvbf -mlock -mmac-d16 -mmac-24 -mrtsc -mswape @gol > -mtelephony -mxy -misize -mannotate-align -marclinux -marclinux_prof > @gol > -mlong-calls -mmedium-calls -msdata -mirq-ctrl-saved @gol > --mrgf-banked-regs @gol > +-mrgf-banked-regs -mlpc-width @gol > -mvolatile-cache -mtp-regno=@var{regno} @gol > -malign-call -mauto-modify-reg -mbbit-peephole -mno-brcc @gol > -mcase-vector-pcrel -mcompact-casesi -mno-cond-exec -mearly-cbranchsi @gol > @@ -14678,6 +14678,18 @@ registers to avoid memory transactions during > interrupt entry and exit > sequences. Use this option when you are using fast interrupts in an > ARC V2 family processor. Permitted values are 4, 8, 16, and 32. > > +@item -mlpc-width=@var{lpcw} > +@opindex mlpc-width > +Specify the width of the LP_COUNT register. Valid values for > +@var{lpcw} are 8, 16, 20, 24, 28 and 32. The default width is fixed > +to 32. If the width is less than 32, the compiler does not attempt to > +transform loops in your program to use the zero-delay loop mechanism > +unless it is known that the @samp{LP_COUNT} register can hold the > +required loop-counter value. Depending on the size specified, the > +compiler and run-time library might continue to use the loop mechanism > +for various needs. This option defines macro @code{__ARC_LPC_WIDTH__} > +with the value of size. > + > @end table > > The following options are passed through to the assembler, and also > diff --git a/gcc/testsuite/gcc.target/arc/loop-1.c > b/gcc/testsuite/gcc.target/arc/loop-1.c > old mode 100644 > new mode 100755 > index 1afe8eb..773f583 > --- a/gcc/testsuite/gcc.target/arc/loop-1.c > +++ b/gcc/testsuite/gcc.target/arc/loop-1.c > @@ -1,45 +1,12 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -w" } */ > > -/* This case would fail to make use of the zero-overhead loop > - instruction at one time due to a bug. */ > +/* Check how we handle empty body loops. */ > > -extern char a[]; > - > -struct some_t > -{ > - struct > - { > - int aaa; > - short bbb; > - char ccc; > - char ddd; > - } ppp[8]; > - > - int www[1]; > -}; > - > -int b; > - > -void > -some_function () > -{ > - struct some_t *tmp = (struct some_t *) a; > - > - while ((*tmp).ppp[b].ccc) > - while(0); > - > - for (; b; b++) > - { > - if (tmp->ppp[b].ccc) > - { > - int c = tmp->ppp[b].bbb; > - int d = tmp->ppp[b].aaa; > - int e = d - tmp->www[c]; > - if (e) > - tmp->ppp[b].ddd = 1; > - } > - } > +a; > +fn1() { > + int i; > + for (; i < 8; i++) { > + double A[a]; > + } > } > - > -/* { dg-final { scan-assembler "\[^\n\]+lp \\.L__GCC__" } } */ > -- > 1.9.1 >