Re: [PATCH 6/7] [ARC] Reimplement ZOL support.

Andrew Burgess Thu, 31 Aug 2017 12:08:32 -0700

* Claudiu Zissulescu <claudiu.zissule...@synopsys.com> [2017-07-24 10:42:58 
+0200]:


> From: claziss <claz...@synopsys.com>
> 
> 2017-05-22  Claudiu Zissulescu <claz...@synopsys.com>
> 
>       * config/arc/arc-c.c (__ARC_LPC_WIDTH__): Add builtin define.
>       * config/arc/arc.c (ARC_MAX_LOOP_LENGTH): Define.
>       (arc_conditional_register_usage): Remove ARC600 lp_count
>       exception.
>       (arc_file_start): Emit Tag_ARC_CPU_variation.
>       (arc_can_use_doloop_p): New conditions to use ZOLs.
>       (hwloop_fail): New function.
>       (hwloop_optimize): Likewise.
>       (hwloop_pattern_reg): Likewise.
>       (arc_doloop_hooks): New struct, to be used with reorg_loops.
>       (arc_reorg_loops): New function, calls reorg_loops.
>       (arc_reorg): Call arc_reorg_loops.  Remove old ZOL handling.
>       (arc600_corereg_hazard): Remove ZOL checking, case handled by
>       hwloop_optimize.
>       (arc_loop_hazard): Remove function, functionality moved into
>       hwloop_optimize.
>       (arc_hazard): Remove arc_loop_hazard call.
>       (arc_adjust_insn_length): Remove ZOL handling, functionality moved
>       into hwloop_optimize.
>       (arc_label_align): Remove ZOL handling.
>       * config/arc/arc.h (LOOP_ALIGN): Changed to 0.
>       * config/arc/arc.md (doloop_begin): Remove pattern.
>       (doloop_begin_i): Likewise.
>       (doloop_end_i): Likewise.
>       (doloop_fallback): Likewise.
>       (doloop_fallback_m): Likewise.
>       (doloop_end): Reimplement expand.
>       (arc_lp): New pattern for LP instruction.
>       (loop_end): New pattern.
>       (loop_fail): Likewise.
>       (decrement_and_branch_until_zero): Likewise.
>       * config/arc/arc.opt (mlpc-width): New option.
>       * doc/invoke.texi (mlpc-width): Document option.
> 
> testsuite/
> 2017-05-22  Claudiu Zissulescu <claz...@synopsys.com>
> 
>           * gcc.target/arc/loop-1.c: Update test.

I'm happy with this if the doc is updated inline with Sandra's
suggestions.

Thanks,
Andrew



> ---
>  gcc/config/arc/arc-c.c                |   2 +
>  gcc/config/arc/arc.c                  | 726 
> ++++++++++++++++++----------------
>  gcc/config/arc/arc.h                  |  10 +-
>  gcc/config/arc/arc.md                 | 419 ++++++--------------
>  gcc/config/arc/arc.opt                |  25 ++
>  gcc/config/arc/predicates.md          |   2 +
>  gcc/doc/invoke.texi                   |  14 +-
>  gcc/testsuite/gcc.target/arc/loop-1.c |  49 +--
>  8 files changed, 561 insertions(+), 686 deletions(-)
>  mode change 100644 => 100755 gcc/testsuite/gcc.target/arc/loop-1.c
> 
> diff --git a/gcc/config/arc/arc-c.c b/gcc/config/arc/arc-c.c
> index de877a1..44ff338 100644
> --- a/gcc/config/arc/arc-c.c
> +++ b/gcc/config/arc/arc-c.c
> @@ -62,6 +62,8 @@ arc_cpu_cpp_builtins (cpp_reader * pfile)
>    builtin_define_with_int_value ("__ARC_TLS_REGNO__",
>                                arc_tp_regno);
>  
> +  builtin_define_with_int_value ("__ARC_LPC_WIDTH__", arc_lpcwidth);
> +
>    builtin_define (TARGET_BIG_ENDIAN
>                 ? "__BIG_ENDIAN__" : "__LITTLE_ENDIAN__");
>    if (TARGET_BIG_ENDIAN)
> diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
> index c94b187..0f9b553 100644
> --- a/gcc/config/arc/arc.c
> +++ b/gcc/config/arc/arc.c
> @@ -64,11 +64,15 @@ along with GCC; see the file COPYING3.  If not see
>  #include "rtl-iter.h"
>  #include "alias.h"
>  #include "opts.h"
> +#include "hw-doloop.h"
>  
>  /* Which cpu we're compiling for (ARC600, ARC601, ARC700).  */
>  static char arc_cpu_name[10] = "";
>  static const char *arc_cpu_string = arc_cpu_name;
>  
> +/* Maximum size of a loop.  */
> +#define ARC_MAX_LOOP_LENGTH 4095
> +
>  /* ??? Loads can handle any constant, stores can only handle small ones.  */
>  /* OTOH, LIMMs cost extra, so their usefulness is limited.  */
>  #define RTX_OK_FOR_OFFSET_P(MODE, X) \
> @@ -1708,18 +1712,7 @@ arc_conditional_register_usage (void)
>          i <= ARC_LAST_SIMD_DMA_CONFIG_REG; i++)
>       reg_alloc_order [i] = i;
>      }
> -  /* For ARC600, lp_count may not be read in an instruction
> -     following immediately after another one setting it to a new value.
> -     There was some discussion on how to enforce scheduling constraints for
> -     processors with missing interlocks on the gcc mailing list:
> -     http://gcc.gnu.org/ml/gcc/2008-05/msg00021.html .
> -     However, we can't actually use this approach, because for ARC the
> -     delay slot scheduling pass is active, which runs after
> -     machine_dependent_reorg.  */
> -  if (TARGET_ARC600)
> -    CLEAR_HARD_REG_BIT (reg_class_contents[SIBCALL_REGS], LP_COUNT);
> -  else if (!TARGET_LP_WR_INTERLOCK)
> -    fixed_regs[LP_COUNT] = 1;
> +
>    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>      if (!call_used_regs[regno])
>        CLEAR_HARD_REG_BIT (reg_class_contents[SIBCALL_REGS], regno);
> @@ -6998,28 +6991,33 @@ arc_pass_by_reference (cumulative_args_t ca_v 
> ATTRIBUTE_UNUSED,
>  /* Implement TARGET_CAN_USE_DOLOOP_P.  */
>  
>  static bool
> -arc_can_use_doloop_p (const widest_int &iterations, const widest_int &,
> +arc_can_use_doloop_p (const widest_int &,
> +                   const widest_int &iterations_max,
>                     unsigned int loop_depth, bool entered_at_top)
>  {
> -  if (loop_depth > 1)
> +  /* Considering limitations in the hardware, only use doloop
> +     for innermost loops which must be entered from the top.  */
> +  if (loop_depth > 1 || !entered_at_top)
>      return false;
> -  /* Setting up the loop with two sr instructions costs 6 cycles.  */
> -  if (TARGET_ARC700
> -      && !entered_at_top
> -      && wi::gtu_p (iterations, 0)
> -      && wi::leu_p (iterations, flag_pic ? 6 : 3))
> +
> +  /* Check for lp_count width boundary.  */
> +  if (arc_lpcwidth != 32
> +      && (wi::gtu_p (iterations_max, ((1 << arc_lpcwidth) - 1))
> +       || wi::eq_p (iterations_max, 0)))
>      return false;
>    return true;
>  }
>  
> -/* NULL if INSN insn is valid within a low-overhead loop.
> -   Otherwise return why doloop cannot be applied.  */
> +/* NULL if INSN insn is valid within a low-overhead loop.  Otherwise
> +   return why doloop cannot be applied.  */
>  
>  static const char *
>  arc_invalid_within_doloop (const rtx_insn *insn)
>  {
>    if (CALL_P (insn))
>      return "Function call in the loop.";
> +
> +  /* FIXME! add here all the ZOL exceptions.  */
>    return NULL;
>  }
>  
> @@ -7118,6 +7116,368 @@ workaround_arc_anomaly (void)
>      }
>  }
>  
> +/* A callback for the hw-doloop pass.  Called when a loop we have discovered
> +   turns out not to be optimizable; we have to split the loop_end pattern 
> into
> +   a subtract and a test.  */
> +
> +static void
> +hwloop_fail (hwloop_info loop)
> +{
> +  rtx test;
> +  rtx insn = loop->loop_end;
> +
> +  if (TARGET_V2
> +      && (loop->length && (loop->length <= ARC_MAX_LOOP_LENGTH))
> +      && REG_P (loop->iter_reg))
> +    {
> +      /* TARGET_V2 has dbnz instructions.  */
> +      test = gen_dbnz (loop->iter_reg, loop->start_label);
> +      insn = emit_jump_insn_before (test, loop->loop_end);
> +    }
> +  else if (REG_P (loop->iter_reg) && (REGNO (loop->iter_reg) == LP_COUNT))
> +    {
> +      /* We have the lp_count as loop iterator, try to use it.  */
> +      emit_insn_before (gen_loop_fail (), loop->loop_end);
> +      test = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_ZNmode, CC_REG),
> +                      const0_rtx);
> +      test = gen_rtx_IF_THEN_ELSE (VOIDmode, test,
> +                                gen_rtx_LABEL_REF (Pmode, loop->start_label),
> +                                pc_rtx);
> +      insn = emit_jump_insn_before (gen_rtx_SET (pc_rtx, test),
> +                                  loop->loop_end);
> +    }
> +  else
> +    {
> +      emit_insn_before (gen_addsi3 (loop->iter_reg,
> +                                 loop->iter_reg,
> +                                 constm1_rtx),
> +                     loop->loop_end);
> +      test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
> +      insn = emit_jump_insn_before (gen_cbranchsi4 (test,
> +                                                 loop->iter_reg,
> +                                                 const0_rtx,
> +                                                 loop->start_label),
> +                                 loop->loop_end);
> +    }
> +  JUMP_LABEL (insn) = loop->start_label;
> +  LABEL_NUSES (loop->start_label)++;
> +  delete_insn (loop->loop_end);
> +}
> +
> +/* Optimize LOOP.  */
> +
> +static bool
> +hwloop_optimize (hwloop_info loop)
> +{
> +  int i;
> +  edge entry_edge;
> +  basic_block entry_bb, bb;
> +  rtx iter_reg, end_label;
> +  rtx_insn *insn, *seq, *entry_after, *last_insn;
> +  unsigned int length;
> +  bool need_fix = false;
> +  rtx lp_reg = gen_rtx_REG (SImode, LP_COUNT);
> +
> +  if (loop->depth > 1)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not innermost\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (!loop->incoming_dest)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has more than one entry\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->incoming_dest != loop->head)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d is not entered from head\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  if (loop->has_call || loop->has_asm)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d has invalid insn\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Scan all the blocks to make sure they don't use iter_reg.  */
> +  if (loop->iter_reg_used || loop->iter_reg_used_outside)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d uses iterator\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if start_label appears before doloop_end.  */
> +  length = 0;
> +  for (insn = loop->start_label;
> +       insn && insn != loop->loop_end;
> +       insn = NEXT_INSN (insn))
> +    length += NONDEBUG_INSN_P (insn) ? get_attr_length (insn) : 0;
> +
> +  if (!insn)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  loop->length = length;
> +  if (loop->length > ARC_MAX_LOOP_LENGTH)
> +    {
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d too long\n", loop->loop_no);
> +      return false;
> +    }
> +  else if (!loop->length)
> +    {
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d is empty\n", loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if we use a register or not.  */
> +  if (!REG_P (loop->iter_reg))
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d iterator is MEM\n",
> +                 loop->loop_no);
> +      return false;
> +    }
> +
> +  /* Check if loop register is lpcount.  */
> +  if (REG_P (loop->iter_reg) && (REGNO (loop->iter_reg)) != LP_COUNT)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, ";; loop %d doesn't use lp_count as loop"
> +              " iterator\n",
> +                 loop->loop_no);
> +      /* This loop doesn't use the lp_count, check though if we can
> +      fix it.  */
> +      if (TEST_HARD_REG_BIT (loop->regs_set_in_loop, LP_COUNT)
> +       /* In very unique cases we may have LP_COUNT alive.  */
> +       || (loop->incoming_src
> +           && REGNO_REG_SET_P (df_get_live_out (loop->incoming_src),
> +                               LP_COUNT)))
> +     return false;
> +      else
> +     need_fix = true;
> +    }
> +
> +  /* Check for control like instruction as the last instruction of a
> +     ZOL.  */
> +  bb = loop->tail;
> +  last_insn = PREV_INSN (loop->loop_end);
> +
> +  while (1)
> +    {
> +      for (; last_insn != BB_HEAD (bb);
> +        last_insn = PREV_INSN (last_insn))
> +     if (NONDEBUG_INSN_P (last_insn))
> +       break;
> +
> +      if (last_insn != BB_HEAD (bb))
> +     break;
> +
> +      if (single_pred_p (bb)
> +       && single_pred_edge (bb)->flags & EDGE_FALLTHRU
> +       && single_pred (bb) != ENTRY_BLOCK_PTR_FOR_FN (cfun))
> +     {
> +       bb = single_pred (bb);
> +       last_insn = BB_END (bb);
> +       continue;
> +     }
> +      else
> +     {
> +       last_insn = NULL;
> +       break;
> +     }
> +    }
> +
> +  if (!last_insn)
> +    {
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d has no last instruction\n",
> +              loop->loop_no);
> +      return false;
> +    }
> +
> +  if ((TARGET_ARC600_FAMILY || TARGET_HS)
> +      && INSN_P (last_insn)
> +      && (JUMP_P (last_insn) || CALL_P (last_insn)
> +       || GET_CODE (PATTERN (last_insn)) == SEQUENCE
> +       /* At this stage we can have (insn (clobber (mem:BLK
> +          (reg)))) instructions, ignpre them.  */
> +       || (GET_CODE (PATTERN (last_insn)) != CLOBBER
> +           && (get_attr_type (last_insn) == TYPE_BRCC
> +               || get_attr_type (last_insn) == TYPE_BRCC_NO_DELAY_SLOT))))
> +    {
> +      if (loop->length + 2 > ARC_MAX_LOOP_LENGTH)
> +     {
> +       if (dump_file)
> +         fprintf (dump_file, ";; loop %d too long\n", loop->loop_no);
> +       return false;
> +     }
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d has a control like last insn;"
> +              "add a nop\n",
> +              loop->loop_no);
> +
> +      last_insn = emit_insn_after (gen_nopv (), last_insn);
> +    }
> +
> +  if (LABEL_P (last_insn))
> +    {
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d has a label as last insn;"
> +              "add a nop\n",
> +              loop->loop_no);
> +      last_insn = emit_insn_after (gen_nopv (), last_insn);
> +    }
> +  loop->last_insn = last_insn;
> +
> +  /* Get the loop iteration register.  */
> +  iter_reg = loop->iter_reg;
> +
> +  gcc_assert (REG_P (iter_reg));
> +
> +  entry_edge = NULL;
> +
> +  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
> +    if (entry_edge->flags & EDGE_FALLTHRU)
> +      break;
> +
> +  if (entry_edge == NULL)
> +    {
> +      if (dump_file)
> +     fprintf (dump_file, ";; loop %d has no fallthru edge jumping"
> +              "into the loop\n",
> +              loop->loop_no);
> +      return false;
> +    }
> +  /* The loop is good.  */
> +  end_label = gen_label_rtx ();
> +  loop->end_label = end_label;
> +
> +  /* Place the zero_cost_loop_start instruction before the loop.  */
> +  entry_bb = entry_edge->src;
> +
> +  start_sequence ();
> +
> +  if (need_fix)
> +    {
> +      /* The loop uses a R-register, but the lp_count is free, thus
> +      use lp_count.  */
> +      emit_insn (gen_movsi (lp_reg, iter_reg));
> +      SET_HARD_REG_BIT (loop->regs_set_in_loop, LP_COUNT);
> +      iter_reg = lp_reg;
> +      if (dump_file)
> +     {
> +       fprintf (dump_file, ";; fix loop %d to use lp_count\n",
> +                loop->loop_no);
> +     }
> +    }
> +
> +  insn = emit_insn (gen_arc_lp (iter_reg,
> +                             loop->start_label,
> +                             loop->end_label));
> +
> +  seq = get_insns ();
> +  end_sequence ();
> +
> +  entry_after = BB_END (entry_bb);
> +  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1
> +      || !entry_after)
> +    {
> +      basic_block new_bb;
> +      edge e;
> +      edge_iterator ei;
> +
> +      emit_insn_before (seq, BB_HEAD (loop->head));
> +      seq = emit_label_before (gen_label_rtx (), seq);
> +      new_bb = create_basic_block (seq, insn, entry_bb);
> +      FOR_EACH_EDGE (e, ei, loop->incoming)
> +        {
> +          if (!(e->flags & EDGE_FALLTHRU))
> +            redirect_edge_and_branch_force (e, new_bb);
> +          else
> +            redirect_edge_succ (e, new_bb);
> +        }
> +
> +      make_edge (new_bb, loop->head, 0);
> +    }
> +  else
> +    {
> +#if 0
> +      while (DEBUG_INSN_P (entry_after)
> +             || (NOTE_P (entry_after)
> +                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK
> +              /* Make sure we don't split a call and its corresponding
> +                 CALL_ARG_LOCATION note.  */
> +                 && NOTE_KIND (entry_after) != NOTE_INSN_CALL_ARG_LOCATION))
> +        entry_after = NEXT_INSN (entry_after);
> +#endif
> +      entry_after = next_nonnote_insn_bb (entry_after);
> +
> +      gcc_assert (entry_after);
> +      emit_insn_before (seq, entry_after);
> +    }
> +
> +  delete_insn (loop->loop_end);
> +  /* Insert the loop end label before the last instruction of the
> +     loop.  */
> +  emit_label_after (end_label, loop->last_insn);
> +
> +  return true;
> +}
> +
> +/* A callback for the hw-doloop pass.  This function examines INSN; if
> +   it is a loop_end pattern we recognize, return the reg rtx for the
> +   loop counter.  Otherwise, return NULL_RTX.  */
> +
> +static rtx
> +hwloop_pattern_reg (rtx_insn *insn)
> +{
> +  rtx reg;
> +
> +  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end)
> +    return NULL_RTX;
> +
> +  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
> +  if (!REG_P (reg))
> +    return NULL_RTX;
> +  return reg;
> +}
> +
> +static struct hw_doloop_hooks arc_doloop_hooks =
> +{
> +  hwloop_pattern_reg,
> +  hwloop_optimize,
> +  hwloop_fail
> +};
> +
> +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
> +   and tries to rewrite the RTL of these loops so that proper Blackfin
> +   hardware loops are generated.  */
> +
> +static void
> +arc_reorg_loops (void)
> +{
> +  reorg_loops (true, &arc_doloop_hooks);
> +}
> +
>  static int arc_reorg_in_progress = 0;
>  
>  /* ARC's machince specific reorg function.  */
> @@ -7131,204 +7491,17 @@ arc_reorg (void)
>    long offset;
>    int changed;
>  
> -  workaround_arc_anomaly ();
> -
>    cfun->machine->arc_reorg_started = 1;
>    arc_reorg_in_progress = 1;
>  
> -  /* Link up loop ends with their loop start.  */
> -  {
> -    for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
> -      if (GET_CODE (insn) == JUMP_INSN
> -       && recog_memoized (insn) == CODE_FOR_doloop_end_i)
> -     {
> -       rtx_insn *top_label
> -         = as_a <rtx_insn *> (XEXP (XEXP (SET_SRC (XVECEXP (PATTERN (insn), 
> 0, 0)), 1), 0));
> -       rtx num = GEN_INT (CODE_LABEL_NUMBER (top_label));
> -       rtx_insn *lp, *prev = prev_nonnote_insn (top_label);
> -       rtx_insn *lp_simple = NULL;
> -       rtx_insn *next = NULL;
> -       rtx op0 = XEXP (XVECEXP (PATTERN (insn), 0, 1), 0);
> -       int seen_label = 0;
> -
> -       for (lp = prev;
> -            (lp && NONJUMP_INSN_P (lp)
> -             && recog_memoized (lp) != CODE_FOR_doloop_begin_i);
> -            lp = prev_nonnote_insn (lp))
> -         ;
> -       if (!lp || !NONJUMP_INSN_P (lp)
> -           || dead_or_set_regno_p (lp, LP_COUNT))
> -         {
> -           HOST_WIDE_INT loop_end_id
> -             = INTVAL (XEXP (XVECEXP (PATTERN (insn), 0, 4), 0));
> +  compute_bb_for_insn ();
>  
> -           for (prev = next = insn, lp = NULL ; prev || next;)
> -             {
> -               if (prev)
> -                 {
> -                   if (NONJUMP_INSN_P (prev)
> -                       && recog_memoized (prev) == CODE_FOR_doloop_begin_i
> -                       && (INTVAL (XEXP (XVECEXP (PATTERN (prev), 0, 5), 0))
> -                           == loop_end_id))
> -                     {
> -                       lp = prev;
> -                       break;
> -                     }
> -                   else if (LABEL_P (prev))
> -                     seen_label = 1;
> -                   prev = prev_nonnote_insn (prev);
> -                 }
> -               if (next)
> -                 {
> -                   if (NONJUMP_INSN_P (next)
> -                       && recog_memoized (next) == CODE_FOR_doloop_begin_i
> -                       && (INTVAL (XEXP (XVECEXP (PATTERN (next), 0, 5), 0))
> -                           == loop_end_id))
> -                     {
> -                       lp = next;
> -                       break;
> -                     }
> -                   next = next_nonnote_insn (next);
> -                 }
> -             }
> -           prev = NULL;
> -         }
> -       else
> -         lp_simple = lp;
> -       if (lp && !dead_or_set_regno_p (lp, LP_COUNT))
> -         {
> -           rtx begin_cnt = XEXP (XVECEXP (PATTERN (lp), 0 ,3), 0);
> -           if (INTVAL (XEXP (XVECEXP (PATTERN (lp), 0, 4), 0)))
> -             /* The loop end insn has been duplicated.  That can happen
> -                when there is a conditional block at the very end of
> -                the loop.  */
> -             goto failure;
> -           /* If Register allocation failed to allocate to the right
> -              register, There is no point into teaching reload to
> -              fix this up with reloads, as that would cost more
> -              than using an ordinary core register with the
> -              doloop_fallback pattern.  */
> -           if ((true_regnum (op0) != LP_COUNT || !REG_P (begin_cnt))
> -           /* Likewise, if the loop setup is evidently inside the loop,
> -              we loose.  */
> -               || (!lp_simple && lp != next && !seen_label))
> -             {
> -               remove_insn (lp);
> -               goto failure;
> -             }
> -           /* It is common that the optimizers copy the loop count from
> -              another register, and doloop_begin_i is stuck with the
> -              source of the move.  Making doloop_begin_i only accept "l"
> -              is nonsentical, as this then makes reload evict the pseudo
> -              used for the loop end.  The underlying cause is that the
> -              optimizers don't understand that the register allocation for
> -              doloop_begin_i should be treated as part of the loop.
> -              Try to work around this problem by verifying the previous
> -              move exists.  */
> -           if (true_regnum (begin_cnt) != LP_COUNT)
> -             {
> -               rtx_insn *mov;
> -               rtx set, note;
> +  df_analyze ();
>  
> -               for (mov = prev_nonnote_insn (lp); mov;
> -                    mov = prev_nonnote_insn (mov))
> -                 {
> -                   if (!NONJUMP_INSN_P (mov))
> -                     mov = 0;
> -                   else if ((set = single_set (mov))
> -                       && rtx_equal_p (SET_SRC (set), begin_cnt)
> -                       && rtx_equal_p (SET_DEST (set), op0))
> -                     break;
> -                 }
> -               if (mov)
> -                 {
> -                   XEXP (XVECEXP (PATTERN (lp), 0 ,3), 0) = op0;
> -                   note = find_regno_note (lp, REG_DEAD, REGNO (begin_cnt));
> -                   if (note)
> -                     remove_note (lp, note);
> -                 }
> -               else
> -                 {
> -                   remove_insn (lp);
> -                   goto failure;
> -                 }
> -             }
> -           XEXP (XVECEXP (PATTERN (insn), 0, 4), 0) = num;
> -           XEXP (XVECEXP (PATTERN (lp), 0, 4), 0) = num;
> -           if (next == lp)
> -             XEXP (XVECEXP (PATTERN (lp), 0, 6), 0) = const2_rtx;
> -           else if (!lp_simple)
> -             XEXP (XVECEXP (PATTERN (lp), 0, 6), 0) = const1_rtx;
> -           else if (prev != lp)
> -             {
> -               remove_insn (lp);
> -               add_insn_after (lp, prev, NULL);
> -             }
> -           if (!lp_simple)
> -             {
> -               XEXP (XVECEXP (PATTERN (lp), 0, 7), 0)
> -                 = gen_rtx_LABEL_REF (Pmode, top_label);
> -               add_reg_note (lp, REG_LABEL_OPERAND, top_label);
> -               LABEL_NUSES (top_label)++;
> -             }
> -           /* We can avoid tedious loop start / end setting for empty loops
> -              be merely setting the loop count to its final value.  */
> -           if (next_active_insn (top_label) == insn)
> -             {
> -               rtx lc_set
> -                 = gen_rtx_SET (XEXP (XVECEXP (PATTERN (lp), 0, 3), 0),
> -                                const0_rtx);
> -
> -               rtx_insn *lc_set_insn = emit_insn_before (lc_set, insn);
> -               delete_insn (lp);
> -               delete_insn (insn);
> -               insn = lc_set_insn;
> -             }
> -           /* If the loop is non-empty with zero length, we can't make it
> -              a zero-overhead loop.  That can happen for empty asms.  */
> -           else
> -             {
> -               rtx_insn *scan;
> +  /* Doloop optimization.  */
> +  arc_reorg_loops ();
>  
> -               for (scan = top_label;
> -                    (scan && scan != insn
> -                     && (!NONJUMP_INSN_P (scan) || !get_attr_length (scan)));
> -                    scan = NEXT_INSN (scan));
> -               if (scan == insn)
> -                 {
> -                   remove_insn (lp);
> -                   goto failure;
> -                 }
> -             }
> -         }
> -       else
> -         {
> -           /* Sometimes the loop optimizer makes a complete hash of the
> -              loop.  If it were only that the loop is not entered at the
> -              top, we could fix this up by setting LP_START with SR .
> -              However, if we can't find the loop begin were it should be,
> -              chances are that it does not even dominate the loop, but is
> -              inside the loop instead.  Using SR there would kill
> -              performance.
> -              We use the doloop_fallback pattern here, which executes
> -              in two cycles on the ARC700 when predicted correctly.  */
> -         failure:
> -           if (!REG_P (op0))
> -             {
> -               rtx op3 = XEXP (XVECEXP (PATTERN (insn), 0, 5), 0);
> -
> -               emit_insn_before (gen_move_insn (op3, op0), insn);
> -               PATTERN (insn)
> -                 = gen_doloop_fallback_m (op3, JUMP_LABEL (insn), op0);
> -             }
> -           else
> -             XVEC (PATTERN (insn), 0)
> -               = gen_rtvec (2, XVECEXP (PATTERN (insn), 0, 0),
> -                            XVECEXP (PATTERN (insn), 0, 1));
> -           INSN_CODE (insn) = -1;
> -         }
> -     }
> -    }
> +  workaround_arc_anomaly ();
>  
>  /* FIXME: should anticipate ccfsm action, generate special patterns for
>     to-be-deleted branches that have no delay slot and have at least the
> @@ -7866,11 +8039,11 @@ arc_register_move_cost (machine_mode,
>       return 6;
>      }
>  
> -  /* The ARC700 stalls for 3 cycles when *reading* from lp_count.  */
> -  if (TARGET_ARC700
> -      && (from_class == LPCOUNT_REG || from_class == ALL_CORE_REGS
> -       || from_class == WRITABLE_CORE_REGS))
> -    return 8;
> +  /* Using lp_count as scratch reg is a VERY bad idea.  */
> +  if (from_class == LPCOUNT_REG)
> +    return 1000;
> +  if (to_class == LPCOUNT_REG)
> +    return 6;
>  
>    /* Force an attempt to 'mov Dy,Dx' to spill.  */
>    if ((TARGET_ARC700 || TARGET_EM) && TARGET_DPFP
> @@ -8312,14 +8485,6 @@ arc600_corereg_hazard (rtx_insn *pred, rtx_insn *succ)
>  {
>    if (!TARGET_ARC600)
>      return 0;
> -  /* If SUCC is a doloop_end_i with a preceding label, we must output a nop
> -     in front of SUCC anyway, so there will be separation between PRED and
> -     SUCC.  */
> -  if (recog_memoized (succ) == CODE_FOR_doloop_end_i
> -      && LABEL_P (prev_nonnote_insn (succ)))
> -    return 0;
> -  if (recog_memoized (succ) == CODE_FOR_doloop_begin_i)
> -    return 0;
>    if (GET_CODE (PATTERN (pred)) == SEQUENCE)
>      pred = as_a <rtx_sequence *> (PATTERN (pred))->insn (1);
>    if (GET_CODE (PATTERN (succ)) == SEQUENCE)
> @@ -8393,76 +8558,6 @@ arc_asm_insn_p (rtx x)
>    return 0;
>  }
>  
> -/* We might have a CALL to a non-returning function before a loop end.
> -   ??? Although the manual says that's OK (the target is outside the
> -   loop, and the loop counter unused there), the assembler barfs on
> -   this for ARC600, so we must insert a nop before such a call too.
> -   For ARC700, and ARCv2 is not allowed to have the last ZOL
> -   instruction a jump to a location where lp_count is modified.  */
> -
> -static bool
> -arc_loop_hazard (rtx_insn *pred, rtx_insn *succ)
> -{
> -  rtx_insn *jump  = NULL;
> -  rtx label_rtx = NULL_RTX;
> -  rtx_insn *label = NULL;
> -  basic_block succ_bb;
> -
> -  if (recog_memoized (succ) != CODE_FOR_doloop_end_i)
> -    return false;
> -
> -  /* Phase 1: ARC600 and ARCv2HS doesn't allow any control instruction
> -     (i.e., jump/call) as the last instruction of a ZOL.  */
> -  if (TARGET_ARC600 || TARGET_HS)
> -    if (JUMP_P (pred) || CALL_P (pred)
> -     || arc_asm_insn_p (PATTERN (pred))
> -     || GET_CODE (PATTERN (pred)) == SEQUENCE)
> -      return true;
> -
> -  /* Phase 2: Any architecture, it is not allowed to have the last ZOL
> -     instruction a jump to a location where lp_count is modified.  */
> -
> -  /* Phase 2a: Dig for the jump instruction.  */
> -  if (JUMP_P (pred))
> -    jump = pred;
> -  else if (GET_CODE (PATTERN (pred)) == SEQUENCE
> -        && JUMP_P (XVECEXP (PATTERN (pred), 0, 0)))
> -    jump = as_a <rtx_insn *> (XVECEXP (PATTERN (pred), 0, 0));
> -  else
> -    return false;
> -
> -  /* Phase 2b: Make sure is not a millicode jump.  */
> -  if ((GET_CODE (PATTERN (jump)) == PARALLEL)
> -      && (XVECEXP (PATTERN (jump), 0, 0) == ret_rtx))
> -    return false;
> -
> -  label_rtx = JUMP_LABEL (jump);
> -  if (!label_rtx)
> -    return false;
> -
> -  /* Phase 2c: Make sure is not a return.  */
> -  if (ANY_RETURN_P (label_rtx))
> -    return false;
> -
> -  /* Pahse 2d: Go to the target of the jump and check for aliveness of
> -     LP_COUNT register.  */
> -  label = safe_as_a <rtx_insn *> (label_rtx);
> -  succ_bb = BLOCK_FOR_INSN (label);
> -  if (!succ_bb)
> -    {
> -      gcc_assert (NEXT_INSN (label));
> -      if (NOTE_INSN_BASIC_BLOCK_P (NEXT_INSN (label)))
> -     succ_bb = NOTE_BASIC_BLOCK (NEXT_INSN (label));
> -      else
> -     succ_bb = BLOCK_FOR_INSN (NEXT_INSN (label));
> -    }
> -
> -  if (succ_bb && REGNO_REG_SET_P (df_get_live_out (succ_bb), LP_COUNT))
> -    return true;
> -
> -  return false;
> -}
> -
>  /* For ARC600:
>     A write to a core reg greater or equal to 32 must not be immediately
>     followed by a use.  Anticipate the length requirement to insert a nop
> @@ -8474,9 +8569,6 @@ arc_hazard (rtx_insn *pred, rtx_insn *succ)
>    if (!pred || !INSN_P (pred) || !succ || !INSN_P (succ))
>      return 0;
>  
> -  if (arc_loop_hazard (pred, succ))
> -    return 4;
> -
>    if (TARGET_ARC600)
>      return arc600_corereg_hazard (pred, succ);
>  
> @@ -8494,24 +8586,6 @@ arc_adjust_insn_length (rtx_insn *insn, int len, bool)
>    if (GET_CODE (PATTERN (insn)) == SEQUENCE)
>      return len;
>  
> -  /* It is impossible to jump to the very end of a Zero-Overhead Loop, as
> -     the ZOL mechanism only triggers when advancing to the end address,
> -     so if there's a label at the end of a ZOL, we need to insert a nop.
> -     The ARC600 ZOL also has extra restrictions on jumps at the end of a
> -     loop.  */
> -  if (recog_memoized (insn) == CODE_FOR_doloop_end_i)
> -    {
> -      rtx_insn *prev = prev_nonnote_insn (insn);
> -
> -      return ((LABEL_P (prev)
> -            || (TARGET_ARC600
> -                && (JUMP_P (prev)
> -                    || CALL_P (prev) /* Could be a noreturn call.  */
> -                    || (NONJUMP_INSN_P (prev)
> -                        && GET_CODE (PATTERN (prev)) == SEQUENCE))))
> -           ? len + 4 : len);
> -    }
> -
>    /* Check for return with but one preceding insn since function
>       start / call.  */
>    if (TARGET_PAD_RETURN
> @@ -9871,27 +9945,9 @@ arc_scheduling_not_expected (void)
>    return cfun->machine->arc_reorg_started;
>  }
>  
> -/* Oddly enough, sometimes we get a zero overhead loop that branch
> -   shortening doesn't think is a loop - observed with compile/pr24883.c
> -   -O3 -fomit-frame-pointer -funroll-loops.  Make sure to include the
> -   alignment visible for branch shortening  (we actually align the loop
> -   insn before it, but that is equivalent since the loop insn is 4 byte
> -   long.)  */
> -
>  int
>  arc_label_align (rtx_insn *label)
>  {
> -  int loop_align = LOOP_ALIGN (LABEL);
> -
> -  if (loop_align > align_labels_log)
> -    {
> -      rtx_insn *prev = prev_nonnote_insn (label);
> -
> -      if (prev && NONJUMP_INSN_P (prev)
> -       && GET_CODE (PATTERN (prev)) == PARALLEL
> -       && recog_memoized (prev) == CODE_FOR_doloop_begin_i)
> -     return loop_align;
> -    }
>    /* Code has a minimum p2 alignment of 1, which we must restore after an
>       ADDR_DIFF_VEC.  */
>    if (align_labels_log < 1)
> diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
> index d4e97cd..4c54b7e 100644
> --- a/gcc/config/arc/arc.h
> +++ b/gcc/config/arc/arc.h
> @@ -581,15 +581,15 @@ enum reg_class
>    {0x0000f00f, 0x00000000, 0x00000000, 0x00000000, 0x00000000},           /* 
> 'q', r0-r3, r12-r15 */          \
>    {0x1000f00f, 0x00000000, 0x00000000, 0x00000000, 0x00000000},           /* 
> 'e', r0-r3, r12-r15, sp */      \
>    {0x1c001fff, 0x00000000, 0x00000000, 0x00000000, 0x00000000},    /* "Rsc", 
> r0-r12 */ \
> -  {0x9fffffff, 0xc0000000, 0x00000000, 0x00000000, 0x00000000},      /* 'r', 
> r0-r28, blink, ap and pcl */    \
> +  {0x9fffffff, 0x80000000, 0x00000000, 0x00000000, 0x00000000},      /* 'r', 
> r0-r28, blink, ap and pcl */    \
>    {0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000},      /* 'W', 
>  r0-r31 */ \
>    /* Include ap / pcl in WRITABLE_CORE_REGS for sake of symmetry.  As these \
>       registers are fixed, it does not affect the literal meaning of the \
>       constraints, but it makes it a superset of GENERAL_REGS, thus \
>       enabling some operations that would otherwise not be possible.  */ \
> -  {0xffffffff, 0xd0000000, 0x00000000, 0x00000000, 0x00000000},      /* 'w', 
> r0-r31, r60 */ \
> -  {0xffffffff, 0xdfffffff, 0x00000000, 0x00000000, 0x00000000},      /* 'c', 
> r0-r60, ap, pcl */ \
> -  {0xffffffff, 0xdfffffff, 0x00000000, 0x00000000, 0x00000000},      /* 
> 'Rac', r0-r60, ap, pcl */ \
> +  {0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000},      /* 'w', 
> r0-r31, r60 */ \
> +  {0xffffffff, 0x9fffffff, 0x00000000, 0x00000000, 0x00000000},      /* 'c', 
> r0-r60, ap, pcl */ \
> +  {0xffffffff, 0x9fffffff, 0x00000000, 0x00000000, 0x00000000},      /* 
> 'Rac', r0-r60, ap, pcl */ \
>    {0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000},      /* 
> 'Rcd', r0-r3 */ \
>    {0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000000},      /* 
> 'Rsd', r0-r1 */ \
>    {0x9fffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000},      /* 'h', 
>  r0-28, r30 */ \
> @@ -1351,7 +1351,7 @@ do {                                                    
> \
>     of a loop.  */
>  /* On the ARC, align loops to 4 byte boundaries unless doing all-out size
>     optimization.  */
> -#define LOOP_ALIGN JUMP_ALIGN
> +#define LOOP_ALIGN(X) 0
>  
>  #define LABEL_ALIGN(LABEL) (arc_label_align (LABEL))
>  
> diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
> index 04a1447..27afe40 100644
> --- a/gcc/config/arc/arc.md
> +++ b/gcc/config/arc/arc.md
> @@ -554,6 +554,11 @@
>     (eq_attr "annul_ret_delay_insn" "yes")
>     (eq_attr "cond_ret_delay_insn" "yes")])
>  
> +(define_delay (eq_attr "type" "loop_end")
> +  [(eq_attr "in_delay_slot" "true")
> +   (eq_attr "in_delay_slot" "true")
> +   (nil)])
> +
>  ;; For ARC600, unexposing the delay sloy incurs a penalty also in the
>  ;; non-taken case, so the only meaningful way to have an annull-true
>  ;; filled delay slot is to conditionalize the delay slot insn.
> @@ -618,8 +623,8 @@
>  ; The iscompact attribute allows the epilogue expander to know for which
>  ; insns it should lengthen the return insn.
>  (define_insn "*movqi_insn"
> -  [(set (match_operand:QI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h, w,w,???w,h, w,Rcq,  S,!*x,  r,r, Ucm,m,???m,  m,Usc")
> -     (match_operand:QI 1 "move_src_operand"  "  cL,   cP,Rcq#q,    
> P,hCm1,cL,I,?Rac,i,?i,  T,Rcq,Usd,Ucm,m,?Rac,c,?Rac,Cm3,i"))]
> +  [(set (match_operand:QI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h,w*l,w*l,???w,h,w*l,Rcq,  S,!*x,  r,r, Ucm,m,???m,  m,Usc")
> +     (match_operand:QI 1 "move_src_operand"  "  cL,   cP,Rcq#q,    P,hCm1, 
> cL,  I,?Rac,i, ?i,  T,Rcq,Usd,Ucm,m,?Rac,c,?Rac,Cm3,i"))]
>    "register_operand (operands[0], QImode)
>     || register_operand (operands[1], QImode)"
>    "@
> @@ -655,8 +660,8 @@
>    "if (prepare_move_operands (operands, HImode)) DONE;")
>  
>  (define_insn "*movhi_insn"
> -  [(set (match_operand:HI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h, w,w,???w,Rcq#q,h, w,Rcq,  S,  r,r, Ucm,m,???m,  m,VUsc")
> -     (match_operand:HI 1 "move_src_operand" "   cL,   cP,Rcq#q,    
> P,hCm1,cL,I,?Rac,    i,i,?i,  T,Rcq,Ucm,m,?Rac,c,?Rac,Cm3,i"))]
> +  [(set (match_operand:HI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h,w*l,w*l,???w,Rcq#q,h,w*l,Rcq,  S,  r,r, Ucm,m,???m,  m,VUsc")
> +     (match_operand:HI 1 "move_src_operand" "   cL,   cP,Rcq#q,    P,hCm1, 
> cL,  I,?Rac,    i,i, ?i,  T,Rcq,Ucm,m,?Rac,c,?Rac,Cm3,i"))]
>    "register_operand (operands[0], HImode)
>     || register_operand (operands[1], HImode)
>     || (CONSTANT_P (operands[1])
> @@ -706,9 +711,9 @@
>  ; the iscompact attribute allows the epilogue expander to know for which
>  ; insns it should lengthen the return insn.
>  ; N.B. operand 1 of alternative 7 expands into pcl,symbol@gotpc .
> -(define_insn "*movsi_insn"                      ;   0     1     2     3    4 
>  5 6   7   8   9   10  11  12  13    14  15   16  17  18     19     20  21  
> 22    23    24 25 26    27 28  29   30   31
> -  [(set (match_operand:SI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h, w,w,  w,  w,  w,  w,???w, ?w,  w,Rcq#q,  h,   w,Rcq,  S,   Us<,RcqRck,!*x, 
>  r,!*Rsd,!*Rcd,r,Ucm,  Usd,m,???m,  m,VUsc")
> -     (match_operand:SI 1 "move_src_operand"  "  cL,   cP,Rcq#q,    
> P,hCm1,cL,I,Crr,Clo,Chi,Cbi,?Rac,Cpc,Clb, ?Cal,Cal,?Cal,Uts,Rcq,RcqRck,   
> Us>,Usd,Ucm,  Usd,  Ucd,m,  w,!*Rzd,c,?Rac,Cm3, C32"))]
> +(define_insn "*movsi_insn"                      ;   0     1     2     3    4 
>  5    6   7   8   9   10    11  12  13    14  15   16  17  18     19     20  
> 21  22    23    24 25 26    27 28  29  30   31
> +  [(set (match_operand:SI 0 "move_dest_operand" "=Rcq,Rcq#q,    w,Rcq#q,   
> h,w*l,w*l,  w,  w,  w,  w,  ???w, ?w,  w,Rcq#q,  h, w*l,Rcq,  S,   
> Us<,RcqRck,!*x,  r,!*Rsd,!*Rcd,r,Ucm,  Usd,m,???m,  m,VUsc")
> +     (match_operand:SI 1 "move_src_operand"  "  cL,   cP,Rcq#q,    P,hCm1, 
> cL,  I,Crr,Clo,Chi,Cbi,?Rac*l,Cpc,Clb, ?Cal,Cal,?Cal,Uts,Rcq,RcqRck,   
> Us>,Usd,Ucm,  Usd,  Ucd,m,  w,!*Rzd,c,?Rac,Cm3, C32"))]
>    "register_operand (operands[0], SImode)
>     || register_operand (operands[1], SImode)
>     || (CONSTANT_P (operands[1])
> @@ -5073,317 +5078,123 @@
>                               xtr, const0_rtx);
>  })
>  
> +;; -------------------------------------------------------------------
> +;; Hardware loop
> +;; -------------------------------------------------------------------
> +
>  ; operand 0 is the loop count pseudo register
> -; operand 1 is the loop end pattern
> -(define_expand "doloop_begin"
> -  [(use (match_operand 0 "register_operand" ""))
> -   (use (match_operand 1 "" ""))]
> +; operand 1 is the label to jump to at the top of the loop
> +(define_expand "doloop_end"
> +  [(parallel [(set (pc)
> +                (if_then_else
> +                 (ne (match_operand 0 "" "")
> +                     (const_int 1))
> +                 (label_ref (match_operand 1 "" ""))
> +                 (pc)))
> +           (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
> +           (unspec [(const_int 0)] UNSPEC_ARC_LP)
> +           (clobber (match_dup 2))])]
>    ""
>  {
> -  /* Using the INSN_UID of the loop end pattern to identify it causes
> -     trouble with -fcompare-debug, so allocate a debug-independent
> -     id instead.  We use negative numbers so that we can use the same
> -     slot in doloop_end_i where we later store a CODE_LABEL_NUMBER, and
> -     still be able to tell what kind of number this is.  */
> -  static HOST_WIDE_INT loop_end_id = 0;
> -
> -  rtx id = GEN_INT (--loop_end_id);
> -  XEXP (XVECEXP (PATTERN (operands[1]), 0, 4), 0) = id;
> -  emit_insn (gen_doloop_begin_i (operands[0], const0_rtx, id,
> -                              const0_rtx, const0_rtx));
> -  DONE;
> + if (GET_MODE (operands[0]) != SImode)
> +   FAIL;
> + operands[2] = gen_rtx_SCRATCH (SImode);
>  })
>  
> -; ??? can't describe the insn properly as then the optimizers try to
> -; hoist the SETs.
> -;(define_insn "doloop_begin_i"
> -;  [(set (reg:SI LP_START) (pc))
> -;   (set (reg:SI LP_END) (unspec:SI [(pc)] UNSPEC_ARC_LP))
> -;   (use (match_operand 0 "const_int_operand" "n"))]
> -;  ""
> -;  "lp .L__GCC__LP%0"
> -;)
> -
> -; The operands of doloop_end_i are also read / written by arc_reorg with
> -; XVECEXP (PATTERN (lp, 0, N), so if you want to change the pattern, you
> -; might have to adjust arc_reorg.
> -; operands 0 / 2 are supplied by the expander, 1, 3 and 4 are filled in
> -; by arc_reorg.  arc_reorg might also alter operand 0.
> -;
> -; N in XVECEXP PATTERN (lp, 0 N)
> -;  V              rtl                 purpose
> -;  0           unspec UNSPEC_ARC_LP identify pattern
> -;  1           clobber LP_START     show LP_START is set
> -;  2           clobber LP_END       show LP_END is set
> -;  3           use operand0         loop count pseudo register
> -;  4           use operand1         before arc_reorg: -id
> -;                                   after : CODE_LABEL_NUMBER of loop top 
> label
> -;  5           use operand2         INSN_UID of loop end insn
> -;  6           use operand3         loop setup not at start (1 above, 2 
> below)
> -;  7           use operand4         LABEL_REF of top label, if not
> -;                                   immediately following
> -; If operand1 is still zero after arc_reorg, this is an orphaned loop
> -; instruction that was not at the start of the loop.
> -; There is no point is reloading this insn - then lp_count would still not
> -; be available for the loop end.
> -(define_insn "doloop_begin_i"
> -  [(unspec:SI [(pc)] UNSPEC_ARC_LP)
> -   (clobber (reg:SI LP_START))
> -   (clobber (reg:SI LP_END))
> -   (use (match_operand:SI 0 "register_operand" "l,l,????*X"))
> -   (use (match_operand 1 "const_int_operand" "n,n,C_0"))
> -   (use (match_operand 2 "const_int_operand" "n,n,X"))
> -   (use (match_operand 3 "const_int_operand" "C_0,n,X"))
> -   (use (match_operand 4 "const_int_operand" "C_0,X,X"))]
> +(define_insn "arc_lp"
> +  [(unspec:SI [(match_operand:SI 0 "register_operand" "l")]
> +           UNSPEC_ARC_LP)
> +   (use (label_ref (match_operand 1 "" "")))
> +   (use (label_ref (match_operand 2 "" "")))]
>    ""
> -{
> -  rtx_insn *scan;
> -  int len, size = 0;
> -  int n_insns = 0;
> -  rtx loop_start = operands[4];
> -
> -  if (CONST_INT_P (loop_start))
> -    loop_start = NULL_RTX;
> -  /* Size implications of the alignment will be taken care of by the
> -     alignment inserted at the loop start.  */
> -  if (LOOP_ALIGN (0) && INTVAL (operands[1]))
> -    {
> -      asm_fprintf (asm_out_file, "\t.p2align %d\\n", LOOP_ALIGN (0));
> -      arc_clear_unalign ();
> -    }
> -  if (!INTVAL (operands[1]))
> -    return "; LITTLE LOST LOOP";
> -  if (loop_start && flag_pic)
> -    {
> -      /* ??? Can do better for when a scratch register
> -      is known.  But that would require extra testing.  */
> -      return "push_s r0\;add r0,pcl,%4@pcl\;sr r0,[2]; LP_START\;add 
> r0,pcl,.L__GCC__LP%1@pcl\;sr r0,[3]; LP_END\;pop_s r0";
> -    }
> -  /* Check if the loop end is in range to be set by the lp instruction.  */
> -  size = INTVAL (operands[3]) < 2 ? 0 : 2048;
> -  for (scan = insn; scan && size < 2048; scan = NEXT_INSN (scan))
> -    {
> -      if (!INSN_P (scan))
> -     continue;
> -      if (recog_memoized (scan) == CODE_FOR_doloop_end_i
> -       && (XEXP (XVECEXP (PATTERN (scan), 0, 4), 0)
> -           == XEXP (XVECEXP (PATTERN (insn), 0, 4), 0)))
> -     break;
> -      len = get_attr_length (scan);
> -      size += len;
> -    }
> -  /* Try to verify that there are at least three instruction fetches
> -     between the loop setup and the first encounter of the loop end.  */
> -  for (scan = NEXT_INSN (insn); scan && n_insns < 3; scan = NEXT_INSN (scan))
> -    {
> -      if (!INSN_P (scan))
> -     continue;
> -      if (rtx_sequence *seq = dyn_cast <rtx_sequence *> (PATTERN (scan)))
> -     scan = seq->insn (0);
> -      if (JUMP_P (scan))
> -     {
> -       if (recog_memoized (scan) != CODE_FOR_doloop_end_i)
> -         {
> -           n_insns += 2;
> -           if (simplejump_p (scan))
> -             {
> -               scan = as_a <rtx_insn *> (XEXP (SET_SRC (PATTERN (scan)), 0));
> -               continue;
> -             }
> -
> -           rtx lab = JUMP_LABEL (scan);
> -           if (!lab)
> -             break;
> -
> -           rtx_insn *next_scan
> -             = next_active_insn (NEXT_INSN (PREV_INSN (scan)));
> -           if (next_scan
> -               && recog_memoized (next_scan) != CODE_FOR_doloop_begin_i)
> -             break;
> -
> -           /* JUMP_LABEL might be simple_return instead if an insn.  */
> -           if (!INSN_P (lab))
> -             {
> -               n_insns++;
> -               break;
> -             }
> -
> -           rtx_insn *next_lab = next_active_insn (as_a<rtx_insn *> (lab));
> -           if (next_lab
> -               && recog_memoized (next_lab) != CODE_FOR_doloop_begin_i)
> -             break;
> -
> -             n_insns++;
> -         }
> -       break;
> -     }
> -      len = get_attr_length (scan);
> -      /* Size estimation of asms assumes that each line which is nonempty
> -      codes an insn, and that each has a long immediate.  For minimum insn
> -      count, assume merely that a nonempty asm has at least one insn.  */
> -      if (GET_CODE (PATTERN (scan)) == ASM_INPUT
> -       || asm_noperands (PATTERN (scan)) >= 0)
> -     n_insns += (len != 0);
> -      else
> -     n_insns += (len > 4 ? 2 : (len ? 1 : 0));
> -    }
> -  if (LOOP_ALIGN (0))
> -    {
> -      asm_fprintf (asm_out_file, "\t.p2align %d\\n", LOOP_ALIGN (0));
> -      arc_clear_unalign ();
> -    }
> -  gcc_assert (n_insns || GET_CODE (next_nonnote_insn (insn)) == CODE_LABEL);
> -  if (size >= 2048 || (TARGET_ARC600 && n_insns == 1) || loop_start)
> -    {
> -      if (flag_pic)
> -     {
> -       /* ??? Can do better for when a scratch register
> -          is known.  But that would require extra testing.  */
> -       arc_clear_unalign ();
> -       return ".p2align 2\;push_s r0\;add r0,pcl,24\;sr r0,[2]; 
> LP_START\;add r0,pcl,.L__GCC__LP%1@pcl\;sr r0,[3]; LP_END\;pop_s r0";
> -     }
> -      output_asm_insn ((size < 2048
> -                     ? "lp .L__GCC__LP%1" : "sr .L__GCC__LP%1,[3]; LP_END"),
> -                    operands);
> -      output_asm_insn (loop_start
> -                    ? "sr %4,[2]; LP_START" : "sr 0f,[2]; LP_START",
> -                    operands);
> -      if (TARGET_ARC600 && n_insns < 1)
> -     output_asm_insn ("nop", operands);
> -      return (TARGET_ARC600 && n_insns < 3) ? "nop_s\;nop_s\;0:" : "0:";
> -    }
> -  else if (TARGET_ARC600 && n_insns < 3)
> -    {
> -      /* At least four instructions are needed between the setting of 
> LP_COUNT
> -      and the loop end - but the lp instruction qualifies as one.  */
> -      rtx_insn *prev = prev_nonnote_insn (insn);
> -
> -      if (!INSN_P (prev) || dead_or_set_regno_p (prev, LP_COUNT))
> -     output_asm_insn ("nop", operands);
> -    }
> -  return "lp .L__GCC__LP%1";
> -}
> +  "lp\\t@%l2\\t; %0:@%l1->@%l2"
>    [(set_attr "type" "loop_setup")
> -   (set_attr_alternative "length"
> -;     FIXME: length is usually 4, but we need branch shortening
> -;     to get this right.
> -;     [(if_then_else (match_test "TARGET_ARC600") (const_int 16) (const_int 
> 4))
> -     [(if_then_else (match_test "flag_pic") (const_int 24) (const_int 16))
> -      (if_then_else (match_test "flag_pic") (const_int 28) (const_int 16))
> -      (const_int 0)])]
> -  ;; ??? we should really branch shorten this insn, but then we'd
> -  ;; need a proper label first.  N.B. the end label can not only go out
> -  ;; of range when it is far away, but also when it precedes the loop -
> -  ;; which, unfortunately, it sometimes does, when the loop "optimizer"
> -  ;; messes things up.
> -)
> -
> -; operand 0 is the loop count pseudo register
> -; operand 1 is the label to jump to at the top of the loop
> -; Use this for the ARC600 and ARC700.
> -; ??? ARC600 might want to check if the loop has few iteration and only a
> -; single insn - loop setup is expensive then.
> -(define_expand "doloop_end"
> -  [(use (match_operand 0 "register_operand" ""))
> -   (use (label_ref (match_operand 1 "" "")))]
> -  "!TARGET_ARC601"
> -{
> -  /* We could do smaller bivs with biv widening, and wider bivs by having
> -     a high-word counter in an outer loop - but punt on this for now.  */
> -  if (GET_MODE (operands[0]) != SImode)
> -    FAIL;
> -  emit_jump_insn (gen_doloop_end_i (operands[0], operands[1], const0_rtx));
> -  DONE;
> -})
> +   (set_attr "length" "4")])
>  
> -(define_insn_and_split "doloop_end_i"
> +;; if by any chance the lp_count is not used, then use an 'r'
> +;; register, instead of going to memory.
> +(define_insn "loop_end"
>    [(set (pc)
> -     (if_then_else (ne (match_operand:SI 0 "shouldbe_register_operand" 
> "+l,*c,*m")
> -                        (const_int 1))
> +     (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0,0")
> +                       (const_int 1))
>                     (label_ref (match_operand 1 "" ""))
>                     (pc)))
> -   (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
> -   (use (reg:SI LP_START))
> -   (use (reg:SI LP_END))
> -   (use (match_operand 2 "const_int_operand" "n,???Cn0,???X"))
> -   (clobber (match_scratch:SI 3 "=X,X,&????r"))]
> +   (set (match_operand:SI 0 "nonimmediate_operand" "=l!r,m")
> +     (plus (match_dup 2) (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_ARC_LP)
> +   (clobber (match_scratch:SI 3 "=X,&r"))]
>    ""
> -  "*
> -{
> -  rtx_insn *prev = prev_nonnote_insn (insn);
> -
> -  /* If there is an immediately preceding label, we must output a nop,
> -     lest a branch to that label will fall out of the loop.
> -     ??? We could try to avoid this by claiming to have a delay slot if there
> -     is a preceding label, and outputting the delay slot insn instead, if
> -     present.
> -     Or we could have some optimization that changes the source edge to 
> update
> -     the loop count and jump to the loop start instead.  */
> -  /* For ARC600, we must also prevent jumps inside the loop and jumps where
> -     the loop counter value is live at the target from being directly at the
> -     loop end.  Being sure that the loop counter is dead at the target is
> -     too much hair - we can't rely on data flow information at this point -
> -     so insert a nop for all branches.
> -     The ARC600 also can't read the loop counter in the last insn of a loop. 
>  */
> -  if (LABEL_P (prev))
> -    output_asm_insn (\"nop%?\", operands);
> -  return \"\\n.L__GCC__LP%2: ; loop end, start is %1\";
> -}"
> -  "&& memory_operand (operands[0], SImode)"
> -  [(pc)]
> -{
> -  emit_move_insn (operands[3], operands[0]);
> -  emit_jump_insn (gen_doloop_fallback_m (operands[3], operands[1], 
> operands[0]));
> -  DONE;
> -}
> -  [(set_attr "type" "loop_end")
> -   (set (attr "length")
> -     (if_then_else (match_test "LABEL_P (prev_nonnote_insn (insn))")
> -                   (const_int 4) (const_int 0)))]
> -)
> +  "\\t;%0 %1 %2"
> +  [(set_attr "length" "0")
> +   (set_attr "predicable" "no")
> +   (set_attr "type" "loop_end")])
>  
> -; This pattern is generated by arc_reorg when there is no recognizable
> -; loop start.
> -(define_insn "*doloop_fallback"
> -  [(set (pc) (if_then_else (ne (match_operand:SI 0 "register_operand" 
> "+r,!w")
> -                             (const_int 1))
> -                        (label_ref (match_operand 1 "" ""))
> -                        (pc)))
> -   (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))]
> -   ; avoid fooling the loop optimizer into assuming this is a special insn.
> -  "reload_completed"
> -  "*return get_attr_length (insn) == 8
> -   ? \"brne.d %0,1,%1\;sub %0,%0,1\"
> -   : \"breq %0,1,0f\;b.d %1\;sub %0,%0,1\\n0:\";"
> -  [(set (attr "length")
> -     (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -256))
> -                        (le (minus (match_dup 1) (pc)) (const_int 244)))
> -                   (const_int 8) (const_int 12)))
> -   (set_attr "type" "brcc_no_delay_slot")
> -   (set_attr "cond" "nocond")]
> -)
> +;; split pattern for the very slim chance when the loop register is
> +;; memory.
> +(define_split
> +  [(set (pc)
> +     (if_then_else (ne (match_operand:SI 0 "memory_operand")
> +                       (const_int 1))
> +                   (label_ref (match_operand 1 ""))
> +                   (pc)))
> +   (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
> +   (unspec [(const_int 0)] UNSPEC_ARC_LP)
> +   (clobber (match_scratch:SI 2))]
> +  "memory_operand (operands[0], SImode)"
> +  [(set (match_dup 2) (match_dup 0))
> +   (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1)))
> +   (set (match_dup 0) (match_dup 2))
> +   (set (reg:CC CC_REG) (compare:CC (match_dup 2) (const_int 0)))
> +   (set (pc)
> +     (if_then_else (ne (reg:CC CC_REG)
> +                       (const_int 0))
> +                   (label_ref (match_dup 1))
> +                   (pc)))]
> +  "")
>  
> -; reload can't make output reloads for jump insns, so we have to do this by 
> hand.
> -(define_insn "doloop_fallback_m"
> -  [(set (pc) (if_then_else (ne (match_operand:SI 0 "register_operand" "+&r")
> -                             (const_int 1))
> -                        (label_ref (match_operand 1 "" ""))
> -                        (pc)))
> -   (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
> -   (set (match_operand:SI 2 "memory_operand" "=m")
> -     (plus:SI (match_dup 0) (const_int -1)))]
> -   ; avoid fooling the loop optimizer into assuming this is a special insn.
> -  "reload_completed"
> -  "*return get_attr_length (insn) == 12
> -   ? \"sub %0,%0,1\;brne.d %0,0,%1\;st%U2%V2 %0,%2\"
> -   : \"sub %0,%0,1\;breq %0,0,0f\;b.d %1\\n0:\tst%U2%V2 %0,%2\";"
> -  [(set (attr "length")
> -     (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -252))
> -                        (le (minus (match_dup 1) (pc)) (const_int 244)))
> -                   (const_int 12) (const_int 16)))
> -   (set_attr "type" "brcc_no_delay_slot")
> -   (set_attr "cond" "nocond")]
> -)
> +(define_insn "loop_fail"
> +  [(set (reg:SI LP_COUNT)
> +     (plus:SI (reg:SI LP_COUNT) (const_int -1)))
> +   (set (reg:CC_ZN CC_REG)
> +     (compare:CC_ZN (plus:SI (reg:SI LP_COUNT) (const_int -1))
> +                    (const_int 0)))]
> +  ""
> +  "sub.f%?\\tlp_count,lp_count,1"
> +  [(set_attr "iscompact" "false")
> +   (set_attr "type" "compare")
> +   (set_attr "cond" "set_zn")
> +   (set_attr "length" "4")
> +   (set_attr "predicable" "yes")])
> +
> +(define_insn_and_split "dbnz"
> +  [(set (pc)
> +     (if_then_else
> +      (ne (plus:SI (match_operand:SI 0 "nonimmediate_operand" "+r!l,m")
> +                   (const_int -1))
> +          (const_int 0))
> +      (label_ref (match_operand 1 "" ""))
> +      (pc)))
> +   (set (match_dup 0)
> +     (plus:SI (match_dup 0)
> +              (const_int -1)))
> +   (clobber (match_scratch:SI 2 "=X,r"))]
> +  "TARGET_V2"
> +  "@
> +   dbnz%#\\t%0,%l1
> +   #"
> +  "TARGET_V2 && reload_completed && memory_operand (operands[0], SImode)"
> +  [(set (match_dup 2) (match_dup 0))
> +   (set (match_dup 2) (plus:SI (match_dup 2) (const_int -1)))
> +   (set (reg:CC CC_REG) (compare:CC (match_dup 2) (const_int 0)))
> +   (set (match_dup 0) (match_dup 2))
> +   (set (pc) (if_then_else (ge (reg:CC CC_REG)
> +                            (const_int 0))
> +                        (label_ref (match_dup 1))
> +                        (pc)))]
> +  ""
> +  [(set_attr "iscompact" "false")
> +   (set_attr "type" "loop_end")
> +   (set_attr "length" "4,20")])
>  
>  (define_expand "movmemsi"
>    [(match_operand:BLK 0 "" "")
> diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
> index ad2df26..d1ebd44 100644
> --- a/gcc/config/arc/arc.opt
> +++ b/gcc/config/arc/arc.opt
> @@ -494,3 +494,28 @@ Specifies the registers that the processor saves on an 
> interrupt entry and exit.
>  mrgf-banked-regs=
>  Target RejectNegative Joined Var(arc_deferred_options) Defer
>  Specifies the number of registers replicated in second register bank on 
> entry to fast interrupt.
> +
> +mlpc-width=
> +Target RejectNegative Joined Enum(arc_lpc) Var(arc_lpcwidth) Init(32)
> +Sets LP_COUNT register width.  Possible values are 8, 16, 20, 24, 28, and 32.
> +
> +Enum
> +Name(arc_lpc) Type(int)
> +
> +EnumValue
> +Enum(arc_lpc) String(8) Value(8)
> +
> +EnumValue
> +Enum(arc_lpc) String(16) Value(16)
> +
> +EnumValue
> +Enum(arc_lpc) String(20) Value(20)
> +
> +EnumValue
> +Enum(arc_lpc) String(24) Value(24)
> +
> +EnumValue
> +Enum(arc_lpc) String(28) Value(28)
> +
> +EnumValue
> +Enum(arc_lpc) String(32) Value(32)
> diff --git a/gcc/config/arc/predicates.md b/gcc/config/arc/predicates.md
> index 1f66438..2610f84 100644
> --- a/gcc/config/arc/predicates.md
> +++ b/gcc/config/arc/predicates.md
> @@ -362,6 +362,8 @@
>        else if (TARGET_MUL64_SET
>              && (REGNO (op) == 57 || REGNO(op) == 58 || REGNO(op) == 59 ))
>       return 0;
> +      else if (REGNO (op) == LP_COUNT)
> +        return 1;
>        else
>       return dest_reg_operand (op, mode);
>      case SUBREG :
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index f480051..3d1f7f3 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -614,7 +614,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mcrc  -mdsp-packa  -mdvbf  -mlock  -mmac-d16  -mmac-24  -mrtsc  -mswape @gol
>  -mtelephony  -mxy  -misize  -mannotate-align  -marclinux  -marclinux_prof 
> @gol
>  -mlong-calls  -mmedium-calls  -msdata -mirq-ctrl-saved @gol
> --mrgf-banked-regs @gol
> +-mrgf-banked-regs -mlpc-width @gol
>  -mvolatile-cache  -mtp-regno=@var{regno} @gol
>  -malign-call  -mauto-modify-reg  -mbbit-peephole  -mno-brcc @gol
>  -mcase-vector-pcrel  -mcompact-casesi  -mno-cond-exec  -mearly-cbranchsi @gol
> @@ -14678,6 +14678,18 @@ registers to avoid memory transactions during 
> interrupt entry and exit
>  sequences.  Use this option when you are using fast interrupts in an
>  ARC V2 family processor.  Permitted values are 4, 8, 16, and 32.
>  
> +@item -mlpc-width=@var{lpcw}
> +@opindex mlpc-width
> +Specify the width of the LP_COUNT register.  Valid values for
> +@var{lpcw} are 8, 16, 20, 24, 28 and 32.  The default width is fixed
> +to 32.  If the width is less than 32, the compiler does not attempt to
> +transform loops in your program to use the zero-delay loop mechanism
> +unless it is known that the @samp{LP_COUNT} register can hold the
> +required loop-counter value.  Depending on the size specified, the
> +compiler and run-time library might continue to use the loop mechanism
> +for various needs.  This option defines macro @code{__ARC_LPC_WIDTH__}
> +with the value of size.
> +
>  @end table
>  
>  The following options are passed through to the assembler, and also
> diff --git a/gcc/testsuite/gcc.target/arc/loop-1.c 
> b/gcc/testsuite/gcc.target/arc/loop-1.c
> old mode 100644
> new mode 100755
> index 1afe8eb..773f583
> --- a/gcc/testsuite/gcc.target/arc/loop-1.c
> +++ b/gcc/testsuite/gcc.target/arc/loop-1.c
> @@ -1,45 +1,12 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -w" } */
>  
> -/* This case would fail to make use of the zero-overhead loop
> -   instruction at one time due to a bug.  */
> +/* Check how we handle empty body loops.  */
>  
> -extern char a[];
> -
> -struct some_t
> -{
> -  struct
> -  {
> -    int aaa;
> -    short bbb;
> -    char ccc;
> -    char ddd;
> -  } ppp[8];
> -
> -  int www[1];
> -};
> -
> -int b;
> -
> -void
> -some_function ()
> -{
> -  struct some_t *tmp = (struct some_t *) a;
> -
> -  while ((*tmp).ppp[b].ccc)
> -    while(0);
> -
> -  for (; b; b++)
> -    {
> -      if (tmp->ppp[b].ccc)
> -        {
> -          int c = tmp->ppp[b].bbb;
> -          int d = tmp->ppp[b].aaa;
> -          int e = d - tmp->www[c];
> -          if (e)
> -            tmp->ppp[b].ddd = 1;
> -        }
> -    }
> +a;
> +fn1() {
> +  int i;
> +  for (; i < 8; i++) {
> +    double A[a];
> +  }
>  }
> -
> -/* { dg-final { scan-assembler "\[^\n\]+lp \\.L__GCC__" } } */
> -- 
> 1.9.1
>

Re: [PATCH 6/7] [ARC] Reimplement ZOL support.

Reply via email to