On Wed, Apr 15, 2020 at 3:56 AM Jiufu Guo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> As you may know, we have loop unroll pass in RTL which was introduced a few
> years ago, and works for a long time.  Currently, this unroller is using the
> pseudos in the original body, and then the pseudos are written multiple times.
>
> It would be a good idea to create new pseudos for those duplicated 
> instructions
> during loop unrolling.  This would relax reg dependence, and then provide more
> freedom/opportunity for other optimizations, like scheduling, RA...

I think there's a separate pass to do something like this, conveniently
placed after unrolling.  -fweb, IIRC enabled by default for -funroll-loops
unless explicitly disabled.  Related is regrename which is also enabled then.

So where does your patch make a difference?  Is the webizers dataflow analysis
maybe confused by backedges?

> As below example:
>
> Original loop:
> ```
>    26: L26:
>    17: NOTE_INSN_BASIC_BLOCK 4
>    18: r130:SF=[r125:DI+r122:DI]
>    19: r131:SF=[r126:DI+r122:DI]
>    20: r129:SF=r130:SF*r131:SF
>    21: r132:DF=float_extend(r129:SF)
>    22: r121:DF=r121:DF+r132:DF
>    23: r133:SI=r123:DI#0-0x1
>    24: r123:DI=zero_extend(r133:SI)
>    25: r122:DI=r122:DI+0x4
>    27: r134:CC=cmp(r123:DI,0)
>    28: pc={(r134:CC!=0)?L49:pc}
>       ; pc falls through to BB 5
>    49: L49:
>    48: NOTE_INSN_BASIC_BLOCK 8
>       ; pc falls through to BB 4
> ```
> It was unrolled to:
> ```
>    26: L26:
>    17: NOTE_INSN_BASIC_BLOCK 4
>    18: r130:SF=[r125:DI+r122:DI]
>    19: r131:SF=[r126:DI+r122:DI]
>    20: r129:SF=r130:SF*r131:SF
>    21: r132:DF=float_extend(r129:SF)
>    93: r145:DF=r121:DF+r132:DF
>    23: r133:SI=r123:DI#0-0x1
>    95: r148:DI=zero_extend(r133:SI)
>    91: r140:DI=r122:DI+0x4
>    25: r122:DI=r140:DI
>    97: r134:CC=cmp(r148:DI,0)
>
>    87: NOTE_INSN_BASIC_BLOCK 16
>    77: r130:SF=[r125:DI+r122:DI]
>    78: r131:SF=[r126:DI+r122:DI]
>    79: r129:SF=r130:SF*r131:SF
>    80: r132:DF=float_extend(r129:SF)
>    81: r121:DF=r121:DF+r132:DF
>    82: r133:SI=r123:DI#0-0x1
>    83: r123:DI=zero_extend(r133:SI)
>    84: r122:DI=r140:DI+0x4
>    85: r134:CC=cmp(r123:DI,0)
>    86: pc={(r134:CC!=0)?L90:pc}
>
> ```
> We can see, the original loop is using r130,r131,r129,r132,r121...
> For the unrolled sequence BB 16, they are still using them.
>
> We could use new pseudos for duplicate body, like below:
> ```
>    26: L26:
>    17: NOTE_INSN_BASIC_BLOCK 4
>    18: r130:SF=[r125:DI+r122:DI]
>    19: r131:SF=[r126:DI+r122:DI]
>    20: r129:SF=r130:SF*r131:SF
>    21: r132:DF=float_extend(r129:SF)
>    93: r145:DF=r121:DF+r132:DF
>    23: r133:SI=r123:DI#0-0x1
>    95: r148:DI=zero_extend(r133:SI)
>    91: r140:DI=r122:DI+0x4
>    25: r122:DI=r140:DI
>    97: r134:CC=cmp(r148:DI,0)
>
>    87: NOTE_INSN_BASIC_BLOCK 16
>    77: r141:SF=[r125:DI+r122:DI]
>    78: r142:SF=[r126:DI+r122:DI]
>    79: r143:SF=r141:SF*r142:SF
>    80: r144:DF=float_extend(r143:SF)
>    81: r146:DF=r145:DF+r144:DF
>    82: r147:SI=r148:DI#0-0x1
>    83: r149:DI=zero_extend(r147:SI)
>    84: r122:DI=r140:DI+0x4
>    85: r150:CC=cmp(r149:DI,0)
>    98: r130:SF=r141:SF
>    99: r131:SF=r142:SF
>   100: r129:SF=r143:SF
>   101: r132:DF=r144:DF
>   102: r121:DF=r146:DF
>   103: r133:SI=r147:SI
>   104: r123:DI=r149:DI
>   105: r134:CC=r150:CC
>    86: pc={(r150:CC!=0)?L90:pc}
> ```
> In BB 16, new pseudos: r141,r142,r143... are used.
>
> For this, I drafted a prototype like the patch below.
> This patch only supports simple loops: one exit edge with one major basic 
> block.
>
> With this patch, we can see, the generated asm code uses fewer instructions 
> on PowerPC.
>
> Welcome for comments, thanks!
>
> BR.
> Jiufu Guo.
>
> ---
>  gcc/common.opt    |   4 +
>  gcc/loop-unroll.c | 309 +++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 310 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/common.opt b/gcc/common.opt
> index fa9da505fc2..e298ce09c82 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -2526,6 +2526,10 @@ fvariable-expansion-in-unroller
>  Common Report Var(flag_variable_expansion_in_unroller) Optimization
>  Apply variable expansion when loops are unrolled.
>
> +fassin-new-operand-in-unroller
> +Common Report Var(flag_assign_new_operand_in_unroller) Init(1) Optimization
> +Creating new pseudo for duplicated instruction when loops are unrolled.
> +
>  fstack-check=
>  Common Report RejectNegative Joined Optimization
>  -fstack-check=[no|generic|specific]    Insert stack checking code into the 
> program.
> diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c
> index 693c7768868..54a48bd9fd7 100644
> --- a/gcc/loop-unroll.c
> +++ b/gcc/loop-unroll.c
> @@ -94,6 +94,17 @@ struct var_to_expand
>                                        var_expansions[REUSE_EXPANSION - 1].  
> */
>  };
>
> +struct def_to_split
> +{
> +  rtx_insn *insn;          /* The insn in that the assigment occurs.  */
> +  rtx reg;                  /* The def/set pseudo.  */
> +  vec<rtx> defs;            /* The copies of the defs which is split to.  */
> +  struct def_to_split *next; /* Next entry in walking order.  */
> +  int count;                /* Number of DEFS.  */
> +  int use_before_def;  /* The pseudo is used before re-defined.  */
> +  rtx_insn *last_insn;
> +};
> +
>  /* Hashtable helper for iv_to_split.  */
>
>  struct iv_split_hasher : free_ptr_hash <iv_to_split>
> @@ -143,6 +154,30 @@ var_expand_hasher::equal (const var_to_expand *i1, const 
> var_to_expand *i2)
>    return i1->insn == i2->insn;
>  }
>
> +/* Hashtable helper for iv_to_split.  */
> +
> +struct def_split_hasher : free_ptr_hash<def_to_split>
> +{
> +  static inline hashval_t hash (const def_to_split *);
> +  static inline bool equal (const def_to_split *, const def_to_split *);
> +};
> +
> +/* Return a hash for VES.  */
> +
> +inline hashval_t
> +def_split_hasher::hash (const def_to_split *i)
> +{
> +  return (hashval_t) INSN_UID (i->insn);
> +}
> +
> +/* Return true if I1 and I2 refer to the same instruction.  */
> +
> +inline bool
> +def_split_hasher::equal (const def_to_split *i1, const def_to_split *i2)
> +{
> +  return i1->insn == i2->insn;
> +}
> +
>  /* Information about optimization applied in
>     the unrolled loop.  */
>
> @@ -156,10 +191,17 @@ struct opt_info
>                                         insns with accumulators to expand.  */
>    struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
>    struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the 
> list.  */
> +
> +  hash_table<def_split_hasher> *insns_def_split; /* A hashtable of insns to
> +                                                   split def.  */
> +  struct def_to_split *def_to_split_head; /* The def to split.  */
> +  struct def_to_split **def_to_split_tail; /* Pointer to the tail of the 
> list.  */
> +
>    unsigned first_new_block;        /* The first basic block that was
>                                        duplicated.  */
>    basic_block loop_exit;           /* The loop exit basic block.  */
>    basic_block loop_preheader;      /* The loop preheader basic block.  */
> +
>  };
>
>  static void decide_unroll_stupid (class loop *, int);
> @@ -183,6 +225,7 @@ static void combine_var_copies_in_loop_exit (struct 
> var_to_expand *,
>                                              basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +static struct def_to_split *analyze_insn_to_split_def (class loop *, 
> rtx_insn *);
>  /* Emit a message summarizing the unroll that will be
>     performed for LOOP, along with the loop's location LOCUS, if
>     appropriate given the dump or -fopt-info settings.  */
> @@ -898,7 +941,8 @@ unroll_loop_runtime_iterations (class loop *loop)
>    struct opt_info *opt_info = NULL;
>    bool ok;
>
> -  if (flag_split_ivs_in_unroller
> +  if (flag_assign_new_operand_in_unroller
> +      || flag_split_ivs_in_unroller
>        || flag_variable_expansion_in_unroller)
>      opt_info = analyze_insns_in_loop (loop);
>
> @@ -1564,6 +1608,71 @@ analyze_iv_to_split_insn (rtx_insn *insn)
>    return ivts;
>  }
>
> +static struct def_to_split *
> +analyze_insn_to_split_def (class loop *loop, rtx_insn *insn)
> +{
> +  rtx set, dest, src;
> +  rtx_insn *ins;
> +  struct def_to_split *dts;
> +
> +  /* TODO: to be relaxed */
> +  if (loop->num_nodes != 2)
> +    return NULL;
> +
> +  FOR_BB_INSNS (loop->latch, ins)
> +    if (INSN_P (ins))
> +      return NULL;
> +
> +  set = single_set (insn);
> +  if (!set)
> +    return NULL;
> +
> +  dest = SET_DEST (set);
> +  src = SET_SRC (set);
> +
> +  if (!REG_P (dest)) //    && !(GET_CODE (dest) == SUBREG && REG_P 
> (SUBREG_REG
> +                    //    (dest))))
> +    return NULL;
> +
> +  if (REGNO (dest) < FIRST_PSEUDO_REGISTER)
> +    return NULL;
> +
> +  int use_before_set = 0;
> +  basic_block bb = BLOCK_FOR_INSN (insn);
> +  rtx set1;
> +  for (ins = BB_HEAD (bb); ins != insn; ins = NEXT_INSN (ins))
> +    if (rtx_referenced_p (dest, ins))
> +      {
> +       set1 = single_set (ins);
> +       if (set1 && rtx_equal_p (SET_DEST (set1), dest))
> +         return NULL;
> +       use_before_set = 1;
> +      }
> +
> +  if (!use_before_set && rtx_referenced_p (dest, src))
> +    use_before_set = 1;
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file,
> +              "\n;; Split assignment %s: ",
> +              use_before_set ? "use leads def" : "");
> +      print_rtl (dump_file, dest);
> +      fprintf (dump_file, "\n");
> +    }
> +
> +  /* Record the accumulator to expand.  */
> +  dts = XNEW (struct def_to_split);
> +  dts->insn = insn;
> +  dts->reg = copy_rtx (dest);
> +  dts->defs.create (1);
> +  dts->next = NULL;
> +  dts->count = 0;
> +  dts->use_before_def = use_before_set;
> +  dts->last_insn = NULL;
> +  return dts;
> +}
> +
>  /* Determines which of insns in LOOP can be optimized.
>     Return a OPT_INFO struct with the relevant hash tables filled
>     with all insns to be optimized.  The FIRST_NEW_BLOCK field
> @@ -1578,6 +1687,7 @@ analyze_insns_in_loop (class loop *loop)
>    rtx_insn *insn;
>    struct iv_to_split *ivts = NULL;
>    struct var_to_expand *ves = NULL;
> +  struct def_to_split *dts= NULL;
>    iv_to_split **slot1;
>    var_to_expand **slot2;
>    vec<edge> edges = get_loop_exit_edges (loop);
> @@ -1618,6 +1728,15 @@ analyze_insns_in_loop (class loop *loop)
>        opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
>      }
>
> +  if (flag_assign_new_operand_in_unroller && can_apply
> +      && LPT_UNROLL_RUNTIME == loop->lpt_decision.decision)
> +    {
> +      opt_info->insns_def_split
> +       = new hash_table<def_split_hasher> (5 * loop->num_nodes);
> +      opt_info->def_to_split_head = NULL;
> +      opt_info->def_to_split_tail = &opt_info->def_to_split_head;
> +    }
> +
>    for (i = 0; i < loop->num_nodes; i++)
>      {
>        bb = body[i];
> @@ -1653,6 +1772,20 @@ analyze_insns_in_loop (class loop *loop)
>             *opt_info->var_to_expand_tail = ves;
>             opt_info->var_to_expand_tail = &ves->next;
>            }
> +
> +       dts = NULL;
> +       if (opt_info->insns_def_split && !ves && !ivts)
> +         dts = analyze_insn_to_split_def (loop, insn);
> +
> +       if (dts)
> +         {
> +           struct def_to_split **slot
> +             = opt_info->insns_def_split->find_slot (dts, INSERT);
> +           gcc_assert (*slot == NULL);
> +           *slot = dts;
> +           *opt_info->def_to_split_tail = dts;
> +           opt_info->def_to_split_tail = &dts->next;
> +         }
>        }
>      }
>
> @@ -1840,6 +1973,145 @@ expand_var_during_unrolling (struct var_to_expand 
> *ve, rtx_insn *insn)
>        }
>  }
>
> +/* Replace INSN with a new duplicate insn.  */
> +static rtx_insn *
> +replace_with_new_insn (rtx_insn *insn)
> +{
> +  rtx_insn *seq;
> +  start_sequence ();
> +  seq = duplicate_insn_chain (insn, insn);
> +  gcc_assert (seq == get_insns ());
> +  end_sequence ();
> +  emit_insn_before (seq, insn);
> +  delete_insn (insn);
> +  return seq;
> +}
> +
> +/* For original body, those insn(s) are not able to be update directly. Then,
> + * duplicate new one for update.  */
> +
> +static void
> +prepare_to_update_orignal_body (struct def_to_split *head)
> +{
> +  struct def_to_split *dts;
> +  rtx_insn *insn;
> +  vec<rtx_insn *> new_insn_list;
> +
> +  /* For all those defines which is used before define, replace with new one 
> for
> +   * update.  */
> +  new_insn_list.create (1);
> +  for (dts = head; dts; dts = dts->next)
> +    if (dts->use_before_def)
> +      {
> +       dts->insn = replace_with_new_insn (dts->insn);
> +       new_insn_list.safe_push (dts->insn);
> +      }
> +
> +  /* For all those insn which is using those defines, replace with new one 
> for
> +   * update.  */
> +  for (dts = head; dts; dts = dts->next)
> +    if (dts->use_before_def)
> +      for (insn = NEXT_INSN (dts->insn);
> +          insn != NEXT_INSN (BB_END (BLOCK_FOR_INSN (dts->insn)));
> +          insn = NEXT_INSN (insn))
> +       if (rtx_referenced_p (dts->reg, insn) && !new_insn_list.contains 
> (insn))
> +         {
> +           insn = replace_with_new_insn (insn);
> +           new_insn_list.safe_push (insn);
> +         }
> +
> +  new_insn_list.release ();
> +}
> +
> +/* Replace reg which occur between START and END.  */
> +static void
> +replace_regs (rtx_insn *start, rtx_insn *end, rtx old_reg, rtx new_reg)
> +{
> +  rtx_insn *insn;
> +  for (insn = start; insn != NEXT_INSN (end); insn = NEXT_INSN (insn))
> +    if (rtx_referenced_p (old_reg, insn))
> +      validate_replace_rtx_group (old_reg, new_reg, insn);
> +
> +  apply_change_group ();
> +}
> +
> +/* Update the the pseudo in orignal blocks, and assign back them at the last
> + * block.  */
> +static void
> +do_additional_update (struct def_to_split *head)
> +{
> +  rtx_insn *insn;
> +  rtx new_reg;
> +  basic_block bb;
> +  struct def_to_split *dts;
> +
> +  for (dts = head; dts; dts = dts->next)
> +    {
> +      if (dts->use_before_def)
> +       {
> +         gcc_assert (dts->last_insn);
> +         gcc_assert (dts->count > 1);
> +
> +         /* Update the insns in original body.  */
> +         bb = BLOCK_FOR_INSN (dts->insn);
> +         new_reg = dts->defs[0];
> +         SET_DEST (single_set (dts->insn)) = new_reg;
> +
> +         replace_regs (NEXT_INSN (dts->insn), BB_END (bb), dts->reg, 
> new_reg);
> +       }
> +
> +      /* In last BB, assign back to orignal reg.  */
> +      start_sequence ();
> +      emit_move_insn (dts->reg, SET_DEST (single_set (dts->last_insn)));
> +      insn = get_insns ();
> +      end_sequence ();
> +      emit_insn_after (insn, dts->last_insn);
> +    }
> +}
> +
> +/* Replace the DTS->REG with new split reg for INSN's block. If the DTS->REG 
> is
> + * referenced before set, Those usage which leading INSN is replaced with
> + * previous define/set.  */
> +static void
> +split_def_during_unrolling (struct def_to_split *dts, rtx_insn *insn)
> +{
> +  rtx new_reg, new_reg0, set;
> +  basic_block bb = BLOCK_FOR_INSN (insn);
> +
> +  if (dts->use_before_def)
> +    {
> +      if (dts->count == 0)
> +       {
> +         dts->defs.safe_push (gen_reg_rtx (GET_MODE (dts->reg)));
> +         dts->count++;
> +       }
> +
> +      /* Replace the usage of the define reg with new reg0 for those insns 
> which
> +       * are leading the define, include the usage of the define.  */
> +      new_reg0 = dts->defs[dts->count - 1];
> +      replace_regs (BB_HEAD (bb), insn, dts->reg, new_reg0);
> +
> +      /* Replace the define reg with new reg, for the define and those usages
> +       * which following the define.  */
> +      set = single_set (insn);
> +      gcc_assert (set);
> +      new_reg = gen_reg_rtx (GET_MODE (dts->reg));
> +      SET_DEST (single_set (insn)) = new_reg;
> +
> +      replace_regs (NEXT_INSN (insn), BB_END (bb), dts->reg, new_reg);
> +    }
> +  else
> +    {
> +      new_reg = gen_reg_rtx (GET_MODE (dts->reg));
> +      replace_regs (insn, BB_END (bb), dts->reg, new_reg);
> +    }
> +
> +  /* Record the insn in last block.  */
> +  dts->last_insn = insn;
> +  dts->defs.safe_push (new_reg);
> +  dts->count++;
> +}
> +
>  /* Initialize the variable expansions in loop preheader.  PLACE is the
>     loop-preheader basic block where the initialization of the
>     expansions should take place.  The expansions are initialized with
> @@ -2011,6 +2283,7 @@ apply_opt_in_copies (struct opt_info *opt_info,
>    rtx_insn *insn, *orig_insn, *next;
>    struct iv_to_split ivts_templ, *ivts;
>    struct var_to_expand ve_templ, *ves;
> +  struct def_to_split dts_templ, *dts;
>
>    /* Sanity check -- we need to put initialization in the original loop
>       body.  */
> @@ -2051,8 +2324,9 @@ apply_opt_in_copies (struct opt_info *opt_info,
>
>            ivts_templ.insn = orig_insn;
>            ve_templ.insn = orig_insn;
> +         dts_templ.insn = orig_insn;
>
> -          /* Apply splitting iv optimization.  */
> +         /* Apply splitting iv optimization.  */
>            if (opt_info->insns_to_split)
>              {
>               maybe_strip_eq_note_for_split_iv (opt_info, insn);
> @@ -2081,6 +2355,19 @@ apply_opt_in_copies (struct opt_info *opt_info,
>                    expand_var_during_unrolling (ves, insn);
>                  }
>              }
> +
> +         if (unrolling && opt_info->insns_def_split)
> +           {
> +             dts = (struct def_to_split *) opt_info->insns_def_split->find (
> +               &dts_templ);
> +             if (dts)
> +               {
> +                 gcc_assert (GET_CODE (PATTERN (insn))
> +                             == GET_CODE (PATTERN (orig_insn)));
> +                 split_def_during_unrolling (dts, insn);
> +               }
> +           }
> +
>            orig_insn = NEXT_INSN (orig_insn);
>          }
>      }
> @@ -2135,9 +2422,14 @@ apply_opt_in_copies (struct opt_info *opt_info,
>                    continue;
>                  }
>              }
> -
>          }
>      }
> +
> +  if (unrolling && opt_info->insns_def_split)
> +    {
> +      prepare_to_update_orignal_body (opt_info->def_to_split_head);
> +      do_additional_update (opt_info->def_to_split_head);
> +    }
>  }
>
>  /* Release OPT_INFO.  */
> @@ -2156,5 +2448,16 @@ free_opt_info (struct opt_info *opt_info)
>        delete opt_info->insns_with_var_to_expand;
>        opt_info->insns_with_var_to_expand = NULL;
>      }
> +
> +  if (opt_info->insns_def_split)
> +    {
> +      struct def_to_split *dts;
> +
> +      for (dts = opt_info->def_to_split_head; dts; dts = dts->next)
> +       dts->defs.release ();
> +      delete opt_info->insns_def_split;
> +      opt_info->insns_def_split = NULL;
> +    }
> +
>    free (opt_info);
>  }
> --
> 2.17.1
>

Reply via email to