[RFC] split pseudos during loop unrolling in RTL unroller

Jiufu Guo via Gcc-patches Tue, 14 Apr 2020 18:56:27 -0700

Hi,

As you may know, we have loop unroll pass in RTL which was introduced a few
years ago, and works for a long time.  Currently, this unroller is using the
pseudos in the original body, and then the pseudos are written multiple times.


It would be a good idea to create new pseudos for those duplicated instructions
during loop unrolling.  This would relax reg dependence, and then provide more
freedom/opportunity for other optimizations, like scheduling, RA... 

As below example:

Original loop:
```
   26: L26:
   17: NOTE_INSN_BASIC_BLOCK 4
   18: r130:SF=[r125:DI+r122:DI]
   19: r131:SF=[r126:DI+r122:DI]
   20: r129:SF=r130:SF*r131:SF
   21: r132:DF=float_extend(r129:SF)
   22: r121:DF=r121:DF+r132:DF
   23: r133:SI=r123:DI#0-0x1
   24: r123:DI=zero_extend(r133:SI)
   25: r122:DI=r122:DI+0x4
   27: r134:CC=cmp(r123:DI,0)
   28: pc={(r134:CC!=0)?L49:pc}
      ; pc falls through to BB 5
   49: L49:
   48: NOTE_INSN_BASIC_BLOCK 8
      ; pc falls through to BB 4
```
It was unrolled to:
```
   26: L26:
   17: NOTE_INSN_BASIC_BLOCK 4
   18: r130:SF=[r125:DI+r122:DI]
   19: r131:SF=[r126:DI+r122:DI]
   20: r129:SF=r130:SF*r131:SF
   21: r132:DF=float_extend(r129:SF)
   93: r145:DF=r121:DF+r132:DF
   23: r133:SI=r123:DI#0-0x1
   95: r148:DI=zero_extend(r133:SI)
   91: r140:DI=r122:DI+0x4
   25: r122:DI=r140:DI
   97: r134:CC=cmp(r148:DI,0)

   87: NOTE_INSN_BASIC_BLOCK 16
   77: r130:SF=[r125:DI+r122:DI]
   78: r131:SF=[r126:DI+r122:DI]
   79: r129:SF=r130:SF*r131:SF
   80: r132:DF=float_extend(r129:SF)
   81: r121:DF=r121:DF+r132:DF
   82: r133:SI=r123:DI#0-0x1
   83: r123:DI=zero_extend(r133:SI)
   84: r122:DI=r140:DI+0x4
   85: r134:CC=cmp(r123:DI,0)
   86: pc={(r134:CC!=0)?L90:pc}

```
We can see, the original loop is using r130,r131,r129,r132,r121...
For the unrolled sequence BB 16, they are still using them.

We could use new pseudos for duplicate body, like below:
```
   26: L26:
   17: NOTE_INSN_BASIC_BLOCK 4
   18: r130:SF=[r125:DI+r122:DI]
   19: r131:SF=[r126:DI+r122:DI]
   20: r129:SF=r130:SF*r131:SF
   21: r132:DF=float_extend(r129:SF)
   93: r145:DF=r121:DF+r132:DF
   23: r133:SI=r123:DI#0-0x1
   95: r148:DI=zero_extend(r133:SI)
   91: r140:DI=r122:DI+0x4
   25: r122:DI=r140:DI
   97: r134:CC=cmp(r148:DI,0)

   87: NOTE_INSN_BASIC_BLOCK 16
   77: r141:SF=[r125:DI+r122:DI]
   78: r142:SF=[r126:DI+r122:DI]
   79: r143:SF=r141:SF*r142:SF
   80: r144:DF=float_extend(r143:SF)
   81: r146:DF=r145:DF+r144:DF
   82: r147:SI=r148:DI#0-0x1
   83: r149:DI=zero_extend(r147:SI)
   84: r122:DI=r140:DI+0x4
   85: r150:CC=cmp(r149:DI,0)
   98: r130:SF=r141:SF
   99: r131:SF=r142:SF
  100: r129:SF=r143:SF
  101: r132:DF=r144:DF
  102: r121:DF=r146:DF
  103: r133:SI=r147:SI
  104: r123:DI=r149:DI
  105: r134:CC=r150:CC
   86: pc={(r150:CC!=0)?L90:pc}
``` 
In BB 16, new pseudos: r141,r142,r143... are used.

For this, I drafted a prototype like the patch below.
This patch only supports simple loops: one exit edge with one major basic block.

With this patch, we can see, the generated asm code uses fewer instructions on 
PowerPC.

Welcome for comments, thanks!

BR.
Jiufu Guo.

---
 gcc/common.opt    |   4 +
 gcc/loop-unroll.c | 309 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 310 insertions(+), 3 deletions(-)

diff --git a/gcc/common.opt b/gcc/common.opt
index fa9da505fc2..e298ce09c82 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2526,6 +2526,10 @@ fvariable-expansion-in-unroller
 Common Report Var(flag_variable_expansion_in_unroller) Optimization
 Apply variable expansion when loops are unrolled.
 
+fassin-new-operand-in-unroller
+Common Report Var(flag_assign_new_operand_in_unroller) Init(1) Optimization
+Creating new pseudo for duplicated instruction when loops are unrolled.
+
 fstack-check=
 Common Report RejectNegative Joined Optimization
 -fstack-check=[no|generic|specific]    Insert stack checking code into the 
program.
diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c
index 693c7768868..54a48bd9fd7 100644
--- a/gcc/loop-unroll.c
+++ b/gcc/loop-unroll.c
@@ -94,6 +94,17 @@ struct var_to_expand
                                       var_expansions[REUSE_EXPANSION - 1].  */
 };
 
+struct def_to_split
+{
+  rtx_insn *insn;          /* The insn in that the assigment occurs.  */
+  rtx reg;                  /* The def/set pseudo.  */
+  vec<rtx> defs;            /* The copies of the defs which is split to.  */
+  struct def_to_split *next; /* Next entry in walking order.  */
+  int count;                /* Number of DEFS.  */
+  int use_before_def;  /* The pseudo is used before re-defined.  */
+  rtx_insn *last_insn;
+};
+
 /* Hashtable helper for iv_to_split.  */
 
 struct iv_split_hasher : free_ptr_hash <iv_to_split>
@@ -143,6 +154,30 @@ var_expand_hasher::equal (const var_to_expand *i1, const 
var_to_expand *i2)
   return i1->insn == i2->insn;
 }
 
+/* Hashtable helper for iv_to_split.  */
+
+struct def_split_hasher : free_ptr_hash<def_to_split>
+{
+  static inline hashval_t hash (const def_to_split *);
+  static inline bool equal (const def_to_split *, const def_to_split *);
+};
+
+/* Return a hash for VES.  */
+
+inline hashval_t
+def_split_hasher::hash (const def_to_split *i)
+{
+  return (hashval_t) INSN_UID (i->insn);
+}
+
+/* Return true if I1 and I2 refer to the same instruction.  */
+
+inline bool
+def_split_hasher::equal (const def_to_split *i1, const def_to_split *i2)
+{
+  return i1->insn == i2->insn;
+}
+
 /* Information about optimization applied in
    the unrolled loop.  */
 
@@ -156,10 +191,17 @@ struct opt_info
                                        insns with accumulators to expand.  */
   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the 
list.  */
+
+  hash_table<def_split_hasher> *insns_def_split; /* A hashtable of insns to
+                                                   split def.  */
+  struct def_to_split *def_to_split_head; /* The def to split.  */
+  struct def_to_split **def_to_split_tail; /* Pointer to the tail of the list. 
 */
+
   unsigned first_new_block;        /* The first basic block that was
                                       duplicated.  */
   basic_block loop_exit;           /* The loop exit basic block.  */
   basic_block loop_preheader;      /* The loop preheader basic block.  */
+
 };
 
 static void decide_unroll_stupid (class loop *, int);
@@ -183,6 +225,7 @@ static void combine_var_copies_in_loop_exit (struct 
var_to_expand *,
                                             basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+static struct def_to_split *analyze_insn_to_split_def (class loop *, rtx_insn 
*);
 /* Emit a message summarizing the unroll that will be
    performed for LOOP, along with the loop's location LOCUS, if
    appropriate given the dump or -fopt-info settings.  */
@@ -898,7 +941,8 @@ unroll_loop_runtime_iterations (class loop *loop)
   struct opt_info *opt_info = NULL;
   bool ok;
 
-  if (flag_split_ivs_in_unroller
+  if (flag_assign_new_operand_in_unroller
+      || flag_split_ivs_in_unroller
       || flag_variable_expansion_in_unroller)
     opt_info = analyze_insns_in_loop (loop);
 
@@ -1564,6 +1608,71 @@ analyze_iv_to_split_insn (rtx_insn *insn)
   return ivts;
 }
 
+static struct def_to_split *
+analyze_insn_to_split_def (class loop *loop, rtx_insn *insn)
+{
+  rtx set, dest, src;
+  rtx_insn *ins;
+  struct def_to_split *dts;
+
+  /* TODO: to be relaxed */
+  if (loop->num_nodes != 2)
+    return NULL;
+
+  FOR_BB_INSNS (loop->latch, ins)
+    if (INSN_P (ins))
+      return NULL;
+
+  set = single_set (insn);
+  if (!set)
+    return NULL;
+
+  dest = SET_DEST (set);
+  src = SET_SRC (set);
+
+  if (!REG_P (dest)) //    && !(GET_CODE (dest) == SUBREG && REG_P (SUBREG_REG
+                    //    (dest))))
+    return NULL;
+
+  if (REGNO (dest) < FIRST_PSEUDO_REGISTER)
+    return NULL;
+
+  int use_before_set = 0;
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  rtx set1;
+  for (ins = BB_HEAD (bb); ins != insn; ins = NEXT_INSN (ins))
+    if (rtx_referenced_p (dest, ins))
+      {
+       set1 = single_set (ins);
+       if (set1 && rtx_equal_p (SET_DEST (set1), dest))
+         return NULL;
+       use_before_set = 1;
+      }
+
+  if (!use_before_set && rtx_referenced_p (dest, src))
+    use_before_set = 1;
+
+  if (dump_file)
+    {
+      fprintf (dump_file,
+              "\n;; Split assignment %s: ",
+              use_before_set ? "use leads def" : "");
+      print_rtl (dump_file, dest);
+      fprintf (dump_file, "\n");
+    }
+
+  /* Record the accumulator to expand.  */
+  dts = XNEW (struct def_to_split);
+  dts->insn = insn;
+  dts->reg = copy_rtx (dest);
+  dts->defs.create (1);
+  dts->next = NULL;
+  dts->count = 0;
+  dts->use_before_def = use_before_set;
+  dts->last_insn = NULL;
+  return dts;
+}
+
 /* Determines which of insns in LOOP can be optimized.
    Return a OPT_INFO struct with the relevant hash tables filled
    with all insns to be optimized.  The FIRST_NEW_BLOCK field
@@ -1578,6 +1687,7 @@ analyze_insns_in_loop (class loop *loop)
   rtx_insn *insn;
   struct iv_to_split *ivts = NULL;
   struct var_to_expand *ves = NULL;
+  struct def_to_split *dts= NULL;
   iv_to_split **slot1;
   var_to_expand **slot2;
   vec<edge> edges = get_loop_exit_edges (loop);
@@ -1618,6 +1728,15 @@ analyze_insns_in_loop (class loop *loop)
       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
     }
 
+  if (flag_assign_new_operand_in_unroller && can_apply
+      && LPT_UNROLL_RUNTIME == loop->lpt_decision.decision)
+    {
+      opt_info->insns_def_split
+       = new hash_table<def_split_hasher> (5 * loop->num_nodes);
+      opt_info->def_to_split_head = NULL;
+      opt_info->def_to_split_tail = &opt_info->def_to_split_head;
+    }
+
   for (i = 0; i < loop->num_nodes; i++)
     {
       bb = body[i];
@@ -1653,6 +1772,20 @@ analyze_insns_in_loop (class loop *loop)
            *opt_info->var_to_expand_tail = ves;
            opt_info->var_to_expand_tail = &ves->next;
           }
+
+       dts = NULL;
+       if (opt_info->insns_def_split && !ves && !ivts)
+         dts = analyze_insn_to_split_def (loop, insn);
+
+       if (dts)
+         {
+           struct def_to_split **slot
+             = opt_info->insns_def_split->find_slot (dts, INSERT);
+           gcc_assert (*slot == NULL);
+           *slot = dts;
+           *opt_info->def_to_split_tail = dts;
+           opt_info->def_to_split_tail = &dts->next;
+         }
       }
     }
 
@@ -1840,6 +1973,145 @@ expand_var_during_unrolling (struct var_to_expand *ve, 
rtx_insn *insn)
       }
 }
 
+/* Replace INSN with a new duplicate insn.  */
+static rtx_insn *
+replace_with_new_insn (rtx_insn *insn)
+{
+  rtx_insn *seq;
+  start_sequence ();
+  seq = duplicate_insn_chain (insn, insn);
+  gcc_assert (seq == get_insns ());
+  end_sequence ();
+  emit_insn_before (seq, insn);
+  delete_insn (insn);
+  return seq;
+}
+
+/* For original body, those insn(s) are not able to be update directly. Then,
+ * duplicate new one for update.  */
+
+static void
+prepare_to_update_orignal_body (struct def_to_split *head)
+{
+  struct def_to_split *dts;
+  rtx_insn *insn;
+  vec<rtx_insn *> new_insn_list;
+
+  /* For all those defines which is used before define, replace with new one 
for
+   * update.  */
+  new_insn_list.create (1);
+  for (dts = head; dts; dts = dts->next)
+    if (dts->use_before_def)
+      {
+       dts->insn = replace_with_new_insn (dts->insn);
+       new_insn_list.safe_push (dts->insn);
+      }
+
+  /* For all those insn which is using those defines, replace with new one for
+   * update.  */
+  for (dts = head; dts; dts = dts->next)
+    if (dts->use_before_def)
+      for (insn = NEXT_INSN (dts->insn);
+          insn != NEXT_INSN (BB_END (BLOCK_FOR_INSN (dts->insn)));
+          insn = NEXT_INSN (insn))
+       if (rtx_referenced_p (dts->reg, insn) && !new_insn_list.contains (insn))
+         {
+           insn = replace_with_new_insn (insn);
+           new_insn_list.safe_push (insn);
+         }
+
+  new_insn_list.release ();
+}
+
+/* Replace reg which occur between START and END.  */
+static void
+replace_regs (rtx_insn *start, rtx_insn *end, rtx old_reg, rtx new_reg)
+{
+  rtx_insn *insn;
+  for (insn = start; insn != NEXT_INSN (end); insn = NEXT_INSN (insn))
+    if (rtx_referenced_p (old_reg, insn))
+      validate_replace_rtx_group (old_reg, new_reg, insn);
+
+  apply_change_group ();
+}
+
+/* Update the the pseudo in orignal blocks, and assign back them at the last
+ * block.  */
+static void
+do_additional_update (struct def_to_split *head)
+{
+  rtx_insn *insn;
+  rtx new_reg;
+  basic_block bb;
+  struct def_to_split *dts;
+
+  for (dts = head; dts; dts = dts->next)
+    {
+      if (dts->use_before_def)
+       {
+         gcc_assert (dts->last_insn);
+         gcc_assert (dts->count > 1);
+
+         /* Update the insns in original body.  */
+         bb = BLOCK_FOR_INSN (dts->insn);
+         new_reg = dts->defs[0];
+         SET_DEST (single_set (dts->insn)) = new_reg;
+
+         replace_regs (NEXT_INSN (dts->insn), BB_END (bb), dts->reg, new_reg);
+       }
+
+      /* In last BB, assign back to orignal reg.  */
+      start_sequence ();
+      emit_move_insn (dts->reg, SET_DEST (single_set (dts->last_insn)));
+      insn = get_insns ();
+      end_sequence ();
+      emit_insn_after (insn, dts->last_insn);
+    }
+}
+
+/* Replace the DTS->REG with new split reg for INSN's block. If the DTS->REG is
+ * referenced before set, Those usage which leading INSN is replaced with
+ * previous define/set.  */
+static void
+split_def_during_unrolling (struct def_to_split *dts, rtx_insn *insn)
+{
+  rtx new_reg, new_reg0, set;
+  basic_block bb = BLOCK_FOR_INSN (insn);
+
+  if (dts->use_before_def)
+    {
+      if (dts->count == 0)
+       {
+         dts->defs.safe_push (gen_reg_rtx (GET_MODE (dts->reg)));
+         dts->count++;
+       }
+
+      /* Replace the usage of the define reg with new reg0 for those insns 
which
+       * are leading the define, include the usage of the define.  */
+      new_reg0 = dts->defs[dts->count - 1];
+      replace_regs (BB_HEAD (bb), insn, dts->reg, new_reg0);
+
+      /* Replace the define reg with new reg, for the define and those usages
+       * which following the define.  */
+      set = single_set (insn);
+      gcc_assert (set);
+      new_reg = gen_reg_rtx (GET_MODE (dts->reg));
+      SET_DEST (single_set (insn)) = new_reg;
+
+      replace_regs (NEXT_INSN (insn), BB_END (bb), dts->reg, new_reg);
+    }
+  else
+    {
+      new_reg = gen_reg_rtx (GET_MODE (dts->reg));
+      replace_regs (insn, BB_END (bb), dts->reg, new_reg);
+    }
+
+  /* Record the insn in last block.  */
+  dts->last_insn = insn;
+  dts->defs.safe_push (new_reg);
+  dts->count++;
+}
+
 /* Initialize the variable expansions in loop preheader.  PLACE is the
    loop-preheader basic block where the initialization of the
    expansions should take place.  The expansions are initialized with
@@ -2011,6 +2283,7 @@ apply_opt_in_copies (struct opt_info *opt_info,
   rtx_insn *insn, *orig_insn, *next;
   struct iv_to_split ivts_templ, *ivts;
   struct var_to_expand ve_templ, *ves;
+  struct def_to_split dts_templ, *dts;
 
   /* Sanity check -- we need to put initialization in the original loop
      body.  */
@@ -2051,8 +2324,9 @@ apply_opt_in_copies (struct opt_info *opt_info,
 
           ivts_templ.insn = orig_insn;
           ve_templ.insn = orig_insn;
+         dts_templ.insn = orig_insn;
 
-          /* Apply splitting iv optimization.  */
+         /* Apply splitting iv optimization.  */
           if (opt_info->insns_to_split)
             {
              maybe_strip_eq_note_for_split_iv (opt_info, insn);
@@ -2081,6 +2355,19 @@ apply_opt_in_copies (struct opt_info *opt_info,
                   expand_var_during_unrolling (ves, insn);
                 }
             }
+
+         if (unrolling && opt_info->insns_def_split)
+           {
+             dts = (struct def_to_split *) opt_info->insns_def_split->find (
+               &dts_templ);
+             if (dts)
+               {
+                 gcc_assert (GET_CODE (PATTERN (insn))
+                             == GET_CODE (PATTERN (orig_insn)));
+                 split_def_during_unrolling (dts, insn);
+               }
+           }
+
           orig_insn = NEXT_INSN (orig_insn);
         }
     }
@@ -2135,9 +2422,14 @@ apply_opt_in_copies (struct opt_info *opt_info,
                   continue;
                 }
             }
-
         }
     }
+
+  if (unrolling && opt_info->insns_def_split)
+    {
+      prepare_to_update_orignal_body (opt_info->def_to_split_head);
+      do_additional_update (opt_info->def_to_split_head);
+    }
 }
 
 /* Release OPT_INFO.  */
@@ -2156,5 +2448,16 @@ free_opt_info (struct opt_info *opt_info)
       delete opt_info->insns_with_var_to_expand;
       opt_info->insns_with_var_to_expand = NULL;
     }
+
+  if (opt_info->insns_def_split)
+    {
+      struct def_to_split *dts;
+
+      for (dts = opt_info->def_to_split_head; dts; dts = dts->next)
+       dts->defs.release ();
+      delete opt_info->insns_def_split;
+      opt_info->insns_def_split = NULL;
+    }
+
   free (opt_info);
 }
-- 
2.17.1

[RFC] split pseudos during loop unrolling in RTL unroller

Reply via email to