On Wed, Apr 15, 2020 at 3:56 AM Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > > As you may know, we have loop unroll pass in RTL which was introduced a few > years ago, and works for a long time. Currently, this unroller is using the > pseudos in the original body, and then the pseudos are written multiple times. > > It would be a good idea to create new pseudos for those duplicated > instructions > during loop unrolling. This would relax reg dependence, and then provide more > freedom/opportunity for other optimizations, like scheduling, RA...
I think there's a separate pass to do something like this, conveniently placed after unrolling. -fweb, IIRC enabled by default for -funroll-loops unless explicitly disabled. Related is regrename which is also enabled then. So where does your patch make a difference? Is the webizers dataflow analysis maybe confused by backedges? > As below example: > > Original loop: > ``` > 26: L26: > 17: NOTE_INSN_BASIC_BLOCK 4 > 18: r130:SF=[r125:DI+r122:DI] > 19: r131:SF=[r126:DI+r122:DI] > 20: r129:SF=r130:SF*r131:SF > 21: r132:DF=float_extend(r129:SF) > 22: r121:DF=r121:DF+r132:DF > 23: r133:SI=r123:DI#0-0x1 > 24: r123:DI=zero_extend(r133:SI) > 25: r122:DI=r122:DI+0x4 > 27: r134:CC=cmp(r123:DI,0) > 28: pc={(r134:CC!=0)?L49:pc} > ; pc falls through to BB 5 > 49: L49: > 48: NOTE_INSN_BASIC_BLOCK 8 > ; pc falls through to BB 4 > ``` > It was unrolled to: > ``` > 26: L26: > 17: NOTE_INSN_BASIC_BLOCK 4 > 18: r130:SF=[r125:DI+r122:DI] > 19: r131:SF=[r126:DI+r122:DI] > 20: r129:SF=r130:SF*r131:SF > 21: r132:DF=float_extend(r129:SF) > 93: r145:DF=r121:DF+r132:DF > 23: r133:SI=r123:DI#0-0x1 > 95: r148:DI=zero_extend(r133:SI) > 91: r140:DI=r122:DI+0x4 > 25: r122:DI=r140:DI > 97: r134:CC=cmp(r148:DI,0) > > 87: NOTE_INSN_BASIC_BLOCK 16 > 77: r130:SF=[r125:DI+r122:DI] > 78: r131:SF=[r126:DI+r122:DI] > 79: r129:SF=r130:SF*r131:SF > 80: r132:DF=float_extend(r129:SF) > 81: r121:DF=r121:DF+r132:DF > 82: r133:SI=r123:DI#0-0x1 > 83: r123:DI=zero_extend(r133:SI) > 84: r122:DI=r140:DI+0x4 > 85: r134:CC=cmp(r123:DI,0) > 86: pc={(r134:CC!=0)?L90:pc} > > ``` > We can see, the original loop is using r130,r131,r129,r132,r121... > For the unrolled sequence BB 16, they are still using them. > > We could use new pseudos for duplicate body, like below: > ``` > 26: L26: > 17: NOTE_INSN_BASIC_BLOCK 4 > 18: r130:SF=[r125:DI+r122:DI] > 19: r131:SF=[r126:DI+r122:DI] > 20: r129:SF=r130:SF*r131:SF > 21: r132:DF=float_extend(r129:SF) > 93: r145:DF=r121:DF+r132:DF > 23: r133:SI=r123:DI#0-0x1 > 95: r148:DI=zero_extend(r133:SI) > 91: r140:DI=r122:DI+0x4 > 25: r122:DI=r140:DI > 97: r134:CC=cmp(r148:DI,0) > > 87: NOTE_INSN_BASIC_BLOCK 16 > 77: r141:SF=[r125:DI+r122:DI] > 78: r142:SF=[r126:DI+r122:DI] > 79: r143:SF=r141:SF*r142:SF > 80: r144:DF=float_extend(r143:SF) > 81: r146:DF=r145:DF+r144:DF > 82: r147:SI=r148:DI#0-0x1 > 83: r149:DI=zero_extend(r147:SI) > 84: r122:DI=r140:DI+0x4 > 85: r150:CC=cmp(r149:DI,0) > 98: r130:SF=r141:SF > 99: r131:SF=r142:SF > 100: r129:SF=r143:SF > 101: r132:DF=r144:DF > 102: r121:DF=r146:DF > 103: r133:SI=r147:SI > 104: r123:DI=r149:DI > 105: r134:CC=r150:CC > 86: pc={(r150:CC!=0)?L90:pc} > ``` > In BB 16, new pseudos: r141,r142,r143... are used. > > For this, I drafted a prototype like the patch below. > This patch only supports simple loops: one exit edge with one major basic > block. > > With this patch, we can see, the generated asm code uses fewer instructions > on PowerPC. > > Welcome for comments, thanks! > > BR. > Jiufu Guo. > > --- > gcc/common.opt | 4 + > gcc/loop-unroll.c | 309 +++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 310 insertions(+), 3 deletions(-) > > diff --git a/gcc/common.opt b/gcc/common.opt > index fa9da505fc2..e298ce09c82 100644 > --- a/gcc/common.opt > +++ b/gcc/common.opt > @@ -2526,6 +2526,10 @@ fvariable-expansion-in-unroller > Common Report Var(flag_variable_expansion_in_unroller) Optimization > Apply variable expansion when loops are unrolled. > > +fassin-new-operand-in-unroller > +Common Report Var(flag_assign_new_operand_in_unroller) Init(1) Optimization > +Creating new pseudo for duplicated instruction when loops are unrolled. > + > fstack-check= > Common Report RejectNegative Joined Optimization > -fstack-check=[no|generic|specific] Insert stack checking code into the > program. > diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c > index 693c7768868..54a48bd9fd7 100644 > --- a/gcc/loop-unroll.c > +++ b/gcc/loop-unroll.c > @@ -94,6 +94,17 @@ struct var_to_expand > var_expansions[REUSE_EXPANSION - 1]. > */ > }; > > +struct def_to_split > +{ > + rtx_insn *insn; /* The insn in that the assigment occurs. */ > + rtx reg; /* The def/set pseudo. */ > + vec<rtx> defs; /* The copies of the defs which is split to. */ > + struct def_to_split *next; /* Next entry in walking order. */ > + int count; /* Number of DEFS. */ > + int use_before_def; /* The pseudo is used before re-defined. */ > + rtx_insn *last_insn; > +}; > + > /* Hashtable helper for iv_to_split. */ > > struct iv_split_hasher : free_ptr_hash <iv_to_split> > @@ -143,6 +154,30 @@ var_expand_hasher::equal (const var_to_expand *i1, const > var_to_expand *i2) > return i1->insn == i2->insn; > } > > +/* Hashtable helper for iv_to_split. */ > + > +struct def_split_hasher : free_ptr_hash<def_to_split> > +{ > + static inline hashval_t hash (const def_to_split *); > + static inline bool equal (const def_to_split *, const def_to_split *); > +}; > + > +/* Return a hash for VES. */ > + > +inline hashval_t > +def_split_hasher::hash (const def_to_split *i) > +{ > + return (hashval_t) INSN_UID (i->insn); > +} > + > +/* Return true if I1 and I2 refer to the same instruction. */ > + > +inline bool > +def_split_hasher::equal (const def_to_split *i1, const def_to_split *i2) > +{ > + return i1->insn == i2->insn; > +} > + > /* Information about optimization applied in > the unrolled loop. */ > > @@ -156,10 +191,17 @@ struct opt_info > insns with accumulators to expand. */ > struct var_to_expand *var_to_expand_head; /* The first var to expand. */ > struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the > list. */ > + > + hash_table<def_split_hasher> *insns_def_split; /* A hashtable of insns to > + split def. */ > + struct def_to_split *def_to_split_head; /* The def to split. */ > + struct def_to_split **def_to_split_tail; /* Pointer to the tail of the > list. */ > + > unsigned first_new_block; /* The first basic block that was > duplicated. */ > basic_block loop_exit; /* The loop exit basic block. */ > basic_block loop_preheader; /* The loop preheader basic block. */ > + > }; > > static void decide_unroll_stupid (class loop *, int); > @@ -183,6 +225,7 @@ static void combine_var_copies_in_loop_exit (struct > var_to_expand *, > basic_block); > static rtx get_expansion (struct var_to_expand *); > > +static struct def_to_split *analyze_insn_to_split_def (class loop *, > rtx_insn *); > /* Emit a message summarizing the unroll that will be > performed for LOOP, along with the loop's location LOCUS, if > appropriate given the dump or -fopt-info settings. */ > @@ -898,7 +941,8 @@ unroll_loop_runtime_iterations (class loop *loop) > struct opt_info *opt_info = NULL; > bool ok; > > - if (flag_split_ivs_in_unroller > + if (flag_assign_new_operand_in_unroller > + || flag_split_ivs_in_unroller > || flag_variable_expansion_in_unroller) > opt_info = analyze_insns_in_loop (loop); > > @@ -1564,6 +1608,71 @@ analyze_iv_to_split_insn (rtx_insn *insn) > return ivts; > } > > +static struct def_to_split * > +analyze_insn_to_split_def (class loop *loop, rtx_insn *insn) > +{ > + rtx set, dest, src; > + rtx_insn *ins; > + struct def_to_split *dts; > + > + /* TODO: to be relaxed */ > + if (loop->num_nodes != 2) > + return NULL; > + > + FOR_BB_INSNS (loop->latch, ins) > + if (INSN_P (ins)) > + return NULL; > + > + set = single_set (insn); > + if (!set) > + return NULL; > + > + dest = SET_DEST (set); > + src = SET_SRC (set); > + > + if (!REG_P (dest)) // && !(GET_CODE (dest) == SUBREG && REG_P > (SUBREG_REG > + // (dest)))) > + return NULL; > + > + if (REGNO (dest) < FIRST_PSEUDO_REGISTER) > + return NULL; > + > + int use_before_set = 0; > + basic_block bb = BLOCK_FOR_INSN (insn); > + rtx set1; > + for (ins = BB_HEAD (bb); ins != insn; ins = NEXT_INSN (ins)) > + if (rtx_referenced_p (dest, ins)) > + { > + set1 = single_set (ins); > + if (set1 && rtx_equal_p (SET_DEST (set1), dest)) > + return NULL; > + use_before_set = 1; > + } > + > + if (!use_before_set && rtx_referenced_p (dest, src)) > + use_before_set = 1; > + > + if (dump_file) > + { > + fprintf (dump_file, > + "\n;; Split assignment %s: ", > + use_before_set ? "use leads def" : ""); > + print_rtl (dump_file, dest); > + fprintf (dump_file, "\n"); > + } > + > + /* Record the accumulator to expand. */ > + dts = XNEW (struct def_to_split); > + dts->insn = insn; > + dts->reg = copy_rtx (dest); > + dts->defs.create (1); > + dts->next = NULL; > + dts->count = 0; > + dts->use_before_def = use_before_set; > + dts->last_insn = NULL; > + return dts; > +} > + > /* Determines which of insns in LOOP can be optimized. > Return a OPT_INFO struct with the relevant hash tables filled > with all insns to be optimized. The FIRST_NEW_BLOCK field > @@ -1578,6 +1687,7 @@ analyze_insns_in_loop (class loop *loop) > rtx_insn *insn; > struct iv_to_split *ivts = NULL; > struct var_to_expand *ves = NULL; > + struct def_to_split *dts= NULL; > iv_to_split **slot1; > var_to_expand **slot2; > vec<edge> edges = get_loop_exit_edges (loop); > @@ -1618,6 +1728,15 @@ analyze_insns_in_loop (class loop *loop) > opt_info->var_to_expand_tail = &opt_info->var_to_expand_head; > } > > + if (flag_assign_new_operand_in_unroller && can_apply > + && LPT_UNROLL_RUNTIME == loop->lpt_decision.decision) > + { > + opt_info->insns_def_split > + = new hash_table<def_split_hasher> (5 * loop->num_nodes); > + opt_info->def_to_split_head = NULL; > + opt_info->def_to_split_tail = &opt_info->def_to_split_head; > + } > + > for (i = 0; i < loop->num_nodes; i++) > { > bb = body[i]; > @@ -1653,6 +1772,20 @@ analyze_insns_in_loop (class loop *loop) > *opt_info->var_to_expand_tail = ves; > opt_info->var_to_expand_tail = &ves->next; > } > + > + dts = NULL; > + if (opt_info->insns_def_split && !ves && !ivts) > + dts = analyze_insn_to_split_def (loop, insn); > + > + if (dts) > + { > + struct def_to_split **slot > + = opt_info->insns_def_split->find_slot (dts, INSERT); > + gcc_assert (*slot == NULL); > + *slot = dts; > + *opt_info->def_to_split_tail = dts; > + opt_info->def_to_split_tail = &dts->next; > + } > } > } > > @@ -1840,6 +1973,145 @@ expand_var_during_unrolling (struct var_to_expand > *ve, rtx_insn *insn) > } > } > > +/* Replace INSN with a new duplicate insn. */ > +static rtx_insn * > +replace_with_new_insn (rtx_insn *insn) > +{ > + rtx_insn *seq; > + start_sequence (); > + seq = duplicate_insn_chain (insn, insn); > + gcc_assert (seq == get_insns ()); > + end_sequence (); > + emit_insn_before (seq, insn); > + delete_insn (insn); > + return seq; > +} > + > +/* For original body, those insn(s) are not able to be update directly. Then, > + * duplicate new one for update. */ > + > +static void > +prepare_to_update_orignal_body (struct def_to_split *head) > +{ > + struct def_to_split *dts; > + rtx_insn *insn; > + vec<rtx_insn *> new_insn_list; > + > + /* For all those defines which is used before define, replace with new one > for > + * update. */ > + new_insn_list.create (1); > + for (dts = head; dts; dts = dts->next) > + if (dts->use_before_def) > + { > + dts->insn = replace_with_new_insn (dts->insn); > + new_insn_list.safe_push (dts->insn); > + } > + > + /* For all those insn which is using those defines, replace with new one > for > + * update. */ > + for (dts = head; dts; dts = dts->next) > + if (dts->use_before_def) > + for (insn = NEXT_INSN (dts->insn); > + insn != NEXT_INSN (BB_END (BLOCK_FOR_INSN (dts->insn))); > + insn = NEXT_INSN (insn)) > + if (rtx_referenced_p (dts->reg, insn) && !new_insn_list.contains > (insn)) > + { > + insn = replace_with_new_insn (insn); > + new_insn_list.safe_push (insn); > + } > + > + new_insn_list.release (); > +} > + > +/* Replace reg which occur between START and END. */ > +static void > +replace_regs (rtx_insn *start, rtx_insn *end, rtx old_reg, rtx new_reg) > +{ > + rtx_insn *insn; > + for (insn = start; insn != NEXT_INSN (end); insn = NEXT_INSN (insn)) > + if (rtx_referenced_p (old_reg, insn)) > + validate_replace_rtx_group (old_reg, new_reg, insn); > + > + apply_change_group (); > +} > + > +/* Update the the pseudo in orignal blocks, and assign back them at the last > + * block. */ > +static void > +do_additional_update (struct def_to_split *head) > +{ > + rtx_insn *insn; > + rtx new_reg; > + basic_block bb; > + struct def_to_split *dts; > + > + for (dts = head; dts; dts = dts->next) > + { > + if (dts->use_before_def) > + { > + gcc_assert (dts->last_insn); > + gcc_assert (dts->count > 1); > + > + /* Update the insns in original body. */ > + bb = BLOCK_FOR_INSN (dts->insn); > + new_reg = dts->defs[0]; > + SET_DEST (single_set (dts->insn)) = new_reg; > + > + replace_regs (NEXT_INSN (dts->insn), BB_END (bb), dts->reg, > new_reg); > + } > + > + /* In last BB, assign back to orignal reg. */ > + start_sequence (); > + emit_move_insn (dts->reg, SET_DEST (single_set (dts->last_insn))); > + insn = get_insns (); > + end_sequence (); > + emit_insn_after (insn, dts->last_insn); > + } > +} > + > +/* Replace the DTS->REG with new split reg for INSN's block. If the DTS->REG > is > + * referenced before set, Those usage which leading INSN is replaced with > + * previous define/set. */ > +static void > +split_def_during_unrolling (struct def_to_split *dts, rtx_insn *insn) > +{ > + rtx new_reg, new_reg0, set; > + basic_block bb = BLOCK_FOR_INSN (insn); > + > + if (dts->use_before_def) > + { > + if (dts->count == 0) > + { > + dts->defs.safe_push (gen_reg_rtx (GET_MODE (dts->reg))); > + dts->count++; > + } > + > + /* Replace the usage of the define reg with new reg0 for those insns > which > + * are leading the define, include the usage of the define. */ > + new_reg0 = dts->defs[dts->count - 1]; > + replace_regs (BB_HEAD (bb), insn, dts->reg, new_reg0); > + > + /* Replace the define reg with new reg, for the define and those usages > + * which following the define. */ > + set = single_set (insn); > + gcc_assert (set); > + new_reg = gen_reg_rtx (GET_MODE (dts->reg)); > + SET_DEST (single_set (insn)) = new_reg; > + > + replace_regs (NEXT_INSN (insn), BB_END (bb), dts->reg, new_reg); > + } > + else > + { > + new_reg = gen_reg_rtx (GET_MODE (dts->reg)); > + replace_regs (insn, BB_END (bb), dts->reg, new_reg); > + } > + > + /* Record the insn in last block. */ > + dts->last_insn = insn; > + dts->defs.safe_push (new_reg); > + dts->count++; > +} > + > /* Initialize the variable expansions in loop preheader. PLACE is the > loop-preheader basic block where the initialization of the > expansions should take place. The expansions are initialized with > @@ -2011,6 +2283,7 @@ apply_opt_in_copies (struct opt_info *opt_info, > rtx_insn *insn, *orig_insn, *next; > struct iv_to_split ivts_templ, *ivts; > struct var_to_expand ve_templ, *ves; > + struct def_to_split dts_templ, *dts; > > /* Sanity check -- we need to put initialization in the original loop > body. */ > @@ -2051,8 +2324,9 @@ apply_opt_in_copies (struct opt_info *opt_info, > > ivts_templ.insn = orig_insn; > ve_templ.insn = orig_insn; > + dts_templ.insn = orig_insn; > > - /* Apply splitting iv optimization. */ > + /* Apply splitting iv optimization. */ > if (opt_info->insns_to_split) > { > maybe_strip_eq_note_for_split_iv (opt_info, insn); > @@ -2081,6 +2355,19 @@ apply_opt_in_copies (struct opt_info *opt_info, > expand_var_during_unrolling (ves, insn); > } > } > + > + if (unrolling && opt_info->insns_def_split) > + { > + dts = (struct def_to_split *) opt_info->insns_def_split->find ( > + &dts_templ); > + if (dts) > + { > + gcc_assert (GET_CODE (PATTERN (insn)) > + == GET_CODE (PATTERN (orig_insn))); > + split_def_during_unrolling (dts, insn); > + } > + } > + > orig_insn = NEXT_INSN (orig_insn); > } > } > @@ -2135,9 +2422,14 @@ apply_opt_in_copies (struct opt_info *opt_info, > continue; > } > } > - > } > } > + > + if (unrolling && opt_info->insns_def_split) > + { > + prepare_to_update_orignal_body (opt_info->def_to_split_head); > + do_additional_update (opt_info->def_to_split_head); > + } > } > > /* Release OPT_INFO. */ > @@ -2156,5 +2448,16 @@ free_opt_info (struct opt_info *opt_info) > delete opt_info->insns_with_var_to_expand; > opt_info->insns_with_var_to_expand = NULL; > } > + > + if (opt_info->insns_def_split) > + { > + struct def_to_split *dts; > + > + for (dts = opt_info->def_to_split_head; dts; dts = dts->next) > + dts->defs.release (); > + delete opt_info->insns_def_split; > + opt_info->insns_def_split = NULL; > + } > + > free (opt_info); > } > -- > 2.17.1 >