Hi, As you may know, we have loop unroll pass in RTL which was introduced a few years ago, and works for a long time. Currently, this unroller is using the pseudos in the original body, and then the pseudos are written multiple times.
It would be a good idea to create new pseudos for those duplicated instructions during loop unrolling. This would relax reg dependence, and then provide more freedom/opportunity for other optimizations, like scheduling, RA... As below example: Original loop: ``` 26: L26: 17: NOTE_INSN_BASIC_BLOCK 4 18: r130:SF=[r125:DI+r122:DI] 19: r131:SF=[r126:DI+r122:DI] 20: r129:SF=r130:SF*r131:SF 21: r132:DF=float_extend(r129:SF) 22: r121:DF=r121:DF+r132:DF 23: r133:SI=r123:DI#0-0x1 24: r123:DI=zero_extend(r133:SI) 25: r122:DI=r122:DI+0x4 27: r134:CC=cmp(r123:DI,0) 28: pc={(r134:CC!=0)?L49:pc} ; pc falls through to BB 5 49: L49: 48: NOTE_INSN_BASIC_BLOCK 8 ; pc falls through to BB 4 ``` It was unrolled to: ``` 26: L26: 17: NOTE_INSN_BASIC_BLOCK 4 18: r130:SF=[r125:DI+r122:DI] 19: r131:SF=[r126:DI+r122:DI] 20: r129:SF=r130:SF*r131:SF 21: r132:DF=float_extend(r129:SF) 93: r145:DF=r121:DF+r132:DF 23: r133:SI=r123:DI#0-0x1 95: r148:DI=zero_extend(r133:SI) 91: r140:DI=r122:DI+0x4 25: r122:DI=r140:DI 97: r134:CC=cmp(r148:DI,0) 87: NOTE_INSN_BASIC_BLOCK 16 77: r130:SF=[r125:DI+r122:DI] 78: r131:SF=[r126:DI+r122:DI] 79: r129:SF=r130:SF*r131:SF 80: r132:DF=float_extend(r129:SF) 81: r121:DF=r121:DF+r132:DF 82: r133:SI=r123:DI#0-0x1 83: r123:DI=zero_extend(r133:SI) 84: r122:DI=r140:DI+0x4 85: r134:CC=cmp(r123:DI,0) 86: pc={(r134:CC!=0)?L90:pc} ``` We can see, the original loop is using r130,r131,r129,r132,r121... For the unrolled sequence BB 16, they are still using them. We could use new pseudos for duplicate body, like below: ``` 26: L26: 17: NOTE_INSN_BASIC_BLOCK 4 18: r130:SF=[r125:DI+r122:DI] 19: r131:SF=[r126:DI+r122:DI] 20: r129:SF=r130:SF*r131:SF 21: r132:DF=float_extend(r129:SF) 93: r145:DF=r121:DF+r132:DF 23: r133:SI=r123:DI#0-0x1 95: r148:DI=zero_extend(r133:SI) 91: r140:DI=r122:DI+0x4 25: r122:DI=r140:DI 97: r134:CC=cmp(r148:DI,0) 87: NOTE_INSN_BASIC_BLOCK 16 77: r141:SF=[r125:DI+r122:DI] 78: r142:SF=[r126:DI+r122:DI] 79: r143:SF=r141:SF*r142:SF 80: r144:DF=float_extend(r143:SF) 81: r146:DF=r145:DF+r144:DF 82: r147:SI=r148:DI#0-0x1 83: r149:DI=zero_extend(r147:SI) 84: r122:DI=r140:DI+0x4 85: r150:CC=cmp(r149:DI,0) 98: r130:SF=r141:SF 99: r131:SF=r142:SF 100: r129:SF=r143:SF 101: r132:DF=r144:DF 102: r121:DF=r146:DF 103: r133:SI=r147:SI 104: r123:DI=r149:DI 105: r134:CC=r150:CC 86: pc={(r150:CC!=0)?L90:pc} ``` In BB 16, new pseudos: r141,r142,r143... are used. For this, I drafted a prototype like the patch below. This patch only supports simple loops: one exit edge with one major basic block. With this patch, we can see, the generated asm code uses fewer instructions on PowerPC. Welcome for comments, thanks! BR. Jiufu Guo. --- gcc/common.opt | 4 + gcc/loop-unroll.c | 309 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 310 insertions(+), 3 deletions(-) diff --git a/gcc/common.opt b/gcc/common.opt index fa9da505fc2..e298ce09c82 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2526,6 +2526,10 @@ fvariable-expansion-in-unroller Common Report Var(flag_variable_expansion_in_unroller) Optimization Apply variable expansion when loops are unrolled. +fassin-new-operand-in-unroller +Common Report Var(flag_assign_new_operand_in_unroller) Init(1) Optimization +Creating new pseudo for duplicated instruction when loops are unrolled. + fstack-check= Common Report RejectNegative Joined Optimization -fstack-check=[no|generic|specific] Insert stack checking code into the program. diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c index 693c7768868..54a48bd9fd7 100644 --- a/gcc/loop-unroll.c +++ b/gcc/loop-unroll.c @@ -94,6 +94,17 @@ struct var_to_expand var_expansions[REUSE_EXPANSION - 1]. */ }; +struct def_to_split +{ + rtx_insn *insn; /* The insn in that the assigment occurs. */ + rtx reg; /* The def/set pseudo. */ + vec<rtx> defs; /* The copies of the defs which is split to. */ + struct def_to_split *next; /* Next entry in walking order. */ + int count; /* Number of DEFS. */ + int use_before_def; /* The pseudo is used before re-defined. */ + rtx_insn *last_insn; +}; + /* Hashtable helper for iv_to_split. */ struct iv_split_hasher : free_ptr_hash <iv_to_split> @@ -143,6 +154,30 @@ var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2) return i1->insn == i2->insn; } +/* Hashtable helper for iv_to_split. */ + +struct def_split_hasher : free_ptr_hash<def_to_split> +{ + static inline hashval_t hash (const def_to_split *); + static inline bool equal (const def_to_split *, const def_to_split *); +}; + +/* Return a hash for VES. */ + +inline hashval_t +def_split_hasher::hash (const def_to_split *i) +{ + return (hashval_t) INSN_UID (i->insn); +} + +/* Return true if I1 and I2 refer to the same instruction. */ + +inline bool +def_split_hasher::equal (const def_to_split *i1, const def_to_split *i2) +{ + return i1->insn == i2->insn; +} + /* Information about optimization applied in the unrolled loop. */ @@ -156,10 +191,17 @@ struct opt_info insns with accumulators to expand. */ struct var_to_expand *var_to_expand_head; /* The first var to expand. */ struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */ + + hash_table<def_split_hasher> *insns_def_split; /* A hashtable of insns to + split def. */ + struct def_to_split *def_to_split_head; /* The def to split. */ + struct def_to_split **def_to_split_tail; /* Pointer to the tail of the list. */ + unsigned first_new_block; /* The first basic block that was duplicated. */ basic_block loop_exit; /* The loop exit basic block. */ basic_block loop_preheader; /* The loop preheader basic block. */ + }; static void decide_unroll_stupid (class loop *, int); @@ -183,6 +225,7 @@ static void combine_var_copies_in_loop_exit (struct var_to_expand *, basic_block); static rtx get_expansion (struct var_to_expand *); +static struct def_to_split *analyze_insn_to_split_def (class loop *, rtx_insn *); /* Emit a message summarizing the unroll that will be performed for LOOP, along with the loop's location LOCUS, if appropriate given the dump or -fopt-info settings. */ @@ -898,7 +941,8 @@ unroll_loop_runtime_iterations (class loop *loop) struct opt_info *opt_info = NULL; bool ok; - if (flag_split_ivs_in_unroller + if (flag_assign_new_operand_in_unroller + || flag_split_ivs_in_unroller || flag_variable_expansion_in_unroller) opt_info = analyze_insns_in_loop (loop); @@ -1564,6 +1608,71 @@ analyze_iv_to_split_insn (rtx_insn *insn) return ivts; } +static struct def_to_split * +analyze_insn_to_split_def (class loop *loop, rtx_insn *insn) +{ + rtx set, dest, src; + rtx_insn *ins; + struct def_to_split *dts; + + /* TODO: to be relaxed */ + if (loop->num_nodes != 2) + return NULL; + + FOR_BB_INSNS (loop->latch, ins) + if (INSN_P (ins)) + return NULL; + + set = single_set (insn); + if (!set) + return NULL; + + dest = SET_DEST (set); + src = SET_SRC (set); + + if (!REG_P (dest)) // && !(GET_CODE (dest) == SUBREG && REG_P (SUBREG_REG + // (dest)))) + return NULL; + + if (REGNO (dest) < FIRST_PSEUDO_REGISTER) + return NULL; + + int use_before_set = 0; + basic_block bb = BLOCK_FOR_INSN (insn); + rtx set1; + for (ins = BB_HEAD (bb); ins != insn; ins = NEXT_INSN (ins)) + if (rtx_referenced_p (dest, ins)) + { + set1 = single_set (ins); + if (set1 && rtx_equal_p (SET_DEST (set1), dest)) + return NULL; + use_before_set = 1; + } + + if (!use_before_set && rtx_referenced_p (dest, src)) + use_before_set = 1; + + if (dump_file) + { + fprintf (dump_file, + "\n;; Split assignment %s: ", + use_before_set ? "use leads def" : ""); + print_rtl (dump_file, dest); + fprintf (dump_file, "\n"); + } + + /* Record the accumulator to expand. */ + dts = XNEW (struct def_to_split); + dts->insn = insn; + dts->reg = copy_rtx (dest); + dts->defs.create (1); + dts->next = NULL; + dts->count = 0; + dts->use_before_def = use_before_set; + dts->last_insn = NULL; + return dts; +} + /* Determines which of insns in LOOP can be optimized. Return a OPT_INFO struct with the relevant hash tables filled with all insns to be optimized. The FIRST_NEW_BLOCK field @@ -1578,6 +1687,7 @@ analyze_insns_in_loop (class loop *loop) rtx_insn *insn; struct iv_to_split *ivts = NULL; struct var_to_expand *ves = NULL; + struct def_to_split *dts= NULL; iv_to_split **slot1; var_to_expand **slot2; vec<edge> edges = get_loop_exit_edges (loop); @@ -1618,6 +1728,15 @@ analyze_insns_in_loop (class loop *loop) opt_info->var_to_expand_tail = &opt_info->var_to_expand_head; } + if (flag_assign_new_operand_in_unroller && can_apply + && LPT_UNROLL_RUNTIME == loop->lpt_decision.decision) + { + opt_info->insns_def_split + = new hash_table<def_split_hasher> (5 * loop->num_nodes); + opt_info->def_to_split_head = NULL; + opt_info->def_to_split_tail = &opt_info->def_to_split_head; + } + for (i = 0; i < loop->num_nodes; i++) { bb = body[i]; @@ -1653,6 +1772,20 @@ analyze_insns_in_loop (class loop *loop) *opt_info->var_to_expand_tail = ves; opt_info->var_to_expand_tail = &ves->next; } + + dts = NULL; + if (opt_info->insns_def_split && !ves && !ivts) + dts = analyze_insn_to_split_def (loop, insn); + + if (dts) + { + struct def_to_split **slot + = opt_info->insns_def_split->find_slot (dts, INSERT); + gcc_assert (*slot == NULL); + *slot = dts; + *opt_info->def_to_split_tail = dts; + opt_info->def_to_split_tail = &dts->next; + } } } @@ -1840,6 +1973,145 @@ expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn) } } +/* Replace INSN with a new duplicate insn. */ +static rtx_insn * +replace_with_new_insn (rtx_insn *insn) +{ + rtx_insn *seq; + start_sequence (); + seq = duplicate_insn_chain (insn, insn); + gcc_assert (seq == get_insns ()); + end_sequence (); + emit_insn_before (seq, insn); + delete_insn (insn); + return seq; +} + +/* For original body, those insn(s) are not able to be update directly. Then, + * duplicate new one for update. */ + +static void +prepare_to_update_orignal_body (struct def_to_split *head) +{ + struct def_to_split *dts; + rtx_insn *insn; + vec<rtx_insn *> new_insn_list; + + /* For all those defines which is used before define, replace with new one for + * update. */ + new_insn_list.create (1); + for (dts = head; dts; dts = dts->next) + if (dts->use_before_def) + { + dts->insn = replace_with_new_insn (dts->insn); + new_insn_list.safe_push (dts->insn); + } + + /* For all those insn which is using those defines, replace with new one for + * update. */ + for (dts = head; dts; dts = dts->next) + if (dts->use_before_def) + for (insn = NEXT_INSN (dts->insn); + insn != NEXT_INSN (BB_END (BLOCK_FOR_INSN (dts->insn))); + insn = NEXT_INSN (insn)) + if (rtx_referenced_p (dts->reg, insn) && !new_insn_list.contains (insn)) + { + insn = replace_with_new_insn (insn); + new_insn_list.safe_push (insn); + } + + new_insn_list.release (); +} + +/* Replace reg which occur between START and END. */ +static void +replace_regs (rtx_insn *start, rtx_insn *end, rtx old_reg, rtx new_reg) +{ + rtx_insn *insn; + for (insn = start; insn != NEXT_INSN (end); insn = NEXT_INSN (insn)) + if (rtx_referenced_p (old_reg, insn)) + validate_replace_rtx_group (old_reg, new_reg, insn); + + apply_change_group (); +} + +/* Update the the pseudo in orignal blocks, and assign back them at the last + * block. */ +static void +do_additional_update (struct def_to_split *head) +{ + rtx_insn *insn; + rtx new_reg; + basic_block bb; + struct def_to_split *dts; + + for (dts = head; dts; dts = dts->next) + { + if (dts->use_before_def) + { + gcc_assert (dts->last_insn); + gcc_assert (dts->count > 1); + + /* Update the insns in original body. */ + bb = BLOCK_FOR_INSN (dts->insn); + new_reg = dts->defs[0]; + SET_DEST (single_set (dts->insn)) = new_reg; + + replace_regs (NEXT_INSN (dts->insn), BB_END (bb), dts->reg, new_reg); + } + + /* In last BB, assign back to orignal reg. */ + start_sequence (); + emit_move_insn (dts->reg, SET_DEST (single_set (dts->last_insn))); + insn = get_insns (); + end_sequence (); + emit_insn_after (insn, dts->last_insn); + } +} + +/* Replace the DTS->REG with new split reg for INSN's block. If the DTS->REG is + * referenced before set, Those usage which leading INSN is replaced with + * previous define/set. */ +static void +split_def_during_unrolling (struct def_to_split *dts, rtx_insn *insn) +{ + rtx new_reg, new_reg0, set; + basic_block bb = BLOCK_FOR_INSN (insn); + + if (dts->use_before_def) + { + if (dts->count == 0) + { + dts->defs.safe_push (gen_reg_rtx (GET_MODE (dts->reg))); + dts->count++; + } + + /* Replace the usage of the define reg with new reg0 for those insns which + * are leading the define, include the usage of the define. */ + new_reg0 = dts->defs[dts->count - 1]; + replace_regs (BB_HEAD (bb), insn, dts->reg, new_reg0); + + /* Replace the define reg with new reg, for the define and those usages + * which following the define. */ + set = single_set (insn); + gcc_assert (set); + new_reg = gen_reg_rtx (GET_MODE (dts->reg)); + SET_DEST (single_set (insn)) = new_reg; + + replace_regs (NEXT_INSN (insn), BB_END (bb), dts->reg, new_reg); + } + else + { + new_reg = gen_reg_rtx (GET_MODE (dts->reg)); + replace_regs (insn, BB_END (bb), dts->reg, new_reg); + } + + /* Record the insn in last block. */ + dts->last_insn = insn; + dts->defs.safe_push (new_reg); + dts->count++; +} + /* Initialize the variable expansions in loop preheader. PLACE is the loop-preheader basic block where the initialization of the expansions should take place. The expansions are initialized with @@ -2011,6 +2283,7 @@ apply_opt_in_copies (struct opt_info *opt_info, rtx_insn *insn, *orig_insn, *next; struct iv_to_split ivts_templ, *ivts; struct var_to_expand ve_templ, *ves; + struct def_to_split dts_templ, *dts; /* Sanity check -- we need to put initialization in the original loop body. */ @@ -2051,8 +2324,9 @@ apply_opt_in_copies (struct opt_info *opt_info, ivts_templ.insn = orig_insn; ve_templ.insn = orig_insn; + dts_templ.insn = orig_insn; - /* Apply splitting iv optimization. */ + /* Apply splitting iv optimization. */ if (opt_info->insns_to_split) { maybe_strip_eq_note_for_split_iv (opt_info, insn); @@ -2081,6 +2355,19 @@ apply_opt_in_copies (struct opt_info *opt_info, expand_var_during_unrolling (ves, insn); } } + + if (unrolling && opt_info->insns_def_split) + { + dts = (struct def_to_split *) opt_info->insns_def_split->find ( + &dts_templ); + if (dts) + { + gcc_assert (GET_CODE (PATTERN (insn)) + == GET_CODE (PATTERN (orig_insn))); + split_def_during_unrolling (dts, insn); + } + } + orig_insn = NEXT_INSN (orig_insn); } } @@ -2135,9 +2422,14 @@ apply_opt_in_copies (struct opt_info *opt_info, continue; } } - } } + + if (unrolling && opt_info->insns_def_split) + { + prepare_to_update_orignal_body (opt_info->def_to_split_head); + do_additional_update (opt_info->def_to_split_head); + } } /* Release OPT_INFO. */ @@ -2156,5 +2448,16 @@ free_opt_info (struct opt_info *opt_info) delete opt_info->insns_with_var_to_expand; opt_info->insns_with_var_to_expand = NULL; } + + if (opt_info->insns_def_split) + { + struct def_to_split *dts; + + for (dts = opt_info->def_to_split_head; dts; dts = dts->next) + dts->defs.release (); + delete opt_info->insns_def_split; + opt_info->insns_def_split = NULL; + } + free (opt_info); } -- 2.17.1