> > +rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma, > > + const vec<operand_entry *> > > +&ops) > > { > > enum tree_code opcode = gimple_assign_rhs_code (stmt); > > int op_num = ops.length (); > > @@ -5483,10 +5494,11 @@ rewrite_expr_tree_parallel (gassign *stmt, int > width, > > int stmt_num = op_num - 1; > > gimple **stmts = XALLOCAVEC (gimple *, stmt_num); > > int op_index = op_num - 1; > > - int stmt_index = 0; > > - int ready_stmts_end = 0; > > - int i = 0; > > - gimple *stmt1 = NULL, *stmt2 = NULL; > > + int width_count = width; > > + int i = 0, j = 0; > > + tree tmp_op[2], op1; > > + operand_entry *oe; > > + gimple *stmt1 = NULL; > > tree last_rhs1 = gimple_assign_rhs1 (stmt); > > > > /* We start expression rewriting from the top statements. > > @@ -5496,91 +5508,84 @@ rewrite_expr_tree_parallel (gassign *stmt, int > width, > > for (i = stmt_num - 2; i >= 0; i--) > > stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1])); > > > > - for (i = 0; i < stmt_num; i++) > > + /* Build parallel dependency chain according to width. */ for (i > > + = 0; i < width; i++) > > { > > - tree op1, op2; > > - > > - /* Determine whether we should use results of > > - already handled statements or not. */ > > - if (ready_stmts_end == 0 > > - && (i - stmt_index >= width || op_index < 1)) > > - ready_stmts_end = i; > > - > > - /* Now we choose operands for the next statement. Non zero > > - value in ready_stmts_end means here that we should use > > - the result of already generated statements as new operand. */ > > - if (ready_stmts_end > 0) > > - { > > - op1 = gimple_assign_lhs (stmts[stmt_index++]); > > - if (ready_stmts_end > stmt_index) > > - op2 = gimple_assign_lhs (stmts[stmt_index++]); > > - else if (op_index >= 0) > > - { > > - operand_entry *oe = ops[op_index--]; > > - stmt2 = oe->stmt_to_insert; > > - op2 = oe->op; > > - } > > - else > > - { > > - gcc_assert (stmt_index < i); > > - op2 = gimple_assign_lhs (stmts[stmt_index++]); > > - } > > + /* */ > > empty comment?
Added it, thanks. > > > + if (op_index > 1 && !has_fma) > > + swap_ops_for_binary_stmt (ops, op_index - 2); > > > > - if (stmt_index >= ready_stmts_end) > > - ready_stmts_end = 0; > > - } > > - else > > + for (j = 0; j < 2; j++) > > { > > - if (op_index > 1) > > - swap_ops_for_binary_stmt (ops, op_index - 2); > > - operand_entry *oe2 = ops[op_index--]; > > - operand_entry *oe1 = ops[op_index--]; > > - op2 = oe2->op; > > - stmt2 = oe2->stmt_to_insert; > > - op1 = oe1->op; > > - stmt1 = oe1->stmt_to_insert; > > + gcc_assert (op_index >= 0); > > + oe = ops[op_index--]; > > + tmp_op[j] = oe->op; > > + /* If the stmt that defines operand has to be inserted, insert it > > + before the use. */ > > + stmt1 = oe->stmt_to_insert; > > + if (stmt1) > > + insert_stmt_before_use (stmts[i], stmt1); > > + stmt1 = NULL; > > } > > - > > - /* If we emit the last statement then we should put > > - operands into the last statement. It will also > > - break the loop. */ > > - if (op_index < 0 && stmt_index == i) > > - i = stmt_num - 1; > > + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1), tmp_op[1], > tmp_op[0], opcode); > > + gimple_set_visited (stmts[i], true); > > > > if (dump_file && (dump_flags & TDF_DETAILS)) > > { > > - fprintf (dump_file, "Transforming "); > > + fprintf (dump_file, " into "); > > print_gimple_stmt (dump_file, stmts[i], 0); > > } > > + } > > > > - /* If the stmt that defines operand has to be inserted, insert it > > - before the use. */ > > - if (stmt1) > > - insert_stmt_before_use (stmts[i], stmt1); > > - if (stmt2) > > - insert_stmt_before_use (stmts[i], stmt2); > > - stmt1 = stmt2 = NULL; > > - > > - /* We keep original statement only for the last one. All > > - others are recreated. */ > > - if (i == stmt_num - 1) > > + for (i = width; i < stmt_num; i++) > > + { > > + /* We keep original statement only for the last one. All others are > > + recreated. */ > > + if ( op_index < 0) > > { > > - gimple_assign_set_rhs1 (stmts[i], op1); > > - gimple_assign_set_rhs2 (stmts[i], op2); > > - update_stmt (stmts[i]); > > + if (width_count == 2) > > + { > > + > > + /* We keep original statement only for the last one. All > > + others are recreated. */ > > + gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs > > (stmts[i-1])); > > + gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs > > (stmts[i-2])); > > + update_stmt (stmts[i]); > > + } > > + else > > + { > > + > > + stmts[i] = > > + build_and_add_sum (TREE_TYPE (last_rhs1), > > + gimple_assign_lhs (stmts[i-width_count]), > > + gimple_assign_lhs > > (stmts[i-width_count+1]), > > + opcode); > > + gimple_set_visited (stmts[i], true); > > + width_count--; > > + } > > } > > else > > { > > - stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1), op1, op2, > opcode); > > + /* Attach the rest of the ops to the parallel dependency chain. > > */ > > + oe = ops[op_index--]; > > + op1 = oe->op; > > + stmt1 = oe->stmt_to_insert; > > + if (stmt1) > > + insert_stmt_before_use (stmts[i], stmt1); > > + stmt1 = NULL; > > + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1), > > + gimple_assign_lhs (stmts[i-width]), > > + op1, > > + opcode); > > gimple_set_visited (stmts[i], true); > > } > > + > > if (dump_file && (dump_flags & TDF_DETAILS)) > > { > > fprintf (dump_file, " into "); > > print_gimple_stmt (dump_file, stmts[i], 0); > > } > > } > > - > > I've looked three times but didn't find a use of 'has_fma'? It's located after your first comment, If the chain has FAM, we do not swap two operands if (op_index > 1 && !has_fma) swap_ops_for_binary_stmt (ops, op_index - 2); > > remove_visited_stmt_chain (last_rhs1); } > > > > @@ -6649,6 +6654,76 @@ transform_stmt_to_multiply > (gimple_stmt_iterator *gsi, gimple *stmt, > > } > > } > > > > +/* Rearrange ops to generate more FMA when the chain may has more > than 2 fmas. > > may have > Done. > > + Put no-mult ops and mult ops alternately at the end of the queue, which > is > > + conducive to generating more fma and reducing the loss of FMA when > breaking > > + the chain. > > + E.g. > > + a * b + c * d + e generates: > > + > > + _4 = c_9(D) * d_10(D); > > + _12 = .FMA (a_7(D), b_8(D), _4); > > + _11 = e_6(D) + _12; > > + > > + Rtearrange ops to -> e + a * b + c * d generates: > > Rearrange > Done. > > + > > + _4 = .FMA (c_7(D), d_8(D), _3); > > + _11 = .FMA (a_5(D), b_6(D), _4); > > + */ > > +static bool > > +rank_ops_for_fma (vec<operand_entry *> *ops) { > > + operand_entry *oe; > > + unsigned int i; > > + unsigned int ops_length = ops->length (); > > + auto_vec<operand_entry *> ops_mult; > > + auto_vec<operand_entry *> ops_others; > > + > > + FOR_EACH_VEC_ELT (*ops, i, oe) > > + { > > + if (TREE_CODE (oe->op) == SSA_NAME) > > + { > > + gimple *def_stmt = SSA_NAME_DEF_STMT (oe->op); > > + if (is_gimple_assign (def_stmt) > > + && gimple_assign_rhs_code (def_stmt) == MULT_EXPR) > > + ops_mult.safe_push (oe); > > + else > > + ops_others.safe_push (oe); > > + } > > + else > > + ops_others.safe_push (oe); > > + } > > + /* When ops_mult.length == 2, like the following case, > > + > > + a * b + c * d + e. > > + > > + we need to rearrange the ops. > > + > > + Putting ops that not def from mult in front can generate more > > + fmas. */ if (ops_mult.length () >= 2) > > + { > > + /* If all ops are defined with mult, we don't need to rearrange them. > */ > > + if (ops_mult.length () != ops_length) > > use && with the previous condition. > Done. > > + { > > + /* Put no-mult ops and mult ops alternately at the end of the > > + queue, which is conducive to generating more fma and reducing > the > > + loss of FMA when breaking the chain. */ > > + ops->truncate (0); > > + ops->splice (ops_mult); > > + int j, opindex = ops->length (); > > + int others_length = ops_others.length(); > > + for (j = 0; j < others_length; j++) > > + { > > + oe = ops_others.pop (); > > + ops->safe_insert (opindex, oe); > > that's quadratic as it needs to move ops. As said previously we know that > 'ops' has enough room and you can use the quick_ (or non-safe_) variants of > the APIs on it. > Done, thanks. Regards, Lili. > Otherwise looks good to me. > > Thanks, > Richard. >