On Wed, Jul 13, 2011 at 3:13 PM, Andreas Krebbel <kreb...@linux.vnet.ibm.com> wrote: > Hi, > > the widening_mul pass might increase the number of multiplications in > the code by transforming > > a = b * c > d = a + 2 > e = a + 3 > > into: > > d = b * c + 2 > e = b * c + 3 > > under the assumption that an FMA instruction is not more expensive > than a simple add. This certainly isn't always true. While e.g. on > s390 an fma is indeed not slower than an add execution-wise it has > disadvantages regarding instruction grouping. It doesn't group with > any other instruction what has a major impact on the instruction > dispatch bandwidth. > > The following patch tries to figure out the costs for adds, mults and > fmas by building an RTX and asking the backends cost function in order > to estimate whether it is whorthwhile doing the transformation. > > With that patch the 436.cactus hotloop contains 28 less > multiplications than before increasing performance slightly (~2%). > > Bootstrapped and regtested on x86_64 and s390x.
Ick ;) Maybe this is finally the time to introduce target hook(s) to get us back costs for trees? For this case we'd need two actually, or just one - dependent on what finegrained information we pass. Choices: tree_code_cost (enum tree_code) tree_code_cost (enum tree_code, enum machine_mode mode) unary_cost (enum tree_code, tree actual_arg0) // args will be mostly SSA names or constants, but at least they are typed - works for mixed-typed operations binary_cost (...) ... unary_cost (enum tree_code, enum tree_code arg0_kind) // constant vs. non-constant arg, but lacks type/mode Richard. > Bye, > > -Andreas- > > 2011-07-13 Andreas Krebbel <andreas.kreb...@de.ibm.com> > > * tree-ssa-math-opts.c (compute_costs): New function. > (convert_mult_to_fma): Take costs into account when propagating > multiplications into several additions. > * config/s390/s390.c (z196_costs): Adjust costs for madbr and > maebr. > > > Index: gcc/tree-ssa-math-opts.c > =================================================================== > *** gcc/tree-ssa-math-opts.c.orig > --- gcc/tree-ssa-math-opts.c > *************** convert_plusminus_to_widen (gimple_stmt_ > *** 2185,2190 **** > --- 2185,2252 ---- > return true; > } > > + /* Computing the costs for calculating RTX with CODE in MODE. */ > + > + static unsigned > + compute_costs (enum machine_mode mode, enum rtx_code code, bool speed) > + { > + rtx seq; > + rtx set; > + unsigned cost = 0; > + > + start_sequence (); > + > + switch (GET_RTX_LENGTH (code)) > + { > + case 2: > + force_operand (gen_rtx_fmt_ee (code, mode, > + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1), > + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2)), > + NULL_RTX); > + break; > + case 3: > + /* FMA expressions are not handled by force_operand. */ > + expand_ternary_op (mode, fma_optab, > + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1), > + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2), > + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 3), > + NULL_RTX, false); > + break; > + default: > + gcc_unreachable (); > + } > + > + seq = get_insns (); > + end_sequence (); > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Calculating costs of %s in %s mode. Sequence > is:\n", > + GET_RTX_NAME (code), GET_MODE_NAME (mode)); > + print_rtl (dump_file, seq); > + } > + > + for (; seq; seq = NEXT_INSN (seq)) > + { > + set = single_set (seq); > + if (set) > + cost += rtx_cost (set, SET, speed); > + else > + cost++; > + } > + > + /* If the backend returns a cost of zero it is most certainly lying. > + Set this to one in order to notice that we already calculated it > + once. */ > + cost = cost ? cost : 1; > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "%s in %s costs %d\n\n", > + GET_RTX_NAME (code), GET_MODE_NAME (mode), cost); > + > + return cost; > + } > + > /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 > with uses in additions and subtractions to form fused multiply-add > operations. Returns true if successful and MUL_STMT should be removed. > */ > *************** convert_mult_to_fma (gimple mul_stmt, tr > *** 2197,2202 **** > --- 2259,2270 ---- > gimple use_stmt, neguse_stmt, fma_stmt; > use_operand_p use_p; > imm_use_iterator imm_iter; > + enum machine_mode mode; > + int uses = 0; > + bool speed = optimize_bb_for_speed_p (gimple_bb (mul_stmt)); > + static unsigned mul_cost[NUM_MACHINE_MODES]; > + static unsigned add_cost[NUM_MACHINE_MODES]; > + static unsigned fma_cost[NUM_MACHINE_MODES]; > > if (FLOAT_TYPE_P (type) > && flag_fp_contract_mode == FP_CONTRACT_OFF) > *************** convert_mult_to_fma (gimple mul_stmt, tr > *** 2213,2222 **** > if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) > return false; > > /* Make sure that the multiplication statement becomes dead after > ! the transformation, thus that all uses are transformed to FMAs. > ! This means we assume that an FMA operation has the same cost > ! as an addition. */ > FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) > { > enum tree_code use_code; > --- 2281,2297 ---- > if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) > return false; > > + mode = TYPE_MODE (type); > + > + if (!fma_cost[mode]) > + { > + fma_cost[mode] = compute_costs (mode, FMA, speed); > + add_cost[mode] = compute_costs (mode, PLUS, speed); > + mul_cost[mode] = compute_costs (mode, MULT, speed); > + } > + > /* Make sure that the multiplication statement becomes dead after > ! the transformation, thus that all uses are transformed to FMAs. */ > FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) > { > enum tree_code use_code; > *************** convert_mult_to_fma (gimple mul_stmt, tr > *** 2292,2297 **** > --- 2367,2373 ---- > if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) > return false; > > + uses++; > /* While it is possible to validate whether or not the exact form > that we've recognized is available in the backend, the assumption > is that the transformation is never a loss. For instance, suppose > *************** convert_mult_to_fma (gimple mul_stmt, tr > *** 2302,2307 **** > --- 2378,2389 ---- > independant and could be run in parallel. */ > } > > + /* Calculate the costs of moving the multiplication into all the > + minus/plus expressions. */ > + > + if (uses * fma_cost[mode] > uses * add_cost[mode] + mul_cost[mode]) > + return false; > + > FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) > { > gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); > Index: gcc/config/s390/s390.c > =================================================================== > *** gcc/config/s390/s390.c.orig > --- gcc/config/s390/s390.c > *************** struct processor_costs z196_cost = > *** 242,249 **** > COSTS_N_INSNS (100), /* SQXBR B+100 */ > COSTS_N_INSNS (42), /* SQDBR B+42 */ > COSTS_N_INSNS (28), /* SQEBR B+28 */ > ! COSTS_N_INSNS (1), /* MADBR B */ > ! COSTS_N_INSNS (1), /* MAEBR B */ > COSTS_N_INSNS (101), /* DXBR B+101 */ > COSTS_N_INSNS (29), /* DDBR */ > COSTS_N_INSNS (22), /* DEBR */ > --- 242,250 ---- > COSTS_N_INSNS (100), /* SQXBR B+100 */ > COSTS_N_INSNS (42), /* SQDBR B+42 */ > COSTS_N_INSNS (28), /* SQEBR B+28 */ > ! /* Cheaper than a mul+add but more expensive then a single mul/add. */ > ! COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MADBR B */ > ! COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MAEBR B */ > COSTS_N_INSNS (101), /* DXBR B+101 */ > COSTS_N_INSNS (29), /* DDBR */ > COSTS_N_INSNS (22), /* DEBR */ >