On Wed, Jul 13, 2011 at 09:58:08AM -0700, Richard Henderson wrote: > Why the force_operand? You've got register inputs. Either the target > is going to support the operation or it isn't.
I agree that it doesn't seem to be necessary. I've used force_operand since ivopts (add_cost) is doing it without seeing a clear reason for it. So I've removed it now. > Saving cost data dependent on speed, which is non-constant. > You probably need to make this a two dimensional array. Fixed. Here is an updated version. Bye, -Andreas- 2011-07-14 Andreas Krebbel <andreas.kreb...@de.ibm.com> * tree-ssa-math-opts.c (compute_costs): New function. (convert_mult_to_fma): Take costs into account when propagating multiplications into several additions. * config/s390/s390.c (z196_costs): Adjust costs for madbr and maebr. Index: gcc/tree-ssa-math-opts.c =================================================================== *** gcc/tree-ssa-math-opts.c.orig --- gcc/tree-ssa-math-opts.c *************** convert_plusminus_to_widen (gimple_stmt_ *** 2185,2190 **** --- 2185,2236 ---- return true; } + /* Computing the costs for calculating RTX with CODE in MODE. */ + + static unsigned + compute_costs (enum machine_mode mode, enum rtx_code code, bool speed) + { + rtx insn; + unsigned cost; + + switch (GET_RTX_LENGTH (code)) + { + case 2: + insn = gen_rtx_fmt_ee (code, mode, + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1), + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2)); + break; + case 3: + insn = gen_rtx_fmt_eee (code, mode, + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1), + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2), + gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 3)); + break; + default: + gcc_unreachable (); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Calculating costs of %s in %s mode. RTX is:\n", + GET_RTX_NAME (code), GET_MODE_NAME (mode)); + print_rtl (dump_file, insn); + } + + cost = rtx_cost (insn, SET, speed); + + /* If the backend returns a cost of zero it is most certainly lying. + Set this to one in order to notice that we already calculated it + once. */ + cost = cost ? cost : 1; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n%s in %s costs %d\n\n", + GET_RTX_NAME (code), GET_MODE_NAME (mode), cost); + + return cost; + } + /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 with uses in additions and subtractions to form fused multiply-add operations. Returns true if successful and MUL_STMT should be removed. */ *************** convert_mult_to_fma (gimple mul_stmt, tr *** 2197,2202 **** --- 2243,2254 ---- gimple use_stmt, neguse_stmt, fma_stmt; use_operand_p use_p; imm_use_iterator imm_iter; + enum machine_mode mode; + int uses = 0; + bool speed = optimize_bb_for_speed_p (gimple_bb (mul_stmt)); + static unsigned mul_cost[2][NUM_MACHINE_MODES]; + static unsigned add_cost[2][NUM_MACHINE_MODES]; + static unsigned fma_cost[2][NUM_MACHINE_MODES]; if (FLOAT_TYPE_P (type) && flag_fp_contract_mode == FP_CONTRACT_OFF) *************** convert_mult_to_fma (gimple mul_stmt, tr *** 2213,2222 **** if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) return false; /* Make sure that the multiplication statement becomes dead after ! the transformation, thus that all uses are transformed to FMAs. ! This means we assume that an FMA operation has the same cost ! as an addition. */ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) { enum tree_code use_code; --- 2265,2281 ---- if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) return false; + mode = TYPE_MODE (type); + + if (!fma_cost[speed][mode]) + { + fma_cost[speed][mode] = compute_costs (mode, FMA, speed); + add_cost[speed][mode] = compute_costs (mode, PLUS, speed); + mul_cost[speed][mode] = compute_costs (mode, MULT, speed); + } + /* Make sure that the multiplication statement becomes dead after ! the transformation, thus that all uses are transformed to FMAs. */ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) { enum tree_code use_code; *************** convert_mult_to_fma (gimple mul_stmt, tr *** 2292,2297 **** --- 2351,2357 ---- if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) return false; + uses++; /* While it is possible to validate whether or not the exact form that we've recognized is available in the backend, the assumption is that the transformation is never a loss. For instance, suppose *************** convert_mult_to_fma (gimple mul_stmt, tr *** 2302,2307 **** --- 2362,2374 ---- independant and could be run in parallel. */ } + /* Calculate the costs of moving the multiplication into all the + minus/plus expressions. */ + + if (uses * fma_cost[speed][mode] > + uses * add_cost[speed][mode] + mul_cost[speed][mode]) + return false; + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) { gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); Index: gcc/config/s390/s390.c =================================================================== *** gcc/config/s390/s390.c.orig --- gcc/config/s390/s390.c *************** struct processor_costs z196_cost = *** 242,249 **** COSTS_N_INSNS (100), /* SQXBR B+100 */ COSTS_N_INSNS (42), /* SQDBR B+42 */ COSTS_N_INSNS (28), /* SQEBR B+28 */ ! COSTS_N_INSNS (1), /* MADBR B */ ! COSTS_N_INSNS (1), /* MAEBR B */ COSTS_N_INSNS (101), /* DXBR B+101 */ COSTS_N_INSNS (29), /* DDBR */ COSTS_N_INSNS (22), /* DEBR */ --- 242,250 ---- COSTS_N_INSNS (100), /* SQXBR B+100 */ COSTS_N_INSNS (42), /* SQDBR B+42 */ COSTS_N_INSNS (28), /* SQEBR B+28 */ ! /* Cheaper than a mul+add but more expensive then a single mul/add. */ ! COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MADBR B */ ! COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MAEBR B */ COSTS_N_INSNS (101), /* DXBR B+101 */ COSTS_N_INSNS (29), /* DDBR */ COSTS_N_INSNS (22), /* DEBR */