Re: [PATCH] widening_mul: Do cost check when propagating mult into plus/minus expressions

Richard Guenther Wed, 13 Jul 2011 07:35:18 -0700

On Wed, Jul 13, 2011 at 3:13 PM, Andreas Krebbel
<kreb...@linux.vnet.ibm.com> wrote:
> Hi,
>
> the widening_mul pass might increase the number of multiplications in
> the code by transforming
>
> a = b * c
> d = a + 2
> e = a + 3
>
> into:
>
> d = b * c + 2
> e = b * c + 3
>
> under the assumption that an FMA instruction is not more expensive
> than a simple add.  This certainly isn't always true.  While e.g. on
> s390 an fma is indeed not slower than an add execution-wise it has
> disadvantages regarding instruction grouping.  It doesn't group with
> any other instruction what has a major impact on the instruction
> dispatch bandwidth.
>
> The following patch tries to figure out the costs for adds, mults and
> fmas by building an RTX and asking the backends cost function in order
> to estimate whether it is whorthwhile doing the transformation.
>
> With that patch the 436.cactus hotloop contains 28 less
> multiplications than before increasing performance slightly (~2%).
>
> Bootstrapped and regtested on x86_64 and s390x.


Ick ;)

Maybe this is finally the time to introduce target hook(s) to
get us back costs for trees?  For this case we'd need two
actually, or just one - dependent on what finegrained information
we pass.  Choices:

  tree_code_cost (enum tree_code)
  tree_code_cost (enum tree_code, enum machine_mode mode)
  unary_cost (enum tree_code, tree actual_arg0) // args will be mostly
SSA names or constants, but at least they are typed - works for
mixed-typed operations
  binary_cost (...)
  ...
  unary_cost (enum tree_code, enum tree_code arg0_kind) // constant
vs. non-constant arg, but lacks type/mode

Richard.

> Bye,
>
> -Andreas-
>
> 2011-07-13  Andreas Krebbel  <andreas.kreb...@de.ibm.com>
>
>        * tree-ssa-math-opts.c (compute_costs): New function.
>        (convert_mult_to_fma): Take costs into account when propagating
>        multiplications into several additions.
>        * config/s390/s390.c (z196_costs): Adjust costs for madbr and
>        maebr.
>
>
> Index: gcc/tree-ssa-math-opts.c
> ===================================================================
> *** gcc/tree-ssa-math-opts.c.orig
> --- gcc/tree-ssa-math-opts.c
> *************** convert_plusminus_to_widen (gimple_stmt_
> *** 2185,2190 ****
> --- 2185,2252 ----
>    return true;
>  }
>
> + /* Computing the costs for calculating RTX with CODE in MODE.  */
> +
> + static unsigned
> + compute_costs (enum machine_mode mode, enum rtx_code code, bool speed)
> + {
> +   rtx seq;
> +   rtx set;
> +   unsigned cost = 0;
> +
> +   start_sequence ();
> +
> +   switch (GET_RTX_LENGTH (code))
> +     {
> +     case 2:
> +       force_operand (gen_rtx_fmt_ee (code, mode,
> +                      gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
> +                      gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2)),
> +                    NULL_RTX);
> +       break;
> +     case 3:
> +       /* FMA expressions are not handled by force_operand.  */
> +       expand_ternary_op (mode, fma_optab,
> +                        gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 1),
> +                        gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 2),
> +                        gen_raw_REG (mode, LAST_VIRTUAL_REGISTER + 3),
> +                        NULL_RTX, false);
> +       break;
> +     default:
> +       gcc_unreachable ();
> +     }
> +
> +   seq = get_insns ();
> +   end_sequence ();
> +
> +   if (dump_file && (dump_flags & TDF_DETAILS))
> +     {
> +       fprintf (dump_file, "Calculating costs of %s in %s mode.  Sequence 
> is:\n",
> +              GET_RTX_NAME (code), GET_MODE_NAME (mode));
> +       print_rtl (dump_file, seq);
> +     }
> +
> +   for (; seq; seq = NEXT_INSN (seq))
> +     {
> +       set = single_set (seq);
> +       if (set)
> +       cost += rtx_cost (set, SET, speed);
> +       else
> +       cost++;
> +     }
> +
> +   /* If the backend returns a cost of zero it is most certainly lying.
> +      Set this to one in order to notice that we already calculated it
> +      once.  */
> +   cost = cost ? cost : 1;
> +
> +   if (dump_file && (dump_flags & TDF_DETAILS))
> +     fprintf (dump_file, "%s in %s costs %d\n\n",
> +              GET_RTX_NAME (code), GET_MODE_NAME (mode), cost);
> +
> +   return cost;
> + }
> +
>  /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
>     with uses in additions and subtractions to form fused multiply-add
>     operations.  Returns true if successful and MUL_STMT should be removed.  
> */
> *************** convert_mult_to_fma (gimple mul_stmt, tr
> *** 2197,2202 ****
> --- 2259,2270 ----
>    gimple use_stmt, neguse_stmt, fma_stmt;
>    use_operand_p use_p;
>    imm_use_iterator imm_iter;
> +   enum machine_mode mode;
> +   int uses = 0;
> +   bool speed = optimize_bb_for_speed_p (gimple_bb (mul_stmt));
> +   static unsigned mul_cost[NUM_MACHINE_MODES];
> +   static unsigned add_cost[NUM_MACHINE_MODES];
> +   static unsigned fma_cost[NUM_MACHINE_MODES];
>
>    if (FLOAT_TYPE_P (type)
>        && flag_fp_contract_mode == FP_CONTRACT_OFF)
> *************** convert_mult_to_fma (gimple mul_stmt, tr
> *** 2213,2222 ****
>    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
>      return false;
>
>    /* Make sure that the multiplication statement becomes dead after
> !      the transformation, thus that all uses are transformed to FMAs.
> !      This means we assume that an FMA operation has the same cost
> !      as an addition.  */
>    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
>      {
>        enum tree_code use_code;
> --- 2281,2297 ----
>    if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
>      return false;
>
> +   mode = TYPE_MODE (type);
> +
> +   if (!fma_cost[mode])
> +     {
> +       fma_cost[mode] = compute_costs (mode, FMA, speed);
> +       add_cost[mode] = compute_costs (mode, PLUS, speed);
> +       mul_cost[mode] = compute_costs (mode, MULT, speed);
> +     }
> +
>    /* Make sure that the multiplication statement becomes dead after
> !      the transformation, thus that all uses are transformed to FMAs.  */
>    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
>      {
>        enum tree_code use_code;
> *************** convert_mult_to_fma (gimple mul_stmt, tr
> *** 2292,2297 ****
> --- 2367,2373 ----
>        if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
>        return false;
>
> +       uses++;
>        /* While it is possible to validate whether or not the exact form
>         that we've recognized is available in the backend, the assumption
>         is that the transformation is never a loss.  For instance, suppose
> *************** convert_mult_to_fma (gimple mul_stmt, tr
> *** 2302,2307 ****
> --- 2378,2389 ----
>         independant and could be run in parallel.  */
>      }
>
> +   /* Calculate the costs of moving the multiplication into all the
> +      minus/plus expressions.  */
> +
> +   if (uses * fma_cost[mode] > uses * add_cost[mode] + mul_cost[mode])
> +     return false;
> +
>    FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
>      {
>        gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
> Index: gcc/config/s390/s390.c
> ===================================================================
> *** gcc/config/s390/s390.c.orig
> --- gcc/config/s390/s390.c
> *************** struct processor_costs z196_cost =
> *** 242,249 ****
>    COSTS_N_INSNS (100),   /* SQXBR B+100 */
>    COSTS_N_INSNS (42),    /* SQDBR B+42 */
>    COSTS_N_INSNS (28),    /* SQEBR B+28 */
> !   COSTS_N_INSNS (1),     /* MADBR B */
> !   COSTS_N_INSNS (1),     /* MAEBR B */
>    COSTS_N_INSNS (101),   /* DXBR B+101 */
>    COSTS_N_INSNS (29),    /* DDBR */
>    COSTS_N_INSNS (22),    /* DEBR */
> --- 242,250 ----
>    COSTS_N_INSNS (100),   /* SQXBR B+100 */
>    COSTS_N_INSNS (42),    /* SQDBR B+42 */
>    COSTS_N_INSNS (28),    /* SQEBR B+28 */
> !   /* Cheaper than a mul+add but more expensive then a single mul/add.  */
> !   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MADBR B */
> !   COSTS_N_INSNS (1) + COSTS_N_INSNS (1) / 2, /* MAEBR B */
>    COSTS_N_INSNS (101),   /* DXBR B+101 */
>    COSTS_N_INSNS (29),    /* DDBR */
>    COSTS_N_INSNS (22),    /* DEBR */
>

Re: [PATCH] widening_mul: Do cost check when propagating mult into plus/minus expressions

Reply via email to