Re: [PATCH] Small expand_mul_overflow improvement (PR target/82981)

Richard Biener Wed, 15 Nov 2017 00:32:32 -0800

On Tue, 14 Nov 2017, Jakub Jelinek wrote:

> Hi!
> 
> For targets that don't have {,u}mulv<mode>4 insn we try 3 different
> expansions of the basic signed * signed -> signed or unsigned * unsigned ->
> unsigned overflow computation.  The first one is done if 
>        if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
>          && targetm.scalar_mode_supported_p (wmode))
> and we emit a WIDEN_MULT_EXPR followed by extraction of the hipart
> from it (for testing overflow if both unsigned and signed) and
> lowpart (result of the multiplication and for signed overflow testing
> where we use MSB of it).  This case is meant for use by smaller modes,
> e.g. subword, where it is generally pretty efficient.  Unfortunately
> on some targets, e.g. mips 64-bit, where the is no widening mult
> optab it can be expanded as a libcall on the full wmode operands,
> which is slow and causes problems e.g. to some freestanding environments
> like Linux kernel that don't bother to link in libgcc.a or replacement
> thereof.  Then there is another case, usually pretty large, with usually
> two but sometimes one multiplication, and various conditionals, shifts, etc.
> meant primarily for the widest supported mode.  And the last fallback
> is just doing multiplication and never computing overflow, hopefully it
> is never used at least on sane targets.
> 
> This patch attempts to check if we'd emit WIDEN_MULT_EXPR as a libcall
> and in that case tries to use the other possibilities, and only falls
> back to the WIDEN_MULT_EXPR with a libcall if we'd otherwise use the
> last fallback without overflow computation.
> 
> In addition to it, it adds support for targets that have supported
> MULT_HIGHPART_EXPR for that mode, by doing pretty much what the
> WIDEN_MULT_EXPR case does, but instead of doing one multiplication
> to compute both lowpart and highpart and then shifts to split those
> we use one multiplication to compute the lowpart and one MULT_HIGHPART_EXPR
> to compute the highpart.  In theory this method doesn't have to be always
> faster than the one with hmode, because the MULT_HIGHPART_EXPR case does
> 2 multiplications plus one comparison, while the hmode code does sometimes
> just one, but it is significantly shorter, fewer conditionals/branches
> so I think it should be generally a win (if it turns out not to be the
> case on some target, we could restrict it to -Os only or whatever).
> 
> And lastly, the MULT_HIGHPART_EXPR case can actually be the optimal code
> if we are only checking for the overflow and don't actually need the
> multiplication value, it is unsigned multiply and we don't need any
> res using code afterwards; in that case the low part multiply can be DCEd
> and only the highpart multiply + comparison will remain.  So, the patch
> adds check for single IMAGPART_EXPR use and other conditions and uses
> the MULT_HIGHPART_EXPR code in preference of the WIDEN_MULT_EXPR in that
> case.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, tested on the
> testcase with cross to mips, ok for trunk?


Ok, but can you add the testcase for the kernel issue?

Thanks,
Richard.

> 2017-11-14  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR target/82981
>       * internal-fn.c: Include gimple-ssa.h, tree-phinodes.h and
>       ssa-iterators.h.
>       (can_widen_mult_without_libcall): New function.
>       (expand_mul_overflow): If only checking unsigned mul overflow,
>       not result, and can do efficiently MULT_HIGHPART_EXPR, emit that.
>       Don't use WIDEN_MULT_EXPR if it would involve a libcall, unless
>       no other way works.  Add MULT_HIGHPART_EXPR + MULT_EXPR support.
>       (expand_DIVMOD): Formatting fix.
>       * expmed.h (expand_mult): Add NO_LIBCALL argument.
>       * expmed.c (expand_mult): Likewise.  Use OPTAB_WIDEN rather
>       than OPTAB_LIB_WIDEN if NO_LIBCALL is true, and allow it to fail.
> 
> --- gcc/internal-fn.c.jj      2017-10-23 10:13:08.000000000 +0200
> +++ gcc/internal-fn.c 2017-11-14 16:48:25.414403348 +0100
> @@ -46,6 +46,9 @@ along with GCC; see the file COPYING3.
>  #include "recog.h"
>  #include "builtins.h"
>  #include "optabs-tree.h"
> +#include "gimple-ssa.h"
> +#include "tree-phinodes.h"
> +#include "ssa-iterators.h"
>  
>  /* The names of each internal function, indexed by function number.  */
>  const char *const internal_fn_name_array[] = {
> @@ -1172,6 +1175,35 @@ expand_neg_overflow (location_t loc, tre
>      }
>  }
>  
> +/* Return true if UNS WIDEN_MULT_EXPR with result mode WMODE and operand
> +   mode MODE can be expanded without using a libcall.  */
> +
> +static bool
> +can_widen_mult_without_libcall (scalar_int_mode wmode, scalar_int_mode mode,
> +                             rtx op0, rtx op1, bool uns)
> +{
> +  if (find_widening_optab_handler (umul_widen_optab, wmode, mode)
> +      != CODE_FOR_nothing)
> +    return true;
> +    
> +  if (find_widening_optab_handler (smul_widen_optab, wmode, mode)
> +      != CODE_FOR_nothing)
> +    return true;
> +
> +  rtx_insn *last = get_last_insn ();
> +  if (CONSTANT_P (op0))
> +    op0 = convert_modes (wmode, mode, op0, uns);
> +  else
> +    op0 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 1);
> +  if (CONSTANT_P (op1))
> +    op1 = convert_modes (wmode, mode, op1, uns);
> +  else
> +    op1 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 2);
> +  rtx ret = expand_mult (wmode, op0, op1, NULL_RTX, uns, true);
> +  delete_insns_since (last);
> +  return ret != NULL_RTX;
> +} 
> +
>  /* Add mul overflow checking to the statement STMT.  */
>  
>  static void
> @@ -1465,9 +1497,29 @@ expand_mul_overflow (location_t loc, tre
>        ops.op1 = make_tree (type, op1);
>        ops.op2 = NULL_TREE;
>        ops.location = loc;
> +
> +      /* Optimize unsigned overflow check where we don't use the
> +      multiplication result, just whether overflow happened.
> +      If we can do MULT_HIGHPART_EXPR, that followed by
> +      comparison of the result against zero is cheapest.
> +      We'll still compute res, but it should be DCEd later.  */
> +      use_operand_p use;
> +      gimple *use_stmt;
> +      if (!is_ubsan
> +       && lhs
> +       && uns
> +       && !(uns0_p && uns1_p && !unsr_p)
> +       && can_mult_highpart_p (mode, uns) == 1
> +       && single_imm_use (lhs, &use, &use_stmt)
> +       && is_gimple_assign (use_stmt)
> +       && gimple_assign_rhs_code (use_stmt) == IMAGPART_EXPR)
> +     goto highpart;
> +
>        if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
> -       && targetm.scalar_mode_supported_p (wmode))
> +       && targetm.scalar_mode_supported_p (wmode)
> +       && can_widen_mult_without_libcall (wmode, mode, op0, op1, uns))
>       {
> +     twoxwider:
>         ops.code = WIDEN_MULT_EXPR;
>         ops.type
>           = build_nonstandard_integer_type (GET_MODE_PRECISION (wmode), uns);
> @@ -1495,6 +1547,35 @@ expand_mul_overflow (location_t loc, tre
>                                      profile_probability::very_likely ());
>           }
>       }
> +      else if (can_mult_highpart_p (mode, uns) == 1)
> +     {
> +     highpart:
> +       ops.code = MULT_HIGHPART_EXPR;
> +       ops.type = type;
> +
> +       rtx hipart = expand_expr_real_2 (&ops, NULL_RTX, mode,
> +                                        EXPAND_NORMAL);
> +       ops.code = MULT_EXPR;
> +       res = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
> +       if (uns)
> +         /* For the unsigned multiplication, there was overflow if
> +            HIPART is non-zero.  */
> +         do_compare_rtx_and_jump (hipart, const0_rtx, EQ, true, mode,
> +                                  NULL_RTX, NULL, done_label,
> +                                  profile_probability::very_likely ());
> +       else
> +         {
> +           rtx signbit = expand_shift (RSHIFT_EXPR, mode, res, prec - 1,
> +                                       NULL_RTX, 0);
> +           /* RES is low half of the double width result, HIPART
> +              the high half.  There was overflow if
> +              HIPART is different from RES < 0 ? -1 : 0.  */
> +           do_compare_rtx_and_jump (signbit, hipart, EQ, true, mode,
> +                                    NULL_RTX, NULL, done_label,
> +                                    profile_probability::very_likely ());
> +         }
> +       
> +     }
>        else if (int_mode_for_size (prec / 2, 1).exists (&hmode)
>              && 2 * GET_MODE_PRECISION (hmode) == prec)
>       {
> @@ -1800,6 +1881,11 @@ expand_mul_overflow (location_t loc, tre
>         tem = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
>         emit_move_insn (res, tem);
>       }
> +      else if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode)
> +            && targetm.scalar_mode_supported_p (wmode))
> +     /* Even emitting a libcall is better than not detecting overflow
> +        at all.  */
> +     goto twoxwider;
>        else
>       {
>         gcc_assert (!is_ubsan);
> @@ -2588,7 +2674,7 @@ expand_DIVMOD (internal_fn, gcall *call_
>    expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs),
>                      make_tree (TREE_TYPE (arg0), quotient),
>                      make_tree (TREE_TYPE (arg1), remainder)),
> -           target, VOIDmode, EXPAND_NORMAL);
> +            target, VOIDmode, EXPAND_NORMAL);
>  }
>  
>  /* Expand a call to FN using the operands in STMT.  FN has a single
> --- gcc/expmed.h.jj   2017-09-01 09:26:37.000000000 +0200
> +++ gcc/expmed.h      2017-11-14 12:32:11.344054003 +0100
> @@ -727,7 +727,7 @@ extern rtx extract_bit_field (rtx, unsig
>                             unsigned HOST_WIDE_INT, int, rtx,
>                             machine_mode, machine_mode, bool, rtx *);
>  extern rtx extract_low_bits (machine_mode, machine_mode, rtx);
> -extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int);
> +extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int, bool = false);
>  extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
>                                       rtx, int);
>  
> --- gcc/expmed.c.jj   2017-11-09 15:38:48.000000000 +0100
> +++ gcc/expmed.c      2017-11-14 12:50:24.948946475 +0100
> @@ -3284,7 +3284,7 @@ expand_mult_const (machine_mode mode, rt
>  
>  rtx
>  expand_mult (machine_mode mode, rtx op0, rtx op1, rtx target,
> -          int unsignedp)
> +          int unsignedp, bool no_libcall)
>  {
>    enum mult_variant variant;
>    struct algorithm algorithm;
> @@ -3420,14 +3420,16 @@ expand_mult (machine_mode mode, rtx op0,
>      {
>        op0 = force_reg (GET_MODE (op0), op0);
>        return expand_binop (mode, add_optab, op0, op0,
> -                        target, unsignedp, OPTAB_LIB_WIDEN);
> +                        target, unsignedp,
> +                        no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN);
>      }
>  
>    /* This used to use umul_optab if unsigned, but for non-widening multiply
>       there is no difference between signed and unsigned.  */
>    op0 = expand_binop (mode, do_trapv ? smulv_optab : smul_optab,
> -                   op0, op1, target, unsignedp, OPTAB_LIB_WIDEN);
> -  gcc_assert (op0);
> +                   op0, op1, target, unsignedp,
> +                   no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN);
> +  gcc_assert (op0 || no_libcall);
>    return op0;
>  }
>  
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

Re: [PATCH] Small expand_mul_overflow improvement (PR target/82981)

Reply via email to