On Tue, 14 Nov 2017, Jakub Jelinek wrote: > Hi! > > For targets that don't have {,u}mulv<mode>4 insn we try 3 different > expansions of the basic signed * signed -> signed or unsigned * unsigned -> > unsigned overflow computation. The first one is done if > if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) > && targetm.scalar_mode_supported_p (wmode)) > and we emit a WIDEN_MULT_EXPR followed by extraction of the hipart > from it (for testing overflow if both unsigned and signed) and > lowpart (result of the multiplication and for signed overflow testing > where we use MSB of it). This case is meant for use by smaller modes, > e.g. subword, where it is generally pretty efficient. Unfortunately > on some targets, e.g. mips 64-bit, where the is no widening mult > optab it can be expanded as a libcall on the full wmode operands, > which is slow and causes problems e.g. to some freestanding environments > like Linux kernel that don't bother to link in libgcc.a or replacement > thereof. Then there is another case, usually pretty large, with usually > two but sometimes one multiplication, and various conditionals, shifts, etc. > meant primarily for the widest supported mode. And the last fallback > is just doing multiplication and never computing overflow, hopefully it > is never used at least on sane targets. > > This patch attempts to check if we'd emit WIDEN_MULT_EXPR as a libcall > and in that case tries to use the other possibilities, and only falls > back to the WIDEN_MULT_EXPR with a libcall if we'd otherwise use the > last fallback without overflow computation. > > In addition to it, it adds support for targets that have supported > MULT_HIGHPART_EXPR for that mode, by doing pretty much what the > WIDEN_MULT_EXPR case does, but instead of doing one multiplication > to compute both lowpart and highpart and then shifts to split those > we use one multiplication to compute the lowpart and one MULT_HIGHPART_EXPR > to compute the highpart. In theory this method doesn't have to be always > faster than the one with hmode, because the MULT_HIGHPART_EXPR case does > 2 multiplications plus one comparison, while the hmode code does sometimes > just one, but it is significantly shorter, fewer conditionals/branches > so I think it should be generally a win (if it turns out not to be the > case on some target, we could restrict it to -Os only or whatever). > > And lastly, the MULT_HIGHPART_EXPR case can actually be the optimal code > if we are only checking for the overflow and don't actually need the > multiplication value, it is unsigned multiply and we don't need any > res using code afterwards; in that case the low part multiply can be DCEd > and only the highpart multiply + comparison will remain. So, the patch > adds check for single IMAGPART_EXPR use and other conditions and uses > the MULT_HIGHPART_EXPR code in preference of the WIDEN_MULT_EXPR in that > case. > > Bootstrapped/regtested on x86_64-linux and i686-linux, tested on the > testcase with cross to mips, ok for trunk?
Ok, but can you add the testcase for the kernel issue? Thanks, Richard. > 2017-11-14 Jakub Jelinek <ja...@redhat.com> > > PR target/82981 > * internal-fn.c: Include gimple-ssa.h, tree-phinodes.h and > ssa-iterators.h. > (can_widen_mult_without_libcall): New function. > (expand_mul_overflow): If only checking unsigned mul overflow, > not result, and can do efficiently MULT_HIGHPART_EXPR, emit that. > Don't use WIDEN_MULT_EXPR if it would involve a libcall, unless > no other way works. Add MULT_HIGHPART_EXPR + MULT_EXPR support. > (expand_DIVMOD): Formatting fix. > * expmed.h (expand_mult): Add NO_LIBCALL argument. > * expmed.c (expand_mult): Likewise. Use OPTAB_WIDEN rather > than OPTAB_LIB_WIDEN if NO_LIBCALL is true, and allow it to fail. > > --- gcc/internal-fn.c.jj 2017-10-23 10:13:08.000000000 +0200 > +++ gcc/internal-fn.c 2017-11-14 16:48:25.414403348 +0100 > @@ -46,6 +46,9 @@ along with GCC; see the file COPYING3. > #include "recog.h" > #include "builtins.h" > #include "optabs-tree.h" > +#include "gimple-ssa.h" > +#include "tree-phinodes.h" > +#include "ssa-iterators.h" > > /* The names of each internal function, indexed by function number. */ > const char *const internal_fn_name_array[] = { > @@ -1172,6 +1175,35 @@ expand_neg_overflow (location_t loc, tre > } > } > > +/* Return true if UNS WIDEN_MULT_EXPR with result mode WMODE and operand > + mode MODE can be expanded without using a libcall. */ > + > +static bool > +can_widen_mult_without_libcall (scalar_int_mode wmode, scalar_int_mode mode, > + rtx op0, rtx op1, bool uns) > +{ > + if (find_widening_optab_handler (umul_widen_optab, wmode, mode) > + != CODE_FOR_nothing) > + return true; > + > + if (find_widening_optab_handler (smul_widen_optab, wmode, mode) > + != CODE_FOR_nothing) > + return true; > + > + rtx_insn *last = get_last_insn (); > + if (CONSTANT_P (op0)) > + op0 = convert_modes (wmode, mode, op0, uns); > + else > + op0 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 1); > + if (CONSTANT_P (op1)) > + op1 = convert_modes (wmode, mode, op1, uns); > + else > + op1 = gen_raw_REG (wmode, LAST_VIRTUAL_REGISTER + 2); > + rtx ret = expand_mult (wmode, op0, op1, NULL_RTX, uns, true); > + delete_insns_since (last); > + return ret != NULL_RTX; > +} > + > /* Add mul overflow checking to the statement STMT. */ > > static void > @@ -1465,9 +1497,29 @@ expand_mul_overflow (location_t loc, tre > ops.op1 = make_tree (type, op1); > ops.op2 = NULL_TREE; > ops.location = loc; > + > + /* Optimize unsigned overflow check where we don't use the > + multiplication result, just whether overflow happened. > + If we can do MULT_HIGHPART_EXPR, that followed by > + comparison of the result against zero is cheapest. > + We'll still compute res, but it should be DCEd later. */ > + use_operand_p use; > + gimple *use_stmt; > + if (!is_ubsan > + && lhs > + && uns > + && !(uns0_p && uns1_p && !unsr_p) > + && can_mult_highpart_p (mode, uns) == 1 > + && single_imm_use (lhs, &use, &use_stmt) > + && is_gimple_assign (use_stmt) > + && gimple_assign_rhs_code (use_stmt) == IMAGPART_EXPR) > + goto highpart; > + > if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) > - && targetm.scalar_mode_supported_p (wmode)) > + && targetm.scalar_mode_supported_p (wmode) > + && can_widen_mult_without_libcall (wmode, mode, op0, op1, uns)) > { > + twoxwider: > ops.code = WIDEN_MULT_EXPR; > ops.type > = build_nonstandard_integer_type (GET_MODE_PRECISION (wmode), uns); > @@ -1495,6 +1547,35 @@ expand_mul_overflow (location_t loc, tre > profile_probability::very_likely ()); > } > } > + else if (can_mult_highpart_p (mode, uns) == 1) > + { > + highpart: > + ops.code = MULT_HIGHPART_EXPR; > + ops.type = type; > + > + rtx hipart = expand_expr_real_2 (&ops, NULL_RTX, mode, > + EXPAND_NORMAL); > + ops.code = MULT_EXPR; > + res = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); > + if (uns) > + /* For the unsigned multiplication, there was overflow if > + HIPART is non-zero. */ > + do_compare_rtx_and_jump (hipart, const0_rtx, EQ, true, mode, > + NULL_RTX, NULL, done_label, > + profile_probability::very_likely ()); > + else > + { > + rtx signbit = expand_shift (RSHIFT_EXPR, mode, res, prec - 1, > + NULL_RTX, 0); > + /* RES is low half of the double width result, HIPART > + the high half. There was overflow if > + HIPART is different from RES < 0 ? -1 : 0. */ > + do_compare_rtx_and_jump (signbit, hipart, EQ, true, mode, > + NULL_RTX, NULL, done_label, > + profile_probability::very_likely ()); > + } > + > + } > else if (int_mode_for_size (prec / 2, 1).exists (&hmode) > && 2 * GET_MODE_PRECISION (hmode) == prec) > { > @@ -1800,6 +1881,11 @@ expand_mul_overflow (location_t loc, tre > tem = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); > emit_move_insn (res, tem); > } > + else if (GET_MODE_2XWIDER_MODE (mode).exists (&wmode) > + && targetm.scalar_mode_supported_p (wmode)) > + /* Even emitting a libcall is better than not detecting overflow > + at all. */ > + goto twoxwider; > else > { > gcc_assert (!is_ubsan); > @@ -2588,7 +2674,7 @@ expand_DIVMOD (internal_fn, gcall *call_ > expand_expr (build2 (COMPLEX_EXPR, TREE_TYPE (lhs), > make_tree (TREE_TYPE (arg0), quotient), > make_tree (TREE_TYPE (arg1), remainder)), > - target, VOIDmode, EXPAND_NORMAL); > + target, VOIDmode, EXPAND_NORMAL); > } > > /* Expand a call to FN using the operands in STMT. FN has a single > --- gcc/expmed.h.jj 2017-09-01 09:26:37.000000000 +0200 > +++ gcc/expmed.h 2017-11-14 12:32:11.344054003 +0100 > @@ -727,7 +727,7 @@ extern rtx extract_bit_field (rtx, unsig > unsigned HOST_WIDE_INT, int, rtx, > machine_mode, machine_mode, bool, rtx *); > extern rtx extract_low_bits (machine_mode, machine_mode, rtx); > -extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int); > +extern rtx expand_mult (machine_mode, rtx, rtx, rtx, int, bool = false); > extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx, > rtx, int); > > --- gcc/expmed.c.jj 2017-11-09 15:38:48.000000000 +0100 > +++ gcc/expmed.c 2017-11-14 12:50:24.948946475 +0100 > @@ -3284,7 +3284,7 @@ expand_mult_const (machine_mode mode, rt > > rtx > expand_mult (machine_mode mode, rtx op0, rtx op1, rtx target, > - int unsignedp) > + int unsignedp, bool no_libcall) > { > enum mult_variant variant; > struct algorithm algorithm; > @@ -3420,14 +3420,16 @@ expand_mult (machine_mode mode, rtx op0, > { > op0 = force_reg (GET_MODE (op0), op0); > return expand_binop (mode, add_optab, op0, op0, > - target, unsignedp, OPTAB_LIB_WIDEN); > + target, unsignedp, > + no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN); > } > > /* This used to use umul_optab if unsigned, but for non-widening multiply > there is no difference between signed and unsigned. */ > op0 = expand_binop (mode, do_trapv ? smulv_optab : smul_optab, > - op0, op1, target, unsignedp, OPTAB_LIB_WIDEN); > - gcc_assert (op0); > + op0, op1, target, unsignedp, > + no_libcall ? OPTAB_WIDEN : OPTAB_LIB_WIDEN); > + gcc_assert (op0 || no_libcall); > return op0; > } > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)