Hi!

Even if target has an umulv4_optab pattern like i?86 mul[lq]; jo,
if one argument is constant power of two, using two shifts, or
for multiplication by 2 just shl with js on previous value is faster
(though, for -Os mul[lq]; jo wins).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-11-29  Jakub Jelinek  <ja...@redhat.com>

        PR target/83210
        * internal-fn.c (expand_mul_overflow): Optimize unsigned
        multiplication by power of 2 constant into two shifts + comparison.

        * gcc.target/i386/pr83210.c: New test.

--- gcc/internal-fn.c.jj        2017-11-22 21:37:46.000000000 +0100
+++ gcc/internal-fn.c   2017-11-29 17:44:17.699009169 +0100
@@ -1462,6 +1462,39 @@ expand_mul_overflow (location_t loc, tre
   type = build_nonstandard_integer_type (GET_MODE_PRECISION (mode), uns);
   sign = uns ? UNSIGNED : SIGNED;
   icode = optab_handler (uns ? umulv4_optab : mulv4_optab, mode);
+  if (uns
+      && (integer_pow2p (arg0) || integer_pow2p (arg1))
+      && (optimize_insn_for_speed_p () || icode == CODE_FOR_nothing))
+    {
+      /* Optimize unsigned multiplication by power of 2 constant
+        using 2 shifts, one for result, one to extract the shifted
+        out bits to see if they are all zero.
+        Don't do this if optimizing for size and we have umulv4_optab,
+        in that case assume multiplication will be shorter.  */
+      rtx opn0 = op0;
+      rtx opn1 = op1;
+      tree argn0 = arg0;
+      tree argn1 = arg1;
+      if (integer_pow2p (arg0))
+       {
+         std::swap (opn0, opn1);
+         std::swap (argn0, argn1);
+       }
+      int cnt = tree_log2 (argn1);
+      if (cnt >= 0 && cnt < GET_MODE_PRECISION (mode))
+       {
+         rtx upper = const0_rtx;
+         res = expand_shift (LSHIFT_EXPR, mode, opn0, cnt, NULL_RTX, uns);
+         if (cnt != 0)
+           upper = expand_shift (RSHIFT_EXPR, mode, opn0,
+                                 GET_MODE_PRECISION (mode) - cnt,
+                                 NULL_RTX, uns);
+         do_compare_rtx_and_jump (upper, const0_rtx, EQ, true, mode,
+                                  NULL_RTX, NULL, done_label,
+                                  profile_probability::very_likely ());
+         goto do_error_label;
+       }
+    }
   if (icode != CODE_FOR_nothing)
     {
       struct expand_operand ops[4];
--- gcc/testsuite/gcc.target/i386/pr83210.c.jj  2017-11-29 17:58:00.102881065 
+0100
+++ gcc/testsuite/gcc.target/i386/pr83210.c     2017-11-29 17:57:36.000000000 
+0100
@@ -0,0 +1,53 @@
+/* PR target/83210 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not {\mmul[lq]\M} } } */
+
+void bar (void);
+
+unsigned
+f1 (unsigned int x)
+{
+  unsigned res;
+  if (__builtin_mul_overflow (x, 2, &res))
+    bar ();
+  return res;
+}
+
+unsigned long
+f2 (unsigned long x)
+{
+  unsigned long res;
+  if (__builtin_mul_overflow (16, x, &res))
+    bar ();
+  return res;
+}
+
+unsigned long long
+f3 (unsigned long long x)
+{
+  unsigned long long res;
+  if (__builtin_mul_overflow (x, (1ULL << (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ 
- 1)), &res))
+    bar ();
+  return res;
+}
+
+#ifdef __SIZEOF_INT128__
+unsigned __int128
+f4 (unsigned __int128 x)
+{
+  unsigned __int128 res;
+  if (__builtin_mul_overflow (x, (((unsigned __int128) 1) << 
(__SIZEOF_INT128__ * __CHAR_BIT__ / 2)), &res))
+    bar ();
+  return res;
+}
+
+unsigned __int128
+f5 (unsigned __int128 x)
+{
+  unsigned __int128 res;
+  if (__builtin_mul_overflow (x, (((unsigned __int128) 1) << 
(__SIZEOF_INT128__ * __CHAR_BIT__ / 2 + 3)), &res))
+    bar ();
+  return res;
+}
+#endif

        Jakub

Reply via email to