63)&b in match.pd.

Roger Sayle Wed, 20 Apr 2022 11:35:25 -0700

This patch implements the constant folding optimization(s) described in
PR middle-end/98865, which should help address the serious performance
regression of Botan AES-128/XTS mentioned in PR tree-optimization/98856.
This combines aspects of both Jakub Jelinek's patch in comment #2 and
Andrew Pinski's patch in comment #4, so both are listed as co-authors.


Alas truth_valued_p is not quite what we want (and tweaking its
definition has undesirable side-effects), so instead this patch
introduces a new zero_one_valued predicate based on tree_nonzero_bits
that extends truth_valued_p (which is for Boolean types with single
bit precision).  This is then used to simple if X*Y into X&Y when
both X and Y are zero_one_valued_p, and simplify X*Y into (-X)&Y when
X is zero_one_valued_p, in both cases replacing an integer multiplication
with a cheaper bit-wise AND.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and with --target_board=unix{-m32}, with
no new failures, except for a tweak required to tree-ssa/vrp116.c.
The recently proposed cmove patch ensures the i386 backend continues
to generate identical code for vrp116.c as before.

Ok, either for mainline or when stage 1 reopens?


2022-04-20  Roger Sayle  <[email protected]>
            Andrew Pinski  <[email protected]>
            Jakub Jelinek  <[email protected]>

gcc/ChangeLog
        PR middle-end/98865
        * match.pd (match zero_one_valued_p): New predicate.
        (mult @0 @1): Use zero_one_valued_p for transforming into (and @0
@1).
        (mult zero_one_valued_p@0 @1): Convert integer multiplication into
        a negation and a bit-wise AND, if it can't be cheaply implemented by
        a single left shift.

gcc/testsuite/ChangeLog
        PR middle-end/98865
        * gcc.dg/pr98865.c: New test case.
        * gcc.dg/vrp116.c: Tweak test to confirm the integer multiplication
        has been eliminated, not for the actual replacement implementation.

Thanks,
Roger
--

diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d3..16a1203 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -285,14 +285,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
            || !COMPLEX_FLOAT_TYPE_P (type)))
    (negate @0)))
 
-/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 } */
-(simplify
- (mult SSA_NAME@1 SSA_NAME@2)
-  (if (INTEGRAL_TYPE_P (type)
-       && get_nonzero_bits (@1) == 1
-       && get_nonzero_bits (@2) == 1)
-   (bit_and @1 @2)))
-
 /* Transform x * { 0 or 1, 0 or 1, ... } into x & { 0 or -1, 0 or -1, ...},
    unless the target has native support for the former but not the latter.  */
 (simplify
@@ -1789,6 +1781,28 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (bit_not (bit_not @0))
   @0)
 
+(match zero_one_valued_p
+ @0
+ (if (INTEGRAL_TYPE_P (type) && tree_nonzero_bits (@0) == 1)))
+(match zero_one_valued_p
+ truth_valued_p@0)
+
+/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 } */
+(simplify
+ (mult zero_one_valued_p@0 zero_one_valued_p@1)
+ (if (INTEGRAL_TYPE_P (type))
+  (bit_and @0 @1)))
+
+/* Transform x * { 0 or 1 } into x & { 0 or -1 }, i.e. an integer
+   multiplication into negate/bitwise and.  Don't do this if the
+   multiplication is cheap, may be implemented by a single shift.  */
+(simplify
+ (mult:c zero_one_valued_p@0 @1)
+ (if (INTEGRAL_TYPE_P (type)
+      && (TREE_CODE (@1) != INTEGER_CST
+          || wi::popcount (wi::to_wide (@1)) > 1))
+  (bit_and (negate @0) @1)))
+
 /* Convert ~ (-A) to A - 1.  */
 (simplify
  (bit_not (convert? (negate @0)))
diff --git a/gcc/testsuite/gcc.dg/pr98865.c b/gcc/testsuite/gcc.dg/pr98865.c
new file mode 100644
index 0000000..e7599d3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98865.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#if __SIZEOF_INT__ == 4
+unsigned int foo(unsigned int a, unsigned int b)
+{
+  return (a >> 31) * b;
+}
+
+int bar(int a, int b)
+{
+  return -(a >> 31) * b;
+}
+
+int baz(int a, int b)
+{
+  int c = a >> 31;
+  int d = -c;
+  return d * b;
+}
+#endif
+
+#if __SIZEOF_LONG_LONG__ == 8
+unsigned long long fool (unsigned long long a, unsigned long long b)
+{
+  return (a >> 63) * b;
+}
+
+long long barl (long long a, long long b)
+{
+  return -(a >> 63) * b;
+}
+
+long long bazl (long long a, long long b)
+{
+  long long c = a >> 63;
+  long long d = -c;
+  return d * b;
+}
+#endif
+
+unsigned int pin (int a, unsigned int b)
+{
+  unsigned int t =  a & 1;
+  return t * b;
+}
+
+unsigned long pinl (long a, unsigned long b)
+{
+  unsigned long t =  a & 1;
+  return t * b;
+}
+
+unsigned long long pinll (long long a, unsigned long long b)
+{
+  unsigned long long t =  a & 1;
+  return t * b;
+}
+
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
index 9e68a77..469b232 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
@@ -9,4 +9,5 @@ f (int m1, int m2, int c)
   return e ? m1 : m2;
 }
 
-/* { dg-final { scan-tree-dump-times "\\? c_\[0-9\]\\(D\\) : 0" 1 "optimized" 
} } */
+/* The MULT_EXPR should become BIT_AND_EXPR or COND_EXPR.  */
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */

[PATCH] PR middle-end/98865: Optimize (a>>63)*b as -(a>>63)&b in match.pd.

Reply via email to