This patch implements the constant folding optimization(s) described in
PR middle-end/98865, which should help address the serious performance
regression of Botan AES-128/XTS mentioned in PR tree-optimization/98856.
This combines aspects of both Jakub Jelinek's patch in comment #2 and
Andrew Pinski's patch in comment #4, so both are listed as co-authors.
Alas truth_valued_p is not quite what we want (and tweaking its
definition has undesirable side-effects), so instead this patch
introduces a new zero_one_valued predicate based on tree_nonzero_bits
that extends truth_valued_p (which is for Boolean types with single
bit precision). This is then used to simple if X*Y into X&Y when
both X and Y are zero_one_valued_p, and simplify X*Y into (-X)&Y when
X is zero_one_valued_p, in both cases replacing an integer multiplication
with a cheaper bit-wise AND.
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and with --target_board=unix{-m32}, with
no new failures, except for a tweak required to tree-ssa/vrp116.c.
The recently proposed cmove patch ensures the i386 backend continues
to generate identical code for vrp116.c as before.
Ok, either for mainline or when stage 1 reopens?
2022-04-20 Roger Sayle <[email protected]>
Andrew Pinski <[email protected]>
Jakub Jelinek <[email protected]>
gcc/ChangeLog
PR middle-end/98865
* match.pd (match zero_one_valued_p): New predicate.
(mult @0 @1): Use zero_one_valued_p for transforming into (and @0
@1).
(mult zero_one_valued_p@0 @1): Convert integer multiplication into
a negation and a bit-wise AND, if it can't be cheaply implemented by
a single left shift.
gcc/testsuite/ChangeLog
PR middle-end/98865
* gcc.dg/pr98865.c: New test case.
* gcc.dg/vrp116.c: Tweak test to confirm the integer multiplication
has been eliminated, not for the actual replacement implementation.
Thanks,
Roger
--
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d3..16a1203 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -285,14 +285,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|| !COMPLEX_FLOAT_TYPE_P (type)))
(negate @0)))
-/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 } */
-(simplify
- (mult SSA_NAME@1 SSA_NAME@2)
- (if (INTEGRAL_TYPE_P (type)
- && get_nonzero_bits (@1) == 1
- && get_nonzero_bits (@2) == 1)
- (bit_and @1 @2)))
-
/* Transform x * { 0 or 1, 0 or 1, ... } into x & { 0 or -1, 0 or -1, ...},
unless the target has native support for the former but not the latter. */
(simplify
@@ -1789,6 +1781,28 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(bit_not (bit_not @0))
@0)
+(match zero_one_valued_p
+ @0
+ (if (INTEGRAL_TYPE_P (type) && tree_nonzero_bits (@0) == 1)))
+(match zero_one_valued_p
+ truth_valued_p@0)
+
+/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 } */
+(simplify
+ (mult zero_one_valued_p@0 zero_one_valued_p@1)
+ (if (INTEGRAL_TYPE_P (type))
+ (bit_and @0 @1)))
+
+/* Transform x * { 0 or 1 } into x & { 0 or -1 }, i.e. an integer
+ multiplication into negate/bitwise and. Don't do this if the
+ multiplication is cheap, may be implemented by a single shift. */
+(simplify
+ (mult:c zero_one_valued_p@0 @1)
+ (if (INTEGRAL_TYPE_P (type)
+ && (TREE_CODE (@1) != INTEGER_CST
+ || wi::popcount (wi::to_wide (@1)) > 1))
+ (bit_and (negate @0) @1)))
+
/* Convert ~ (-A) to A - 1. */
(simplify
(bit_not (convert? (negate @0)))
diff --git a/gcc/testsuite/gcc.dg/pr98865.c b/gcc/testsuite/gcc.dg/pr98865.c
new file mode 100644
index 0000000..e7599d3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98865.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#if __SIZEOF_INT__ == 4
+unsigned int foo(unsigned int a, unsigned int b)
+{
+ return (a >> 31) * b;
+}
+
+int bar(int a, int b)
+{
+ return -(a >> 31) * b;
+}
+
+int baz(int a, int b)
+{
+ int c = a >> 31;
+ int d = -c;
+ return d * b;
+}
+#endif
+
+#if __SIZEOF_LONG_LONG__ == 8
+unsigned long long fool (unsigned long long a, unsigned long long b)
+{
+ return (a >> 63) * b;
+}
+
+long long barl (long long a, long long b)
+{
+ return -(a >> 63) * b;
+}
+
+long long bazl (long long a, long long b)
+{
+ long long c = a >> 63;
+ long long d = -c;
+ return d * b;
+}
+#endif
+
+unsigned int pin (int a, unsigned int b)
+{
+ unsigned int t = a & 1;
+ return t * b;
+}
+
+unsigned long pinl (long a, unsigned long b)
+{
+ unsigned long t = a & 1;
+ return t * b;
+}
+
+unsigned long long pinll (long long a, unsigned long long b)
+{
+ unsigned long long t = a & 1;
+ return t * b;
+}
+
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
index 9e68a77..469b232 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp116.c
@@ -9,4 +9,5 @@ f (int m1, int m2, int c)
return e ? m1 : m2;
}
-/* { dg-final { scan-tree-dump-times "\\? c_\[0-9\]\\(D\\) : 0" 1 "optimized"
} } */
+/* The MULT_EXPR should become BIT_AND_EXPR or COND_EXPR. */
+/* { dg-final { scan-tree-dump-not " \\* " "optimized" } } */