Add two patterns to eliminate mispredicts in the following bit ops
scenarios:

- checking if a single bit is not set, and in this case set it: always
  set the bit;
- checking if a bitmask is set (even partially), and in this case clear
  it: always clear the bitmask.

Bootstrapped and tested with x86_64-pc-linux-gnu.

        PR tree-optimization/64567

gcc/ChangeLog:

        * match.pd (`cond (bit_and A IMM) (bit_or A IMM) A`): New
        pattern.
        (`cond (bit_and A IMM) (bit_and A ~IMM) A`): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.dg/tree-ssa/pr64567-2.c: New test.
        * gcc.dg/tree-ssa/pr64567.c: New test.
---

Changes from v2:
- add a single bit check for the bit_and+bit_or optimization;
- simplify mask check for bit_and+bit_and optimization;
- turn pr64567.c into a run test that checks if these optimizations
  won't be applied for volatile variables;
- add a new run test to assert that the new optimizations aren't
  giving a wrong result.
- v2 link: https://gcc.gnu.org/pipermail/gcc-patches/2026-February/708558.html


 gcc/match.pd                              | 20 +++++++++
 gcc/testsuite/gcc.dg/tree-ssa/pr64567-2.c | 52 +++++++++++++++++++++++
 gcc/testsuite/gcc.dg/tree-ssa/pr64567.c   | 40 +++++++++++++++++
 3 files changed, 112 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr64567-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr64567.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 8910591a04b..bca21fb7a9f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6316,6 +6316,26 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
               && !expand_vec_cond_expr_p (TREE_TYPE (@1), TREE_TYPE (@0)))))
    (vec_cond @0 (op!:type @3 @1) (op!:type @3 @2)))))
 
+/* If we have a "if a bit is not set, set it" case,
+   just set the bit all the time (PR 64567).  Note that
+   this does not work if we're checking for more than one
+   bit, e.g. (a & 5 ? a | 5 : a) will fail for a = 1 (we
+   would return 5 instead of 1).  */
+(simplify
+ (cond (eq (bit_and @0 INTEGER_CST@1) integer_zerop)
+       (bit_ior@2 @0 INTEGER_CST@1) @0)
+  (if (wi::popcount (wi::to_wide (@1)) == 1)
+   @2))
+
+/* A clear bit version of the above: "if a bitmask is
+   set, clear it". In this case always clear the bitmask
+   (see PR 64567).  */
+(simplify
+ (cond (ne (bit_and @0 INTEGER_CST@1) integer_zerop)
+       (bit_and@3 @0 INTEGER_CST@2) @0)
+  (if (wi::to_wide (@2) == ~wi::to_wide (@1))
+   @3))
+
 #if GIMPLE
 (match (nop_atomic_bit_test_and_p @0 @1 @4)
  (bit_and (convert?@4 (ATOMIC_FETCH_OR_XOR_N @2 INTEGER_CST@0 @3))
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr64567-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr64567-2.c
new file mode 100644
index 00000000000..5b523d2cfe7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr64567-2.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+void abort(void);
+
+/* Macro adapted from builtin-object-size-common.h  */
+#define FAIL() \
+  do { \
+    __builtin_printf ("Failure at line: %d\n", __LINE__);                    \
+    abort();                                                                 \
+  } while (0)
+
+
+__attribute__((noinline))
+unsigned mask_zero_set (unsigned val, unsigned mask)
+{
+  if ((val & mask) == 0)
+    val |= mask;
+
+  return val;
+}
+
+__attribute__((noinline))
+unsigned mask_notzero_clear (unsigned val, unsigned mask)
+{
+  if ((val & mask) != 0)
+    val &= ~mask;
+
+  return val;
+}
+
+int main (void) {
+  if (mask_zero_set (0, 1) != 1)
+    FAIL ();
+
+  if (mask_zero_set (4, 3) != 7)
+    FAIL ();
+
+  if (mask_zero_set (1, 5) != 1)
+    FAIL ();
+
+  if (mask_notzero_clear (7, 1) != 6)
+    FAIL ();
+
+  if (mask_notzero_clear (7, 3) != 4)
+    FAIL ();
+
+  if (mask_notzero_clear (8, 6) != 8)
+    FAIL ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr64567.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr64567.c
new file mode 100644
index 00000000000..51af71a3938
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr64567.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define F1 0x01
+#define F2 0x02
+
+#define DECLS(n,VOL)                   \
+__attribute__((noinline,noclone))      \
+unsigned foo##n(unsigned A)            \
+{                                      \
+  VOL unsigned flags = A;              \
+  if (flags & (F1 | F2))               \
+    flags &= ~(F1 | F2);               \
+  return flags;                        \
+}                                      \
+__attribute__((noinline,noclone))      \
+unsigned bar##n(unsigned A)            \
+{                                      \
+  VOL unsigned flags = A;              \
+  if (!(flags & F1))                   \
+    flags |= F1;                       \
+  return flags;                        \
+}                                      \
+
+DECLS(0,)
+DECLS(1,volatile)
+
+int main ()
+{
+  for(int A = 0; A < 4; A++)
+    {
+      if (foo0 (A) != foo1 (A)) __builtin_abort();
+      if (bar0 (A) != bar1 (A)) __builtin_abort();
+    }
+}
+
+/* foo1 and bar1 will add 2 gotos each since they are not
+   being optimized, 'main' will add +6 (2 gotos for the loop,
+   2 gotos for each abort check).  */
+/* { dg-final { scan-tree-dump-times " goto " 10 optimized } } */
-- 
2.43.0

Reply via email to