This patch adds a new rule for distributing lowpart subregs through
ANDs, IORs, and XORs with a constant, in cases where one of the terms
then disappears.  For example:

  (lowart-subreg:QI (and:HI x 0x100))

simplifies to zero and

  (lowart-subreg:QI (and:HI x 0xff))

simplifies to (lowart-subreg:QI x).

This would often be handled at some point using nonzero bits.  However,
the specific case I want the optimisation for is SVE predicates,
where nonzero bit tracking isn't currently an option.  Specifically:
the predicate modes VNx8BI, VNx4BI and VNx2BI have the same size as
VNx16BI, but treat only every second, fourth, or eighth bit as
significant.  Thus if we have:

  (subreg:VNx8BI (and:VNx16BI x C))

where C is the repeating constant { 1, 0, 1, 0, ... }, then the
AND only clears bits that are made insignificant by the subreg,
and so the result is equal to (subreg:VNx8BI x).  Later patches
rely on this.

Tested on aarch64-linux-gnu.  OK to install?

Richard


gcc/
        * simplify-rtx.cc (simplify_context::simplify_subreg): Distribute
        lowpart subregs through AND/IOR/XOR, if doing so eliminates one
        of the terms.
        (test_scalar_int_ext_ops): Add some tests of the above for integers.
        * config/aarch64/aarch64.cc (aarch64_test_sve_folding): Likewise
        add tests for predicate modes.
---
 gcc/config/aarch64/aarch64.cc | 34 ++++++++++++++++
 gcc/simplify-rtx.cc           | 75 ++++++++++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4d9d83dfa88..c426f31439f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -31963,9 +31963,43 @@ aarch64_test_sysreg_encoding_clashes (void)
 static void
 aarch64_test_sve_folding ()
 {
+  aarch64_target_switcher switcher (AARCH64_FL_SVE);
+
   tree res = fold_unary (BIT_NOT_EXPR, ssizetype,
                         ssize_int (poly_int64 (1, 1)));
   ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1))));
+
+  auto build_v16bi = [](bool a, bool b)
+    {
+      rtx_vector_builder builder (VNx16BImode, 2, 1);
+      builder.quick_push (a ? const1_rtx : const0_rtx);
+      builder.quick_push (b ? const1_rtx : const0_rtx);
+      return builder.build ();
+    };
+  rtx v16bi_10 = build_v16bi (1, 0);
+  rtx v16bi_01 = build_v16bi (0, 1);
+
+  for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode })
+    {
+      rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1);
+      rtx subreg = lowpart_subreg (VNx16BImode, reg, mode);
+      rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10);
+      ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg);
+      rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode));
+
+      rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10);
+      ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode));
+      rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg);
+
+      rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10);
+      ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode),
+                    lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg),
+                                    VNx16BImode));
+      rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg);
+    }
 }
 
 /* Run all target-specific selftests.  */
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index cbe61b49bf6..125048da181 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -8394,9 +8394,45 @@ simplify_context::simplify_subreg (machine_mode 
outermode, rtx op,
       && VECTOR_MODE_P (innermode)
       && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))
       && known_eq (GET_MODE_UNIT_SIZE (outermode),
-                   GET_MODE_UNIT_SIZE (innermode)))
+                  GET_MODE_UNIT_SIZE (innermode)))
     return simplify_gen_relational (GET_CODE (op), outermode, innermode,
                                    XEXP (op, 0), XEXP (op, 1));
+
+  /* Distribute lowpart subregs through logic ops in cases where one term
+     disappears.
+
+     (subreg:M1 (and:M2 X C1)) -> (subreg:M1 X)
+     (subreg:M1 (ior:M2 X C1)) -> (subreg:M1 C1)
+     (subreg:M1 (xor:M2 X C1)) -> (subreg:M1 (not:M2 X))
+
+     if M2 is no smaller than M1 and (subreg:M1 C1) is all-ones.
+
+     (subreg:M1 (and:M2 X C2)) -> (subreg:M1 C2)
+     (subreg:M1 (ior/xor:M2 X C2)) -> (subreg:M1 X)
+
+     if M2 is no smaller than M1 and (subreg:M1 C2) is zero.  */
+  if (known_ge (innersize, outersize)
+      && GET_MODE_CLASS (outermode) == GET_MODE_CLASS (innermode)
+      && (GET_CODE (op) == AND || GET_CODE (op) == IOR || GET_CODE (op) == XOR)
+      && CONSTANT_P (XEXP (op, 1)))
+    {
+      rtx op1_subreg = simplify_subreg (outermode, XEXP (op, 1), innermode, 0);
+      if (op1_subreg == CONSTM1_RTX (outermode))
+       {
+         if (GET_CODE (op) == IOR)
+           return op1_subreg;
+         rtx op0 = XEXP (op, 0);
+         if (GET_CODE (op) == XOR)
+           op0 = simplify_gen_unary (NOT, innermode, op0, innermode);
+         return simplify_gen_subreg (outermode, op0, innermode, 0);
+       }
+
+      if (op1_subreg == CONST0_RTX (outermode))
+       return (GET_CODE (op) == AND
+               ? op1_subreg
+               : simplify_gen_subreg (outermode, XEXP (op, 0), innermode, 0));
+    }
+
   return NULL_RTX;
 }
 
@@ -8668,6 +8704,43 @@ test_scalar_int_ext_ops (machine_mode bmode, 
machine_mode smode)
                                     lowpart_subreg (bmode, sreg, smode),
                                     bmode),
                 sreg);
+
+  /* Test extensions, followed by logic ops, followed by truncations.  */
+  rtx bsubreg = lowpart_subreg (bmode, sreg, smode);
+  rtx smask = gen_int_mode (GET_MODE_MASK (smode), bmode);
+  rtx inv_smask = gen_int_mode (~GET_MODE_MASK (smode), bmode);
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (AND, bmode,
+                                                     bsubreg, smask),
+                                bmode),
+                sreg);
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (AND, bmode,
+                                                     bsubreg, inv_smask),
+                                bmode),
+                const0_rtx);
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (IOR, bmode,
+                                                     bsubreg, smask),
+                                bmode),
+                constm1_rtx);
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (IOR, bmode,
+                                                     bsubreg, inv_smask),
+                                bmode),
+                sreg);
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (XOR, bmode,
+                                                     bsubreg, smask),
+                                bmode),
+                lowpart_subreg (smode,
+                                gen_rtx_NOT (bmode, bsubreg),
+                                bmode));
+  ASSERT_RTX_EQ (lowpart_subreg (smode,
+                                simplify_gen_binary (XOR, bmode,
+                                                     bsubreg, inv_smask),
+                                bmode),
+                sreg);
 }
 
 /* Verify more simplifications of integer extension/truncation.
-- 
2.43.0

Reply via email to