This patch adds a new rule for distributing lowpart subregs through ANDs, IORs, and XORs with a constant, in cases where one of the terms then disappears. For example:
(lowart-subreg:QI (and:HI x 0x100)) simplifies to zero and (lowart-subreg:QI (and:HI x 0xff)) simplifies to (lowart-subreg:QI x). This would often be handled at some point using nonzero bits. However, the specific case I want the optimisation for is SVE predicates, where nonzero bit tracking isn't currently an option. Specifically: the predicate modes VNx8BI, VNx4BI and VNx2BI have the same size as VNx16BI, but treat only every second, fourth, or eighth bit as significant. Thus if we have: (subreg:VNx8BI (and:VNx16BI x C)) where C is the repeating constant { 1, 0, 1, 0, ... }, then the AND only clears bits that are made insignificant by the subreg, and so the result is equal to (subreg:VNx8BI x). Later patches rely on this. Tested on aarch64-linux-gnu. OK to install? Richard gcc/ * simplify-rtx.cc (simplify_context::simplify_subreg): Distribute lowpart subregs through AND/IOR/XOR, if doing so eliminates one of the terms. (test_scalar_int_ext_ops): Add some tests of the above for integers. * config/aarch64/aarch64.cc (aarch64_test_sve_folding): Likewise add tests for predicate modes. --- gcc/config/aarch64/aarch64.cc | 34 ++++++++++++++++ gcc/simplify-rtx.cc | 75 ++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4d9d83dfa88..c426f31439f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -31963,9 +31963,43 @@ aarch64_test_sysreg_encoding_clashes (void) static void aarch64_test_sve_folding () { + aarch64_target_switcher switcher (AARCH64_FL_SVE); + tree res = fold_unary (BIT_NOT_EXPR, ssizetype, ssize_int (poly_int64 (1, 1))); ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1)))); + + auto build_v16bi = [](bool a, bool b) + { + rtx_vector_builder builder (VNx16BImode, 2, 1); + builder.quick_push (a ? const1_rtx : const0_rtx); + builder.quick_push (b ? const1_rtx : const0_rtx); + return builder.build (); + }; + rtx v16bi_10 = build_v16bi (1, 0); + rtx v16bi_01 = build_v16bi (0, 1); + + for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode }) + { + rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1); + rtx subreg = lowpart_subreg (VNx16BImode, reg, mode); + rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg); + rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode)); + + rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode)); + rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg); + + rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10); + ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode), + lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg), + VNx16BImode)); + rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg); + } } /* Run all target-specific selftests. */ diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index cbe61b49bf6..125048da181 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -8394,9 +8394,45 @@ simplify_context::simplify_subreg (machine_mode outermode, rtx op, && VECTOR_MODE_P (innermode) && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)) && known_eq (GET_MODE_UNIT_SIZE (outermode), - GET_MODE_UNIT_SIZE (innermode))) + GET_MODE_UNIT_SIZE (innermode))) return simplify_gen_relational (GET_CODE (op), outermode, innermode, XEXP (op, 0), XEXP (op, 1)); + + /* Distribute lowpart subregs through logic ops in cases where one term + disappears. + + (subreg:M1 (and:M2 X C1)) -> (subreg:M1 X) + (subreg:M1 (ior:M2 X C1)) -> (subreg:M1 C1) + (subreg:M1 (xor:M2 X C1)) -> (subreg:M1 (not:M2 X)) + + if M2 is no smaller than M1 and (subreg:M1 C1) is all-ones. + + (subreg:M1 (and:M2 X C2)) -> (subreg:M1 C2) + (subreg:M1 (ior/xor:M2 X C2)) -> (subreg:M1 X) + + if M2 is no smaller than M1 and (subreg:M1 C2) is zero. */ + if (known_ge (innersize, outersize) + && GET_MODE_CLASS (outermode) == GET_MODE_CLASS (innermode) + && (GET_CODE (op) == AND || GET_CODE (op) == IOR || GET_CODE (op) == XOR) + && CONSTANT_P (XEXP (op, 1))) + { + rtx op1_subreg = simplify_subreg (outermode, XEXP (op, 1), innermode, 0); + if (op1_subreg == CONSTM1_RTX (outermode)) + { + if (GET_CODE (op) == IOR) + return op1_subreg; + rtx op0 = XEXP (op, 0); + if (GET_CODE (op) == XOR) + op0 = simplify_gen_unary (NOT, innermode, op0, innermode); + return simplify_gen_subreg (outermode, op0, innermode, 0); + } + + if (op1_subreg == CONST0_RTX (outermode)) + return (GET_CODE (op) == AND + ? op1_subreg + : simplify_gen_subreg (outermode, XEXP (op, 0), innermode, 0)); + } + return NULL_RTX; } @@ -8668,6 +8704,43 @@ test_scalar_int_ext_ops (machine_mode bmode, machine_mode smode) lowpart_subreg (bmode, sreg, smode), bmode), sreg); + + /* Test extensions, followed by logic ops, followed by truncations. */ + rtx bsubreg = lowpart_subreg (bmode, sreg, smode); + rtx smask = gen_int_mode (GET_MODE_MASK (smode), bmode); + rtx inv_smask = gen_int_mode (~GET_MODE_MASK (smode), bmode); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (AND, bmode, + bsubreg, smask), + bmode), + sreg); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (AND, bmode, + bsubreg, inv_smask), + bmode), + const0_rtx); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (IOR, bmode, + bsubreg, smask), + bmode), + constm1_rtx); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (IOR, bmode, + bsubreg, inv_smask), + bmode), + sreg); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (XOR, bmode, + bsubreg, smask), + bmode), + lowpart_subreg (smode, + gen_rtx_NOT (bmode, bsubreg), + bmode)); + ASSERT_RTX_EQ (lowpart_subreg (smode, + simplify_gen_binary (XOR, bmode, + bsubreg, inv_smask), + bmode), + sreg); } /* Verify more simplifications of integer extension/truncation. -- 2.43.0