Kyrylo Tkachov <ktkac...@nvidia.com> writes: > From 930789b3c366777c49d4eb2f4dc84b0374601504 Mon Sep 17 00:00:00 2001 > From: Kyrylo Tkachov <ktkac...@nvidia.com> > Date: Fri, 11 Jul 2025 02:50:32 -0700 > Subject: [PATCH 1/2] aarch64: Use SVE2 NBSL for vector NOR and NAND for > Advanced SIMD modes > > We already have patterns to use the NBSL instruction to implement vector > NOR and NAND operations for SVE types and modes. It is straightforward to > have similar patterns for the fixed-width Advanced SIMD modes as well, though > it requires combine patterns without the predicate operand and an explicit 'Z' > output modifier. This patch does so. > > So now for example we generate for: > > uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); } > uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); } > > nand_q: > nbsl z0.d, z0.d, z1.d, z1.d > ret > > nor_q: > nbsl z0.d, z0.d, z1.d, z0.d > ret > > instead of the previous: > nand_q: > and v0.16b, v0.16b, v1.16b > not v0.16b, v0.16b > ret > > nor_q: > orr v0.16b, v0.16b, v1.16b > not v0.16b, v0.16b > ret > > The tied operand requirements for NBSL mean that we can generate the MOVPRFX > when the operands fall that way, but I guess having a 2-insn MOVPRFX form is > not worse than the current 2-insn codegen at least, and the MOVPRFX can be > fused by many cores. > > Bootstrapped and tested on aarch64-none-linux-gnu. > > Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com> > > gcc/ > > * config/aarch64/aarch64-sve2.md (*aarch64_sve2_unpred_nor<mode>): > New define_insn. > (*aarch64_sve2_nand_unpred<mode>): Likewise. > > gcc/testsuite/ > > * gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c: New test.
OK, thanks. Richard > --- > gcc/config/aarch64/aarch64-sve2.md | 29 ++++++++ > .../aarch64/sve2/nbsl_nor_nand_neon.c | 68 +++++++++++++++++++ > 2 files changed, 97 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c > > diff --git a/gcc/config/aarch64/aarch64-sve2.md > b/gcc/config/aarch64/aarch64-sve2.md > index 233a9b51c25..6d6dc94cd81 100644 > --- a/gcc/config/aarch64/aarch64-sve2.md > +++ b/gcc/config/aarch64/aarch64-sve2.md > @@ -1645,6 +1645,20 @@ > } > ) > > +(define_insn "*aarch64_sve2_unpred_nor<mode>" > + [(set (match_operand:VDQ_I 0 "register_operand") > + (and:VDQ_I > + (not:VDQ_I > + (match_operand:VDQ_I 1 "register_operand")) > + (not:VDQ_I > + (match_operand:VDQ_I 2 "register_operand"))))] > + "TARGET_SVE2" > + {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ] > + [ w , 0 , w ; * ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z0.d > + [ ?&w , w , w ; yes ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, > %Z0.d, %Z2.d, %Z1.d > + } > +) > + > ;; Use NBSL for vector NAND. > (define_insn_and_rewrite "*aarch64_sve2_nand<mode>" > [(set (match_operand:SVE_FULL_I 0 "register_operand") > @@ -1667,6 +1681,21 @@ > } > ) > > +;; Same as above but unpredicated and including Advanced SIMD modes. > +(define_insn "*aarch64_sve2_nand_unpred<mode>" > + [(set (match_operand:VDQ_I 0 "register_operand") > + (ior:VDQ_I > + (not:VDQ_I > + (match_operand:VDQ_I 1 "register_operand")) > + (not:VDQ_I > + (match_operand:VDQ_I 2 "register_operand"))))] > + "TARGET_SVE2" > + {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ] > + [ w , 0 , w ; * ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z2.d > + [ ?&w , w , w ; yes ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, > %Z0.d, %Z2.d, %Z2.d > + } > +) > + > ;; Unpredicated bitwise select. > ;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup) > (define_expand "@aarch64_sve2_bsl<mode>" > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c > b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c > new file mode 100644 > index 00000000000..09bfc194f88 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c > @@ -0,0 +1,68 @@ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +#include <arm_neon.h> > + > +#define NAND(x, y) (~((x) & (y))) > +#define NOR(x, y) (~((x) | (y))) > + > +/* > +** nand_d: > +** nbsl z0.d, z0.d, z1.d, z1.d > +** ret > +*/ > +uint32x2_t nand_d(uint32x2_t a, uint32x2_t b) { return NAND(a, b); } > + > +/* > +** nand_d_mp: > +** movprfx z0, z1 > +** nbsl z0.d, z0.d, z2.d, z2.d > +** ret > +*/ > +uint32x2_t nand_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return > NAND(a, b); } > + > +/* > +** nor_d: > +** nbsl z0.d, z0.d, z1.d, z0.d > +** ret > +*/ > +uint32x2_t nor_d(uint32x2_t a, uint32x2_t b) { return NOR(a, b); } > + > +/* > +** nor_d_mp: > +** movprfx z0, z1 > +** nbsl z0.d, z0.d, z2.d, z1.d > +** ret > +*/ > +uint32x2_t nor_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return > NOR(a, b); } > + > +/* > +** nand_q: > +** nbsl z0.d, z0.d, z1.d, z1.d > +** ret > +*/ > +uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); } > + > +/* > +** nand_q_mp: > +** movprfx z0, z1 > +** nbsl z0.d, z0.d, z2.d, z2.d > +** ret > +*/ > +uint32x4_t nand_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return > NAND(a, b); } > + > +/* > +** nor_q: > +** nbsl z0.d, z0.d, z1.d, z0.d > +** ret > +*/ > +uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); } > + > +/* > +** nor_q_mp: > +** movprfx z0, z1 > +** nbsl z0.d, z0.d, z2.d, z1.d > +** ret > +*/ > +uint32x4_t nor_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return > NOR(a, b); } > +