Kyrylo Tkachov <ktkac...@nvidia.com> writes:
> From 930789b3c366777c49d4eb2f4dc84b0374601504 Mon Sep 17 00:00:00 2001
> From: Kyrylo Tkachov <ktkac...@nvidia.com>
> Date: Fri, 11 Jul 2025 02:50:32 -0700
> Subject: [PATCH 1/2] aarch64: Use SVE2 NBSL for vector NOR and NAND for
>  Advanced SIMD modes
>
> We already have patterns to use the NBSL instruction to implement vector
> NOR and NAND operations for SVE types and modes.  It is straightforward to
> have similar patterns for the fixed-width Advanced SIMD modes as well, though
> it requires combine patterns without the predicate operand and an explicit 'Z'
> output modifier.  This patch does so.
>
> So now for example we generate for:
>
> uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); }
> uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); }
>
> nand_q:
>         nbsl    z0.d, z0.d, z1.d, z1.d
>         ret
>
> nor_q:
>         nbsl    z0.d, z0.d, z1.d, z0.d
>         ret
>
> instead of the previous:
> nand_q:
>         and     v0.16b, v0.16b, v1.16b
>         not     v0.16b, v0.16b
>         ret
>
> nor_q:
>         orr     v0.16b, v0.16b, v1.16b
>         not     v0.16b, v0.16b
>         ret
>
> The tied operand requirements for NBSL mean that we can generate the MOVPRFX
> when the operands fall that way, but I guess having a 2-insn MOVPRFX form is
> not worse than the current 2-insn codegen at least, and the MOVPRFX can be
> fused by many cores.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
>
> gcc/
>
>       * config/aarch64/aarch64-sve2.md (*aarch64_sve2_unpred_nor<mode>):
>       New define_insn.
>       (*aarch64_sve2_nand_unpred<mode>): Likewise.
>
> gcc/testsuite/
>
>       * gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c: New test.

OK, thanks.

Richard

> ---
>  gcc/config/aarch64/aarch64-sve2.md            | 29 ++++++++
>  .../aarch64/sve2/nbsl_nor_nand_neon.c         | 68 +++++++++++++++++++
>  2 files changed, 97 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 233a9b51c25..6d6dc94cd81 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1645,6 +1645,20 @@
>    }
>  )
>  
> +(define_insn "*aarch64_sve2_unpred_nor<mode>"
> +  [(set (match_operand:VDQ_I 0 "register_operand")
> +     (and:VDQ_I
> +       (not:VDQ_I
> +         (match_operand:VDQ_I 1 "register_operand"))
> +       (not:VDQ_I
> +         (match_operand:VDQ_I 2 "register_operand"))))]
> +  "TARGET_SVE2"
> +  {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ]
> +     [ w        , 0  , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z0.d
> +     [ ?&w      , w  , w ; yes            ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, 
> %Z0.d, %Z2.d, %Z1.d
> +  }
> +)
> +
>  ;; Use NBSL for vector NAND.
>  (define_insn_and_rewrite "*aarch64_sve2_nand<mode>"
>    [(set (match_operand:SVE_FULL_I 0 "register_operand")
> @@ -1667,6 +1681,21 @@
>    }
>  )
>  
> +;; Same as above but unpredicated and including Advanced SIMD modes.
> +(define_insn "*aarch64_sve2_nand_unpred<mode>"
> +  [(set (match_operand:VDQ_I 0 "register_operand")
> +     (ior:VDQ_I
> +       (not:VDQ_I
> +         (match_operand:VDQ_I 1 "register_operand"))
> +       (not:VDQ_I
> +         (match_operand:VDQ_I 2 "register_operand"))))]
> +  "TARGET_SVE2"
> +  {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ]
> +     [ w        , 0  , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z2.d
> +     [ ?&w      , w  , w ; yes            ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, 
> %Z0.d, %Z2.d, %Z2.d
> +  }
> +)
> +
>  ;; Unpredicated bitwise select.
>  ;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)
>  (define_expand "@aarch64_sve2_bsl<mode>"
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c
> new file mode 100644
> index 00000000000..09bfc194f88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c
> @@ -0,0 +1,68 @@
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +#define NAND(x, y)  (~((x) & (y)))
> +#define NOR(x, y)   (~((x) | (y)))
> +
> +/*
> +** nand_d:
> +**   nbsl    z0.d, z0.d, z1.d, z1.d
> +**   ret
> +*/
> +uint32x2_t nand_d(uint32x2_t a, uint32x2_t b) { return NAND(a, b); }
> +
> +/*
> +** nand_d_mp:
> +**   movprfx z0, z1
> +**   nbsl    z0.d, z0.d, z2.d, z2.d
> +**   ret
> +*/
> +uint32x2_t nand_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return 
> NAND(a, b); }
> +
> +/*
> +** nor_d:
> +**   nbsl    z0.d, z0.d, z1.d, z0.d
> +**   ret
> +*/
> +uint32x2_t nor_d(uint32x2_t a, uint32x2_t b) { return NOR(a, b); }
> +
> +/*
> +** nor_d_mp:
> +**   movprfx z0, z1
> +**   nbsl    z0.d, z0.d, z2.d, z1.d
> +**   ret
> +*/
> +uint32x2_t nor_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return 
> NOR(a, b); }
> +
> +/*
> +** nand_q:
> +**   nbsl    z0.d, z0.d, z1.d, z1.d
> +**   ret
> +*/
> +uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); }
> +
> +/*
> +** nand_q_mp:
> +**   movprfx z0, z1
> +**   nbsl    z0.d, z0.d, z2.d, z2.d
> +**   ret
> +*/
> +uint32x4_t nand_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return 
> NAND(a, b); }
> +
> +/*
> +** nor_q:
> +**   nbsl    z0.d, z0.d, z1.d, z0.d
> +**   ret
> +*/
> +uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); }
> +
> +/*
> +** nor_q_mp:
> +**   movprfx z0, z1
> +**   nbsl    z0.d, z0.d, z2.d, z1.d
> +**   ret
> +*/
> +uint32x4_t nor_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return 
> NOR(a, b); }
> +

Reply via email to