https://gcc.gnu.org/g:c02fa90cb32132c42c801c70144ceb76168248a2

commit r16-2283-gc02fa90cb32132c42c801c70144ceb76168248a2
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Fri Jul 11 02:50:32 2025 -0700

    aarch64: Use SVE2 NBSL for vector NOR and NAND for Advanced SIMD modes
    
    We already have patterns to use the NBSL instruction to implement vector
    NOR and NAND operations for SVE types and modes.  It is straightforward to
    have similar patterns for the fixed-width Advanced SIMD modes as well, 
though
    it requires combine patterns without the predicate operand and an explicit 
'Z'
    output modifier.  This patch does so.
    
    So now for example we generate for:
    
    uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); }
    uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); }
    
    nand_q:
            nbsl    z0.d, z0.d, z1.d, z1.d
            ret
    
    nor_q:
            nbsl    z0.d, z0.d, z1.d, z0.d
            ret
    
    instead of the previous:
    nand_q:
            and     v0.16b, v0.16b, v1.16b
            not     v0.16b, v0.16b
            ret
    
    nor_q:
            orr     v0.16b, v0.16b, v1.16b
            not     v0.16b, v0.16b
            ret
    
    The tied operand requirements for NBSL mean that we can generate the MOVPRFX
    when the operands fall that way, but I guess having a 2-insn MOVPRFX form is
    not worse than the current 2-insn codegen at least, and the MOVPRFX can be
    fused by many cores.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            * config/aarch64/aarch64-sve2.md (*aarch64_sve2_unpred_nor<mode>):
            New define_insn.
            (*aarch64_sve2_nand_unpred<mode>): Likewise.
    
    gcc/testsuite/
    
            * gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve2.md                 | 29 +++++++++
 .../gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c   | 68 ++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve2.md 
b/gcc/config/aarch64/aarch64-sve2.md
index 660901d4b3f1..7148f54b363f 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1645,6 +1645,20 @@
   }
 )
 
+(define_insn "*aarch64_sve2_unpred_nor<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (and:VDQ_I
+         (not:VDQ_I
+           (match_operand:VDQ_I 1 "register_operand"))
+         (not:VDQ_I
+           (match_operand:VDQ_I 2 "register_operand"))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ]
+     [ w        , 0  , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z0.d
+     [ ?&w      , w  , w ; yes            ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, 
%Z0.d, %Z2.d, %Z1.d
+  }
+)
+
 ;; Use NBSL for vector NAND.
 (define_insn_and_rewrite "*aarch64_sve2_nand<mode>"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
@@ -1667,6 +1681,21 @@
   }
 )
 
+;; Same as above but unpredicated and including Advanced SIMD modes.
+(define_insn "*aarch64_sve2_nand_unpred<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (ior:VDQ_I
+         (not:VDQ_I
+           (match_operand:VDQ_I 1 "register_operand"))
+         (not:VDQ_I
+           (match_operand:VDQ_I 2 "register_operand"))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , %1 , 2 ; attrs: movprfx ]
+     [ w        , 0  , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z2.d, %Z2.d
+     [ ?&w      , w  , w ; yes            ] movprfx\t%Z0, %Z1\;nbsl\t%Z0.d, 
%Z0.d, %Z2.d, %Z2.d
+  }
+)
+
 ;; Unpredicated bitwise select.
 ;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)
 (define_expand "@aarch64_sve2_bsl<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c
new file mode 100644
index 000000000000..09bfc194f88a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_nor_nand_neon.c
@@ -0,0 +1,68 @@
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+#define NAND(x, y)  (~((x) & (y)))
+#define NOR(x, y)   (~((x) | (y)))
+
+/*
+** nand_d:
+**     nbsl    z0.d, z0.d, z1.d, z1.d
+**     ret
+*/
+uint32x2_t nand_d(uint32x2_t a, uint32x2_t b) { return NAND(a, b); }
+
+/*
+** nand_d_mp:
+**     movprfx z0, z1
+**     nbsl    z0.d, z0.d, z2.d, z2.d
+**     ret
+*/
+uint32x2_t nand_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return 
NAND(a, b); }
+
+/*
+** nor_d:
+**     nbsl    z0.d, z0.d, z1.d, z0.d
+**     ret
+*/
+uint32x2_t nor_d(uint32x2_t a, uint32x2_t b) { return NOR(a, b); }
+
+/*
+** nor_d_mp:
+**     movprfx z0, z1
+**     nbsl    z0.d, z0.d, z2.d, z1.d
+**     ret
+*/
+uint32x2_t nor_d_mp(uint32x2_t c, uint32x2_t a, uint32x2_t b) { return NOR(a, 
b); }
+
+/*
+** nand_q:
+**     nbsl    z0.d, z0.d, z1.d, z1.d
+**     ret
+*/
+uint64x2_t nand_q(uint64x2_t a, uint64x2_t b) { return NAND(a, b); }
+
+/*
+** nand_q_mp:
+**     movprfx z0, z1
+**     nbsl    z0.d, z0.d, z2.d, z2.d
+**     ret
+*/
+uint32x4_t nand_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return 
NAND(a, b); }
+
+/*
+** nor_q:
+**     nbsl    z0.d, z0.d, z1.d, z0.d
+**     ret
+*/
+uint64x2_t nor_q(uint64x2_t a, uint64x2_t b) { return NOR(a, b); }
+
+/*
+** nor_q_mp:
+**     movprfx z0, z1
+**     nbsl    z0.d, z0.d, z2.d, z1.d
+**     ret
+*/
+uint32x4_t nor_q_mp(uint32x4_t c, uint32x4_t a, uint32x4_t b) { return NOR(a, 
b); }
+

Reply via email to