This patch adds unpacked support for unconditional and conditional CNOT. The type suffix has to be taken from the element size rather than the container size.
Tested on aarch64-linux-gnu and aarch64_be-elf. Pushed to trunk. Richard gcc/ * config/aarch64/aarch64-sve.md (*cnot<mode>): Extend from SVE_FULL_I to SVE_I. (*cond_cnot<mode>_2, *cond_cnot<mode>_any): Likewise. gcc/testsuite/ * gcc.target/aarch64/sve/cnot_2.c: New test. * gcc.target/aarch64/sve/cond_cnot_4.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_4_run.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_5.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_5_run.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_6.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_6_run.c: Likewise. --- gcc/config/aarch64/aarch64-sve.md | 36 +++++++++---------- gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c | 29 +++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_4.c | 32 +++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_4_run.c | 26 ++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_5.c | 32 +++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_5_run.c | 26 ++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_6.c | 31 ++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_6_run.c | 26 ++++++++++++++ 8 files changed, 220 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b83f9912cb6..2f5a5e3c914 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3227,16 +3227,16 @@ (define_expand "@aarch64_pred_cnot<mode>" ) (define_insn "*cnot<mode>" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I [(unspec:<VPRED> [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") (match_operand:SI 5 "aarch64_sve_ptrue_flag") (eq:<VPRED> - (match_operand:SVE_FULL_I 2 "register_operand" "0, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "0, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL))] "TARGET_SVE" @@ -3274,19 +3274,19 @@ (define_expand "@cond_cnot<mode>" ;; Predicated logical inverse, merging with the first input. (define_insn_and_rewrite "*cond_cnot<mode>_2" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl") ;; Logical inverse of operand 2 (as above). - (unspec:SVE_FULL_I + (unspec:SVE_I [(unspec:<VPRED> [(match_operand 5) (const_int SVE_KNOWN_PTRUE) (eq:<VPRED> - (match_operand:SVE_FULL_I 2 "register_operand" "0, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "0, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL) (match_dup 2)] @@ -3310,22 +3310,22 @@ (define_insn_and_rewrite "*cond_cnot<mode>_2" ;; as earlyclobber helps to make the instruction more regular to the ;; register allocator. (define_insn_and_rewrite "*cond_cnot<mode>_any" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=&w, ?&w, ?&w") + (unspec:SVE_I [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") ;; Logical inverse of operand 2 (as above). - (unspec:SVE_FULL_I + (unspec:SVE_I [(unspec:<VPRED> [(match_operand 5) (const_int SVE_KNOWN_PTRUE) (eq:<VPRED> - (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "w, w, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL) - (match_operand:SVE_FULL_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")] + (match_operand:SVE_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[6])" "@ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c new file mode 100644 index 00000000000..fe778234424 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include <stdint.h> + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2##_##TYPE3 (TYPE2 *restrict r, \ + TYPE1 *restrict pred, \ + TYPE2 *restrict a) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + if (pred[i]) \ + r[i] = !a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c new file mode 100644 index 00000000000..729d3f4f2ac --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include <stdint.h> + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE2 *__restrict a, \ + TYPE1 *__restrict pred) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = pred[i] ? !a[i] : a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz} } } */ +/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c new file mode 100644 index 00000000000..de9c0a502e9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_4.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 pred[N]; \ + TYPE2 r[N], a[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i & 1 ? 0 : 3 * (i + 1); \ + pred[i] = (i % 3 < 2); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, pred); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (pred[i] ? !a[i] : a[i])) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c new file mode 100644 index 00000000000..7318e108591 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include <stdint.h> + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE1 *__restrict a, \ + TYPE2 *__restrict b) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = a[i] == 0 ? !b[i] : a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz} } } */ +/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c new file mode 100644 index 00000000000..f8f277c32c2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_5.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 a[N]; \ + TYPE2 r[N], b[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i % 3 < 2 ? 0 : i * 42; \ + b[i] = i & 1 ? 0 : 3 * (i + 1); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, b); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : a[i])) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c new file mode 100644 index 00000000000..d44e357f44a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include <stdint.h> + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE1 *__restrict a, \ + TYPE2 *__restrict b) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = a[i] == 0 ? !b[i] : 127; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c new file mode 100644 index 00000000000..9e33616dc8f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_6.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 a[N]; \ + TYPE2 r[N], b[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i % 3 < 2 ? 0 : i * 42; \ + b[i] = i & 1 ? 0 : 3 * (i + 1); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, b); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : 127)) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +}