https://gcc.gnu.org/g:9f54332461238a365acb4e1ada5d0327e4c93644
commit r16-4560-g9f54332461238a365acb4e1ada5d0327e4c93644 Author: Tamar Christina <[email protected]> Date: Wed Oct 22 10:51:41 2025 +0100 AArch64: Add support for boolean reductions for Adv. SIMD The vectorizer has learned how to do boolean reductions of masks to a C bool for the operations OR, XOR and AND. This implements the new optabs for Adv.SIMD. Adv.SIMD today can already vectorize such loops but does so through SHIFT-AND-INSERT to perform the reductions step-wise and inorder. As an example, an OR reduction today does: movi v3.4s, 0 ext v5.16b, v30.16b, v3.16b, #8 orr v5.16b, v5.16b, v30.16b ext v29.16b, v5.16b, v3.16b, #4 orr v29.16b, v29.16b, v5.16b ext v4.16b, v29.16b, v3.16b, #2 orr v4.16b, v4.16b, v29.16b ext v3.16b, v4.16b, v3.16b, #1 orr v3.16b, v3.16b, v4.16b fmov w1, s3 and w1, w1, 1 For reducing to a boolean however we don't need the stepwise reduction and can just look at the bit patterns. For e.g. OR we now generate: umaxp v3.4s, v3.4s, v3.4s fmov x1, d3 cmp x1, 0 cset w0, ne For the remaining codegen see test vect-reduc-bool-9.c. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (reduc_sbool_and_scal_<mode>, reduc_sbool_ior_scal_<mode>, reduc_sbool_xor_scal_<mode>): New. * config/aarch64/iterators.md (VALLI): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-reduc-bool-1.c: New test. * gcc.target/aarch64/vect-reduc-bool-2.c: New test. * gcc.target/aarch64/vect-reduc-bool-3.c: New test. * gcc.target/aarch64/vect-reduc-bool-4.c: New test. * gcc.target/aarch64/vect-reduc-bool-5.c: New test. * gcc.target/aarch64/vect-reduc-bool-6.c: New test. * gcc.target/aarch64/vect-reduc-bool-7.c: New test. * gcc.target/aarch64/vect-reduc-bool-8.c: New test. * gcc.target/aarch64/vect-reduc-bool-9.c: New test. Diff: --- gcc/config/aarch64/aarch64-simd.md | 97 ++++++++++++++++++++++ gcc/config/aarch64/iterators.md | 3 + .../gcc.target/aarch64/vect-reduc-bool-1.c | 51 ++++++++++++ .../gcc.target/aarch64/vect-reduc-bool-2.c | 51 ++++++++++++ .../gcc.target/aarch64/vect-reduc-bool-3.c | 51 ++++++++++++ .../gcc.target/aarch64/vect-reduc-bool-4.c | 51 ++++++++++++ .../gcc.target/aarch64/vect-reduc-bool-5.c | 49 +++++++++++ .../gcc.target/aarch64/vect-reduc-bool-6.c | 49 +++++++++++ .../gcc.target/aarch64/vect-reduc-bool-7.c | 49 +++++++++++ .../gcc.target/aarch64/vect-reduc-bool-8.c | 49 +++++++++++ .../gcc.target/aarch64/vect-reduc-bool-9.c | 63 ++++++++++++++ 11 files changed, 563 insertions(+) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index eaa8d57cc413..648a42f7d0f7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3469,6 +3469,103 @@ DONE; }) +;; AND tree reductions. +;; Check if after a min pairwise reduction that all the lanes are 1. +;; +;; uminp v1.4s, v1.4s, v1.4s +;; fmov x1, d1 +;; cmn x1, #1 +;; cset w0, eq +;; +(define_expand "reduc_sbool_and_scal_<mode>" + [(set (match_operand:QI 0 "register_operand") + (unspec:QI [(match_operand:VALLI 1 "register_operand")] + UNSPEC_ANDV))] + "TARGET_SIMD" +{ + rtx tmp = operands[1]; + /* 128-bit vectors need to be compressed to 64-bits first. */ + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode))) + { + /* Always reduce using a V4SI. */ + rtx reduc = gen_lowpart (V4SImode, tmp); + rtx res = gen_reg_rtx (V4SImode); + emit_insn (gen_aarch64_uminpv4si (res, reduc, reduc)); + emit_move_insn (tmp, gen_lowpart (<MODE>mode, res)); + } + rtx val = gen_reg_rtx (DImode); + emit_move_insn (val, gen_lowpart (DImode, tmp)); + rtx cc_reg = aarch64_gen_compare_reg (EQ, val, constm1_rtx); + rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, constm1_rtx); + rtx tmp2 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; +}) + +;; IOR tree reductions. +;; Check that after a MAX pairwise reduction any lane is not 0 +;; +;; umaxp v1.4s, v1.4s, v1.4s +;; fmov x1, d1 +;; cmp x1, 0 +;; cset w0, ne +;; +(define_expand "reduc_sbool_ior_scal_<mode>" + [(set (match_operand:QI 0 "register_operand") + (unspec:QI [(match_operand:VALLI 1 "register_operand")] + UNSPEC_IORV))] + "TARGET_SIMD" +{ + rtx tmp = operands[1]; + /* 128-bit vectors need to be compressed to 64-bits first. */ + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode))) + { + /* Always reduce using a V4SI. */ + rtx reduc = gen_lowpart (V4SImode, tmp); + rtx res = gen_reg_rtx (V4SImode); + emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc)); + emit_move_insn (tmp, gen_lowpart (<MODE>mode, res)); + } + rtx val = gen_reg_rtx (DImode); + emit_move_insn (val, gen_lowpart (DImode, tmp)); + rtx cc_reg = aarch64_gen_compare_reg (NE, val, const0_rtx); + rtx cmp = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx); + rtx tmp2 = gen_reg_rtx (SImode); + emit_insn (gen_aarch64_cstoresi (tmp2, cmp, cc_reg)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; +}) + +;; Unpredicated predicate XOR tree reductions. +;; Check to see if the number of active lanes in the predicates is a multiple +;; of 2. We use a normal reduction after masking with 0x1. +;; +;; movi v1.16b, 0x1 +;; and v2.16b, v2.16b, v2.16b +;; addv b3, v2.16b +;; fmov w1, s3 +;; and w0, w1, 1 +;; +(define_expand "reduc_sbool_xor_scal_<mode>" + [(set (match_operand:QI 0 "register_operand") + (unspec:QI [(match_operand:VALLI 1 "register_operand")] + UNSPEC_XORV))] + "TARGET_SIMD" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + rtx one_reg = force_reg (<MODE>mode, CONST1_RTX (<MODE>mode)); + emit_move_insn (tmp, gen_rtx_AND (<MODE>mode, operands[1], one_reg)); + rtx tmp2 = gen_reg_rtx (<VEL>mode); + emit_insn (gen_reduc_plus_scal_<mode> (tmp2, tmp)); + rtx tmp3 = gen_reg_rtx (DImode); + emit_move_insn (tmp3, gen_rtx_AND (DImode, + lowpart_subreg (DImode, tmp2, <VEL>mode), + const1_rtx)); + emit_move_insn (operands[0], gen_lowpart (QImode, tmp2)); + DONE; +}) + ;; SADDLV and UADDLV can be expressed as an ADDV instruction that first ;; sign or zero-extends its elements. (define_insn "aarch64_<su>addlv<mode>" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 3757998c0ea9..517b2808b5f7 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -217,6 +217,9 @@ ;; All Advanced SIMD modes on which we support any arithmetic operations. (define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF]) +;; All Advanced SIMD integer modes +(define_mode_iterator VALLI [VDQ_BHSI V2DI]) + ;; All Advanced SIMD modes suitable for moving, loading, and storing. (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V4BF V8BF V2SF V4SF V2DF]) diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c new file mode 100644 index 000000000000..c9b1c85c222e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-1.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c new file mode 100644 index 000000000000..598d6c71ec84 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-2.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c new file mode 100644 index 000000000000..9517965753a7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-3.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c new file mode 100644 index 000000000000..3cd577f5ed59 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-4.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fand (n)) + __builtin_abort (); + + p[0] = 0; + for (int n = 1; n < 77; ++n) + if (fand (n)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fior (n)) + __builtin_abort (); + + p[0] = 1; + for (int n = 1; n < 77; ++n) + if (!fior (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c new file mode 100644 index 000000000000..c6fa63b7657e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-5.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +char p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c new file mode 100644 index 000000000000..6d12e6a7cb4f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-6.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +short p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c new file mode 100644 index 000000000000..58d6a785f9a0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-7.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +int p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c new file mode 100644 index 000000000000..18ad94a4bd7f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-8.c @@ -0,0 +1,49 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fdump-tree-vect-details" }*/ + +long long p[128]; + +bool __attribute__((noipa)) +fxort (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +bool __attribute__((noipa)) +fxorf (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +int main() +{ + __builtin_memset (p, 1, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (fxort (n) != !(n & 1)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n) != (n & 1)) + __builtin_abort (); + + __builtin_memset (p, 0, sizeof(p)); + + for (int n = 0; n < 77; ++n) + if (!fxort (n)) + __builtin_abort (); + + for (int n = 0; n < 77; ++n) + if (fxorf (n)) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { target { vect_int && vect_condition } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c new file mode 100644 index 000000000000..7d9a82f5fc3a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-bool-9.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/ +/* { dg-final { check-function-bodies "**" "" } } */ + +char p[128]; + +/* +** fand: +** ... +** uminp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** fmov x[0-9]+, d[0-9]+ +** cmn x[0-9]+, #1 +** cset w[0-9]+, eq +** ... +*/ +bool __attribute__((noipa)) +fand (int n) +{ + bool r = true; + for (int i = 0; i < n; ++i) + r &= (p[i] != 0); + return r; +} + +/* +** fior: +** ... +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** fmov x[0-9]+, d[0-9]+ +** cmp x[0-9]+, 0 +** cset w[0-9]+, ne +** ... +*/ +bool __attribute__((noipa)) +fior (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r |= (p[i] != 0); + return r; +} + +/* +** fxor: +** ... +** movi v[0-9]+.16b, 0x1 +** and v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** addv b[0-9]+, v[0-9]+.16b +** fmov w[0-9]+, s[0-9]+ +** and w[0-9]+, w[0-9]+, 1 +** ... +*/ +bool __attribute__((noipa)) +fxor (int n) +{ + bool r = false; + for (int i = 0; i < n; ++i) + r ^= (p[i] != 0); + return r; +} + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 3 "vect" } } */ +
