BB vectorizer relies on the backend support of .REDUC_{PLUS,IOR,XOR,AND} to vectorize reduction.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready push to trunk. gcc/ChangeLog: PR target/112325 * config/i386/sse.md (reduc_<code>_scal_<mode>): New expander. (REDUC_ANY_LOGIC_MODE): New iterator. (REDUC_PLUS_MODE): Extend to VxHI/SI/DImode. (REDUC_SSE_PLUS_MODE): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr112325-1.c: New test. * gcc.target/i386/pr112325-2.c: New test. --- gcc/config/i386/sse.md | 48 ++++++++- gcc/testsuite/gcc.target/i386/pr112325-1.c | 116 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr112325-2.c | 38 +++++++ 3 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr112325-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr112325-2.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d250a6cb802..f94a77d0b6d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3417,7 +3417,9 @@ (define_insn "sse3_h<insn>v4sf3" (define_mode_iterator REDUC_SSE_PLUS_MODE [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE") - (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")]) + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HI "TARGET_SSE2") (V4SI "TARGET_SSE2") + (V2DI "TARGET_SSE2")]) (define_expand "reduc_plus_scal_<mode>" [(plus:REDUC_SSE_PLUS_MODE @@ -3458,8 +3460,12 @@ (define_mode_iterator REDUC_PLUS_MODE (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V16SF "TARGET_AVX512F && TARGET_EVEX512") (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL && TARGET_EVEX512") - (V32QI "TARGET_AVX") - (V64QI "TARGET_AVX512F && TARGET_EVEX512")]) + (V32QI "TARGET_AVX") (V16HI "TARGET_AVX") + (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") + (V64QI "TARGET_AVX512F && TARGET_EVEX512") + (V32HI "TARGET_AVX512F && TARGET_EVEX512") + (V16SI "TARGET_AVX512F && TARGET_EVEX512") + (V8DI "TARGET_AVX512F && TARGET_EVEX512")]) (define_expand "reduc_plus_scal_<mode>" [(plus:REDUC_PLUS_MODE @@ -3597,6 +3603,42 @@ (define_insn "reduces<mode><mask_scalar_name><round_saeonly_scalar_name>" (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) +(define_expand "reduc_<code>_scal_<mode>" + [(any_logic:VI_128 + (match_operand:<ssescalarmode> 0 "register_operand") + (match_operand:VI_128 1 "register_operand"))] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + ix86_expand_reduc (gen_<code><mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><ssescalarmodelower> (operands[0], + tmp, const0_rtx)); + DONE; +}) + +(define_mode_iterator REDUC_ANY_LOGIC_MODE + [(V32QI "TARGET_AVX") (V16HI "TARGET_AVX") + (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") + (V64QI "TARGET_AVX512F && TARGET_EVEX512") + (V32HI "TARGET_AVX512F && TARGET_EVEX512") + (V16SI "TARGET_AVX512F && TARGET_EVEX512") + (V8DI "TARGET_AVX512F && TARGET_EVEX512")]) + +(define_expand "reduc_<code>_scal_<mode>" + [(any_logic:REDUC_ANY_LOGIC_MODE + (match_operand:<ssescalarmode> 0 "register_operand") + (match_operand:REDUC_ANY_LOGIC_MODE 1 "register_operand"))] + "" +{ + rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode); + emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1])); + rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode); + rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]); + emit_insn (gen_<code><ssehalfvecmodelower>3 (tmp2, tmp, tmp3)); + emit_insn (gen_reduc_<code>_scal_<ssehalfvecmodelower> (operands[0], tmp2)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel floating point comparisons diff --git a/gcc/testsuite/gcc.target/i386/pr112325-1.c b/gcc/testsuite/gcc.target/i386/pr112325-1.c new file mode 100644 index 00000000000..56e20c156f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112325-1.c @@ -0,0 +1,116 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512bw -O2 -mtune=generic -mprefer-vector-width=512 -fdump-tree-slp2" } */ +/* { dg-final { scan-tree-dump-times ".REDUC_PLUS" 3 "slp2" } } */ +/* { dg-final { scan-tree-dump-times ".REDUC_IOR" 4 "slp2" } } */ + +int +__attribute__((noipa)) +plus_v4si (int* a) +{ + int sum = 0; + sum += a[0]; + sum += a[1]; + sum += a[2]; + sum += a[3]; + return sum; +} + +short +__attribute__((noipa)) +plus_v8hi (short* a) +{ + short sum = 0; + sum += a[0]; + sum += a[1]; + sum += a[2]; + sum += a[3]; + sum += a[4]; + sum += a[5]; + sum += a[6]; + sum += a[7]; + return sum; +} + +long long +__attribute__((noipa)) +plus_v8di (long long* a) +{ + long long sum = 0; + sum += a[0]; + sum += a[1]; + sum += a[2]; + sum += a[3]; + sum += a[4]; + sum += a[5]; + sum += a[6]; + sum += a[7]; + return sum; +} + +int +__attribute__((noipa)) +ior_v4si (int* a) +{ + int sum = 0; + sum |= a[0]; + sum |= a[1]; + sum |= a[2]; + sum |= a[3]; + return sum; +} + +short +__attribute__((noipa)) +ior_v8hi (short* a) +{ + short sum = 0; + sum |= a[0]; + sum |= a[1]; + sum |= a[2]; + sum |= a[3]; + sum |= a[4]; + sum |= a[5]; + sum |= a[6]; + sum |= a[7]; + return sum; +} + +long long +__attribute__((noipa)) +ior_v8di (long long* a) +{ + long long sum = 0; + sum |= a[0]; + sum |= a[1]; + sum |= a[2]; + sum |= a[3]; + sum |= a[4]; + sum |= a[5]; + sum |= a[6]; + sum |= a[7]; + return sum; +} + +char +__attribute__((noipa)) +ior_v16qi (char* a) +{ + char sum = 0; + sum |= a[0]; + sum |= a[1]; + sum |= a[2]; + sum |= a[3]; + sum |= a[4]; + sum |= a[5]; + sum |= a[6]; + sum |= a[7]; + sum |= a[8]; + sum |= a[9]; + sum |= a[10]; + sum |= a[11]; + sum |= a[12]; + sum |= a[13]; + sum |= a[14]; + sum |= a[15]; + return sum; +} diff --git a/gcc/testsuite/gcc.target/i386/pr112325-2.c b/gcc/testsuite/gcc.target/i386/pr112325-2.c new file mode 100644 index 00000000000..650006b0bd9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112325-2.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" +#include "pr112325-1.c" + +static void +sse2_test (void) +{ + int d[4] = { 3, 11, 22, 89}; + short w[8] = { 3, 11, 22, 89, 4, 9, 13, 7}; + char b[16] = { 3, 11, 22, 89, 4, 9, 13, 7, 2, 6, 5, 111, 163, 88, 11, 235}; + long long q[8] = { 3, 11, 22, 89, 4, 9, 13, 7}; + + /* if (plus_v4si (d) != 125) */ + /* __builtin_abort (); */ + + /* if (plus_v8hi (w) != 158) */ + /* __builtin_abort (); */ + + /* if (plus_v8di (q) != 158) */ + /* __builtin_abort (); */ + + /* if (ior_v4si (d) != 95) */ + /* __builtin_abort (); */ + + /* if (ior_v8hi (w) != 95) */ + /* __builtin_abort (); */ + + /* if (ior_v16qi (b) != (char)255) */ + /* __builtin_abort (); */ + + if (ior_v8di (q) != 95) + __builtin_abort (); + + return; +} -- 2.31.1