On Mon, Nov 30, 2020 at 9:46 PM Jakub Jelinek <ja...@redhat.com> wrote: > > On Mon, Nov 30, 2020 at 09:11:10PM +0800, Hongtao Liu wrote: > > +;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw. > > +(define_split > > + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") > > + (unspec:<avx512fmaskmode> > > + [(us_minus:VI12_AVX512VL > > + (match_operand:VI12_AVX512VL 1 "vector_operand") > > + (match_operand:VI12_AVX512VL 2 "vector_operand")) > > + (match_operand:VI12_AVX512VL 3 "const0_operand") > > + (match_operand:SI 4 "const0_operand")] > > + UNSPEC_PCMP))] > > + "TARGET_AVX512BW && ix86_binary_operator_ok (US_MINUS, <MODE>mode, > > operands)" > > Too long line, please wrap it. > Also, INTVAL (operands[4]) == 0 is EQ comparison, can't we handle also > NE (i.e. INTVAL (operands[4]) == 4? > I.e. replace the "const0_operand" in there with "const_0_to_7_operand" > and check in conditions that (INTVAL (operands[4]) & 3) == 0. > > > + [(const_int 0)] > > + { > > + /* LE: 2, NLT: 5. */ > > + rtx cmp_predicate = GEN_INT (2); > > + if (MEM_P (operands[1])) > > + { > > + std::swap (operands[1], operands[2]); > > + cmp_predicate = GEN_INT (5); > > For INTVAL (operands[4]) == 4 it would then be cmp_predictate NLE: 4 resp. > LT: 3 I think. > > Also, this handles only UNSPEC_PCMP, can't we handle UNSPEC_UNSIGNED_PCMP > too? I mean, for equality comparisons it doesn't really matter if we have > signed or unsigned == or !=. And for unsigned > x == 0U is equivalent to x <= 0U, and x != 0U equivalent to x > 0U. > > Jakub >
Yes, Update patch. +(define_int_iterator UNSPEC_PCMP_ITER + [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) + +(define_int_attr pcmp_signed_mask + [(UNSPEC_PCMP "3") (UNSPEC_UNSIGNED_PCMP "1")]) + +;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw. +;; For signed comparison, handle EQ 0: NEQ 4, +;; for unsigned comparison extra handle LE:2, NLE:6, equivalent to EQ and NEQ. + +(define_split + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") + (unspec:<avx512fmaskmode> + [(us_minus:VI12_AVX512VL + (match_operand:VI12_AVX512VL 1 "vector_operand") + (match_operand:VI12_AVX512VL 2 "vector_operand")) + (match_operand:VI12_AVX512VL 3 "const0_operand") + (match_operand:SI 4 "const_0_to_7_operand")] + UNSPEC_PCMP_ITER))] + "TARGET_AVX512BW + && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands) + && (INTVAL (operands[4]) & <pcmp_signed_mask>) == 0" + [(const_int 0)] + { + bool neq_p = INTVAL (operands[4]) >> 2; + /* LE: 2, NLT: 5, NLE: 6, LT: 1 */ + rtx cmp_predicate = neq_p ? GEN_INT (6) : GEN_INT (2); + if (MEM_P (operands[1])) + { + std::swap (operands[1], operands[2]); + cmp_predicate = neq_p ? GEN_INT (1) : GEN_INT (5); + } + emit_insn (gen_<avx512>_ucmp<mode>3 (operands[0], operands[1], + operands[2], cmp_predicate)); + DONE; + }) + -- BR, Hongtao
From e3eb61066ee665325cba8e231b991f9a1dda07df Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao....@intel.com> Date: Mon, 30 Nov 2020 13:27:16 +0800 Subject: [PATCH] Optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnleuw [PR96906] For signed comparisons, it handles cases that are eq or neq to 0. For unsigned comparisons, it additionaly handles cases that are le or gt to 0(equivilent to eq or neq to 0). Transform case eq to leu, case neq to gtu. .i.e. for -mavx512bw -mavx512vl transform eq case code from vpsubusw %xmm1, %xmm0, %xmm0 vpxor %xmm1, %xmm1, %xmm1 vpcmpeqw %xmm1, %xmm0, %k0 to vpcmpleuw %xmm1, %xmm0, %k0 .i.e. for -mavx512bw -mavx512vl transform neq case code from vpsubusw %xmm1, %xmm0, %xmm0 vpxor %xmm1, %xmm1, %xmm1 vpcmpneqw %xmm1, %xmm0, %k0 to vpcmpnleuw %xmm1, %xmm0, %k0 gcc/ChangeLog PR target/96906 * config/i386/sse.md (<avx512>_ucmp<mode>3<mask_scalar_merge_name>): Add a new define_split after this insn. gcc/testsuite/ChangeLog * gcc.target/i386/avx512bw-pr96906-1.c: New test. * gcc.target/i386/pr96906-1.c: Add -mno-avx512f. --- gcc/config/i386/sse.md | 37 ++++++++++ .../gcc.target/i386/avx512bw-pr96906-1.c | 68 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr96906-1.c | 2 +- 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 4aad462f882..7a4dafea1ed 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3006,6 +3006,43 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_int_iterator UNSPEC_PCMP_ITER + [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) + +(define_int_attr pcmp_signed_mask + [(UNSPEC_PCMP "3") (UNSPEC_UNSIGNED_PCMP "1")]) + +;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw. +;; For signed comparison, handle EQ 0: NEQ 4, +;; for unsigned comparison extra handle LE:2, NLE:6, equivalent to EQ and NEQ. + +(define_split + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") + (unspec:<avx512fmaskmode> + [(us_minus:VI12_AVX512VL + (match_operand:VI12_AVX512VL 1 "vector_operand") + (match_operand:VI12_AVX512VL 2 "vector_operand")) + (match_operand:VI12_AVX512VL 3 "const0_operand") + (match_operand:SI 4 "const_0_to_7_operand")] + UNSPEC_PCMP_ITER))] + "TARGET_AVX512BW + && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands) + && (INTVAL (operands[4]) & <pcmp_signed_mask>) == 0" + [(const_int 0)] + { + bool neq_p = INTVAL (operands[4]) >> 2; + /* LE: 2, NLT: 5, NLE: 6, LT: 1 */ + rtx cmp_predicate = neq_p ? GEN_INT (6) : GEN_INT (2); + if (MEM_P (operands[1])) + { + std::swap (operands[1], operands[2]); + cmp_predicate = neq_p ? GEN_INT (1) : GEN_INT (5); + } + emit_insn (gen_<avx512>_ucmp<mode>3 (operands[0], operands[1], + operands[2], cmp_predicate)); + DONE; + }) + (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c new file mode 100644 index 00000000000..81d7e06b972 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c @@ -0,0 +1,68 @@ +/* PR target/96906 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw -mavx512vl -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)vpcmpub[ \t]*\$2} 9 } } */ +/* { dg-final { scan-assembler-times {(?n)vpcmpub[ \t]*\$6} 9 } } */ +/* { dg-final { scan-assembler-times {(?n)vpcmpuw[ \t]*\$2} 9 } } */ +/* { dg-final { scan-assembler-times {(?n)vpcmpuw[ \t]*\$6} 9 } } */ + + +#include<immintrin.h> + +#define FOO(LENGTH,SUFFIX,TYPE,UTYPE,RTYPE,PRED) \ + __mmask##RTYPE \ + foo_##LENGTH##_##TYPE##_##PRED (__m##LENGTH##i x, __m##LENGTH##i y) \ + { \ + return \ + _mm##SUFFIX##_cmp_##TYPE##_mask (_mm##SUFFIX##_subs_##UTYPE (x, y), \ + _mm##SUFFIX##_setzero_si##LENGTH (), \ + PRED); \ + } \ + +FOO (128,, epi16, epu16, 8, 0); +FOO (128,, epi16, epu16, 8, 4); + +FOO (128,, epu16, epu16, 8, 0); +FOO (128,, epu16, epu16, 8, 2); +FOO (128,, epu16, epu16, 8, 4); +FOO (128,, epu16, epu16, 8, 6); + +FOO (256, 256, epi16, epu16, 16, 0); +FOO (256, 256, epi16, epu16, 16, 4); + +FOO (256, 256, epu16, epu16, 16, 0); +FOO (256, 256, epu16, epu16, 16, 2); +FOO (256, 256, epu16, epu16, 16, 4); +FOO (256, 256, epu16, epu16, 16, 6); + +FOO (512, 512, epi16, epu16, 32, 0); +FOO (512, 512, epi16, epu16, 32, 4); + +FOO (512, 512, epu16, epu16, 32, 0); +FOO (512, 512, epu16, epu16, 32, 2); +FOO (512, 512, epu16, epu16, 32, 4); +FOO (512, 512, epu16, epu16, 32, 6); + +FOO (128,, epi8, epu8, 16, 0); +FOO (128,, epi8, epu8, 16, 4); + +FOO (128,, epu8, epu8, 16, 0); +FOO (128,, epu8, epu8, 16, 2); +FOO (128,, epu8, epu8, 16, 4); +FOO (128,, epu8, epu8, 16, 6); + +FOO (256, 256, epi8, epu8, 32, 0); +FOO (256, 256, epi8, epu8, 32, 4); + +FOO (256, 256, epu8, epu8, 32, 0); +FOO (256, 256, epu8, epu8, 32, 2); +FOO (256, 256, epu8, epu8, 32, 4); +FOO (256, 256, epu8, epu8, 32, 6); + +FOO (512, 512, epi8, epu8, 64, 0); +FOO (512, 512, epi8, epu8, 64, 4); + +FOO (512, 512, epu8, epu8, 64, 0); +FOO (512, 512, epu8, epu8, 64, 2); +FOO (512, 512, epu8, epu8, 64, 4); +FOO (512, 512, epu8, epu8, 64, 6); diff --git a/gcc/testsuite/gcc.target/i386/pr96906-1.c b/gcc/testsuite/gcc.target/i386/pr96906-1.c index 9d836eb2bdd..b1b41bf522d 100644 --- a/gcc/testsuite/gcc.target/i386/pr96906-1.c +++ b/gcc/testsuite/gcc.target/i386/pr96906-1.c @@ -1,6 +1,6 @@ /* PR target/96906 */ /* { dg-do compile } */ -/* { dg-options "-O2 -mavx2" } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ /* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */ /* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */ /* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */ -- 2.18.1