On Mon, Nov 30, 2020 at 9:46 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> On Mon, Nov 30, 2020 at 09:11:10PM +0800, Hongtao Liu wrote:
> > +;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw.
> > +(define_split
> > +  [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
> > +        (unspec:<avx512fmaskmode>
> > +          [(us_minus:VI12_AVX512VL
> > +             (match_operand:VI12_AVX512VL 1 "vector_operand")
> > +             (match_operand:VI12_AVX512VL 2 "vector_operand"))
> > +           (match_operand:VI12_AVX512VL 3 "const0_operand")
> > +           (match_operand:SI 4 "const0_operand")]
> > +          UNSPEC_PCMP))]
> > +  "TARGET_AVX512BW && ix86_binary_operator_ok (US_MINUS, <MODE>mode, 
> > operands)"
>
> Too long line, please wrap it.
> Also, INTVAL (operands[4]) == 0 is EQ comparison, can't we handle also
> NE (i.e. INTVAL (operands[4]) == 4?
> I.e. replace the "const0_operand" in there with "const_0_to_7_operand"
> and check in conditions that (INTVAL (operands[4]) & 3) == 0.
>
> > +  [(const_int 0)]
> > +  {
> > +    /* LE: 2, NLT: 5.  */
> > +    rtx cmp_predicate = GEN_INT (2);
> > +    if (MEM_P (operands[1]))
> > +      {
> > +        std::swap (operands[1], operands[2]);
> > +        cmp_predicate = GEN_INT (5);
>
> For INTVAL (operands[4]) == 4 it would then be cmp_predictate NLE: 4 resp.
> LT: 3 I think.
>
> Also, this handles only UNSPEC_PCMP, can't we handle UNSPEC_UNSIGNED_PCMP
> too?  I mean, for equality comparisons it doesn't really matter if we have
> signed or unsigned == or !=.  And for unsigned
> x == 0U is equivalent to x <= 0U, and x != 0U equivalent to x > 0U.
>
>         Jakub
>

Yes, Update patch.

+(define_int_iterator UNSPEC_PCMP_ITER
+  [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
+
+(define_int_attr pcmp_signed_mask
+  [(UNSPEC_PCMP "3") (UNSPEC_UNSIGNED_PCMP "1")])
+
+;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw.
+;; For signed comparison, handle EQ 0: NEQ 4,
+;; for unsigned comparison extra handle LE:2, NLE:6, equivalent to EQ and NEQ.
+
+(define_split
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+       (unspec:<avx512fmaskmode>
+         [(us_minus:VI12_AVX512VL
+            (match_operand:VI12_AVX512VL 1 "vector_operand")
+            (match_operand:VI12_AVX512VL 2 "vector_operand"))
+          (match_operand:VI12_AVX512VL 3 "const0_operand")
+          (match_operand:SI 4 "const_0_to_7_operand")]
+         UNSPEC_PCMP_ITER))]
+  "TARGET_AVX512BW
+  && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)
+  && (INTVAL (operands[4]) & <pcmp_signed_mask>) == 0"
+  [(const_int 0)]
+  {
+    bool neq_p = INTVAL (operands[4]) >> 2;
+    /* LE: 2, NLT: 5, NLE: 6, LT: 1  */
+    rtx cmp_predicate = neq_p ? GEN_INT (6) : GEN_INT (2);
+    if (MEM_P (operands[1]))
+      {
+       std::swap (operands[1], operands[2]);
+       cmp_predicate = neq_p ? GEN_INT (1) : GEN_INT (5);
+      }
+    emit_insn (gen_<avx512>_ucmp<mode>3 (operands[0], operands[1],
+                                       operands[2], cmp_predicate));
+    DONE;
+  })
+


-- 
BR,
Hongtao
From e3eb61066ee665325cba8e231b991f9a1dda07df Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Mon, 30 Nov 2020 13:27:16 +0800
Subject: [PATCH]  Optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnleuw
 [PR96906]

For signed comparisons, it handles cases that are eq or neq to 0.
For unsigned comparisons, it additionaly handles cases that are le or
gt to 0(equivilent to eq or neq to 0). Transform case eq to leu,
case neq to gtu.

.i.e. for -mavx512bw -mavx512vl transform eq case code from

	vpsubusw        %xmm1, %xmm0, %xmm0
	vpxor   %xmm1, %xmm1, %xmm1
	vpcmpeqw  %xmm1, %xmm0, %k0
to
	vpcmpleuw       %xmm1, %xmm0, %k0

.i.e. for -mavx512bw -mavx512vl transform neq case code from

	vpsubusw        %xmm1, %xmm0, %xmm0
	vpxor   %xmm1, %xmm1, %xmm1
	vpcmpneqw  %xmm1, %xmm0, %k0
to
	vpcmpnleuw       %xmm1, %xmm0, %k0

gcc/ChangeLog
	PR target/96906
	* config/i386/sse.md
	(<avx512>_ucmp<mode>3<mask_scalar_merge_name>): Add a new
	define_split after this insn.

gcc/testsuite/ChangeLog

	* gcc.target/i386/avx512bw-pr96906-1.c: New test.
	* gcc.target/i386/pr96906-1.c: Add -mno-avx512f.
---
 gcc/config/i386/sse.md                        | 37 ++++++++++
 .../gcc.target/i386/avx512bw-pr96906-1.c      | 68 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96906-1.c     |  2 +-
 3 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4aad462f882..7a4dafea1ed 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3006,6 +3006,43 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_int_iterator UNSPEC_PCMP_ITER
+  [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
+
+(define_int_attr pcmp_signed_mask
+  [(UNSPEC_PCMP "3") (UNSPEC_UNSIGNED_PCMP "1")])
+
+;; PR96906 - optimize vpsubusw compared to 0 into vpcmpleuw or vpcmpnltuw.
+;; For signed comparison, handle EQ 0: NEQ 4,
+;; for unsigned comparison extra handle LE:2, NLE:6, equivalent to EQ and NEQ.
+
+(define_split
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+	(unspec:<avx512fmaskmode>
+	  [(us_minus:VI12_AVX512VL
+	     (match_operand:VI12_AVX512VL 1 "vector_operand")
+	     (match_operand:VI12_AVX512VL 2 "vector_operand"))
+	   (match_operand:VI12_AVX512VL 3 "const0_operand")
+	   (match_operand:SI 4 "const_0_to_7_operand")]
+	  UNSPEC_PCMP_ITER))]
+  "TARGET_AVX512BW
+  && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)
+  && (INTVAL (operands[4]) & <pcmp_signed_mask>) == 0"
+  [(const_int 0)]
+  {
+    bool neq_p = INTVAL (operands[4]) >> 2;
+    /* LE: 2, NLT: 5, NLE: 6, LT: 1  */
+    rtx cmp_predicate = neq_p ? GEN_INT (6) : GEN_INT (2);
+    if (MEM_P (operands[1]))
+      {
+	std::swap (operands[1], operands[2]);
+	cmp_predicate = neq_p ? GEN_INT (1) : GEN_INT (5);
+      }
+    emit_insn (gen_<avx512>_ucmp<mode>3 (operands[0], operands[1],
+					operands[2], cmp_predicate));
+    DONE;
+  })
+
 (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(and:<avx512fmaskmode>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c
new file mode 100644
index 00000000000..81d7e06b972
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr96906-1.c
@@ -0,0 +1,68 @@
+/* PR target/96906 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl -masm=att" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpub[ \t]*\$2} 9 } } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpub[ \t]*\$6} 9 } } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpuw[ \t]*\$2} 9 } } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpuw[ \t]*\$6} 9 } } */
+
+
+#include<immintrin.h>
+
+#define FOO(LENGTH,SUFFIX,TYPE,UTYPE,RTYPE,PRED)			\
+  __mmask##RTYPE							\
+  foo_##LENGTH##_##TYPE##_##PRED (__m##LENGTH##i x, __m##LENGTH##i y)	\
+  {									\
+    return								\
+      _mm##SUFFIX##_cmp_##TYPE##_mask (_mm##SUFFIX##_subs_##UTYPE (x, y), \
+				       _mm##SUFFIX##_setzero_si##LENGTH (), \
+				       PRED);				\
+  }									\
+
+FOO (128,, epi16, epu16, 8, 0);
+FOO (128,, epi16, epu16, 8, 4);
+
+FOO (128,, epu16, epu16, 8, 0);
+FOO (128,, epu16, epu16, 8, 2);
+FOO (128,, epu16, epu16, 8, 4);
+FOO (128,, epu16, epu16, 8, 6);
+
+FOO (256, 256, epi16, epu16, 16, 0);
+FOO (256, 256, epi16, epu16, 16, 4);
+
+FOO (256, 256, epu16, epu16, 16, 0);
+FOO (256, 256, epu16, epu16, 16, 2);
+FOO (256, 256, epu16, epu16, 16, 4);
+FOO (256, 256, epu16, epu16, 16, 6);
+
+FOO (512, 512, epi16, epu16, 32, 0);
+FOO (512, 512, epi16, epu16, 32, 4);
+
+FOO (512, 512, epu16, epu16, 32, 0);
+FOO (512, 512, epu16, epu16, 32, 2);
+FOO (512, 512, epu16, epu16, 32, 4);
+FOO (512, 512, epu16, epu16, 32, 6);
+
+FOO (128,, epi8, epu8, 16, 0);
+FOO (128,, epi8, epu8, 16, 4);
+
+FOO (128,, epu8, epu8, 16, 0);
+FOO (128,, epu8, epu8, 16, 2);
+FOO (128,, epu8, epu8, 16, 4);
+FOO (128,, epu8, epu8, 16, 6);
+
+FOO (256, 256, epi8, epu8, 32, 0);
+FOO (256, 256, epi8, epu8, 32, 4);
+
+FOO (256, 256, epu8, epu8, 32, 0);
+FOO (256, 256, epu8, epu8, 32, 2);
+FOO (256, 256, epu8, epu8, 32, 4);
+FOO (256, 256, epu8, epu8, 32, 6);
+
+FOO (512, 512, epi8, epu8, 64, 0);
+FOO (512, 512, epi8, epu8, 64, 4);
+
+FOO (512, 512, epu8, epu8, 64, 0);
+FOO (512, 512, epu8, epu8, 64, 2);
+FOO (512, 512, epu8, epu8, 64, 4);
+FOO (512, 512, epu8, epu8, 64, 6);
diff --git a/gcc/testsuite/gcc.target/i386/pr96906-1.c b/gcc/testsuite/gcc.target/i386/pr96906-1.c
index 9d836eb2bdd..b1b41bf522d 100644
--- a/gcc/testsuite/gcc.target/i386/pr96906-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr96906-1.c
@@ -1,6 +1,6 @@
 /* PR target/96906 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx2" } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
 /* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */
 /* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */
 /* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */
-- 
2.18.1

Reply via email to