Hi:
  Add define_peephole2 to perform optimization like bellow:

+/* Optimize for TARGET_AVX512F
+  vpsubusw op1, op2, dst1;
+  vxorps xmm, xmm, dst2; ---->   vpcmpleuw op1, op2, dst3
+  vpcmpeqw dst1, dst2, dst3  */

and

+/* Optimize for target above TARGET_SSE4_1
+  vpsubusw op1, op2, dst1;      vpminuw op1, op2, dst1
+  vpxor xmm, xmm, dst2; ---->   vpcmpeqw op1, dst1, dst3
+  vpcmpeqw dst1, dst2, dst3  */

Bootstrap is ok, regression test is ok for i386/x86-64 backend.
Ok for trunk?

gcc/ChangeLog:
        PR target/96906
        * config/i386/sse.md (VI12_128_256): New mode iterator.
        (define_peephole2): Optimize comparison between result of
        us_minus and 0, it could be optimized to "vpcmplequ" for
        AVX512 or "pminu + cmpeq" for target above TARGET_SSE4_1.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/avx2-pr96906-1.c: New test.
        * gcc.target/i386/avx512f-pr96906-1.c: New test.
        * gcc.target/i386/sse2-pr96906.c: New test.
        * gcc.target/i386/sse4_1-pr96906-1.c: New test.

--
BR,
Hongtao
From dbfbd5350c9d12a0e5ca643cf9666d041d7d4744 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 3 Sep 2020 16:18:20 +0800
Subject: [PATCH] Optimize __builtin_ia32_psubusw128 compared to 0 to
 __builtin_ia32_pminuw128 compared to operand

gcc/ChangeLog:
	PR target/96906
	* config/i386/sse.md (VI12_128_256): New mode iterator.
	(define_peephole2): Optimize comparison between result of
	us_minus and 0, it could be optimized to "vpcmplequ" for
	AVX512 or "pminu + cmpeq" for target above TARGET_SSE4_1.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-pr96906-1.c: New test.
	* gcc.target/i386/avx512f-pr96906-1.c: New test.
	* gcc.target/i386/sse2-pr96906.c: New test.
	* gcc.target/i386/sse4_1-pr96906-1.c: New test.
---
 gcc/config/i386/sse.md                        | 63 +++++++++++++++++++
 .../gcc.target/i386/avx2-pr96906-1.c          | 17 +++++
 .../gcc.target/i386/avx512f-pr96906-1.c       | 40 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-pr96906.c  | 11 ++++
 .../gcc.target/i386/sse4_1-pr96906-1.c        | 11 ++++
 5 files changed, 142 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr96906.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8250325e1a3..60a571494d5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -636,6 +636,7 @@ (define_mode_iterator VI124_128 [V16QI V8HI V4SI])
 (define_mode_iterator VI24_128 [V8HI V4SI])
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
 (define_mode_iterator VI48_128 [V4SI V2DI])
+(define_mode_iterator VI12_128_256 [V16QI V32QI V8HI V16HI])
 
 ;; Various 256bit and 512 vector integer mode combinations
 (define_mode_iterator VI124_256 [V32QI V16HI V8SI])
@@ -12943,6 +12944,68 @@ (define_insn "sse2_gt<mode>3"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+/* Optimize for TARGET_AVX512F
+  vpsubusw op1, op2, dst1;
+  vxorps xmm, xmm, dst2; ---->   vpcmpleuw op1, op2, dst3
+  vpcmpeqw dst1, dst2, dst3  */
+(define_peephole2
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+	(us_minus:VI12_AVX512VL
+	  (match_operand:VI12_AVX512VL 1 "register_operand")
+	  (match_operand:VI12_AVX512VL 2 "vector_operand")))
+   (set (match_operand:VI12_AVX512VL 3 "register_operand")
+	(match_operand:VI12_AVX512VL 4 "const0_operand"))
+   (set (match_operand:<avx512fmaskmode> 5 "mask_reg_operand")
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VI12_AVX512VL 6 "register_operand")
+	     (match_operand:VI12_AVX512VL 7 "register_operand")
+	     (const_int 0)]
+	    UNSPEC_PCMP))]
+  "((rtx_equal_p (operands[0], operands[6])
+    && rtx_equal_p (operands[3], operands[7]))
+   || (rtx_equal_p (operands[0], operands[7])
+      && rtx_equal_p (operands[3], operands[6])))
+  && peep2_reg_dead_p (3, operands[0])
+  && peep2_reg_dead_p (3, operands[3])"
+  [(set (match_dup 5)
+	(unspec:<avx512fmaskmode>
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (const_int 2)] UNSPEC_UNSIGNED_PCMP))])
+
+/* Optimize for target above TARGET_SSE4_1
+  vpsubusw op1, op2, dst1;      vpminuw op1, op2, dst1
+  vpxor xmm, xmm, dst2; ---->   vpcmpeqw op1, dst1, dst3
+  vpcmpeqw dst1, dst2, dst3  */
+(define_peephole2
+  [(set (match_operand:VI12_128_256 0 "register_operand")
+	(us_minus:VI12_128_256
+	  (match_operand:VI12_128_256 1 "register_operand")
+	  (match_operand:VI12_128_256 2 "vector_operand")))
+   (set (match_operand:VI12_128_256 3 "register_operand")
+	(match_operand:VI12_128_256 4 "const0_operand"))
+   (set (match_operand:VI12_128_256 5 "register_operand")
+	(eq:VI12_128_256
+	  (match_operand:VI12_128_256 6 "register_operand")
+	   (match_operand:VI12_128_256 7 "register_operand")))]
+  "(TARGET_SSE4_1 || <MODE>mode == V16QImode)
+  && ((rtx_equal_p (operands[0], operands[6])
+      && rtx_equal_p (operands[3], operands[7]))
+     || (rtx_equal_p (operands[0], operands[7])
+	&& rtx_equal_p (operands[3], operands[6])))
+  && (peep2_reg_dead_p (3, operands[0])
+     || rtx_equal_p (operands[0],operands[5]))
+  && (peep2_reg_dead_p (3, operands[3])
+     || rtx_equal_p (operands[3],operands[5]))"
+  [(set (match_dup 0)
+	(umin:VI12_128_256
+	  (match_dup 1)
+	  (match_dup 2)))
+   (set (match_dup 5)
+	(eq:VI12_128_256
+	  (match_dup 0)
+	  (match_dup 1)))])
+
 (define_expand "vcond<V_512:mode><VI_AVX512BW:mode>"
   [(set (match_operand:V_512 0 "register_operand")
 	(if_then_else:V_512
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c
new file mode 100644
index 00000000000..85fd6914c29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-times "vpminu\[wb\]" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq\[wb\]" 2 } } */
+
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v16hi __attribute__((vector_size(32)));
+
+v16hi cmple_epu16(v16hi x, v16hi y)
+{
+  return __builtin_ia32_psubusw256(x, y) == 0;
+}
+
+v32qi cmple_epu8(v32qi x, v32qi y)
+{
+  return __builtin_ia32_psubusb256(x, y) == 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c
new file mode 100644
index 00000000000..5897d1885a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vpcmpu\[wb\]\[\t \]*\\\$2" 6 } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+
+v16qi cmple_epu8_128 (v16qi x, v16qi y)
+{
+  return __builtin_ia32_psubusb128(x, y) == 0;
+}
+
+v32qi cmple_epu8_256 (v32qi x, v32qi y)
+{
+  return __builtin_ia32_psubusb256(x, y) == 0;
+}
+
+v64qi cmple_epu8_512 (v64qi x, v64qi y, v64qi z)
+{
+  return __builtin_ia32_psubusb512_mask (x, y, z, -1) == 0;
+}
+
+v8hi cmple_epu16_128 (v8hi x, v8hi y)
+{
+  return __builtin_ia32_psubusw128(x, y) == 0;
+}
+
+v16hi cmple_epu16_256 (v16hi x, v16hi y)
+{
+  return __builtin_ia32_psubusw256(x, y) == 0;
+}
+
+v32hi cmple_epu16_512 (v32hi x, v32hi y, v32hi z)
+{
+  return __builtin_ia32_psubusw512_mask (x, y, z, -1) == 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr96906.c b/gcc/testsuite/gcc.target/i386/sse2-pr96906.c
new file mode 100644
index 00000000000..8f7e9ec6556
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr96906.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
+/* { dg-final { scan-assembler-times "pminub" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeqb" 1 } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi cmple_epu8(v16qi x, v16qi y)
+{
+  return __builtin_ia32_psubusb128(x, y) == 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c b/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c
new file mode 100644
index 00000000000..02cdc1f2d0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -mno-avx -O2" } */
+/* { dg-final { scan-assembler-times "pminuw" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeqw" 1 } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+
+v8hi cmple_epu16(v8hi x, v8hi y)
+{
+  return __builtin_ia32_psubusw128(x, y) == 0;
+}
-- 
2.18.1

Reply via email to