Hi: Add define_peephole2 to perform optimization like bellow: +/* Optimize for TARGET_AVX512F + vpsubusw op1, op2, dst1; + vxorps xmm, xmm, dst2; ----> vpcmpleuw op1, op2, dst3 + vpcmpeqw dst1, dst2, dst3 */
and +/* Optimize for target above TARGET_SSE4_1 + vpsubusw op1, op2, dst1; vpminuw op1, op2, dst1 + vpxor xmm, xmm, dst2; ----> vpcmpeqw op1, dst1, dst3 + vpcmpeqw dst1, dst2, dst3 */ Bootstrap is ok, regression test is ok for i386/x86-64 backend. Ok for trunk? gcc/ChangeLog: PR target/96906 * config/i386/sse.md (VI12_128_256): New mode iterator. (define_peephole2): Optimize comparison between result of us_minus and 0, it could be optimized to "vpcmplequ" for AVX512 or "pminu + cmpeq" for target above TARGET_SSE4_1. gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-pr96906-1.c: New test. * gcc.target/i386/avx512f-pr96906-1.c: New test. * gcc.target/i386/sse2-pr96906.c: New test. * gcc.target/i386/sse4_1-pr96906-1.c: New test. -- BR, Hongtao
From dbfbd5350c9d12a0e5ca643cf9666d041d7d4744 Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao.liu@intel.com> Date: Thu, 3 Sep 2020 16:18:20 +0800 Subject: [PATCH] Optimize __builtin_ia32_psubusw128 compared to 0 to __builtin_ia32_pminuw128 compared to operand gcc/ChangeLog: PR target/96906 * config/i386/sse.md (VI12_128_256): New mode iterator. (define_peephole2): Optimize comparison between result of us_minus and 0, it could be optimized to "vpcmplequ" for AVX512 or "pminu + cmpeq" for target above TARGET_SSE4_1. gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-pr96906-1.c: New test. * gcc.target/i386/avx512f-pr96906-1.c: New test. * gcc.target/i386/sse2-pr96906.c: New test. * gcc.target/i386/sse4_1-pr96906-1.c: New test. --- gcc/config/i386/sse.md | 63 +++++++++++++++++++ .../gcc.target/i386/avx2-pr96906-1.c | 17 +++++ .../gcc.target/i386/avx512f-pr96906-1.c | 40 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-pr96906.c | 11 ++++ .../gcc.target/i386/sse4_1-pr96906-1.c | 11 ++++ 5 files changed, 142 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr96906.c create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8250325e1a3..60a571494d5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -636,6 +636,7 @@ (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) (define_mode_iterator VI48_128 [V4SI V2DI]) +(define_mode_iterator VI12_128_256 [V16QI V32QI V8HI V16HI]) ;; Various 256bit and 512 vector integer mode combinations (define_mode_iterator VI124_256 [V32QI V16HI V8SI]) @@ -12943,6 +12944,68 @@ (define_insn "sse2_gt<mode>3" (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) +/* Optimize for TARGET_AVX512F + vpsubusw op1, op2, dst1; + vxorps xmm, xmm, dst2; ----> vpcmpleuw op1, op2, dst3 + vpcmpeqw dst1, dst2, dst3 */ +(define_peephole2 + [(set (match_operand:VI12_AVX512VL 0 "register_operand") + (us_minus:VI12_AVX512VL + (match_operand:VI12_AVX512VL 1 "register_operand") + (match_operand:VI12_AVX512VL 2 "vector_operand"))) + (set (match_operand:VI12_AVX512VL 3 "register_operand") + (match_operand:VI12_AVX512VL 4 "const0_operand")) + (set (match_operand:<avx512fmaskmode> 5 "mask_reg_operand") + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 6 "register_operand") + (match_operand:VI12_AVX512VL 7 "register_operand") + (const_int 0)] + UNSPEC_PCMP))] + "((rtx_equal_p (operands[0], operands[6]) + && rtx_equal_p (operands[3], operands[7])) + || (rtx_equal_p (operands[0], operands[7]) + && rtx_equal_p (operands[3], operands[6]))) + && peep2_reg_dead_p (3, operands[0]) + && peep2_reg_dead_p (3, operands[3])" + [(set (match_dup 5) + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (const_int 2)] UNSPEC_UNSIGNED_PCMP))]) + +/* Optimize for target above TARGET_SSE4_1 + vpsubusw op1, op2, dst1; vpminuw op1, op2, dst1 + vpxor xmm, xmm, dst2; ----> vpcmpeqw op1, dst1, dst3 + vpcmpeqw dst1, dst2, dst3 */ +(define_peephole2 + [(set (match_operand:VI12_128_256 0 "register_operand") + (us_minus:VI12_128_256 + (match_operand:VI12_128_256 1 "register_operand") + (match_operand:VI12_128_256 2 "vector_operand"))) + (set (match_operand:VI12_128_256 3 "register_operand") + (match_operand:VI12_128_256 4 "const0_operand")) + (set (match_operand:VI12_128_256 5 "register_operand") + (eq:VI12_128_256 + (match_operand:VI12_128_256 6 "register_operand") + (match_operand:VI12_128_256 7 "register_operand")))] + "(TARGET_SSE4_1 || <MODE>mode == V16QImode) + && ((rtx_equal_p (operands[0], operands[6]) + && rtx_equal_p (operands[3], operands[7])) + || (rtx_equal_p (operands[0], operands[7]) + && rtx_equal_p (operands[3], operands[6]))) + && (peep2_reg_dead_p (3, operands[0]) + || rtx_equal_p (operands[0],operands[5])) + && (peep2_reg_dead_p (3, operands[3]) + || rtx_equal_p (operands[3],operands[5]))" + [(set (match_dup 0) + (umin:VI12_128_256 + (match_dup 1) + (match_dup 2))) + (set (match_dup 5) + (eq:VI12_128_256 + (match_dup 0) + (match_dup 1)))]) + (define_expand "vcond<V_512:mode><VI_AVX512BW:mode>" [(set (match_operand:V_512 0 "register_operand") (if_then_else:V_512 diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c new file mode 100644 index 00000000000..85fd6914c29 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-pr96906-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-times "vpminu\[wb\]" 2 } } */ +/* { dg-final { scan-assembler-times "vpcmpeq\[wb\]" 2 } } */ + +typedef char v32qi __attribute__((vector_size(32))); +typedef short v16hi __attribute__((vector_size(32))); + +v16hi cmple_epu16(v16hi x, v16hi y) +{ + return __builtin_ia32_psubusw256(x, y) == 0; +} + +v32qi cmple_epu8(v32qi x, v32qi y) +{ + return __builtin_ia32_psubusb256(x, y) == 0; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c new file mode 100644 index 00000000000..5897d1885a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96906-1.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vpcmpu\[wb\]\[\t \]*\\\$2" 6 } } */ + +typedef char v16qi __attribute__((vector_size(16))); +typedef short v8hi __attribute__((vector_size(16))); +typedef char v32qi __attribute__((vector_size(32))); +typedef short v16hi __attribute__((vector_size(32))); +typedef char v64qi __attribute__((vector_size(64))); +typedef short v32hi __attribute__((vector_size(64))); + +v16qi cmple_epu8_128 (v16qi x, v16qi y) +{ + return __builtin_ia32_psubusb128(x, y) == 0; +} + +v32qi cmple_epu8_256 (v32qi x, v32qi y) +{ + return __builtin_ia32_psubusb256(x, y) == 0; +} + +v64qi cmple_epu8_512 (v64qi x, v64qi y, v64qi z) +{ + return __builtin_ia32_psubusb512_mask (x, y, z, -1) == 0; +} + +v8hi cmple_epu16_128 (v8hi x, v8hi y) +{ + return __builtin_ia32_psubusw128(x, y) == 0; +} + +v16hi cmple_epu16_256 (v16hi x, v16hi y) +{ + return __builtin_ia32_psubusw256(x, y) == 0; +} + +v32hi cmple_epu16_512 (v32hi x, v32hi y, v32hi z) +{ + return __builtin_ia32_psubusw512_mask (x, y, z, -1) == 0; +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr96906.c b/gcc/testsuite/gcc.target/i386/sse2-pr96906.c new file mode 100644 index 00000000000..8f7e9ec6556 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-pr96906.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-sse4.1" } */ +/* { dg-final { scan-assembler-times "pminub" 1 } } */ +/* { dg-final { scan-assembler-times "pcmpeqb" 1 } } */ + +typedef char v16qi __attribute__((vector_size(16))); + +v16qi cmple_epu8(v16qi x, v16qi y) +{ + return __builtin_ia32_psubusb128(x, y) == 0; +} diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c b/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c new file mode 100644 index 00000000000..02cdc1f2d0e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse4_1-pr96906-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-msse4.1 -mno-avx -O2" } */ +/* { dg-final { scan-assembler-times "pminuw" 1 } } */ +/* { dg-final { scan-assembler-times "pcmpeqw" 1 } } */ + +typedef short v8hi __attribute__((vector_size(16))); + +v8hi cmple_epu16(v8hi x, v8hi y) +{ + return __builtin_ia32_psubusw128(x, y) == 0; +} -- 2.18.1