On Wed, Nov 25, 2020 at 11:34 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following patch renames VI12_AVX2 iterator to VI12_AVX2_AVX512BW > for consistency with some other iterators, as I need VI12_AVX2 without > AVX512BW for this change. > The real meat is a pre-reload define_insn_and_split which combine > can use to optimize psubusw compared to 0 into pminuw compared to op0 > (and similarly for psubusb compared to 0 into pminub compared to op0). > According to Agner Fog's tables, psubus[bw] and pminu[bw] timings > are the same, but the advantage of pminu[bw] is that the comparison > doesn't need a zero operand, so e.g. for -msse4.1 it causes changes like > - psubusw %xmm1, %xmm0 > - pxor %xmm1, %xmm1 > + pminuw %xmm0, %xmm1 > pcmpeqw %xmm1, %xmm0 > and similarly for avx2: > - vpsubusb %ymm1, %ymm0, %ymm0 > - vpxor %xmm1, %xmm1, %xmm1 > - vpcmpeqb %ymm1, %ymm0, %ymm0 > + vpminub %ymm1, %ymm0, %ymm1 > + vpcmpeqb %ymm0, %ymm1, %ymm0 > > I haven't done the AVX512{BW,VL} define_insn_and_split, they'll need > to match the UNSPEC_PCMP which are used for avx512 comparisons. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
I *think* this could be done with a combine splitter. The benefit of using otherwise extremely picky combine splitter (it also doesn't report why it can't split a combined insn) is that the split insn can be used in follow-up combine attempts. Uros. > 2020-11-25 Jakub Jelinek <ja...@redhat.com> > > PR target/96906 > * config/i386/sse.md (VI12_AVX2): Remove V64QI/V32HI modes. > (VI12_AVX2_AVX512BW): New mode iterator. > (<sse2_avx2>_<plusminus_insn><mode>3<mask_name>, > *<sse2_avx2>_<plusminus_insn><mode>3<mask_name>, > uavg<mode>3_ceil, <sse2_avx2>_uavg<mode>3<mask_name>, > *<sse2_avx2>_uavg<mode>3<mask_name>): Use VI12_AVX2_AVX512BW > iterator instead of VI12_AVX2. > (*<sse2_avx2>_ussub<mode>3_eq0): New define_insn_and_split. > > * gcc.target/i386/pr96906-1.c: New test. > > --- gcc/config/i386/sse.md.jj 2020-11-24 20:20:06.481079501 +0100 > +++ gcc/config/i386/sse.md 2020-11-25 11:22:25.565877452 +0100 > @@ -466,6 +466,10 @@ (define_mode_iterator SSESCALARMODE > [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI]) > > (define_mode_iterator VI12_AVX2 > + [(V32QI "TARGET_AVX2") V16QI > + (V16HI "TARGET_AVX2") V8HI]) > + > +(define_mode_iterator VI12_AVX2_AVX512BW > [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI > (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI]) > > @@ -11395,18 +11399,18 @@ (define_insn "*<plusminus_insn><mode>3_m > (set_attr "mode" "<sseinsnmode>")]) > > (define_expand "<sse2_avx2>_<plusminus_insn><mode>3<mask_name>" > - [(set (match_operand:VI12_AVX2 0 "register_operand") > - (sat_plusminus:VI12_AVX2 > - (match_operand:VI12_AVX2 1 "vector_operand") > - (match_operand:VI12_AVX2 2 "vector_operand")))] > + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") > + (sat_plusminus:VI12_AVX2_AVX512BW > + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand") > + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))] > "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" > "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") > > (define_insn "*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>" > - [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v") > - (sat_plusminus:VI12_AVX2 > - (match_operand:VI12_AVX2 1 "vector_operand" "<comm>0,v") > - (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))] > + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v") > + (sat_plusminus:VI12_AVX2_AVX512BW > + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "<comm>0,v") > + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" "xBm,vm")))] > "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition> > && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > "@ > @@ -11418,6 +11422,26 @@ (define_insn "*<sse2_avx2>_<plusminus_in > (set_attr "prefix" "orig,maybe_evex") > (set_attr "mode" "TI")]) > > +;; PR96906 - optimize psubusw compared to 0 into pminuw compared to op0. > +(define_insn_and_split "*<sse2_avx2>_ussub<mode>3_eq0" > + [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,x") > + (eq:VI12_AVX2 > + (us_minus:VI12_AVX2 > + (match_operand:VI12_AVX2 1 "vector_operand" "0,v") > + (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")) > + (match_operand:VI12_AVX2 3 "const0_operand" "C,C")))] > + "TARGET_SSE2 > + && (<MODE>mode != V8HImode || TARGET_SSE4_1) > + && ix86_pre_reload_split () > + && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)" > + "#" > + "&& 1" > + [(set (match_dup 4) > + (umin:VI12_AVX2 (match_dup 1) (match_dup 2))) > + (set (match_dup 0) > + (eq:VI12_AVX2 (match_dup 4) (match_dup 1)))] > + "operands[4] = gen_reg_rtx (<MODE>mode);") > + > (define_expand "mulv8qi3" > [(set (match_operand:V8QI 0 "register_operand") > (mult:V8QI (match_operand:V8QI 1 "register_operand") > @@ -12022,15 +12046,15 @@ (define_expand "sdot_prodv4si" > }) > > (define_expand "uavg<mode>3_ceil" > - [(set (match_operand:VI12_AVX2 0 "register_operand") > - (truncate:VI12_AVX2 > + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") > + (truncate:VI12_AVX2_AVX512BW > (lshiftrt:<ssedoublemode> > (plus:<ssedoublemode> > (plus:<ssedoublemode> > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 1 "vector_operand")) > + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")) > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 2 "vector_operand"))) > + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand"))) > (match_dup 3)) > (const_int 1))))] > "TARGET_SSE2" > @@ -15744,15 +15768,15 @@ (define_expand "vec_unpacks_hi_<mode>" > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > (define_expand "<sse2_avx2>_uavg<mode>3<mask_name>" > - [(set (match_operand:VI12_AVX2 0 "register_operand") > - (truncate:VI12_AVX2 > + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") > + (truncate:VI12_AVX2_AVX512BW > (lshiftrt:<ssedoublemode> > (plus:<ssedoublemode> > (plus:<ssedoublemode> > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 1 "vector_operand")) > + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")) > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 2 "vector_operand"))) > + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand"))) > (match_dup <mask_expand_op3>)) > (const_int 1))))] > "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" > @@ -15762,15 +15786,15 @@ (define_expand "<sse2_avx2>_uavg<mode>3< > }) > > (define_insn "*<sse2_avx2>_uavg<mode>3<mask_name>" > - [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v") > - (truncate:VI12_AVX2 > + [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v") > + (truncate:VI12_AVX2_AVX512BW > (lshiftrt:<ssedoublemode> > (plus:<ssedoublemode> > (plus:<ssedoublemode> > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 1 "vector_operand" "%0,v")) > + (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" > "%0,v")) > (zero_extend:<ssedoublemode> > - (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm"))) > + (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" > "xBm,vm"))) > (match_operand:<ssedoublemode> <mask_expand_op3> > "const1_operand")) > (const_int 1))))] > "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition> > --- gcc/testsuite/gcc.target/i386/pr96906-1.c.jj 2020-11-25 > 11:13:16.995100955 +0100 > +++ gcc/testsuite/gcc.target/i386/pr96906-1.c 2020-11-25 11:13:16.995100955 > +0100 > @@ -0,0 +1,62 @@ > +/* PR target/96906 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2" } */ > +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*ymm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*xmm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*xmm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*ymm" 2 } } */ > +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*ymm" 2 } } */ > +/* { dg-final { scan-assembler-not "\tvpsubus\[bw]" } } */ > + > +#include <x86intrin.h> > + > +__m128i > +f1 (__m128i x, __m128i y) > +{ > + return _mm_cmpeq_epi16 (_mm_subs_epu16 (x, y), _mm_setzero_si128 ()); > +} > + > +__m128i > +f2 (__m128i x, __m128i y) > +{ > + return _mm_cmpeq_epi16 (_mm_min_epu16 (x, y), x); > +} > + > +__m128i > +f3 (__m128i x, __m128i y) > +{ > + return _mm_cmpeq_epi8 (_mm_subs_epu8 (x, y), _mm_setzero_si128 ()); > +} > + > +__m128i > +f4 (__m128i x, __m128i y) > +{ > + return _mm_cmpeq_epi8 (_mm_min_epu8 (x, y), x); > +} > + > +__m256i > +f5 (__m256i x, __m256i y) > +{ > + return _mm256_cmpeq_epi16 (_mm256_subs_epu16 (x, y), _mm256_setzero_si256 > ()); > +} > + > +__m256i > +f6 (__m256i x, __m256i y) > +{ > + return _mm256_cmpeq_epi16 (_mm256_min_epu16 (x, y), x); > +} > + > +__m256i > +f7 (__m256i x, __m256i y) > +{ > + return _mm256_cmpeq_epi8 (_mm256_subs_epu8 (x, y), _mm256_setzero_si256 > ()); > +} > + > +__m256i > +f8 (__m256i x, __m256i y) > +{ > + return _mm256_cmpeq_epi8 (_mm256_min_epu8 (x, y), x); > +} > > Jakub >