On Wed, Nov 25, 2020 at 11:34 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The following patch renames VI12_AVX2 iterator to VI12_AVX2_AVX512BW
> for consistency with some other iterators, as I need VI12_AVX2 without
> AVX512BW for this change.
> The real meat is a pre-reload define_insn_and_split which combine
> can use to optimize psubusw compared to 0 into pminuw compared to op0
> (and similarly for psubusb compared to 0 into pminub compared to op0).
> According to Agner Fog's tables, psubus[bw] and pminu[bw] timings
> are the same, but the advantage of pminu[bw] is that the comparison
> doesn't need a zero operand, so e.g. for -msse4.1 it causes changes like
> -       psubusw %xmm1, %xmm0
> -       pxor    %xmm1, %xmm1
> +       pminuw  %xmm0, %xmm1
>         pcmpeqw %xmm1, %xmm0
> and similarly for avx2:
> -       vpsubusb        %ymm1, %ymm0, %ymm0
> -       vpxor   %xmm1, %xmm1, %xmm1
> -       vpcmpeqb        %ymm1, %ymm0, %ymm0
> +       vpminub %ymm1, %ymm0, %ymm1
> +       vpcmpeqb        %ymm0, %ymm1, %ymm0
>
> I haven't done the AVX512{BW,VL} define_insn_and_split, they'll need
> to match the UNSPEC_PCMP which are used for avx512 comparisons.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

I *think* this could be done with a combine splitter. The benefit of
using otherwise extremely picky combine splitter (it also doesn't
report why it can't split a combined insn) is that the split insn can
be used in follow-up combine attempts.

Uros.

> 2020-11-25  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/96906
>         * config/i386/sse.md (VI12_AVX2): Remove V64QI/V32HI modes.
>         (VI12_AVX2_AVX512BW): New mode iterator.
>         (<sse2_avx2>_<plusminus_insn><mode>3<mask_name>,
>         *<sse2_avx2>_<plusminus_insn><mode>3<mask_name>,
>         uavg<mode>3_ceil, <sse2_avx2>_uavg<mode>3<mask_name>,
>         *<sse2_avx2>_uavg<mode>3<mask_name>): Use VI12_AVX2_AVX512BW
>         iterator instead of VI12_AVX2.
>         (*<sse2_avx2>_ussub<mode>3_eq0): New define_insn_and_split.
>
>         * gcc.target/i386/pr96906-1.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2020-11-24 20:20:06.481079501 +0100
> +++ gcc/config/i386/sse.md      2020-11-25 11:22:25.565877452 +0100
> @@ -466,6 +466,10 @@ (define_mode_iterator SSESCALARMODE
>    [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI])
>
>  (define_mode_iterator VI12_AVX2
> +  [(V32QI "TARGET_AVX2") V16QI
> +   (V16HI "TARGET_AVX2") V8HI])
> +
> +(define_mode_iterator VI12_AVX2_AVX512BW
>    [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI
>     (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI])
>
> @@ -11395,18 +11399,18 @@ (define_insn "*<plusminus_insn><mode>3_m
>     (set_attr "mode" "<sseinsnmode>")])
>
>  (define_expand "<sse2_avx2>_<plusminus_insn><mode>3<mask_name>"
> -  [(set (match_operand:VI12_AVX2 0 "register_operand")
> -       (sat_plusminus:VI12_AVX2
> -         (match_operand:VI12_AVX2 1 "vector_operand")
> -         (match_operand:VI12_AVX2 2 "vector_operand")))]
> +  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
> +       (sat_plusminus:VI12_AVX2_AVX512BW
> +         (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")
> +         (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))]
>    "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
>    "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
>
>  (define_insn "*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>"
> -  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v")
> -       (sat_plusminus:VI12_AVX2
> -         (match_operand:VI12_AVX2 1 "vector_operand" "<comm>0,v")
> -         (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))]
> +  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v")
> +       (sat_plusminus:VI12_AVX2_AVX512BW
> +         (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "<comm>0,v")
> +         (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" "xBm,vm")))]
>    "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>
>     && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "@
> @@ -11418,6 +11422,26 @@ (define_insn "*<sse2_avx2>_<plusminus_in
>     (set_attr "prefix" "orig,maybe_evex")
>     (set_attr "mode" "TI")])
>
> +;; PR96906 - optimize psubusw compared to 0 into pminuw compared to op0.
> +(define_insn_and_split "*<sse2_avx2>_ussub<mode>3_eq0"
> +  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,x")
> +       (eq:VI12_AVX2
> +         (us_minus:VI12_AVX2
> +           (match_operand:VI12_AVX2 1 "vector_operand" "0,v")
> +           (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm"))
> +         (match_operand:VI12_AVX2 3 "const0_operand" "C,C")))]
> +  "TARGET_SSE2
> +   && (<MODE>mode != V8HImode || TARGET_SSE4_1)
> +   && ix86_pre_reload_split ()
> +   && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 4)
> +        (umin:VI12_AVX2 (match_dup 1) (match_dup 2)))
> +   (set (match_dup 0)
> +       (eq:VI12_AVX2 (match_dup 4) (match_dup 1)))]
> +  "operands[4] = gen_reg_rtx (<MODE>mode);")
> +
>  (define_expand "mulv8qi3"
>    [(set (match_operand:V8QI 0 "register_operand")
>         (mult:V8QI (match_operand:V8QI 1 "register_operand")
> @@ -12022,15 +12046,15 @@ (define_expand "sdot_prodv4si"
>  })
>
>  (define_expand "uavg<mode>3_ceil"
> -  [(set (match_operand:VI12_AVX2 0 "register_operand")
> -       (truncate:VI12_AVX2
> +  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
> +       (truncate:VI12_AVX2_AVX512BW
>           (lshiftrt:<ssedoublemode>
>             (plus:<ssedoublemode>
>               (plus:<ssedoublemode>
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 1 "vector_operand"))
> +                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand"))
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 2 "vector_operand")))
> +                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))
>               (match_dup 3))
>             (const_int 1))))]
>    "TARGET_SSE2"
> @@ -15744,15 +15768,15 @@ (define_expand "vec_unpacks_hi_<mode>"
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
>  (define_expand "<sse2_avx2>_uavg<mode>3<mask_name>"
> -  [(set (match_operand:VI12_AVX2 0 "register_operand")
> -       (truncate:VI12_AVX2
> +  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
> +       (truncate:VI12_AVX2_AVX512BW
>           (lshiftrt:<ssedoublemode>
>             (plus:<ssedoublemode>
>               (plus:<ssedoublemode>
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 1 "vector_operand"))
> +                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand"))
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 2 "vector_operand")))
> +                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))
>               (match_dup <mask_expand_op3>))
>             (const_int 1))))]
>    "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
> @@ -15762,15 +15786,15 @@ (define_expand "<sse2_avx2>_uavg<mode>3<
>  })
>
>  (define_insn "*<sse2_avx2>_uavg<mode>3<mask_name>"
> -  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v")
> -       (truncate:VI12_AVX2
> +  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v")
> +       (truncate:VI12_AVX2_AVX512BW
>           (lshiftrt:<ssedoublemode>
>             (plus:<ssedoublemode>
>               (plus:<ssedoublemode>
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 1 "vector_operand" "%0,v"))
> +                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" 
> "%0,v"))
>                 (zero_extend:<ssedoublemode>
> -                 (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))
> +                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" 
> "xBm,vm")))
>               (match_operand:<ssedoublemode> <mask_expand_op3> 
> "const1_operand"))
>             (const_int 1))))]
>    "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>
> --- gcc/testsuite/gcc.target/i386/pr96906-1.c.jj        2020-11-25 
> 11:13:16.995100955 +0100
> +++ gcc/testsuite/gcc.target/i386/pr96906-1.c   2020-11-25 11:13:16.995100955 
> +0100
> @@ -0,0 +1,62 @@
> +/* PR target/96906 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2" } */
> +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*ymm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*xmm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*ymm" 2 } } */
> +/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*ymm" 2 } } */
> +/* { dg-final { scan-assembler-not "\tvpsubus\[bw]" } } */
> +
> +#include <x86intrin.h>
> +
> +__m128i
> +f1 (__m128i x, __m128i y)
> +{
> +  return _mm_cmpeq_epi16 (_mm_subs_epu16 (x, y), _mm_setzero_si128 ());
> +}
> +
> +__m128i
> +f2 (__m128i x, __m128i y)
> +{
> +  return _mm_cmpeq_epi16 (_mm_min_epu16 (x, y), x);
> +}
> +
> +__m128i
> +f3 (__m128i x, __m128i y)
> +{
> +  return _mm_cmpeq_epi8 (_mm_subs_epu8 (x, y), _mm_setzero_si128 ());
> +}
> +
> +__m128i
> +f4 (__m128i x, __m128i y)
> +{
> +  return _mm_cmpeq_epi8 (_mm_min_epu8 (x, y), x);
> +}
> +
> +__m256i
> +f5 (__m256i x, __m256i y)
> +{
> +  return _mm256_cmpeq_epi16 (_mm256_subs_epu16 (x, y), _mm256_setzero_si256 
> ());
> +}
> +
> +__m256i
> +f6 (__m256i x, __m256i y)
> +{
> +  return _mm256_cmpeq_epi16 (_mm256_min_epu16 (x, y), x);
> +}
> +
> +__m256i
> +f7 (__m256i x, __m256i y)
> +{
> +  return _mm256_cmpeq_epi8 (_mm256_subs_epu8 (x, y), _mm256_setzero_si256 
> ());
> +}
> +
> +__m256i
> +f8 (__m256i x, __m256i y)
> +{
> +  return _mm256_cmpeq_epi8 (_mm256_min_epu8 (x, y), x);
> +}
>
>         Jakub
>

Reply via email to