On Tue, Jul 20, 2021 at 2:33 PM liuhongt <hongtao....@intel.com> wrote:
>
> Hi:
>   As mention in 
> https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575420.html
>
> ----cut start-----
> > note for the lowpart we can just view-convert away the excess bits,
> > fully re-using the mask.  We generate surprisingly "good" code:
> >
> >         kmovb   %k1, %edi
> >         shrb    $4, %dil
> >         kmovb   %edi, %k2
> >
> > besides the lack of using kshiftrb.  I guess we're just lacking
> > a mask register alternative for
> Yes, we can do it similar as kor/kand/kxor.
> ---cut end--------
>
>   Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/constraints.md (Wb): New constraint.
>         (Ww): Ditto.
>         * config/i386/i386.md (*ashlhi3_1): Extend to avx512 mask
>         shift.
>         (*ashlqi3_1): Ditto.
>         (*<insn><mode>3_1): Ditto.
>         (*<insn><mode>3_1): Ditto.
>         * config/i386/sse.md (k<code><mode>): New define_split after
>         it to convert generic shift pattern to mask shift ones.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/mask-shift.c: New test.
> ---
>  gcc/config/i386/constraints.md             | 10 +++
>  gcc/config/i386/i386.md                    | 94 +++++++++++++++-------
>  gcc/config/i386/sse.md                     | 14 ++++
>  gcc/testsuite/gcc.target/i386/mask-shift.c | 83 +++++++++++++++++++
>  4 files changed, 173 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/mask-shift.c
>
> diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
> index 485e3f5b2cf..4aa28a5621c 100644
> --- a/gcc/config/i386/constraints.md
> +++ b/gcc/config/i386/constraints.md
> @@ -222,6 +222,16 @@ (define_constraint "BC"
>             (match_operand 0 "vector_all_ones_operand"))))
>
>  ;; Integer constant constraints.
> +(define_constraint "Wb"
> +  "Integer constant in the range 0 @dots{} 7, for 8-bit shifts."
> +  (and (match_code "const_int")
> +       (match_test "IN_RANGE (ival, 0, 7)")))
> +
> +(define_constraint "Ww"
> +  "Integer constant in the range 0 @dots{} 15, for 16-bit shifts."
> +  (and (match_code "const_int")
> +       (match_test "IN_RANGE (ival, 0, 15)")))
> +
>  (define_constraint "I"
>    "Integer constant in the range 0 @dots{} 31, for 32-bit shifts."
>    (and (match_code "const_int")
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 8b809c49fe0..c5f9bd4d4d8 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1136,6 +1136,7 @@ (define_mode_attr di [(SI "nF") (DI "Wd")])
>
>  ;; Immediate operand constraint for shifts.
>  (define_mode_attr S [(QI "I") (HI "I") (SI "I") (DI "J") (TI "O")])
> +(define_mode_attr KS [(QI "Wb") (HI "Ww") (SI "I") (DI "J")])
>
>  ;; Print register name in the specified mode.
>  (define_mode_attr k [(QI "b") (HI "w") (SI "k") (DI "q")])
> @@ -11088,9 +11089,9 @@ (define_insn "*bmi2_ashl<mode>3_1"
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*ashl<mode>3_1"
> -  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
> -       (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,rm")
> -                     (match_operand:QI 2 "nonmemory_operand" "c<S>,M,r")))
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k")
> +       (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
> "0,l,rm,k")
> +                     (match_operand:QI 2 "nonmemory_operand" 
> "c<S>,M,r,<KS>")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)"
>  {
> @@ -11098,6 +11099,7 @@ (define_insn "*ashl<mode>3_1"
>      {
>      case TYPE_LEA:
>      case TYPE_ISHIFTX:
> +    case TYPE_MSKLOG:
>        return "#";
>
>      case TYPE_ALU:
> @@ -11113,7 +11115,11 @@ (define_insn "*ashl<mode>3_1"
>         return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
>      }
>  }
> -  [(set_attr "isa" "*,*,bmi2")
> +  [(set_attr "isa" "*,*,bmi2,avx512bw")
>     (set (attr "type")
>       (cond [(eq_attr "alternative" "1")
>               (const_string "lea")
> @@ -11123,6 +11129,8 @@ (define_insn "*ashl<mode>3_1"
>                       (match_operand 0 "register_operand"))
>                  (match_operand 2 "const1_operand"))
>               (const_string "alu")
> +           (eq_attr "alternative" "3")
> +             (const_string "msklog")
>            ]
>            (const_string "ishift")))
>     (set (attr "length_immediate")
> @@ -11218,15 +11226,16 @@ (define_split
>    "operands[2] = gen_lowpart (SImode, operands[2]);")
>
>  (define_insn "*ashlhi3_1"
> -  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp")
> -       (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l")
> -                  (match_operand:QI 2 "nonmemory_operand" "cI,M")))
> +  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k")
> +       (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k")
> +                  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (ASHIFT, HImode, operands)"
>  {
>    switch (get_attr_type (insn))
>      {
>      case TYPE_LEA:
> +    case TYPE_MSKLOG:
>        return "#";
>
>      case TYPE_ALU:
> @@ -11241,9 +11246,12 @@ (define_insn "*ashlhi3_1"
>         return "sal{w}\t{%2, %0|%0, %2}";
>      }
>  }
> -  [(set (attr "type")
> +  [(set_attr "isa" "*,*,avx512f")
> +   (set (attr "type")
>       (cond [(eq_attr "alternative" "1")
>               (const_string "lea")
> +           (eq_attr "alternative" "2")
> +             (const_string "msklog")
>              (and (and (match_test "TARGET_DOUBLE_WITH_ADD")
>                       (match_operand 0 "register_operand"))
>                  (match_operand 2 "const1_operand"))
> @@ -11259,18 +11270,19 @@ (define_insn "*ashlhi3_1"
>                            (match_test "optimize_function_for_size_p 
> (cfun)")))))
>         (const_string "0")
>         (const_string "*")))
> -   (set_attr "mode" "HI,SI")])
> +   (set_attr "mode" "HI,SI,HI")])
>
>  (define_insn "*ashlqi3_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp")
> -       (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l")
> -                  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M")))
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k")
> +       (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k")
> +                  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (ASHIFT, QImode, operands)"
>  {
>    switch (get_attr_type (insn))
>      {
>      case TYPE_LEA:
> +    case TYPE_MSKLOG:
>        return "#";
>
>      case TYPE_ALU:
> @@ -11298,9 +11307,12 @@ (define_insn "*ashlqi3_1"
>         }
>      }
>  }
> -  [(set (attr "type")
> +  [(set_attr "isa" "*,*,*,avx512dq")
> +   (set (attr "type")
>       (cond [(eq_attr "alternative" "2")
>               (const_string "lea")
> +           (eq_attr "alternative" "3")
> +             (const_string "msklog")
>              (and (and (match_test "TARGET_DOUBLE_WITH_ADD")
>                       (match_operand 0 "register_operand"))
>                  (match_operand 2 "const1_operand"))
> @@ -11316,7 +11334,7 @@ (define_insn "*ashlqi3_1"
>                            (match_test "optimize_function_for_size_p 
> (cfun)")))))
>         (const_string "0")
>         (const_string "*")))
> -   (set_attr "mode" "QI,SI,SI")
> +   (set_attr "mode" "QI,SI,SI,QI")
>     ;; Potential partial reg stall on alternative 1.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "1")
> @@ -11819,16 +11837,17 @@ (define_insn "*bmi2_<insn><mode>3_1"
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*<insn><mode>3_1"
> -  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
> +  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k")
>         (any_shiftrt:SWI48
> -         (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
> -         (match_operand:QI 2 "nonmemory_operand" "c<S>,r")))
> +         (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k")
> +         (match_operand:QI 2 "nonmemory_operand" "c<S>,r,<KS>")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>  {
>    switch (get_attr_type (insn))
>      {
>      case TYPE_ISHIFTX:
> +    case TYPE_MSKLOG:
>        return "#";
>
>      default:
> @@ -11839,11 +11858,16 @@ (define_insn "*<insn><mode>3_1"
>         return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
>      }
>  }
> -  [(set_attr "isa" "*,bmi2")
> -   (set_attr "type" "ishift,ishiftx")
> +  [(set_attr "isa" "*,bmi2,avx512bw")
> +   (set_attr "type" "ishift,ishiftx,msklog")
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "2")
> +         (symbol_ref "<CODE> == LSHIFTRT && TARGET_AVX512BW")

Please rather split the pattern to ASHIFTRT and LSHIFTRT. The
macroization has no point if we need to use enabled attribute in this
way.

> +         (const_string "*")))
>     (set (attr "length_immediate")
>       (if_then_else
> -       (and (match_operand 2 "const1_operand")
> +       (and (and (match_operand 2 "const1_operand")
> +                (eq_attr "alternative" "0"))
>             (ior (match_test "TARGET_SHIFT1")
>                  (match_test "optimize_function_for_size_p (cfun)")))
>         (const_string "0")
> @@ -11916,27 +11940,41 @@ (define_split
>    "operands[2] = gen_lowpart (SImode, operands[2]);")
>
>  (define_insn "*<insn><mode>3_1"
> -  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
> +  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m, ?k")
>         (any_shiftrt:SWI12
> -         (match_operand:SWI12 1 "nonimmediate_operand" "0")
> -         (match_operand:QI 2 "nonmemory_operand" "c<S>")))
> +         (match_operand:SWI12 1 "nonimmediate_operand" "0, k")
> +         (match_operand:QI 2 "nonmemory_operand" "c<S>, <KS>")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>  {
> -  if (operands[2] == const1_rtx
> -      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
> -    return "<shift>{<imodesuffix>}\t%0";
> -  else
> -    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
> +  switch (get_attr_type (insn))
> +    {
> +    case TYPE_ISHIFT:
> +      if (operands[2] == const1_rtx
> +         && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
> +       return "<shift>{<imodesuffix>}\t%0";
> +      else
> +       return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
> +    case TYPE_MSKLOG:
> +      return "#";
> +    default:
> +      gcc_unreachable ();
> +    }
>  }
> -  [(set_attr "type" "ishift")
> +  [(set_attr "type" "ishift,msklog")
>     (set (attr "length_immediate")
>       (if_then_else
> -       (and (match_operand 2 "const1_operand")
> +       (and (and (match_operand 2 "const1_operand")
> +                (eq_attr "alternative" "0"))
>             (ior (match_test "TARGET_SHIFT1")
>                  (match_test "optimize_function_for_size_p (cfun)")))
>         (const_string "0")
>         (const_string "*")))
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "1")
> +         (symbol_ref "<CODE> == LSHIFTRT && TARGET_AVX512F
> +                      && (<MODE>mode != QImode || TARGET_AVX512DQ)")

Also here, please split out LSHIFTRT and perhaps use conditional
constraint to avoid enabled attribute.

Uros.

> +         (const_string "*")))
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*<insn><mode>3_1_slp"
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index ab29999023d..f8759e4d758 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1755,6 +1755,20 @@ (define_insn "k<code><mode>"
>     (set_attr "prefix" "vex")
>     (set_attr "mode" "<MODE>")])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +       (any_lshift:SWI1248_AVX512BW
> +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> +         (match_operand 2 "const_int_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_lshift:SWI1248_AVX512BW
> +            (match_dup 1)
> +            (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "ktest<mode>"
>    [(set (reg:CC FLAGS_REG)
>         (unspec:CC
> diff --git a/gcc/testsuite/gcc.target/i386/mask-shift.c 
> b/gcc/testsuite/gcc.target/i386/mask-shift.c
> new file mode 100644
> index 00000000000..4cb6ef37821
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mask-shift.c
> @@ -0,0 +1,83 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> +
> +#include<immintrin.h>
> +void
> +fooq (__m512i a, __m512i b, void* p)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  m1 >>= 4;
> +  _mm512_mask_storeu_epi64 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftrb} "1" } }  */
> +
> +void
> +food (__m512i a, __m512i b, void* p)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  m1 >>= 8;
> +  _mm512_mask_storeu_epi32 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftrw} "1" } }  */
> +
> +void
> +foow (__m512i a, __m512i b, void* p)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  m1 >>= 16;
> +  _mm512_mask_storeu_epi16 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftrd} "1" } }  */
> +
> +void
> +foob (__m512i a, __m512i b, void* p)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  m1 >>= 32;
> +  _mm512_mask_storeu_epi8 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftrq} "1" { target { ! ia32 } 
> } } }  */
> +
> +void
> +fooq1 (__m512i a, __m512i b, void* p)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  m1 <<= 4;
> +  _mm512_mask_storeu_epi64 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftlb} "1" } }  */
> +
> +void
> +food1 (__m512i a, __m512i b, void* p)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  m1 <<= 8;
> +  _mm512_mask_storeu_epi32 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftlw} "1" } }  */
> +
> +void
> +foow1 (__m512i a, __m512i b, void* p)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  m1 <<= 16;
> +  _mm512_mask_storeu_epi16 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftld} "1" } }  */
> +
> +void
> +foob1 (__m512i a, __m512i b, void* p)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  m1 <<= 32;
> +  _mm512_mask_storeu_epi8 (p, m1, a);
> +}
> +
> +/* { dg-final { scan-assembler-times {(?n)kshiftlq} "1" { target { ! ia32 } 
> } } }  */
> --
> 2.18.1
>

Reply via email to