vmovsh.

Hongtao Liu via Gcc-patches Wed, 15 Sep 2021 22:03:15 -0700

I'm going to check in 6 patches

[PATCH 24/62] AVX512FP16: Add vmovw/vmovsh.
[PATCH 25/62] AVX512FP16: Add testcase for vmovsh/vmovw.
[PATCH 26/62] AVX512FP16: Add
vcvtph2dq/vcvtph2qq/vcvtph2w/vcvtph2uw/vcvtph2uqq/vcvtph2udq
[PATCH 27/62] AVX512FP16: Add testcase for
vcvtph2w/vcvtph2uw/vcvtph2dq/vcvtph2udq/vcvtph2qq/vcvtph2uqq.
[PATCH 28/62] AVX512FP16: Add
vcvtuw2ph/vcvtw2ph/vcvtdq2ph/vcvtudq2ph/vcvtqq2ph/vcvtuqq2ph
[PATCH 29/62] AVX512FP16: Add testcase for
vcvtw2ph/vcvtuw2ph/vcvtdq2ph/vcvtudq2ph/vcvtqq2ph/vcvtuqq2ph.


  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}
  Newly added runtime testcase passed on SPR.

On Thu, Jul 1, 2021 at 2:17 PM liuhongt <hongtao....@intel.com> wrote:
>
> gcc/ChangeLog:
>
>         * config/i386/avx512fp16intrin.h: (_mm_cvtsi16_si128):
>         New intrinsic.
>         (_mm_cvtsi128_si16): Likewise.
>         (_mm_mask_load_sh): Likewise.
>         (_mm_maskz_load_sh): Likewise.
>         (_mm_mask_store_sh): Likewise.
>         (_mm_move_sh): Likewise.
>         (_mm_mask_move_sh): Likewise.
>         (_mm_maskz_move_sh): Likewise.
>         * config/i386/i386-builtin-types.def: Add corresponding builtin types.
>         * config/i386/i386-builtin.def: Add corresponding new builtins.
>         * config/i386/i386-expand.c
>         (ix86_expand_special_args_builtin): Handle new builtin types.
>         (ix86_expand_vector_init_one_nonzero): Adjust for FP16 target.
>         * config/i386/sse.md (VI2F): New mode iterator.
>         (vec_set<mode>_0): Use new mode iterator.
>         (avx512f_mov<ssescalarmodelower>_mask): Adjust for HF vector mode.
>         (avx512f_store<mode>_mask): Ditto.
> ---
>  gcc/config/i386/avx512fp16intrin.h     | 59 ++++++++++++++++++++++++++
>  gcc/config/i386/i386-builtin-types.def |  3 ++
>  gcc/config/i386/i386-builtin.def       |  5 +++
>  gcc/config/i386/i386-expand.c          | 11 +++++
>  gcc/config/i386/sse.md                 | 33 +++++++-------
>  5 files changed, 95 insertions(+), 16 deletions(-)
>
> diff --git a/gcc/config/i386/avx512fp16intrin.h 
> b/gcc/config/i386/avx512fp16intrin.h
> index 2fbfc140c44..cdf6646c8c6 100644
> --- a/gcc/config/i386/avx512fp16intrin.h
> +++ b/gcc/config/i386/avx512fp16intrin.h
> @@ -2453,6 +2453,65 @@ _mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h 
> __A,
>
>  #endif /* __OPTIMIZE__ */
>
> +/* Intrinsics vmovw.  */
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cvtsi16_si128 (short __A)
> +{
> +  return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
> +}
> +
> +extern __inline short
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cvtsi128_si16 (__m128i __A)
> +{
> +  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0);
> +}
> +
> +/* Intrinsics vmovsh.  */
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C)
> +{
> +  return __builtin_ia32_loadsh_mask (__C, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B)
> +{
> +  return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
> +{
> +  __builtin_ia32_storesh_mask (__A,  __C, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_move_sh (__m128h __A, __m128h  __B)
> +{
> +  __A[0] = __B[0];
> +  return __A;
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h  __C, __m128h __D)
> +{
> +  return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B);
> +}
> +
> +extern __inline __m128h
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_maskz_move_sh (__mmask8 __A, __m128h  __B, __m128h __C)
> +{
> +  return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
> +}
> +
>  #ifdef __DISABLE_AVX512FP16__
>  #undef __DISABLE_AVX512FP16__
>  #pragma GCC pop_options
> diff --git a/gcc/config/i386/i386-builtin-types.def 
> b/gcc/config/i386/i386-builtin-types.def
> index 79e7edf13e5..6cf3e354c78 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST)
>  DEF_POINTER_TYPE (PVOID, VOID)
>  DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
>  DEF_POINTER_TYPE (PFLOAT, FLOAT)
> +DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST)
>  DEF_POINTER_TYPE (PSHORT, SHORT)
>  DEF_POINTER_TYPE (PUSHORT, USHORT)
>  DEF_POINTER_TYPE (PINT, INT)
> @@ -1308,6 +1309,8 @@ DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI)
>  DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI)
>  DEF_FUNCTION_TYPE (SI, V32HF, INT, USI)
>  DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF)
> +DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI)
> +DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI)
>  DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI)
>  DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT)
>  DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI)
> diff --git a/gcc/config/i386/i386-builtin.def 
> b/gcc/config/i386/i386-builtin.def
> index ed1a4a38b1c..be617b8f18a 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
> CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas
>  BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
> CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, 
> "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, 
> (int) VOID_FTYPE_PV32QI_V32HI_USI)
>  BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
> CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, 
> "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, 
> (int) VOID_FTYPE_PV32QI_V32HI_USI)
>
> +/* AVX512FP16 */
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, 
> "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) 
> V8HF_FTYPE_PCFLOAT16_V8HF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, 
> "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) 
> VOID_FTYPE_PCFLOAT16_V8HF_UQI)
> +
>  /* RDPKRU and WRPKRU.  */
>  BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru,  "__builtin_ia32_rdpkru", 
> IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID)
>  BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru,  "__builtin_ia32_wrpkru", 
> IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
> @@ -2826,6 +2830,7 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 
> OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_
>  BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
> CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", 
> IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
> CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", 
> IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
>  BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
> CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", 
> IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
> +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, 
> "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) 
> V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
>
>  /* Builtins with rounding support.  */
>  BDESC_END (ARGS, ROUND_ARGS)
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index 266aa411ddb..bfc7fc75b97 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -10907,6 +10907,7 @@ ix86_expand_special_args_builtin (const struct 
> builtin_description *d,
>      case VOID_FTYPE_PFLOAT_V16SF_UHI:
>      case VOID_FTYPE_PFLOAT_V8SF_UQI:
>      case VOID_FTYPE_PFLOAT_V4SF_UQI:
> +    case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
>      case VOID_FTYPE_PV32QI_V32HI_USI:
>      case VOID_FTYPE_PV16QI_V16HI_UHI:
>      case VOID_FTYPE_PUDI_V8HI_UQI:
> @@ -10979,6 +10980,7 @@ ix86_expand_special_args_builtin (const struct 
> builtin_description *d,
>      case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
>      case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
>      case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
> +    case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
>        nargs = 3;
>        klass = load;
>        memory = 0;
> @@ -13993,6 +13995,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, 
> machine_mode mode,
>        break;
>      case E_V8HImode:
>        use_vector_set = TARGET_SSE2;
> +      gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
> +       ? gen_vec_setv8hi_0 : NULL;
>        break;
>      case E_V8QImode:
>        use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
> @@ -14004,8 +14008,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, 
> machine_mode mode,
>        use_vector_set = TARGET_SSE4_1;
>        break;
>      case E_V32QImode:
> +      use_vector_set = TARGET_AVX;
> +      break;
>      case E_V16HImode:
>        use_vector_set = TARGET_AVX;
> +      gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
> +       ? gen_vec_setv16hi_0 : NULL;
>        break;
>      case E_V8SImode:
>        use_vector_set = TARGET_AVX;
> @@ -14053,6 +14061,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, 
> machine_mode mode,
>        use_vector_set = TARGET_AVX512FP16 && one_var == 0;
>        gen_vec_set_0 = gen_vec_setv32hf_0;
>        break;
> +    case E_V32HImode:
> +      use_vector_set = TARGET_AVX512FP16 && one_var == 0;
> +      gen_vec_set_0 = gen_vec_setv32hi_0;
>      default:
>        break;
>      }
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c4db778e25d..97f7c698d5d 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -758,6 +758,7 @@ (define_mode_iterator VIHF_AVX512BW
>    (V32HF "TARGET_AVX512FP16")])
>
>  ;; Int-float size matches
> +(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF])
>  (define_mode_iterator VI4F_128 [V4SI V4SF])
>  (define_mode_iterator VI8F_128 [V2DI V2DF])
>  (define_mode_iterator VI4F_256 [V8SI V8SF])
> @@ -1317,13 +1318,13 @@ (define_insn_and_split "*<avx512>_load<mode>"
>    [(set (match_dup 0) (match_dup 1))])
>
>  (define_insn "avx512f_mov<ssescalarmodelower>_mask"
> -  [(set (match_operand:VF_128 0 "register_operand" "=v")
> -       (vec_merge:VF_128
> -         (vec_merge:VF_128
> -           (match_operand:VF_128 2 "register_operand" "v")
> -           (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
> +  [(set (match_operand:VFH_128 0 "register_operand" "=v")
> +       (vec_merge:VFH_128
> +         (vec_merge:VFH_128
> +           (match_operand:VFH_128 2 "register_operand" "v")
> +           (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C")
>             (match_operand:QI 4 "register_operand" "Yk"))
> -         (match_operand:VF_128 1 "register_operand" "v")
> +         (match_operand:VFH_128 1 "register_operand" "v")
>           (const_int 1)))]
>    "TARGET_AVX512F"
>    "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
> @@ -1336,7 +1337,7 @@ (define_expand "avx512f_load<mode>_mask"
>         (vec_merge:<ssevecmode>
>           (vec_merge:<ssevecmode>
>             (vec_duplicate:<ssevecmode>
> -             (match_operand:MODEF 1 "memory_operand"))
> +             (match_operand:MODEFH 1 "memory_operand"))
>             (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
>             (match_operand:QI 3 "register_operand"))
>           (match_dup 4)
> @@ -1349,7 +1350,7 @@ (define_insn "*avx512f_load<mode>_mask"
>         (vec_merge:<ssevecmode>
>           (vec_merge:<ssevecmode>
>             (vec_duplicate:<ssevecmode>
> -             (match_operand:MODEF 1 "memory_operand" "m"))
> +             (match_operand:MODEFH 1 "memory_operand" "m"))
>             (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
>             (match_operand:QI 3 "register_operand" "Yk"))
>           (match_operand:<ssevecmode> 4 "const0_operand" "C")
> @@ -1362,11 +1363,11 @@ (define_insn "*avx512f_load<mode>_mask"
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "avx512f_store<mode>_mask"
> -  [(set (match_operand:MODEF 0 "memory_operand" "=m")
> -       (if_then_else:MODEF
> +  [(set (match_operand:MODEFH 0 "memory_operand" "=m")
> +       (if_then_else:MODEFH
>           (and:QI (match_operand:QI 2 "register_operand" "Yk")
>                  (const_int 1))
> -         (vec_select:MODEF
> +         (vec_select:MODEFH
>             (match_operand:<ssevecmode> 1 "register_operand" "v")
>             (parallel [(const_int 0)]))
>           (match_dup 0)))]
> @@ -8513,11 +8514,11 @@ (define_insn "vec_set<mode>_0"
>
>  ;; vmovw clears also the higer bits
>  (define_insn "vec_set<mode>_0"
> -  [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v")
> -       (vec_merge:VF_AVX512FP16
> -         (vec_duplicate:VF_AVX512FP16
> -           (match_operand:HF 2 "nonimmediate_operand" "rm"))
> -         (match_operand:VF_AVX512FP16 1 "const0_operand" "C")
> +  [(set (match_operand:VI2F 0 "register_operand" "=v")
> +       (vec_merge:VI2F
> +         (vec_duplicate:VI2F
> +           (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "rm"))
> +         (match_operand:VI2F 1 "const0_operand" "C")
>           (const_int 1)))]
>    "TARGET_AVX512FP16"
>    "vmovw\t{%2, %x0|%x0, %2}"
> --
> 2.18.1
>


-- 
BR,
Hongtao

Re: [PATCH 24/62] AVX512FP16: Add vmovw/vmovsh.

Reply via email to