I'm going to check in 6 patches [PATCH 24/62] AVX512FP16: Add vmovw/vmovsh. [PATCH 25/62] AVX512FP16: Add testcase for vmovsh/vmovw. [PATCH 26/62] AVX512FP16: Add vcvtph2dq/vcvtph2qq/vcvtph2w/vcvtph2uw/vcvtph2uqq/vcvtph2udq [PATCH 27/62] AVX512FP16: Add testcase for vcvtph2w/vcvtph2uw/vcvtph2dq/vcvtph2udq/vcvtph2qq/vcvtph2uqq. [PATCH 28/62] AVX512FP16: Add vcvtuw2ph/vcvtw2ph/vcvtdq2ph/vcvtudq2ph/vcvtqq2ph/vcvtuqq2ph [PATCH 29/62] AVX512FP16: Add testcase for vcvtw2ph/vcvtuw2ph/vcvtdq2ph/vcvtudq2ph/vcvtqq2ph/vcvtuqq2ph.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,} Newly added runtime testcase passed on SPR. On Thu, Jul 1, 2021 at 2:17 PM liuhongt <hongtao....@intel.com> wrote: > > gcc/ChangeLog: > > * config/i386/avx512fp16intrin.h: (_mm_cvtsi16_si128): > New intrinsic. > (_mm_cvtsi128_si16): Likewise. > (_mm_mask_load_sh): Likewise. > (_mm_maskz_load_sh): Likewise. > (_mm_mask_store_sh): Likewise. > (_mm_move_sh): Likewise. > (_mm_mask_move_sh): Likewise. > (_mm_maskz_move_sh): Likewise. > * config/i386/i386-builtin-types.def: Add corresponding builtin types. > * config/i386/i386-builtin.def: Add corresponding new builtins. > * config/i386/i386-expand.c > (ix86_expand_special_args_builtin): Handle new builtin types. > (ix86_expand_vector_init_one_nonzero): Adjust for FP16 target. > * config/i386/sse.md (VI2F): New mode iterator. > (vec_set<mode>_0): Use new mode iterator. > (avx512f_mov<ssescalarmodelower>_mask): Adjust for HF vector mode. > (avx512f_store<mode>_mask): Ditto. > --- > gcc/config/i386/avx512fp16intrin.h | 59 ++++++++++++++++++++++++++ > gcc/config/i386/i386-builtin-types.def | 3 ++ > gcc/config/i386/i386-builtin.def | 5 +++ > gcc/config/i386/i386-expand.c | 11 +++++ > gcc/config/i386/sse.md | 33 +++++++------- > 5 files changed, 95 insertions(+), 16 deletions(-) > > diff --git a/gcc/config/i386/avx512fp16intrin.h > b/gcc/config/i386/avx512fp16intrin.h > index 2fbfc140c44..cdf6646c8c6 100644 > --- a/gcc/config/i386/avx512fp16intrin.h > +++ b/gcc/config/i386/avx512fp16intrin.h > @@ -2453,6 +2453,65 @@ _mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h > __A, > > #endif /* __OPTIMIZE__ */ > > +/* Intrinsics vmovw. */ > +extern __inline __m128i > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_cvtsi16_si128 (short __A) > +{ > + return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_cvtsi128_si16 (__m128i __A) > +{ > + return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0); > +} > + > +/* Intrinsics vmovsh. */ > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C) > +{ > + return __builtin_ia32_loadsh_mask (__C, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B) > +{ > + return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A); > +} > + > +extern __inline void > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C) > +{ > + __builtin_ia32_storesh_mask (__A, __C, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_move_sh (__m128h __A, __m128h __B) > +{ > + __A[0] = __B[0]; > + return __A; > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) > +{ > + return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B); > +} > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C) > +{ > + return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A); > +} > + > #ifdef __DISABLE_AVX512FP16__ > #undef __DISABLE_AVX512FP16__ > #pragma GCC pop_options > diff --git a/gcc/config/i386/i386-builtin-types.def > b/gcc/config/i386/i386-builtin-types.def > index 79e7edf13e5..6cf3e354c78 100644 > --- a/gcc/config/i386/i386-builtin-types.def > +++ b/gcc/config/i386/i386-builtin-types.def > @@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST) > DEF_POINTER_TYPE (PVOID, VOID) > DEF_POINTER_TYPE (PDOUBLE, DOUBLE) > DEF_POINTER_TYPE (PFLOAT, FLOAT) > +DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST) > DEF_POINTER_TYPE (PSHORT, SHORT) > DEF_POINTER_TYPE (PUSHORT, USHORT) > DEF_POINTER_TYPE (PINT, INT) > @@ -1308,6 +1309,8 @@ DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI) > DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI) > DEF_FUNCTION_TYPE (SI, V32HF, INT, USI) > DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF) > +DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI) > +DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI) > DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI) > DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) > DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI) > diff --git a/gcc/config/i386/i386-builtin.def > b/gcc/config/i386/i386-builtin.def > index ed1a4a38b1c..be617b8f18a 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, > CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas > BDESC (OPTION_MASK_ISA_AVX512BW, 0, > CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, > "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, > (int) VOID_FTYPE_PV32QI_V32HI_USI) > BDESC (OPTION_MASK_ISA_AVX512BW, 0, > CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, > "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, > (int) VOID_FTYPE_PV32QI_V32HI_USI) > > +/* AVX512FP16 */ > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, > "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) > V8HF_FTYPE_PCFLOAT16_V8HF_UQI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, > "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) > VOID_FTYPE_PCFLOAT16_V8HF_UQI) > + > /* RDPKRU and WRPKRU. */ > BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru, "__builtin_ia32_rdpkru", > IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) > BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru, "__builtin_ia32_wrpkru", > IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED) > @@ -2826,6 +2830,7 @@ BDESC (OPTION_MASK_ISA_AVX512VL, > OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_ > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", > IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", > IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI) > BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, > CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", > IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI) > +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, > "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) > V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) > > /* Builtins with rounding support. */ > BDESC_END (ARGS, ROUND_ARGS) > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 266aa411ddb..bfc7fc75b97 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -10907,6 +10907,7 @@ ix86_expand_special_args_builtin (const struct > builtin_description *d, > case VOID_FTYPE_PFLOAT_V16SF_UHI: > case VOID_FTYPE_PFLOAT_V8SF_UQI: > case VOID_FTYPE_PFLOAT_V4SF_UQI: > + case VOID_FTYPE_PCFLOAT16_V8HF_UQI: > case VOID_FTYPE_PV32QI_V32HI_USI: > case VOID_FTYPE_PV16QI_V16HI_UHI: > case VOID_FTYPE_PUDI_V8HI_UQI: > @@ -10979,6 +10980,7 @@ ix86_expand_special_args_builtin (const struct > builtin_description *d, > case V16SF_FTYPE_PCFLOAT_V16SF_UHI: > case V8SF_FTYPE_PCFLOAT_V8SF_UQI: > case V4SF_FTYPE_PCFLOAT_V4SF_UQI: > + case V8HF_FTYPE_PCFLOAT16_V8HF_UQI: > nargs = 3; > klass = load; > memory = 0; > @@ -13993,6 +13995,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, > machine_mode mode, > break; > case E_V8HImode: > use_vector_set = TARGET_SSE2; > + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 > + ? gen_vec_setv8hi_0 : NULL; > break; > case E_V8QImode: > use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; > @@ -14004,8 +14008,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, > machine_mode mode, > use_vector_set = TARGET_SSE4_1; > break; > case E_V32QImode: > + use_vector_set = TARGET_AVX; > + break; > case E_V16HImode: > use_vector_set = TARGET_AVX; > + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 > + ? gen_vec_setv16hi_0 : NULL; > break; > case E_V8SImode: > use_vector_set = TARGET_AVX; > @@ -14053,6 +14061,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, > machine_mode mode, > use_vector_set = TARGET_AVX512FP16 && one_var == 0; > gen_vec_set_0 = gen_vec_setv32hf_0; > break; > + case E_V32HImode: > + use_vector_set = TARGET_AVX512FP16 && one_var == 0; > + gen_vec_set_0 = gen_vec_setv32hi_0; > default: > break; > } > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index c4db778e25d..97f7c698d5d 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -758,6 +758,7 @@ (define_mode_iterator VIHF_AVX512BW > (V32HF "TARGET_AVX512FP16")]) > > ;; Int-float size matches > +(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF]) > (define_mode_iterator VI4F_128 [V4SI V4SF]) > (define_mode_iterator VI8F_128 [V2DI V2DF]) > (define_mode_iterator VI4F_256 [V8SI V8SF]) > @@ -1317,13 +1318,13 @@ (define_insn_and_split "*<avx512>_load<mode>" > [(set (match_dup 0) (match_dup 1))]) > > (define_insn "avx512f_mov<ssescalarmodelower>_mask" > - [(set (match_operand:VF_128 0 "register_operand" "=v") > - (vec_merge:VF_128 > - (vec_merge:VF_128 > - (match_operand:VF_128 2 "register_operand" "v") > - (match_operand:VF_128 3 "nonimm_or_0_operand" "0C") > + [(set (match_operand:VFH_128 0 "register_operand" "=v") > + (vec_merge:VFH_128 > + (vec_merge:VFH_128 > + (match_operand:VFH_128 2 "register_operand" "v") > + (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C") > (match_operand:QI 4 "register_operand" "Yk")) > - (match_operand:VF_128 1 "register_operand" "v") > + (match_operand:VFH_128 1 "register_operand" "v") > (const_int 1)))] > "TARGET_AVX512F" > "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}" > @@ -1336,7 +1337,7 @@ (define_expand "avx512f_load<mode>_mask" > (vec_merge:<ssevecmode> > (vec_merge:<ssevecmode> > (vec_duplicate:<ssevecmode> > - (match_operand:MODEF 1 "memory_operand")) > + (match_operand:MODEFH 1 "memory_operand")) > (match_operand:<ssevecmode> 2 "nonimm_or_0_operand") > (match_operand:QI 3 "register_operand")) > (match_dup 4) > @@ -1349,7 +1350,7 @@ (define_insn "*avx512f_load<mode>_mask" > (vec_merge:<ssevecmode> > (vec_merge:<ssevecmode> > (vec_duplicate:<ssevecmode> > - (match_operand:MODEF 1 "memory_operand" "m")) > + (match_operand:MODEFH 1 "memory_operand" "m")) > (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C") > (match_operand:QI 3 "register_operand" "Yk")) > (match_operand:<ssevecmode> 4 "const0_operand" "C") > @@ -1362,11 +1363,11 @@ (define_insn "*avx512f_load<mode>_mask" > (set_attr "mode" "<MODE>")]) > > (define_insn "avx512f_store<mode>_mask" > - [(set (match_operand:MODEF 0 "memory_operand" "=m") > - (if_then_else:MODEF > + [(set (match_operand:MODEFH 0 "memory_operand" "=m") > + (if_then_else:MODEFH > (and:QI (match_operand:QI 2 "register_operand" "Yk") > (const_int 1)) > - (vec_select:MODEF > + (vec_select:MODEFH > (match_operand:<ssevecmode> 1 "register_operand" "v") > (parallel [(const_int 0)])) > (match_dup 0)))] > @@ -8513,11 +8514,11 @@ (define_insn "vec_set<mode>_0" > > ;; vmovw clears also the higer bits > (define_insn "vec_set<mode>_0" > - [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v") > - (vec_merge:VF_AVX512FP16 > - (vec_duplicate:VF_AVX512FP16 > - (match_operand:HF 2 "nonimmediate_operand" "rm")) > - (match_operand:VF_AVX512FP16 1 "const0_operand" "C") > + [(set (match_operand:VI2F 0 "register_operand" "=v") > + (vec_merge:VI2F > + (vec_duplicate:VI2F > + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "rm")) > + (match_operand:VI2F 1 "const0_operand" "C") > (const_int 1)))] > "TARGET_AVX512FP16" > "vmovw\t{%2, %x0|%x0, %2}" > -- > 2.18.1 > -- BR, Hongtao