On Wed, Jul 21, 2021 at 3:44 PM liuhongt <hongtao....@intel.com> wrote: > > gcc/ChangeLog: > > * config/i386/avx512fp16intrin.h (_mm_set_ph): New intrinsic. > (_mm256_set_ph): Likewise. > (_mm512_set_ph): Likewise. > (_mm_setr_ph): Likewise. > (_mm256_setr_ph): Likewise. > (_mm512_setr_ph): Likewise. > (_mm_set1_ph): Likewise. > (_mm256_set1_ph): Likewise. > (_mm512_set1_ph): Likewise. > (_mm_setzero_ph): Likewise. > (_mm256_setzero_ph): Likewise. > (_mm512_setzero_ph): Likewise. > (_mm_set_sh): Likewise. > (_mm_load_sh): Likewise. > (_mm_store_sh): Likewise. > * config/i386/i386-builtin-types.def (V8HF): New type. > (DEF_FUNCTION_TYPE (V8HF, V8HI)): New builtin function type > * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): > Support vector HFmodes. > (ix86_expand_vector_init_one_nonzero): Likewise. > (ix86_expand_vector_init_one_var): Likewise. > (ix86_expand_vector_init_interleave): Likewise. > (ix86_expand_vector_init_general): Likewise. > (ix86_expand_vector_set): Likewise. > (ix86_expand_vector_extract): Likewise. > (ix86_expand_vector_init_concat): Likewise. > (ix86_expand_sse_movcc): Handle vector HFmodes. > (ix86_expand_vector_set_var): Ditto. > * config/i386/i386-modes.def: Add HF vector modes in comment. > * config/i386/i386.c (classify_argument): Add HF vector modes. > (ix86_hard_regno_mode_ok): Allow HF vector modes for AVX512FP16. > (ix86_vector_mode_supported_p): Likewise. > (ix86_set_reg_reg_cost): Handle vector HFmode. > (ix86_get_ssemov): Handle vector HFmode. > (function_arg_advance_64): Pass unamed V16HFmode and V32HFmode > by stack. Got some feedback by H.J that 16/32/64-byte vector _Float16 should be passed by sse registers for 32-bit mode, not stack. will handle it in function_arg_32 in my next version. > * config/i386/i386.h (VALID_AVX512FP16_REG_MODE): New. > (VALID_AVX256_REG_OR_OI_MODE): Rename to .. > (VALID_AVX256_REG_OR_OI_VHF_MODE): .. this, and add V16HF. > (VALID_SSE2_REG_VHF_MODE): New. > (VALID_AVX512VL_128_REG_MODE): Add V8HF and TImode. > (SSE_REG_MODE_P): Add vector HFmode. > * config/i386/i386.md (mode): Add HF vector modes. > (MODE_SIZE): Likewise. > (ssemodesuffix): Add ph suffix for HF vector modes. > * config/i386/sse.md (VFH_128): New mode iterator. > (VMOVE): Adjust for HF vector modes. > (V): Likewise. > (V_256_512): Likewise. > (avx512): Likewise. > (avx512fmaskmode): Likewise. > (shuffletype): Likewise. > (sseinsnmode): Likewise. > (ssedoublevecmode): Likewise. > (ssehalfvecmode): Likewise. > (ssehalfvecmodelower): Likewise. > (ssePScmode): Likewise. > (ssescalarmode): Likewise. > (ssescalarmodelower): Likewise. > (sseintprefix): Likewise. > (i128): Likewise. > (bcstscalarsuff): Likewise. > (xtg_mode): Likewise. > (VI12HF_AVX512VL): New mode_iterator. > (VF_AVX512FP16): Likewise. > (VIHF): Likewise. > (VIHF_256): Likewise. > (VIHF_AVX512BW): Likewise. > (V16_256): Likewise. > (V32_512): Likewise. > (sseintmodesuffix): New mode_attr. > (sse): Add scalar and vector HFmodes. > (ssescalarmode): Add vector HFmode mapping. > (ssescalarmodesuffix): Add sh suffix for HFmode. > (*<sse>_vm<insn><mode>3): Use VFH_128. > (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise. > (*ieee_<ieee_maxmin><mode>3): Likewise. > (<avx512>_blendm<mode>): New define_insn. > (vec_setv8hf): New define_expand. > (vec_set<mode>_0): New define_insn for HF vector set. > (*avx512fp16_movsh): Likewise. > (avx512fp16_movsh): Likewise. > (vec_extract_lo_v32hi): Rename to ... > (vec_extract_lo_<mode>): ... this, and adjust to allow HF > vector modes. > (vec_extract_hi_v32hi): Likewise. > (vec_extract_hi_<mode>): Likewise. > (vec_extract_lo_v16hi): Likewise. > (vec_extract_lo_<mode>): Likewise. > (vec_extract_hi_v16hi): Likewise. > (vec_extract_hi_<mode>): Likewise. > (vec_set_hi_v16hi): Likewise. > (vec_set_hi_<mode>): Likewise. > (vec_set_lo_v16hi): Likewise. > (vec_set_lo_<mode>: Likewise. > (*vec_extract<mode>_0): New define_insn_and_split for HF > vector extract. > (*vec_extracthf): New define_insn. > (VEC_EXTRACT_MODE): Add HF vector modes. > (PINSR_MODE): Add V8HF. > (sse2p4_1): Likewise. > (pinsr_evex_isa): Likewise. > (<sse2p4_1>_pinsr<ssemodesuffix>): Adjust to support > insert for V8HFmode. > (pbroadcast_evex_isa): Add HF vector modes. > (AVX2_VEC_DUP_MODE): Likewise. > (VEC_INIT_MODE): Likewise. > (VEC_INIT_HALF_MODE): Likewise. > (avx2_pbroadcast<mode>): Adjust to support HF vector mode > broadcast. > (avx2_pbroadcast<mode>_1): Likewise. > (<avx512>_vec_dup<mode>_1): Likewise. > (<avx512>_vec_dup<mode><mask_name>): Likewise. > (<mask_codefor><avx512>_vec_dup_gpr<mode><mask_name>): > Likewise. > --- > gcc/config/i386/avx512fp16intrin.h | 172 +++++++++++ > gcc/config/i386/i386-builtin-types.def | 6 +- > gcc/config/i386/i386-expand.c | 124 +++++++- > gcc/config/i386/i386-modes.def | 12 +- > gcc/config/i386/i386.c | 69 ++--- > gcc/config/i386/i386.h | 15 +- > gcc/config/i386/i386.md | 13 +- > gcc/config/i386/sse.md | 395 +++++++++++++++++++------ > 8 files changed, 652 insertions(+), 154 deletions(-) > > diff --git a/gcc/config/i386/avx512fp16intrin.h > b/gcc/config/i386/avx512fp16intrin.h > index 38d63161ba6..3fc0770986e 100644 > --- a/gcc/config/i386/avx512fp16intrin.h > +++ b/gcc/config/i386/avx512fp16intrin.h > @@ -45,6 +45,178 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ > (16), __may_alias__)); > typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), > __may_alias__)); > typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), > __may_alias__)); > > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5, > + _Float16 __A4, _Float16 __A3, _Float16 __A2, > + _Float16 __A1, _Float16 __A0) > +{ > + return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3, > + __A4, __A5, __A6, __A7 }; > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13, > + _Float16 __A12, _Float16 __A11, _Float16 __A10, > + _Float16 __A9, _Float16 __A8, _Float16 __A7, > + _Float16 __A6, _Float16 __A5, _Float16 __A4, > + _Float16 __A3, _Float16 __A2, _Float16 __A1, > + _Float16 __A0) > +{ > + return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3, > + __A4, __A5, __A6, __A7, > + __A8, __A9, __A10, __A11, > + __A12, __A13, __A14, __A15 }; > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29, > + _Float16 __A28, _Float16 __A27, _Float16 __A26, > + _Float16 __A25, _Float16 __A24, _Float16 __A23, > + _Float16 __A22, _Float16 __A21, _Float16 __A20, > + _Float16 __A19, _Float16 __A18, _Float16 __A17, > + _Float16 __A16, _Float16 __A15, _Float16 __A14, > + _Float16 __A13, _Float16 __A12, _Float16 __A11, > + _Float16 __A10, _Float16 __A9, _Float16 __A8, > + _Float16 __A7, _Float16 __A6, _Float16 __A5, > + _Float16 __A4, _Float16 __A3, _Float16 __A2, > + _Float16 __A1, _Float16 __A0) > +{ > + return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3, > + __A4, __A5, __A6, __A7, > + __A8, __A9, __A10, __A11, > + __A12, __A13, __A14, __A15, > + __A16, __A17, __A18, __A19, > + __A20, __A21, __A22, __A23, > + __A24, __A25, __A26, __A27, > + __A28, __A29, __A30, __A31 }; > +} > + > +/* Create vectors of elements in the reversed order from _mm_set_ph, > + _mm256_set_ph and _mm512_set_ph functions. */ > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, > + _Float16 __A3, _Float16 __A4, _Float16 __A5, > + _Float16 __A6, _Float16 __A7) > +{ > + return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, > + _Float16 __A3, _Float16 __A4, _Float16 __A5, > + _Float16 __A6, _Float16 __A7, _Float16 __A8, > + _Float16 __A9, _Float16 __A10, _Float16 __A11, > + _Float16 __A12, _Float16 __A13, _Float16 __A14, > + _Float16 __A15) > +{ > + return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9, > + __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1, > + __A0); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, > + _Float16 __A3, _Float16 __A4, _Float16 __A5, > + _Float16 __A6, _Float16 __A7, _Float16 __A8, > + _Float16 __A9, _Float16 __A10, _Float16 __A11, > + _Float16 __A12, _Float16 __A13, _Float16 __A14, > + _Float16 __A15, _Float16 __A16, _Float16 __A17, > + _Float16 __A18, _Float16 __A19, _Float16 __A20, > + _Float16 __A21, _Float16 __A22, _Float16 __A23, > + _Float16 __A24, _Float16 __A25, _Float16 __A26, > + _Float16 __A27, _Float16 __A28, _Float16 __A29, > + _Float16 __A30, _Float16 __A31) > + > +{ > + return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25, > + __A24, __A23, __A22, __A21, __A20, __A19, __A18, > + __A17, __A16, __A15, __A14, __A13, __A12, __A11, > + __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3, > + __A2, __A1, __A0); > +} > + > +/* Broadcast _Float16 to vector. */ > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_set1_ph (_Float16 __A) > +{ > + return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_set1_ph (_Float16 __A) > +{ > + return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, > + __A, __A, __A, __A, __A, __A, __A, __A); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_set1_ph (_Float16 __A) > +{ > + return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, > + __A, __A, __A, __A, __A, __A, __A, __A, > + __A, __A, __A, __A, __A, __A, __A, __A, > + __A, __A, __A, __A, __A, __A, __A, __A); > +} > + > +/* Create a vector with all zeros. */ > + > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_setzero_ph (void) > +{ > + return _mm_set1_ph (0.0f); > +} > + > +extern __inline __m256h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_setzero_ph (void) > +{ > + return _mm256_set1_ph (0.0f); > +} > + > +extern __inline __m512h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_setzero_ph (void) > +{ > + return _mm512_set1_ph (0.0f); > +} > + > +/* Create a vector with element 0 as F and the rest zero. */ > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_set_sh (_Float16 __F) > +{ > + return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, __F); > +} > + > +/* Create a vector with element 0 as *P and the rest zero. */ > +extern __inline __m128h > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_load_sh (void const *__P) > +{ > + return _mm_set_ph (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, > + *(_Float16 const *) __P); > +} > + > +/* Stores the lower _Float16 value. */ > +extern __inline void > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_store_sh (void *__P, __m128h __A) > +{ > + *(_Float16 *) __P = ((__v8hf)__A)[0]; > +} > + > #ifdef __DISABLE_AVX512FP16__ > #undef __DISABLE_AVX512FP16__ > #pragma GCC pop_options > diff --git a/gcc/config/i386/i386-builtin-types.def > b/gcc/config/i386/i386-builtin-types.def > index 1768b88d748..4df6ee1009d 100644 > --- a/gcc/config/i386/i386-builtin-types.def > +++ b/gcc/config/i386/i386-builtin-types.def > @@ -85,6 +85,7 @@ DEF_VECTOR_TYPE (V8QI, QI) > # SSE vectors > DEF_VECTOR_TYPE (V2DF, DOUBLE) > DEF_VECTOR_TYPE (V4SF, FLOAT) > +DEF_VECTOR_TYPE (V8HF, FLOAT16) > DEF_VECTOR_TYPE (V2DI, DI) > DEF_VECTOR_TYPE (V4SI, SI) > DEF_VECTOR_TYPE (V8HI, HI) > @@ -1297,4 +1298,7 @@ DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID) > DEF_FUNCTION_TYPE (UINT, UINT, V2DI, PVOID) > DEF_FUNCTION_TYPE (VOID, V2DI, V2DI, V2DI, UINT) > DEF_FUNCTION_TYPE (UINT8, PV2DI, V2DI, PCVOID) > -DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) > \ No newline at end of file > +DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) > + > +# FP16 builtins > +DEF_FUNCTION_TYPE (V8HF, V8HI) > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index b7d050a1e42..bb965ca0e9b 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -3952,6 +3952,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, > rtx op_false) > break; > case E_V16QImode: > case E_V8HImode: > + case E_V8HFmode: > case E_V4SImode: > case E_V2DImode: > if (TARGET_SSE4_1) > @@ -3974,6 +3975,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, > rtx op_false) > break; > case E_V32QImode: > case E_V16HImode: > + case E_V16HFmode: > case E_V8SImode: > case E_V4DImode: > if (TARGET_AVX2) > @@ -3993,6 +3995,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, > rtx op_false) > case E_V32HImode: > gen = gen_avx512bw_blendmv32hi; > break; > + case E_V32HFmode: > + gen = gen_avx512bw_blendmv32hf; > + break; > case E_V16SImode: > gen = gen_avx512f_blendmv16si; > break; > @@ -14144,6 +14149,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, > machine_mode mode, > } > return true; > > + case E_V8HFmode: > + case E_V16HFmode: > + case E_V32HFmode: > + return ix86_vector_duplicate_value (mode, target, val); > + > default: > return false; > } > @@ -14228,6 +14238,18 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, > machine_mode mode, > use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; > gen_vec_set_0 = gen_vec_setv8di_0; > break; > + case E_V8HFmode: > + use_vector_set = TARGET_AVX512FP16 && one_var == 0; > + gen_vec_set_0 = gen_vec_setv8hf_0; > + break; > + case E_V16HFmode: > + use_vector_set = TARGET_AVX512FP16 && one_var == 0; > + gen_vec_set_0 = gen_vec_setv16hf_0; > + break; > + case E_V32HFmode: > + use_vector_set = TARGET_AVX512FP16 && one_var == 0; > + gen_vec_set_0 = gen_vec_setv32hf_0; > + break; > default: > break; > } > @@ -14377,6 +14399,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, > machine_mode mode, > if (!TARGET_64BIT) > return false; > /* FALLTHRU */ > + case E_V8HFmode: > + case E_V16HFmode: > case E_V4DFmode: > case E_V8SFmode: > case E_V8SImode: > @@ -14457,6 +14481,9 @@ ix86_expand_vector_init_concat (machine_mode mode, > case 2: > switch (mode) > { > + case E_V32HFmode: > + half_mode = V16HFmode; > + break; > case E_V16SImode: > half_mode = V8SImode; > break; > @@ -14469,6 +14496,9 @@ ix86_expand_vector_init_concat (machine_mode mode, > case E_V8DFmode: > half_mode = V4DFmode; > break; > + case E_V16HFmode: > + half_mode = V8HFmode; > + break; > case E_V8SImode: > half_mode = V4SImode; > break; > @@ -14611,13 +14641,22 @@ ix86_expand_vector_init_interleave (machine_mode > mode, > { > machine_mode first_imode, second_imode, third_imode, inner_mode; > int i, j; > - rtx op0, op1; > + rtx op, op0, op1; > rtx (*gen_load_even) (rtx, rtx, rtx); > rtx (*gen_interleave_first_low) (rtx, rtx, rtx); > rtx (*gen_interleave_second_low) (rtx, rtx, rtx); > > switch (mode) > { > + case E_V8HFmode: > + gen_load_even = gen_vec_setv8hf; > + gen_interleave_first_low = gen_vec_interleave_lowv4si; > + gen_interleave_second_low = gen_vec_interleave_lowv2di; > + inner_mode = HFmode; > + first_imode = V4SImode; > + second_imode = V2DImode; > + third_imode = VOIDmode; > + break; > case E_V8HImode: > gen_load_even = gen_vec_setv8hi; > gen_interleave_first_low = gen_vec_interleave_lowv4si; > @@ -14642,9 +14681,19 @@ ix86_expand_vector_init_interleave (machine_mode > mode, > > for (i = 0; i < n; i++) > { > + op = ops [i + i]; > + if (inner_mode == HFmode) > + { > + /* Convert HFmode to HImode. */ > + op1 = gen_reg_rtx (HImode); > + op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0); > + op = gen_reg_rtx (HImode); > + emit_move_insn (op, op1); > + } > + > /* Extend the odd elment to SImode using a paradoxical SUBREG. */ > op0 = gen_reg_rtx (SImode); > - emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); > + emit_move_insn (op0, gen_lowpart (SImode, op)); > > /* Insert the SImode value as low element of V4SImode vector. */ > op1 = gen_reg_rtx (V4SImode); > @@ -14781,6 +14830,10 @@ ix86_expand_vector_init_general (bool mmx_ok, > machine_mode mode, > half_mode = V8HImode; > goto half; > > + case E_V16HFmode: > + half_mode = V8HFmode; > + goto half; > + > half: > n = GET_MODE_NUNITS (mode); > for (i = 0; i < n; i++) > @@ -14804,6 +14857,11 @@ half: > half_mode = V16HImode; > goto quarter; > > + case E_V32HFmode: > + quarter_mode = V8HFmode; > + half_mode = V16HFmode; > + goto quarter; > + > quarter: > n = GET_MODE_NUNITS (mode); > for (i = 0; i < n; i++) > @@ -14840,6 +14898,9 @@ quarter: > move from GPR to SSE register directly. */ > if (!TARGET_INTER_UNIT_MOVES_TO_VEC) > break; > + /* FALLTHRU */ > + > + case E_V8HFmode: > > n = GET_MODE_NUNITS (mode); > for (i = 0; i < n; i++) > @@ -15087,6 +15148,16 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx > idx) > case E_V16SFmode: > cmp_mode = V16SImode; > break; > + /* TARGET_AVX512FP16 implies TARGET_AVX512BW. */ > + case E_V8HFmode: > + cmp_mode = V8HImode; > + break; > + case E_V16HFmode: > + cmp_mode = V16HImode; > + break; > + case E_V32HFmode: > + cmp_mode = V32HImode; > + break; > default: > gcc_unreachable (); > } > @@ -15123,23 +15194,25 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, > rtx val, int elt) > machine_mode half_mode; > bool use_vec_merge = false; > rtx tmp; > - static rtx (*gen_extract[6][2]) (rtx, rtx) > + static rtx (*gen_extract[7][2]) (rtx, rtx) > = { > { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, > { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, > { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, > { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, > { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, > - { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } > + { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }, > + { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf } > }; > - static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) > + static rtx (*gen_insert[7][2]) (rtx, rtx, rtx) > = { > { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, > { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, > { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, > { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, > { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, > - { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } > + { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }, > + { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf }, > }; > int i, j, n; > machine_mode mmode = VOIDmode; > @@ -15306,6 +15379,10 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx > val, int elt) > } > return; > > + case E_V8HFmode: > + use_vec_merge = true; > + break; > + > case E_V8HImode: > case E_V2HImode: > use_vec_merge = TARGET_SSE2; > @@ -15329,6 +15406,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx > val, int elt) > n = 16; > goto half; > > + case E_V16HFmode: > + half_mode = V8HFmode; > + j = 6; > + n = 8; > + goto half; > + > case E_V16HImode: > half_mode = V8HImode; > j = 1; > @@ -15409,6 +15492,13 @@ half: > } > break; > > + case E_V32HFmode: > + if (TARGET_AVX512BW) > + { > + mmode = SImode; > + gen_blendm = gen_avx512bw_blendmv32hf; > + } > + break; > case E_V32HImode: > if (TARGET_AVX512BW) > { > @@ -15780,6 +15870,28 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, > rtx vec, int elt) > ix86_expand_vector_extract (false, target, tmp, elt & 3); > return; > > + case E_V32HFmode: > + tmp = gen_reg_rtx (V16HFmode); > + if (elt < 16) > + emit_insn (gen_vec_extract_lo_v32hf (tmp, vec)); > + else > + emit_insn (gen_vec_extract_hi_v32hf (tmp, vec)); > + ix86_expand_vector_extract (false, target, tmp, elt & 15); > + return; > + > + case E_V16HFmode: > + tmp = gen_reg_rtx (V8HFmode); > + if (elt < 8) > + emit_insn (gen_vec_extract_lo_v16hf (tmp, vec)); > + else > + emit_insn (gen_vec_extract_hi_v16hf (tmp, vec)); > + ix86_expand_vector_extract (false, target, tmp, elt & 7); > + return; > + > + case E_V8HFmode: > + use_vec_extr = true; > + break; > + > case E_V8QImode: > use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; > /* ??? Could extract the appropriate HImode element and shift. */ > diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def > index 9232f59a925..fcadfcd4c94 100644 > --- a/gcc/config/i386/i386-modes.def > +++ b/gcc/config/i386/i386-modes.def > @@ -84,12 +84,12 @@ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ > VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI */ > VECTOR_MODES (INT, 64); /* V64QI V32HI V16SI V8DI */ > VECTOR_MODES (INT, 128); /* V128QI V64HI V32SI V16DI */ > -VECTOR_MODES (FLOAT, 8); /* V2SF */ > -VECTOR_MODES (FLOAT, 16); /* V4SF V2DF */ > -VECTOR_MODES (FLOAT, 32); /* V8SF V4DF V2TF */ > -VECTOR_MODES (FLOAT, 64); /* V16SF V8DF V4TF */ > -VECTOR_MODES (FLOAT, 128); /* V32SF V16DF V8TF */ > -VECTOR_MODES (FLOAT, 256); /* V64SF V32DF V16TF */ > +VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ > +VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ > +VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ > +VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ > +VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ > +VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ > VECTOR_MODE (INT, TI, 1); /* V1TI */ > VECTOR_MODE (INT, DI, 1); /* V1DI */ > VECTOR_MODE (INT, SI, 1); /* V1SI */ > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index e826484a4f4..9fd36ff4c59 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -2418,6 +2418,7 @@ classify_argument (machine_mode mode, const_tree type, > case E_V8SFmode: > case E_V8SImode: > case E_V32QImode: > + case E_V16HFmode: > case E_V16HImode: > case E_V4DFmode: > case E_V4DImode: > @@ -2428,6 +2429,7 @@ classify_argument (machine_mode mode, const_tree type, > return 4; > case E_V8DFmode: > case E_V16SFmode: > + case E_V32HFmode: > case E_V8DImode: > case E_V16SImode: > case E_V32HImode: > @@ -2445,6 +2447,7 @@ classify_argument (machine_mode mode, const_tree type, > case E_V4SImode: > case E_V16QImode: > case E_V8HImode: > + case E_V8HFmode: > case E_V2DFmode: > case E_V2DImode: > classes[0] = X86_64_SSE_CLASS; > @@ -2929,7 +2932,9 @@ function_arg_advance_64 (CUMULATIVE_ARGS *cum, > machine_mode mode, > > /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ > if (!named && (VALID_AVX512F_REG_MODE (mode) > - || VALID_AVX256_REG_MODE (mode))) > + || VALID_AVX256_REG_MODE (mode) > + || mode == V16HFmode > + || mode == V32HFmode)) > return 0; > > if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs) > @@ -3176,12 +3181,14 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, > machine_mode mode, > default: > break; > > + case E_V16HFmode: > case E_V8SFmode: > case E_V8SImode: > case E_V32QImode: > case E_V16HImode: > case E_V4DFmode: > case E_V4DImode: > + case E_V32HFmode: > case E_V16SFmode: > case E_V16SImode: > case E_V64QImode: > @@ -4676,12 +4683,14 @@ ix86_gimplify_va_arg (tree valist, tree type, > gimple_seq *pre_p, > nat_mode = type_natural_mode (type, NULL, false); > switch (nat_mode) > { > + case E_V16HFmode: > case E_V8SFmode: > case E_V8SImode: > case E_V32QImode: > case E_V16HImode: > case E_V4DFmode: > case E_V4DImode: > + case E_V32HFmode: > case E_V16SFmode: > case E_V16SImode: > case E_V64QImode: > @@ -5348,7 +5357,12 @@ ix86_get_ssemov (rtx *operands, unsigned size, > switch (type) > { > case opcode_int: > - opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32"; > + if (scalar_mode == E_HFmode) > + opcode = (misaligned_p > + ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64") > + : "vmovdqa64"); > + else > + opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32"; > break; > case opcode_float: > opcode = misaligned_p ? "vmovups" : "vmovaps"; > @@ -5362,6 +5376,11 @@ ix86_get_ssemov (rtx *operands, unsigned size, > { > switch (scalar_mode) > { > + case E_HFmode: > + opcode = (misaligned_p > + ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64") > + : "vmovdqa64"); > + break; > case E_SFmode: > opcode = misaligned_p ? "%vmovups" : "%vmovaps"; > break; > @@ -19293,7 +19312,6 @@ inline_memory_move_cost (machine_mode mode, enum > reg_class regclass, int in) > int index; > switch (mode) > { > - case E_HFmode: > case E_SFmode: > index = 0; > break; > @@ -19394,31 +19412,12 @@ inline_memory_move_cost (machine_mode mode, enum > reg_class regclass, int in) > } > break; > case 2: > - { > - int cost; > - if (in == 2) > - cost = MAX (ix86_cost->hard_register.int_load[1], > - ix86_cost->hard_register.int_store[1]); > - else > - cost = in ? ix86_cost->hard_register.int_load[1] > - : ix86_cost->hard_register.int_store[1]; > - if (mode == E_HFmode) > - { > - /* Prefer SSE over GPR for HFmode. */ > - int sse_cost; > - int index = sse_store_index (mode); > - if (in == 2) > - sse_cost = MAX (ix86_cost->hard_register.sse_load[index], > - ix86_cost->hard_register.sse_store[index]); > - else > - sse_cost = (in > - ? ix86_cost->hard_register.sse_load [index] > - : ix86_cost->hard_register.sse_store [index]); > - if (sse_cost >= cost) > - cost = sse_cost + 1; > - } > - return cost; > - } > + if (in == 2) > + return MAX (ix86_cost->hard_register.int_load[1], > + ix86_cost->hard_register.int_store[1]); > + else > + return in ? ix86_cost->hard_register.int_load[1] > + : ix86_cost->hard_register.int_store[1]; > default: > if (in == 2) > cost = MAX (ix86_cost->hard_register.int_load[2], > @@ -19596,6 +19595,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, > machine_mode mode) > between gpr and sse registser. */ > if (TARGET_AVX512F > && (mode == XImode > + || mode == V32HFmode > || VALID_AVX512F_REG_MODE (mode) > || VALID_AVX512F_SCALAR_MODE (mode))) > return true; > @@ -19610,9 +19610,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, > machine_mode mode) > /* TODO check for QI/HI scalars. */ > /* AVX512VL allows sse regs16+ for 128/256 bit modes. */ > if (TARGET_AVX512VL > - && (mode == OImode > - || mode == TImode > - || VALID_AVX256_REG_MODE (mode) > + && (VALID_AVX256_REG_OR_OI_VHF_MODE (mode) > || VALID_AVX512VL_128_REG_MODE (mode))) > return true; > > @@ -19622,9 +19620,9 @@ ix86_hard_regno_mode_ok (unsigned int regno, > machine_mode mode) > > /* OImode and AVX modes are available only when AVX is enabled. */ > return ((TARGET_AVX > - && VALID_AVX256_REG_OR_OI_MODE (mode)) > + && VALID_AVX256_REG_OR_OI_VHF_MODE (mode)) > || VALID_SSE_REG_MODE (mode) > - || VALID_SSE2_REG_MODE (mode) > + || VALID_SSE2_REG_VHF_MODE (mode) > || VALID_MMX_REG_MODE (mode) > || VALID_MMX_REG_MODE_3DNOW (mode)); > } > @@ -19837,7 +19835,8 @@ ix86_set_reg_reg_cost (machine_mode mode) > > case MODE_VECTOR_INT: > case MODE_VECTOR_FLOAT: > - if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) > + if ((TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode)) > + || (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) > || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) > || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) > || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) > @@ -21703,6 +21702,8 @@ ix86_vector_mode_supported_p (machine_mode mode) > if ((TARGET_MMX || TARGET_MMX_WITH_SSE) > && VALID_MMX_REG_MODE (mode)) > return true; > + if (TARGET_AVX512FP16 && VALID_AVX512FP16_REG_MODE (mode)) > + return true; > if ((TARGET_3DNOW || TARGET_MMX_WITH_SSE) > && VALID_MMX_REG_MODE_3DNOW (mode)) > return true; > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index dca2ad32ed4..086dbafbcee 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -995,8 +995,8 @@ extern const char *host_detect_local_cpu (int argc, const > char **argv); > || (MODE) == V4DImode || (MODE) == V2TImode || (MODE) == V8SFmode \ > || (MODE) == V4DFmode) > > -#define VALID_AVX256_REG_OR_OI_MODE(MODE) \ > - (VALID_AVX256_REG_MODE (MODE) || (MODE) == OImode) > +#define VALID_AVX256_REG_OR_OI_VHF_MODE(MODE) \ > + (VALID_AVX256_REG_MODE (MODE) || (MODE) == OImode || (MODE) == V16HFmode) > > #define VALID_AVX512F_SCALAR_MODE(MODE) > \ > ((MODE) == DImode || (MODE) == DFmode || (MODE) == SImode \ > @@ -1014,13 +1014,20 @@ extern const char *host_detect_local_cpu (int argc, > const char **argv); > #define VALID_AVX512VL_128_REG_MODE(MODE) \ > ((MODE) == V2DImode || (MODE) == V2DFmode || (MODE) == V16QImode \ > || (MODE) == V4SImode || (MODE) == V4SFmode || (MODE) == V8HImode \ > - || (MODE) == TFmode || (MODE) == V1TImode) > + || (MODE) == TFmode || (MODE) == V1TImode || (MODE) == V8HFmode \ > + || (MODE) == TImode) > + > +#define VALID_AVX512FP16_REG_MODE(MODE) > \ > + ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode) > > #define VALID_SSE2_REG_MODE(MODE) \ > ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ > || (MODE) == V4QImode || (MODE) == V2HImode || (MODE) == V1SImode \ > || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode) > > +#define VALID_SSE2_REG_VHF_MODE(MODE) \ > + (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode) > + > #define VALID_SSE_REG_MODE(MODE) \ > ((MODE) == V1TImode || (MODE) == TImode \ > || (MODE) == V4SFmode || (MODE) == V4SImode \ > @@ -1064,7 +1071,7 @@ extern const char *host_detect_local_cpu (int argc, > const char **argv); > || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode \ > || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode \ > || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode \ > - || (MODE) == V16SFmode) > + || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE)) > > #define X87_FLOAT_MODE_P(MODE) \ > (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == > XFmode)) > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 8f11cbcf28b..20945fabb2c 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -496,8 +496,8 @@ (define_attr "type" > > ;; Main data type used by the insn > (define_attr "mode" > - "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF, > - V2DF,V2SF,V1DF,V8DF" > + "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF, > + V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF" > (const_string "unknown")) > > ;; The CPU unit operations uses. > @@ -1098,7 +1098,8 @@ (define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") > (DI "8") > (V2DI "16") (V4DI "32") (V8DI "64") > (V1TI "16") (V2TI "32") (V4TI "64") > (V2DF "16") (V4DF "32") (V8DF "64") > - (V4SF "16") (V8SF "32") (V16SF "64")]) > + (V4SF "16") (V8SF "32") (V16SF "64") > + (V8HF "16") (V16HF "32") (V32HF "64")]) > > ;; Double word integer modes as mode attribute. > (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")]) > @@ -1236,9 +1237,9 @@ (define_mode_attr ssevecmodef [(SF "V4SF") (DF "V2DF") > (TF "TF")]) > ;; SSE instruction suffix for various modes > (define_mode_attr ssemodesuffix > [(HF "sh") (SF "ss") (DF "sd") > - (V16SF "ps") (V8DF "pd") > - (V8SF "ps") (V4DF "pd") > - (V4SF "ps") (V2DF "pd") > + (V32HF "ph") (V16SF "ps") (V8DF "pd") > + (V16HF "ph") (V8SF "ps") (V4DF "pd") > + (V8HF "ph") (V4SF "ps") (V2DF "pd") > (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q") > (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q") > (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")]) > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index ab29999023d..b004b5eee74 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -225,6 +225,7 @@ (define_mode_iterator VMOVE > (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI > (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI > (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX") V1TI > + (V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") V8HF > (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) > > @@ -240,6 +241,13 @@ (define_mode_iterator VI12_AVX512VL > [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") > V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) > > +(define_mode_iterator VI12HF_AVX512VL > + [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") > + V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL") > + (V32HF "TARGET_AVX512FP16") > + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") > + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")]) > + > ;; Same iterator, but without supposed TARGET_AVX512BW > (define_mode_iterator VI12_AVX512VLBW > [(V64QI "TARGET_AVX512BW") (V16QI "TARGET_AVX512VL") > @@ -255,6 +263,8 @@ (define_mode_iterator V > (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI > (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI > (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI > + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") > + (V8HF "TARGET_AVX512FP16") > (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) > > @@ -277,7 +287,8 @@ (define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF > V8DF]) > (define_mode_iterator V_256_512 > [V32QI V16HI V8SI V4DI V8SF V4DF > (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F") > - (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) > + (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") > + (V16HF "TARGET_AVX512FP16") (V32HF "TARGET_AVX512FP16")]) > > ;; All vector float modes > (define_mode_iterator VF > @@ -321,6 +332,11 @@ (define_mode_iterator VF2_512_256VL > (define_mode_iterator VF_128 > [V4SF (V2DF "TARGET_SSE2")]) > > +;; All 128bit vector HF/SF/DF modes > +(define_mode_iterator VFH_128 > + [(V8HF "TARGET_AVX512FP16") > + V4SF (V2DF "TARGET_SSE2")]) > + > ;; All 256bit vector float modes > (define_mode_iterator VF_256 > [V8SF V4DF]) > @@ -347,6 +363,9 @@ (define_mode_iterator VF2_AVX512VL > (define_mode_iterator VF1_AVX512VL > [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) > > +(define_mode_iterator VF_AVX512FP16 > + [V32HF V16HF V8HF]) > + > ;; All vector integer modes > (define_mode_iterator VI > [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") > @@ -355,6 +374,16 @@ (define_mode_iterator VI > (V8SI "TARGET_AVX") V4SI > (V4DI "TARGET_AVX") V2DI]) > > +;; All vector integer and HF modes > +(define_mode_iterator VIHF > + [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") > + (V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI > + (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI > + (V8SI "TARGET_AVX") V4SI > + (V4DI "TARGET_AVX") V2DI > + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") > + (V8HF "TARGET_AVX512FP16")]) > + > (define_mode_iterator VI_AVX2 > [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI > (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI > @@ -557,6 +586,7 @@ (define_mode_attr avx512 > (V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw") > (V4SI "avx512vl") (V8SI "avx512vl") (V16SI "avx512f") > (V2DI "avx512vl") (V4DI "avx512vl") (V8DI "avx512f") > + (V8HF "avx512fp16") (V16HF "avx512vl") (V32HF "avx512bw") > (V4SF "avx512vl") (V8SF "avx512vl") (V16SF "avx512f") > (V2DF "avx512vl") (V4DF "avx512vl") (V8DF "avx512f")]) > > @@ -617,12 +647,13 @@ (define_mode_attr avx2_avx512 > (V8HI "avx512vl") (V16HI "avx512vl") (V32HI "avx512bw")]) > > (define_mode_attr shuffletype > - [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i") > - (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i") > - (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i") > - (V32HI "i") (V16HI "i") (V8HI "i") > - (V64QI "i") (V32QI "i") (V16QI "i") > - (V4TI "i") (V2TI "i") (V1TI "i")]) > + [(V32HF "f") (V16HF "f") (V8HF "f") > + (V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i") > + (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i") > + (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i") > + (V32HI "i") (V16HI "i") (V8HI "i") > + (V64QI "i") (V32QI "i") (V16QI "i") > + (V4TI "i") (V2TI "i") (V1TI "i")]) > > (define_mode_attr ssequartermode > [(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")]) > @@ -659,6 +690,8 @@ (define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI]) > > ;; All 128 and 256bit vector integer modes > (define_mode_iterator VI_128_256 [V16QI V8HI V4SI V2DI V32QI V16HI V8SI > V4DI]) > +;; All 256bit vector integer and HF modes > +(define_mode_iterator VIHF_256 [V32QI V16HI V8SI V4DI V16HF]) > > ;; Various 128bit vector integer mode combinations > (define_mode_iterator VI12_128 [V16QI V8HI]) > @@ -680,6 +713,9 @@ (define_mode_iterator VI48_512 [V16SI V8DI]) > (define_mode_iterator VI4_256_8_512 [V8SI V8DI]) > (define_mode_iterator VI_AVX512BW > [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) > +(define_mode_iterator VIHF_AVX512BW > + [V16SI V8DI (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW") > + (V32HF "TARGET_AVX512FP16")]) > > ;; Int-float size matches > (define_mode_iterator VI4F_128 [V4SI V4SF]) > @@ -720,6 +756,9 @@ (define_mode_iterator VF_AVX512 > (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") > V16SF V8DF]) > > +(define_mode_iterator V16_256 [V16HI V16HF]) > +(define_mode_iterator V32_512 [V32HI V32HF]) > + > (define_mode_attr avx512bcst > [(V4SI "%{1to4%}") (V2DI "%{1to2%}") > (V8SI "%{1to8%}") (V4DI "%{1to4%}") > @@ -730,8 +769,10 @@ (define_mode_attr avx512bcst > > ;; Mapping from float mode to required SSE level > (define_mode_attr sse > - [(SF "sse") (DF "sse2") > + [(SF "sse") (DF "sse2") (HF "avx512fp16") > (V4SF "sse") (V2DF "sse2") > + (V32HF "avx512fp16") (V16HF "avx512fp16") > + (V8HF "avx512fp16") > (V16SF "avx512f") (V8SF "avx") > (V8DF "avx512f") (V4DF "avx")]) > > @@ -767,14 +808,23 @@ (define_mode_attr sseinsnmode > (V16SF "V16SF") (V8DF "V8DF") > (V8SF "V8SF") (V4DF "V4DF") > (V4SF "V4SF") (V2DF "V2DF") > + (V8HF "TI") (V16HF "OI") (V32HF "XI") > (TI "TI")]) > > +;; SSE integer instruction suffix for various modes > +(define_mode_attr sseintmodesuffix > + [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q") > + (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q") > + (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q") > + (V8HF "w") (V16HF "w") (V32HF "w")]) > + > ;; Mapping of vector modes to corresponding mask size > (define_mode_attr avx512fmaskmode > [(V64QI "DI") (V32QI "SI") (V16QI "HI") > (V32HI "SI") (V16HI "HI") (V8HI "QI") (V4HI "QI") > (V16SI "HI") (V8SI "QI") (V4SI "QI") > (V8DI "QI") (V4DI "QI") (V2DI "QI") > + (V32HF "SI") (V16HF "HI") (V8HF "QI") > (V16SF "HI") (V8SF "QI") (V4SF "QI") > (V8DF "QI") (V4DF "QI") (V2DF "QI")]) > > @@ -784,6 +834,7 @@ (define_mode_attr avx512fmaskmodelower > (V32HI "si") (V16HI "hi") (V8HI "qi") (V4HI "qi") > (V16SI "hi") (V8SI "qi") (V4SI "qi") > (V8DI "qi") (V4DI "qi") (V2DI "qi") > + (V32HF "si") (V16HF "hi") (V8HF "qi") > (V16SF "hi") (V8SF "qi") (V4SF "qi") > (V8DF "qi") (V4DF "qi") (V2DF "qi")]) > > @@ -828,7 +879,8 @@ (define_mode_attr ssedoublevecmode > (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI") > (V16SF "V32SF") (V8DF "V16DF") > (V8SF "V16SF") (V4DF "V8DF") > - (V4SF "V8SF") (V2DF "V4DF")]) > + (V4SF "V8SF") (V2DF "V4DF") > + (V32HF "V64HF") (V16HF "V32HF") (V8HF "V16HF")]) > > ;; Mapping of vector modes to a vector mode of half size > ;; instead of V1DI/V1DF, DI/DF are used for V2DI/V2DF although they are > scalar. > @@ -838,7 +890,8 @@ (define_mode_attr ssehalfvecmode > (V16QI "V8QI") (V8HI "V4HI") (V4SI "V2SI") (V2DI "DI") > (V16SF "V8SF") (V8DF "V4DF") > (V8SF "V4SF") (V4DF "V2DF") > - (V4SF "V2SF") (V2DF "DF")]) > + (V4SF "V2SF") (V2DF "DF") > + (V32HF "V16HF") (V16HF "V8HF") (V8HF "V4HF")]) > > (define_mode_attr ssehalfvecmodelower > [(V64QI "v32qi") (V32HI "v16hi") (V16SI "v8si") (V8DI "v4di") (V4TI "v2ti") > @@ -846,9 +899,10 @@ (define_mode_attr ssehalfvecmodelower > (V16QI "v8qi") (V8HI "v4hi") (V4SI "v2si") > (V16SF "v8sf") (V8DF "v4df") > (V8SF "v4sf") (V4DF "v2df") > - (V4SF "v2sf")]) > + (V4SF "v2sf") > + (V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")]) > > -;; Mapping of vector modes ti packed single mode of the same size > +;; Mapping of vector modes to packed single mode of the same size > (define_mode_attr ssePSmode > [(V16SI "V16SF") (V8DF "V16SF") > (V16SF "V16SF") (V8DI "V16SF") > @@ -858,7 +912,8 @@ (define_mode_attr ssePSmode > (V4DI "V8SF") (V2DI "V4SF") > (V4TI "V16SF") (V2TI "V8SF") (V1TI "V4SF") > (V8SF "V8SF") (V4SF "V4SF") > - (V4DF "V8SF") (V2DF "V4SF")]) > + (V4DF "V8SF") (V2DF "V4SF") > + (V32HF "V16SF") (V16HF "V8SF") (V8HF "V4SF")]) > > (define_mode_attr ssePSmode2 > [(V8DI "V8SF") (V4DI "V4SF")]) > @@ -869,6 +924,7 @@ (define_mode_attr ssescalarmode > (V32HI "HI") (V16HI "HI") (V8HI "HI") > (V16SI "SI") (V8SI "SI") (V4SI "SI") > (V8DI "DI") (V4DI "DI") (V2DI "DI") > + (V32HF "HF") (V16HF "HF") (V8HF "HF") > (V16SF "SF") (V8SF "SF") (V4SF "SF") > (V8DF "DF") (V4DF "DF") (V2DF "DF") > (V4TI "TI") (V2TI "TI")]) > @@ -879,6 +935,7 @@ (define_mode_attr ssescalarmodelower > (V32HI "hi") (V16HI "hi") (V8HI "hi") > (V16SI "si") (V8SI "si") (V4SI "si") > (V8DI "di") (V4DI "di") (V2DI "di") > + (V32HF "hf") (V16HF "hf") (V8HF "hf") > (V16SF "sf") (V8SF "sf") (V4SF "sf") > (V8DF "df") (V4DF "df") (V2DF "df") > (V4TI "ti") (V2TI "ti")]) > @@ -889,6 +946,7 @@ (define_mode_attr ssexmmmode > (V32HI "V8HI") (V16HI "V8HI") (V8HI "V8HI") > (V16SI "V4SI") (V8SI "V4SI") (V4SI "V4SI") > (V8DI "V2DI") (V4DI "V2DI") (V2DI "V2DI") > + (V32HF "V8HF") (V16HF "V8HF") (V8HF "V8HF") > (V16SF "V4SF") (V8SF "V4SF") (V4SF "V4SF") > (V8DF "V2DF") (V4DF "V2DF") (V2DF "V2DF")]) > > @@ -931,10 +989,11 @@ (define_mode_attr ssescalarsize > (V64QI "8") (V32QI "8") (V16QI "8") > (V32HI "16") (V16HI "16") (V8HI "16") > (V16SI "32") (V8SI "32") (V4SI "32") > + (V32HF "16") (V16HF "16") (V8HF "16") > (V16SF "32") (V8SF "32") (V4SF "32") > (V8DF "64") (V4DF "64") (V2DF "64")]) > > -;; SSE prefix for integer vector modes > +;; SSE prefix for integer and HF vector modes > (define_mode_attr sseintprefix > [(V2DI "p") (V2DF "") > (V4DI "p") (V4DF "") > @@ -942,16 +1001,16 @@ (define_mode_attr sseintprefix > (V4SI "p") (V4SF "") > (V8SI "p") (V8SF "") > (V16SI "p") (V16SF "") > - (V16QI "p") (V8HI "p") > - (V32QI "p") (V16HI "p") > - (V64QI "p") (V32HI "p")]) > + (V16QI "p") (V8HI "p") (V8HF "p") > + (V32QI "p") (V16HI "p") (V16HF "p") > + (V64QI "p") (V32HI "p") (V32HF "p")]) > > ;; SSE scalar suffix for vector modes > (define_mode_attr ssescalarmodesuffix > - [(SF "ss") (DF "sd") > - (V16SF "ss") (V8DF "sd") > - (V8SF "ss") (V4DF "sd") > - (V4SF "ss") (V2DF "sd") > + [(HF "sh") (SF "ss") (DF "sd") > + (V32HF "sh") (V16SF "ss") (V8DF "sd") > + (V16HF "sh") (V8SF "ss") (V4DF "sd") > + (V8HF "sh") (V4SF "ss") (V2DF "sd") > (V16SI "d") (V8DI "q") > (V8SI "d") (V4DI "q") > (V4SI "d") (V2DI "q")]) > @@ -979,7 +1038,8 @@ (define_mode_attr castmode > ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. > ;; i64x4 or f64x4 for 512bit modes. > (define_mode_attr i128 > - [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128") > + [(V16HF "%~128") (V32HF "i64x4") (V16SF "f64x4") (V8SF "f128") > + (V8DF "f64x4") (V4DF "f128") > (V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128") > (V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")]) > > @@ -1003,14 +1063,18 @@ (define_mode_attr bcstscalarsuff > (V32HI "w") (V16HI "w") (V8HI "w") > (V16SI "d") (V8SI "d") (V4SI "d") > (V8DI "q") (V4DI "q") (V2DI "q") > + (V32HF "w") (V16HF "w") (V8HF "w") > (V16SF "ss") (V8SF "ss") (V4SF "ss") > (V8DF "sd") (V4DF "sd") (V2DF "sd")]) > > ;; Tie mode of assembler operand to mode iterator > (define_mode_attr xtg_mode > - [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") (V4SF "x") (V2DF "x") > - (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t") > - (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")]) > + [(V16QI "x") (V8HI "x") (V4SI "x") (V2DI "x") > + (V8HF "x") (V4SF "x") (V2DF "x") > + (V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") > + (V16HF "t") (V8SF "t") (V4DF "t") > + (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") > + (V32HF "g") (V16SF "g") (V8DF "g")]) > > ;; Half mask mode for unpacks > (define_mode_attr HALFMASKMODE > @@ -1306,6 +1370,20 @@ (define_insn "<avx512>_blendm<mode>" > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_insn "<avx512>_blendm<mode>" > + [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v") > + (vec_merge:VF_AVX512FP16 > + (match_operand:VF_AVX512FP16 2 "nonimmediate_operand" "vm,vm") > + (match_operand:VF_AVX512FP16 1 "nonimm_or_0_operand" "0C,v") > + (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))] > + "TARGET_AVX512BW" > + "@ > + vmovdqu<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2} > + vpblendmw\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "evex") > + (set_attr "mode" "<sseinsnmode>")]) > + > (define_insn "<avx512>_store<mode>_mask" > [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") > (vec_merge:V48_AVX512VL > @@ -1903,12 +1981,12 @@ (define_insn "*<insn><mode>3<mask_name><round_name>" > ;; Standard scalar operation patterns which preserve the rest of the > ;; vector for combiner. > (define_insn "*<sse>_vm<insn><mode>3" > - [(set (match_operand:VF_128 0 "register_operand" "=x,v") > - (vec_merge:VF_128 > - (vec_duplicate:VF_128 > + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") > + (vec_merge:VFH_128 > + (vec_duplicate:VFH_128 > (plusminus:<ssescalarmode> > (vec_select:<ssescalarmode> > - (match_operand:VF_128 1 "register_operand" "0,v") > + (match_operand:VFH_128 1 "register_operand" "0,v") > (parallel [(const_int 0)])) > (match_operand:<ssescalarmode> 2 "nonimmediate_operand" > "xm,vm"))) > (match_dup 1) > @@ -1919,7 +1997,16 @@ (define_insn "*<sse>_vm<insn><mode>3" > v<plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" > [(set_attr "isa" "noavx,avx") > (set_attr "type" "sseadd") > - (set_attr "prefix" "orig,vex") > + (set (attr "prefix") > + (cond [(eq_attr "alternative" "0") > + (const_string "orig") > + (eq_attr "alternative" "1") > + (if_then_else > + (match_test "<MODE>mode == V8HFmode") > + (const_string "evex") > + (const_string "vex")) > + ] > + (const_string "*"))) > (set_attr "mode" "<ssescalarmode>")]) > > (define_insn "<sse>_vm<insn><mode>3<mask_scalar_name><round_scalar_name>" > @@ -1966,12 +2053,12 @@ (define_insn "*mul<mode>3<mask_name><round_name>" > ;; Standard scalar operation patterns which preserve the rest of the > ;; vector for combiner. > (define_insn "*<sse>_vm<multdiv_mnemonic><mode>3" > - [(set (match_operand:VF_128 0 "register_operand" "=x,v") > - (vec_merge:VF_128 > - (vec_duplicate:VF_128 > + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") > + (vec_merge:VFH_128 > + (vec_duplicate:VFH_128 > (multdiv:<ssescalarmode> > (vec_select:<ssescalarmode> > - (match_operand:VF_128 1 "register_operand" "0,v") > + (match_operand:VFH_128 1 "register_operand" "0,v") > (parallel [(const_int 0)])) > (match_operand:<ssescalarmode> 2 "nonimmediate_operand" > "xm,vm"))) > (match_dup 1) > @@ -1982,7 +2069,16 @@ (define_insn "*<sse>_vm<multdiv_mnemonic><mode>3" > v<multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" > [(set_attr "isa" "noavx,avx") > (set_attr "type" "sse<multdiv_mnemonic>") > - (set_attr "prefix" "orig,vex") > + (set (attr "prefix") > + (cond [(eq_attr "alternative" "0") > + (const_string "orig") > + (eq_attr "alternative" "1") > + (if_then_else > + (match_test "<MODE>mode == V8HFmode") > + (const_string "evex") > + (const_string "vex")) > + ] > + (const_string "*"))) > (set_attr "btver2_decode" "direct,double") > (set_attr "mode" "<ssescalarmode>")]) > > @@ -2368,12 +2464,12 @@ (define_insn > "ieee_<ieee_maxmin><mode>3<mask_name><round_saeonly_name>" > ;; Standard scalar operation patterns which preserve the rest of the > ;; vector for combiner. > (define_insn "*ieee_<ieee_maxmin><mode>3" > - [(set (match_operand:VF_128 0 "register_operand" "=x,v") > - (vec_merge:VF_128 > - (vec_duplicate:VF_128 > + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") > + (vec_merge:VFH_128 > + (vec_duplicate:VFH_128 > (unspec:<ssescalarmode> > [(vec_select:<ssescalarmode> > - (match_operand:VF_128 1 "register_operand" "0,v") > + (match_operand:VFH_128 1 "register_operand" "0,v") > (parallel [(const_int 0)])) > (match_operand:<ssescalarmode> 2 "nonimmediate_operand" > "xm,vm")] > IEEE_MAXMIN)) > @@ -2386,7 +2482,16 @@ (define_insn "*ieee_<ieee_maxmin><mode>3" > [(set_attr "isa" "noavx,avx") > (set_attr "type" "sseadd") > (set_attr "btver2_sse_attr" "maxmin") > - (set_attr "prefix" "orig,vex") > + (set (attr "prefix") > + (cond [(eq_attr "alternative" "0") > + (const_string "orig") > + (eq_attr "alternative" "1") > + (if_then_else > + (match_test "<MODE>mode == V8HFmode") > + (const_string "evex") > + (const_string "vex")) > + ] > + (const_string "*"))) > (set_attr "mode" "<ssescalarmode>")]) > > (define_insn > "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>" > @@ -8364,6 +8469,45 @@ (define_insn "vec_set<mode>_0" > ] > (symbol_ref "true")))]) > > +;; vmovw clears also the higer bits > +(define_insn "vec_set<mode>_0" > + [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v") > + (vec_merge:VF_AVX512FP16 > + (vec_duplicate:VF_AVX512FP16 > + (match_operand:HF 2 "nonimmediate_operand" "rm")) > + (match_operand:VF_AVX512FP16 1 "const0_operand" "C") > + (const_int 1)))] > + "TARGET_AVX512FP16" > + "vmovw\t{%2, %x0|%x0, %2}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF")]) > + > +(define_insn "*avx512fp16_movsh" > + [(set (match_operand:V8HF 0 "register_operand" "=v") > + (vec_merge:V8HF > + (vec_duplicate:V8HF > + (match_operand:HF 2 "register_operand" "v")) > + (match_operand:V8HF 1 "register_operand" "v") > + (const_int 1)))] > + "TARGET_AVX512FP16" > + "vmovsh\t{%2, %1, %0|%0, %1, %2}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF")]) > + > +(define_insn "avx512fp16_movsh" > + [(set (match_operand:V8HF 0 "register_operand" "=v") > + (vec_merge:V8HF > + (match_operand:V8HF 2 "register_operand" "v") > + (match_operand:V8HF 1 "register_operand" "v") > + (const_int 1)))] > + "TARGET_AVX512FP16" > + "vmovsh\t{%2, %1, %0|%0, %1, %2}" > + [(set_attr "type" "ssemov") > + (set_attr "prefix" "evex") > + (set_attr "mode" "HF")]) > + > ;; A subset is vec_setv4sf. > (define_insn "*vec_setv4sf_sse4_1" > [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v") > @@ -8499,6 +8643,20 @@ (define_expand "vec_set<mode>" > DONE; > }) > > +(define_expand "vec_setv8hf" > + [(match_operand:V8HF 0 "register_operand") > + (match_operand:HF 1 "register_operand") > + (match_operand 2 "vec_setm_sse41_operand")] > + "TARGET_SSE" > +{ > + if (CONST_INT_P (operands[2])) > + ix86_expand_vector_set (false, operands[0], operands[1], > + INTVAL (operands[2])); > + else > + ix86_expand_vector_set_var (operands[0], operands[1], operands[2]); > + DONE; > +}) > + > (define_expand "vec_set<mode>" > [(match_operand:V_256_512 0 "register_operand") > (match_operand:<ssescalarmode> 1 "register_operand") > @@ -9214,10 +9372,10 @@ (define_insn "vec_extract_hi_<mode>" > (set_attr "length_immediate" "1") > (set_attr "mode" "<sseinsnmode>")]) > > -(define_insn_and_split "vec_extract_lo_v32hi" > - [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,v,m") > - (vec_select:V16HI > - (match_operand:V32HI 1 "nonimmediate_operand" "v,m,v") > +(define_insn_and_split "vec_extract_lo_<mode>" > + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,v,m") > + (vec_select:<ssehalfvecmode> > + (match_operand:V32_512 1 "nonimmediate_operand" "v,m,v") > (parallel [(const_int 0) (const_int 1) > (const_int 2) (const_int 3) > (const_int 4) (const_int 5) > @@ -9244,9 +9402,10 @@ (define_insn_and_split "vec_extract_lo_v32hi" > if (!TARGET_AVX512VL > && REG_P (operands[0]) > && EXT_REX_SSE_REG_P (operands[1])) > - operands[0] = lowpart_subreg (V32HImode, operands[0], V16HImode); > + operands[0] = lowpart_subreg (<MODE>mode, operands[0], > + <ssehalfvecmode>mode); > else > - operands[1] = gen_lowpart (V16HImode, operands[1]); > + operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]); > } > [(set_attr "type" "sselog1") > (set_attr "prefix_extra" "1") > @@ -9255,10 +9414,10 @@ (define_insn_and_split "vec_extract_lo_v32hi" > (set_attr "prefix" "evex") > (set_attr "mode" "XI")]) > > -(define_insn "vec_extract_hi_v32hi" > - [(set (match_operand:V16HI 0 "nonimmediate_operand" "=vm") > - (vec_select:V16HI > - (match_operand:V32HI 1 "register_operand" "v") > +(define_insn "vec_extract_hi_<mode>" > + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=vm") > + (vec_select:<ssehalfvecmode> > + (match_operand:V32_512 1 "register_operand" "v") > (parallel [(const_int 16) (const_int 17) > (const_int 18) (const_int 19) > (const_int 20) (const_int 21) > @@ -9275,10 +9434,10 @@ (define_insn "vec_extract_hi_v32hi" > (set_attr "prefix" "evex") > (set_attr "mode" "XI")]) > > -(define_insn_and_split "vec_extract_lo_v16hi" > - [(set (match_operand:V8HI 0 "nonimmediate_operand" "=v,m") > - (vec_select:V8HI > - (match_operand:V16HI 1 "nonimmediate_operand" "vm,v") > +(define_insn_and_split "vec_extract_lo_<mode>" > + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m") > + (vec_select:<ssehalfvecmode> > + (match_operand:V16_256 1 "nonimmediate_operand" "vm,v") > (parallel [(const_int 0) (const_int 1) > (const_int 2) (const_int 3) > (const_int 4) (const_int 5) > @@ -9287,12 +9446,12 @@ (define_insn_and_split "vec_extract_lo_v16hi" > "#" > "&& reload_completed" > [(set (match_dup 0) (match_dup 1))] > - "operands[1] = gen_lowpart (V8HImode, operands[1]);") > + "operands[1] = gen_lowpart (<ssehalfvecmode>mode, operands[1]);") > > -(define_insn "vec_extract_hi_v16hi" > - [(set (match_operand:V8HI 0 "nonimmediate_operand" "=xm,vm,vm") > - (vec_select:V8HI > - (match_operand:V16HI 1 "register_operand" "x,v,v") > +(define_insn "vec_extract_hi_<mode>" > + [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=xm,vm,vm") > + (vec_select:<ssehalfvecmode> > + (match_operand:V16_256 1 "register_operand" "x,v,v") > (parallel [(const_int 8) (const_int 9) > (const_int 10) (const_int 11) > (const_int 12) (const_int 13) > @@ -9428,12 +9587,41 @@ (define_insn "vec_extract_hi_v32qi" > (set_attr "prefix" "vex,evex,evex") > (set_attr "mode" "OI")]) > > +;; NB: *vec_extract<mode>_0 must be placed before *vec_extracthf. > +;; Otherwise, it will be ignored. > +(define_insn_and_split "*vec_extract<mode>_0" > + [(set (match_operand:HF 0 "nonimmediate_operand" "=v,m,r") > + (vec_select:HF > + (match_operand:VF_AVX512FP16 1 "nonimmediate_operand" "vm,v,m") > + (parallel [(const_int 0)])))] > + "TARGET_AVX512FP16 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" > + "#" > + "&& reload_completed" > + [(set (match_dup 0) (match_dup 1))] > + "operands[1] = gen_lowpart (HFmode, operands[1]);") > + > +(define_insn "*vec_extracthf" > + [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m") > + (vec_select:HF > + (match_operand:V8HF 1 "register_operand" "v,v") > + (parallel > + [(match_operand:SI 2 "const_0_to_7_operand")])))] > + "TARGET_AVX512FP16" > + "@ > + vpextrw\t{%2, %1, %k0|%k0, %1, %2} > + vpextrw\t{%2, %1, %0|%0, %1, %2}" > + [(set_attr "type" "sselog1") > + (set_attr "prefix" "maybe_evex") > + (set_attr "mode" "TI")]) > + > ;; Modes handled by vec_extract patterns. > (define_mode_iterator VEC_EXTRACT_MODE > [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX") V16QI > (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX") V8HI > (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI > (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI > + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") > + (V8HF "TARGET_AVX512FP16") > (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF > (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) > @@ -14666,16 +14854,16 @@ (define_expand "vec_interleave_low<mode>" > > ;; Modes handled by pinsr patterns. > (define_mode_iterator PINSR_MODE > - [(V16QI "TARGET_SSE4_1") V8HI > + [(V16QI "TARGET_SSE4_1") V8HI (V8HF "TARGET_AVX512FP16") > (V4SI "TARGET_SSE4_1") > (V2DI "TARGET_SSE4_1 && TARGET_64BIT")]) > > (define_mode_attr sse2p4_1 > - [(V16QI "sse4_1") (V8HI "sse2") > + [(V16QI "sse4_1") (V8HI "sse2") (V8HF "sse4_1") > (V4SI "sse4_1") (V2DI "sse4_1")]) > > (define_mode_attr pinsr_evex_isa > - [(V16QI "avx512bw") (V8HI "avx512bw") > + [(V16QI "avx512bw") (V8HI "avx512bw") (V8HF "avx512bw") > (V4SI "avx512dq") (V2DI "avx512dq")]) > > ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred. > @@ -14703,11 +14891,19 @@ (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>" > case 2: > case 4: > if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode)) > - return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; > + { > + if (<MODE>mode == V8HFmode) > + return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; > + else > + return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; > + } > /* FALLTHRU */ > case 3: > case 5: > - return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > + if (<MODE>mode == V8HFmode) > + return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > + else > + return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; > default: > gcc_unreachable (); > } > @@ -21122,16 +21318,17 @@ (define_mode_attr pbroadcast_evex_isa > [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") > (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") > (V16SI "avx512f") (V8SI "avx512f") (V4SI "avx512f") > - (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f")]) > + (V8DI "avx512f") (V4DI "avx512f") (V2DI "avx512f") > + (V32HF "avx512bw") (V16HF "avx512bw") (V8HF "avx512bw")]) > > (define_insn "avx2_pbroadcast<mode>" > - [(set (match_operand:VI 0 "register_operand" "=x,v") > - (vec_duplicate:VI > + [(set (match_operand:VIHF 0 "register_operand" "=x,v") > + (vec_duplicate:VIHF > (vec_select:<ssescalarmode> > (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm,vm") > (parallel [(const_int 0)]))))] > "TARGET_AVX2" > - "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}" > + "vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1}" > [(set_attr "isa" "*,<pbroadcast_evex_isa>") > (set_attr "type" "ssemov") > (set_attr "prefix_extra" "1") > @@ -21139,17 +21336,17 @@ (define_insn "avx2_pbroadcast<mode>" > (set_attr "mode" "<sseinsnmode>")]) > > (define_insn "avx2_pbroadcast<mode>_1" > - [(set (match_operand:VI_256 0 "register_operand" "=x,x,v,v") > - (vec_duplicate:VI_256 > + [(set (match_operand:VIHF_256 0 "register_operand" "=x,x,v,v") > + (vec_duplicate:VIHF_256 > (vec_select:<ssescalarmode> > - (match_operand:VI_256 1 "nonimmediate_operand" "m,x,m,v") > + (match_operand:VIHF_256 1 "nonimmediate_operand" "m,x,m,v") > (parallel [(const_int 0)]))))] > "TARGET_AVX2" > "@ > - vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1} > - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1} > - vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1} > - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}" > + vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1} > + vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1} > + vpbroadcast<sseintmodesuffix>\t{%1, %0|%0, %<iptr>1} > + vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1}" > [(set_attr "isa" "*,*,<pbroadcast_evex_isa>,<pbroadcast_evex_isa>") > (set_attr "type" "ssemov") > (set_attr "prefix_extra" "1") > @@ -21503,15 +21700,15 @@ (define_insn "avx2_vec_dupv4df" > (set_attr "mode" "V4DF")]) > > (define_insn "<avx512>_vec_dup<mode>_1" > - [(set (match_operand:VI_AVX512BW 0 "register_operand" "=v,v") > - (vec_duplicate:VI_AVX512BW > + [(set (match_operand:VIHF_AVX512BW 0 "register_operand" "=v,v") > + (vec_duplicate:VIHF_AVX512BW > (vec_select:<ssescalarmode> > - (match_operand:VI_AVX512BW 1 "nonimmediate_operand" "v,m") > + (match_operand:VIHF_AVX512BW 1 "nonimmediate_operand" "v,m") > (parallel [(const_int 0)]))))] > "TARGET_AVX512F" > "@ > - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1} > - vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %<iptr>1}" > + vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %x1} > + vpbroadcast<sseintmodesuffix>\t{%x1, %0|%0, %<iptr>1}" > [(set_attr "type" "ssemov") > (set_attr "prefix" "evex") > (set_attr "mode" "<sseinsnmode>")]) > @@ -21536,8 +21733,8 @@ (define_insn "<avx512>_vec_dup<mode><mask_name>" > (set_attr "mode" "<sseinsnmode>")]) > > (define_insn "<avx512>_vec_dup<mode><mask_name>" > - [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") > - (vec_duplicate:VI12_AVX512VL > + [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") > + (vec_duplicate:VI12HF_AVX512VL > (vec_select:<ssescalarmode> > (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "vm") > (parallel [(const_int 0)]))))] > @@ -21572,8 +21769,8 @@ (define_insn > "<mask_codefor>avx512f_broadcast<mode><mask_name>" > (set_attr "mode" "<sseinsnmode>")]) > > (define_insn "<mask_codefor><avx512>_vec_dup_gpr<mode><mask_name>" > - [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v") > - (vec_duplicate:VI12_AVX512VL > + [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v,v") > + (vec_duplicate:VI12HF_AVX512VL > (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "vm,r")))] > "TARGET_AVX512BW" > "@ > @@ -21668,7 +21865,7 @@ (define_mode_attr vecdupssescalarmodesuffix > [(V8SF "ss") (V4DF "sd") (V8SI "ss") (V4DI "sd")]) > ;; Modes handled by AVX2 vec_dup patterns. > (define_mode_iterator AVX2_VEC_DUP_MODE > - [V32QI V16QI V16HI V8HI V8SI V4SI]) > + [V32QI V16QI V16HI V8HI V8SI V4SI V16HF V8HF]) > > (define_insn "*vec_dup<mode>" > [(set (match_operand:AVX2_VEC_DUP_MODE 0 "register_operand" "=x,x,v") > @@ -22224,12 +22421,12 @@ (define_insn "vec_set_hi_<mode><mask_name>" > (set_attr "prefix" "vex") > (set_attr "mode" "<sseinsnmode>")]) > > -(define_insn "vec_set_lo_v16hi" > - [(set (match_operand:V16HI 0 "register_operand" "=x,v") > - (vec_concat:V16HI > - (match_operand:V8HI 2 "nonimmediate_operand" "xm,vm") > - (vec_select:V8HI > - (match_operand:V16HI 1 "register_operand" "x,v") > +(define_insn "vec_set_lo_<mode>" > + [(set (match_operand:V16_256 0 "register_operand" "=x,v") > + (vec_concat:V16_256 > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm,vm") > + (vec_select:<ssehalfvecmode> > + (match_operand:V16_256 1 "register_operand" "x,v") > (parallel [(const_int 8) (const_int 9) > (const_int 10) (const_int 11) > (const_int 12) (const_int 13) > @@ -22244,16 +22441,16 @@ (define_insn "vec_set_lo_v16hi" > (set_attr "prefix" "vex,evex") > (set_attr "mode" "OI")]) > > -(define_insn "vec_set_hi_v16hi" > - [(set (match_operand:V16HI 0 "register_operand" "=x,v") > - (vec_concat:V16HI > - (vec_select:V8HI > - (match_operand:V16HI 1 "register_operand" "x,v") > +(define_insn "vec_set_hi_<mode>" > + [(set (match_operand:V16_256 0 "register_operand" "=x,v") > + (vec_concat:V16_256 > + (vec_select:<ssehalfvecmode> > + (match_operand:V16_256 1 "register_operand" "x,v") > (parallel [(const_int 0) (const_int 1) > (const_int 2) (const_int 3) > (const_int 4) (const_int 5) > (const_int 6) (const_int 7)])) > - (match_operand:V8HI 2 "nonimmediate_operand" "xm,vm")))] > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm,vm")))] > "TARGET_AVX" > "@ > vinsert%~128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1} > @@ -22430,6 +22627,8 @@ (define_mode_iterator VEC_INIT_MODE > (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI > (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI > (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI > + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") > + (V8HF "TARGET_AVX512FP16") > (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") > (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) > @@ -22441,6 +22640,8 @@ (define_mode_iterator VEC_INIT_HALF_MODE > (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI > (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI > (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") > + (V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16") > + (V8HF "TARGET_AVX512FP16") > (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF > (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") > (V4TI "TARGET_AVX512F")]) > -- > 2.18.1 >
-- BR, Hongtao