On Wed, Apr 17, 2019 at 7:14 PM Uros Bizjak <ubiz...@gmail.com> wrote: > > On Wed, Apr 17, 2019 at 1:03 PM Uros Bizjak <ubiz...@gmail.com> wrote: > > > > On Wed, Apr 17, 2019 at 12:29 PM Hongtao Liu <crazy...@gmail.com> wrote: > > > > > > On Fri, Apr 12, 2019 at 11:18 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > > > On Fri, Apr 12, 2019 at 3:19 AM Uros Bizjak <ubiz...@gmail.com> wrote: > > > > > > > > > > On Fri, Apr 12, 2019 at 11:03 AM Hongtao Liu <crazy...@gmail.com> > > > > > wrote: > > > > > > > > > > > > On Fri, Apr 12, 2019 at 3:30 PM Uros Bizjak <ubiz...@gmail.com> > > > > > > wrote: > > > > > > > > > > > > > > On Fri, Apr 12, 2019 at 9:09 AM Liu, Hongtao > > > > > > > <hongtao....@intel.com> wrote: > > > > > > > > > > > > > > > > Hi : > > > > > > > > This patch is about to enable support for bfloat16 which > > > > > > > > will be in Future Cooper Lake, Please refer to > > > > > > > > https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference > > > > > > > > for more details about BF16. > > > > > > > > > > > > > > > > There are 3 instructions for AVX512BF16: VCVTNE2PS2BF16, > > > > > > > > VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector > > > > > > > > Neural Network Instructions supporting: > > > > > > > > > > > > > > > > - VCVTNE2PS2BF16: Convert Two Packed Single Data to One > > > > > > > > Packed BF16 Data. > > > > > > > > - VCVTNEPS2BF16: Convert Packed Single Data to Packed > > > > > > > > BF16 Data. > > > > > > > > - VDPBF16PS: Dot Product of BF16 Pairs Accumulated into > > > > > > > > Packed Single Precision. > > > > > > > > > > > > > > > > Since only BF16 intrinsics are supported, we treat it as HI for > > > > > > > > simplicity. > > > > > > > > > > > > > > I think it was a mistake declaring cvtps2ph and cvtph2ps using > > > > > > > HImode > > > > > > > instead of HFmode. Is there a compelling reason not to introduce > > > > > > > corresponding bf16_format supporting infrastructure and declare > > > > > > > these > > > > > > > intrinsics using half-binary (HBmode ?) mode instead? > > > > > > > > > > > > > > Uros. > > > > > > > > > > > > Bfloat16 isn't IEEE standard which we want to reserve HFmode for. > > > > > > > > > > True. > > > > > > > > > > > The IEEE 754 standard specifies a binary16 as having the following > > > > > > format: > > > > > > Sign bit: 1 bit > > > > > > Exponent width: 5 bits > > > > > > Significand precision: 11 bits (10 explicitly stored) > > > > > > > > > > > > Bfloat16 has the following format: > > > > > > Sign bit: 1 bit > > > > > > Exponent width: 8 bits > > > > > > Significand precision: 8 bits (7 explicitly stored), as opposed to > > > > > > 24 > > > > > > bits in a classical single-precision floating-point format > > > > > > > > > > This is why I proposed to introduce HBmode (and corresponding > > > > > bfloat16_format) to distingush between ieee HFmode and BFmode. > > > > > > > > > > > > > Unless there is BF16 language level support, HBmode has no advantage > > > > over HImode. We can add HBmode when we gain BF16 language support. > > > > > > > > -- > > > > H.J. > > > > > > Any other comments, I'll merge this to trunk? > > > > It is not a regression, so please no. > > Ehm, "regression fix" ... > > Uros.
Update patch. -- BR, Hongtao
Index: gcc/common/config/i386/i386-common.c =================================================================== --- gcc/common/config/i386/i386-common.c (revision 270934) +++ gcc/common/config/i386/i386-common.c (working copy) @@ -88,6 +88,7 @@ (OPTION_MASK_ISA_AVX512VPOPCNTDQ | OPTION_MASK_ISA_AVX512F_SET) #define OPTION_MASK_ISA_AVX512BITALG_SET \ (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512F_SET) +#define OPTION_MASK_ISA_AVX512BF16_SET OPTION_MASK_ISA_AVX512BF16 #define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED @@ -215,6 +216,7 @@ #define OPTION_MASK_ISA_AVX512VNNI_UNSET OPTION_MASK_ISA_AVX512VNNI #define OPTION_MASK_ISA_AVX512VPOPCNTDQ_UNSET OPTION_MASK_ISA_AVX512VPOPCNTDQ #define OPTION_MASK_ISA_AVX512BITALG_UNSET OPTION_MASK_ISA_AVX512BITALG +#define OPTION_MASK_ISA_AVX512BF16_UNSET OPTION_MASK_ISA_AVX512BF16 #define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED @@ -276,10 +278,14 @@ | OPTION_MASK_ISA_SSE_UNSET) #define OPTION_MASK_ISA2_AVX512F_UNSET \ - (OPTION_MASK_ISA_AVX5124FMAPS_UNSET | OPTION_MASK_ISA_AVX5124VNNIW_UNSET) + (OPTION_MASK_ISA_AVX512BF16_UNSET \ + | OPTION_MASK_ISA_AVX5124FMAPS_UNSET \ + | OPTION_MASK_ISA_AVX5124VNNIW_UNSET) #define OPTION_MASK_ISA2_GENERAL_REGS_ONLY_UNSET \ (OPTION_MASK_ISA2_AVX512F_UNSET) +#define OPTION_MASK_ISA2_AVX512BW_UNSET OPTION_MASK_ISA_AVX512BF16_UNSET + /* Set 1 << value as value of -malign-FLAG option. */ static void @@ -738,6 +744,21 @@ } return true; + case OPT_mavx512bf16: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_SET; + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET; + opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX512BF16_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_UNSET; + } + return true; + case OPT_msgx: if (value) { @@ -800,6 +821,8 @@ { opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AVX512BW_UNSET; opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_UNSET; + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BW_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BW_UNSET; } return true; Index: gcc/config/i386/avx512bf16intrin.h =================================================================== --- gcc/config/i386/avx512bf16intrin.h (nonexistent) +++ gcc/config/i386/avx512bf16intrin.h (working copy) @@ -0,0 +1,118 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16INTRIN_H_INCLUDED + +#ifndef __AVX512BF16__ +#pragma GCC push_options +#pragma GCC target("avx512bf16") +#define __DISABLE_AVX512BF16__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v32bh __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__)); + +/* vcvtne2ps2bf16 */ + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtneps_pbh (__m512 __A) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); +} + +#ifdef __DISABLE_AVX512BF16__ +#undef __DISABLE_AVX512BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16__ */ + +#endif /* _AVX512BF16INTRIN_H_INCLUDED */ Index: gcc/config/i386/avx512bf16vlintrin.h =================================================================== --- gcc/config/i386/avx512bf16vlintrin.h (nonexistent) +++ gcc/config/i386/avx512bf16vlintrin.h (working copy) @@ -0,0 +1,183 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BF16__) +#pragma GCC push_options +#pragma GCC target("avx512bf16,avx512vl") +#define __DISABLE_AVX512BF16VL__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v16bh __attribute__ ((__vector_size__ (32))); +typedef short __v8bh __attribute__ ((__vector_size__ (16))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* vcvtne2ps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtne2ps_pbh (__m128 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneps_pbh (__m256 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneps_pbh (__m128 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); +} + +#ifdef __DISABLE_AVX512BF16VL__ +#undef __DISABLE_AVX512BF16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16VL__ */ + +#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */ Index: gcc/config/i386/cpuid.h =================================================================== --- gcc/config/i386/cpuid.h (revision 270934) +++ gcc/config/i386/cpuid.h (working copy) @@ -21,6 +21,9 @@ * <http://www.gnu.org/licenses/>. */ +/* %eax */ +#define bit_AVX512BF16 (1 << 5) + /* %ecx */ #define bit_SSE3 (1 << 0) #define bit_PCLMUL (1 << 1) Index: gcc/config/i386/driver-i386.c =================================================================== --- gcc/config/i386/driver-i386.c (revision 270934) +++ gcc/config/i386/driver-i386.c (working copy) @@ -426,6 +426,7 @@ unsigned int has_movdiri = 0, has_movdir64b = 0; unsigned int has_waitpkg = 0; unsigned int has_cldemote = 0; + unsigned int has_avx512bf16 = 0; unsigned int has_ptwrite = 0; @@ -533,6 +534,9 @@ has_shstk = ecx & bit_SHSTK; has_pconfig = edx & bit_PCONFIG; has_waitpkg = ecx & bit_WAITPKG; + + __cpuid_count (7, 1, eax, ebx, ecx, edx); + has_avx512bf16 = eax & bit_AVX512BF16; } if (max_level >= 13) @@ -1143,6 +1147,7 @@ const char *waitpkg = has_waitpkg ? " -mwaitpkg" : " -mno-waitpkg"; const char *cldemote = has_cldemote ? " -mcldemote" : " -mno-cldemote"; const char *ptwrite = has_ptwrite ? " -mptwrite" : " -mno-ptwrite"; + const char *avx512bf16 = has_avx512bf16 ? " -mavx512bf16" : " -mno-avx512bf16"; options = concat (options, mmx, mmx3dnow, sse, sse2, sse3, ssse3, sse4a, cx16, sahf, movbe, aes, sha, pclmul, @@ -1157,7 +1162,7 @@ clwb, mwaitx, clzero, pku, rdpid, gfni, shstk, avx512vbmi2, avx512vnni, vaes, vpclmulqdq, avx512bitalg, movdiri, movdir64b, waitpkg, cldemote, - ptwrite, + ptwrite, avx512bf16, NULL); } Index: gcc/config/i386/i386-builtin-types.def =================================================================== --- gcc/config/i386/i386-builtin-types.def (revision 270934) +++ gcc/config/i386/i386-builtin-types.def (working copy) @@ -1262,3 +1262,29 @@ DEF_FUNCTION_TYPE (V4DI, V4DI) DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI) + +# BF16 builtins +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF) +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI) +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI) +DEF_FUNCTION_TYPE (V16HI, V16SF) +DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI) +DEF_FUNCTION_TYPE (V16HI, V16SF, UHI) +DEF_FUNCTION_TYPE (V8HI, V8SF) +DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V8SF, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF) +DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF, UQI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI) Index: gcc/config/i386/i386-builtin.def =================================================================== --- gcc/config/i386/i386-builtin.def (revision 270934) +++ gcc/config/i386/i386-builtin.def (working copy) @@ -2703,6 +2703,35 @@ BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +/* BF16 */ +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) + /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) Index: gcc/config/i386/i386-builtins.c =================================================================== --- gcc/config/i386/i386-builtins.c (revision 270934) +++ gcc/config/i386/i386-builtins.c (working copy) @@ -1920,6 +1920,7 @@ F_VPCLMULQDQ, F_AVX512VNNI, F_AVX512BITALG, + F_AVX512BF16, F_MAX }; @@ -2064,7 +2065,8 @@ {"gfni", F_GFNI, P_ZERO}, {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO}, {"avx512vnni", F_AVX512VNNI, P_ZERO}, - {"avx512bitalg", F_AVX512BITALG, P_ZERO} + {"avx512bitalg", F_AVX512BITALG, P_ZERO}, + {"avx512bf16", F_AVX512BF16, P_ZERO} }; /* This parses the attribute arguments to target in DECL and determines Index: gcc/config/i386/i386-c.c =================================================================== --- gcc/config/i386/i386-c.c (revision 270934) +++ gcc/config/i386/i386-c.c (working copy) @@ -548,6 +548,8 @@ def_or_undef (parse_in, "__CLDEMOTE__"); if (isa_flag2 & OPTION_MASK_ISA_PTWRITE) def_or_undef (parse_in, "__PTWRITE__"); + if (isa_flag2 & OPTION_MASK_ISA_AVX512BF16) + def_or_undef (parse_in, "__AVX512BF16__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); Index: gcc/config/i386/i386-expand.c =================================================================== --- gcc/config/i386/i386-expand.c (revision 270934) +++ gcc/config/i386/i386-expand.c (working copy) @@ -8968,6 +8968,9 @@ case V8DF_FTYPE_V2DF: case V8DF_FTYPE_V8DF: case V4DI_FTYPE_V4DI: + case V16HI_FTYPE_V16SF: + case V8HI_FTYPE_V8SF: + case V8HI_FTYPE_V4SF: nargs = 1; break; case V4SF_FTYPE_V4SF_VEC_MERGE: @@ -9092,6 +9095,12 @@ case USI_FTYPE_USI_USI: case UDI_FTYPE_UDI_UDI: case V16SI_FTYPE_V8DF_V8DF: + case V32HI_FTYPE_V16SF_V16SF: + case V16HI_FTYPE_V8SF_V8SF: + case V8HI_FTYPE_V4SF_V4SF: + case V16HI_FTYPE_V16SF_UHI: + case V8HI_FTYPE_V8SF_UQI: + case V8HI_FTYPE_V4SF_UQI: nargs = 2; break; case V2DI_FTYPE_V2DI_INT_CONVERT: @@ -9274,6 +9283,15 @@ case V16HI_FTYPE_V16HI_V16HI_V16HI: case V8SI_FTYPE_V8SI_V8SI_V8SI: case V8HI_FTYPE_V8HI_V8HI_V8HI: + case V32HI_FTYPE_V16SF_V16SF_USI: + case V16HI_FTYPE_V8SF_V8SF_UHI: + case V8HI_FTYPE_V4SF_V4SF_UQI: + case V16HI_FTYPE_V16SF_V16HI_UHI: + case V8HI_FTYPE_V8SF_V8HI_UQI: + case V8HI_FTYPE_V4SF_V8HI_UQI: + case V16SF_FTYPE_V16SF_V32HI_V32HI: + case V8SF_FTYPE_V8SF_V16HI_V16HI: + case V4SF_FTYPE_V4SF_V8HI_V8HI: nargs = 3; break; case V32QI_FTYPE_V32QI_V32QI_INT: @@ -9413,6 +9431,9 @@ case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: + case V32HI_FTYPE_V16SF_V16SF_V32HI_USI: + case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI: + case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI: nargs = 4; break; case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: @@ -9456,6 +9477,9 @@ break; case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: + case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI: + case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI: + case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI: nargs = 4; break; case UQI_FTYPE_V8DI_V8DI_INT_UQI: Index: gcc/config/i386/i386-options.c =================================================================== --- gcc/config/i386/i386-options.c (revision 270934) +++ gcc/config/i386/i386-options.c (working copy) @@ -209,7 +209,8 @@ { "-mmovdir64b", OPTION_MASK_ISA_MOVDIR64B }, { "-mwaitpkg", OPTION_MASK_ISA_WAITPKG }, { "-mcldemote", OPTION_MASK_ISA_CLDEMOTE }, - { "-mptwrite", OPTION_MASK_ISA_PTWRITE } + { "-mptwrite", OPTION_MASK_ISA_PTWRITE }, + { "-mavx512bf16", OPTION_MASK_ISA_AVX512BF16 } }; static struct ix86_target_opts isa_opts[] = { @@ -919,6 +920,7 @@ IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), IX86_ATTR_ISA ("cldemote", OPT_mcldemote), IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), + IX86_ATTR_ISA ("avx512bf16", OPT_mavx512bf16), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -2034,6 +2036,10 @@ && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; + if (((processor_alias_table[i].flags & PTA_AVX512BF16) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA_AVX512BF16)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16; if (((processor_alias_table[i].flags & PTA_SGX) != 0) && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX)) opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX; Index: gcc/config/i386/i386.h =================================================================== --- gcc/config/i386/i386.h (revision 270934) +++ gcc/config/i386/i386.h (working copy) @@ -193,6 +193,8 @@ #define TARGET_CLDEMOTE_P(x) TARGET_ISA_CLDEMOTE_P(x) #define TARGET_PTWRITE TARGET_ISA_PTWRITE #define TARGET_PTWRITE_P(x) TARGET_ISA_PTWRITE_P(x) +#define TARGET_AVX512BF16 TARGET_ISA_AVX512BF16 +#define TARGET_AVX512BF16_P(x) TARGET_ISA_AVX512BF16_P(x) #define TARGET_LP64 TARGET_ABI_64 #define TARGET_LP64_P(x) TARGET_ABI_64_P(x) @@ -2355,6 +2357,7 @@ const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8); const wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 9); const wide_int_bitmask PTA_PTWRITE (0, HOST_WIDE_INT_1U << 10); +const wide_int_bitmask PTA_AVX512BF16 (0, HOST_WIDE_INT_1U << 11); const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR; Index: gcc/config/i386/i386.opt =================================================================== --- gcc/config/i386/i386.opt (revision 270934) +++ gcc/config/i386/i386.opt (working copy) @@ -1101,3 +1101,8 @@ mrecord-return Target Report Var(ix86_flag_record_return) Init(0) Generate a __return_loc section pointing to all return instrumentation code. + +mavx512bf16 +Target Report Mask(ISA_AVX512BF16) Var(ix86_isa_flags2) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and +AVX512BF16 built-in functions and code generation. Index: gcc/config/i386/immintrin.h =================================================================== --- gcc/config/i386/immintrin.h (revision 270934) +++ gcc/config/i386/immintrin.h (working copy) @@ -130,6 +130,10 @@ #include <cldemoteintrin.h> +#include <avx512bf16vlintrin.h> + +#include <avx512bf16intrin.h> + #include <rdseedintrin.h> #include <prfchwintrin.h> Index: gcc/config/i386/sse.md =================================================================== --- gcc/config/i386/sse.md (revision 270934) +++ gcc/config/i386/sse.md (working copy) @@ -187,6 +187,11 @@ ;; For AVX512BITALG support UNSPEC_VPSHUFBIT + + ;; For AVX512BF16 support + UNSPEC_VCVTNE2PS2BF16 + UNSPEC_VCVTNEPS2BF16 + UNSPEC_VDPBF16PS ]) (define_c_enum "unspecv" [ @@ -726,6 +731,15 @@ (V16SF "hi") (V8SF "qi") (V4SF "qi") (V8DF "qi") (V4DF "qi") (V2DF "qi")]) +;; Mapping of vector modes to corresponding mask half size +(define_mode_attr avx512fmaskhalfmode + [(V64QI "SI") (V32QI "HI") (V16QI "QI") + (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") + (V16SI "QI") (V8SI "QI") (V4SI "QI") + (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V16SF "QI") (V8SF "QI") (V4SF "QI") + (V8DF "QI") (V4DF "QI") (V2DF "QI")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V16SF "V16SI") (V8DF "V8DI") @@ -22184,3 +22198,90 @@ "vpshufbitqmb\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) + +(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) +;; Converting from BF to SF +(define_mode_attr bf16_cvt_2sf + [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")]) +;; Converting from SF to BF +(define_mode_attr sf_cvt_bf16 + [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")]) +;; Mapping from BF to SF +(define_mode_attr sf_bf16 + [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")]) + +(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz" + [(match_operand:BF16 0 "register_operand") + (match_operand:<bf16_cvt_2sf> 1 "register_operand") + (match_operand:<bf16_cvt_2sf> 2 "register_operand") + (match_operand:<avx512fmaskmode> 3 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1], + operands[2], CONST0_RTX(<MODE>mode), operands[3])); + DONE; +}) + +(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>" + [(set (match_operand:BF16 0 "register_operand" "=v") + (unspec:BF16 + [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v") + (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")] + UNSPEC_VCVTNE2PS2BF16))] + "TARGET_AVX512BF16" + "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}") + +(define_expand "avx512f_cvtneps2bf16_<mode>_maskz" + [(match_operand:<sf_cvt_bf16> 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand:<avx512fmaskmode> 2 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtneps2bf16_<mode>_mask(operands[0], operands[1], + CONST0_RTX(<sf_cvt_bf16>mode), operands[2])); + DONE; +}) + +(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>" + [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v") + (unspec:<sf_cvt_bf16> + [(match_operand:VF1_AVX512VL 1 "register_operand" "v")] + UNSPEC_VCVTNEPS2BF16))] + "TARGET_AVX512BF16" + "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}") + +(define_expand "avx512f_dpbf16ps_<mode>_maskz" + [(match_operand:VF1_AVX512VL 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand:<sf_bf16> 2 "register_operand") + (match_operand:<sf_bf16> 3 "register_operand") + (match_operand:<avx512fmaskhalfmode> 4 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_dpbf16ps_<mode>_maskz_1(operands[0], operands[1], + operands[2], operands[3], CONST0_RTX(<MODE>mode), operands[4])); + DONE; +}) + +(define_insn "avx512f_dpbf16ps_<mode><maskz_half_name>" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand:<sf_bf16> 2 "register_operand" "v") + (match_operand:<sf_bf16> 3 "register_operand" "v")] + UNSPEC_VDPBF16PS))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}") + +(define_insn "avx512f_dpbf16ps_<mode>_mask" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (vec_merge:VF1_AVX512VL + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand:<sf_bf16> 2 "register_operand" "v") + (match_operand:<sf_bf16> 3 "register_operand" "v")] + UNSPEC_VDPBF16PS) + (match_dup 1) + (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}") Index: gcc/config/i386/subst.md =================================================================== --- gcc/config/i386/subst.md (revision 270934) +++ gcc/config/i386/subst.md (working copy) @@ -313,3 +313,16 @@ (const_int 1)) (match_operand:SI 3 "const48_operand")] UNSPEC_EMBEDDED_ROUNDING))]) + +(define_subst_attr "maskz_half_name" "maskz_half" "" "_maskz_1") +(define_subst_attr "maskz_half_operand4" "maskz_half" "" "%{%5%}%N4") + +(define_subst "maskz_half" + [(set (match_operand:SUBST_V 0) + (match_operand:SUBST_V 1))] + "" + [(set (match_dup 0) + (vec_merge:SUBST_V + (match_dup 1) + (match_operand:SUBST_V 2 "const0_operand" "C") + (match_operand:<avx512fmaskhalfmode> 3 "register_operand" "Yk")))]) Index: gcc/config.gcc =================================================================== --- gcc/config.gcc (revision 270934) +++ gcc/config.gcc (working copy) @@ -407,7 +407,7 @@ avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h pconfigintrin.h wbnoinvdintrin.h movdirintrin.h - waitpkgintrin.h cldemoteintrin.h" + waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h" ;; x86_64-*-*) cpu_type=i386 @@ -439,7 +439,7 @@ avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h pconfigintrin.h wbnoinvdintrin.h movdirintrin.h - waitpkgintrin.h cldemoteintrin.h" + waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h" ;; ia64-*-*) extra_headers=ia64intrin.h Index: gcc/doc/invoke.texi =================================================================== --- gcc/doc/invoke.texi (revision 270934) +++ gcc/doc/invoke.texi (working copy) @@ -1274,7 +1274,7 @@ -msse4a -m3dnow -m3dnowa -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop @gol -madx -mlzcnt -mbmi2 -mfxsr -mxsave -mxsaveopt -mrtm -mhle -mlwp @gol -mmwaitx -mclzero -mpku -mthreads -mgfni -mvaes -mwaitpkg @gol --mshstk -mmanual-endbr -mforce-indirect-call -mavx512vbmi2 @gol +-mshstk -mmanual-endbr -mforce-indirect-call -mavx512vbmi2 -mavx512bf16 @gol -mvpclmulqdq -mavx512bitalg -mmovdiri -mmovdir64b -mavx512vpopcntdq @gol -mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol -mrdseed -msgx @gol @@ -28041,6 +28041,9 @@ @itemx -mavx512vbmi2 @opindex mavx512vbmi2 @need 200 +@itemx -mavx512bf16 +@opindex mavx512bf16 +@need 200 @itemx -mgfni @opindex mgfni @need 200 @@ -28083,7 +28086,7 @@ WBNOINVD, FMA4, PREFETCHW, RDPID, PREFETCHWT1, RDSEED, SGX, XOP, LWP, 3DNow!@:, enhanced 3DNow!@:, POPCNT, ABM, ADX, BMI, BMI2, LZCNT, FXSR, XSAVE, XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2, -GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, +GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16 AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, or CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-} option to disable use of these instructions. Index: gcc/testsuite/g++.dg/other/i386-2.C =================================================================== --- gcc/testsuite/g++.dg/other/i386-2.C (revision 270934) +++ gcc/testsuite/g++.dg/other/i386-2.C (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h, Index: gcc/testsuite/g++.dg/other/i386-3.C =================================================================== --- gcc/testsuite/g++.dg/other/i386-3.C (revision 270934) +++ gcc/testsuite/g++.dg/other/i386-3.C (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h, Index: gcc/testsuite/gcc.target/i386/avx-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx-1.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/avx-1.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512vl -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> Index: gcc/testsuite/gcc.target/i386/avx-2.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx-2.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/avx-2.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512vl -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> Index: gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c (working copy) @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m512bh res; +volatile __m512 x1, x2; +volatile __mmask32 m32; + +void extern +avx512bf16_test (void) +{ + res = _mm512_cvtne2ps_pbh (x1, x2); + res = _mm512_mask_cvtne2ps_pbh (res, m32, x1, x2); + res = _mm512_maskz_cvtne2ps_pbh (m32, x1, x2); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c (working copy) @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m256bh res; +volatile __m512 x1; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res = _mm512_cvtneps_pbh (x1); + res = _mm512_mask_cvtneps_pbh (res, m16, x1); + res = _mm512_maskz_cvtneps_pbh (m16, x1); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c (working copy) @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m512 res; +volatile __m512bh x1, x2; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res = _mm512_dpbf16_ps (res, x1, x2); + res = _mm512_mask_dpbf16_ps (res, m16, x1, x2); + res = _mm512_maskz_dpbf16_ps (m16, res, x1, x2); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c (working copy) @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +typedef union +{ + __m512 x; + float a[16]; +} union512s; + +float res_ref[16]; +union512s res; +__m512bh x1, x2; +__mmask16 m16; + +static void __attribute__((noinline, unused)) +merge_masking_s (float *arr, unsigned long long mask, int size) +{ + int i; + for (i = 0; i < size; i++) + { + arr[i] = (mask & (1LL << i)) ? arr[i] : 117; + } +} + +static int __attribute__((noinline, unused)) +check_union512s (union512s u, const float *v) +{ + int i; + int err = 0; + for (i = 0; i < (sizeof (u.a) / sizeof ((u.a)[0])); i++) + if (u.a[i] != v[i]) + { + err++; + ; + } + return err; +} + +void extern +avx512bf16_test (void) +{ + res.x = _mm512_mask_dpbf16_ps (res.x, m16, x1, x2); + merge_masking_s (res_ref, m16, 16); + if (check_union512s (res, res_ref)) + abort (); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c (working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m128bh res1; +volatile __m256bh res2; +volatile __m128 x1, x2; +volatile __m256 x3, x4; +volatile __mmask8 m8; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res2 = _mm256_cvtne2ps_pbh (x3, x4); + res2 = _mm256_mask_cvtne2ps_pbh (res2, m16, x3, x4); + res2 = _mm256_maskz_cvtne2ps_pbh (m16, x3, x4); + + res1 = _mm_cvtne2ps_pbh (x1, x2); + res1 = _mm_mask_cvtne2ps_pbh (res1, m8, x1, x2); + res1 = _mm_maskz_cvtne2ps_pbh (m8, x1, x2); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c (working copy) @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m128bh res1, res2; +volatile __m128 x1; +volatile __m256 x2; +volatile __mmask8 m8; + +void extern +avx512bf16_test (void) +{ + res2 = _mm256_cvtneps_pbh (x2); + res2 = _mm256_mask_cvtneps_pbh (res2, m8, x2); + res2 = _mm256_maskz_cvtneps_pbh (m8, x2); + + res1 = _mm_cvtneps_pbh (x1); + res1 = _mm_mask_cvtneps_pbh (res1, m8, x1); + res1 = _mm_maskz_cvtneps_pbh (m8, x1); +} Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c (working copy) @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include <immintrin.h> + +volatile __m256 res1; +volatile __m256bh x1, x2; +volatile __m128 res2; +volatile __m128bh x3, x4; +volatile __mmask8 m8; + +void extern +avx512bf16_test (void) +{ + res1 = _mm256_dpbf16_ps (res1, x1, x2); + res1 = _mm256_mask_dpbf16_ps (res1, m8, x1, x2); + res1 = _mm256_maskz_dpbf16_ps (m8, res1, x1, x2); + + res2 = _mm_dpbf16_ps (res2, x3, x4); + res2 = _mm_mask_dpbf16_ps (res2, m8, x3, x4); + res2 = _mm_maskz_dpbf16_ps (m8, res2, x3, x4); +} Index: gcc/testsuite/gcc.target/i386/sse-12.c =================================================================== --- gcc/testsuite/gcc.target/i386/sse-12.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/sse-12.c (working copy) @@ -3,7 +3,7 @@ popcntintrin.h gfniintrin.h and mm_malloc.h are usable with -O -std=c89 -pedantic-errors. */ /* { dg-do compile } */ -/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ #include <x86intrin.h> Index: gcc/testsuite/gcc.target/i386/sse-13.c =================================================================== --- gcc/testsuite/gcc.target/i386/sse-13.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/sse-13.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> Index: gcc/testsuite/gcc.target/i386/sse-14.c =================================================================== --- gcc/testsuite/gcc.target/i386/sse-14.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/sse-14.c (working copy) @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> Index: gcc/testsuite/gcc.target/i386/sse-22.c =================================================================== --- gcc/testsuite/gcc.target/i386/sse-22.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/sse-22.c (working copy) @@ -101,7 +101,7 @@ #ifndef DIFFERENT_PRAGMAS -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16") #endif /* Following intrinsics require immediate arguments. They @@ -218,7 +218,7 @@ /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */ #ifdef DIFFERENT_PRAGMAS -#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg") +#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16") #endif #include <immintrin.h> test_1 (_cvtss_sh, unsigned short, float, 1) Index: gcc/testsuite/gcc.target/i386/sse-23.c =================================================================== --- gcc/testsuite/gcc.target/i386/sse-23.c (revision 270934) +++ gcc/testsuite/gcc.target/i386/sse-23.c (working copy) @@ -696,6 +696,6 @@ #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) #define __builtin_ia32_vpclmulqdq_v8di(A, B, C) __builtin_ia32_vpclmulqdq_v8di(A, B, 1) -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16") #include <x86intrin.h>