On Wed, Apr 17, 2019 at 7:14 PM Uros Bizjak <ubiz...@gmail.com> wrote:
>
> On Wed, Apr 17, 2019 at 1:03 PM Uros Bizjak <ubiz...@gmail.com> wrote:
> >
> > On Wed, Apr 17, 2019 at 12:29 PM Hongtao Liu <crazy...@gmail.com> wrote:
> > >
> > > On Fri, Apr 12, 2019 at 11:18 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> > > >
> > > > On Fri, Apr 12, 2019 at 3:19 AM Uros Bizjak <ubiz...@gmail.com> wrote:
> > > > >
> > > > > On Fri, Apr 12, 2019 at 11:03 AM Hongtao Liu <crazy...@gmail.com> 
> > > > > wrote:
> > > > > >
> > > > > > On Fri, Apr 12, 2019 at 3:30 PM Uros Bizjak <ubiz...@gmail.com> 
> > > > > > wrote:
> > > > > > >
> > > > > > > On Fri, Apr 12, 2019 at 9:09 AM Liu, Hongtao 
> > > > > > > <hongtao....@intel.com> wrote:
> > > > > > > >
> > > > > > > > Hi :
> > > > > > > >     This patch is about to enable support for bfloat16 which 
> > > > > > > > will be in Future Cooper Lake, Please refer to 
> > > > > > > > https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference
> > > > > > > > for more details about BF16.
> > > > > > > >
> > > > > > > > There are 3 instructions for AVX512BF16: VCVTNE2PS2BF16, 
> > > > > > > > VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector 
> > > > > > > > Neural Network Instructions supporting:
> > > > > > > >
> > > > > > > > -       VCVTNE2PS2BF16: Convert Two Packed Single Data to One 
> > > > > > > > Packed BF16 Data.
> > > > > > > > -       VCVTNEPS2BF16: Convert Packed Single Data to Packed 
> > > > > > > > BF16 Data.
> > > > > > > > -       VDPBF16PS: Dot Product of BF16 Pairs Accumulated into 
> > > > > > > > Packed Single Precision.
> > > > > > > >
> > > > > > > > Since only BF16 intrinsics are supported, we treat it as HI for 
> > > > > > > > simplicity.
> > > > > > >
> > > > > > > I think it was a mistake declaring cvtps2ph and cvtph2ps using 
> > > > > > > HImode
> > > > > > > instead of HFmode. Is there a compelling reason not to introduce
> > > > > > > corresponding bf16_format supporting infrastructure and declare 
> > > > > > > these
> > > > > > > intrinsics using half-binary (HBmode ?) mode instead?
> > > > > > >
> > > > > > > Uros.
> > > > > >
> > > > > > Bfloat16 isn't IEEE standard which we want to reserve HFmode for.
> > > > >
> > > > > True.
> > > > >
> > > > > > The IEEE 754 standard specifies a binary16 as having the following 
> > > > > > format:
> > > > > > Sign bit: 1 bit
> > > > > > Exponent width: 5 bits
> > > > > > Significand precision: 11 bits (10 explicitly stored)
> > > > > >
> > > > > > Bfloat16 has the following format:
> > > > > > Sign bit: 1 bit
> > > > > > Exponent width: 8 bits
> > > > > > Significand precision: 8 bits (7 explicitly stored), as opposed to 
> > > > > > 24
> > > > > > bits in a classical single-precision floating-point format
> > > > >
> > > > > This is why I proposed to introduce HBmode (and corresponding
> > > > > bfloat16_format) to distingush between ieee HFmode and BFmode.
> > > > >
> > > >
> > > > Unless there is BF16 language level support,  HBmode has no advantage
> > > > over HImode.   We can add HBmode when we gain BF16 language support.
> > > >
> > > > --
> > > > H.J.
> > >
> > > Any other comments, I'll merge this to trunk?
> >
> > It is not a regression, so please no.
>
> Ehm, "regression fix" ...
>
> Uros.

Update patch.

-- 
BR,
Hongtao
Index: gcc/common/config/i386/i386-common.c
===================================================================
--- gcc/common/config/i386/i386-common.c	(revision 270934)
+++ gcc/common/config/i386/i386-common.c	(working copy)
@@ -88,6 +88,7 @@
   (OPTION_MASK_ISA_AVX512VPOPCNTDQ | OPTION_MASK_ISA_AVX512F_SET)
 #define OPTION_MASK_ISA_AVX512BITALG_SET \
   (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512F_SET)
+#define OPTION_MASK_ISA_AVX512BF16_SET OPTION_MASK_ISA_AVX512BF16
 #define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM
 #define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW
 #define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED
@@ -215,6 +216,7 @@
 #define OPTION_MASK_ISA_AVX512VNNI_UNSET OPTION_MASK_ISA_AVX512VNNI
 #define OPTION_MASK_ISA_AVX512VPOPCNTDQ_UNSET OPTION_MASK_ISA_AVX512VPOPCNTDQ
 #define OPTION_MASK_ISA_AVX512BITALG_UNSET OPTION_MASK_ISA_AVX512BITALG
+#define OPTION_MASK_ISA_AVX512BF16_UNSET OPTION_MASK_ISA_AVX512BF16
 #define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM
 #define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW
 #define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED
@@ -276,10 +278,14 @@
    | OPTION_MASK_ISA_SSE_UNSET)
 
 #define OPTION_MASK_ISA2_AVX512F_UNSET \
-  (OPTION_MASK_ISA_AVX5124FMAPS_UNSET | OPTION_MASK_ISA_AVX5124VNNIW_UNSET)
+  (OPTION_MASK_ISA_AVX512BF16_UNSET \
+   | OPTION_MASK_ISA_AVX5124FMAPS_UNSET \
+   | OPTION_MASK_ISA_AVX5124VNNIW_UNSET)
 #define OPTION_MASK_ISA2_GENERAL_REGS_ONLY_UNSET \
   (OPTION_MASK_ISA2_AVX512F_UNSET)
 
+#define OPTION_MASK_ISA2_AVX512BW_UNSET OPTION_MASK_ISA_AVX512BF16_UNSET
+
 /* Set 1 << value as value of -malign-FLAG option.  */
 
 static void
@@ -738,6 +744,21 @@
 	}
       return true;
 
+    case OPT_mavx512bf16:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16_SET;
+	  opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_SET;
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX512BF16_UNSET;
+	  opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_UNSET;
+	}
+      return true;
+
     case OPT_msgx:
       if (value)
 	{
@@ -800,6 +821,8 @@
 	{
 	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AVX512BW_UNSET;
 	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_UNSET;
+	  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BW_UNSET;
+	  opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BW_UNSET;
 	}
       return true;
 
Index: gcc/config/i386/avx512bf16intrin.h
===================================================================
--- gcc/config/i386/avx512bf16intrin.h	(nonexistent)
+++ gcc/config/i386/avx512bf16intrin.h	(working copy)
@@ -0,0 +1,118 @@
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16INTRIN_H_INCLUDED
+#define _AVX512BF16INTRIN_H_INCLUDED
+
+#ifndef __AVX512BF16__
+#pragma GCC push_options
+#pragma GCC target("avx512bf16")
+#define __DISABLE_AVX512BF16__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtneps_pbh (__m512 __A)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A);
+}
+
+#ifdef __DISABLE_AVX512BF16__
+#undef __DISABLE_AVX512BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16__ */
+
+#endif /* _AVX512BF16INTRIN_H_INCLUDED */
Index: gcc/config/i386/avx512bf16vlintrin.h
===================================================================
--- gcc/config/i386/avx512bf16vlintrin.h	(nonexistent)
+++ gcc/config/i386/avx512bf16vlintrin.h	(working copy)
@@ -0,0 +1,183 @@
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16VLINTRIN_H_INCLUDED
+#define _AVX512BF16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
+#pragma GCC push_options
+#pragma GCC target("avx512bf16,avx512vl")
+#define __DISABLE_AVX512BF16VL__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef short __v16bh __attribute__ ((__vector_size__ (32)));
+typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneps_pbh (__m256 __A)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneps_pbh (__m128 __A)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
+}
+
+#ifdef __DISABLE_AVX512BF16VL__
+#undef __DISABLE_AVX512BF16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16VL__ */
+
+#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */
Index: gcc/config/i386/cpuid.h
===================================================================
--- gcc/config/i386/cpuid.h	(revision 270934)
+++ gcc/config/i386/cpuid.h	(working copy)
@@ -21,6 +21,9 @@
  * <http://www.gnu.org/licenses/>.
  */
 
+/* %eax */
+#define bit_AVX512BF16	(1 << 5)
+
 /* %ecx */
 #define bit_SSE3	(1 << 0)
 #define bit_PCLMUL	(1 << 1)
Index: gcc/config/i386/driver-i386.c
===================================================================
--- gcc/config/i386/driver-i386.c	(revision 270934)
+++ gcc/config/i386/driver-i386.c	(working copy)
@@ -426,6 +426,7 @@
   unsigned int has_movdiri = 0, has_movdir64b = 0;
   unsigned int has_waitpkg = 0;
   unsigned int has_cldemote = 0;
+  unsigned int has_avx512bf16 = 0;
 
   unsigned int has_ptwrite = 0;
 
@@ -533,6 +534,9 @@
       has_shstk = ecx & bit_SHSTK;
       has_pconfig = edx & bit_PCONFIG;
       has_waitpkg = ecx & bit_WAITPKG;
+
+      __cpuid_count (7, 1, eax, ebx, ecx, edx);
+      has_avx512bf16 = eax & bit_AVX512BF16;
     }
 
   if (max_level >= 13)
@@ -1143,6 +1147,7 @@
       const char *waitpkg = has_waitpkg ? " -mwaitpkg" : " -mno-waitpkg";
       const char *cldemote = has_cldemote ? " -mcldemote" : " -mno-cldemote";
       const char *ptwrite = has_ptwrite ? " -mptwrite" : " -mno-ptwrite";
+      const char *avx512bf16 = has_avx512bf16 ? " -mavx512bf16" : " -mno-avx512bf16";
 
       options = concat (options, mmx, mmx3dnow, sse, sse2, sse3, ssse3,
 			sse4a, cx16, sahf, movbe, aes, sha, pclmul,
@@ -1157,7 +1162,7 @@
 			clwb, mwaitx, clzero, pku, rdpid, gfni, shstk,
 			avx512vbmi2, avx512vnni, vaes, vpclmulqdq,
 			avx512bitalg, movdiri, movdir64b, waitpkg, cldemote,
-			ptwrite,
+			ptwrite, avx512bf16,
 			NULL);
     }
 
Index: gcc/config/i386/i386-builtin-types.def
===================================================================
--- gcc/config/i386/i386-builtin-types.def	(revision 270934)
+++ gcc/config/i386/i386-builtin-types.def	(working copy)
@@ -1262,3 +1262,29 @@
 DEF_FUNCTION_TYPE (V4DI, V4DI)
 DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
+
+# BF16 builtins
+DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
+DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
+DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
+DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
+DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
+DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16HI, V16SF)
+DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
+DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
+DEF_FUNCTION_TYPE (V8HI, V8SF)
+DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
+DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8HI, V4SF)
+DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
+DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
Index: gcc/config/i386/i386-builtin.def
===================================================================
--- gcc/config/i386/i386-builtin.def	(revision 270934)
+++ gcc/config/i386/i386-builtin.def	(working copy)
@@ -2703,6 +2703,35 @@
 BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
 
+/* BF16 */
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+
 /* Builtins with rounding support.  */
 BDESC_END (ARGS, ROUND_ARGS)
 
Index: gcc/config/i386/i386-builtins.c
===================================================================
--- gcc/config/i386/i386-builtins.c	(revision 270934)
+++ gcc/config/i386/i386-builtins.c	(working copy)
@@ -1920,6 +1920,7 @@
   F_VPCLMULQDQ,
   F_AVX512VNNI,
   F_AVX512BITALG,
+  F_AVX512BF16,
   F_MAX
 };
 
@@ -2064,7 +2065,8 @@
   {"gfni",	F_GFNI,	P_ZERO},
   {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
   {"avx512vnni", F_AVX512VNNI, P_ZERO},
-  {"avx512bitalg", F_AVX512BITALG, P_ZERO}
+  {"avx512bitalg", F_AVX512BITALG, P_ZERO},
+  {"avx512bf16", F_AVX512BF16, P_ZERO}
 };
 
 /* This parses the attribute arguments to target in DECL and determines
Index: gcc/config/i386/i386-c.c
===================================================================
--- gcc/config/i386/i386-c.c	(revision 270934)
+++ gcc/config/i386/i386-c.c	(working copy)
@@ -548,6 +548,8 @@
     def_or_undef (parse_in, "__CLDEMOTE__");
   if (isa_flag2 & OPTION_MASK_ISA_PTWRITE)
     def_or_undef (parse_in, "__PTWRITE__");
+  if (isa_flag2 & OPTION_MASK_ISA_AVX512BF16)
+    def_or_undef (parse_in, "__AVX512BF16__");
   if (TARGET_IAMCU)
     {
       def_or_undef (parse_in, "__iamcu");
Index: gcc/config/i386/i386-expand.c
===================================================================
--- gcc/config/i386/i386-expand.c	(revision 270934)
+++ gcc/config/i386/i386-expand.c	(working copy)
@@ -8968,6 +8968,9 @@
     case V8DF_FTYPE_V2DF:
     case V8DF_FTYPE_V8DF:
     case V4DI_FTYPE_V4DI:
+    case V16HI_FTYPE_V16SF:
+    case V8HI_FTYPE_V8SF:
+    case V8HI_FTYPE_V4SF:
       nargs = 1;
       break;
     case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -9092,6 +9095,12 @@
     case USI_FTYPE_USI_USI:
     case UDI_FTYPE_UDI_UDI:
     case V16SI_FTYPE_V8DF_V8DF:
+    case V32HI_FTYPE_V16SF_V16SF:
+    case V16HI_FTYPE_V8SF_V8SF:
+    case V8HI_FTYPE_V4SF_V4SF:
+    case V16HI_FTYPE_V16SF_UHI:
+    case V8HI_FTYPE_V8SF_UQI:
+    case V8HI_FTYPE_V4SF_UQI:
       nargs = 2;
       break;
     case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -9274,6 +9283,15 @@
     case V16HI_FTYPE_V16HI_V16HI_V16HI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI:
     case V8HI_FTYPE_V8HI_V8HI_V8HI:
+    case V32HI_FTYPE_V16SF_V16SF_USI:
+    case V16HI_FTYPE_V8SF_V8SF_UHI:
+    case V8HI_FTYPE_V4SF_V4SF_UQI:
+    case V16HI_FTYPE_V16SF_V16HI_UHI:
+    case V8HI_FTYPE_V8SF_V8HI_UQI:
+    case V8HI_FTYPE_V4SF_V8HI_UQI:
+    case V16SF_FTYPE_V16SF_V32HI_V32HI:
+    case V8SF_FTYPE_V8SF_V16HI_V16HI:
+    case V4SF_FTYPE_V4SF_V8HI_V8HI:
       nargs = 3;
       break;
     case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -9413,6 +9431,9 @@
     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
+    case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
+    case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
+    case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
       nargs = 4;
       break;
     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -9456,6 +9477,9 @@
       break;
     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
+    case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
+    case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
+    case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
       nargs = 4;
       break;
     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
Index: gcc/config/i386/i386-options.c
===================================================================
--- gcc/config/i386/i386-options.c	(revision 270934)
+++ gcc/config/i386/i386-options.c	(working copy)
@@ -209,7 +209,8 @@
     { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B },
     { "-mwaitpkg",	OPTION_MASK_ISA_WAITPKG },
     { "-mcldemote",	OPTION_MASK_ISA_CLDEMOTE },
-    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE }
+    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE },
+    { "-mavx512bf16",	OPTION_MASK_ISA_AVX512BF16 }
   };
   static struct ix86_target_opts isa_opts[] =
   {
@@ -919,6 +920,7 @@
     IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
     IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
     IX86_ATTR_ISA ("ptwrite",   OPT_mptwrite),
+    IX86_ATTR_ISA ("avx512bf16",   OPT_mavx512bf16),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
@@ -2034,6 +2036,10 @@
 	    && !(opts->x_ix86_isa_flags_explicit
 		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
+	if (((processor_alias_table[i].flags & PTA_AVX512BF16) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA_AVX512BF16))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16;
 	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
 	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
 	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
Index: gcc/config/i386/i386.h
===================================================================
--- gcc/config/i386/i386.h	(revision 270934)
+++ gcc/config/i386/i386.h	(working copy)
@@ -193,6 +193,8 @@
 #define TARGET_CLDEMOTE_P(x) TARGET_ISA_CLDEMOTE_P(x)
 #define TARGET_PTWRITE	TARGET_ISA_PTWRITE
 #define TARGET_PTWRITE_P(x)	TARGET_ISA_PTWRITE_P(x)
+#define TARGET_AVX512BF16	TARGET_ISA_AVX512BF16
+#define TARGET_AVX512BF16_P(x)	TARGET_ISA_AVX512BF16_P(x)
 
 #define TARGET_LP64	TARGET_ABI_64
 #define TARGET_LP64_P(x)	TARGET_ABI_64_P(x)
@@ -2355,6 +2357,7 @@
 const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
 const wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 9);
 const wide_int_bitmask PTA_PTWRITE (0, HOST_WIDE_INT_1U << 10);
+const wide_int_bitmask PTA_AVX512BF16 (0, HOST_WIDE_INT_1U << 11);
 
 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
   | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
Index: gcc/config/i386/i386.opt
===================================================================
--- gcc/config/i386/i386.opt	(revision 270934)
+++ gcc/config/i386/i386.opt	(working copy)
@@ -1101,3 +1101,8 @@
 mrecord-return
 Target Report Var(ix86_flag_record_return) Init(0)
 Generate a __return_loc section pointing to all return instrumentation code.
+
+mavx512bf16
+Target Report Mask(ISA_AVX512BF16) Var(ix86_isa_flags2) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and
+AVX512BF16 built-in functions and code generation.
Index: gcc/config/i386/immintrin.h
===================================================================
--- gcc/config/i386/immintrin.h	(revision 270934)
+++ gcc/config/i386/immintrin.h	(working copy)
@@ -130,6 +130,10 @@
 
 #include <cldemoteintrin.h>
 
+#include <avx512bf16vlintrin.h>
+
+#include <avx512bf16intrin.h>
+
 #include <rdseedintrin.h>
 
 #include <prfchwintrin.h>
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md	(revision 270934)
+++ gcc/config/i386/sse.md	(working copy)
@@ -187,6 +187,11 @@
 
   ;; For AVX512BITALG support
   UNSPEC_VPSHUFBIT
+
+  ;; For AVX512BF16 support
+  UNSPEC_VCVTNE2PS2BF16
+  UNSPEC_VCVTNEPS2BF16
+  UNSPEC_VDPBF16PS
 ])
 
 (define_c_enum "unspecv" [
@@ -726,6 +731,15 @@
    (V16SF "hi") (V8SF  "qi") (V4SF  "qi")
    (V8DF  "qi") (V4DF  "qi") (V2DF  "qi")])
 
+;; Mapping of vector modes to corresponding mask half size
+(define_mode_attr avx512fmaskhalfmode
+  [(V64QI "SI") (V32QI "HI") (V16QI "QI")
+   (V32HI "HI") (V16HI "QI") (V8HI  "QI") (V4HI "QI")
+   (V16SI "QI") (V8SI  "QI") (V4SI  "QI")
+   (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V16SF "QI") (V8SF  "QI") (V4SF  "QI")
+   (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V16SF "V16SI") (V8DF  "V8DI")
@@ -22184,3 +22198,90 @@
   "vpshufbitqmb\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
   [(set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+;; Converting from BF to SF
+(define_mode_attr bf16_cvt_2sf
+  [(V32HI  "V16SF") (V16HI  "V8SF") (V8HI  "V4SF")])
+;; Converting from SF to BF
+(define_mode_attr sf_cvt_bf16
+  [(V4SF  "V8HI") (V8SF  "V8HI") (V16SF  "V16HI")])
+;; Mapping from BF to SF
+(define_mode_attr sf_bf16
+  [(V4SF  "V8HI") (V8SF  "V16HI") (V16SF  "V32HI")])
+
+(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
+  [(match_operand:BF16 0 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 1 "register_operand")
+   (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+   (match_operand:<avx512fmaskmode> 3 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
+    operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+  DONE;
+})
+
+(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
+  [(set (match_operand:BF16 0 "register_operand" "=v")
+	(unspec:BF16
+	  [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
+	   (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
+        UNSPEC_VCVTNE2PS2BF16))]
+  "TARGET_AVX512BF16"
+  "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+
+(define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
+  [(match_operand:<sf_cvt_bf16> 0 "register_operand")
+   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:<avx512fmaskmode> 2 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_cvtneps2bf16_<mode>_mask(operands[0], operands[1],
+    CONST0_RTX(<sf_cvt_bf16>mode), operands[2]));
+  DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
+  [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
+	(unspec:<sf_cvt_bf16>
+	  [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
+        UNSPEC_VCVTNEPS2BF16))]
+  "TARGET_AVX512BF16"
+  "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+
+(define_expand "avx512f_dpbf16ps_<mode>_maskz"
+  [(match_operand:VF1_AVX512VL 0 "register_operand")
+   (match_operand:VF1_AVX512VL 1 "register_operand")
+   (match_operand:<sf_bf16> 2 "register_operand")
+   (match_operand:<sf_bf16> 3 "register_operand")
+   (match_operand:<avx512fmaskhalfmode> 4 "register_operand")]
+  "TARGET_AVX512BF16"
+{
+  emit_insn (gen_avx512f_dpbf16ps_<mode>_maskz_1(operands[0], operands[1],
+    operands[2], operands[3], CONST0_RTX(<MODE>mode), operands[4]));
+  DONE;
+})
+
+(define_insn "avx512f_dpbf16ps_<mode><maskz_half_name>"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v")
+	(unspec:VF1_AVX512VL
+	  [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
+	   (match_operand:<sf_bf16> 2 "register_operand" "v")
+	   (match_operand:<sf_bf16> 3 "register_operand" "v")]
+        UNSPEC_VDPBF16PS))]
+  "TARGET_AVX512BF16"
+  "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
+
+(define_insn "avx512f_dpbf16ps_<mode>_mask"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v")
+	(vec_merge:VF1_AVX512VL
+	  (unspec:VF1_AVX512VL
+	    [(match_operand:VF1_AVX512VL 1 "register_operand" "0")
+	     (match_operand:<sf_bf16> 2 "register_operand" "v")
+	     (match_operand:<sf_bf16> 3 "register_operand" "v")]
+             UNSPEC_VDPBF16PS)
+          (match_dup 1)
+          (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512BF16"
+  "vdpbf16ps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}")
Index: gcc/config/i386/subst.md
===================================================================
--- gcc/config/i386/subst.md	(revision 270934)
+++ gcc/config/i386/subst.md	(working copy)
@@ -313,3 +313,16 @@
 		(const_int 1))
 	     (match_operand:SI 3 "const48_operand")]
 		UNSPEC_EMBEDDED_ROUNDING))])
+
+(define_subst_attr "maskz_half_name" "maskz_half" "" "_maskz_1")
+(define_subst_attr "maskz_half_operand4" "maskz_half" "" "%{%5%}%N4")
+
+(define_subst "maskz_half"
+  [(set (match_operand:SUBST_V 0)
+        (match_operand:SUBST_V 1))]
+  ""
+  [(set (match_dup 0)
+        (vec_merge:SUBST_V
+	  (match_dup 1)
+	  (match_operand:SUBST_V 2 "const0_operand" "C")
+	  (match_operand:<avx512fmaskhalfmode> 3 "register_operand" "Yk")))])
Index: gcc/config.gcc
===================================================================
--- gcc/config.gcc	(revision 270934)
+++ gcc/config.gcc	(working copy)
@@ -407,7 +407,7 @@
 		       avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h
 		       avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
 		       pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
-		       waitpkgintrin.h cldemoteintrin.h"
+		       waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -439,7 +439,7 @@
 		       avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h
 		       avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
 		       pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
-		       waitpkgintrin.h cldemoteintrin.h"
+		       waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h"
 	;;
 ia64-*-*)
 	extra_headers=ia64intrin.h
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 270934)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1274,7 +1274,7 @@
 -msse4a  -m3dnow  -m3dnowa  -mpopcnt  -mabm  -mbmi  -mtbm  -mfma4  -mxop @gol
 -madx  -mlzcnt  -mbmi2  -mfxsr  -mxsave  -mxsaveopt  -mrtm  -mhle  -mlwp @gol
 -mmwaitx  -mclzero  -mpku  -mthreads  -mgfni  -mvaes  -mwaitpkg @gol
--mshstk -mmanual-endbr -mforce-indirect-call  -mavx512vbmi2 @gol
+-mshstk -mmanual-endbr -mforce-indirect-call -mavx512vbmi2 -mavx512bf16 @gol
 -mvpclmulqdq  -mavx512bitalg  -mmovdiri  -mmovdir64b  -mavx512vpopcntdq @gol
 -mavx5124fmaps  -mavx512vnni  -mavx5124vnniw  -mprfchw  -mrdpid @gol
 -mrdseed  -msgx @gol
@@ -28041,6 +28041,9 @@
 @itemx -mavx512vbmi2
 @opindex mavx512vbmi2
 @need 200
+@itemx -mavx512bf16
+@opindex mavx512bf16
+@need 200
 @itemx -mgfni
 @opindex mgfni
 @need 200
@@ -28083,7 +28086,7 @@
 WBNOINVD, FMA4, PREFETCHW, RDPID, PREFETCHWT1, RDSEED, SGX, XOP, LWP,
 3DNow!@:, enhanced 3DNow!@:, POPCNT, ABM, ADX, BMI, BMI2, LZCNT, FXSR, XSAVE,
 XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
-GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B,
+GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16
 AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, or CLDEMOTE
 extended instruction sets.  Each has a corresponding @option{-mno-} option to
 disable use of these instructions.
Index: gcc/testsuite/g++.dg/other/i386-2.C
===================================================================
--- gcc/testsuite/g++.dg/other/i386-2.C	(revision 270934)
+++ gcc/testsuite/g++.dg/other/i386-2.C	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt  -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt  -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
Index: gcc/testsuite/g++.dg/other/i386-3.C
===================================================================
--- gcc/testsuite/g++.dg/other/i386-3.C	(revision 270934)
+++ gcc/testsuite/g++.dg/other/i386-3.C	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
Index: gcc/testsuite/gcc.target/i386/avx-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx-1.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/avx-1.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512vl -mavx512bf16" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/avx-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx-2.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/avx-2.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -msse4a -maes -mpclmul -mavx512bw -mavx512vl -mavx512bf16" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c	(working copy)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m512bh res;
+volatile __m512 x1, x2;
+volatile __mmask32 m32;
+
+void extern
+avx512bf16_test (void)
+{
+  res = _mm512_cvtne2ps_pbh (x1, x2);
+  res = _mm512_mask_cvtne2ps_pbh (res, m32, x1, x2);
+  res = _mm512_maskz_cvtne2ps_pbh (m32, x1, x2);
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c	(working copy)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m256bh res;
+volatile __m512 x1;
+volatile __mmask16 m16;
+
+void extern
+avx512bf16_test (void)
+{
+  res = _mm512_cvtneps_pbh (x1);
+  res = _mm512_mask_cvtneps_pbh (res, m16, x1);
+  res = _mm512_maskz_cvtneps_pbh (m16, x1);
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c	(working copy)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m512 res;
+volatile __m512bh x1, x2;
+volatile __mmask16 m16;
+
+void extern
+avx512bf16_test (void)
+{
+  res = _mm512_dpbf16_ps (res, x1, x2);
+  res = _mm512_mask_dpbf16_ps (res, m16, x1, x2);
+  res = _mm512_maskz_dpbf16_ps (m16, res, x1, x2);
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c	(working copy)
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+typedef union
+{
+  __m512 x;
+  float a[16];
+} union512s;
+
+float res_ref[16];
+union512s res;
+__m512bh x1, x2;
+__mmask16 m16;
+
+static void __attribute__((noinline, unused))
+merge_masking_s (float *arr, unsigned long long mask, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+  {
+    arr[i] = (mask & (1LL << i)) ? arr[i] : 117;
+  }
+}
+
+static int __attribute__((noinline, unused))
+check_union512s (union512s u, const float *v)
+{
+  int i;
+  int err = 0;
+  for (i = 0; i < (sizeof (u.a) / sizeof ((u.a)[0])); i++)
+    if (u.a[i] != v[i])
+    {
+      err++;
+      ;
+    }
+  return err;
+}
+
+void extern
+avx512bf16_test (void)
+{
+  res.x = _mm512_mask_dpbf16_ps (res.x, m16, x1, x2);
+  merge_masking_s (res_ref, m16, 16);
+  if (check_union512s (res, res_ref))
+    abort ();
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128bh res1;
+volatile __m256bh res2;
+volatile __m128 x1, x2;
+volatile __m256 x3, x4;
+volatile __mmask8 m8;
+volatile __mmask16 m16;
+
+void extern
+avx512bf16_test (void)
+{
+  res2 = _mm256_cvtne2ps_pbh (x3, x4);
+  res2 = _mm256_mask_cvtne2ps_pbh (res2, m16, x3, x4);
+  res2 = _mm256_maskz_cvtne2ps_pbh (m16, x3, x4);
+
+  res1 = _mm_cvtne2ps_pbh (x1, x2);
+  res1 = _mm_mask_cvtne2ps_pbh (res1, m8, x1, x2);
+  res1 = _mm_maskz_cvtne2ps_pbh (m8, x1, x2);
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c	(working copy)
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m128bh res1, res2;
+volatile __m128 x1;
+volatile __m256 x2;
+volatile __mmask8 m8;
+
+void extern
+avx512bf16_test (void)
+{
+  res2 = _mm256_cvtneps_pbh (x2);
+  res2 = _mm256_mask_cvtneps_pbh (res2, m8, x2);
+  res2 = _mm256_maskz_cvtneps_pbh (m8, x2);
+
+  res1 = _mm_cvtneps_pbh (x1);
+  res1 = _mm_mask_cvtneps_pbh (res1, m8, x1);
+  res1 = _mm_maskz_cvtneps_pbh (m8, x1);
+}
Index: gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c	(working copy)
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m256 res1;
+volatile __m256bh x1, x2;
+volatile __m128 res2;
+volatile __m128bh x3, x4;
+volatile __mmask8 m8;
+
+void extern
+avx512bf16_test (void)
+{
+  res1 = _mm256_dpbf16_ps (res1, x1, x2);
+  res1 = _mm256_mask_dpbf16_ps (res1, m8, x1, x2);
+  res1 = _mm256_maskz_dpbf16_ps (m8, res1, x1, x2);
+
+  res2 = _mm_dpbf16_ps (res2, x3, x4);
+  res2 = _mm_mask_dpbf16_ps (res2, m8, x3, x4);
+  res2 = _mm_maskz_dpbf16_ps (m8, res2, x3, x4);
+}
Index: gcc/testsuite/gcc.target/i386/sse-12.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-12.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/sse-12.c	(working copy)
@@ -3,7 +3,7 @@
    popcntintrin.h gfniintrin.h and mm_malloc.h are usable
    with -O -std=c89 -pedantic-errors.  */
 /* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */
 
 #include <x86intrin.h>
 
Index: gcc/testsuite/gcc.target/i386/sse-13.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-13.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/sse-13.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/sse-14.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-14.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/sse-14.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16" } */
 /* { dg-add-options bind_pic_locally } */
 
 #include <mm_malloc.h>
Index: gcc/testsuite/gcc.target/i386/sse-22.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-22.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/sse-22.c	(working copy)
@@ -101,7 +101,7 @@
 
 
 #ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16")
 #endif
 
 /* Following intrinsics require immediate arguments.  They
@@ -218,7 +218,7 @@
 
 /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
 #ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16")
 #endif
 #include <immintrin.h>
 test_1 (_cvtss_sh, unsigned short, float, 1)
Index: gcc/testsuite/gcc.target/i386/sse-23.c
===================================================================
--- gcc/testsuite/gcc.target/i386/sse-23.c	(revision 270934)
+++ gcc/testsuite/gcc.target/i386/sse-23.c	(working copy)
@@ -696,6 +696,6 @@
 #define __builtin_ia32_vpclmulqdq_v2di(A, B, C)  __builtin_ia32_vpclmulqdq_v2di(A, B, 1) 
 #define __builtin_ia32_vpclmulqdq_v8di(A, B, C)  __builtin_ia32_vpclmulqdq_v8di(A, B, 1) 
 
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16")
 
 #include <x86intrin.h>

Reply via email to