Re: [PATCH] i386: Enable intrinsics that convert float and bf16 data to each other.
On Wed, Dec 22, 2021 at 11:28 AM Kong, Lingling via Gcc-patches wrote: > > Hi, > > > This patch is to enable intrinsics that convert float and bf16 data to each > other. > Ok for master? > Ok. > gcc/ChangeLog: > > * config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Add new intrinsic. > (_mm512_cvtpbh_ps): Likewise. > (_mm512_maskz_cvtpbh_ps): Likewise. > (_mm512_mask_cvtpbh_ps): Likewise. > * config/i386/avx512bf16vlintrin.h (_mm_cvtness_sbh): Likewise. > (_mm_cvtpbh_ps): Likewise. > (_mm256_cvtpbh_ps): Likewise. > (_mm_maskz_cvtpbh_ps): Likewise. > (_mm256_maskz_cvtpbh_ps): Likewise. > (_mm_mask_cvtpbh_ps): Likewise. > (_mm256_mask_cvtpbh_ps): Likewise. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: New test. > * gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c: Ditto. > * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Ditto. > * gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c: Ditto. > --- > gcc/config/i386/avx512bf16intrin.h| 36 +++ > gcc/config/i386/avx512bf16vlintrin.h | 63 +++ > .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c | 15 + > .../gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c | 20 ++ > .../i386/avx512bf16vl-cvtness2sbh-1.c | 14 + > .../i386/avx512bf16vl-vcvtpbh2ps-1.c | 29 + > 6 files changed, 177 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c > > diff --git a/gcc/config/i386/avx512bf16intrin.h > b/gcc/config/i386/avx512bf16intrin.h > index 9afc6bd7d2b..6b62dc3e398 100644 > --- a/gcc/config/i386/avx512bf16intrin.h > +++ b/gcc/config/i386/avx512bf16intrin.h > @@ -41,6 +41,16 @@ typedef short __v32bh __attribute__ ((__vector_size__ > (64))); > vector types, and their scalar components. */ typedef short __m512bh > __attribute__ ((__vector_size__ (64), __may_alias__)); > > +/* Convert One BF16 Data to One Single Float Data. */ extern __inline > +float __attribute__ ((__gnu_inline__, __always_inline__, > +__artificial__)) _mm_cvtsbh_ss (__bfloat16 __A) { > + union{ float a; unsigned int b;} __tmp; > + __tmp.b = ((unsigned int)(__A)) << 16; > + return __tmp.a; > +} > + > /* vcvtne2ps2bf16 */ > > extern __inline __m512bh > @@ -110,6 +120,32 @@ _mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, > __m512bh __C, __m512bh __D) >return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); } > > +extern __inline __m512 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_cvtpbh_ps (__m256bh __A) { > + return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 ( > +(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)); } > + > +extern __inline __m512 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A) { > + return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 ( > +(__m512i)_mm512_maskz_cvtepi16_epi32 ( > +(__mmask16)__U, (__m256i)__A), 16)); > +} > + > +extern __inline __m512 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A) { > + return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 ( > +(__m512i)__S, (__mmask16)__U, > +(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16))); } > + > #ifdef __DISABLE_AVX512BF16__ > #undef __DISABLE_AVX512BF16__ > #pragma GCC pop_options > diff --git a/gcc/config/i386/avx512bf16vlintrin.h > b/gcc/config/i386/avx512bf16vlintrin.h > index 6dd396d4008..5e6a6503aa6 100644 > --- a/gcc/config/i386/avx512bf16vlintrin.h > +++ b/gcc/config/i386/avx512bf16vlintrin.h > @@ -43,6 +43,7 @@ typedef short __v8bh __attribute__ ((__vector_size__ > (16))); typedef short __m256bh __attribute__ ((__vector_size__ (32), > __may_alias__)); typedef short __m128bh __attribute__ ((__vector_size__ > (16), __may_alias__)); > > +typedef unsigned short __bfloat16; > /* vcvtne2ps2bf16 */ > > extern __inline __m256bh > @@ -175,6 +176,68 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh > __C, __m128bh __D) >return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); } > > +extern __inline __bfloat16 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_cvtness_sbh (float __A) { > + __v4sf __V = {__A, 0, 0, 0}; > + __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V, > + (__v8hi)_mm_undefined_si128 (), (__mmask8)-1); > + return __R[0]; > +} > + > +extern __inline __m128 > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)
[PATCH] i386: Enable intrinsics that convert float and bf16 data to each other.
Hi, This patch is to enable intrinsics that convert float and bf16 data to each other. Ok for master? gcc/ChangeLog: * config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Add new intrinsic. (_mm512_cvtpbh_ps): Likewise. (_mm512_maskz_cvtpbh_ps): Likewise. (_mm512_mask_cvtpbh_ps): Likewise. * config/i386/avx512bf16vlintrin.h (_mm_cvtness_sbh): Likewise. (_mm_cvtpbh_ps): Likewise. (_mm256_cvtpbh_ps): Likewise. (_mm_maskz_cvtpbh_ps): Likewise. (_mm256_maskz_cvtpbh_ps): Likewise. (_mm_mask_cvtpbh_ps): Likewise. (_mm256_mask_cvtpbh_ps): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: New test. * gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c: Ditto. * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Ditto. * gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c: Ditto. --- gcc/config/i386/avx512bf16intrin.h| 36 +++ gcc/config/i386/avx512bf16vlintrin.h | 63 +++ .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c | 15 + .../gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c | 20 ++ .../i386/avx512bf16vl-cvtness2sbh-1.c | 14 + .../i386/avx512bf16vl-vcvtpbh2ps-1.c | 29 + 6 files changed, 177 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h index 9afc6bd7d2b..6b62dc3e398 100644 --- a/gcc/config/i386/avx512bf16intrin.h +++ b/gcc/config/i386/avx512bf16intrin.h @@ -41,6 +41,16 @@ typedef short __v32bh __attribute__ ((__vector_size__ (64))); vector types, and their scalar components. */ typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__)); +/* Convert One BF16 Data to One Single Float Data. */ extern __inline +float __attribute__ ((__gnu_inline__, __always_inline__, +__artificial__)) _mm_cvtsbh_ss (__bfloat16 __A) { + union{ float a; unsigned int b;} __tmp; + __tmp.b = ((unsigned int)(__A)) << 16; + return __tmp.a; +} + /* vcvtne2ps2bf16 */ extern __inline __m512bh @@ -110,6 +120,32 @@ _mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D) return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); } +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpbh_ps (__m256bh __A) { + return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 ( +(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)); } + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A) { + return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 ( +(__m512i)_mm512_maskz_cvtepi16_epi32 ( +(__mmask16)__U, (__m256i)__A), 16)); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A) { + return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 ( +(__m512i)__S, (__mmask16)__U, +(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16))); } + #ifdef __DISABLE_AVX512BF16__ #undef __DISABLE_AVX512BF16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h index 6dd396d4008..5e6a6503aa6 100644 --- a/gcc/config/i386/avx512bf16vlintrin.h +++ b/gcc/config/i386/avx512bf16vlintrin.h @@ -43,6 +43,7 @@ typedef short __v8bh __attribute__ ((__vector_size__ (16))); typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef unsigned short __bfloat16; /* vcvtne2ps2bf16 */ extern __inline __m256bh @@ -175,6 +176,68 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); } +extern __inline __bfloat16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtness_sbh (float __A) { + __v4sf __V = {__A, 0, 0, 0}; + __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V, + (__v8hi)_mm_undefined_si128 (), (__mmask8)-1); + return __R[0]; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpbh_ps (__m128bh __A) { + return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( +(__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16)); } + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpbh_ps (__m128bh __A) { +