On Tue, Apr 18, 2023 at 3:13 PM Hu, Lin1 via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > More details: Intrinsics guide add these 128/256-bit intrinsics as follow: > https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=reduce_&ig_expand=5814. > > So we intend to enable these intrinsics for GCC-14. > > -----Original Message----- > From: Gcc-patches <gcc-patches-bounces+lin1.hu=intel....@gcc.gnu.org> On > Behalf Of Hu, Lin1 via Gcc-patches > Sent: Tuesday, April 18, 2023 3:03 PM > To: gcc-patches@gcc.gnu.org > Cc: Liu, Hongtao <hongtao....@intel.com>; ubiz...@gmail.com > Subject: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics > > Hi all, > > The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has > been tested on x86_64-pc-linux-gnu. OK for trunk? Ok. > > BRs, > Lin > > gcc/ChangeLog: > > * config/i386/avx2intrin.h > (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. > (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. > (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. > (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. > (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. > (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. > (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. > (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. > (_mm_reduce_add_epi16): New instrinsics. > (_mm_reduce_mul_epi16): Ditto. > (_mm_reduce_and_epi16): Ditto. > (_mm_reduce_or_epi16): Ditto. > (_mm_reduce_max_epi16): Ditto. > (_mm_reduce_max_epu16): Ditto. > (_mm_reduce_min_epi16): Ditto. > (_mm_reduce_min_epu16): Ditto. > (_mm256_reduce_add_epi16): Ditto. > (_mm256_reduce_mul_epi16): Ditto. > (_mm256_reduce_and_epi16): Ditto. > (_mm256_reduce_or_epi16): Ditto. > (_mm256_reduce_max_epi16): Ditto. > (_mm256_reduce_max_epu16): Ditto. > (_mm256_reduce_min_epi16): Ditto. > (_mm256_reduce_min_epu16): Ditto. > (_mm_reduce_add_epi8): Ditto. > (_mm_reduce_mul_epi8): Ditto. > (_mm_reduce_and_epi8): Ditto. > (_mm_reduce_or_epi8): Ditto. > (_mm_reduce_max_epi8): Ditto. > (_mm_reduce_max_epu8): Ditto. > (_mm_reduce_min_epi8): Ditto. > (_mm_reduce_min_epu8): Ditto. > (_mm256_reduce_add_epi8): Ditto. > (_mm256_reduce_mul_epi8): Ditto. > (_mm256_reduce_and_epi8): Ditto. > (_mm256_reduce_or_epi8): Ditto. > (_mm256_reduce_max_epi8): Ditto. > (_mm256_reduce_max_epu8): Ditto. > (_mm256_reduce_min_epi8): Ditto. > (_mm256_reduce_min_epu8): Ditto. > * config/i386/avx512vlbwintrin.h: > (_mm_mask_reduce_add_epi16): Ditto. > (_mm_mask_reduce_mul_epi16): Ditto. > (_mm_mask_reduce_and_epi16): Ditto. > (_mm_mask_reduce_or_epi16): Ditto. > (_mm_mask_reduce_max_epi16): Ditto. > (_mm_mask_reduce_max_epu16): Ditto. > (_mm_mask_reduce_min_epi16): Ditto. > (_mm_mask_reduce_min_epu16): Ditto. > (_mm256_mask_reduce_add_epi16): Ditto. > (_mm256_mask_reduce_mul_epi16): Ditto. > (_mm256_mask_reduce_and_epi16): Ditto. > (_mm256_mask_reduce_or_epi16): Ditto. > (_mm256_mask_reduce_max_epi16): Ditto. > (_mm256_mask_reduce_max_epu16): Ditto. > (_mm256_mask_reduce_min_epi16): Ditto. > (_mm256_mask_reduce_min_epu16): Ditto. > (_mm_mask_reduce_add_epi8): Ditto. > (_mm_mask_reduce_mul_epi8): Ditto. > (_mm_mask_reduce_and_epi8): Ditto. > (_mm_mask_reduce_or_epi8): Ditto. > (_mm_mask_reduce_max_epi8): Ditto. > (_mm_mask_reduce_max_epu8): Ditto. > (_mm_mask_reduce_min_epi8): Ditto. > (_mm_mask_reduce_min_epu8): Ditto. > (_mm256_mask_reduce_add_epi8): Ditto. > (_mm256_mask_reduce_mul_epi8): Ditto. > (_mm256_mask_reduce_and_epi8): Ditto. > (_mm256_mask_reduce_or_epi8): Ditto. > (_mm256_mask_reduce_max_epi8): Ditto. > (_mm256_mask_reduce_max_epu8): Ditto. > (_mm256_mask_reduce_min_epi8): Ditto. > (_mm256_mask_reduce_min_epu8): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. > --- > gcc/config/i386/avx2intrin.h | 347 ++++++++++++++++++ > gcc/config/i386/avx512vlbwintrin.h | 256 +++++++++++++ > .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++++++++++ > 3 files changed, 809 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c > > diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h > index 1b9c8169a96..9b8c13b7233 100644 > --- a/gcc/config/i386/avx2intrin.h > +++ b/gcc/config/i386/avx2intrin.h > @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const > *__base, > (int) (SCALE)) > #endif /* __OPTIMIZE__ */ > > +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ > + __v8hi __T1 = (__v8hi)__W; \ > + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, > +6, 7); \ > + __v8hi __T3 = __T1 op __T2; \ > + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, > +6, 7); \ > + __v8hi __T5 = __T3 op __T4; \ > + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, > +6, 7); \ > + __v8hi __T7 = __T5 op __T6; \ > + return __T7[0] > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_add_epi16 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_mul_epi16 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_and_epi16 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (&); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_or_epi16 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (|); > +} > + > +#define _MM_REDUCE_OPERATOR_MAX_MIN_EP16(op) \ > + __m128i __T1 = (__m128i)__builtin_shufflevector ((__v8hi)__V, \ > + (__v8hi)__V, 4, 5, 6, 7, 4, 5, 6, 7); \ > + __m128i __T2 = _mm_##op (__V, __T1); \ > + __m128i __T3 = (__m128i)__builtin_shufflevector ((__v8hi)__T2, \ > + (__v8hi)__T2, 2, 3, 2, 3, 4, 5, 6, 7); \ > + __m128i __T4 = _mm_##op (__T2, __T3); \ > + __m128i __T5 = (__m128i)__builtin_shufflevector ((__v8hi)__T4, \ > + (__v8hi)__T4, 1, 1, 2, 3, 4, 5, 6, 7); \ > + __v8hi __T6 = (__v8hi)_mm_##op (__T4, __T5); \ > + return __T6[0] > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_max_epi16 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_max_epu16 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_min_epi16 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_min_epu16 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); } > + > +#define _MM256_REDUCE_OPERATOR_BASIC_EPI16(op) \ > + __v8hi __T1 = (__v8hi)_mm256_extracti128_si256 (__W, 0); \ > + __v8hi __T2 = (__v8hi)_mm256_extracti128_si256 (__W, 1); \ > + __v8hi __T3 = __T1 op __T2; \ > + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, > +6, 7); \ > + __v8hi __T5 = __T3 op __T4; \ > + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, > +6, 7); \ > + __v8hi __T7 = __T5 op __T6; \ > + __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, > +6, 7); \ > + __v8hi __T9 = __T7 op __T8; \ > + return __T9[0] > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_add_epi16 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_mul_epi16 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_and_epi16 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_or_epi16 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|); } > + > +#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP16(op) \ > + __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \ > + __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \ > + __m128i __T3 = _mm_##op (__T1, __T2); \ > + __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, \ > + (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); \ > + __m128i __T5 = _mm_##op (__T3, __T4); \ > + __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, \ > + (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); \ > + __m128i __T7 = _mm_##op (__T5, __T6); \ > + __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, \ > + (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); \ > + __v8hi __T9 = (__v8hi)_mm_##op (__T7, __T8); \ > + return __T9[0] > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_max_epi16 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_max_epu16 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_min_epi16 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_min_epu16 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); } > + > +#define _MM_REDUCE_OPERATOR_BASIC_EPI8(op) \ > + __v16qi __T1 = (__v16qi)__W; \ > + __v16qi __T2 = __builtin_shufflevector (__T1, __T1, \ > + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, > 15); \ > + __v16qi __T3 = __T1 op __T2; \ > + __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \ > + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T5 = __T3 op __T4; \ > + __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \ > + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T7 = __T5 op __T6; \ > + __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \ > + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T9 = __T7 op __T8; \ > + return __T9[0] > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_add_epi8 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (+); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_mul_epi8 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (*); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_and_epi8 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (&); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_or_epi8 (__m128i __W) > +{ > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (|); > +} > + > +#define _MM_REDUCE_OPERATOR_MAX_MIN_EP8(op) \ > + __m128i __T1 = (__m128i)__builtin_shufflevector ((__v16qi)__V, > (__v16qi)__V, \ > + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, > 15); \ > + __m128i __T2 = _mm_##op (__V, __T1); \ > + __m128i __T3 = (__m128i)__builtin_shufflevector ((__v16qi)__T2, \ > + (__v16qi)__T2, \ > + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __m128i __T4 = _mm_##op (__T2, __T3); \ > + __m128i __T5 = (__m128i)__builtin_shufflevector ((__v16qi)__T4, \ > + (__v16qi)__T4, \ > + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __m128i __T6 = _mm_##op (__T4, __T5); \ > + __m128i __T7 = (__m128i)__builtin_shufflevector ((__v16qi)__T6, \ > + (__v16qi)__T6, \ > + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T8 = (__v16qi)_mm_##op (__T6, __T7); \ > + return __T8[0] > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_max_epi8 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_max_epu8 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); } > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_min_epi8 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_reduce_min_epu8 (__m128i __V) > +{ > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); } > + > +#define _MM256_REDUCE_OPERATOR_BASIC_EPI8(op) \ > + __v16qi __T1 = (__v16qi)_mm256_extracti128_si256 (__W, 0); \ > + __v16qi __T2 = (__v16qi)_mm256_extracti128_si256 (__W, 1); \ > + __v16qi __T3 = __T1 op __T2; \ > + __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \ > + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, > 15); \ > + __v16qi __T5 = __T3 op __T4; \ > + __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \ > + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T7 = __T5 op __T6; \ > + __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \ > + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T9 = __T7 op __T8; \ > + __v16qi __T10 = __builtin_shufflevector (__T9, __T9, \ > + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T11 = __T9 op __T10; \ > + return __T11[0] > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_add_epi8 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_mul_epi8 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_and_epi8 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_or_epi8 (__m256i __W) > +{ > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|); } > + > +#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP8(op) \ > + __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \ > + __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \ > + __m128i __T3 = _mm_##op (__T1, __T2); \ > + __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, \ > + (__v16qi)__T3, \ > + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, > 15); \ > + __m128i __T5 = _mm_##op (__T3, __T4); \ > + __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, \ > + (__v16qi)__T5, \ > + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __m128i __T7 = _mm_##op (__T5, __T6); \ > + __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, \ > + (__v16qi)__T5, \ > + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __m128i __T9 = _mm_##op (__T7, __T8); \ > + __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, \ > + (__v16qi)__T9, \ > + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ > + __v16qi __T11 = (__v16qi)_mm_##op (__T9, __T10); \ > + return __T11[0] > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_max_epi8 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_max_epu8 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); } > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_min_epi8 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_reduce_min_epu8 (__m256i __V) > +{ > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); } > + > #ifdef __DISABLE_AVX2__ > #undef __DISABLE_AVX2__ > #pragma GCC pop_options > diff --git a/gcc/config/i386/avx512vlbwintrin.h > b/gcc/config/i386/avx512vlbwintrin.h > index 0232783a362..bf38ef6247d 100644 > --- a/gcc/config/i386/avx512vlbwintrin.h > +++ b/gcc/config/i386/avx512vlbwintrin.h > @@ -4750,6 +4750,262 @@ _mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i > __X, __m256i __Y) > (__mmask16) __M); > } > > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_add_epi16 (__mmask8 __M, __m128i __W) { > + __W = _mm_maskz_mov_epi16 (__M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_mul_epi16 (__mmask8 __M, __m128i __W) { > + __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (1), __M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_and_epi16 (__mmask8 __M, __m128i __W) { > + __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (&); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_or_epi16 (__mmask8 __M, __m128i __W) { > + __W = _mm_maskz_mov_epi16 (__M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI16 (|); > +} > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_max_epi16 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-32767-1), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_max_epu16 (__mmask16 __M, __m128i __V) { > + __V = _mm_maskz_mov_epi16 (__M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_min_epi16 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (32767), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_min_epu16 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_add_epi16 (__mmask16 __M, __m256i __W) { > + __W = _mm256_maskz_mov_epi16 (__M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_mul_epi16 (__mmask16 __M, __m256i __W) { > + __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (1), __M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_and_epi16 (__mmask16 __M, __m256i __W) { > + __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_or_epi16 (__mmask16 __M, __m256i __W) { > + __W = _mm256_maskz_mov_epi16 (__M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_max_epi16 (__mmask16 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-32767-1), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_max_epu16 (__mmask16 __M, __m256i __V) { > + __V = _mm256_maskz_mov_epi16 (__M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); } > + > +extern __inline short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_min_epi16 (__mmask16 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (32767), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); } > + > +extern __inline unsigned short > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_min_epu16 (__mmask16 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_add_epi8 (__mmask16 __M, __m128i __W) { > + __W = _mm_maskz_mov_epi8 (__M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (+); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_mul_epi8 (__mmask16 __M, __m128i __W) { > + __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (1), __M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (*); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_and_epi8 (__mmask16 __M, __m128i __W) { > + __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (&); > +} > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_or_epi8 (__mmask16 __M, __m128i __W) { > + __W = _mm_maskz_mov_epi8 (__M, __W); > + _MM_REDUCE_OPERATOR_BASIC_EPI8 (|); > +} > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_max_epi8 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-127-1), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_max_epu8 (__mmask16 __M, __m128i __V) { > + __V = _mm_maskz_mov_epi8 (__M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); } > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_min_epi8 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (127), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm_mask_reduce_min_epu8 (__mmask16 __M, __m128i __V) { > + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __V); > + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_add_epi8 (__mmask32 __M, __m256i __W) { > + __W = _mm256_maskz_mov_epi8 (__M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_mul_epi8 (__mmask32 __M, __m256i __W) { > + __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (1), __M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_and_epi8 (__mmask32 __M, __m256i __W) { > + __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&); } > + > +extern __inline char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_or_epi8 (__mmask32 __M, __m256i __W) { > + __W = _mm256_maskz_mov_epi8 (__M, __W); > + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|); } > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_max_epi8 (__mmask32 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-127-1), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_max_epu8 (__mmask32 __M, __m256i __V) { > + __V = _mm256_maskz_mov_epi8 (__M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); } > + > +extern __inline signed char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_min_epi8 (__mmask32 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (127), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); } > + > +extern __inline unsigned char > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm256_mask_reduce_min_epu8 (__mmask32 __M, __m256i __V) { > + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __V); > + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); } > + > #ifdef __DISABLE_AVX512VLBW__ > #undef __DISABLE_AVX512VLBW__ > #pragma GCC pop_options > diff --git a/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c > b/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c > new file mode 100644 > index 00000000000..146ef6bf8da > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c > @@ -0,0 +1,206 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx512bw -mavx512vl" } */ > +/* { dg-require-effective-target avx512bw } */ > +/* { dg-require-effective-target avx512vl } */ > + > +#define AVX512BW > +#define AVX512VL > + > +#include "avx512f-helper.h" > + > +#define FUNC_TEST_REDUCE_BASIC(opname) \ > + FUNC_TEST_REDUCE_OP (, short, epi16, opname, __m128i, __mmask8) \ > + FUNC_TEST_REDUCE_OP (256, short, epi16, opname, __m256i, __mmask16) > +\ > + FUNC_TEST_REDUCE_OP (, char, epi8, opname, __m128i, __mmask16) \ > + FUNC_TEST_REDUCE_OP (256, char, epi8, opname, __m256i, __mmask32) > + > +#define FUNC_TEST_REDUCE_MAX_MIN(opname) \ > + FUNC_TEST_REDUCE_OP (, short, epi16, opname, __m128i, __mmask8) \ > + FUNC_TEST_REDUCE_OP (256, short, epi16, opname, __m256i, __mmask16) > +\ > + FUNC_TEST_REDUCE_OP (, char, epi8, opname, __m128i, __mmask16) \ > + FUNC_TEST_REDUCE_OP (256, char, epi8, opname, __m256i, __mmask32) \ > + FUNC_TEST_REDUCE_OP (, unsigned short, epu16, opname, __m128i, > +__mmask8) \ > + FUNC_TEST_REDUCE_OP (256, unsigned short, epu16, \ > + opname, __m256i, __mmask16) \ > + FUNC_TEST_REDUCE_OP (, unsigned char, epu8, opname, __m128i, > +__mmask16) \ > + FUNC_TEST_REDUCE_OP (256, unsigned char, epu8, opname, __m256i, > +__mmask32) > + > +#define FUNC_TEST_REDUCE_OP(len, rtype, type, opname, argtype, > +masktype) \ > + __attribute__((noinline, noclone)) rtype \ > + test_##len##_reduce_##opname##_##type (argtype a) \ > + { \ > + return _mm##len##_reduce_##opname##_##type (a); \ > + } \ > + __attribute__((noinline, noclone)) rtype \ > + test_##len##_mask_reduce_##opname##_##type (masktype u, argtype a) \ > + { \ > + return _mm##len##_mask_reduce_##opname##_##type (u, a); \ > + } > + > +FUNC_TEST_REDUCE_BASIC (add) > +FUNC_TEST_REDUCE_BASIC (mul) > +FUNC_TEST_REDUCE_BASIC (and) > +FUNC_TEST_REDUCE_BASIC (or) > +FUNC_TEST_REDUCE_MAX_MIN (max) > +FUNC_TEST_REDUCE_MAX_MIN (min) > + > +#define TESTOP(len, opname, op, type, suffix, neutral) \ > + do { \ > + type r1 = _mm##len##_reduce_##opname##_##suffix (v.x); > \ > + type r2 = test_##len##_reduce_##opname##_##suffix (v.x); > \ > + type r3 = neutral; \ > + if (r1 != r2) \ > + __builtin_abort (); \ > + for (int i = 0; i < SIZE; i++) \ > + r3 = r3 op v.a[i]; \ > + if (r1 != r3) \ > + __builtin_abort (); \ > + type r4 = _mm##len##_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); > \ > + type r5 = test_##len##_mask_reduce_##opname##_##suffix (MASK_VALUE, > v.x); \ > + if (r4 != r5) \ > + __builtin_abort (); \ > + r3 = neutral; \ > + for (int i = 0; i < SIZE; i++) \ > + if (MASK_VALUE & (1 << i)) \ > + r3 = r3 op v.a[i]; \ > + if (r4 != r3) \ > + __builtin_abort (); \ > + type r6 = _mm##len##_mask_reduce_##opname##_##suffix (0, v.x); > \ > + type r7 = test_##len##_mask_reduce_##opname##_##suffix (0, v.x); > \ > + if (r6 != r7 || r6 != neutral) \ > + __builtin_abort (); \ > + } while (0) > + > +#undef AVX512F_LEN > +#define AVX512F_LEN 128 > + > +#undef SIZE > +#define SIZE (AVX512F_LEN / 8) > +#include "avx512f-mask-type.h" > + > +#define TEST_128_EPI8(c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_b) v; \ > + v.x = _mm_set_epi8 (c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16); \ > + TESTOP (, add, +, char, epi8, 0); \ > + TESTOP (, mul, *, char, epi8, 1); \ > + TESTOP (, and, &, char, epi8, (char) ~0); > \ > + TESTOP (, or, |, char, epi8, 0); \ > + TESTOP (, min, < v.a[i] ? r3 :, char, epi8, __SCHAR_MAX__); > \ > + TESTOP (, max, > v.a[i] ? r3 :, char, epi8, -__SCHAR_MAX__ - 1); \ > + TESTOP (, min, < (unsigned char) v.a[i] ? r3 :, unsigned char, epu8, > (unsigned char) ~0U); \ > + TESTOP (, max, > (unsigned char) v.a[i] ? r3 :, unsigned char, > +epu8, 0); \ > + } while (0) > + > +static void > +test_128_epi8 (void) > +{ > + TEST_128_EPI8 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); > + TEST_128_EPI8 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, > +6); } > + > +#undef SIZE > +#define SIZE (AVX512F_LEN / 16) > +#include "avx512f-mask-type.h" > + > +#define TEST_128_EPI16(c1, c2, c3, c4, c5, c6, c7, c8) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_w) v; \ > + v.x = _mm_set_epi16 (c1, c2, c3, c4, c5, c6, c7, c8); \ > + TESTOP (, add, +, short, epi16, 0); > \ > + TESTOP (, mul, *, short, epi16, 1); > \ > + TESTOP (, and, &, short, epi16, (short) ~0); \ > + TESTOP (, or, |, short, epi16, 0); \ > + TESTOP (, min, < v.a[i] ? r3 :, short, epi16, __SHRT_MAX__); \ > + TESTOP (, max, > v.a[i] ? r3 :, short, epi16, -__SHRT_MAX__ - 1); \ > + TESTOP (, min, < (unsigned short) v.a[i] ? r3 :, unsigned short, > epu16,(unsigned short) ~0U); \ > + TESTOP (, max, > (unsigned short) v.a[i] ? r3 :, unsigned short, epu16, > 0); \ > + } while (0) > + > +static void > +test_128_epi16 (void) > +{ > + TEST_128_EPI16 (1, 2, 3, 4, 5, 6, 6, 5); > + TEST_128_EPI16 (-1, 15, -1, 7, -1, 7, -1, -1); } > + > +void > +test_128 (void) > +{ > + test_128_epi8 (); > + test_128_epi16 (); > +} > + > +#undef AVX512F_LEN > +#define AVX512F_LEN 256 > + > +#undef SIZE > +#define SIZE (AVX512F_LEN / 8) > +#include "avx512f-mask-type.h" > + > +#define TEST_256_EPI8(c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16, \ > + c17, c18, c19, c20, c21, c22, c23, c24, \ > + c25, c26, c27, c28, c29, c30, c31, c32) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_b) v; \ > + v.x = _mm256_set_epi8 (c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16, \ > + c17, c18, c19, c20, c21, c22, c23, c24, \ > + c25, c26, c27, c28, c29, c30, c31, c32); \ > + TESTOP (256, add, +, char, epi8, 0); \ > + TESTOP (256, mul, *, char, epi8, 1); \ > + TESTOP (256, and, &, char, epi8, (char) ~0); \ > + TESTOP (256, or, |, char, epi8, 0); > \ > + TESTOP (256, min, < v.a[i] ? r3 :, char, epi8, __SCHAR_MAX__); \ > + TESTOP (256, max, > v.a[i] ? r3 :, char, epi8, -__SCHAR_MAX__ - 1); > \ > + TESTOP (256, min, < (unsigned char) v.a[i] ? r3 :, \ > + unsigned char, epu8, (unsigned char)~0U); \ > + TESTOP (256, max, > (unsigned char) v.a[i] ? r3 :, \ > + unsigned char, epu8, 0); \ > + } while (0) > + > +static void > +test_256_epi8 (void) > +{ > + TEST_256_EPI8 (1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 12, 11, 10, 9, > +9, 7, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4, 7, 10, 11, 12); > + TEST_256_EPI8 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, > +6, -1, 30, -1, 28, -1, 26, -1, 24, -1, 22, -1, -1, -1, -1, 17, 16); } > + > +#undef SIZE > +#define SIZE (AVX512F_LEN / 16) > +#include "avx512f-mask-type.h" > + > +#define TEST_256_EPI16(c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_w) v; \ > + v.x = _mm256_set_epi16 (c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16); \ > + TESTOP (256, add, +, short, epi16, 0); \ > + TESTOP (256, mul, *, short, epi16, 1); \ > + TESTOP (256, and, &, short, epi16, (short) ~0); \ > + TESTOP (256, or, |, short, epi16, 0); \ > + TESTOP (256, min, < v.a[i] ? r3 :, short, epi16, __SHRT_MAX__); \ > + TESTOP (256, max, > v.a[i] ? r3 :, short, epi16, -__SHRT_MAX__ - 1);\ > + TESTOP (256, min, < (unsigned short) v.a[i] ? r3 :, > \ > + unsigned short, epu16, (unsigned short) ~0U); \ > + TESTOP (256, max, > (unsigned short) v.a[i] ? r3 :, > \ > + unsigned short, epu16, 0); \ > + } while (0) > + > +static void > +test_256_epi16 (void) > +{ > + TEST_256_EPI16 (9, 7, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4, 7, 10, 11, 12); > + TEST_256_EPI16 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, > +7, 6); } > + > +void > +test_256 (void) > +{ > + test_256_epi8 (); > + test_256_epi16 (); > +} > -- > 2.31.1 >
-- BR, Hongtao