Hi all, The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has been tested on x86_64-pc-linux-gnu. OK for trunk?
BRs, Lin gcc/ChangeLog: * config/i386/avx2intrin.h (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_mm_reduce_add_epi16): New instrinsics. (_mm_reduce_mul_epi16): Ditto. (_mm_reduce_and_epi16): Ditto. (_mm_reduce_or_epi16): Ditto. (_mm_reduce_max_epi16): Ditto. (_mm_reduce_max_epu16): Ditto. (_mm_reduce_min_epi16): Ditto. (_mm_reduce_min_epu16): Ditto. (_mm256_reduce_add_epi16): Ditto. (_mm256_reduce_mul_epi16): Ditto. (_mm256_reduce_and_epi16): Ditto. (_mm256_reduce_or_epi16): Ditto. (_mm256_reduce_max_epi16): Ditto. (_mm256_reduce_max_epu16): Ditto. (_mm256_reduce_min_epi16): Ditto. (_mm256_reduce_min_epu16): Ditto. (_mm_reduce_add_epi8): Ditto. (_mm_reduce_mul_epi8): Ditto. (_mm_reduce_and_epi8): Ditto. (_mm_reduce_or_epi8): Ditto. (_mm_reduce_max_epi8): Ditto. (_mm_reduce_max_epu8): Ditto. (_mm_reduce_min_epi8): Ditto. (_mm_reduce_min_epu8): Ditto. (_mm256_reduce_add_epi8): Ditto. (_mm256_reduce_mul_epi8): Ditto. (_mm256_reduce_and_epi8): Ditto. (_mm256_reduce_or_epi8): Ditto. (_mm256_reduce_max_epi8): Ditto. (_mm256_reduce_max_epu8): Ditto. (_mm256_reduce_min_epi8): Ditto. (_mm256_reduce_min_epu8): Ditto. * config/i386/avx512vlbwintrin.h: (_mm_mask_reduce_add_epi16): Ditto. (_mm_mask_reduce_mul_epi16): Ditto. (_mm_mask_reduce_and_epi16): Ditto. (_mm_mask_reduce_or_epi16): Ditto. (_mm_mask_reduce_max_epi16): Ditto. (_mm_mask_reduce_max_epu16): Ditto. (_mm_mask_reduce_min_epi16): Ditto. (_mm_mask_reduce_min_epu16): Ditto. (_mm256_mask_reduce_add_epi16): Ditto. (_mm256_mask_reduce_mul_epi16): Ditto. (_mm256_mask_reduce_and_epi16): Ditto. (_mm256_mask_reduce_or_epi16): Ditto. (_mm256_mask_reduce_max_epi16): Ditto. (_mm256_mask_reduce_max_epu16): Ditto. (_mm256_mask_reduce_min_epi16): Ditto. (_mm256_mask_reduce_min_epu16): Ditto. (_mm_mask_reduce_add_epi8): Ditto. (_mm_mask_reduce_mul_epi8): Ditto. (_mm_mask_reduce_and_epi8): Ditto. (_mm_mask_reduce_or_epi8): Ditto. (_mm_mask_reduce_max_epi8): Ditto. (_mm_mask_reduce_max_epu8): Ditto. (_mm_mask_reduce_min_epi8): Ditto. (_mm_mask_reduce_min_epu8): Ditto. (_mm256_mask_reduce_add_epi8): Ditto. (_mm256_mask_reduce_mul_epi8): Ditto. (_mm256_mask_reduce_and_epi8): Ditto. (_mm256_mask_reduce_or_epi8): Ditto. (_mm256_mask_reduce_max_epi8): Ditto. (_mm256_mask_reduce_max_epu8): Ditto. (_mm256_mask_reduce_min_epi8): Ditto. (_mm256_mask_reduce_min_epu8): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. --- gcc/config/i386/avx2intrin.h | 347 ++++++++++++++++++ gcc/config/i386/avx512vlbwintrin.h | 256 +++++++++++++ .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++++++++++ 3 files changed, 809 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 1b9c8169a96..9b8c13b7233 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, (int) (SCALE)) #endif /* __OPTIMIZE__ */ +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)__W; \ + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T7 = __T5 op __T6; \ + return __T7[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_and_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (&); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_or_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (|); +} + +#define _MM_REDUCE_OPERATOR_MAX_MIN_EP16(op) \ + __m128i __T1 = (__m128i)__builtin_shufflevector ((__v8hi)__V, \ + (__v8hi)__V, 4, 5, 6, 7, 4, 5, 6, 7); \ + __m128i __T2 = _mm_##op (__V, __T1); \ + __m128i __T3 = (__m128i)__builtin_shufflevector ((__v8hi)__T2, \ + (__v8hi)__T2, 2, 3, 2, 3, 4, 5, 6, 7); \ + __m128i __T4 = _mm_##op (__T2, __T3); \ + __m128i __T5 = (__m128i)__builtin_shufflevector ((__v8hi)__T4, \ + (__v8hi)__T4, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T6 = (__v8hi)_mm_##op (__T4, __T5); \ + return __T6[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_epi16 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_epu16 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_epi16 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_epu16 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); +} + +#define _MM256_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)_mm256_extracti128_si256 (__W, 0); \ + __v8hi __T2 = (__v8hi)_mm256_extracti128_si256 (__W, 1); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7); \ + __v8hi __T7 = __T5 op __T6; \ + __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T9 = __T7 op __T8; \ + return __T9[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_add_epi16 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_mul_epi16 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_and_epi16 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_or_epi16 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|); +} + +#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP16(op) \ + __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \ + __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \ + __m128i __T3 = _mm_##op (__T1, __T2); \ + __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, \ + (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); \ + __m128i __T5 = _mm_##op (__T3, __T4); \ + __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, \ + (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); \ + __m128i __T7 = _mm_##op (__T5, __T6); \ + __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, \ + (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T9 = (__v8hi)_mm_##op (__T7, __T8); \ + return __T9[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_epi16 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_epu16 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_epi16 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_epu16 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); +} + +#define _MM_REDUCE_OPERATOR_BASIC_EPI8(op) \ + __v16qi __T1 = (__v16qi)__W; \ + __v16qi __T2 = __builtin_shufflevector (__T1, __T1, \ + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T3 = __T1 op __T2; \ + __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \ + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T5 = __T3 op __T4; \ + __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \ + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T7 = __T5 op __T6; \ + __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \ + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T9 = __T7 op __T8; \ + return __T9[0] + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_epi8 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI8 (+); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_epi8 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI8 (*); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_and_epi8 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI8 (&); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_or_epi8 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI8 (|); +} + +#define _MM_REDUCE_OPERATOR_MAX_MIN_EP8(op) \ + __m128i __T1 = (__m128i)__builtin_shufflevector ((__v16qi)__V, (__v16qi)__V, \ + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T2 = _mm_##op (__V, __T1); \ + __m128i __T3 = (__m128i)__builtin_shufflevector ((__v16qi)__T2, \ + (__v16qi)__T2, \ + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T4 = _mm_##op (__T2, __T3); \ + __m128i __T5 = (__m128i)__builtin_shufflevector ((__v16qi)__T4, \ + (__v16qi)__T4, \ + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T6 = _mm_##op (__T4, __T5); \ + __m128i __T7 = (__m128i)__builtin_shufflevector ((__v16qi)__T6, \ + (__v16qi)__T6, \ + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T8 = (__v16qi)_mm_##op (__T6, __T7); \ + return __T8[0] + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_epi8 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_epu8 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_epi8 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_epu8 (__m128i __V) +{ + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); +} + +#define _MM256_REDUCE_OPERATOR_BASIC_EPI8(op) \ + __v16qi __T1 = (__v16qi)_mm256_extracti128_si256 (__W, 0); \ + __v16qi __T2 = (__v16qi)_mm256_extracti128_si256 (__W, 1); \ + __v16qi __T3 = __T1 op __T2; \ + __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \ + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T5 = __T3 op __T4; \ + __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \ + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T7 = __T5 op __T6; \ + __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \ + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T9 = __T7 op __T8; \ + __v16qi __T10 = __builtin_shufflevector (__T9, __T9, \ + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T11 = __T9 op __T10; \ + return __T11[0] + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_add_epi8 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_mul_epi8 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_and_epi8 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_or_epi8 (__m256i __W) +{ + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|); +} + +#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP8(op) \ + __m128i __T1 = _mm256_extracti128_si256 (__V, 0); \ + __m128i __T2 = _mm256_extracti128_si256 (__V, 1); \ + __m128i __T3 = _mm_##op (__T1, __T2); \ + __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, \ + (__v16qi)__T3, \ + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T5 = _mm_##op (__T3, __T4); \ + __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, \ + (__v16qi)__T5, \ + 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T7 = _mm_##op (__T5, __T6); \ + __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, \ + (__v16qi)__T5, \ + 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __m128i __T9 = _mm_##op (__T7, __T8); \ + __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, \ + (__v16qi)__T9, \ + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ + __v16qi __T11 = (__v16qi)_mm_##op (__T9, __T10); \ + return __T11[0] + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_epi8 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_epu8 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_epi8 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_epu8 (__m256i __V) +{ + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); +} + #ifdef __DISABLE_AVX2__ #undef __DISABLE_AVX2__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h index 0232783a362..bf38ef6247d 100644 --- a/gcc/config/i386/avx512vlbwintrin.h +++ b/gcc/config/i386/avx512vlbwintrin.h @@ -4750,6 +4750,262 @@ _mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) (__mmask16) __M); } +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_add_epi16 (__mmask8 __M, __m128i __W) +{ + __W = _mm_maskz_mov_epi16 (__M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_mul_epi16 (__mmask8 __M, __m128i __W) +{ + __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (1), __M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_and_epi16 (__mmask8 __M, __m128i __W) +{ + __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI16 (&); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_or_epi16 (__mmask8 __M, __m128i __W) +{ + __W = _mm_maskz_mov_epi16 (__M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI16 (|); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_max_epi16 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-32767-1), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_max_epu16 (__mmask16 __M, __m128i __V) +{ + __V = _mm_maskz_mov_epi16 (__M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_min_epi16 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (32767), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_min_epu16 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_add_epi16 (__mmask16 __M, __m256i __W) +{ + __W = _mm256_maskz_mov_epi16 (__M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_mul_epi16 (__mmask16 __M, __m256i __W) +{ + __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (1), __M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_and_epi16 (__mmask16 __M, __m256i __W) +{ + __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_or_epi16 (__mmask16 __M, __m256i __W) +{ + __W = _mm256_maskz_mov_epi16 (__M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_max_epi16 (__mmask16 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-32767-1), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_max_epu16 (__mmask16 __M, __m256i __V) +{ + __V = _mm256_maskz_mov_epi16 (__M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_min_epi16 (__mmask16 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (32767), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16); +} + +extern __inline unsigned short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_min_epu16 (__mmask16 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_add_epi8 (__mmask16 __M, __m128i __W) +{ + __W = _mm_maskz_mov_epi8 (__M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI8 (+); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_mul_epi8 (__mmask16 __M, __m128i __W) +{ + __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (1), __M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI8 (*); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_and_epi8 (__mmask16 __M, __m128i __W) +{ + __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI8 (&); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_or_epi8 (__mmask16 __M, __m128i __W) +{ + __W = _mm_maskz_mov_epi8 (__M, __W); + _MM_REDUCE_OPERATOR_BASIC_EPI8 (|); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_max_epi8 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-127-1), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_max_epu8 (__mmask16 __M, __m128i __V) +{ + __V = _mm_maskz_mov_epi8 (__M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_min_epi8 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (127), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_min_epu8 (__mmask16 __M, __m128i __V) +{ + __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __V); + _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_add_epi8 (__mmask32 __M, __m256i __W) +{ + __W = _mm256_maskz_mov_epi8 (__M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_mul_epi8 (__mmask32 __M, __m256i __W) +{ + __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (1), __M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_and_epi8 (__mmask32 __M, __m256i __W) +{ + __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&); +} + +extern __inline char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_or_epi8 (__mmask32 __M, __m256i __W) +{ + __W = _mm256_maskz_mov_epi8 (__M, __W); + _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_max_epi8 (__mmask32 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-127-1), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_max_epu8 (__mmask32 __M, __m256i __V) +{ + __V = _mm256_maskz_mov_epi8 (__M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8); +} + +extern __inline signed char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_min_epi8 (__mmask32 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (127), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_min_epu8 (__mmask32 __M, __m256i __V) +{ + __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __V); + _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8); +} + #ifdef __DISABLE_AVX512VLBW__ #undef __DISABLE_AVX512VLBW__ #pragma GCC pop_options diff --git a/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c b/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c new file mode 100644 index 00000000000..146ef6bf8da --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c @@ -0,0 +1,206 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512bw -mavx512vl" } */ +/* { dg-require-effective-target avx512bw } */ +/* { dg-require-effective-target avx512vl } */ + +#define AVX512BW +#define AVX512VL + +#include "avx512f-helper.h" + +#define FUNC_TEST_REDUCE_BASIC(opname) \ + FUNC_TEST_REDUCE_OP (, short, epi16, opname, __m128i, __mmask8) \ + FUNC_TEST_REDUCE_OP (256, short, epi16, opname, __m256i, __mmask16) \ + FUNC_TEST_REDUCE_OP (, char, epi8, opname, __m128i, __mmask16) \ + FUNC_TEST_REDUCE_OP (256, char, epi8, opname, __m256i, __mmask32) + +#define FUNC_TEST_REDUCE_MAX_MIN(opname) \ + FUNC_TEST_REDUCE_OP (, short, epi16, opname, __m128i, __mmask8) \ + FUNC_TEST_REDUCE_OP (256, short, epi16, opname, __m256i, __mmask16) \ + FUNC_TEST_REDUCE_OP (, char, epi8, opname, __m128i, __mmask16) \ + FUNC_TEST_REDUCE_OP (256, char, epi8, opname, __m256i, __mmask32) \ + FUNC_TEST_REDUCE_OP (, unsigned short, epu16, opname, __m128i, __mmask8) \ + FUNC_TEST_REDUCE_OP (256, unsigned short, epu16, \ + opname, __m256i, __mmask16) \ + FUNC_TEST_REDUCE_OP (, unsigned char, epu8, opname, __m128i, __mmask16) \ + FUNC_TEST_REDUCE_OP (256, unsigned char, epu8, opname, __m256i, __mmask32) + +#define FUNC_TEST_REDUCE_OP(len, rtype, type, opname, argtype, masktype) \ + __attribute__((noinline, noclone)) rtype \ + test_##len##_reduce_##opname##_##type (argtype a) \ + { \ + return _mm##len##_reduce_##opname##_##type (a); \ + } \ + __attribute__((noinline, noclone)) rtype \ + test_##len##_mask_reduce_##opname##_##type (masktype u, argtype a) \ + { \ + return _mm##len##_mask_reduce_##opname##_##type (u, a); \ + } + +FUNC_TEST_REDUCE_BASIC (add) +FUNC_TEST_REDUCE_BASIC (mul) +FUNC_TEST_REDUCE_BASIC (and) +FUNC_TEST_REDUCE_BASIC (or) +FUNC_TEST_REDUCE_MAX_MIN (max) +FUNC_TEST_REDUCE_MAX_MIN (min) + +#define TESTOP(len, opname, op, type, suffix, neutral) \ + do { \ + type r1 = _mm##len##_reduce_##opname##_##suffix (v.x); \ + type r2 = test_##len##_reduce_##opname##_##suffix (v.x); \ + type r3 = neutral; \ + if (r1 != r2) \ + __builtin_abort (); \ + for (int i = 0; i < SIZE; i++) \ + r3 = r3 op v.a[i]; \ + if (r1 != r3) \ + __builtin_abort (); \ + type r4 = _mm##len##_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ + type r5 = test_##len##_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ + if (r4 != r5) \ + __builtin_abort (); \ + r3 = neutral; \ + for (int i = 0; i < SIZE; i++) \ + if (MASK_VALUE & (1 << i)) \ + r3 = r3 op v.a[i]; \ + if (r4 != r3) \ + __builtin_abort (); \ + type r6 = _mm##len##_mask_reduce_##opname##_##suffix (0, v.x); \ + type r7 = test_##len##_mask_reduce_##opname##_##suffix (0, v.x); \ + if (r6 != r7 || r6 != neutral) \ + __builtin_abort (); \ + } while (0) + +#undef AVX512F_LEN +#define AVX512F_LEN 128 + +#undef SIZE +#define SIZE (AVX512F_LEN / 8) +#include "avx512f-mask-type.h" + +#define TEST_128_EPI8(c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_b) v; \ + v.x = _mm_set_epi8 (c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16); \ + TESTOP (, add, +, char, epi8, 0); \ + TESTOP (, mul, *, char, epi8, 1); \ + TESTOP (, and, &, char, epi8, (char) ~0); \ + TESTOP (, or, |, char, epi8, 0); \ + TESTOP (, min, < v.a[i] ? r3 :, char, epi8, __SCHAR_MAX__); \ + TESTOP (, max, > v.a[i] ? r3 :, char, epi8, -__SCHAR_MAX__ - 1); \ + TESTOP (, min, < (unsigned char) v.a[i] ? r3 :, unsigned char, epu8, (unsigned char) ~0U); \ + TESTOP (, max, > (unsigned char) v.a[i] ? r3 :, unsigned char, epu8, 0); \ + } while (0) + +static void +test_128_epi8 (void) +{ + TEST_128_EPI8 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); + TEST_128_EPI8 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6); +} + +#undef SIZE +#define SIZE (AVX512F_LEN / 16) +#include "avx512f-mask-type.h" + +#define TEST_128_EPI16(c1, c2, c3, c4, c5, c6, c7, c8) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_w) v; \ + v.x = _mm_set_epi16 (c1, c2, c3, c4, c5, c6, c7, c8); \ + TESTOP (, add, +, short, epi16, 0); \ + TESTOP (, mul, *, short, epi16, 1); \ + TESTOP (, and, &, short, epi16, (short) ~0); \ + TESTOP (, or, |, short, epi16, 0); \ + TESTOP (, min, < v.a[i] ? r3 :, short, epi16, __SHRT_MAX__); \ + TESTOP (, max, > v.a[i] ? r3 :, short, epi16, -__SHRT_MAX__ - 1); \ + TESTOP (, min, < (unsigned short) v.a[i] ? r3 :, unsigned short, epu16,(unsigned short) ~0U); \ + TESTOP (, max, > (unsigned short) v.a[i] ? r3 :, unsigned short, epu16, 0); \ + } while (0) + +static void +test_128_epi16 (void) +{ + TEST_128_EPI16 (1, 2, 3, 4, 5, 6, 6, 5); + TEST_128_EPI16 (-1, 15, -1, 7, -1, 7, -1, -1); +} + +void +test_128 (void) +{ + test_128_epi8 (); + test_128_epi16 (); +} + +#undef AVX512F_LEN +#define AVX512F_LEN 256 + +#undef SIZE +#define SIZE (AVX512F_LEN / 8) +#include "avx512f-mask-type.h" + +#define TEST_256_EPI8(c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16, \ + c17, c18, c19, c20, c21, c22, c23, c24, \ + c25, c26, c27, c28, c29, c30, c31, c32) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_b) v; \ + v.x = _mm256_set_epi8 (c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16, \ + c17, c18, c19, c20, c21, c22, c23, c24, \ + c25, c26, c27, c28, c29, c30, c31, c32); \ + TESTOP (256, add, +, char, epi8, 0); \ + TESTOP (256, mul, *, char, epi8, 1); \ + TESTOP (256, and, &, char, epi8, (char) ~0); \ + TESTOP (256, or, |, char, epi8, 0); \ + TESTOP (256, min, < v.a[i] ? r3 :, char, epi8, __SCHAR_MAX__); \ + TESTOP (256, max, > v.a[i] ? r3 :, char, epi8, -__SCHAR_MAX__ - 1); \ + TESTOP (256, min, < (unsigned char) v.a[i] ? r3 :, \ + unsigned char, epu8, (unsigned char)~0U); \ + TESTOP (256, max, > (unsigned char) v.a[i] ? r3 :, \ + unsigned char, epu8, 0); \ + } while (0) + +static void +test_256_epi8 (void) +{ + TEST_256_EPI8 (1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 12, 11, 10, 9, 9, 7, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4, 7, 10, 11, 12); + TEST_256_EPI8 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6, -1, 30, -1, 28, -1, 26, -1, 24, -1, 22, -1, -1, -1, -1, 17, 16); +} + +#undef SIZE +#define SIZE (AVX512F_LEN / 16) +#include "avx512f-mask-type.h" + +#define TEST_256_EPI16(c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_w) v; \ + v.x = _mm256_set_epi16 (c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16); \ + TESTOP (256, add, +, short, epi16, 0); \ + TESTOP (256, mul, *, short, epi16, 1); \ + TESTOP (256, and, &, short, epi16, (short) ~0); \ + TESTOP (256, or, |, short, epi16, 0); \ + TESTOP (256, min, < v.a[i] ? r3 :, short, epi16, __SHRT_MAX__); \ + TESTOP (256, max, > v.a[i] ? r3 :, short, epi16, -__SHRT_MAX__ - 1);\ + TESTOP (256, min, < (unsigned short) v.a[i] ? r3 :, \ + unsigned short, epu16, (unsigned short) ~0U); \ + TESTOP (256, max, > (unsigned short) v.a[i] ? r3 :, \ + unsigned short, epu16, 0); \ + } while (0) + +static void +test_256_epi16 (void) +{ + TEST_256_EPI16 (9, 7, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4, 7, 10, 11, 12); + TEST_256_EPI16 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6); +} + +void +test_256 (void) +{ + test_256_epi8 (); + test_256_epi16 (); +} -- 2.31.1