[PATCH] Add missing _mm512_prefetch_i{32,64}gather_{pd,ps} (PR target/79481)

2017-02-13 Thread Jakub Jelinek
Hi!

As mentioned in the PR, ICC as well as clang have these non-masked
gather prefetch intrinsics in addition to masked (and for scatter
even GCC has both masked and non-masked), but GCC does not (the
SDM actually doesn't mention those, only those for scatters).

The following patch implements those, I think it is useful to have
them for compatibility with the other compilers as well for consistency.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-02-13  Jakub Jelinek  

PR target/79481
* config/i386/avx512pfintrin.h (_mm512_prefetch_i32gather_pd,
_mm512_prefetch_i32gather_ps, _mm512_prefetch_i64gather_pd,
_mm512_prefetch_i64gather_ps): New inline functions and macros.

* gcc.target/i386/sse-14.c (test_2vx): Add void return type.
(test_3vx): Change return type from int to void. 
(_mm512_prefetch_i32gather_ps, _mm512_prefetch_i32scatter_ps,
_mm512_prefetch_i64gather_ps, _mm512_prefetch_i64scatter_ps,
_mm512_prefetch_i32gather_pd, _mm512_prefetch_i32scatter_pd,
_mm512_prefetch_i64gather_pd, _mm512_prefetch_i64scatter_pd): New
tests.
* gcc.target/i386/sse-22.c (test_2vx): Add void return type.
(test_3vx): Change return type from int to void.
(_mm512_prefetch_i32gather_ps, _mm512_prefetch_i32scatter_ps,
_mm512_prefetch_i64gather_ps, _mm512_prefetch_i64scatter_ps,
_mm512_prefetch_i32gather_pd, _mm512_prefetch_i32scatter_pd,
_mm512_prefetch_i64gather_pd, _mm512_prefetch_i64scatter_pd): New
tests.
* gcc.target/i386/avx512pf-vgatherpf0dpd-1.c: Add non-masked
intrinsic.  Change scan-assembler-times number from 1 to 2.
* gcc.target/i386/avx512pf-vgatherpf0dps-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf0qpd-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf0qps-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf1dpd-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf1dps-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf1qpd-1.c: Likewise.
* gcc.target/i386/avx512pf-vgatherpf1qps-1.c: Likewise.

--- gcc/config/i386/avx512pfintrin.h.jj 2017-01-17 18:40:59.0 +0100
+++ gcc/config/i386/avx512pfintrin.h2017-02-13 09:56:21.03124 +0100
@@ -48,6 +48,24 @@ typedef unsigned short __mmask16;
 #ifdef __OPTIMIZE__
 extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr,
+ int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdps ((__mmask16) 0x, (__v16si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask,
   void const *__addr, int __scale, int __hint)
 {
@@ -66,6 +84,24 @@ _mm512_mask_prefetch_i32gather_ps (__m51
 
 extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask,
   void const *__addr, int __scale, int __hint)
 {
@@ -155,6 +191,14 @@ _mm512_mask_prefetch_i64scatter_ps (void
 }
 
 #else
+#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT)  \
+  __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i)INDEX,   \
+ (void const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT)  \
+  __builtin_ia32_gatherpfdps ((__mmask16)0x, (__v16si)(__m512i)INDEX,\
+ (void const *)ADDR, (int)SCALE, (int)HINT)
+
 #define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT)\
   __builtin_ia32_gatherpfdpd ((__mmask8)MASK, (_

Re: [PATCH] Add missing _mm512_prefetch_i{32,64}gather_{pd,ps} (PR target/79481)

2017-02-14 Thread Uros Bizjak
On Mon, Feb 13, 2017 at 8:35 PM, Jakub Jelinek  wrote:
> Hi!
>
> As mentioned in the PR, ICC as well as clang have these non-masked
> gather prefetch intrinsics in addition to masked (and for scatter
> even GCC has both masked and non-masked), but GCC does not (the
> SDM actually doesn't mention those, only those for scatters).
>
> The following patch implements those, I think it is useful to have
> them for compatibility with the other compilers as well for consistency.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2017-02-13  Jakub Jelinek  
>
> PR target/79481
> * config/i386/avx512pfintrin.h (_mm512_prefetch_i32gather_pd,
> _mm512_prefetch_i32gather_ps, _mm512_prefetch_i64gather_pd,
> _mm512_prefetch_i64gather_ps): New inline functions and macros.
>
> * gcc.target/i386/sse-14.c (test_2vx): Add void return type.
> (test_3vx): Change return type from int to void.
> (_mm512_prefetch_i32gather_ps, _mm512_prefetch_i32scatter_ps,
> _mm512_prefetch_i64gather_ps, _mm512_prefetch_i64scatter_ps,
> _mm512_prefetch_i32gather_pd, _mm512_prefetch_i32scatter_pd,
> _mm512_prefetch_i64gather_pd, _mm512_prefetch_i64scatter_pd): New
> tests.
> * gcc.target/i386/sse-22.c (test_2vx): Add void return type.
> (test_3vx): Change return type from int to void.
> (_mm512_prefetch_i32gather_ps, _mm512_prefetch_i32scatter_ps,
> _mm512_prefetch_i64gather_ps, _mm512_prefetch_i64scatter_ps,
> _mm512_prefetch_i32gather_pd, _mm512_prefetch_i32scatter_pd,
> _mm512_prefetch_i64gather_pd, _mm512_prefetch_i64scatter_pd): New
> tests.
> * gcc.target/i386/avx512pf-vgatherpf0dpd-1.c: Add non-masked
> intrinsic.  Change scan-assembler-times number from 1 to 2.
> * gcc.target/i386/avx512pf-vgatherpf0dps-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf0qpd-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf0qps-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf1dpd-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf1dps-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf1qpd-1.c: Likewise.
> * gcc.target/i386/avx512pf-vgatherpf1qps-1.c: Likewise.

OK.

Thanks,
Uros.

> --- gcc/config/i386/avx512pfintrin.h.jj 2017-01-17 18:40:59.0 +0100
> +++ gcc/config/i386/avx512pfintrin.h2017-02-13 09:56:21.03124 +0100
> @@ -48,6 +48,24 @@ typedef unsigned short __mmask16;
>  #ifdef __OPTIMIZE__
>  extern __inline void
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr,
> + int __scale, int __hint)
> +{
> +  __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
> + __scale, __hint);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr,
> + int __scale, int __hint)
> +{
> +  __builtin_ia32_gatherpfdps ((__mmask16) 0x, (__v16si) __index, __addr,
> + __scale, __hint);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask,
>void const *__addr, int __scale, int 
> __hint)
>  {
> @@ -66,6 +84,24 @@ _mm512_mask_prefetch_i32gather_ps (__m51
>
>  extern __inline void
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr,
> + int __scale, int __hint)
> +{
> +  __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr,
> + __scale, __hint);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr,
> + int __scale, int __hint)
> +{
> +  __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
> + __scale, __hint);
> +}
> +
> +extern __inline void
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask,
>void const *__addr, int __scale, int 
> __hint)
>  {
> @@ -155,6 +191,14 @@ _mm512_mask_prefetch_i64scatter_ps (void
>  }
>
>  #else
> +#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT)  \
> +  __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i)INDEX,   \
> + (void const *)ADDR, (int)SCALE, (int)HINT)
> +
> +#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT)  \
> +  __b