Hello,
this patch gets rid of a few more builtins (well, I actually kept them,
since Ada users may still need them). I had to tweak the flags for
pr59539-2.c, otherwise the compiler thinks it is more efficient to split
the loads, reading 128 bits at a time. This still breaks one testcase:
avx512f-vmovdqu32-1.c. I don't think it really matters, and I'll just
adapt the scan-assembler-times regex if you agree, but from
https://gcc.gnu.org/ml/gcc-patches/2014-01/msg00757.html it looks like
getting 32 instead of 64 might be on purpose, so I'd like your opinion on
that
movq p(%rip), %rax
vmovdqu64 (%rax), %zmm0
vmovdqa64 %zmm0, x(%rip)
kmovw m(%rip), %k1
vmovdqa64 x(%rip), %zmm0
vmovdqu32 (%rax), %zmm0{%k1}
vmovdqa64 %zmm0, x(%rip)
kmovw m(%rip), %k1
vmovdqu32 (%rax), %zmm0{%k1}{z}
vmovdqa64 %zmm0, x(%rip)
vmovdqa64 x(%rip), %zmm0
- vmovdqu32 %zmm0, (%rax)
+ vmovdqu64 %zmm0, (%rax)
movq p(%rip), %rax
vmovdqa64 x(%rip), %zmm0
kmovw m(%rip), %k1
vmovdqu32 %zmm0, (%rax){%k1}
The changes in the signature of functions don't seem to matter, gcc
apparently ignores the aligned attribute for that purpose. The last change
(_mm_load_ps) is for consistency.
Bootstrap+regtest on x86_64-pc-linux-gnu, with only the above regression.
2016-08-29 Marc Glisse <marc.gli...@inria.fr>
gcc/
* config/i386/avx512fintrin.h (__m512_u, __m512i_u, __m512d_u):
New types.
(_mm512_loadu_pd, _mm512_storeu_pd, _mm512_loadu_ps,
_mm512_storeu_ps, _mm512_loadu_si512, _mm512_storeu_si512):
Replace builtin with vector extension.
* config/i386/avxintrin.h (__m256_u, __m256i_u, __m256d_u):
New types.
(_mm256_loadu_pd, _mm256_storeu_pd, _mm256_loadu_ps,
_mm256_storeu_ps, _mm256_loadu_si256, _mm256_storeu_si256):
Replace builtin with vector extension.
* config/i386/emmintrin.h (__m128i_u, __m128d_u): New types.
(_mm_loadu_pd, _mm_storeu_pd, _mm_loadu_si128, _mm_storeu_si128):
Replace builtin with vector extension.
* config/i386/xmmintrin.h (__m128_u): New type.
(_mm_loadu_ps, _mm_storeu_ps): Replace builtin with vector extension.
(_mm_load_ps, _mm_store_ps): Simplify.
testsuite/
* gcc.target/i386/pr59539-2.c: Adapt options.
--
Marc Glisse
Index: gcc/config/i386/avx512fintrin.h
===================================================================
--- gcc/config/i386/avx512fintrin.h (revision 239797)
+++ gcc/config/i386/avx512fintrin.h (working copy)
@@ -52,6 +52,11 @@
typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+/* Unaligned version of the same type. */
+typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
@@ -5674,10 +5679,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_pd (void const *__P)
{
- return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return *(__m512d_u *)__P;
}
extern __inline __m512d
@@ -5703,8 +5705,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_pd (void *__P, __m512d __A)
{
- __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A,
- (__mmask8) -1);
+ *(__m512d_u *)__P = __A;
}
extern __inline void
@@ -5719,10 +5720,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_ps (void const *__P)
{
- return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1);
+ return *(__m512_u *)__P;
}
extern __inline __m512
@@ -5748,8 +5746,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_ps (void *__P, __m512 __A)
{
- __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A,
- (__mmask16) -1);
+ *(__m512_u *)__P = __A;
}
extern __inline void
@@ -5791,10 +5788,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_loadu_si512 (void const *__P)
{
- return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1);
+ return *(__m512i_u *)__P;
}
extern __inline __m512i
@@ -5820,8 +5814,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_storeu_si512 (void *__P, __m512i __A)
{
- __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
- (__mmask16) -1);
+ *(__m512i_u *)__P = __A;
}
extern __inline void
Index: gcc/config/i386/avxintrin.h
===================================================================
--- gcc/config/i386/avxintrin.h (revision 239797)
+++ gcc/config/i386/avxintrin.h (working copy)
@@ -58,6 +58,17 @@
typedef double __m256d __attribute__ ((__vector_size__ (32),
__may_alias__));
+/* Unaligned version of the same types. */
+typedef float __m256_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+typedef double __m256d_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+
/* Compare predicates for scalar and packed compare intrinsics. */
/* Equal (ordered, non-signaling) */
@@ -857,25 +868,25 @@
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_loadu_pd (double const *__P)
{
- return (__m256d) __builtin_ia32_loadupd256 (__P);
+ return *(__m256d_u *)__P;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_storeu_pd (double *__P, __m256d __A)
{
- __builtin_ia32_storeupd256 (__P, (__v4df)__A);
+ *(__m256d_u *)__P = __A;
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_loadu_ps (float const *__P)
{
- return (__m256) __builtin_ia32_loadups256 (__P);
+ return *(__m256_u *)__P;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_storeu_ps (float *__P, __m256 __A)
{
- __builtin_ia32_storeups256 (__P, (__v8sf)__A);
+ *(__m256_u *)__P = __A;
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -891,15 +902,15 @@
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_loadu_si256 (__m256i const *__P)
+_mm256_loadu_si256 (__m256i_u const *__P)
{
- return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
+ return *__P;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_storeu_si256 (__m256i *__P, __m256i __A)
+_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
{
- __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
+ *__P = __A;
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h (revision 239797)
+++ gcc/config/i386/emmintrin.h (working copy)
@@ -52,6 +52,10 @@
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+/* Unaligned version of the same types. */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
/* Create a selector for use with the SHUFPD instruction. */
#define _MM_SHUFFLE2(fp1,fp0) \
(((fp1) << 1) | (fp0))
@@ -123,7 +127,7 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd (double const *__P)
{
- return __builtin_ia32_loadupd (__P);
+ return *(__m128d_u *)__P;
}
/* Create a vector with all two elements equal to *P. */
@@ -165,7 +169,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd (double *__P, __m128d __A)
{
- __builtin_ia32_storeupd (__P, __A);
+ *(__m128d_u *)__P = __A;
}
/* Stores the lower DPFP value. */
@@ -693,9 +697,9 @@
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_loadu_si128 (__m128i const *__P)
+_mm_loadu_si128 (__m128i_u const *__P)
{
- return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
+ return *__P;
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -711,9 +715,9 @@
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_storeu_si128 (__m128i *__P, __m128i __B)
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
{
- __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
+ *__P = __B;
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h (revision 239797)
+++ gcc/config/i386/xmmintrin.h (working copy)
@@ -68,6 +68,9 @@
vector types, and their scalar components. */
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+/* Unaligned version of the same type. */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
/* Internal data types for implementing the intrinsics. */
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -921,7 +924,7 @@
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps (float const *__P)
{
- return (__m128) *(__v4sf *)__P;
+ return *(__m128 *)__P;
}
/* Load four SPFP values from P. The address need not be 16-byte aligned. */
@@ -928,7 +931,7 @@
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_ps (float const *__P)
{
- return (__m128) __builtin_ia32_loadups (__P);
+ return *(__m128_u *)__P;
}
/* Load four SPFP values in reverse order. The address must be aligned. */
@@ -970,7 +973,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps (float *__P, __m128 __A)
{
- *(__v4sf *)__P = (__v4sf)__A;
+ *(__m128 *)__P = __A;
}
/* Store four SPFP values. The address need not be 16-byte aligned. */
@@ -977,7 +980,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_ps (float *__P, __m128 __A)
{
- __builtin_ia32_storeups (__P, (__v4sf)__A);
+ *(__m128_u *)__P = __A;
}
/* Store the lower SPFP value across four words. */
Index: gcc/testsuite/gcc.target/i386/pr59539-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr59539-2.c (revision 239797)
+++ gcc/testsuite/gcc.target/i386/pr59539-2.c (working copy)
@@ -1,6 +1,6 @@
/* PR target/59539 */
/* { dg-do compile } */
-/* { dg-options "-O2 -mavx2" } */
+/* { dg-options "-O2 -march=haswell" } */
#include <immintrin.h>