[PATCH] D31766: [Clang][X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation

2017-04-14 Thread Simon Pilgrim via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL300326: [X86][SSE] Update MOVNTDQA non-temporal loads to 
generic implementation (clang) (authored by RKSimon).

Changed prior to commit:
  https://reviews.llvm.org/D31766?vs=94379=95303#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D31766

Files:
  cfe/trunk/include/clang/Basic/BuiltinsX86.def
  cfe/trunk/lib/Headers/avx2intrin.h
  cfe/trunk/lib/Headers/avx512fintrin.h
  cfe/trunk/lib/Headers/smmintrin.h
  cfe/trunk/test/CodeGen/avx2-builtins.c
  cfe/trunk/test/CodeGen/avx512f-builtins.c
  cfe/trunk/test/CodeGen/sse41-builtins.c

Index: cfe/trunk/test/CodeGen/avx512f-builtins.c
===
--- cfe/trunk/test/CodeGen/avx512f-builtins.c
+++ cfe/trunk/test/CodeGen/avx512f-builtins.c
@@ -6251,7 +6251,7 @@
 
 __m512i test_mm512_stream_load_si512(void *__P) {
   // CHECK-LABEL: @test_mm512_stream_load_si512
-  // CHECK: @llvm.x86.avx512.movntdqa
+  // CHECK: load <8 x i64>, <8 x i64>* %{{.*}}, align 64, !nontemporal
   return _mm512_stream_load_si512(__P); 
 }
 
Index: cfe/trunk/test/CodeGen/avx2-builtins.c
===
--- cfe/trunk/test/CodeGen/avx2-builtins.c
+++ cfe/trunk/test/CodeGen/avx2-builtins.c
@@ -1117,7 +1117,7 @@
 
 __m256i test_mm256_stream_load_si256(__m256i const *a) {
   // CHECK-LABEL: test_mm256_stream_load_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %{{.*}})
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.*}}, align 32, !nontemporal
   return _mm256_stream_load_si256(a);
 }
 
Index: cfe/trunk/test/CodeGen/sse41-builtins.c
===
--- cfe/trunk/test/CodeGen/sse41-builtins.c
+++ cfe/trunk/test/CodeGen/sse41-builtins.c
@@ -354,7 +354,7 @@
 
 __m128i test_mm_stream_load_si128(__m128i const *a) {
   // CHECK-LABEL: test_mm_stream_load_si128
-  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %{{.*}})
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16, !nontemporal
   return _mm_stream_load_si128(a);
 }
 
Index: cfe/trunk/lib/Headers/smmintrin.h
===
--- cfe/trunk/lib/Headers/smmintrin.h
+++ cfe/trunk/lib/Headers/smmintrin.h
@@ -691,7 +691,7 @@
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_stream_load_si128 (__m128i const *__V)
 {
-  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
Index: cfe/trunk/lib/Headers/avx512fintrin.h
===
--- cfe/trunk/lib/Headers/avx512fintrin.h
+++ cfe/trunk/lib/Headers/avx512fintrin.h
@@ -8931,7 +8931,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_stream_load_si512 (void *__P)
 {
-  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+  return (__m512i) __builtin_nontemporal_load((const __v8di *)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
Index: cfe/trunk/lib/Headers/avx2intrin.h
===
--- cfe/trunk/lib/Headers/avx2intrin.h
+++ cfe/trunk/lib/Headers/avx2intrin.h
@@ -832,7 +832,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_stream_load_si256(__m256i const *__V)
 {
-  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+  return (__m256i)__builtin_nontemporal_load((const __v4di *)__V);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
Index: cfe/trunk/include/clang/Basic/BuiltinsX86.def
===
--- cfe/trunk/include/clang/Basic/BuiltinsX86.def
+++ cfe/trunk/include/clang/Basic/BuiltinsX86.def
@@ -391,7 +391,6 @@
 TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_movntdqa, "V2LLiV2LLiC*", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2LLiV2LLi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2LLiV2LLi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2LLiV2LLi", "", "sse4.1")
@@ -576,7 +575,6 @@
 TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_movntdqa256, "V4LLiV4LLiC*", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "", "avx2")
@@ -1747,7 +1745,6 @@
 TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
 

[PATCH] D31766: [Clang][X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation

2017-04-10 Thread Craig Topper via Phabricator via cfe-commits
craig.topper accepted this revision.
craig.topper added a comment.
This revision is now accepted and ready to land.

LGTM


Repository:
  rL LLVM

https://reviews.llvm.org/D31766



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D31766: [Clang][X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation

2017-04-06 Thread Simon Pilgrim via Phabricator via cfe-commits
RKSimon created this revision.

MOVNTDQA non-temporal aligned vector loads can be correctly represented using 
generic builtin loads, allowing us to remove the existing x86 intrinsics.

The LLVM companion patch will be published shortly.


Repository:
  rL LLVM

https://reviews.llvm.org/D31766

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/Headers/avx2intrin.h
  lib/Headers/avx512fintrin.h
  lib/Headers/smmintrin.h
  test/CodeGen/avx2-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/sse41-builtins.c

Index: test/CodeGen/sse41-builtins.c
===
--- test/CodeGen/sse41-builtins.c
+++ test/CodeGen/sse41-builtins.c
@@ -354,7 +354,7 @@
 
 __m128i test_mm_stream_load_si128(__m128i const *a) {
   // CHECK-LABEL: test_mm_stream_load_si128
-  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %{{.*}})
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16, !nontemporal
   return _mm_stream_load_si128(a);
 }
 
Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -6251,7 +6251,7 @@
 
 __m512i test_mm512_stream_load_si512(void *__P) {
   // CHECK-LABEL: @test_mm512_stream_load_si512
-  // CHECK: @llvm.x86.avx512.movntdqa
+  // CHECK: load <8 x i64>, <8 x i64>* %{{.*}}, align 64, !nontemporal
   return _mm512_stream_load_si512(__P); 
 }
 
Index: test/CodeGen/avx2-builtins.c
===
--- test/CodeGen/avx2-builtins.c
+++ test/CodeGen/avx2-builtins.c
@@ -1117,7 +1117,7 @@
 
 __m256i test_mm256_stream_load_si256(__m256i const *a) {
   // CHECK-LABEL: test_mm256_stream_load_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %{{.*}})
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.*}}, align 32, !nontemporal
   return _mm256_stream_load_si256(a);
 }
 
Index: lib/Headers/smmintrin.h
===
--- lib/Headers/smmintrin.h
+++ lib/Headers/smmintrin.h
@@ -691,7 +691,7 @@
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_stream_load_si128 (__m128i const *__V)
 {
-  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
Index: lib/Headers/avx512fintrin.h
===
--- lib/Headers/avx512fintrin.h
+++ lib/Headers/avx512fintrin.h
@@ -8931,7 +8931,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_stream_load_si512 (void *__P)
 {
-  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+  return (__m512i) __builtin_nontemporal_load((const __v8di *)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
Index: lib/Headers/avx2intrin.h
===
--- lib/Headers/avx2intrin.h
+++ lib/Headers/avx2intrin.h
@@ -832,7 +832,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_stream_load_si256(__m256i const *__V)
 {
-  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+  return (__m256i)__builtin_nontemporal_load((const __v4di *)__V);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
Index: include/clang/Basic/BuiltinsX86.def
===
--- include/clang/Basic/BuiltinsX86.def
+++ include/clang/Basic/BuiltinsX86.def
@@ -391,7 +391,6 @@
 TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_movntdqa, "V2LLiV2LLiC*", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2LLiV2LLi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2LLiV2LLi", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2LLiV2LLi", "", "sse4.1")
@@ -576,7 +575,6 @@
 TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "", "avx2")
-TARGET_BUILTIN(__builtin_ia32_movntdqa256, "V4LLiV4LLiC*", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "", "avx2")
@@ -1747,7 +1745,6 @@
 TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f")
 TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
 TARGET_BUILTIN(__builtin_ia32_dbpsadbw128_mask, "V8sV16cV16cIiV8sUc","","avx512bw,avx512vl")