Re: [PATCH] D12835: [X86][SSE] Replace 128-bit SSE41 PMOVSX intrinsics with native IR

2015-09-19 Thread Simon Pilgrim via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL248092: [X86][SSE] Replace 128-bit SSE41 PMOVSX intrinsics 
with native IR (authored by RKSimon).

Changed prior to commit:
  http://reviews.llvm.org/D12835?vs=34646=35167#toc

Repository:
  rL LLVM

http://reviews.llvm.org/D12835

Files:
  cfe/trunk/include/clang/Basic/BuiltinsX86.def
  cfe/trunk/lib/Headers/smmintrin.h
  cfe/trunk/test/CodeGen/builtins-x86.c
  cfe/trunk/test/CodeGen/sse41-builtins.c

Index: cfe/trunk/include/clang/Basic/BuiltinsX86.def
===
--- cfe/trunk/include/clang/Basic/BuiltinsX86.def
+++ cfe/trunk/include/clang/Basic/BuiltinsX86.def
@@ -376,12 +376,6 @@
 TARGET_BUILTIN(__builtin_ia32_pminsd128, "V4iV4iV4i", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pminud128, "V4iV4iV4i", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pminuw128, "V8sV8sV8s", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbd128, "V4iV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbq128, "V2LLiV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxbw128, "V8sV16c", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxdq128, "V2LLiV4i", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxwd128, "V4iV8s", "", "sse4.1")
-TARGET_BUILTIN(__builtin_ia32_pmovsxwq128, "V2LLiV8s", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pmovzxbd128, "V4iV16c", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pmovzxbq128, "V2LLiV16c", "", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pmovzxbw128, "V8sV16c", "", "sse4.1")
Index: cfe/trunk/test/CodeGen/builtins-x86.c
===
--- cfe/trunk/test/CodeGen/builtins-x86.c
+++ cfe/trunk/test/CodeGen/builtins-x86.c
@@ -372,12 +372,6 @@
   tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
   tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
-  tmp_V4i = __builtin_ia32_pmovsxbd128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovsxbq128(tmp_V16c);
-  tmp_V8s = __builtin_ia32_pmovsxbw128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovsxdq128(tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmovsxwd128(tmp_V8s);
-  tmp_V2LLi = __builtin_ia32_pmovsxwq128(tmp_V8s);
   tmp_V4i = __builtin_ia32_pmovzxbd128(tmp_V16c);
   tmp_V2LLi = __builtin_ia32_pmovzxbq128(tmp_V16c);
   tmp_V8s = __builtin_ia32_pmovzxbw128(tmp_V16c);
Index: cfe/trunk/test/CodeGen/sse41-builtins.c
===
--- cfe/trunk/test/CodeGen/sse41-builtins.c
+++ cfe/trunk/test/CodeGen/sse41-builtins.c
@@ -86,42 +86,42 @@
 
 __m128i test_mm_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> {{.*}})
+  // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
   // CHECK-ASM: pmovsxbw %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi16(a);
 }
 
 __m128i test_mm_cvtepi8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> {{.*}})
+  // CHECK: sext <4 x i8> {{.*}} to <4 x i32>
   // CHECK-ASM: pmovsxbd %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi32(a);
 }
 
 __m128i test_mm_cvtepi8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> {{.*}})
+  // CHECK: sext <2 x i8> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxbq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi64(a);
 }
 
 __m128i test_mm_cvtepi16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> {{.*}})
+  // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
   // CHECK-ASM: pmovsxwd %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi16_epi32(a);
 }
 
 __m128i test_mm_cvtepi16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> {{.*}})
+  // CHECK: sext <2 x i16> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxwq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi16_epi64(a);
 }
 
 __m128i test_mm_cvtepi32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi32_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> {{.*}})
+  // CHECK: sext <2 x i32> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxdq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi32_epi64(a);
 }
Index: cfe/trunk/lib/Headers/smmintrin.h
===
--- cfe/trunk/lib/Headers/smmintrin.h
+++ cfe/trunk/lib/Headers/smmintrin.h
@@ -286,37 +286,37 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qi)__V, (__v16qi)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
-  return (__m128i) 

Re: [PATCH] D12835: [X86][SSE] Replace 128-bit SSE41 PMOVSX intrinsics with native IR

2015-09-15 Thread Quentin Colombet via cfe-commits
qcolombet accepted this revision.
qcolombet added a comment.
This revision is now accepted and ready to land.

LGTM.

Thanks,
-Quentin


Repository:
  rL LLVM

http://reviews.llvm.org/D12835



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D12835: [X86][SSE] Replace 128-bit SSE41 PMOVSX intrinsics with native IR

2015-09-13 Thread Simon Pilgrim via cfe-commits
RKSimon created this revision.
RKSimon added reviewers: ab, qcolombet, craig.topper, spatel.
RKSimon added a subscriber: cfe-commits.
RKSimon set the repository for this revision to rL LLVM.

128-bit vector integer sign extensions correctly lower to the pmovsx 
instructions even for debug builds.

This patch removes the builtins and reimplements the _mm_cvtepi*_epi* 
intrinsics using __builtin_shufflevector (to extract the bottom most subvector) 
and __builtin_convertvector (to actually perform the sign extension).

After this I'll add a patch for the removal/upgrade of the pmovsx intrinsics on 
the llvm side.

Note: AVX2 256-bit vector integer sign extensions currently don't lower 
correctly in debug builds if they need the __builtin_shufflevector stage 
(although I could add the ones that don't need this straightaway).

Repository:
  rL LLVM

http://reviews.llvm.org/D12835

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/Headers/smmintrin.h
  test/CodeGen/builtins-x86.c
  test/CodeGen/sse41-builtins.c

Index: test/CodeGen/sse41-builtins.c
===
--- test/CodeGen/sse41-builtins.c
+++ test/CodeGen/sse41-builtins.c
@@ -86,42 +86,42 @@
 
 __m128i test_mm_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> {{.*}})
+  // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
   // CHECK-ASM: pmovsxbw %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi16(a);
 }
 
 __m128i test_mm_cvtepi8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> {{.*}})
+  // CHECK: sext <4 x i8> {{.*}} to <4 x i32>
   // CHECK-ASM: pmovsxbd %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi32(a);
 }
 
 __m128i test_mm_cvtepi8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> {{.*}})
+  // CHECK: sext <2 x i8> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxbq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi8_epi64(a);
 }
 
 __m128i test_mm_cvtepi16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> {{.*}})
+  // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
   // CHECK-ASM: pmovsxwd %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi16_epi32(a);
 }
 
 __m128i test_mm_cvtepi16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> {{.*}})
+  // CHECK: sext <2 x i16> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxwq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi16_epi64(a);
 }
 
 __m128i test_mm_cvtepi32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi32_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> {{.*}})
+  // CHECK: sext <2 x i32> {{.*}} to <2 x i64>
   // CHECK-ASM: pmovsxdq %xmm{{.*}}, %xmm{{.*}}
   return _mm_cvtepi32_epi64(a);
 }
Index: test/CodeGen/builtins-x86.c
===
--- test/CodeGen/builtins-x86.c
+++ test/CodeGen/builtins-x86.c
@@ -372,12 +372,6 @@
   tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
   tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
-  tmp_V4i = __builtin_ia32_pmovsxbd128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovsxbq128(tmp_V16c);
-  tmp_V8s = __builtin_ia32_pmovsxbw128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovsxdq128(tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmovsxwd128(tmp_V8s);
-  tmp_V2LLi = __builtin_ia32_pmovsxwq128(tmp_V8s);
   tmp_V4i = __builtin_ia32_pmovzxbd128(tmp_V16c);
   tmp_V2LLi = __builtin_ia32_pmovzxbq128(tmp_V16c);
   tmp_V8s = __builtin_ia32_pmovzxbw128(tmp_V16c);
Index: lib/Headers/smmintrin.h
===
--- lib/Headers/smmintrin.h
+++ lib/Headers/smmintrin.h
@@ -286,37 +286,37 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qi)__V, (__v16qi)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qi)__V, (__v16qi)__V, 0, 1, 2, 3), __v4si);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qi)__V, (__v16qi)__V, 0, 1), __v2di);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
 {
-  return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V,