[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

2017-09-15 Thread Simon Pilgrim via Phabricator via cfe-commits
RKSimon added a comment.

_mm256_permute2x128_si256 ?


https://reviews.llvm.org/D37892



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

2017-09-15 Thread Simon Pilgrim via Phabricator via cfe-commits
RKSimon added a comment.

Also, there currently isn't any testing of the zero vector case.


https://reviews.llvm.org/D37892



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

2017-09-15 Thread Craig Topper via Phabricator via cfe-commits
craig.topper updated this revision to Diff 115427.
craig.topper added a comment.

Convert the AVX2 integer intrinsic as well.


https://reviews.llvm.org/D37892

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/avx-builtins.c
  test/CodeGen/avx2-builtins.c


Index: test/CodeGen/avx2-builtins.c
===
--- test/CodeGen/avx2-builtins.c
+++ test/CodeGen/avx2-builtins.c
@@ -907,8 +907,8 @@
 
 __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_permute2x128_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x 
i64> %{{.*}}, i8 49)
-  return _mm256_permute2x128_si256(a, b, 0x31);
+  // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x 
i32> 
+  return _mm256_permute2x128_si256(a, b, 0x38);
 }
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {
Index: test/CodeGen/avx-builtins.c
===
--- test/CodeGen/avx-builtins.c
+++ test/CodeGen/avx-builtins.c
@@ -678,19 +678,19 @@
 
 __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> 
%{{.*}}, <4 x double> %{{.*}}, i8 49)
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x 
i32> 
   return _mm256_permute2f128_pd(A, B, 0x31);
 }
 
 __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> 
%{{.*}}, <8 x float> %{{.*}}, i8 19)
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 

   return _mm256_permute2f128_ps(A, B, 0x13);
 }
 
 __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, 
<8 x i32> %{{.*}}, i8 32)
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
   return _mm256_permute2f128_si256(A, B, 0x20);
 }
 
Index: lib/CodeGen/CGBuiltin.cpp
===
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -7923,6 +7923,45 @@
 return EmitX86Select(*this, Ops[4], Align, Ops[3]);
   }
 
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256: {
+unsigned Imm = cast(Ops[2])->getZExtValue();
+unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+
+// This takes a very simple approach since there are two lanes and a
+// shuffle can have 2 inputs. So we reserve the first input for the first
+// lane and the second input for the second lane. This may result in
+// duplicate sources, but this can be dealt with in the backend.
+
+Value *OutOps[2];
+uint32_t Indices[8];
+for (unsigned l = 0; l != 2; ++l) {
+  // Determine the source for this lane.
+  if (Imm & (1 << ((l * 4) + 3)))
+OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
+  else if (Imm & (1 << ((l * 4) + 1)))
+OutOps[l] = Ops[1];
+  else
+OutOps[l] = Ops[0];
+
+  for (unsigned i = 0; i != NumElts/2; ++i) {
+// Start with ith element of the source for this lane.
+unsigned Idx = (l * NumElts) + i;
+// If bit 0 of the immediate half is set, switch to the high half of
+// the source.
+if (Imm & (1 << (l * 4)))
+  Idx += NumElts/2;
+Indices[(l * (NumElts/2)) + i] = Idx;
+  }
+}
+
+return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
+   makeArrayRef(Indices, NumElts),
+   "vperm");
+  }
+
   case X86::BI__builtin_ia32_movnti:
   case X86::BI__builtin_ia32_movnti64:
   case X86::BI__builtin_ia32_movntsd:


Index: test/CodeGen/avx2-builtins.c
===
--- test/CodeGen/avx2-builtins.c
+++ test/CodeGen/avx2-builtins.c
@@ -907,8 +907,8 @@
 
 __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_permute2x128_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49)
-  return _mm256_permute2x128_si256(a, b, 0x31);
+  // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x i32> 
+  return _mm256_permute2x128_si256(a, b, 0x38);
 }
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {
Index: test/CodeGen/avx-builtins.c
===
--- test/CodeGen/avx-builtins.c
+++ test/CodeGen/avx-builtins.c
@@ -678,19 +678,19 @@
 
 __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_permute2f128_pd
-

[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

2017-09-15 Thread Simon Pilgrim via Phabricator via cfe-commits
RKSimon accepted this revision.
RKSimon added a comment.
This revision is now accepted and ready to land.

LGTM - please can you update the 
avx-intrinsics-fast-isel.ll/avxs-intrinsics-fast-isel.ll cases to match the 
*-builtins.c as well (either now or if/when you add the intrinsics to 
autoupgrade).


https://reviews.llvm.org/D37892



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37892: [X86] Use native shuffle vector for the perm2f128 intrinsics

2017-09-15 Thread Phabricator via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL313418: [X86] Use native shuffle vector for the perm2f128 
intrinsics (authored by ctopper).

Changed prior to commit:
  https://reviews.llvm.org/D37892?vs=115427&id=115518#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D37892

Files:
  cfe/trunk/lib/CodeGen/CGBuiltin.cpp
  cfe/trunk/test/CodeGen/avx-builtins.c
  cfe/trunk/test/CodeGen/avx2-builtins.c


Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
===
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp
@@ -7923,6 +7923,45 @@
 return EmitX86Select(*this, Ops[4], Align, Ops[3]);
   }
 
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256: {
+unsigned Imm = cast(Ops[2])->getZExtValue();
+unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+
+// This takes a very simple approach since there are two lanes and a
+// shuffle can have 2 inputs. So we reserve the first input for the first
+// lane and the second input for the second lane. This may result in
+// duplicate sources, but this can be dealt with in the backend.
+
+Value *OutOps[2];
+uint32_t Indices[8];
+for (unsigned l = 0; l != 2; ++l) {
+  // Determine the source for this lane.
+  if (Imm & (1 << ((l * 4) + 3)))
+OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
+  else if (Imm & (1 << ((l * 4) + 1)))
+OutOps[l] = Ops[1];
+  else
+OutOps[l] = Ops[0];
+
+  for (unsigned i = 0; i != NumElts/2; ++i) {
+// Start with ith element of the source for this lane.
+unsigned Idx = (l * NumElts) + i;
+// If bit 0 of the immediate half is set, switch to the high half of
+// the source.
+if (Imm & (1 << (l * 4)))
+  Idx += NumElts/2;
+Indices[(l * (NumElts/2)) + i] = Idx;
+  }
+}
+
+return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
+   makeArrayRef(Indices, NumElts),
+   "vperm");
+  }
+
   case X86::BI__builtin_ia32_movnti:
   case X86::BI__builtin_ia32_movnti64:
   case X86::BI__builtin_ia32_movntsd:
Index: cfe/trunk/test/CodeGen/avx-builtins.c
===
--- cfe/trunk/test/CodeGen/avx-builtins.c
+++ cfe/trunk/test/CodeGen/avx-builtins.c
@@ -678,19 +678,19 @@
 
 __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> 
%{{.*}}, <4 x double> %{{.*}}, i8 49)
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x 
i32> 
   return _mm256_permute2f128_pd(A, B, 0x31);
 }
 
 __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> 
%{{.*}}, <8 x float> %{{.*}}, i8 19)
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 

   return _mm256_permute2f128_ps(A, B, 0x13);
 }
 
 __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
   // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, 
<8 x i32> %{{.*}}, i8 32)
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
   return _mm256_permute2f128_si256(A, B, 0x20);
 }
 
Index: cfe/trunk/test/CodeGen/avx2-builtins.c
===
--- cfe/trunk/test/CodeGen/avx2-builtins.c
+++ cfe/trunk/test/CodeGen/avx2-builtins.c
@@ -907,8 +907,8 @@
 
 __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_permute2x128_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x 
i64> %{{.*}}, i8 49)
-  return _mm256_permute2x128_si256(a, b, 0x31);
+  // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x 
i32> 
+  return _mm256_permute2x128_si256(a, b, 0x38);
 }
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {


Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
===
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp
@@ -7923,6 +7923,45 @@
 return EmitX86Select(*this, Ops[4], Align, Ops[3]);
   }
 
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256: {
+unsigned Imm = cast(Ops[2])->getZExtValue();
+unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+
+// This takes a very simple approach since there are two lanes a