[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-25 Thread Artem Belevich via Phabricator via cfe-commits
tra created this revision.
Herald added subscribers: hiraditya, sanjoy, jholewinski.

https://reviews.llvm.org/D38191

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/__clang_cuda_intrinsics.h
  clang/test/CodeGen/builtins-nvptx-ptx60.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/match.ll

Index: llvm/test/CodeGen/NVPTX/match.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/match.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s
+
+declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)
+declare i64 @llvm.nvvm.match.any.sync.i64(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i32
+define i32 @match.any.sync.i32(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i32_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.any.sync.i32_param_1];
+
+  ; CHECK:  match.any.sync.b32  [[V0:%r[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V1:%r[0-9]+]], [[VALUE]], 1;
+  %v1 = call i32 @llvm.nvvm.match.any.sync.i32(i32 1, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V2:%r[0-9]+]], 2, [[MASK]];
+  %v2 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 2)
+  ; CHECK:  match.any.sync.b32  [[V3:%r[0-9]+]], 4, 3;
+  %v3 = call i32 @llvm.nvvm.match.any.sync.i32(i32 3, i32 4)
+  %sum1 = add i32 %v0, %v1
+  %sum2 = add i32 %v2, %v3
+  %sum3 = add i32 %sum1, %sum2
+  ret i32 %sum3;
+}
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i64
+define i64 @match.any.sync.i64(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i64_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.any.sync.i64_param_1];
+
+  ; CHECK:  match.any.sync.b64  [[V0:%rd[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V1:%rd[0-9]+]], [[VALUE]], 1;
+  %v1 = call i64 @llvm.nvvm.match.any.sync.i64(i32 1, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V2:%rd[0-9]+]], 2, [[MASK]];
+  %v2 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 2)
+  ; CHECK:  match.any.sync.b64  [[V3:%rd[0-9]+]], 4, 3;
+  %v3 = call i64 @llvm.nvvm.match.any.sync.i64(i32 3, i64 4)
+  %sum1 = add i64 %v0, %v1
+  %sum2 = add i64 %v2, %v3
+  %sum3 = add i64 %sum1, %sum2
+  ret i64 %sum3;
+}
+
+declare {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32, i32)
+declare {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i32p(
+define {i32,i1} @match.all.sync.i32p(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i32p_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.all.sync.i32p_param_1];
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 %value)
+  %v1 = extractvalue {i32, i1} %r1, 0
+  %p1 = extractvalue {i32, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 1)
+  %v2 = extractvalue {i32, i1} %r2, 0
+  %p2 = extractvalue {i32, i1} %r2, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], 2;
+  %r3 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 2, i32 %value)
+  %v3 = extractvalue {i32, i1} %r3, 0
+  %p3 = extractvalue {i32, i1} %r3, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 4, 3;
+  %r4 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 3, i32 4)
+  %v4 = extractvalue {i32, i1} %r4, 0
+  %p4 = extractvalue {i32, i1} %r4, 1
+
+  %vsum1 = add i32 %v1, %v2
+  %vsum2 = add i32 %v3, %v4
+  %vsum3 = add i32 %vsum1, %vsum2
+  %psum1 = add i1 %p1, %p2
+  %psum2 = add i1 %p3, %p4
+  %psum3 = add i1 %psum1, %psum2
+  %ret0 = insertvalue {i32, i1} undef, i32 %vsum3, 0
+  %ret1 = insertvalue {i32, i1} %ret0, i1 %psum3, 1
+  ret {i32, i1} %ret1;
+}
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i64p(
+define {i64,i1} @match.all.sync.i64p(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i64p_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.all.sync.i64p_param_1];
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value)
+  %v1 = extractvalue {i64, i1} %r1, 0
+  %p1 = extractvalue {i64, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 1)
+  %v2 = extractvalue {i64, i1} %r2, 0
+  %p2 = extractvalue {i64,

[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-25 Thread Justin Lebar via Phabricator via cfe-commits
jlebar added inline comments.



Comment at: clang/include/clang/Basic/BuiltinsNVPTX.def:419
+TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", "ptx60")
+// These return a pair {value, predicate} which requires custom lowering.
+TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", "ptx60")

Nit, non-restrictive "which" should get a comma.  :)



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:9603
+Value *Pred = Builder.CreateSExt(Builder.CreateExtractValue(ResultPair, 1),
+ PredOutPtr.getElementType());
+Builder.CreateStore(Pred, PredOutPtr);

Doing sext i1 -> i32 is going to cause us to store 0 or -1 in the pred 
(right?).  The CUDA docs say

> Predicate pred is set to true if all threads in mask have the same value of 
> value; otherwise the predicate is set to false.

I'd guess that "true" probably means 1 (i.e. uext i1 -> i32) rather than -1, 
although, I guess we have to check.


https://reviews.llvm.org/D38191



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-25 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 116578.
tra marked an inline comment as done.
tra added a comment.

Addressed Justin's comments.


https://reviews.llvm.org/D38191

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/__clang_cuda_intrinsics.h
  clang/test/CodeGen/builtins-nvptx-ptx60.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/match.ll

Index: llvm/test/CodeGen/NVPTX/match.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/match.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s
+
+declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)
+declare i64 @llvm.nvvm.match.any.sync.i64(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i32
+define i32 @match.any.sync.i32(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i32_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.any.sync.i32_param_1];
+
+  ; CHECK:  match.any.sync.b32  [[V0:%r[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V1:%r[0-9]+]], [[VALUE]], 1;
+  %v1 = call i32 @llvm.nvvm.match.any.sync.i32(i32 1, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V2:%r[0-9]+]], 2, [[MASK]];
+  %v2 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 2)
+  ; CHECK:  match.any.sync.b32  [[V3:%r[0-9]+]], 4, 3;
+  %v3 = call i32 @llvm.nvvm.match.any.sync.i32(i32 3, i32 4)
+  %sum1 = add i32 %v0, %v1
+  %sum2 = add i32 %v2, %v3
+  %sum3 = add i32 %sum1, %sum2
+  ret i32 %sum3;
+}
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i64
+define i64 @match.any.sync.i64(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i64_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.any.sync.i64_param_1];
+
+  ; CHECK:  match.any.sync.b64  [[V0:%rd[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V1:%rd[0-9]+]], [[VALUE]], 1;
+  %v1 = call i64 @llvm.nvvm.match.any.sync.i64(i32 1, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V2:%rd[0-9]+]], 2, [[MASK]];
+  %v2 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 2)
+  ; CHECK:  match.any.sync.b64  [[V3:%rd[0-9]+]], 4, 3;
+  %v3 = call i64 @llvm.nvvm.match.any.sync.i64(i32 3, i64 4)
+  %sum1 = add i64 %v0, %v1
+  %sum2 = add i64 %v2, %v3
+  %sum3 = add i64 %sum1, %sum2
+  ret i64 %sum3;
+}
+
+declare {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32, i32)
+declare {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i32p(
+define {i32,i1} @match.all.sync.i32p(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i32p_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.all.sync.i32p_param_1];
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 %value)
+  %v1 = extractvalue {i32, i1} %r1, 0
+  %p1 = extractvalue {i32, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 1)
+  %v2 = extractvalue {i32, i1} %r2, 0
+  %p2 = extractvalue {i32, i1} %r2, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], 2;
+  %r3 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 2, i32 %value)
+  %v3 = extractvalue {i32, i1} %r3, 0
+  %p3 = extractvalue {i32, i1} %r3, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 4, 3;
+  %r4 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 3, i32 4)
+  %v4 = extractvalue {i32, i1} %r4, 0
+  %p4 = extractvalue {i32, i1} %r4, 1
+
+  %vsum1 = add i32 %v1, %v2
+  %vsum2 = add i32 %v3, %v4
+  %vsum3 = add i32 %vsum1, %vsum2
+  %psum1 = add i1 %p1, %p2
+  %psum2 = add i1 %p3, %p4
+  %psum3 = add i1 %psum1, %psum2
+  %ret0 = insertvalue {i32, i1} undef, i32 %vsum3, 0
+  %ret1 = insertvalue {i32, i1} %ret0, i1 %psum3, 1
+  ret {i32, i1} %ret1;
+}
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i64p(
+define {i64,i1} @match.all.sync.i64p(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i64p_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.all.sync.i64p_param_1];
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value)
+  %v1 = extractvalue {i64, i1} %r1, 0
+  %p1 = extractvalue {i64, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 1)
+  %v2 = extractval

[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-25 Thread Artem Belevich via Phabricator via cfe-commits
tra added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:9603
+Value *Pred = Builder.CreateSExt(Builder.CreateExtractValue(ResultPair, 1),
+ PredOutPtr.getElementType());
+Builder.CreateStore(Pred, PredOutPtr);

jlebar wrote:
> Doing sext i1 -> i32 is going to cause us to store 0 or -1 in the pred 
> (right?).  The CUDA docs say
> 
> > Predicate pred is set to true if all threads in mask have the same value of 
> > value; otherwise the predicate is set to false.
> 
> I'd guess that "true" probably means 1 (i.e. uext i1 -> i32) rather than -1, 
> although, I guess we have to check.
Right. It should've been ZExt. In similar places CUDA headers use "selp %r1, 1, 
0, %p".


https://reviews.llvm.org/D38191



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-25 Thread Artem Belevich via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL314135: [NVPTX] added match.{any,all}.sync instructions, 
intrinsics & builtins. (authored by tra).

Changed prior to commit:
  https://reviews.llvm.org/D38191?vs=116578&id=116584#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D38191

Files:
  cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def
  cfe/trunk/lib/CodeGen/CGBuiltin.cpp
  cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h
  cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu
  llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td
  llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/trunk/test/CodeGen/NVPTX/match.ll

Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===
--- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,6 +714,9 @@
 return false;
   case Intrinsic::nvvm_texsurf_handle_internal:
 SelectTexSurfHandle(N);
+  case Intrinsic::nvvm_match_all_sync_i32p:
+  case Intrinsic::nvvm_match_all_sync_i64p:
+SelectMatchAll(N);
 return true;
   }
 }
@@ -726,6 +729,36 @@
 MVT::i64, GlobalVal));
 }
 
+void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) {
+  SDLoc DL(N);
+  enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 };
+  unsigned IID = cast(N->getOperand(0))->getZExtValue();
+  unsigned OpcodeIndex =
+  (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0;
+  SDValue MaskOp = N->getOperand(1);
+  SDValue ValueOp = N->getOperand(2);
+  if (ConstantSDNode *ValueConst = dyn_cast(ValueOp)) {
+OpcodeIndex |= HAS_CONST_VALUE;
+ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL,
+ValueConst->getValueType(0));
+  }
+  if (ConstantSDNode *MaskConst = dyn_cast(MaskOp)) {
+OpcodeIndex |= HAS_CONST_MASK;
+MaskOp = CurDAG->getTargetConstant(MaskConst->getZExtValue(), DL,
+   MaskConst->getValueType(0));
+  }
+  // Maps {IS_I64, HAS_CONST_VALUE, HAS_CONST_MASK} -> opcode
+  unsigned Opcodes[8] = {
+  NVPTX::MATCH_ALLP_SYNC_32rr, NVPTX::MATCH_ALLP_SYNC_32ri,
+  NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii,
+  NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri,
+  NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii};
+  SDNode *NewNode = CurDAG->getMachineNode(Opcodes[OpcodeIndex], DL,
+   {ValueOp->getValueType(0), MVT::i1},
+   {MaskOp, ValueOp});
+  ReplaceNode(N, NewNode);
+}
+
 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast(N);
Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
===
--- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -158,6 +158,7 @@
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
+def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 
Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
===
--- llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -58,6 +58,7 @@
   bool tryIntrinsicNoChain(SDNode *N);
   bool tryIntrinsicChain(SDNode *N);
   void SelectTexSurfHandle(SDNode *N);
+  void SelectMatchAll(SDNode *N);
   bool tryLoad(SDNode *N);
   bool tryLoadVector(SDNode *N);
   bool tryLDGLDU(SDNode *N);
Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
===
--- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -247,6 +247,63 @@
 defm VOTE_SYNC_UNI : VOTE_SYNC;
 defm VOTE_SYNC_BALLOT : VOTE_SYNC;
 
+multiclass MATCH_ANY_SYNC {
+  def ii : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, ImmOp:$value),
+  "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+  [(set regclass:$dest, (IntOp imm:$mask, imm:$value))]>,
+   Requires<[hasPTX60, hasSM70]>;
+  def ir : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, ImmOp:$value),
+  "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+  [(set regclass:$dest, (IntOp Int32Regs:$mask, imm:$value))]>,
+   Requires<[hasPTX60, hasSM70]>;
+  def ri : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, regclass:$value),
+  "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+   

[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-26 Thread Artem Belevich via Phabricator via cfe-commits
tra reopened this revision.
tra added inline comments.
This revision is now accepted and ready to land.



Comment at: llvm/trunk/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp:716
   case Intrinsic::nvvm_texsurf_handle_internal:
 SelectTexSurfHandle(N);
+  case Intrinsic::nvvm_match_all_sync_i32p:

I've unintentionally killed `return true;` here and that's what broke the 
texture tests. I'm not sure yet why my local tests worked.


https://reviews.llvm.org/D38191



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-26 Thread Artem Belevich via Phabricator via cfe-commits
tra updated this revision to Diff 116674.
tra added a comment.

Added missing return. Tests pass now.


https://reviews.llvm.org/D38191

Files:
  clang/include/clang/Basic/BuiltinsNVPTX.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/__clang_cuda_intrinsics.h
  clang/test/CodeGen/builtins-nvptx-ptx60.cu
  llvm/include/llvm/IR/IntrinsicsNVVM.td
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
  llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
  llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
  llvm/test/CodeGen/NVPTX/match.ll

Index: llvm/test/CodeGen/NVPTX/match.ll
===
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/match.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s
+
+declare i32 @llvm.nvvm.match.any.sync.i32(i32, i32)
+declare i64 @llvm.nvvm.match.any.sync.i64(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i32
+define i32 @match.any.sync.i32(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i32_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.any.sync.i32_param_1];
+
+  ; CHECK:  match.any.sync.b32  [[V0:%r[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V1:%r[0-9]+]], [[VALUE]], 1;
+  %v1 = call i32 @llvm.nvvm.match.any.sync.i32(i32 1, i32 %value)
+  ; CHECK:  match.any.sync.b32  [[V2:%r[0-9]+]], 2, [[MASK]];
+  %v2 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 2)
+  ; CHECK:  match.any.sync.b32  [[V3:%r[0-9]+]], 4, 3;
+  %v3 = call i32 @llvm.nvvm.match.any.sync.i32(i32 3, i32 4)
+  %sum1 = add i32 %v0, %v1
+  %sum2 = add i32 %v2, %v3
+  %sum3 = add i32 %sum1, %sum2
+  ret i32 %sum3;
+}
+
+; CHECK-LABEL: .func{{.*}}match.any.sync.i64
+define i64 @match.any.sync.i64(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.any.sync.i64_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.any.sync.i64_param_1];
+
+  ; CHECK:  match.any.sync.b64  [[V0:%rd[0-9]+]], [[VALUE]], [[MASK]];
+  %v0 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V1:%rd[0-9]+]], [[VALUE]], 1;
+  %v1 = call i64 @llvm.nvvm.match.any.sync.i64(i32 1, i64 %value)
+  ; CHECK:  match.any.sync.b64  [[V2:%rd[0-9]+]], 2, [[MASK]];
+  %v2 = call i64 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 2)
+  ; CHECK:  match.any.sync.b64  [[V3:%rd[0-9]+]], 4, 3;
+  %v3 = call i64 @llvm.nvvm.match.any.sync.i64(i32 3, i64 4)
+  %sum1 = add i64 %v0, %v1
+  %sum2 = add i64 %v2, %v3
+  %sum3 = add i64 %sum1, %sum2
+  ret i64 %sum3;
+}
+
+declare {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32, i32)
+declare {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64)
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i32p(
+define {i32,i1} @match.all.sync.i32p(i32 %mask, i32 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i32p_param_0];
+  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match.all.sync.i32p_param_1];
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 %value)
+  %v1 = extractvalue {i32, i1} %r1, 0
+  %p1 = extractvalue {i32, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 1)
+  %v2 = extractvalue {i32, i1} %r2, 0
+  %p2 = extractvalue {i32, i1} %r2, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], 2;
+  %r3 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 2, i32 %value)
+  %v3 = extractvalue {i32, i1} %r3, 0
+  %p3 = extractvalue {i32, i1} %r3, 1
+
+  ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, 4, 3;
+  %r4 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 3, i32 4)
+  %v4 = extractvalue {i32, i1} %r4, 0
+  %p4 = extractvalue {i32, i1} %r4, 1
+
+  %vsum1 = add i32 %v1, %v2
+  %vsum2 = add i32 %v3, %v4
+  %vsum3 = add i32 %vsum1, %vsum2
+  %psum1 = add i1 %p1, %p2
+  %psum2 = add i1 %p3, %p4
+  %psum3 = add i1 %psum1, %psum2
+  %ret0 = insertvalue {i32, i1} undef, i32 %vsum3, 0
+  %ret1 = insertvalue {i32, i1} %ret0, i1 %psum3, 1
+  ret {i32, i1} %ret1;
+}
+
+; CHECK-LABEL: .func{{.*}}match.all.sync.i64p(
+define {i64,i1} @match.all.sync.i64p(i32 %mask, i64 %value) {
+  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match.all.sync.i64p_param_0];
+  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match.all.sync.i64p_param_1];
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
+  %r1 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value)
+  %v1 = extractvalue {i64, i1} %r1, 0
+  %p1 = extractvalue {i64, i1} %r1, 1
+
+  ; CHECK:  match.all.sync.b64 {{%rd[0-9]+\|%p[0-9]+}}, 1, [[MASK]];
+  %r2 = call {i64, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 1)
+  %v2 = extractvalue {i64, i1} %r2, 0
+  %p2 = 

[PATCH] D38191: [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.

2017-09-26 Thread Artem Belevich via Phabricator via cfe-commits
tra closed this revision.
tra added a comment.

Landed with fix in r314223.


https://reviews.llvm.org/D38191



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits