[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rG6963c61f0f6e: [NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async* (authored by tra). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c clang/test/SemaCUDA/builtins.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,87 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK:
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
jlebar accepted this revision. jlebar added a comment. This revision is now accepted and ready to land. Re-approval. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra requested review of this revision. tra added a comment. PTAL. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra updated this revision to Diff 523566. tra added a comment. Instead of changing existing intrinsic, introduce a new set which takes an additional src_size argument. This should keep existing users working. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c clang/test/SemaCUDA/builtins.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,87 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4.s(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}],
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra added a comment. Looks like the extra intrinsic argument broke MLIR. I'll need to figure out how to deal with that. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
This revision was landed with ongoing or failed builds. This revision was automatically updated to reflect the committed changes. Closed by commit rGe7b9c2f00fa0: [NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async* (authored by tra). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c clang/test/SemaCUDA/builtins.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,75 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra updated this revision to Diff 523428. tra added a comment. Cosmetic test cleanup. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c clang/test/SemaCUDA/builtins.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,75 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4,
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra updated this revision to Diff 523426. tra added a comment. Actually connected the Sema check for the optional argument, and added a test to cover it. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D150820/new/ https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c clang/test/SemaCUDA/builtins.cu llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,75 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { -; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; -; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { +; CHECK:
[PATCH] D150820: [NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*
tra created this revision. Herald added subscribers: mattd, gchakrabarti, asavonic, bixia, hiraditya. Herald added a project: All. tra updated this revision to Diff 523216. tra added a comment. tra retitled this revision from "[NVPTX] added src_size argument to __nvvm_cp_async* intrinsics." to "[NVPTX, CUDA] added optional src_size argument to __nvvm_cp_async*". tra edited the summary of this revision. Herald added a subscriber: yaxunl. tra published this revision for review. tra added reviewers: jlebar, nyalloc. Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, jholewinski. Herald added projects: clang, LLVM. Updated clang side. The optional argument is needed for CUDA-11+ headers when we're compiling for sm_80+ GPUs. For the intrinsics, the src_size argument is required now. Old calls w/o the src_size argument can be upgraded by using src_size=transfer size of the intrinsic. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D150820 Files: clang/include/clang/Basic/BuiltinsNVPTX.def clang/include/clang/Sema/Sema.h clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Sema/SemaChecking.cpp clang/test/CodeGen/builtins-nvptx.c llvm/include/llvm/IR/IntrinsicsNVVM.td llvm/lib/Target/NVPTX/NVPTXIntrinsics.td llvm/test/CodeGen/NVPTX/async-copy.ll Index: llvm/test/CodeGen/NVPTX/async-copy.ll === --- llvm/test/CodeGen/NVPTX/async-copy.ll +++ llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; ALL-LABEL: asyncwaitgroup +; CHECK-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; ALL: cp.async.wait_group 8; + ; CHECK: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; ALL: cp.async.wait_group 0; + ; CHECK: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; ALL: cp.async.wait_group 16; + ; CHECK: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; ALL-LABEL: asyncwaitall +; CHECK-LABEL: asyncwaitall define void @asyncwaitall() { -; ALL: cp.async.wait_all +; CHECK: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; ALL-LABEL: asynccommitgroup +; CHECK-LABEL: asynccommitgroup define void @asynccommitgroup() { -; ALL: cp.async.commit_group +; CHECK: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,72 +41,75 @@ ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; +; The distinction between PTX32/PTX64 here is only to capture pointer register type +; in R to be used in subsequent tests. +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; +; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void