Author: Joseph Huber Date: 2021-10-08T22:21:59-04:00 New Revision: bad44d5f39bc29c1062f0c4ce8b71720734ad3cb
URL: https://github.com/llvm/llvm-project/commit/bad44d5f39bc29c1062f0c4ce8b71720734ad3cb DIFF: https://github.com/llvm/llvm-project/commit/bad44d5f39bc29c1062f0c4ce8b71720734ad3cb.diff LOG: [OpenMP] Add RTL function for getting number of threads in block. This patch adds support for the `__kmpc_get_hardware_num_threads_in_block` function that returns the number of threads. This was missing in the new runtime and was used by the AMDGPU plugin which prevented it from using the new runtime. This patchs also unified the interface for getting the thread numbers in the frontend. Originally authored by jdoerfert. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D111475 Added: Modified: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/CodeGen/CGOpenMPRuntimeGPU.h clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp clang/test/OpenMP/nvptx_parallel_codegen.cpp clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp openmp/libomptarget/DeviceRTL/include/Interface.h openmp/libomptarget/DeviceRTL/src/Mapping.cpp openmp/libomptarget/DeviceRTL/src/Utils.cpp Removed: ################################################################################ diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp index cac5faaa8d0f2..cf6f6bc3654c0 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -46,16 +46,3 @@ llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) { CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x); return Bld.CreateCall(F, llvm::None, "nvptx_tid"); } - -llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Module *M = &CGF.CGM.getModule(); - const char *LocSize = "__kmpc_amdgcn_gpu_num_threads"; - llvm::Function *F = M->getFunction(LocSize); - if (!F) { - F = llvm::Function::Create( - llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false), - llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); - } - return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); -} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h index c1421261bfc19..cecbcaf3c3d36 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -32,9 +32,6 @@ class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU { /// Get the id of the current thread on the GPU. llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; - - /// Get the maximum number of threads in a block of the GPU. - llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // namespace CodeGen diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index be47ccbbbc58a..366482085d4fa 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3947,3 +3947,16 @@ void CGOpenMPRuntimeGPU::clear() { } CGOpenMPRuntime::clear(); } + +llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Module *M = &CGF.CGM.getModule(); + const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; + llvm::Function *F = M->getFunction(LocSize); + if (!F) { + F = llvm::Function::Create( + llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false), + llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); + } + return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index 5d3b711e6d4b5..cc069844660b4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -182,7 +182,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0; /// Get the maximum number of threads in a block of the GPU. - virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF); /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 1688d07b90b6e..ae07ffd3c93fb 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -46,11 +46,3 @@ llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) { &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x); return Bld.CreateCall(F, llvm::None, "nvptx_tid"); } - -llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Function *F; - F = llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); - return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); -} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 5f16029592665..ddb4fc2c59b2c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -32,9 +32,6 @@ class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU { /// Get the id of the current thread on the GPU. llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; - - /// Get the maximum number of threads in a block of the GPU. - llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // CodeGen namespace. diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index c24a4fa39cdf2..676d5a4ded4c5 100644 --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -3005,7 +3005,7 @@ int main(int argc, char **argv) { // CHECK4-NEXT: store i32 [[TMP6]], i32* [[CONV1]], align 4 // CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 // CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR5:[0-9]+]] +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] // CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: @@ -3068,7 +3068,7 @@ int main(int argc, char **argv) { // CHECK4-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* // CHECK4-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* // CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i64 40, i1 false) -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -3259,23 +3259,23 @@ int main(int argc, char **argv) { // CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK4-NEXT: store i32 [[ADD]], i32* [[I6]], align 4 -// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR7:[0-9]+]] -// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I6]]) #[[ATTR6:[0-9]+]] +// CHECK4-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR6]] // CHECK4-NEXT: [[ADD10:%.*]] = add nsw i32 [[CALL]], [[CALL9]] // CHECK4-NEXT: [[TMP19:%.*]] = load i32, i32* [[I6]], align 4 // CHECK4-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64 // CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B4]], i64 0, i64 [[IDXPROM]] -// CHECK4-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR7]] +// CHECK4-NEXT: [[CALL11:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR6]] // CHECK4-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD10]], [[CALL11]] // CHECK4-NEXT: [[TMP20:%.*]] = load i32, i32* [[I6]], align 4 // CHECK4-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP20]] to i64 // CHECK4-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C5]], i64 0, i64 [[IDXPROM13]] -// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR7]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR6]] // CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD12]], [[CALL15]] // CHECK4-NEXT: [[TMP21:%.*]] = load i32, i32* [[I6]], align 4 // CHECK4-NEXT: [[IDXPROM17:%.*]] = sext i32 [[TMP21]] to i64 // CHECK4-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i64 0, i64 [[IDXPROM17]] -// CHECK4-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR7]] +// CHECK4-NEXT: [[CALL19:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX18]]) #[[ATTR6]] // CHECK4-NEXT: [[ADD20:%.*]] = add nsw i32 [[ADD16]], [[CALL19]] // CHECK4-NEXT: store i32 [[ADD20]], i32* [[TMP1]], align 4 // CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -3337,7 +3337,7 @@ int main(int argc, char **argv) { // CHECK5-NEXT: store i32 [[TMP6]], i32* [[ARGC_CASTED]], align 4 // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 // CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] // CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK5-NEXT: ret void // CHECK5: worker.exit: @@ -3399,7 +3399,7 @@ int main(int argc, char **argv) { // CHECK5-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* // CHECK5-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* // CHECK5-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK5-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 // CHECK5-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -3585,20 +3585,20 @@ int main(int argc, char **argv) { // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR7:[0-9]+]] -// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR6:[0-9]+]] +// CHECK5-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR6]] // CHECK5-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] // CHECK5-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR7]] +// CHECK5-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR6]] // CHECK5-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] // CHECK5-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 // CHECK5-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR7]] +// CHECK5-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR6]] // CHECK5-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] // CHECK5-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 // CHECK5-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR7]] +// CHECK5-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR6]] // CHECK5-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] // CHECK5-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -3660,7 +3660,7 @@ int main(int argc, char **argv) { // CHECK6-NEXT: store i32 [[TMP6]], i32* [[ARGC_CASTED]], align 4 // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 // CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR5:[0-9]+]] +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR4:[0-9]+]] // CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK6-NEXT: ret void // CHECK6: worker.exit: @@ -3722,7 +3722,7 @@ int main(int argc, char **argv) { // CHECK6-NEXT: [[TMP8:%.*]] = bitcast [10 x i32]* [[B4]] to i8* // CHECK6-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8* // CHECK6-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP8]], i8* align 4 [[TMP9]], i32 40, i1 false) -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK6-NEXT: [[TMP10:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK6-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 // CHECK6-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP11]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -3908,20 +3908,20 @@ int main(int argc, char **argv) { // CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK6-NEXT: store i32 [[ADD]], i32* [[I5]], align 4 -// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR7:[0-9]+]] -// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR7]] +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I5]]) #[[ATTR6:[0-9]+]] +// CHECK6-NEXT: [[CALL7:%.*]] = call i32 @_Z3fooPi(i32* [[TMP1]]) #[[ATTR6]] // CHECK6-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]] // CHECK6-NEXT: [[TMP19:%.*]] = load i32, i32* [[I5]], align 4 // CHECK6-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B3]], i32 0, i32 [[TMP19]] -// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR7]] +// CHECK6-NEXT: [[CALL9:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX]]) #[[ATTR6]] // CHECK6-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]] // CHECK6-NEXT: [[TMP20:%.*]] = load i32, i32* [[I5]], align 4 // CHECK6-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[C4]], i32 0, i32 [[TMP20]] -// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR7]] +// CHECK6-NEXT: [[CALL12:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX11]]) #[[ATTR6]] // CHECK6-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD10]], [[CALL12]] // CHECK6-NEXT: [[TMP21:%.*]] = load i32, i32* [[I5]], align 4 // CHECK6-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP4]], i32 0, i32 [[TMP21]] -// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR7]] +// CHECK6-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[ARRAYIDX14]]) #[[ATTR6]] // CHECK6-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD13]], [[CALL15]] // CHECK6-NEXT: store i32 [[ADD16]], i32* [[TMP1]], align 4 // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp index 35d6813d25267..96be6915cdbec 100644 --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1665,7 +1665,7 @@ int bar(int n){ // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] // CHECK1: omp.critical.loop: @@ -1937,7 +1937,7 @@ int bar(int n){ // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]] // CHECK2: omp.critical.loop: diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index b71cc83fbe784..c415a7493f813 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -18866,7 +18866,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -19097,7 +19097,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -19299,7 +19299,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -19562,7 +19562,7 @@ int bar(int n){ // CHECK1-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 // CHECK1-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[CONV12:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 // CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 @@ -19867,7 +19867,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -20460,7 +20460,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -20691,7 +20691,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -20893,7 +20893,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -21154,7 +21154,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 [[TMP7]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP9]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -21456,7 +21456,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -22029,7 +22029,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -22252,7 +22252,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -22445,7 +22445,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -22696,7 +22696,7 @@ int bar(int n){ // CHECK3-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 // CHECK3-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 // CHECK3-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 @@ -22999,7 +22999,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -23563,7 +23563,7 @@ int bar(int n){ // CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -23786,7 +23786,7 @@ int bar(int n){ // CHECK4-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -23979,7 +23979,7 @@ int bar(int n){ // CHECK4-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -24230,7 +24230,7 @@ int bar(int n){ // CHECK4-NEXT: store i64 [[TMP7]], i64* [[DOTOMP_COMB_UB]], align 8 // CHECK4-NEXT: store i64 1, i64* [[DOTOMP_STRIDE]], align 8 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64 // CHECK4-NEXT: [[TMP8:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 @@ -24533,7 +24533,7 @@ int bar(int n){ // CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp index ea13b9312cb1c..261b277cdc497 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -60,7 +60,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV3]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: @@ -113,7 +113,7 @@ int main(int argc, char **argv) { // CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -304,10 +304,10 @@ int main(int argc, char **argv) { // CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 -// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR5:[0-9]+]] -// CHECK1-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[CALL]], [[CALL13]] -// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR5]] +// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] // CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD14]], [[CALL15]] // CHECK1-NEXT: store i32 [[ADD16]], i32* [[TMP0]], align 4 // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -366,7 +366,7 @@ int main(int argc, char **argv) { // CHECK2-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: @@ -417,7 +417,7 @@ int main(int argc, char **argv) { // CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -598,10 +598,10 @@ int main(int argc, char **argv) { // CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] -// CHECK2-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK2-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] +// CHECK2-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] // CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK2-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -660,7 +660,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: @@ -711,7 +711,7 @@ int main(int argc, char **argv) { // CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -892,10 +892,10 @@ int main(int argc, char **argv) { // CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] -// CHECK3-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK3-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] +// CHECK3-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] // CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK3-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -958,7 +958,7 @@ int main(int argc, char **argv) { // CHECK4-NEXT: store i32 [[TMP5]], i32* [[CONV3]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: @@ -1011,7 +1011,7 @@ int main(int argc, char **argv) { // CHECK4-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK4-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK4-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK4-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -1202,10 +1202,10 @@ int main(int argc, char **argv) { // CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK4-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 -// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR5:[0-9]+]] -// CHECK4-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] +// CHECK4-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[CALL]], [[CALL13]] -// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR5]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] // CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD14]], [[CALL15]] // CHECK4-NEXT: store i32 [[ADD16]], i32* [[TMP0]], align 4 // CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -1264,7 +1264,7 @@ int main(int argc, char **argv) { // CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK5-NEXT: ret void // CHECK5: worker.exit: @@ -1315,7 +1315,7 @@ int main(int argc, char **argv) { // CHECK5-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK5-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK5-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK5-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK5-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -1496,10 +1496,10 @@ int main(int argc, char **argv) { // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] -// CHECK5-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK5-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK5-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] +// CHECK5-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] // CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK5-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -1558,7 +1558,7 @@ int main(int argc, char **argv) { // CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] +// CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] // CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK6-NEXT: ret void // CHECK6: worker.exit: @@ -1609,7 +1609,7 @@ int main(int argc, char **argv) { // CHECK6-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK6-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK6-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK6-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK6-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -1790,10 +1790,10 @@ int main(int argc, char **argv) { // CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK6-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] -// CHECK6-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] +// CHECK6-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] // CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK6-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] +// CHECK6-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] // CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK6-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index 14172267626b8..89b808d037686 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -9803,7 +9803,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -10058,7 +10058,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -10274,7 +10274,7 @@ int bar(int n){ // CHECK1-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK1-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -10867,7 +10867,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -11114,7 +11114,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -11321,7 +11321,7 @@ int bar(int n){ // CHECK2-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK2-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK2-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -11905,7 +11905,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -12152,7 +12152,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) @@ -12359,7 +12359,7 @@ int bar(int n){ // CHECK3-NEXT: store i32 99, i32* [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4 -// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() // CHECK3-NEXT: [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 // CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]]) diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h index ff4f929345651..c7bfede8bbd42 100644 --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -203,6 +203,9 @@ void __kmpc_get_shared_variables(void ***GlobalArgs); /// External interface to get the thread ID. uint32_t __kmpc_get_hardware_thread_id_in_block(); +/// External interface to get the number of threads. +uint32_t __kmpc_get_hardware_num_threads_in_block(); + /// Kernel /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp index 66089eaa7496c..c2041856b6cc6 100644 --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -84,8 +84,7 @@ uint32_t getNumberOfBlocks() { } uint32_t getNumberOfProcessorElements() { - // TODO - return mapping::getBlockSize(); + return getBlockSize(); } uint32_t getWarpId() { @@ -234,5 +233,9 @@ extern "C" { __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { return mapping::getThreadIdInBlock(); } + +__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { + return mapping::getNumberOfProcessorElements(); +} } #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp index 1317bc69fc512..f11c54ed1c497 100644 --- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -22,6 +22,7 @@ namespace _OMP { /// Helper to keep code alive without introducing a performance penalty. __attribute__((used, weak, optnone)) void keepAlive() { __kmpc_get_hardware_thread_id_in_block(); + __kmpc_get_hardware_num_threads_in_block(); __kmpc_barrier_simple_spmd(nullptr, 0); } } // namespace _OMP _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits