[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
ABataev accepted this revision. ABataev added a comment. LG Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:474 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); Later we should remove it from the code. https://reviews.llvm.org/D38976 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
gtbercea updated this revision to Diff 121543. gtbercea edited the summary of this revision. gtbercea added a comment. Remove blocks. https://reviews.llvm.org/D38976 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]( + // CHECK: call void [[PARALLEL_FN2]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -217,11 +217,11 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
gtbercea updated this revision to Diff 121538. Repository: rL LLVM https://reviews.llvm.org/D38976 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]( + // CHECK: call void [[PARALLEL_FN2]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*)) + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -217,11 +217,11 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*) + // CHECK: [[WM:
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
Hahnfeld added inline comments. Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:2393 + + // Level0(CGF); + Is this meant to be a comment or should there be something happening here? Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:2396 + CGF.EmitBranch(AfterBB); + if (NextBB) +CGF.EmitBlock(NextBB); Especially, `NextBB` can never be anything else than `nullptr`... Repository: rL LLVM https://reviews.llvm.org/D38976 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
ABataev accepted this revision. ABataev added a comment. This revision is now accepted and ready to land. LG Repository: rL LLVM https://reviews.llvm.org/D38976 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
gtbercea created this revision. Herald added a subscriber: jholewinski. This patch is part of the development effort to add support in the current OpenMP GPU offloading implementation for implicitly sharing variables between a target region executed by the team master thread and the worker threads within that team. This patch is the first of three required for successfully performing the implicit sharing of master thread variables with the worker threads within a team. The remaining two patches are: - a patch to the LLVM NVPTX backend which ensures the lowering of shared variables to an device memory which allows the sharing of references; - a runtime patch to libomptarget which ensures that a list of references to shared variables is properly maintained. A simple code snippet which illustrates an implicit data sharing situation is as follows: #pragma omp target { // master thread only int v; #pragma omp parallel { // worker threads // use v } } Variable v is implicitly shared from the team master thread which executes the code in between the target and parallel directives. The worker threads must operate on the latest version of v, including any updates performed by the master. The code generated in this patch relies on the LLVM NVPTX patch (mentioned above) which prevents v from being lowered in the thread local memory of the master thread thus making the reference to this variable un-shareable with the workers. This ensures that the code generated by this patch is correct. Since the parallel region is outlined the passing of arguments to the outlined regions must preserve the original order of arguments. The runtime therefore maintains a list of references to shared variables thus ensuring their passing in the correct order. The passing of arguments to the outlined parallel function is performed in a separate function which the data sharing infrastructure constructs in this patch. The function is inlined when optimizations are enabled. Repository: rL LLVM https://reviews.llvm.org/D38976 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%