[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
https://github.com/jdoerfert closed https://github.com/llvm/llvm-project/pull/70383 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/70383 >From fa6d6d9cf6398915f911e06eecc78c7ba83d3623 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Wed, 25 Oct 2023 16:46:01 -0700 Subject: [PATCH] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user provided bounds that were passed via attributes. --- clang/test/OpenMP/bug57757.cpp| 15 ++-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 4 +- .../plugins-nextgen/amdgpu/src/rtl.cpp| 8 +- .../PluginInterface/PluginInterface.cpp | 74 +++ .../common/PluginInterface/PluginInterface.h | 39 +- .../plugins-nextgen/cuda/src/rtl.cpp | 8 +- .../generic-elf-64bit/src/rtl.cpp | 20 ++--- .../test/offloading/default_thread_limit.c| 3 +- .../test/offloading/thread_state_1.c | 4 +- .../test/offloading/thread_state_2.c | 4 +- 10 files changed, 74 insertions(+), 105 deletions(-) diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp index 7894796ac46284c..7acfe134ddd0baf 100644 --- a/clang/test/OpenMP/bug57757.cpp +++ b/clang/test/OpenMP/bug57757.cpp @@ -32,24 +32,23 @@ void foo() { // CHECK-NEXT: entry: // CHECK-NEXT:[[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2 // CHECK-NEXT:tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]]) -// CHECK-NEXT:tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]]) -// CHECK-NEXT:[[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16 +// CHECK-NEXT:[[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17 // CHECK-NEXT:switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [ // CHECK-NEXT:i32 0, label [[DOTUNTIED_JMP__I:%.*]] // CHECK-NEXT:i32 1, label [[DOTUNTIED_NEXT__I:%.*]] // CHECK-NEXT:] // CHECK: .untied.jmp..i: -// CHECK-NEXT:store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16 -// CHECK-NEXT:[[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19 +// CHECK-NEXT:store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17 +// CHECK-NEXT:[[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13 // CHECK-NEXT:br label [[DOTOMP_OUTLINED__EXIT]] // CHECK: .untied.next..i: // CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1 // CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2 // CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1 -// CHECK-NEXT:[[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13 -// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13 -// CHECK-NEXT:[[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13 -// CHECK-NEXT:tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19 +// CHECK-NEXT:[[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13 +// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13 +// CHECK-NEXT:[[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13 +// CHECK-NEXT:tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13 // CHECK-NEXT:br label [[DOTOMP_OUTLINED__EXIT]] // CHECK: .omp_outlined..exit: // CHECK-NEXT:ret i32 0 diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3e4e030f44c7fe0..b320d77652e1cba 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, Function *Kernel = Builder.GetInsertBlock()->getParent(); - /// Manifest the launch configuration in the metadata matching the kernel - /// environment. + // Manifest the launch configuration in the metadata matching the kernel + // environment. if (MinTeamsVal > 1 || MaxTeamsVal > 0) writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeams
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
https://github.com/jhuber6 approved this pull request. Lots of churn but looks straightforward enough. Few nits. https://github.com/llvm/llvm-project/pull/70383 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
https://github.com/jhuber6 edited https://github.com/llvm/llvm-project/pull/70383 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
@@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, ImagePtr = &Image; - PreferredNumThreads = GenericDevice.getDefaultNumThreads(); + // Retrieve kernel environment object for the kernel. + GlobalTy KernelEnv(std::string(Name) + "_kernel_environment", + sizeof(KernelEnvironment), &KernelEnvironment); + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + if (auto Err = + GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) { +[[maybe_unused]] std::string ErrStr = toString(std::move(Err)); +DP("Failed to read kernel environment for '%s': %s\n" + "Using default SPMD (2) execution mode\n", + Name, ErrStr.data()); +KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD; +KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2; jhuber6 wrote: ```suggestion KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2; ``` https://github.com/llvm/llvm-project/pull/70383 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
@@ -4078,8 +4092,20 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) { Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); Function *Kernel = Builder.GetInsertBlock()->getParent(); - auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel); - auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel); + + /// Manifest the launch configuration in the metadata matching the kernel + /// environment. jhuber6 wrote: Doxygen comments? https://github.com/llvm/llvm-project/pull/70383 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
llvmbot wrote: @llvm/pr-subscribers-clang-codegen Author: Johannes Doerfert (jdoerfert) Changes By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user-provided bounds that were passed via attributes. --- Patch is 1.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70383.diff 25 Files Affected: - (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+45-48) - (modified) clang/lib/CodeGen/CGOpenMPRuntime.h (+9-1) - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+20-10) - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+2-2) - (modified) clang/test/OpenMP/distribute_simd_codegen.cpp (+508-508) - (modified) clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp (+11-98) - (modified) clang/test/OpenMP/target_parallel_codegen.cpp (+110-110) - (modified) clang/test/OpenMP/target_parallel_debug_codegen.cpp (+420-420) - (modified) clang/test/OpenMP/target_parallel_for_codegen.cpp (+314-314) - (modified) clang/test/OpenMP/target_parallel_for_debug_codegen.cpp (+589-589) - (modified) clang/test/OpenMP/target_parallel_for_simd_codegen.cpp (+932-932) - (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp (+589-589) - (modified) clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp (+998-998) - (modified) clang/test/OpenMP/teams_distribute_simd_codegen.cpp (+206-206) - (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+27-26) - (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+55-60) - (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+4-2) - (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+3-5) - (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp (+27-47) - (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h (+19-20) - (modified) openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp (+3-5) - (modified) openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp (+8-12) - (modified) openmp/libomptarget/test/offloading/default_thread_limit.c (+1-2) - (modified) openmp/libomptarget/test/offloading/thread_state_1.c (+2-2) - (modified) openmp/libomptarget/test/offloading/thread_state_2.c (+2-2) ``diff diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 6262b3416a1730a..c1be7c2d0321589 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF, {ThreadId, AllocatorVal}); } +void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams( +const OMPExecutableDirective &D, CodeGenFunction &CGF, +int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, +int32_t &MaxTeamsVal) { + + getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal); + getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal, + /*UpperBoundOnly=*/true); + + for (auto *C : D.getClausesOfKind()) { +for (auto *A : C->getAttrs()) { + int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1; + int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1; + if (auto *Attr = dyn_cast(A)) +CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal, + &AttrMinBlocksVal, &AttrMaxBlocksVal); + else if (auto *Attr = dyn_cast(A)) +CGM.handleAMDGPUFlatWorkGroupSizeAttr( +nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal, +&AttrMaxThreadsVal); + else +continue; + + MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal); + if (AttrMaxThreadsVal > 0) +MaxThreadsVal = MaxThreadsVal > 0 +? std::min(MaxThreadsVal, AttrMaxThreadsVal) +: AttrMaxThreadsVal; + MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal); + if (AttrMaxBlocksVal > 0) +MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal) + : AttrMaxBlocksVal; +} + } +} + void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, @@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); }; - // Get NumTeams and ThreadLimit attributes - int32_t DefaultValMinTeams = 1; - int32_t DefaultValMaxTeams = -1; - uint32_t DefaultValMinThreads = 1; - uint32_t DefaultValMaxThreads = UINT32_MAX; - - getNumTeamsExprForTargetDirective(CGF, D, DefaultValMin