[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert closed 
https://github.com/llvm/llvm-project/pull/70383
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Johannes Doerfert via cfe-commits

https://github.com/jdoerfert updated 
https://github.com/llvm/llvm-project/pull/70383

>From fa6d6d9cf6398915f911e06eecc78c7ba83d3623 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert 
Date: Wed, 25 Oct 2023 16:46:01 -0700
Subject: [PATCH] [OpenMP] Associate the KernelEnvironment with the
 GenericKernelTy

By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user provided bounds that were passed via attributes.
---
 clang/test/OpenMP/bug57757.cpp| 15 ++--
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp |  4 +-
 .../plugins-nextgen/amdgpu/src/rtl.cpp|  8 +-
 .../PluginInterface/PluginInterface.cpp   | 74 +++
 .../common/PluginInterface/PluginInterface.h  | 39 +-
 .../plugins-nextgen/cuda/src/rtl.cpp  |  8 +-
 .../generic-elf-64bit/src/rtl.cpp | 20 ++---
 .../test/offloading/default_thread_limit.c|  3 +-
 .../test/offloading/thread_state_1.c  |  4 +-
 .../test/offloading/thread_state_2.c  |  4 +-
 10 files changed, 74 insertions(+), 105 deletions(-)

diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 7894796ac46284c..7acfe134ddd0baf 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -32,24 +32,23 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:[[TMP2:%.*]] = getelementptr inbounds 
[[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
 // CHECK-NEXT:tail call void 
@llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:tail call void 
@llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK-NEXT:[[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa 
[[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
+// CHECK-NEXT:[[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa 
[[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
 // CHECK-NEXT:switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
 // CHECK-NEXT:i32 0, label [[DOTUNTIED_JMP__I:%.*]]
 // CHECK-NEXT:i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
 // CHECK-NEXT:]
 // CHECK:   .untied.jmp..i:
-// CHECK-NEXT:store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], 
!alias.scope !13, !noalias !16
-// CHECK-NEXT:[[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull 
@[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
+// CHECK-NEXT:store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], 
!alias.scope !13, !noalias !17
+// CHECK-NEXT:[[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull 
@[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
 // CHECK-NEXT:br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:   .untied.next..i:
 // CHECK-NEXT:[[TMP5:%.*]] = getelementptr inbounds 
[[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
 // CHECK-NEXT:[[TMP6:%.*]] = getelementptr inbounds 
[[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
 // CHECK-NEXT:[[TMP7:%.*]] = getelementptr inbounds 
[[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
-// CHECK-NEXT:[[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa 
[[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa 
[[TBAA18]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:[[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa 
[[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef 
[[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
+// CHECK-NEXT:[[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa 
[[TBAA19:![0-9]+]], !noalias !13
+// CHECK-NEXT:[[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa 
[[TBAA16]], !noalias !13
+// CHECK-NEXT:[[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa 
[[TBAA20:![0-9]+]], !noalias !13
+// CHECK-NEXT:tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef 
[[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
 // CHECK-NEXT:br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:   .omp_outlined..exit:
 // CHECK-NEXT:ret i32 0
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp 
b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3e4e030f44c7fe0..b320d77652e1cba 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const 
LocationDescription &Loc, bool IsSPMD,
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
 
-  /// Manifest the launch configuration in the metadata matching the kernel
-  /// environment.
+  // Manifest the launch configuration in the metadata matching the kernel
+  // environment.
   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeams

[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

Lots of churn but looks straightforward enough. Few nits.

https://github.com/llvm/llvm-project/pull/70383
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 edited 
https://github.com/llvm/llvm-project/pull/70383
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Joseph Huber via cfe-commits


@@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
   ImagePtr = &Image;
 
-  PreferredNumThreads = GenericDevice.getDefaultNumThreads();
+  // Retrieve kernel environment object for the kernel.
+  GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
+ sizeof(KernelEnvironment), &KernelEnvironment);
+  GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+  if (auto Err =
+  GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
+[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+DP("Failed to read kernel environment for '%s': %s\n"
+   "Using default SPMD (2) execution mode\n",
+   Name, ErrStr.data());
+KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
+KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2;

jhuber6 wrote:

```suggestion
KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
```

https://github.com/llvm/llvm-project/pull/70383
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-29 Thread Joseph Huber via cfe-commits


@@ -4078,8 +4092,20 @@ OpenMPIRBuilder::createTargetInit(const 
LocationDescription &Loc, bool IsSPMD) {
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
-  auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel);
-  auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel);
+
+  /// Manifest the launch configuration in the metadata matching the kernel
+  /// environment.

jhuber6 wrote:

Doxygen comments?

https://github.com/llvm/llvm-project/pull/70383
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)

2023-10-26 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Johannes Doerfert (jdoerfert)


Changes

By associating the kernel environment with the generic kernel we can
  access middle-end information easily, including the launch bounds ranges
  that are acceptable. By constraining the number of threads accordingly,
  we now obey the user-provided bounds that were passed via attributes.

---

Patch is 1.54 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/70383.diff


25 Files Affected:

- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+45-48) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntime.h (+9-1) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+20-10) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+2-2) 
- (modified) clang/test/OpenMP/distribute_simd_codegen.cpp (+508-508) 
- (modified) clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp 
(+11-98) 
- (modified) clang/test/OpenMP/target_parallel_codegen.cpp (+110-110) 
- (modified) clang/test/OpenMP/target_parallel_debug_codegen.cpp (+420-420) 
- (modified) clang/test/OpenMP/target_parallel_for_codegen.cpp (+314-314) 
- (modified) clang/test/OpenMP/target_parallel_for_debug_codegen.cpp (+589-589) 
- (modified) clang/test/OpenMP/target_parallel_for_simd_codegen.cpp (+932-932) 
- (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp 
(+589-589) 
- (modified) clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp 
(+998-998) 
- (modified) clang/test/OpenMP/teams_distribute_simd_codegen.cpp (+206-206) 
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+27-26) 
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+55-60) 
- (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+4-2) 
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+3-5) 
- (modified) 
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp 
(+27-47) 
- (modified) 
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h 
(+19-20) 
- (modified) openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp (+3-5) 
- (modified) openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp 
(+8-12) 
- (modified) openmp/libomptarget/test/offloading/default_thread_limit.c (+1-2) 
- (modified) openmp/libomptarget/test/offloading/thread_state_1.c (+2-2) 
- (modified) openmp/libomptarget/test/offloading/thread_state_2.c (+2-2) 


``diff
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 6262b3416a1730a..c1be7c2d0321589 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6002,6 +6002,42 @@ void 
CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
   {ThreadId, AllocatorVal});
 }
 
+void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
+const OMPExecutableDirective &D, CodeGenFunction &CGF,
+int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
+int32_t &MaxTeamsVal) {
+
+  getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
+  getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
+  /*UpperBoundOnly=*/true);
+
+  for (auto *C : D.getClausesOfKind()) {
+for (auto *A : C->getAttrs()) {
+  int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1;
+  int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1;
+  if (auto *Attr = dyn_cast(A))
+CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal,
+   &AttrMinBlocksVal, &AttrMaxBlocksVal);
+  else if (auto *Attr = dyn_cast(A))
+CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal,
+&AttrMaxThreadsVal);
+  else
+continue;
+
+  MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
+  if (AttrMaxThreadsVal > 0)
+MaxThreadsVal = MaxThreadsVal > 0
+? std::min(MaxThreadsVal, AttrMaxThreadsVal)
+: AttrMaxThreadsVal;
+  MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
+  if (AttrMaxBlocksVal > 0)
+MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
+  : AttrMaxBlocksVal;
+}
+  }
+}
+
 void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 const OMPExecutableDirective &D, StringRef ParentName,
 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
@@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
   };
 
-  // Get NumTeams and ThreadLimit attributes
-  int32_t DefaultValMinTeams = 1;
-  int32_t DefaultValMaxTeams = -1;
-  uint32_t DefaultValMinThreads = 1;
-  uint32_t DefaultValMaxThreads = UINT32_MAX;
-
-  getNumTeamsExprForTargetDirective(CGF, D, DefaultValMin