[PATCH] D108708: [openmp][amdgpu] Initial gfx10 offloading implementation

Jon Chesterfield via Phabricator via cfe-commits Wed, 25 Aug 2021 09:11:00 -0700

JonChesterfield created this revision.
JonChesterfield added reviewers: dpalermo, jdoerfert, tianshilei1992, ronlieb, 
ye-luo, grokos.
Herald added subscribers: kerbowa, guansong, t-tye, tpr, dstuttard, yaxunl, 
mgorny, nhaehnle, jvesely, kzhuravl.
JonChesterfield requested review of this revision.
Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, sstefan1, 
wdng.
Herald added projects: clang, OpenMP, LLVM.


Lets wavefront size be 32 for amdgpu openmp, as well as 64.

Fixes up as little as possible to pass that through the libraries. This change
is end to end, as opposed to updating clang/devicertl/plugin separately. It can
be broken up for review/commit if preferred. Posting as-is so that others with
a gfx10 can try it out. It works roughly as well as gfx9 for me, but there are
probably bugs remaining as well as the todo: for letting grid values vary more.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D108708

Files:
  clang/lib/Basic/Targets/AMDGPU.h
  llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
  openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
  openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -436,8 +436,9 @@
   int MaxTeamsDefault;
 };
 
+template <uint32_t wavesize>
 static constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<wavesize>();
 }
 
 /// Class containing all the device information
@@ -508,10 +509,24 @@
   static const unsigned HardTeamLimit =
       (1 << 16) - 1; // 64K needed to fit in uint16
   static const int DefaultNumTeams = 128;
-  static const int Max_Teams = getGridValue().GV_Max_Teams;
-  static const int Warp_Size = getGridValue().GV_Warp_Size;
-  static const int Max_WG_Size = getGridValue().GV_Max_WG_Size;
-  static const int Default_WG_Size = getGridValue().GV_Default_WG_Size;
+
+  // These need to be per-device since different devices can have different
+  // wave sizes, but are currently the same number for each so that refactor
+  // can be postponed.
+  static_assert(getGridValue<32>().GV_Max_Teams ==
+                    getGridValue<64>().GV_Max_Teams,
+                "");
+  static const int Max_Teams = getGridValue<64>().GV_Max_Teams;
+
+  static_assert(getGridValue<32>().GV_Max_WG_Size ==
+                    getGridValue<64>().GV_Max_WG_Size,
+                "");
+  static const int Max_WG_Size = getGridValue<64>().GV_Max_WG_Size;
+
+  static_assert(getGridValue<32>().GV_Default_WG_Size ==
+                    getGridValue<64>().GV_Default_WG_Size,
+                "");
+  static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
 
   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
                                       size_t size, hsa_agent_t,
@@ -1060,8 +1075,9 @@
     DP("Queried wavefront size: %d\n", wavefront_size);
     DeviceInfo.WarpSize[device_id] = wavefront_size;
   } else {
-    DP("Default wavefront size: %d\n", getGridValue().GV_Warp_Size);
-    DeviceInfo.WarpSize[device_id] = getGridValue().GV_Warp_Size;
+    // TODO: Burn the wavefront size into the code object
+    DP("Warning: Unknown wavefront size, assuming 64\n");
+    DeviceInfo.WarpSize[device_id] = 64;
   }
 
   // Adjust teams to the env variables
@@ -1886,9 +1902,10 @@
   int WorkgroupSize;
   int GridSize;
 };
-launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
-                         int ExecutionMode, int num_teams, int thread_limit,
-                         uint64_t loop_tripcount, int DeviceNumTeams) {
+launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
+                         int ConstWGSize, int ExecutionMode, int num_teams,
+                         int thread_limit, uint64_t loop_tripcount,
+                         int DeviceNumTeams) {
 
   int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
   int num_groups = 0;
@@ -1901,7 +1918,7 @@
   if (print_kernel_trace & STARTUP_DETAILS) {
     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
     DP("Max_Teams: %d\n", Max_Teams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", RTLDeviceInfoTy::Warp_Size);
+    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
        RTLDeviceInfoTy::Default_WG_Size);
@@ -1914,8 +1931,8 @@
     threadsPerGroup = thread_limit;
     DP("Setting threads per block to requested %d\n", thread_limit);
     if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
-      threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
-      DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
+      threadsPerGroup += WarpSize;
+      DP("Adding master wavefront: +%d threads\n", WarpSize);
     }
     if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
       threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
@@ -1951,7 +1968,7 @@
   // So we only handle constant thread_limits.
   if (threadsPerGroup >
       RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
+    // Should we round threadsPerGroup up to nearest WarpSize
     // here?
     num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
 
@@ -2100,12 +2117,13 @@
   /*
    * Set limit based on ThreadsPerGroup and GroupsPerDevice
    */
-  launchVals LV = getLaunchVals(DeviceInfo.Env, KernelInfo->ConstWGSize,
-                                KernelInfo->ExecutionMode,
-                                num_teams,      // From run_region arg
-                                thread_limit,   // From run_region arg
-                                loop_tripcount, // From run_region arg
-                                DeviceInfo.NumTeams[KernelInfo->device_id]);
+  launchVals LV =
+      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
+                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
+                    num_teams,      // From run_region arg
+                    thread_limit,   // From run_region arg
+                    loop_tripcount, // From run_region arg
+                    DeviceInfo.NumTeams[KernelInfo->device_id]);
   const int GridSize = LV.GridSize;
   const int WorkgroupSize = LV.WorkgroupSize;
 
Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -34,7 +34,7 @@
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 INLINE constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
Index: openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -107,7 +107,7 @@
 endif()
 
 # create libraries
-set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908)
+set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx1010 gfx1031)
 if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
   set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
 endif()
Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -81,7 +81,7 @@
 };
 
 /// For AMDGPU GPUs
-static constexpr GV AMDGPUGridValues = {
+static constexpr GV AMDGPUGridValues64 = {
     256,  // GV_Slot_Size
     64,   // GV_Warp_Size
     128,  // GV_Max_Teams
@@ -90,6 +90,20 @@
     256,  // GV_Default_WG_Size
 };
 
+static constexpr GV AMDGPUGridValues32 = {
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
+    128,  // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
+};
+
+template <uint32_t wavesize> constexpr const GV &getAMDGPUGridValues() {
+  static_assert(wavesize == 32 || wavesize == 64, "");
+  return wavesize == 32 ? AMDGPUGridValues32 : AMDGPUGridValues64;
+}
+
 /// For Nvidia GPUs
 static constexpr GV NVPTXGridValues = {
     256,  // GV_Slot_Size
Index: clang/lib/Basic/Targets/AMDGPU.h
===================================================================
--- clang/lib/Basic/Targets/AMDGPU.h
+++ clang/lib/Basic/Targets/AMDGPU.h
@@ -371,7 +371,14 @@
   }
 
   const llvm::omp::GV &getGridValue() const override {
-    return llvm::omp::AMDGPUGridValues;
+    switch (WavefrontSize) {
+    case 32:
+      return llvm::omp::getAMDGPUGridValues<32>();
+    case 64:
+      return llvm::omp::getAMDGPUGridValues<64>();
+    default:
+      llvm_unreachable("getGridValue not implemented for this wavesize");
+    }
   }
 
   /// \returns Target specific vtbl ptr address space.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D108708: [openmp][amdgpu] Initial gfx10 offloading implementation

Reply via email to