[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Johannes Doerfert via cfe-commits Sat, 09 May 2026 09:28:56 -0700

================
@@ -187,129 +246,56 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     ThreadId = 0;
   }
 
-  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
-  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
   uint32_t NumThreads = omp_get_num_threads();
   uint32_t TeamId = omp_get_team_num();
   uint32_t NumTeams = omp_get_num_teams();
-  [[clang::loader_uninitialized]] static Local<unsigned> Bound;
-  [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
-  // Block progress for teams greater than the current upper
-  // limit. We always only allow a number of teams less or equal
-  // to the number of slots in the buffer.
-  bool IsMaster = (ThreadId == 0);
-  while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::acquire);
-    if (TeamId < Bound + num_of_records)
-      break;
-  }
 
-  if (IsMaster) {
-    int ModBockId = TeamId % num_of_records;
-    if (TeamId < num_of_records) {
-      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
-    } else
-      lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
-    // Propagate the memory writes above to the world.
-    fence::kernel(atomic::release);
-
-    // Increment team counter.
-    // This counter is incremented by all teams in the current
-    // num_of_records chunk.
-    ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
-                                 atomic::MemScopeTy::device);
+  // Fast path for single-team kernels: no cross-team work required,
+  // the team-local reduction already produced the final result.
+  if (NumTeams <= 1)
+    return ThreadId == 0;
+
+  uint32_t &TeamsDone = state::getKernelLaunchEnvironment().ReductionTeamsDone;
+  void *GlobalBuffer = state::getKernelLaunchEnvironment().ReductionBuffer;
+  [[clang::loader_uninitialized]] static Local<uint32_t> TeamsDoneResult;
+
+  // Save the team's reduced value in the global buffer and atomically
+  // increment the teams-done counter.
+  if (ThreadId == 0) {
+    lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+    TeamsDoneResult = atomic::inc(&TeamsDone, NumTeams - 1u, atomic::acq_rel,
+                                  atomic::MemScopeTy::device);
   }
 
-  // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
-  // state machine.
+  // This sync is needed so that all threads from last team see the shared 
teams
+  // done counter value and know that they are in the last team.
   if (mapping::isSPMDMode())
     synchronize::threadsAligned(atomic::acq_rel);
 
-  // reduce_data is global or shared so before being reduced within the
-  // warp we need to bring it in local memory:
-  // local_reduce_data = reduce_data[i]
-  //
-  // Example for 3 reduction variables a, b, c (of potentially different
-  // types):
-  //
-  // buffer layout (struct of arrays):
-  // a, a, ..., a, b, b, ... b, c, c, ... c
-  // |__________|
-  //     num_of_records
-  //
-  // local_data_reduce layout (struct):
-  // a, b, c
-  //
-  // Each thread will have a local struct containing the values to be
-  // reduced:
-  //      1. do reduction within each warp.
-  //      2. do reduction across warps.
-  //      3. write the final result to the main reduction variable
-  //         by returning 1 in the thread holding the reduction result.
-
-  // Check if this is the very last team.
-  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
-  if (ChunkTeamCount == NumTeams - Bound - 1) {
-    // Ensure we see the global memory writes by other teams
-    fence::kernel(atomic::acquire);
-
-    //
-    // Last team processing.
-    //
-    if (ThreadId >= NumRecs)
-      return 0;
-    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
-    if (ThreadId >= NumThreads)
-      return 0;
+  // If teams done counter reaches NumTeams-1, this is the last team.
+  if (TeamsDoneResult != NumTeams - 1u)
+    return 0;
 
-    // Load from buffer and reduce.
+  // The last team performs final reduction across all team values.
+  uint32_t ValidValues = NumThreads < NumTeams ? NumThreads : NumTeams;
+  if (ThreadId < ValidValues) {
+    // Make sure that global buffer is fresh.
+    fence::kernel(atomic::acquire);
+    // Get the team values from the global buffer.
     glcpyFct(GlobalBuffer, ThreadId, reduce_data);
-    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
-      glredFct(GlobalBuffer, i, reduce_data);
-
-    // Reduce across warps to the warp master.
-    if (NumThreads > 1) {
-      gpu_regular_warp_reduce(reduce_data, shflFct);
-
-      // When we have more than [mapping::getWarpSize()] number of threads
-      // a block reduction is performed here.
-      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
-      if (ActiveThreads > mapping::getWarpSize()) {
-        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
-                               mapping::getWarpSize();
-        // Gather all the reduced values from each warp
-        // to the first warp.
-        cpyFct(reduce_data, WarpsNeeded);
-
-        uint32_t WarpId = ThreadId / mapping::getWarpSize();
-        if (WarpId == 0)
-          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                    ThreadId);
-      }
-    }
-
-    if (IsMaster) {
-      Cnt = 0;
-      IterCnt = 0;
-      return 1;
-    }
-    return 0;
-  }
-  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
-    // Allow SIZE number of teams to proceed writing their
-    // intermediate results to the global buffer.
-    atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
+    // In case we have more teams than threads, we need to iterate over the
+    // remaining teams.
+    for (uint32_t I = NumThreads + ThreadId; I < NumTeams; I += NumThreads)
+      glredFct(GlobalBuffer, I, reduce_data);
   }
 
-  return 0;
-}
+  return gpu_block_reduce<false>(reduce_data, shflFct, cpyFct, ValidValues,
+                                 ThreadId);
 }
+} // extern "C"
 
 void *__kmpc_reduction_get_fixed_buffer() {
   return state::getKernelLaunchEnvironment().ReductionBuffer;
----------------
jdoerfert wrote:


This is not needed anymore, I think.

https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Reply via email to