[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Johannes Doerfert via cfe-commits Fri, 08 May 2026 07:20:17 -0700

================
@@ -61,6 +65,78 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   return (logical_lane_id == 0);
 }
 
+// Reduction within a block on the GPU.
+//
+// Template parameters:
+// - checkLiveness: Whether to check the liveness of the lanes. This is only
+//                  useful if gpu_block_reduce is called in a context where
+//                  L2 parallel regions are possible.
+// Parameters:
+// - reduce_data: Pointer to the reduction data
+// - shflFct:     Shuffle reduction function
+// - cpyFct:      Inter-warp copy function (copies data from each warp's thread
+//                0 to the lanes of the zeroth warp)
+// - NumValues:   Number of values to reduce / threads to consider
+// - ThreadId:    Thread ID in block (getThreadIdInBlock() in SPMD and 0 in
+//                Generic mode)
+//
+// Returns:
+// - 1 if the thread is the zeroth thread of the block
+// - 0 otherwise
+template <bool checkLiveness = true>
+[[clang::always_inline]]
+static uint32_t gpu_block_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+                                 InterWarpCopyFnTy cpyFct, uint32_t NumValues,
+                                 uint32_t BlockThreadId) {
+  if (NumValues <= 1)
+    return BlockThreadId == 0;
+
+  uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+  uint32_t WarpOffset = WarpId * mapping::getWarpSize();
+  // Calculate how many values this warp has to deal with. Cap WarpId *
+  // mapping::getWarpSize() at NumValues to avoid underflow.
+  uint32_t ActiveLanes =
+      WarpOffset < NumValues
+          ? kmpc_min(NumValues - WarpOffset, mapping::getWarpSize())
+          : 0;
+
+  if constexpr (checkLiveness) {
+    __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+    // Check for partial warp with non-contiguous lanes.
+    if (Liveness != lanes::All && (Liveness & (Liveness + 1))) {
+      // Only threads in L2 parallel region may enter here.
+      return gpu_irregular_simd_reduce(reduce_data, shflFct);
+    }
+    ActiveLanes = kmpc_min(ActiveLanes, utils::popc(Liveness));
+  }
+
+  if (ActiveLanes < mapping::getWarpSize())
+    gpu_irregular_warp_reduce(reduce_data, shflFct, ActiveLanes,
+                              BlockThreadId % mapping::getWarpSize());
+  else
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+
+  if (NumValues > mapping::getWarpSize()) {
+    uint32_t WarpsNeeded =
+        (NumValues + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
----------------
jdoerfert wrote:


Why do we do this for any warp but 0?

https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [mlir] [openmp] [OpenMP][offload] Cross-team reductions with variable number of teams (PR #195102)

Reply via email to