================
@@ -61,6 +65,78 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
return (logical_lane_id == 0);
}
+// Reduction within a block on the GPU.
+//
+// Template parameters:
+// - checkLiveness: Whether to check the liveness of the lanes. This is only
+// useful if gpu_block_reduce is called in a context where
+// L2 parallel regions are possible.
+// Parameters:
+// - reduce_data: Pointer to the reduction data
+// - shflFct: Shuffle reduction function
+// - cpyFct: Inter-warp copy function (copies data from each warp's thread
+// 0 to the lanes of the zeroth warp)
+// - NumValues: Number of values to reduce / threads to consider
+// - ThreadId: Thread ID in block (getThreadIdInBlock() in SPMD and 0 in
+// Generic mode)
+//
+// Returns:
+// - 1 if the thread is the zeroth thread of the block
+// - 0 otherwise
+template <bool checkLiveness = true>
+[[clang::always_inline]]
+static uint32_t gpu_block_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct, uint32_t NumValues,
+ uint32_t BlockThreadId) {
+ if (NumValues <= 1)
+ return BlockThreadId == 0;
+
+ uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+ uint32_t WarpOffset = WarpId * mapping::getWarpSize();
+ // Calculate how many values this warp has to deal with. Cap WarpId *
+ // mapping::getWarpSize() at NumValues to avoid underflow.
+ uint32_t ActiveLanes =
+ WarpOffset < NumValues
+ ? kmpc_min(NumValues - WarpOffset, mapping::getWarpSize())
+ : 0;
+
+ if constexpr (checkLiveness) {
+ __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+ // Check for partial warp with non-contiguous lanes.
+ if (Liveness != lanes::All && (Liveness & (Liveness + 1))) {
+ // Only threads in L2 parallel region may enter here.
+ return gpu_irregular_simd_reduce(reduce_data, shflFct);
+ }
+ ActiveLanes = kmpc_min(ActiveLanes, utils::popc(Liveness));
+ }
+
+ if (ActiveLanes < mapping::getWarpSize())
+ gpu_irregular_warp_reduce(reduce_data, shflFct, ActiveLanes,
+ BlockThreadId % mapping::getWarpSize());
+ else
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+
+ // When we have more than [mapping::getWarpSize()] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+
+ if (NumValues > mapping::getWarpSize()) {
+ uint32_t WarpsNeeded =
+ (NumValues + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
----------------
jdoerfert wrote:
Why do we do this for any warp but 0?
https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits