================
@@ -187,129 +246,56 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
ThreadId = 0;
}
- uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
- uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
- [[clang::loader_uninitialized]] static Local<unsigned> Bound;
- [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
- // Block progress for teams greater than the current upper
- // limit. We always only allow a number of teams less or equal
- // to the number of slots in the buffer.
- bool IsMaster = (ThreadId == 0);
- while (IsMaster) {
- Bound = atomic::load(&IterCnt, atomic::acquire);
- if (TeamId < Bound + num_of_records)
- break;
- }
- if (IsMaster) {
- int ModBockId = TeamId % num_of_records;
- if (TeamId < num_of_records) {
- lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
- } else
- lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
- // Propagate the memory writes above to the world.
- fence::kernel(atomic::release);
-
- // Increment team counter.
- // This counter is incremented by all teams in the current
- // num_of_records chunk.
- ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
- atomic::MemScopeTy::device);
+ // Fast path for single-team kernels: no cross-team work required,
+ // the team-local reduction already produced the final result.
+ if (NumTeams <= 1)
+ return ThreadId == 0;
+
+ uint32_t &TeamsDone = state::getKernelLaunchEnvironment().ReductionTeamsDone;
+ void *GlobalBuffer = state::getKernelLaunchEnvironment().ReductionBuffer;
+ [[clang::loader_uninitialized]] static Local<uint32_t> TeamsDoneResult;
+
+ // Save the team's reduced value in the global buffer and atomically
+ // increment the teams-done counter.
+ if (ThreadId == 0) {
+ lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+ TeamsDoneResult = atomic::inc(&TeamsDone, NumTeams - 1u, atomic::acq_rel,
+ atomic::MemScopeTy::device);
}
- // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
- // state machine.
+ // This sync is needed so that all threads from last team see the shared
teams
+ // done counter value and know that they are in the last team.
if (mapping::isSPMDMode())
synchronize::threadsAligned(atomic::acq_rel);
- // reduce_data is global or shared so before being reduced within the
- // warp we need to bring it in local memory:
- // local_reduce_data = reduce_data[i]
- //
- // Example for 3 reduction variables a, b, c (of potentially different
- // types):
- //
- // buffer layout (struct of arrays):
- // a, a, ..., a, b, b, ... b, c, c, ... c
- // |__________|
- // num_of_records
- //
- // local_data_reduce layout (struct):
- // a, b, c
- //
- // Each thread will have a local struct containing the values to be
- // reduced:
- // 1. do reduction within each warp.
- // 2. do reduction across warps.
- // 3. write the final result to the main reduction variable
- // by returning 1 in the thread holding the reduction result.
-
- // Check if this is the very last team.
- unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
- if (ChunkTeamCount == NumTeams - Bound - 1) {
- // Ensure we see the global memory writes by other teams
- fence::kernel(atomic::acquire);
-
- //
- // Last team processing.
- //
- if (ThreadId >= NumRecs)
- return 0;
- NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
- if (ThreadId >= NumThreads)
- return 0;
+ // If teams done counter reaches NumTeams-1, this is the last team.
+ if (TeamsDoneResult != NumTeams - 1u)
+ return 0;
- // Load from buffer and reduce.
+ // The last team performs final reduction across all team values.
+ uint32_t ValidValues = NumThreads < NumTeams ? NumThreads : NumTeams;
+ if (ThreadId < ValidValues) {
+ // Make sure that global buffer is fresh.
+ fence::kernel(atomic::acquire);
+ // Get the team values from the global buffer.
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
- for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
- glredFct(GlobalBuffer, i, reduce_data);
-
- // Reduce across warps to the warp master.
- if (NumThreads > 1) {
- gpu_regular_warp_reduce(reduce_data, shflFct);
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
- if (ActiveThreads > mapping::getWarpSize()) {
- uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = ThreadId / mapping::getWarpSize();
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- ThreadId);
- }
- }
-
- if (IsMaster) {
- Cnt = 0;
- IterCnt = 0;
- return 1;
- }
- return 0;
- }
- if (IsMaster && ChunkTeamCount == num_of_records - 1) {
- // Allow SIZE number of teams to proceed writing their
- // intermediate results to the global buffer.
- atomic::add(&IterCnt, uint32_t(num_of_records), atomic::seq_cst);
+ // In case we have more teams than threads, we need to iterate over the
+ // remaining teams.
+ for (uint32_t I = NumThreads + ThreadId; I < NumTeams; I += NumThreads)
+ glredFct(GlobalBuffer, I, reduce_data);
}
- return 0;
-}
+ return gpu_block_reduce<false>(reduce_data, shflFct, cpyFct, ValidValues,
+ ThreadId);
}
+} // extern "C"
void *__kmpc_reduction_get_fixed_buffer() {
return state::getKernelLaunchEnvironment().ReductionBuffer;
----------------
ro-i wrote:
removed
https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits