================
@@ -187,130 +248,57 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
ThreadId = 0;
}
- uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
- uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
- [[clang::loader_uninitialized]] static Local<unsigned> Bound;
- [[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
-
- // Block progress for teams greater than the current upper
- // limit. We always only allow a number of teams less or equal
- // to the number of slots in the buffer.
- bool IsMaster = (ThreadId == 0);
- while (IsMaster) {
- Bound = atomic::load(&IterCnt, atomic::acquire);
- if (TeamId < Bound + num_of_records)
- break;
- }
- if (IsMaster) {
- int ModBockId = TeamId % num_of_records;
- if (TeamId < num_of_records) {
- lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
- } else
- lgredFct(GlobalBuffer, ModBockId, reduce_data);
-
- // Propagate the memory writes above to the world.
- fence::kernel(atomic::release);
-
- // Increment team counter.
- // This counter is incremented by all teams in the current
- // num_of_records chunk.
- ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
- atomic::MemScopeTy::device);
+ // Fast path for single-team kernels: no cross-team work required,
+ // the team-local reduction already produced the final result.
+ if (NumTeams <= 1)
+ return ThreadId == 0;
+
+ uint32_t &TeamsDone = state::getKernelLaunchEnvironment().ReductionTeamsDone;
+ void *GlobalBuffer = state::getKernelLaunchEnvironment().ReductionBuffer;
+ [[clang::loader_uninitialized]] static Local<uint32_t> TeamsDoneResult;
+
+ // Save the team's reduced value in the global buffer and atomically
+ // increment the teams-done counter.
+ if (ThreadId == 0) {
+ lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+ TeamsDoneResult = atomic::inc(&TeamsDone, NumTeams - 1u, atomic::acq_rel,
+ atomic::MemScopeTy::device);
----------------
ro-i wrote:
"modulo" as-in: it wraps around if it's *larger* than the specified value (aka
larger that `NumTeams - 1`. The documentation for that is slightly misleading,
but it's specified here:
https://github.com/llvm/llvm-project/blob/1cb92d817468c6fbe1b9c6905bcf84f712de742c/llvm/include/llvm/IR/Instructions.h#L776-L778
https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits