https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/196061
Significantly reduces register usage and removes register spilling in `offload/test/offloading/multiple-reductions.cpp`, for example. Provides speedup of up to 5-10x for a lot of reductions in such a larger setup. Based on https://github.com/llvm/llvm-project/pull/195940. See also the discussion in https://github.com/llvm/llvm-project/pull/195102. >From bb128b7689aedbc4a46d0578c25515ec8c0d16cc Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <[email protected]> Date: Wed, 6 May 2026 07:02:23 -0500 Subject: [PATCH] [OpenMP][offload] Inline target reductions Significantly reduces register usage and removes register spilling in `offload/test/offloading/multiple-reductions.cpp`, for example. Provides speedup of up to 5-10x for a lot of reductions in such a larger setup. --- openmp/device/src/Reduction.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp index f2a2d5e39aaa5..8a685d3bad885 100644 --- a/openmp/device/src/Reduction.cpp +++ b/openmp/device/src/Reduction.cpp @@ -22,15 +22,19 @@ using namespace ompx; namespace { -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { +[[clang::always_inline]] +static void gpu_regular_warp_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { shflFct(reduce_data, /*LaneId - not used= */ 0, /*Offset = */ mask, /*AlgoVersion=*/0); } } -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { +[[clang::always_inline]] +static void gpu_irregular_warp_reduce(void *reduce_data, + ShuffleReductFnTy shflFct, uint32_t size, + uint32_t tid) { uint32_t curr_size; uint32_t mask; curr_size = size; @@ -42,6 +46,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, } } +[[clang::always_inline]] static uint32_t gpu_irregular_simd_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { uint32_t size, remote_id, physical_lane_id; @@ -61,6 +66,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, return (logical_lane_id == 0); } +[[clang::always_inline]] static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { @@ -155,17 +161,21 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, return BlockThreadId == 0; } -uint32_t roundToWarpsize(uint32_t s) { +[[clang::always_inline]] +static uint32_t roundToWarpsize(uint32_t s) { if (s < mapping::getWarpSize()) return 1; return (s & ~(unsigned)(mapping::getWarpSize() - 1)); } -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } +static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) { + return x < y ? x : y; +} } // namespace extern "C" { +[[clang::always_inline]] int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data, @@ -174,6 +184,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); } +[[clang::always_inline]] int32_t __kmpc_nvptx_teams_reduce_nowait_v2( IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
