https://github.com/JonChesterfield created https://github.com/llvm/llvm-project/pull/131141
Declare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx. >From b9fdef141a83969eff8e7ac2dbc8c98163c0fbf5 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield <jonathanchesterfi...@gmail.com> Date: Thu, 13 Mar 2025 13:23:38 +0000 Subject: [PATCH] [Headers][NFC] Deduplicate gpu_match_any between targets --- clang/lib/Headers/amdgpuintrin.h | 19 +------------ clang/lib/Headers/gpuintrin.h | 48 +++++++++++++++++++++++++++++++- clang/lib/Headers/nvptxintrin.h | 19 ++----------- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 56748f6c3e818..74054068c9714 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); // Attribute to declare a function as a kernel. #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected"))) -// Defined in gpuintrin.h, used later in this file. -_DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x); - // Returns the number of workgroups in the 'x' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); @@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, // Returns a bitmask marking all lanes that have the same value of __x. _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { - uint32_t __match_mask = 0; - - bool __done = 0; - while (__gpu_ballot(__lane_mask, !__done)) { - if (!__done) { - uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); - if (__first == __x) { - __match_mask = __gpu_lane_mask(); - __done = 1; - } - } - } - __gpu_sync_lane(__lane_mask); - return __match_mask; + return __gpu_fallback_match_any_u32(__lane_mask, __x); } // Returns a bitmask marking all lanes that have the same value of __x. diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index ac79d685337c5..e4a9a49e10e1f 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")"); #define bool _Bool #endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {kind(gpu)})"); + +// Returns the bit-mask of active threads in the current warp or wavefront. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x); + +// Copies the value from the first active thread to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + + +// Copies the value from the first active thread to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x); + + + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + + #if defined(__NVPTX__) #include <nvptxintrin.h> #elif defined(__AMDGPU__) @@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) { return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask); } -// Copies the value from the first active thread in the wavefront to the rest. +// Copies the value from the first active thread to the rest. _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) { uint32_t __hi = (uint32_t)(__x >> 32ull); diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index 10ad7a682d4cd..1da9402040b52 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})"); // Attribute to declare a function as a kernel. #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected"))) -// Defined in gpuintrin.h, used later in this file. -_DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x); - // Returns the number of CUDA blocks in the 'x' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { return __nvvm_read_ptx_sreg_nctaid_x(); @@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { // Newer targets can use the dedicated CUDA support. #if __CUDA_ARCH__ >= 700 return __nvvm_match_any_sync_i32(__lane_mask, __x); +#else + return __gpu_fallback_match_any_u32(__lane_mask, __x); #endif - - uint32_t __match_mask = 0; - bool __done = 0; - while (__gpu_ballot(__lane_mask, !__done)) { - if (!__done) { - uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); - if (__first == __x) { - __match_mask = __gpu_lane_mask(); - __done = 1; - } - } - } - return __match_mask; } // Returns a bitmask marking all lanes that have the same value of __x. _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits