[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131134 >From 7347ebc6a0aadd1b9676e329bdf7705dbfae7875 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 13 Mar 2025 12:49:42 + Subject: [PATCH] [libc][nfc] Steps to allow sharing code between gpu intrin.h headers --- clang/lib/Headers/amdgpuintrin.h | 22 ++ clang/lib/Headers/gpuintrin.h| 26 +++--- clang/lib/Headers/nvptxintrin.h | 27 --- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 839a05175cf3e..56748f6c3e818 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -13,11 +13,8 @@ #error "This file is intended for AMDGPU targets or offloading to AMDGPU" #endif -#include - -#if !defined(__cplusplus) -_Pragma("push_macro(\"bool\")"); -#define bool _Bool +#ifndef __GPUINTRIN_H +#error "Never use directly; include instead" #endif _Pragma("omp begin declare target device_type(nohost)"); @@ -146,17 +143,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, return __builtin_amdgcn_ds_bpermute(__lane << 2, __x); } -// Shuffles the the lanes inside the wavefront according to the given index. -_DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, - uint32_t __width) { - uint32_t __hi = (uint32_t)(__x >> 32ull); - uint32_t __lo = (uint32_t)(__x & 0x); - return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width) - << 32ull) | - ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); -} - // Returns a bitmask marking all lanes that have the same value of __x. _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { @@ -238,8 +224,4 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { _Pragma("omp end declare variant"); _Pragma("omp end declare target"); -#if !defined(__cplusplus) -_Pragma("pop_macro(\"bool\")"); -#endif - #endif // __AMDGPUINTRIN_H diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 4181628d18048..ac79d685337c5 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -25,6 +25,13 @@ #endif #endif +#include + +#if !defined(__cplusplus) +_Pragma("push_macro(\"bool\")"); +#define bool _Bool +#endif + #if defined(__NVPTX__) #include #elif defined(__AMDGPU__) @@ -33,13 +40,6 @@ #error "This header is only meant to be used on GPU architectures." #endif -#include - -#if !defined(__cplusplus) -_Pragma("push_macro(\"bool\")"); -#define bool _Bool -#endif - _Pragma("omp begin declare target device_type(nohost)"); _Pragma("omp begin declare variant match(device = {kind(gpu)})"); @@ -141,6 +141,18 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) { __builtin_bit_cast(uint64_t, __x))); } +// Shuffles the the lanes according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0x); + uint32_t __mask = (uint32_t)__lane_mask; + return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) + << 32ull) | + ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); +} + // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index d00a5f6de3950..10ad7a682d4cd 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -13,15 +13,12 @@ #error "This file is intended for NVPTX targets or offloading to NVPTX" #endif -#ifndef __CUDA_ARCH__ -#define __CUDA_ARCH__ 0 +#ifndef __GPUINTRIN_H +#error "Never use directly; include instead" #endif -#include - -#if !defined(__cplusplus) -_Pragma("push_macro(\"bool\")"); -#define bool _Bool +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 #endif _Pragma("omp begin declare target device_type(nohost)"); @@ -153,18 +150,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, ((__gpu_num_lanes() - __width) << 8u) | 0x1f); } -// Shuffles the the lanes inside the warp according to the given index. -_DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, - uint32_t __width) { - uint32_t __hi = (uint32_t)(__x >> 32ull); - uint32_t __lo = (uint32_t)(__x & 0x); - uint32_t __mask =
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/JonChesterfield closed https://github.com/llvm/llvm-project/pull/131134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/jhuber6 edited https://github.com/llvm/llvm-project/pull/131134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/jhuber6 approved this pull request. LG https://github.com/llvm/llvm-project/pull/131134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/JonChesterfield edited https://github.com/llvm/llvm-project/pull/131134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/JonChesterfield edited https://github.com/llvm/llvm-project/pull/131134 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131134 >From 4c04f6979409642eb6bc9dc3c48b5e3636210ef0 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 13 Mar 2025 12:49:42 + Subject: [PATCH] [libc][nfc] Steps to allow sharing code between gpu intrin.h headers --- clang/lib/Headers/amdgpuintrin.h | 24 ++-- clang/lib/Headers/gpuintrin.h| 32 +++- clang/lib/Headers/nvptxintrin.h | 29 - 3 files changed, 29 insertions(+), 56 deletions(-) diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 839a05175cf3e..6a2b46b0c511e 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -13,14 +13,10 @@ #error "This file is intended for AMDGPU targets or offloading to AMDGPU" #endif -#include - -#if !defined(__cplusplus) -_Pragma("push_macro(\"bool\")"); -#define bool _Bool +#ifndef __GPUINTRIN_H +#error "Never use directly; include instead" #endif -_Pragma("omp begin declare target device_type(nohost)"); _Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); // Type aliases to the address spaces used by the AMDGPU backend. @@ -146,17 +142,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, return __builtin_amdgcn_ds_bpermute(__lane << 2, __x); } -// Shuffles the the lanes inside the wavefront according to the given index. -_DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, - uint32_t __width) { - uint32_t __hi = (uint32_t)(__x >> 32ull); - uint32_t __lo = (uint32_t)(__x & 0x); - return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width) - << 32ull) | - ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); -} - // Returns a bitmask marking all lanes that have the same value of __x. _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { @@ -236,10 +221,5 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { } _Pragma("omp end declare variant"); -_Pragma("omp end declare target"); - -#if !defined(__cplusplus) -_Pragma("pop_macro(\"bool\")"); -#endif #endif // __AMDGPUINTRIN_H diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 4181628d18048..d369e60b2fcac 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -25,14 +25,6 @@ #endif #endif -#if defined(__NVPTX__) -#include -#elif defined(__AMDGPU__) -#include -#elif !defined(_OPENMP) -#error "This header is only meant to be used on GPU architectures." -#endif - #include #if !defined(__cplusplus) @@ -41,6 +33,15 @@ _Pragma("push_macro(\"bool\")"); #endif _Pragma("omp begin declare target device_type(nohost)"); + +#if defined(__NVPTX__) +#include +#elif defined(__AMDGPU__) +#include +#elif !defined(_OPENMP) +#error "This header is only meant to be used on GPU architectures." +#endif + _Pragma("omp begin declare variant match(device = {kind(gpu)})"); #define __GPU_X_DIM 0 @@ -141,6 +142,18 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) { __builtin_bit_cast(uint64_t, __x))); } +// Shuffles the the lanes according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0x); + uint32_t __mask = (uint32_t)__lane_mask; + return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) + << 32ull) | + ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); +} + // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ float __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, @@ -223,7 +236,8 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) #undef __DO_LANE_SUM _Pragma("omp end declare variant"); -_Pragma("omp end declare target"); + +_Pragma("omp end declare target"); // nohost #if !defined(__cplusplus) _Pragma("pop_macro(\"bool\")"); diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index d00a5f6de3950..8e6b0de72aae1 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -13,18 +13,14 @@ #error "This file is intended for NVPTX targets or offloading to NVPTX" #endif -#ifndef __CUDA_ARCH__ -#define __CUDA_ARCH__ 0 +#ifndef __GPUINTRIN_H +#error "Never use directly; include instead" #endif -#include - -#if !defined(__cplusplus) -_Pragma("push_macro(\"bool\")"); -#define bool _Bool +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 #endif -_Pragma("omp begin declare tar