[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131134

>From 7347ebc6a0aadd1b9676e329bdf7705dbfae7875 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Thu, 13 Mar 2025 12:49:42 +
Subject: [PATCH] [libc][nfc] Steps to allow sharing code between gpu intrin.h
 headers

---
 clang/lib/Headers/amdgpuintrin.h | 22 ++
 clang/lib/Headers/gpuintrin.h| 26 +++---
 clang/lib/Headers/nvptxintrin.h  | 27 ---
 3 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 839a05175cf3e..56748f6c3e818 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -13,11 +13,8 @@
 #error "This file is intended for AMDGPU targets or offloading to AMDGPU"
 #endif
 
-#include 
-
-#if !defined(__cplusplus)
-_Pragma("push_macro(\"bool\")");
-#define bool _Bool
+#ifndef __GPUINTRIN_H
+#error "Never use  directly; include  instead"
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
@@ -146,17 +143,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t 
__idx, uint32_t __x,
   return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
 }
 
-// Shuffles the the lanes inside the wavefront according to the given index.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
-  uint32_t __width) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
-  << 32ull) |
- ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
-}
-
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
@@ -238,8 +224,4 @@ _DEFAULT_FN_ATTRS static __inline__ void 
__gpu_thread_suspend(void) {
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
-#if !defined(__cplusplus)
-_Pragma("pop_macro(\"bool\")");
-#endif
-
 #endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4181628d18048..ac79d685337c5 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -25,6 +25,13 @@
 #endif
 #endif
 
+#include 
+
+#if !defined(__cplusplus)
+_Pragma("push_macro(\"bool\")");
+#define bool _Bool
+#endif
+
 #if defined(__NVPTX__)
 #include 
 #elif defined(__AMDGPU__)
@@ -33,13 +40,6 @@
 #error "This header is only meant to be used on GPU architectures."
 #endif
 
-#include 
-
-#if !defined(__cplusplus)
-_Pragma("push_macro(\"bool\")");
-#define bool _Bool
-#endif
-
 _Pragma("omp begin declare target device_type(nohost)");
 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
 
@@ -141,6 +141,18 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double 
__x) {
 __builtin_bit_cast(uint64_t, __x)));
 }
 
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+  uint32_t __width) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0x);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
+  << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
+}
+
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index d00a5f6de3950..10ad7a682d4cd 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,15 +13,12 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
-#ifndef __CUDA_ARCH__
-#define __CUDA_ARCH__ 0
+#ifndef __GPUINTRIN_H
+#error "Never use  directly; include  instead"
 #endif
 
-#include 
-
-#if !defined(__cplusplus)
-_Pragma("push_macro(\"bool\")");
-#define bool _Bool
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
@@ -153,18 +150,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t 
__idx, uint32_t __x,
   ((__gpu_num_lanes() - __width) << 8u) | 
0x1f);
 }
 
-// Shuffles the the lanes inside the warp according to the given index.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
-  uint32_t __width) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  uint32_t __mask =

[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield closed 
https://github.com/llvm/llvm-project/pull/131134
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 edited 
https://github.com/llvm/llvm-project/pull/131134
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/131134
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield edited 
https://github.com/llvm/llvm-project/pull/131134
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield edited 
https://github.com/llvm/llvm-project/pull/131134
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Steps to allow sharing code between gpu intrin.h headers (PR #131134)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131134

>From 4c04f6979409642eb6bc9dc3c48b5e3636210ef0 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Thu, 13 Mar 2025 12:49:42 +
Subject: [PATCH] [libc][nfc] Steps to allow sharing code between gpu intrin.h
 headers

---
 clang/lib/Headers/amdgpuintrin.h | 24 ++--
 clang/lib/Headers/gpuintrin.h| 32 +++-
 clang/lib/Headers/nvptxintrin.h  | 29 -
 3 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 839a05175cf3e..6a2b46b0c511e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -13,14 +13,10 @@
 #error "This file is intended for AMDGPU targets or offloading to AMDGPU"
 #endif
 
-#include 
-
-#if !defined(__cplusplus)
-_Pragma("push_macro(\"bool\")");
-#define bool _Bool
+#ifndef __GPUINTRIN_H
+#error "Never use  directly; include  instead"
 #endif
 
-_Pragma("omp begin declare target device_type(nohost)");
 _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 
 // Type aliases to the address spaces used by the AMDGPU backend.
@@ -146,17 +142,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t 
__idx, uint32_t __x,
   return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
 }
 
-// Shuffles the the lanes inside the wavefront according to the given index.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
-  uint32_t __width) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
-  << 32ull) |
- ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
-}
-
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
@@ -236,10 +221,5 @@ _DEFAULT_FN_ATTRS static __inline__ void 
__gpu_thread_suspend(void) {
 }
 
 _Pragma("omp end declare variant");
-_Pragma("omp end declare target");
-
-#if !defined(__cplusplus)
-_Pragma("pop_macro(\"bool\")");
-#endif
 
 #endif // __AMDGPUINTRIN_H
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4181628d18048..d369e60b2fcac 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -25,14 +25,6 @@
 #endif
 #endif
 
-#if defined(__NVPTX__)
-#include 
-#elif defined(__AMDGPU__)
-#include 
-#elif !defined(_OPENMP)
-#error "This header is only meant to be used on GPU architectures."
-#endif
-
 #include 
 
 #if !defined(__cplusplus)
@@ -41,6 +33,15 @@ _Pragma("push_macro(\"bool\")");
 #endif
 
 _Pragma("omp begin declare target device_type(nohost)");
+
+#if defined(__NVPTX__)
+#include 
+#elif defined(__AMDGPU__)
+#include 
+#elif !defined(_OPENMP)
+#error "This header is only meant to be used on GPU architectures."
+#endif
+
 _Pragma("omp begin declare variant match(device = {kind(gpu)})");
 
 #define __GPU_X_DIM 0
@@ -141,6 +142,18 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double 
__x) {
 __builtin_bit_cast(uint64_t, __x)));
 }
 
+// Shuffles the the lanes according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+  uint32_t __width) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0x);
+  uint32_t __mask = (uint32_t)__lane_mask;
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
+  << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
+}
+
 // Shuffles the the lanes according to the given index.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
@@ -223,7 +236,8 @@ __DO_LANE_SUM(double, f64);   // double 
__gpu_lane_sum_f64(m, x)
 #undef __DO_LANE_SUM
 
 _Pragma("omp end declare variant");
-_Pragma("omp end declare target");
+
+_Pragma("omp end declare target"); // nohost
 
 #if !defined(__cplusplus)
 _Pragma("pop_macro(\"bool\")");
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index d00a5f6de3950..8e6b0de72aae1 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,18 +13,14 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
-#ifndef __CUDA_ARCH__
-#define __CUDA_ARCH__ 0
+#ifndef __GPUINTRIN_H
+#error "Never use  directly; include  instead"
 #endif
 
-#include 
-
-#if !defined(__cplusplus)
-_Pragma("push_macro(\"bool\")");
-#define bool _Bool
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
 #endif
 
-_Pragma("omp begin declare tar