[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Joseph Huber via cfe-commits


@@ -32,6 +32,31 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Forward declare a few functions for the implementation header.
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);

jhuber6 wrote:

```suggestion
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
```

https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `openmp-s390x-linux` 
running on `systemz-1` while building `clang` at step 6 "test-openmp".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/88/builds/9091


Here is the relevant piece of the build log for the reference

```
Step 6 (test-openmp) failure: test (failure)
 TEST 'libomp :: tasking/issue-94260-2.c' FAILED 

Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang 
-fopenmp   -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src
 -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test 
-L 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src
  -fno-omit-frame-pointer -mbackchain -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt
 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c
 -o 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
 -lm -latomic && 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang 
-fopenmp -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src
 -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test 
-L 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src
 -fno-omit-frame-pointer -mbackchain -I 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt
 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c
 -o 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
 -lm -latomic
# executed command: 
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--




```



https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131141

>From fbeb177a750ca671a9cff9f37f57e58c6900e7fd Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Thu, 13 Mar 2025 13:23:38 +
Subject: [PATCH] [Headers][NFC] Deduplicate gpu_match_ between targets via
 inlining

---
 clang/lib/Headers/amdgpuintrin.h | 44 ++---
 clang/lib/Headers/gpuintrin.h| 82 +++-
 clang/lib/Headers/nvptxintrin.h  | 48 ---
 3 files changed, 93 insertions(+), 81 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..f7fb8e2814180 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t 
__idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-if (!__done) {
-  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-  if (__first == __x) {
-__match_mask = __gpu_lane_mask();
-__done = 1;
-  }
-}
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint64_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-if (!__done) {
-  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
-  if (__first == __x) {
-__match_mask = __gpu_lane_mask();
-__done = 1;
-  }
-}
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
 }
 
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  __gpu_sync_lane(__lane_mask);
-  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
 }
 
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
-  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  __gpu_sync_lane(__lane_mask);
-  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
 }
 
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..0fb3916acac61 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,30 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Forward declare a few functions for the implementation header.
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
 #if defined(__NVPTX__)
 #include 
 #elif defined(__AMDGPU__)
@@ -115,7 +139,7 @@ __gpu_i

[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield closed 
https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits


@@ -32,6 +32,31 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Forward declare a few functions for the implementation header.
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);

JonChesterfield wrote:

Sure, done

https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

LG

https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131141

>From 5e55b829eb3c7f4a4e674333cdde73b5bfe970f8 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Thu, 13 Mar 2025 13:23:38 +
Subject: [PATCH] [Headers][NFC] Deduplicate gpu_match_ between targets via
 inlining

---
 clang/lib/Headers/amdgpuintrin.h | 44 ++---
 clang/lib/Headers/gpuintrin.h| 83 +++-
 clang/lib/Headers/nvptxintrin.h  | 48 +++---
 3 files changed, 94 insertions(+), 81 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..5e7f9b967bd17 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t 
__idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-if (!__done) {
-  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-  if (__first == __x) {
-__match_mask = __gpu_lane_mask();
-__done = 1;
-  }
-}
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint64_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-if (!__done) {
-  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
-  if (__first == __x) {
-__match_mask = __gpu_lane_mask();
-__done = 1;
-  }
-}
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u64(__lane_mask, __x);
 }
 
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  __gpu_sync_lane(__lane_mask);
-  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+  return __gpu_fallback_match_all_u32(__lane_mask, __x);
 }
 
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
-  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  __gpu_sync_lane(__lane_mask);
-  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+  return __gpu_fallback_match_all_u64(__lane_mask, __x);
 }
 
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..f231f3c519a34 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,31 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Forward declare a few functions for the implementation header.
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u64(uint64_t __lane_mask, uint64_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_all_u32(uint64_t __lane_mask, uint32_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_all_u64(uint64_t __lane_mask, uint64_t __x);
+
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
 #if defined(__NVPTX__)
 #include 
 #elif defined(__AMDG

[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield edited 
https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)

2025-03-13 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield edited 
https://github.com/llvm/llvm-project/pull/131141
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits