[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread LLVM Continuous Integration via cfe-commits

llvm-ci wrote:

LLVM Buildbot has detected a new failure on builder `openmp-s390x-linux` 
running on `systemz-1` while building `clang` at step 2 "checkout".

Full details are available at: 
https://lab.llvm.org/buildbot/#/builders/88/builds/9054


Here is the relevant piece of the build log for the reference

```
Step 2 (checkout) failure: update (failure)
git version 2.34.1
fatal: unable to access 'https://github.com/llvm/llvm-project.git/': Failed to 
connect to github.com port 443 after 133598 ms: Connection timed out
fatal: unable to access 'https://github.com/llvm/llvm-project.git/': GnuTLS 
recv error (-110): The TLS connection was non-properly terminated.

```



https://github.com/llvm/llvm-project/pull/131027
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield closed 
https://github.com/llvm/llvm-project/pull/131027
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131027

>From 68f09d0f3f7849b91cb39ce42ba48e3e4aafb488 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Wed, 12 Mar 2025 20:31:39 +
Subject: [PATCH] [libc][nfc] Use common implementation of read_first_lane_u64,
 no codegen regression

---
 clang/lib/Headers/amdgpuintrin.h | 15 ++
 clang/lib/Headers/gpuintrin.h| 10 
 clang/lib/Headers/nvptxintrin.h  | 21 ++--
 clang/test/Headers/gpuintrin.c   | 87 +---
 4 files changed, 99 insertions(+), 34 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 15409eacf7716..839a05175cf3e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0x);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
   bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   __gpu_sync_lane(__lane_mask);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index efdc3d94ac0b3..4181628d18048 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,16 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xull);
+  return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
+ ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
+  0xull);
+}
+
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 73eb0af8b5926..d00a5f6de3950 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = 
{arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
 }
 
-// Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  uint32_t __mask = (uint32_t)__lane_mask;
-  uint32_t __id = __builtin_ffs(__mask) - 1;

[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread Joseph Huber via cfe-commits

https://github.com/jhuber6 approved this pull request.

This means that `nvptxintrin.h` and `amdgpuintrin.h` can't be included 
standalone, but I'm not sure it's a big deal since we already define a lot of 
the common functionality in the `gpurintrin.h` header.

https://github.com/llvm/llvm-project/pull/131027
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread via cfe-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 15e6bb6224177805d8b6a8268f08a2b88ae4dd70 
a343ee96a2cadf4c508e12e74568d4cdf63ee75a --extensions c,h -- 
clang/lib/Headers/amdgpuintrin.h clang/lib/Headers/gpuintrin.h 
clang/lib/Headers/nvptxintrin.h clang/test/Headers/gpuintrin.c
``





View the diff from clang-format here.


``diff
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index b7b997c196..4181628d18 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -121,7 +121,8 @@ __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t 
__x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
   uint32_t __lo = (uint32_t)(__x & 0xull);
   return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
- ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 
0xull);
+ ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
+  0xull);
 }
 
 // Gets the first floating point value from the active lanes.

``




https://github.com/llvm/llvm-project/pull/131027
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread via cfe-commits

llvmbot wrote:




@llvm/pr-subscribers-libc

Author: Jon Chesterfield (JonChesterfield)


Changes

No codegen regression on either target. The two builtin_ffs implied on nvptx 
CSE away.

```
define internal i64 @__gpu_read_first_lane_u64(i64 noundef 
%__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv2 = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 
true)
  %iszero = icmp eq i32 %conv2, 0
  %sub = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 
%conv, i32 %sub, i32 31)
  %conv4 = sext i32 %1 to i64
  %shl = shl nsw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 
%conv1, i32 %sub, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
; becomes

define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef 
%__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv.i = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 
true)
  %iszero = icmp eq i32 %conv.i, 0
  %sub.i = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 
%conv, i32 %sub.i, i32 31)
  %conv4 = zext i32 %1 to i64
  %shl = shl nuw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 
%conv1, i32 %sub.i, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
```

The sext vs zext difference is vaguely interesting but since the bits are 
immediately discarded in either case it make no odds. The amdgcn one doesn't 
need CSE, the readfirstlane function is a single call to an intrinsic.

Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could 
use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and 
a stray missing static as well.

---
Full diff: https://github.com/llvm/llvm-project/pull/131027.diff


4 Files Affected:

- (modified) clang/lib/Headers/amdgpuintrin.h (+5-10) 
- (modified) clang/lib/Headers/gpuintrin.h (+9) 
- (modified) clang/lib/Headers/nvptxintrin.h (+5-16) 
- (modified) clang/test/Headers/gpuintrin.c (+79-8) 


``diff
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 15409eacf7716..839a05175cf3e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0x);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
   bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   __gpu_sync_lane(__lane_mask);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index efdc3d94ac0b3..b7b997c1968c5 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,15 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline_

[clang] [libc][nfc] Use common implementation of read_first_lane_u64 (PR #131027)

2025-03-12 Thread Jon Chesterfield via cfe-commits

https://github.com/JonChesterfield created 
https://github.com/llvm/llvm-project/pull/131027

No codegen regression on either target. The two builtin_ffs implied on nvptx 
CSE away.

```
define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 
noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv2 = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true)
  %iszero = icmp eq i32 %conv2, 0
  %sub = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 
%sub, i32 31)
  %conv4 = sext i32 %1 to i64
  %shl = shl nsw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 
%sub, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
; becomes

define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef 
%__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv.i = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true)
  %iszero = icmp eq i32 %conv.i, 0
  %sub.i = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 
%sub.i, i32 31)
  %conv4 = zext i32 %1 to i64
  %shl = shl nuw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 
%sub.i, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
```

The sext vs zext difference is vaguely interesting but since the bits are 
immediately discarded in either case it make no odds. The amdgcn one doesn't 
need CSE, the readfirstlane function is a single call to an intrinsic.

Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could 
use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and 
a stray missing static as well.

>From a343ee96a2cadf4c508e12e74568d4cdf63ee75a Mon Sep 17 00:00:00 2001
From: Jon Chesterfield 
Date: Wed, 12 Mar 2025 20:31:39 +
Subject: [PATCH] [libc][nfc] Use common implementation of read_first_lane_u64,
 no codegen regression

---
 clang/lib/Headers/amdgpuintrin.h | 15 ++
 clang/lib/Headers/gpuintrin.h|  9 
 clang/lib/Headers/nvptxintrin.h  | 21 ++--
 clang/test/Headers/gpuintrin.c   | 87 +---
 4 files changed, 98 insertions(+), 34 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 15409eacf7716..839a05175cf3e 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = 
{arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0x);
-  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0x);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
   bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   __gpu_sync_lane(__lane_mask);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index efdc3d94ac0b3..b7b997c1968c5 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,15 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   re