[clang] [Clang] Change ballot mask handling for GPU intrinsics (PR #176202)

Joseph Huber via cfe-commits Thu, 15 Jan 2026 09:44:10 -0800

https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/176202


>From 680b9d0e4cf1dc7c6cb92e0ef2fefcf8fbfb8d7d Mon Sep 17 00:00:00 2001
From: Joseph Huber <[email protected]>
Date: Thu, 15 Jan 2026 11:10:45 -0600
Subject: [PATCH] [Clang] Change ballot mask handling for GPU intrinsics

Summary:
The NVIDIA handling of ballot and similar is complicated following
Volta. These instructions basically return the list of converged waves
*at the moment of calling*. It's entirely possible for an active mask to
increase during execution. This assumption that other ballots return the
submask is wrong, the one returned from CUDA can increase beyond the
local mask.

This leaves two options, remove the special handling to make it
consistent or add it to NVPTX.

This PR does the first just to give a smaller interface to the builtin.
But I think the second is 'more correct' if we use the ballot directly
as it could result in electing an unexpected leader.
---
 clang/lib/Headers/amdgpuintrin.h |  4 +---
 clang/lib/Headers/spirvintrin.h  |  6 ++----
 clang/test/Headers/gpuintrin.c   | 20 ++++++++------------
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..fac6dd995eaca 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -115,9 +115,7 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
-  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads
-  return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
+  return __builtin_amdgcn_ballot_w64(__x);
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
index 2a10a47adedde..10fb40db02daa 100644
--- a/clang/lib/Headers/spirvintrin.h
+++ b/clang/lib/Headers/spirvintrin.h
@@ -121,12 +121,10 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t 
__x) {
 // implementation is incorrect if the target uses more than 64 lanes.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
-  // The lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads.
   uint32_t [[clang::ext_vector_type(4)]] __mask =
       __builtin_spirv_subgroup_ballot(__x);
-  return __lane_mask & __builtin_bit_cast(uint64_t, __builtin_shufflevector(
-                                                        __mask, __mask, 0, 1));
+  return __builtin_bit_cast(uint64_t,
+                            __builtin_shufflevector(__mask, __mask, 0, 1));
 }
 
 // Waits for all the threads in the block to converge and issues a fence.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index c8fe721c8c37c..1f6227a811267 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -446,12 +446,10 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], 
align 8
 // AMDGPU-NEXT:    [[STOREDV:%.*]] = zext i1 [[__X]] to i8
 // AMDGPU-NEXT:    store i8 [[STOREDV]], ptr [[__X_ADDR_ASCAST]], align 1
-// AMDGPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], 
align 8
-// AMDGPU-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__X_ADDR_ASCAST]], align 1
-// AMDGPU-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1
-// AMDGPU-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 
[[LOADEDV]])
-// AMDGPU-NEXT:    [[AND:%.*]] = and i64 [[TMP0]], [[TMP2]]
-// AMDGPU-NEXT:    ret i64 [[AND]]
+// AMDGPU-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__X_ADDR_ASCAST]], align 1
+// AMDGPU-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
+// AMDGPU-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 
[[LOADEDV]])
+// AMDGPU-NEXT:    ret i64 [[TMP1]]
 //
 //
 // AMDGPU-LABEL: define internal void @__gpu_sync_threads(
@@ -1337,14 +1335,12 @@ __gpu_kernel void foo() {
 // SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // SPIRV-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.spv.wave.ballot(i1 
[[LOADEDV]])
 // SPIRV-NEXT:    store <4 x i32> [[TMP1]], ptr [[__MASK]], align 16
-// SPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
 // SPIRV-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
-// SPIRV-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[__MASK]], align 16
-// SPIRV-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x 
i32> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// SPIRV-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x 
i32> [[TMP3]], <2 x i32> <i32 0, i32 1>
 // SPIRV-NEXT:    store <2 x i32> [[SHUFFLE]], ptr [[REF_TMP]], align 8
-// SPIRV-NEXT:    [[TMP5:%.*]] = load i64, ptr [[REF_TMP]], align 8
-// SPIRV-NEXT:    [[AND:%.*]] = and i64 [[TMP2]], [[TMP5]]
-// SPIRV-NEXT:    ret i64 [[AND]]
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr [[REF_TMP]], align 8
+// SPIRV-NEXT:    ret i64 [[TMP4]]
 //
 //
 // SPIRV-LABEL: define internal spir_func void @__gpu_sync_threads(

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang] Change ballot mask handling for GPU intrinsics (PR #176202)

Reply via email to