https://github.com/AlexVlx updated https://github.com/llvm/llvm-project/pull/179492
>From edf668446bdf50c27f8ec01ada9f7ab67157083f Mon Sep 17 00:00:00 2001 From: Alex Voicu <[email protected]> Date: Tue, 3 Feb 2026 16:19:51 +0000 Subject: [PATCH 1/2] Add `wave_id` and `wave_shuffle` Clang builtins. --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 9 +++++++ clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 4 +++ clang/lib/Sema/SemaAMDGPU.cpp | 25 +++++++++++++++++++ clang/lib/Sema/SemaChecking.cpp | 4 ++- .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 16 ++++++++++++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 21 ++++++++++++++++ 6 files changed, 78 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 1950757097fc6..a9acc1544ad53 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -67,6 +67,8 @@ def __builtin_amdgcn_mbcnt_lo : AMDGPUBuiltin<"unsigned int(unsigned int, unsign def __builtin_amdgcn_s_memtime : AMDGPUBuiltin<"uint64_t()", [], "s-memtime-inst">; +def __builtin_amdgcn_wave_id : AMDGPUBuiltin<"int32_t()", [Const], "architected-sgprs">; + //===----------------------------------------------------------------------===// // Instruction builtins. //===----------------------------------------------------------------------===// @@ -413,6 +415,13 @@ def __builtin_amdgcn_wave_reduce_fsub_f64 : AMDGPUBuiltin<"double(double, _Const def __builtin_amdgcn_wave_reduce_fmin_f64 : AMDGPUBuiltin<"double(double, _Constant int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_fmax_f64 : AMDGPUBuiltin<"double(double, _Constant int32_t)", [Const]>; +//===----------------------------------------------------------------------===// +// Wave Shuffle builtins. +//===----------------------------------------------------------------------===// + +// This is an overloaded builtin modelled after the atomic ones +def __builtin_amdgcn_wave_shuffle : AMDGPUBuiltin<"void(...)", [Const, CustomTypeChecking]>; + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index a096ed27a788e..619c9b4be9090 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -449,6 +449,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); return Builder.CreateCall(F, {Value, Strategy}); } + case AMDGPU::BI__builtin_amdgcn_wave_shuffle: + // TODO: can we unify this with wave_reduce? + return emitBuiltinWithOneOverloadedType<2>(*this, E, + Intrinsic::amdgcn_wave_shuffle); case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { // Translate from the intrinsics's struct return to the builtin's out diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 4261e1849133f..d5403f22eb7bb 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -296,6 +296,31 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, } return false; } + case AMDGPU::BI__builtin_amdgcn_wave_shuffle: { + Expr *Val = TheCall->getArg(0); + QualType ValTy = Val->getType(); + + if ((!ValTy->isIntegerType() && !ValTy->isFloatingType()) || + SemaRef.getASTContext().getTypeSize(ValTy) > 32) + return Diag(Val->getExprLoc(), diag::err_builtin_invalid_arg_type) + << Val << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/2 + << ValTy; + + Expr *Idx = TheCall->getArg(1); + QualType IdxTy = Idx->getType(); + if (!IdxTy->isIntegerType()) + return Diag(Idx->getExprLoc(), diag::err_typecheck_expect_int) << IdxTy; + if (SemaRef.getASTContext().getTypeSize(IdxTy) > 32) + return Diag(Idx->getExprLoc(), diag::err_builtin_invalid_arg_type) + << Idx << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/0 + << IdxTy; + + // Resolve the overload here, now that we know that the invocation is + // correct: the intrinsic returns the type of the value argument. + TheCall->setType(ValTy); + + return false; + } default: return false; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index e2e1b37572364..9858264aa042d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2100,8 +2100,10 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case llvm::Triple::spirv: case llvm::Triple::spirv32: case llvm::Triple::spirv64: - if (TI.getTriple().getOS() != llvm::Triple::OSType::AMDHSA) + if (TI.getTriple().getVendor() != llvm::Triple::VendorType::AMD) return SPIRV().CheckSPIRVBuiltinFunctionCall(TI, BuiltinID, TheCall); + else + return AMDGPU().CheckAMDGCNBuiltinFunctionCall(BuiltinID, TheCall); return false; case llvm::Triple::systemz: return SystemZ().CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl index 8c02616780182..d39c4180178ad 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl @@ -317,3 +317,19 @@ void test_ds_bpermute_fi_b32(global int* out, int a, int b) { *out = __builtin_amdgcn_ds_bpermute_fi_b32(a, b); } + +__attribute__((target("architected-sgprs"))) +// CHECK-LABEL: @test_wave_id( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.wave.id() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4 +// CHECK-NEXT: ret void +// +void test_wave_id(global int* out) +{ + *out = __builtin_amdgcn_wave_id(); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 376105cb6594c..4755cd32a2e2c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -937,6 +937,27 @@ void test_wave_reduce_max_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2); } +// CHECK-LABEL: @test_wave_shuffle_u32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.shuffle.i32 +void test_wave_shuffle_u32(global unsigned* out, unsigned in, int idx) +{ + *out = __builtin_amdgcn_wave_shuffle(in, idx); +} + +// CHECK-LABEL: @test_wave_shuffle_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.shuffle.i32 +void test_wave_shuffle_i32(global int* out, int in, int idx) +{ + *out = __builtin_amdgcn_wave_shuffle(in, idx); +} + +// CHECK-LABEL: @test_wave_shuffle_f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.shuffle.f32 +void test_wave_shuffle_f32(global float* out, float in, int idx) +{ + *out = __builtin_amdgcn_wave_shuffle(in, idx); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() >From deb0a0d3e8f39604aeca1c5fc148b90f6a93d4ba Mon Sep 17 00:00:00 2001 From: Alex Voicu <[email protected]> Date: Tue, 3 Feb 2026 16:25:23 +0000 Subject: [PATCH 2/2] Fix formatting. --- clang/lib/Sema/SemaAMDGPU.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index d5403f22eb7bb..8fce0a56bc4f9 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -303,8 +303,8 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, if ((!ValTy->isIntegerType() && !ValTy->isFloatingType()) || SemaRef.getASTContext().getTypeSize(ValTy) > 32) return Diag(Val->getExprLoc(), diag::err_builtin_invalid_arg_type) - << Val << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/2 - << ValTy; + << Val << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/2 + << ValTy; Expr *Idx = TheCall->getArg(1); QualType IdxTy = Idx->getType(); @@ -312,8 +312,8 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, return Diag(Idx->getExprLoc(), diag::err_typecheck_expect_int) << IdxTy; if (SemaRef.getASTContext().getTypeSize(IdxTy) > 32) return Diag(Idx->getExprLoc(), diag::err_builtin_invalid_arg_type) - << Idx << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/0 - << IdxTy; + << Idx << /*scalar=*/1 << /*'int'=*/4 << /*floating point=*/0 + << IdxTy; // Resolve the overload here, now that we know that the invocation is // correct: the intrinsic returns the type of the value argument. _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
