https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/194814
>From 82553f32f84b6eae33adfc2329274f804a0ed277 Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Wed, 29 Apr 2026 10:07:32 +0530 Subject: [PATCH 1/2] [AMDGPU] Add builtins for wave reduction intrinsics Assisted by - Claude-sonnet:4.6 --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 9 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 18 ++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 189 ++++++++++++++++++++ 3 files changed, 216 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index fc910123560a9..9c58805b353d3 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -538,6 +538,15 @@ def __builtin_amdgcn_is_invocable : AMDGPUBuiltin<"__amdgpu_feature_predicate_t( //===----------------------------------------------------------------------===// +def __builtin_amdgcn_wave_reduce_add_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_sub_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_min_i16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_min_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_max_i16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_max_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_and_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_or_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_xor_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_add_u32 : AMDGPUBuiltin<"uint32_t(uint32_t, _Constant int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_sub_u32 : AMDGPUBuiltin<"uint32_t(uint32_t, _Constant int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_min_i32 : AMDGPUBuiltin<"int32_t(int32_t, _Constant int32_t)", [Const]>; diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index cfad312d7535a..c3f358d6defc2 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -476,42 +476,51 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { switch (BuiltinID) { default: llvm_unreachable("Unknown BuiltinID for wave reduction"); + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: return Intrinsic::amdgcn_wave_reduce_add; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64: return Intrinsic::amdgcn_wave_reduce_fadd; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: return Intrinsic::amdgcn_wave_reduce_sub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64: return Intrinsic::amdgcn_wave_reduce_fsub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: return Intrinsic::amdgcn_wave_reduce_min; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64: return Intrinsic::amdgcn_wave_reduce_fmin; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: return Intrinsic::amdgcn_wave_reduce_umin; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: return Intrinsic::amdgcn_wave_reduce_max; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64: return Intrinsic::amdgcn_wave_reduce_fmax; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: return Intrinsic::amdgcn_wave_reduce_umax; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: return Intrinsic::amdgcn_wave_reduce_and; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: return Intrinsic::amdgcn_wave_reduce_or; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b16: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: return Intrinsic::amdgcn_wave_reduce_xor; @@ -523,22 +532,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; llvm::SyncScope::ID SSID; switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b16: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 2d645a968f2fd..71c0a9da840bf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -944,6 +944,195 @@ void test_wave_reduce_max_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_add_u16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16( +void test_wave_reduce_add_u16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_add_u16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16( +void test_wave_reduce_add_u16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_u16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16( +void test_wave_reduce_add_u16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_add_u16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16( +void test_wave_reduce_sub_u16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16( +void test_wave_reduce_sub_u16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_u16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16( +void test_wave_reduce_sub_u16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16( +void test_wave_reduce_min_i16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_i16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16( +void test_wave_reduce_min_i16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16( +void test_wave_reduce_min_i16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_u16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16( +void test_wave_reduce_min_u16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_u16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16( +void test_wave_reduce_min_u16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_u16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16( +void test_wave_reduce_min_u16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16( +void test_wave_reduce_max_i16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_i16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16( +void test_wave_reduce_max_i16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16( +void test_wave_reduce_max_i16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_u16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16( +void test_wave_reduce_max_u16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_u16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16( +void test_wave_reduce_max_u16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_u16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16( +void test_wave_reduce_max_u16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_b16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16( +void test_wave_reduce_and_b16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_and_b16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16( +void test_wave_reduce_and_b16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_b16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16( +void test_wave_reduce_and_b16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_b16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16( +void test_wave_reduce_or_b16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_or_b16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16( +void test_wave_reduce_or_b16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_b16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16( +void test_wave_reduce_or_b16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b16(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b16_default +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16( +void test_wave_reduce_xor_b16_default(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b16_iterative +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16( +void test_wave_reduce_xor_b16_iterative(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b16_dpp +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16( +void test_wave_reduce_xor_b16_dpp(global short* out, short in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 2); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() >From 8b5b34cd6fca1691d1c496235bf494d2e881e33e Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Mon, 4 May 2026 14:39:06 +0530 Subject: [PATCH 2/2] Missing SEMA tests --- .../wave-reduce-builtins-validate-amdgpu.cl | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl b/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl index 0f1565f1272c1..373c771c178a3 100644 --- a/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl +++ b/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl @@ -3,6 +3,32 @@ // Test that the second argument (strategy) must be a constant integer +void test_wave_reduce_u16(unsigned short val, int strategy) { + (void)__builtin_amdgcn_wave_reduce_add_u16(val, 0); + (void)__builtin_amdgcn_wave_reduce_sub_u16(val, 1); + (void)__builtin_amdgcn_wave_reduce_min_u16(val, 0); + (void)__builtin_amdgcn_wave_reduce_max_u16(val, 0); + + (void)__builtin_amdgcn_wave_reduce_add_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_add_u16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_sub_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_sub_u16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_min_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_min_u16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_max_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_max_u16' must be a constant integer}} +} + +void test_wave_reduce_i16(short val, int strategy) { + (void)__builtin_amdgcn_wave_reduce_min_i16(val, 0); + (void)__builtin_amdgcn_wave_reduce_max_i16(val, 0); + (void)__builtin_amdgcn_wave_reduce_and_b16(val, 0); + (void)__builtin_amdgcn_wave_reduce_or_b16(val, 0); + (void)__builtin_amdgcn_wave_reduce_xor_b16(val, 0); + + (void)__builtin_amdgcn_wave_reduce_min_i16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_min_i16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_max_i16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_max_i16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_and_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_and_b16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_or_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_or_b16' must be a constant integer}} + (void)__builtin_amdgcn_wave_reduce_xor_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_xor_b16' must be a constant integer}} +} + void test_wave_reduce_u32(unsigned int val, int strategy) { (void)__builtin_amdgcn_wave_reduce_add_u32(val, 0); (void)__builtin_amdgcn_wave_reduce_sub_u32(val, 1); _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
