https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/170813
>From c776e9260d0ec70cdd2e2430801cec04c038b69f Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Fri, 5 Dec 2025 14:03:36 +0530 Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 84 +++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 12ffad305e7c0..3aa7017fe7c6d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -408,6 +408,10 @@ def __builtin_amdgcn_wave_reduce_fadd_f32 : AMDGPUBuiltin<"float(float, int32_t) def __builtin_amdgcn_wave_reduce_fsub_f32 : AMDGPUBuiltin<"float(float, int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_fmin_f32 : AMDGPUBuiltin<"float(float, int32_t)", [Const]>; def __builtin_amdgcn_wave_reduce_fmax_f32 : AMDGPUBuiltin<"float(float, int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_fadd_f64 : AMDGPUBuiltin<"double(double, int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_fsub_f64 : AMDGPUBuiltin<"double(double, int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_fmin_f64 : AMDGPUBuiltin<"double(double, int32_t)", [Const]>; +def __builtin_amdgcn_wave_reduce_fmax_f64 : AMDGPUBuiltin<"double(double, int32_t)", [Const]>; //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index a8a5bc348f00c..a096ed27a788e 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -374,16 +374,19 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: return Intrinsic::amdgcn_wave_reduce_add; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64: return Intrinsic::amdgcn_wave_reduce_fadd; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: return Intrinsic::amdgcn_wave_reduce_sub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64: return Intrinsic::amdgcn_wave_reduce_fsub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: return Intrinsic::amdgcn_wave_reduce_min; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64: return Intrinsic::amdgcn_wave_reduce_fmin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: @@ -392,6 +395,7 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: return Intrinsic::amdgcn_wave_reduce_max; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64: return Intrinsic::amdgcn_wave_reduce_fmax; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: @@ -415,14 +419,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 04140ed3f10b0..376105cb6594c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -412,6 +412,13 @@ void test_wave_reduce_fadd_f32_default(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fadd_f64_default +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fadd.f64( +void test_wave_reduce_fadd_f64_default(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_iterative(global int* out, int in) @@ -433,6 +440,13 @@ void test_wave_reduce_fadd_f32_iterative(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fadd_f64_iterative +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fadd.f64( +void test_wave_reduce_fadd_f64_iterative(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_dpp(global int* out, int in) @@ -454,6 +468,13 @@ void test_wave_reduce_fadd_f32_dpp(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fadd_f64_dpp +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fadd.f64( +void test_wave_reduce_fadd_f64_dpp(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_default(global int* out, int in) @@ -475,6 +496,13 @@ void test_wave_reduce_fsub_f32_default(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fsub_f64_default +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fsub.f64( +void test_wave_reduce_fsub_f64_default(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_iterative(global int* out, int in) @@ -496,6 +524,13 @@ void test_wave_reduce_fsub_f32_iterative(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fsub_f64_iterative +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fsub.f64( +void test_wave_reduce_fsub_f64_iterative(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_dpp(global int* out, int in) @@ -517,6 +552,13 @@ void test_wave_reduce_fsub_f32_dpp(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fsub_f64_dpp +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fsub.f64( +void test_wave_reduce_fsub_f64_dpp(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_and_b32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( void test_wave_reduce_and_b32_default(global int* out, int in) @@ -664,6 +706,13 @@ void test_wave_reduce_fmin_f32_default(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmin_f64_default +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmin.f64( +void test_wave_reduce_fmin_f64_default(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_iterative(global int* out, int in) @@ -685,6 +734,13 @@ void test_wave_reduce_fmin_f32_iterative(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmin_f64_iterative +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmin.f64( +void test_wave_reduce_fmin_f64_iterative(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_dpp(global int* out, int in) @@ -706,6 +762,13 @@ void test_wave_reduce_fmin_f32_dpp(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmin_f64_dpp +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmin.f64( +void test_wave_reduce_fmin_f64_dpp(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( void test_wave_reduce_min_u32_default(global int* out, int in) @@ -769,6 +832,13 @@ void test_wave_reduce_fmax_f32_default(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmax_f64_default +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmax.f64( +void test_wave_reduce_fmax_f64_default(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_iterative(global int* out, int in) @@ -790,6 +860,13 @@ void test_wave_reduce_fmax_f32_iterative(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmax_f64_iterative +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmax.f64( +void test_wave_reduce_fmax_f64_iterative(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_dpp(global int* out, int in) @@ -811,6 +888,13 @@ void test_wave_reduce_fmax_f32_dpp(global float* out, float in) *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmax_f64_dpp +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.wave.reduce.fmax.f64( +void test_wave_reduce_fmax_f64_dpp(global double* out, double in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f64(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( void test_wave_reduce_max_u32_default(global int* out, int in) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
