https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/149360
Co-authored-by: Mekhanoshin, Stanislav <stanislav.mekhanos...@amd.com> >From 2fa6c545f78a345feb30c1ac27e9874106b5870c Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Thu, 17 Jul 2025 13:03:14 -0400 Subject: [PATCH] [AMDGPU] Add support for `v_tanh_f32` on gfx1250 Co-authored-by: Mekhanoshin, Stanislav <stanislav.mekhanos...@amd.com> --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 +++++ llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 + llvm/lib/TargetParser/TargetParser.cpp | 1 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll | 84 +++++++++++++++++++ 7 files changed, 122 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3b6ad7d90be3c..4111837d962b5 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -669,6 +669,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index a1f984c129276..e120a46c6327b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -42,6 +42,25 @@ void test_s_wait_tensorcnt() { __builtin_amdgcn_s_wait_tensorcnt(0); } +// CHECK-LABEL: @test_tanh_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: ret void +// +void test_tanh_f32(global float* out, float a) +{ + *out = __builtin_amdgcn_tanhf(a); +} + // CHECK-LABEL: @test_tanh_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index faf59c1541fc0..0e0e83b7a6b54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 67c6daaa24c2a..268162bcada47 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -234,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1380,6 +1381,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index ff89b8badeed0..8c35fea8259f4 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -1138,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; +defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index d7e206ef8cd4f..4ca7444a73b35 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx1250-insts"] = true; Features["bitop3-insts"] = true; Features["prng-inst"] = true; + Features["tanh-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["fp8-conversion-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll index 344c0112e4a54..91a2a0b651132 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll @@ -7,8 +7,92 @@ ; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. ; FIXME: GlobalISel does not work with bf16 +declare float @llvm.amdgcn.tanh.f32(float) #0 declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0 +define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { +; SDAG-REAL16-LABEL: tanh_f32: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, s2 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, s2 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f32_constant_4.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 4.0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 4.0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f32_constant_100.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_undef_f32: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_undef_f32: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float undef) + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-REAL16-LABEL: tanh_bf16: ; SDAG-REAL16: ; %bb.0: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits