https://github.com/rovka created https://github.com/llvm/llvm-project/pull/149052
When using the `amdgcn.init.whole.wave` intrinsic, we add dummy VGPR arguments with the purpose of preserving their inactive lanes. The pattern may look something like this: ``` entry: call amdgcn.init.whole.wave branch to shader or tail shader: $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes actual code... tail: call amdgcn.cs.chain [...], implicit $vInactive ``` We should not report these VGPRs in the `.vgpr_count` metadata. This patch achieves that goal by ignoring IMPLICIT_DEFs and calls. This should be safe since if those registers are actually used in any other context, they will be counted there. It also reduces the scope of the code that counts unused function arguments to only work on entry functions, since only they need to handle hardware-initialized registers. This is a reworked version of #133242, which was reverted in #144039. >From 3ccfa1e5284bcafe6f927d89096d26619c23bec1 Mon Sep 17 00:00:00 2001 From: Diana Picus <diana-magda.pi...@amd.com> Date: Wed, 16 Jul 2025 09:45:50 +0200 Subject: [PATCH] [AMDGPU] Ignore inactive VGPRs in .vgpr_count When using the `amdgcn.init.whole.wave` intrinsic, we add dummy VGPR arguments with the purpose of preserving their inactive lanes. The pattern may look something like this: ``` entry: call amdgcn.init.whole.wave branch to shader or tail shader: $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes actual code... tail: call amdgcn.cs.chain [...], implicit $vInactive ``` We should not report these VGPRs in the `.vgpr_count` metadata. This patch achieves that goal by ignoring IMPLICIT_DEFs and calls. This should be safe since if those registers are actually used in any other context, they will be counted there. It also reduces the scope of the code that counts unused function arguments to only work on entry functions, since only they need to handle hardware-initialized registers. This is a reworked version of #133242, which was reverted in #144039. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 2 +- .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 3 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 22 ++++++ .../init-whole-wave-vgpr-count-large.ll | 76 ++++++++++++++++++ .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll | 50 ++++++++++++ ...init-whole-wave-vgpr-count-use-inactive.ll | 78 +++++++++++++++++++ .../AMDGPU/init-whole-wave-vgpr-count.ll | 75 ++++++++++++++++++ .../AMDGPU/unnamed-function-resource-info.ll | 2 +- .../CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll | 4 +- .../test/CodeGen/AMDGPU/vgpr-count-compute.ll | 30 +++++++ .../AMDGPU/vgpr-count-graphics-chain.ll | 27 +++++++ .../CodeGen/AMDGPU/vgpr-count-graphics.ll | 25 ++++++ 12 files changed, 390 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c0920e3e71bee..14c392b2b2250 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -993,7 +993,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // dispatch registers are function args. unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - if (isShader(F.getCallingConv())) { + if (AMDGPU::shouldReportUnusedFuncArgs(F.getCallingConv())) { bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index ec4daa2cf662a..bb0d2027c71f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -213,6 +213,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( if (!RC || !TRI.isVGPRClass(RC)) continue; + if (MI.isCall() || MI.isImplicitDef()) + continue; + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index e6078d6918ac2..bc06c68d968c6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1367,6 +1367,28 @@ constexpr bool isEntryFunctionCC(CallingConv::ID CC) { } } +// Shaders that are entry functions need to count input arguments even if +// they're not used (i.e. not reported by AMDGPUResourceUsageAnalysis). Other +// functions can skip including them. This is especially important for shaders +// that use the init.whole.wave intrinsic, since they sometimes have VGPR +// arguments that are only added for the purpose of preserving their inactive +// lanes and should not be included in the vgpr-count. +LLVM_READNONE +constexpr bool shouldReportUnusedFuncArgs(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll new file mode 100644 index 0000000000000..e47f5e25ead3a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll @@ -0,0 +1,76 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Use VGPRs above the input arguments. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0x1d{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; clobber v28", "~{v28}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll new file mode 100644 index 0000000000000..5d7472fd3c56e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll @@ -0,0 +1,50 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: leaf_shader: +; CHECK: .vgpr_count:{{.*}}0xc{{$}} + +; Function without calls. +define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value, + i32 %active.vgpr1, i32 %active.vgpr2, + i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6) + local_unnamed_addr { +entry: + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %compute, label %merge + +compute: + ; Perform a more complex computation using active VGPRs + %square = mul i32 %active.vgpr1, %active.vgpr1 + %product = mul i32 %square, %active.vgpr2 + %sum = add i32 %product, %input.value + %result = add i32 %sum, 42 + br label %merge + +merge: + %final.result = phi i32 [ 0, %entry ], [ %result, %compute ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ] + + store i32 %final.result, ptr %output.ptr, align 4 + + ret void +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll new file mode 100644 index 0000000000000..f1f7fb22d44c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll @@ -0,0 +1,78 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes. +; In that case, the VGPR should be included in the .vgpr_count +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xd{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll new file mode 100644 index 0000000000000..b9130dd1b7ed4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xa{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll index cf5b95a729974..bb0ec0d3ad3f8 100644 --- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll +++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll @@ -16,7 +16,7 @@ entry: } ; CHECK-LABEL: __unnamed_2: -; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr) +; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr) ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr) ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr) ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll index 2cb5e309c8c21..ee35dc4cddade 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll @@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 { } ; GCN-LABEL: {{^}}f1024_0: -; GFX90A: NumVgprs: 32 +; GFX90A: NumVgprs: 1 ; GFX90A: NumAgprs: 1 -; GFX90A: TotalNumVgprs: 33 +; GFX90A: TotalNumVgprs: 5 define void @f1024_0() #1024 { call void @foo() ret void diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll new file mode 100644 index 0000000000000..8c8182db7b479 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll @@ -0,0 +1,30 @@ +; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED +; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED +target triple = "amdgcn-amd-amdhsa" + +@global = addrspace(1) global i32 poison, align 4 + +; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR. +; Only hardware-initialized VGPRs (v0) are read in this kernel. + +; CHECK-LABEL: amdhsa.kernels: +; CHECK-LABEL: kernel_x +; CHECK: .vgpr_count: 1 +define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret void +} + +; CHECK-LABEL: kernel_z +; PACKED: .vgpr_count: 1 +; NOTPACKED: .vgpr_count: 3 +define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.z() + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + ret void +} + +attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll new file mode 100644 index 0000000000000..9b8bd079958df --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll @@ -0,0 +1,27 @@ +; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s +; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count. +target triple = "amdgcn--amdpal" + +@global = addrspace(1) global i32 poison, align 4 + +; CHECK-LABEL: amdpal.pipelines: + +; Shouldn't report the part of %vgpr_args that's not used +; CHECK-LABEL: entry_point_symbol: cs_calling_chain +; CHECK: .vgpr_count: 0xa +define amdgpu_cs void @cs_calling_chain(i32 %vgpr, i32 inreg %sgpr) { + %vgpr_args = insertvalue {i32, i32, i32, i32} poison, i32 %vgpr, 1 + call void (ptr, i32, i32, {i32, i32, i32, i32}, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.s( + ptr @chain_func, i32 0, i32 inreg %sgpr, {i32, i32, i32, i32} %vgpr_args, i32 0) + unreachable +} + +; Neither uses not writes a VGPR +; CHECK-LABEL: chain_func: +; CHECK: .vgpr_count: 0x1 +define amdgpu_cs_chain void @chain_func([32 x i32] %args) { +entry: + call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a( + ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0) + unreachable +} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll new file mode 100644 index 0000000000000..04c3005541fe1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll @@ -0,0 +1,25 @@ +; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s +; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count. +target triple = "amdgcn--amdpal" + +@global = addrspace(1) global i32 poison, align 4 + +; CHECK-LABEL: amdpal.pipelines: + +; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used. +; CHECK-LABEL: .entry_point_symbol: kernel_use +; CHECK: .vgpr_count: 0x20 +define amdgpu_cs void @kernel_use([32 x i32] %args) { +entry: + %a = extractvalue [32 x i32] %args, 14 + store i32 %a, ptr addrspace(1) @global + ret void +} + +; Neither uses not writes a VGPR +; CHECK-LABEL: gfx_func: +; CHECK: .vgpr_count: 0x20 +define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) { +entry: + ret [32 x i32] %args +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits