Author: theRonShark Date: 2026-05-11T20:05:41-04:00 New Revision: 896e337bfc2b4029ca46cee456769eff7bc40ef0
URL: https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0 DIFF: https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0.diff LOG: Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation (#…" This reverts commit 7ddee0b619f658cef905a69427ef9531fd1d229d. Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h llvm/lib/Target/AMDGPU/SIProgramInfo.cpp llvm/lib/Target/AMDGPU/SIProgramInfo.h llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll Removed: llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll ################################################################################ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 3a2738d9fc498..ad61d8d084c7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -234,18 +234,6 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } -/// Set bits in a kernel descriptor MCExpr field: -/// return ((Dst & ~Mask) | (Value << Shift)) -static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value, - uint32_t Mask, uint32_t Shift, MCContext &Ctx) { - const auto *Shft = MCConstantExpr::create(Shift, Ctx); - const auto *Msk = MCConstantExpr::create(Mask, Ctx); - Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); - Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), - Ctx); - return Dst; -} - void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) @@ -253,29 +241,6 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { assert(TM.getTargetTriple().getOS() == Triple::AMDHSA); - const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); - MCContext &Ctx = MF->getContext(); - - AMDGPU::MCKernelDescriptor KD = - getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo); - - // Compute inst_pref_size using MCExpr label subtraction for exact code - // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter) - // right after the function code, so (Lfunc_end - func_sym) gives the - // exact function code size in bytes. - if (STM.hasInstPrefSize()) { - const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(getFunctionEnd(), OutContext), - MCSymbolRefExpr::create(CurrentFnSym, OutContext), OutContext); - - uint32_t Mask, Shift, Width, CacheLineSize; - STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize); - const MCExpr *InstPrefSize = - AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx); - KD.compute_pgm_rsrc3 = - setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx); - } - auto &Streamer = getTargetStreamer()->getStreamer(); auto &Context = Streamer.getContext(); auto &ObjectFileInfo = *Context.getObjectFileInfo(); @@ -289,10 +254,13 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) { Streamer.emitValueToAlignment(Align(64), 0, 1, 0); ReadOnlySection.ensureMinAlignment(Align(64)); + const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); + SmallString<128> KernelName; getNameWithPrefix(KernelName, &MF->getFunction()); getTargetStreamer()->EmitAmdhsaKernelDescriptor( - STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU, + STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + CurrentProgramInfo.NumVGPRsForWavesPerEU, MCBinaryExpr::createSub( CurrentProgramInfo.NumSGPRsForWavesPerEU, AMDGPUMCExpr::createExtraSGPRs( @@ -1470,22 +1438,33 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; ProgInfo.EXCPEnable = 0; + // return ((Dst & ~Mask) | (Value << Shift)) + auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, + uint32_t Shift) { + const auto *Shft = MCConstantExpr::create(Shift, Ctx); + const auto *Msk = MCConstantExpr::create(Mask, Ctx); + Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); + Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), + Ctx); + return Dst; + }; + if (STM.hasGFX90AInsts()) { ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, + SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), + SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, - amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); } if (STM.hasGFX1250Insts()) ProgInfo.ComputePGMRSrc3 = - setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, + SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt, amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT, - amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx); + amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT); ProgInfo.Occupancy = createOccupancy( STM.computeOccupancy(F, ProgInfo.LDSSize).second, @@ -1504,6 +1483,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ", final occupancy is " + Twine(Occupancy)); F.getContext().diagnose(Diag); } + + if (isGFX11Plus(STM)) { + uint32_t CodeSizeInBytes = (uint32_t)std::min( + ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */), + (uint64_t)std::numeric_limits<uint32_t>::max()); + uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128); + uint32_t Field, Shift, Width; + if (isGFX11(STM)) { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; + } else { + Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; + Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; + Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; + } + uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1); + ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, + CreateExpr(InstPrefSize), Field, Shift); + } } static unsigned getRsrcReg(CallingConv::ID CallConv) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 758e9b445d6dd..5f580ac0577d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -21,7 +21,6 @@ #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/ErrorHandling.h" #define GET_SUBTARGETINFO_HEADER @@ -426,23 +425,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPrefetch() const { return HasGFX12Insts; } - bool hasInstPrefSize() const { return isGFX11Plus(); } - - void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, - uint32_t &CacheLineSize) const { - assert(isGFX11Plus()); - CacheLineSize = getInstCacheLineSize(); - if (getGeneration() == GFX11) { - Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; - } else { - Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; - Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; - Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; - } - } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 4563803ad6577..fd0a2a6a77d7e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -12,12 +12,9 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <functional> #include <optional> @@ -77,9 +74,6 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case AGVK_Occupancy: OS << "occupancy("; break; - case AGVK_InstPrefSize: - OS << "instprefsize("; - break; case AGVK_Lit: OS << "lit("; break; @@ -188,27 +182,6 @@ bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res, return true; } -/// Get the inst_pref_size field width for the given subtarget. -static unsigned getInstPrefSizeFieldWidth(const MCSubtargetInfo &STI) { - if (AMDGPU::isGFX12Plus(STI)) - return amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; - return amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; -} - -bool AMDGPUMCExpr::evaluateInstPrefSize(MCValue &Res, - const MCAssembler *Asm) const { - uint64_t CodeSizeInBytes = 0; - if (!evaluateMCExprs(Args, Asm, {CodeSizeInBytes})) - return false; - const MCSubtargetInfo *STI = Ctx.getSubtargetInfo(); - unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); - unsigned CacheLineSize = AMDGPU::IsaInfo::getInstCacheLineSize(STI); - uint64_t CodeSizeInLines = divideCeil(CodeSizeInBytes, CacheLineSize); - uint64_t MaxVal = (1u << FieldWidth) - 1; - Res = MCValue::get(std::min(CodeSizeInLines, MaxVal)); - return true; -} - bool AMDGPUMCExpr::isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *E) { switch (E->getKind()) { @@ -254,8 +227,6 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, return evaluateTotalNumVGPR(Res, Asm); case AGVK_Occupancy: return evaluateOccupancy(Res, Asm); - case AGVK_InstPrefSize: - return evaluateInstPrefSize(Res, Asm); case AGVK_Lit: case AGVK_Lit64: return Args[0]->evaluateAsRelocatable(Res, Asm); @@ -308,11 +279,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR, return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx); } -const AMDGPUMCExpr * -AMDGPUMCExpr::createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx) { - return create(AGVK_InstPrefSize, {CodeSizeBytes}, Ctx); -} - const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, MCContext &Ctx) { assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); @@ -503,7 +469,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs: case AMDGPUMCExpr::VariantKind::AGVK_AlignTo: case AMDGPUMCExpr::VariantKind::AGVK_Occupancy: - case AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize: case AMDGPUMCExpr::VariantKind::AGVK_Lit: case AMDGPUMCExpr::VariantKind::AGVK_Lit64: { int64_t Val; @@ -512,16 +477,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, KBM[Expr] = KnownBits::makeConstant(APValue); return; } - if (AGVK->getKind() == AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize) { - // The result is clamped to (1 << FieldWidth) - 1, so upper bits are - // known zero. FieldWidth is derived from the subtarget. - const MCSubtargetInfo *STI = AGVK->getCtx().getSubtargetInfo(); - unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI); - KnownBits KB(BitWidth); - KB.Zero.setBitsFrom(FieldWidth); - KBM[Expr] = KB; - return; - } KBM[Expr] = KnownBits(BitWidth); return; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index 4b1aa0c591a80..96bd8f4cf3c13 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -38,7 +38,6 @@ class AMDGPUMCExpr : public MCTargetExpr { AGVK_TotalNumVGPRs, AGVK_AlignTo, AGVK_Occupancy, - AGVK_InstPrefSize, AGVK_Lit, AGVK_Lit64, }; @@ -70,7 +69,6 @@ class AMDGPUMCExpr : public MCTargetExpr { bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const; bool evaluateAlignTo(MCValue &Res, const MCAssembler *Asm) const; bool evaluateOccupancy(MCValue &Res, const MCAssembler *Asm) const; - bool evaluateInstPrefSize(MCValue &Res, const MCAssembler *Asm) const; public: static const AMDGPUMCExpr * @@ -99,18 +97,11 @@ class AMDGPUMCExpr : public MCTargetExpr { return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); } - /// Create an expression for instruction prefetch size computation: - /// min(divideCeil(CodeSizeBytes, CacheLineSize), (1 << FieldWidth) - 1) - /// FieldWidth and CacheLineSize are derived from the subtarget. - static const AMDGPUMCExpr *createInstPrefSize(const MCExpr *CodeSizeBytes, - MCContext &Ctx); - static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, MCContext &Ctx); ArrayRef<const MCExpr *> getArgs() const { return Args; } VariantKind getKind() const { return Kind; } - MCContext &getCtx() const { return Ctx; } const MCExpr *getSubExpr(size_t Index) const; void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 27cef7a1b9158..99255e4060886 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -215,8 +215,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } -uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { - if (CodeSizeInBytes.has_value()) +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound) { + if (!IsLowerBound && CodeSizeInBytes.has_value()) return *CodeSizeInBytes; const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); @@ -225,7 +226,12 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { uint64_t CodeSize = 0; for (const MachineBasicBlock &MBB : MF) { - CodeSize = alignTo(CodeSize, MBB.getAlignment()); + // The amount of padding to align code can be both underestimated and + // overestimated. In case of inline asm used getInstSizeInBytes() will + // return a maximum size of a single instruction, where the real size may + // diff er. At this point CodeSize may be already off. + if (!IsLowerBound) + CodeSize = alignTo(CodeSize, MBB.getAlignment()); for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -233,6 +239,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { if (MI.isMetaInstruction()) continue; + // We cannot properly estimate inline asm size. It can be as small as zero + // if that is just a comment. + if (IsLowerBound && MI.isInlineAsm()) + continue; + CodeSize += TII->getInstSizeInBytes(MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index fb56ebf88c96f..947b473142a1f 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -105,7 +105,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { void reset(const MachineFunction &MF); // Get function code size and cache the value. - uint64_t getFunctionCodeSize(const MachineFunction &MF); + // If \p IsLowerBound is set it returns a minimal code size which is safe + // to address. + uint64_t getFunctionCodeSize(const MachineFunction &MF, + bool IsLowerBound = false); /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index dd67e77d0d9ed..b13aed2432602 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1182,14 +1182,6 @@ std::string AMDGPUTargetID::toString() const { return Str; } -unsigned getInstCacheLineSize(const MCSubtargetInfo *STI) { - if (STI->getFeatureBits().test(FeatureInstCacheLineSize128)) - return 128; - if (STI->getFeatureBits().test(FeatureInstCacheLineSize64)) - return 64; - return 64; -} - unsigned getWavefrontSize(const MCSubtargetInfo *STI) { if (STI->getFeatureBits().test(FeatureWavefrontSize16)) return 16; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index e1b36f0996331..49373f09ee460 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -233,9 +233,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } -/// \returns Instruction cache line size in bytes for given subtarget \p STI. -unsigned getInstCacheLineSize(const MCSubtargetInfo *STI); - /// \returns Wavefront size for given subtarget \p STI. unsigned getWavefrontSize(const MCSubtargetInfo *STI); diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll index b76ef7eac11c4..580167076e1f0 100644 --- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll +++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll @@ -1,31 +1,11 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 %s -;; Verify that inst_pref_size resolves to the correct value in the object file. -;; COMPUTE_PGM_RSRC3 is at offset 0x2C in each 64-byte kernel descriptor. -;; inst_pref_size is bits [9:4] on GFX11 (6-bit) and bits [11:4] on GFX12+ (8-bit). -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx11.o -; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 --amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx12.o -; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s - -; The inst_pref_size is computed via MCExpr label subtraction, resolved at -; assembly/link time. In text output it appears as: -; ((instprefsize(<code_size>)<<Shift)&Mask)>>Shift -; where: -; <code_size> = .Lfunc_endN - func_sym (exact function code size in bytes) -; instprefsize = min(divideCeil(code_size, cache_line_size), (1 << field_width) - 1) -; field_width and cache_line_size are derived from the subtarget - ; GCN-LABEL: .amdhsa_kernel large -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&1008)>>4 -; GFX11: codeLenInByte = {{[0-9]+}} -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&4080)>>4 -; GFX12: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: -;; gfx11 pref=3 (0x30), gfx12 pref=4 (0x40) -; OBJ-GFX11: 0020 {{.*}}30000000 -; OBJ-GFX12: 0020 {{.*}}40000000 +; GFX11: .amdhsa_inst_pref_size 3 +; GFX11: codeLenInByte = 3{{[0-9][0-9]$}} +; GFX12: .amdhsa_inst_pref_size 4 +; GFX12: codeLenInByte = 4{{[0-9][0-9]$}} define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) { bb: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 256, i1 false) @@ -33,30 +13,18 @@ bb: } ; GCN-LABEL: .amdhsa_kernel small -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&1008)>>4 -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&4080)>>4 -; GCN: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: -;; pref=1 (0x10) for both -; OBJ-GFX11: 0060 {{.*}}10000000 -; OBJ-GFX12: 0060 {{.*}}10000000 +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} define amdgpu_kernel void @small() { bb: ret void } -; Inline asm is accounted for via MCExpr label subtraction (exact code size). -; The MCExpr resolves to the correct inst_pref_size at assembly time. +; Ignore inline asm in size calculation ; GCN-LABEL: .amdhsa_kernel inline_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&1008)>>4 -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-inline_asm)<<4)&4080)>>4 -; GCN: codeLenInByte = {{[0-9]+}} -;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: -;; pref=9 (0x90) for both -;; (.fill 256, 4, 0 = 1024 bytes + 4 s_endpgm = 1028 -> divideCeil(1028,128) = 9) -; OBJ-GFX11: 00a0 {{.*}}90000000 -; OBJ-GFX12: 00a0 {{.*}}90000000 +; GCN: .amdhsa_inst_pref_size 1 +; GCN: codeLenInByte = {{[0-9]$}} define amdgpu_kernel void @inline_asm() { bb: call void asm sideeffect ".fill 256, 4, 0", ""() diff --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll deleted file mode 100644 index 287a30032230b..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll +++ /dev/null @@ -1,154 +0,0 @@ -;; Verify that inline assembly is correctly accounted for in the -;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr -;; label subtraction (.Lfunc_end - func_sym), giving exact code size. -;; See inst-prefetch-hint.ll for explanation of the instprefsize expression. - -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o %t.gfx11.o -; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o %t.gfx12.o -; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s - -;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 --- -;; pref_size = divideCeil(1028, 128) = 9 - -; GFX11-LABEL: .amdhsa_kernel test_fill -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_fill -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0020 {{.*}}90000000 -; OBJ-GFX12: 0020 {{.*}}90000000 - -define amdgpu_kernel void @test_fill() { - call void asm sideeffect ".fill 256, 4, 0", ""() - ret void -} - -;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 --- -;; pref_size = 9 - -; GFX11-LABEL: .amdhsa_kernel test_space -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_space -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0060 {{.*}}90000000 -; OBJ-GFX12: 0060 {{.*}}90000000 - -define amdgpu_kernel void @test_space() { - call void asm sideeffect ".space 1024", ""() - ret void -} - -;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 --- -;; pref_size = divideCeil(132, 128) = 2 - -; GFX11-LABEL: .amdhsa_kernel test_instructions -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_instructions -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC: -;; pref_size=2 -> 2<<4 = 0x20 -; OBJ-GFX11: 00a0 {{.*}}20000000 -; OBJ-GFX12: 00a0 {{.*}}20000000 - -define amdgpu_kernel void @test_instructions() { - call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0", ""() - ret void -} - -;; --- Comments emit no bytes: only s_endpgm = 4 bytes --- -;; pref_size = 1 - -; GFX11-LABEL: .amdhsa_kernel test_comments -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_comments -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4 -;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC: -;; pref_size=1 -> 1<<4 = 0x10 -; OBJ-GFX11: 00e0 {{.*}}10000000 -; OBJ-GFX12: 00e0 {{.*}}10000000 - -define amdgpu_kernel void @test_comments() { - call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""() - ret void -} - -;; --- Empty inline asm: only s_endpgm = 4 bytes --- -;; pref_size = 1 - -; GFX11-LABEL: .amdhsa_kernel test_empty_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_empty_asm -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C: -;; pref_size=1 -> 1<<4 = 0x10 -; OBJ-GFX11: 0120 {{.*}}10000000 -; OBJ-GFX12: 0120 {{.*}}10000000 - -define amdgpu_kernel void @test_empty_asm() { - call void asm sideeffect "", ""() - ret void -} - -;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 1028 --- -;; pref_size = divideCeil(1028, 128) = 9 - -; GFX11-LABEL: .amdhsa_kernel test_multiple_asm -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_multiple_asm -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C: -;; pref_size=9 -> 9<<4 = 0x90 -; OBJ-GFX11: 0160 {{.*}}90000000 -; OBJ-GFX12: 0160 {{.*}}90000000 - -define amdgpu_kernel void @test_multiple_asm() { - call void asm sideeffect ".fill 128, 4, 0", ""() - call void asm sideeffect ".space 512", ""() - ret void -} - -;; --- Large function that exceeds GFX11 6-bit field max (63) --- -;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes -;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63 -;; pref_size should clamp to 63 - -; GFX11-LABEL: .amdhsa_kernel test_clamping -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_clamping -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC: -;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 -;; gfx12: no clamping, 65 -> 65<<4 = 0x410 -; OBJ-GFX11: 01a0 {{.*}}f0030000 -; OBJ-GFX12: 01a0 {{.*}}10040000 - -define amdgpu_kernel void @test_clamping() { - call void asm sideeffect ".fill 2048, 4, 0", ""() - ret void -} - -;; --- Large function that exceeds both GFX11 and GFX12 field max --- -;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes -;; divideCeil(32772, 128) = 257 -;; GFX11 max = 63, GFX12 max = 255 -> both clamp - -; GFX11-LABEL: .amdhsa_kernel test_clamping_both -; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4 -; GFX12-LABEL: .amdhsa_kernel test_clamping_both -; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4 -;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC: -;; gfx11: clamped to 63 -> 63<<4 = 0x3F0 -;; gfx12: clamped to 255 -> 255<<4 = 0xFF0 -; OBJ-GFX11: 01e0 {{.*}}f0030000 -; OBJ-GFX12: 01e0 {{.*}}f00f0000 - -define amdgpu_kernel void @test_clamping_both() { - call void asm sideeffect ".fill 8192, 4, 0", ""() - ret void -} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
