[llvm-branch-commits] [llvm] 896e337 - Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation (#…"

via llvm-branch-commits Wed, 13 May 2026 06:37:53 -0700

Author: theRonShark
Date: 2026-05-11T20:05:41-04:00
New Revision: 896e337bfc2b4029ca46cee456769eff7bc40ef0


URL: 
https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0
DIFF: 
https://github.com/llvm/llvm-project/commit/896e337bfc2b4029ca46cee456769eff7bc40ef0.diff

LOG: Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation 
(#…"

This reverts commit 7ddee0b619f658cef905a69427ef9531fd1d229d.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
    llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
    llvm/lib/Target/AMDGPU/SIProgramInfo.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll

Removed: 
    llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 3a2738d9fc498..ad61d8d084c7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -234,18 +234,6 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
 }
 
-/// Set bits in a kernel descriptor MCExpr field:
-///   return ((Dst & ~Mask) | (Value << Shift))
-static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
-                             uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
-  const auto *Shft = MCConstantExpr::create(Shift, Ctx);
-  const auto *Msk = MCConstantExpr::create(Mask, Ctx);
-  Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
-  Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
-                               Ctx);
-  return Dst;
-}
-
 void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) {
   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   if (!MFI.isEntryFunction())
@@ -253,29 +241,6 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction 
*MF) {
 
   assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
 
-  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
-  MCContext &Ctx = MF->getContext();
-
-  AMDGPU::MCKernelDescriptor KD =
-      getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo);
-
-  // Compute inst_pref_size using MCExpr label subtraction for exact code
-  // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
-  // right after the function code, so (Lfunc_end - func_sym) gives the
-  // exact function code size in bytes.
-  if (STM.hasInstPrefSize()) {
-    const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
-        MCSymbolRefExpr::create(getFunctionEnd(), OutContext),
-        MCSymbolRefExpr::create(CurrentFnSym, OutContext), OutContext);
-
-    uint32_t Mask, Shift, Width, CacheLineSize;
-    STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
-    const MCExpr *InstPrefSize =
-        AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx);
-    KD.compute_pgm_rsrc3 =
-        setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx);
-  }
-
   auto &Streamer = getTargetStreamer()->getStreamer();
   auto &Context = Streamer.getContext();
   auto &ObjectFileInfo = *Context.getObjectFileInfo();
@@ -289,10 +254,13 @@ void AMDGPUAsmPrinter::endFunction(const MachineFunction 
*MF) {
   Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
   ReadOnlySection.ensureMinAlignment(Align(64));
 
+  const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
+
   SmallString<128> KernelName;
   getNameWithPrefix(KernelName, &MF->getFunction());
   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
-      STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU,
+      STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
+      CurrentProgramInfo.NumVGPRsForWavesPerEU,
       MCBinaryExpr::createSub(
           CurrentProgramInfo.NumSGPRsForWavesPerEU,
           AMDGPUMCExpr::createExtraSGPRs(
@@ -1470,22 +1438,33 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo 
&ProgInfo,
   ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
   ProgInfo.EXCPEnable = 0;
 
+  // return ((Dst & ~Mask) | (Value << Shift))
+  auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+                        uint32_t Shift) {
+    const auto *Shft = MCConstantExpr::create(Shift, Ctx);
+    const auto *Msk = MCConstantExpr::create(Mask, Ctx);
+    Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+    Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, 
Ctx),
+                                 Ctx);
+    return Dst;
+  };
+
   if (STM.hasGFX90AInsts()) {
     ProgInfo.ComputePGMRSrc3 =
-        setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
+        SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
     ProgInfo.ComputePGMRSrc3 =
-        setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
+        SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
                 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
   if (STM.hasGFX1250Insts())
     ProgInfo.ComputePGMRSrc3 =
-        setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
+        SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
+                amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
 
   ProgInfo.Occupancy = createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
@@ -1504,6 +1483,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo 
&ProgInfo,
             ", final occupancy is " + Twine(Occupancy));
     F.getContext().diagnose(Diag);
   }
+
+  if (isGFX11Plus(STM)) {
+    uint32_t CodeSizeInBytes = (uint32_t)std::min(
+        ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
+        (uint64_t)std::numeric_limits<uint32_t>::max());
+    uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
+    uint32_t Field, Shift, Width;
+    if (isGFX11(STM)) {
+      Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
+      Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
+      Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
+    } else {
+      Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
+      Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
+      Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
+    }
+    uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
+    ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
+                                       CreateExpr(InstPrefSize), Field, Shift);
+  }
 }
 
 static unsigned getRsrcReg(CallingConv::ID CallConv) {

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h 
b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 758e9b445d6dd..5f580ac0577d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -21,7 +21,6 @@
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/ErrorHandling.h"
 
 #define GET_SUBTARGETINFO_HEADER
@@ -426,23 +425,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasPrefetch() const { return HasGFX12Insts; }
 
-  bool hasInstPrefSize() const { return isGFX11Plus(); }
-
-  void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
-                           uint32_t &CacheLineSize) const {
-    assert(isGFX11Plus());
-    CacheLineSize = getInstCacheLineSize();
-    if (getGeneration() == GFX11) {
-      Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
-      Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
-      Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
-    } else {
-      Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
-      Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
-      Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
-    }
-  }
-
   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }
 

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp 
b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4563803ad6577..fd0a2a6a77d7e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -12,12 +12,9 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
 #include <optional>
@@ -77,9 +74,6 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo 
*MAI) const {
   case AGVK_Occupancy:
     OS << "occupancy(";
     break;
-  case AGVK_InstPrefSize:
-    OS << "instprefsize(";
-    break;
   case AGVK_Lit:
     OS << "lit(";
     break;
@@ -188,27 +182,6 @@ bool AMDGPUMCExpr::evaluateOccupancy(MCValue &Res,
   return true;
 }
 
-/// Get the inst_pref_size field width for the given subtarget.
-static unsigned getInstPrefSizeFieldWidth(const MCSubtargetInfo &STI) {
-  if (AMDGPU::isGFX12Plus(STI))
-    return amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
-  return amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
-}
-
-bool AMDGPUMCExpr::evaluateInstPrefSize(MCValue &Res,
-                                        const MCAssembler *Asm) const {
-  uint64_t CodeSizeInBytes = 0;
-  if (!evaluateMCExprs(Args, Asm, {CodeSizeInBytes}))
-    return false;
-  const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
-  unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI);
-  unsigned CacheLineSize = AMDGPU::IsaInfo::getInstCacheLineSize(STI);
-  uint64_t CodeSizeInLines = divideCeil(CodeSizeInBytes, CacheLineSize);
-  uint64_t MaxVal = (1u << FieldWidth) - 1;
-  Res = MCValue::get(std::min(CodeSizeInLines, MaxVal));
-  return true;
-}
-
 bool AMDGPUMCExpr::isSymbolUsedInExpression(const MCSymbol *Sym,
                                             const MCExpr *E) {
   switch (E->getKind()) {
@@ -254,8 +227,6 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
     return evaluateTotalNumVGPR(Res, Asm);
   case AGVK_Occupancy:
     return evaluateOccupancy(Res, Asm);
-  case AGVK_InstPrefSize:
-    return evaluateInstPrefSize(Res, Asm);
   case AGVK_Lit:
   case AGVK_Lit64:
     return Args[0]->evaluateAsRelocatable(Res, Asm);
@@ -308,11 +279,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const 
MCExpr *NumAGPR,
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
-const AMDGPUMCExpr *
-AMDGPUMCExpr::createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx) {
-  return create(AGVK_InstPrefSize, {CodeSizeBytes}, Ctx);
-}
-
 const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
                                             MCContext &Ctx) {
   assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
@@ -503,7 +469,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, 
KnownBitsMap &KBM,
   case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs:
   case AMDGPUMCExpr::VariantKind::AGVK_AlignTo:
   case AMDGPUMCExpr::VariantKind::AGVK_Occupancy:
-  case AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize:
   case AMDGPUMCExpr::VariantKind::AGVK_Lit:
   case AMDGPUMCExpr::VariantKind::AGVK_Lit64: {
     int64_t Val;
@@ -512,16 +477,6 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, 
KnownBitsMap &KBM,
       KBM[Expr] = KnownBits::makeConstant(APValue);
       return;
     }
-    if (AGVK->getKind() == AMDGPUMCExpr::VariantKind::AGVK_InstPrefSize) {
-      // The result is clamped to (1 << FieldWidth) - 1, so upper bits are
-      // known zero. FieldWidth is derived from the subtarget.
-      const MCSubtargetInfo *STI = AGVK->getCtx().getSubtargetInfo();
-      unsigned FieldWidth = getInstPrefSizeFieldWidth(*STI);
-      KnownBits KB(BitWidth);
-      KB.Zero.setBitsFrom(FieldWidth);
-      KBM[Expr] = KB;
-      return;
-    }
     KBM[Expr] = KnownBits(BitWidth);
     return;
   }

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h 
b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 4b1aa0c591a80..96bd8f4cf3c13 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -38,7 +38,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
     AGVK_TotalNumVGPRs,
     AGVK_AlignTo,
     AGVK_Occupancy,
-    AGVK_InstPrefSize,
     AGVK_Lit,
     AGVK_Lit64,
   };
@@ -70,7 +69,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
   bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
   bool evaluateAlignTo(MCValue &Res, const MCAssembler *Asm) const;
   bool evaluateOccupancy(MCValue &Res, const MCAssembler *Asm) const;
-  bool evaluateInstPrefSize(MCValue &Res, const MCAssembler *Asm) const;
 
 public:
   static const AMDGPUMCExpr *
@@ -99,18 +97,11 @@ class AMDGPUMCExpr : public MCTargetExpr {
     return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  /// Create an expression for instruction prefetch size computation:
-  /// min(divideCeil(CodeSizeBytes, CacheLineSize), (1 << FieldWidth) - 1)
-  /// FieldWidth and CacheLineSize are derived from the subtarget.
-  static const AMDGPUMCExpr *createInstPrefSize(const MCExpr *CodeSizeBytes,
-                                                MCContext &Ctx);
-
   static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
                                        MCContext &Ctx);
 
   ArrayRef<const MCExpr *> getArgs() const { return Args; }
   VariantKind getKind() const { return Kind; }
-  MCContext &getCtx() const { return Ctx; }
   const MCExpr *getSubExpr(size_t Index) const;
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;

diff  --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 27cef7a1b9158..99255e4060886 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -215,8 +215,9 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
   return MCConstantExpr::create(0, Ctx);
 }
 
-uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) {
-  if (CodeSizeInBytes.has_value())
+uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
+                                            bool IsLowerBound) {
+  if (!IsLowerBound && CodeSizeInBytes.has_value())
     return *CodeSizeInBytes;
 
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -225,7 +226,12 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const 
MachineFunction &MF) {
   uint64_t CodeSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
-    CodeSize = alignTo(CodeSize, MBB.getAlignment());
+    // The amount of padding to align code can be both underestimated and
+    // overestimated. In case of inline asm used getInstSizeInBytes() will
+    // return a maximum size of a single instruction, where the real size may
+    // 
diff er. At this point CodeSize may be already off.
+    if (!IsLowerBound)
+      CodeSize = alignTo(CodeSize, MBB.getAlignment());
 
     for (const MachineInstr &MI : MBB) {
       // TODO: CodeSize should account for multiple functions.
@@ -233,6 +239,11 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const 
MachineFunction &MF) {
       if (MI.isMetaInstruction())
         continue;
 
+      // We cannot properly estimate inline asm size. It can be as small as 
zero
+      // if that is just a comment.
+      if (IsLowerBound && MI.isInlineAsm())
+        continue;
+
       CodeSize += TII->getInstSizeInBytes(MI);
     }
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h 
b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index fb56ebf88c96f..947b473142a1f 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -105,7 +105,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
   void reset(const MachineFunction &MF);
 
   // Get function code size and cache the value.
-  uint64_t getFunctionCodeSize(const MachineFunction &MF);
+  // If \p IsLowerBound is set it returns a minimal code size which is safe
+  // to address.
+  uint64_t getFunctionCodeSize(const MachineFunction &MF,
+                               bool IsLowerBound = false);
 
   /// Compute the value of the ComputePGMRsrc1 register.
   const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dd67e77d0d9ed..b13aed2432602 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1182,14 +1182,6 @@ std::string AMDGPUTargetID::toString() const {
   return Str;
 }
 
-unsigned getInstCacheLineSize(const MCSubtargetInfo *STI) {
-  if (STI->getFeatureBits().test(FeatureInstCacheLineSize128))
-    return 128;
-  if (STI->getFeatureBits().test(FeatureInstCacheLineSize64))
-    return 64;
-  return 64;
-}
-
 unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureWavefrontSize16))
     return 16;

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e1b36f0996331..49373f09ee460 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -233,9 +233,6 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-/// \returns Instruction cache line size in bytes for given subtarget \p STI.
-unsigned getInstCacheLineSize(const MCSubtargetInfo *STI);
-
 /// \returns Wavefront size for given subtarget \p STI.
 unsigned getWavefrontSize(const MCSubtargetInfo *STI);
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll 
b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
index b76ef7eac11c4..580167076e1f0 100644
--- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
+++ b/llvm/test/CodeGen/AMDGPU/inst-prefetch-hint.ll
@@ -1,31 +1,11 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 
--amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX11 
%s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 
--amdgpu-memcpy-loop-unroll=100000 < %s | FileCheck --check-prefixes=GCN,GFX12 
%s
 
-;; Verify that inst_pref_size resolves to the correct value in the object file.
-;; COMPUTE_PGM_RSRC3 is at offset 0x2C in each 64-byte kernel descriptor.
-;; inst_pref_size is bits [9:4] on GFX11 (6-bit) and bits [11:4] on GFX12+ 
(8-bit).
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 
--amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx11.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck 
--check-prefix=OBJ-GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 
--amdgpu-memcpy-loop-unroll=100000 -filetype=obj < %s -o %t.gfx12.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck 
--check-prefix=OBJ-GFX12 %s
-
-; The inst_pref_size is computed via MCExpr label subtraction, resolved at
-; assembly/link time. In text output it appears as:
-;   ((instprefsize(<code_size>)<<Shift)&Mask)>>Shift
-; where:
-;   <code_size>       = .Lfunc_endN - func_sym (exact function code size in 
bytes)
-;   instprefsize      = min(divideCeil(code_size, cache_line_size), (1 << 
field_width) - 1)
-;   field_width and cache_line_size are derived from the subtarget
-
 ; GCN-LABEL: .amdhsa_kernel large
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&1008)>>4
-; GFX11: codeLenInByte = {{[0-9]+}}
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-large)<<4)&4080)>>4
-; GFX12: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C:
-;; gfx11 pref=3 (0x30), gfx12 pref=4 (0x40)
-; OBJ-GFX11: 0020 {{.*}}30000000
-; OBJ-GFX12: 0020 {{.*}}40000000
+; GFX11: .amdhsa_inst_pref_size 3
+; GFX11: codeLenInByte = 3{{[0-9][0-9]$}}
+; GFX12: .amdhsa_inst_pref_size 4
+; GFX12: codeLenInByte = 4{{[0-9][0-9]$}}
 define amdgpu_kernel void @large(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 bb:
   call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %out, ptr addrspace(1) 
%in, i32 256, i1 false)
@@ -33,30 +13,18 @@ bb:
 }
 
 ; GCN-LABEL: .amdhsa_kernel small
-; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&1008)>>4
-; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-small)<<4)&4080)>>4
-; GCN: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C:
-;; pref=1 (0x10) for both
-; OBJ-GFX11: 0060 {{.*}}10000000
-; OBJ-GFX12: 0060 {{.*}}10000000
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
 define amdgpu_kernel void @small() {
 bb:
   ret void
 }
 
-; Inline asm is accounted for via MCExpr label subtraction (exact code size).
-; The MCExpr resolves to the correct inst_pref_size at assembly time.
+; Ignore inline asm in size calculation
 
 ; GCN-LABEL: .amdhsa_kernel inline_asm
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end2-inline_asm)<<4)&1008)>>4
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end2-inline_asm)<<4)&4080)>>4
-; GCN: codeLenInByte = {{[0-9]+}}
-;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC:
-;; pref=9 (0x90) for both
-;; (.fill 256, 4, 0 = 1024 bytes + 4 s_endpgm = 1028 -> divideCeil(1028,128) = 
9)
-; OBJ-GFX11: 00a0 {{.*}}90000000
-; OBJ-GFX12: 00a0 {{.*}}90000000
+; GCN: .amdhsa_inst_pref_size 1
+; GCN: codeLenInByte = {{[0-9]$}}
 define amdgpu_kernel void @inline_asm() {
 bb:
   call void asm sideeffect ".fill 256, 4, 0", ""()

diff  --git a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll 
b/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll
deleted file mode 100644
index 287a30032230b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/inst-prefetch-inline-asm.ll
+++ /dev/null
@@ -1,154 +0,0 @@
-;; Verify that inline assembly is correctly accounted for in the
-;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr
-;; label subtraction (.Lfunc_end - func_sym), giving exact code size.
-;; See inst-prefetch-hint.ll for explanation of the instprefsize expression.
-
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck 
--check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck 
--check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o 
%t.gfx11.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck 
--check-prefix=OBJ-GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o 
%t.gfx12.o
-; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck 
--check-prefix=OBJ-GFX12 %s
-
-;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 ---
-;; pref_size = divideCeil(1028, 128) = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_fill
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_fill
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0020 {{.*}}90000000
-; OBJ-GFX12: 0020 {{.*}}90000000
-
-define amdgpu_kernel void @test_fill() {
-  call void asm sideeffect ".fill 256, 4, 0", ""()
-  ret void
-}
-
-;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 ---
-;; pref_size = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_space
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_space
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0060 {{.*}}90000000
-; OBJ-GFX12: 0060 {{.*}}90000000
-
-define amdgpu_kernel void @test_space() {
-  call void asm sideeffect ".space 1024", ""()
-  ret void
-}
-
-;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 ---
-;; pref_size = divideCeil(132, 128) = 2
-
-; GFX11-LABEL: .amdhsa_kernel test_instructions
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_instructions
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC:
-;; pref_size=2 -> 2<<4 = 0x20
-; OBJ-GFX11: 00a0 {{.*}}20000000
-; OBJ-GFX12: 00a0 {{.*}}20000000
-
-define amdgpu_kernel void @test_instructions() {
-  call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 
0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 
0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 
0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 
0\0As_nop 0\0As_nop 0\0As_nop 0", ""()
-  ret void
-}
-
-;; --- Comments emit no bytes: only s_endpgm = 4 bytes ---
-;; pref_size = 1
-
-; GFX11-LABEL: .amdhsa_kernel test_comments
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_comments
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4
-;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC:
-;; pref_size=1 -> 1<<4 = 0x10
-; OBJ-GFX11: 00e0 {{.*}}10000000
-; OBJ-GFX12: 00e0 {{.*}}10000000
-
-define amdgpu_kernel void @test_comments() {
-  call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""()
-  ret void
-}
-
-;; --- Empty inline asm: only s_endpgm = 4 bytes ---
-;; pref_size = 1
-
-; GFX11-LABEL: .amdhsa_kernel test_empty_asm
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_empty_asm
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C:
-;; pref_size=1 -> 1<<4 = 0x10
-; OBJ-GFX11: 0120 {{.*}}10000000
-; OBJ-GFX12: 0120 {{.*}}10000000
-
-define amdgpu_kernel void @test_empty_asm() {
-  call void asm sideeffect "", ""()
-  ret void
-}
-
-;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 
1028 ---
-;; pref_size = divideCeil(1028, 128) = 9
-
-; GFX11-LABEL: .amdhsa_kernel test_multiple_asm
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_multiple_asm
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C:
-;; pref_size=9 -> 9<<4 = 0x90
-; OBJ-GFX11: 0160 {{.*}}90000000
-; OBJ-GFX12: 0160 {{.*}}90000000
-
-define amdgpu_kernel void @test_multiple_asm() {
-  call void asm sideeffect ".fill 128, 4, 0", ""()
-  call void asm sideeffect ".space 512", ""()
-  ret void
-}
-
-;; --- Large function that exceeds GFX11 6-bit field max (63) ---
-;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes
-;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63
-;; pref_size should clamp to 63
-
-; GFX11-LABEL: .amdhsa_kernel test_clamping
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_clamping
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC:
-;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
-;; gfx12: no clamping, 65 -> 65<<4 = 0x410
-; OBJ-GFX11: 01a0 {{.*}}f0030000
-; OBJ-GFX12: 01a0 {{.*}}10040000
-
-define amdgpu_kernel void @test_clamping() {
-  call void asm sideeffect ".fill 2048, 4, 0", ""()
-  ret void
-}
-
-;; --- Large function that exceeds both GFX11 and GFX12 field max ---
-;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes
-;; divideCeil(32772, 128) = 257
-;; GFX11 max = 63, GFX12 max = 255 -> both clamp
-
-; GFX11-LABEL: .amdhsa_kernel test_clamping_both
-; GFX11: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4
-; GFX12-LABEL: .amdhsa_kernel test_clamping_both
-; GFX12: .amdhsa_inst_pref_size 
((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4
-;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC:
-;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
-;; gfx12: clamped to 255 -> 255<<4 = 0xFF0
-; OBJ-GFX11: 01e0 {{.*}}f0030000
-; OBJ-GFX12: 01e0 {{.*}}f00f0000
-
-define amdgpu_kernel void @test_clamping_both() {
-  call void asm sideeffect ".fill 8192, 4, 0", ""()
-  ret void
-}


        
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] 896e337 - Revert "[AMDGPU] Account for inline asm size in inst_pref_size calculation (#…"

Reply via email to