[clang] 3dfb782 - [AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (#169345)

via cfe-commits Mon, 12 Jan 2026 04:05:20 -0800

Author: Pankaj Dwivedi
Date: 2026-01-12T17:35:06+05:30
New Revision: 3dfb782333bf929945f63e5b0b1cad378b0bd87a


URL: 
https://github.com/llvm/llvm-project/commit/3dfb782333bf929945f63e5b0b1cad378b0bd87a
DIFF: 
https://github.com/llvm/llvm-project/commit/3dfb782333bf929945f63e5b0b1cad378b0bd87a.diff

LOG: [AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling 
(#169345)

Reference issue: https://github.com/ROCm/llvm-project/issues/67

This patch adds support for expanding s_waitcnt instructions into
sequences with decreasing counter values, enabling PC-sampling profilers
to identify which specific memory operation is causing a stall.

This is controlled via:
Clang flag: -mamdgpu-expand-waitcnt-profiling /
-mno-amdgpu-expand-waitcnt-profiling
Function attribute: "amdgpu-expand-waitcnt-profiling"

When enabled, instead of emitting a single waitcnt, the pass generates a
sequence that waits for each outstanding operation individually. For
example, if there are 5 outstanding memory operations and the target is
to wait until 2 remain:


**Original**: 
s_waitcnt vmcnt(2)

**Expanded**:  
s_waitcnt vmcnt(4)
s_waitcnt vmcnt(3)
s_waitcnt vmcnt(2)

The expansion starts from (Outstanding - 1) down to the target value,
since waitcnt(Outstanding) would be a no-op (the counter is already at
that value).

- Uses ScoreBrackets to determine the actual number of outstanding
operations
- Only expands when operations complete in-order
- Skips expansion for mixed event types (e.g., LDS+SMEM on same counter)
- Skips expansion for scalar memory (always out-of-order)

Releated previous work for Reference
- **PR**: llvm/llvm-project#79236 (related `-amdgpu-waitcnt-forcezero`)

---------

Co-authored-by: Pierre van Houtryve <[email protected]>

Added: 
    llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll

Modified: 
    clang/include/clang/Basic/CodeGenOptions.def
    clang/include/clang/Options/Options.td
    clang/lib/CodeGen/Targets/AMDGPU.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/CodeGenOptions.def 
b/clang/include/clang/Basic/CodeGenOptions.def
index 6cdbffc456193..baf8b093c10e6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -466,6 +466,10 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign)
 /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only)
 CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign)
 
+/// Enable expanded waitcnt for profiling (AMDGPU Only)
+/// Expands s_waitcnt instructions to help PC-sampling profilers identify 
stalls.
+CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign)
+
 // Whether to emit Swift Async function extended frame information: auto,
 // never, always.
 ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,

diff  --git a/clang/include/clang/Options/Options.td 
b/clang/include/clang/Options/Options.td
index 6a72931727a7c..d48ca15864060 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5577,6 +5577,13 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee",
   "This option changes the ABI. (AMDGPU only)">,
   NegFlag<SetFalse, [], [ClangOption, CC1Option]>>;
 
+defm amdgpu_expand_waitcnt_profiling : 
BoolMOption<"amdgpu-expand-waitcnt-profiling",
+  CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt 
instructions to help "
+  "PC-sampling profilers identify memory stalls. Instead of a single 
waitcnt(target), "
+  "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
+  NegFlag<SetFalse, [], [ClangOption]>>;
+
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, 
Group<m_Group>,
   HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
   Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,

diff  --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0ab6c753b8bad..4bc9557b26b52 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -443,6 +443,8 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     setFunctionDeclAttributes(FD, F, M);
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
+  if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
+    F->addFnAttr("amdgpu-expand-waitcnt-profiling");
 }
 
 unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c0f9be77f3adc..4bf7c65daae72 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -105,6 +105,35 @@ auto inst_counter_types(InstCounterType MaxCounter = 
NUM_INST_CNTS) {
   return enum_seq(LOAD_CNT, MaxCounter);
 }
 
+// Get the maximum wait count value for a given counter type.
+static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
+                                InstCounterType T) {
+  switch (T) {
+  case LOAD_CNT:
+    return Limits.LoadcntMax;
+  case DS_CNT:
+    return Limits.DscntMax;
+  case EXP_CNT:
+    return Limits.ExpcntMax;
+  case STORE_CNT:
+    return Limits.StorecntMax;
+  case SAMPLE_CNT:
+    return Limits.SamplecntMax;
+  case BVH_CNT:
+    return Limits.BvhcntMax;
+  case KM_CNT:
+    return Limits.KmcntMax;
+  case X_CNT:
+    return Limits.XcntMax;
+  case VA_VDST:
+    return Limits.VaVdstMax;
+  case VM_VSRC:
+    return Limits.VmVsrcMax;
+  default:
+    return 0;
+  }
+}
+
 /// Integer IDs used to track vector memory locations we may have to wait on.
 /// Encoded as u16 chunks:
 ///
@@ -140,19 +169,6 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
   return static_cast<unsigned>(RU);
 }
 
-struct HardwareLimits {
-  unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
-  unsigned ExpcntMax;
-  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
-  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
-  unsigned SamplecntMax; // gfx12+ only.
-  unsigned BvhcntMax;    // gfx12+ only.
-  unsigned KmcntMax;     // gfx12+ only.
-  unsigned XcntMax;      // gfx1250.
-  unsigned VaVdstMax;    // gfx12+ expert mode only.
-  unsigned VmVsrcMax;    // gfx12+ expert mode only.
-};
-
 #define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       
\
   DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */    
\
   DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */         
\
@@ -314,19 +330,27 @@ class WaitcntGenerator {
   AMDGPU::IsaVersion IV;
   InstCounterType MaxCounter;
   bool OptNone;
+  bool ExpandWaitcntProfiling = false;
+  const AMDGPU::HardwareLimits *Limits = nullptr;
 
 public:
   WaitcntGenerator() = default;
-  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
+  WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
+                   const AMDGPU::HardwareLimits *Limits)
       : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
         IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
         OptNone(MF.getFunction().hasOptNone() ||
-                MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
+                MF.getTarget().getOptLevel() == CodeGenOptLevel::None),
+        ExpandWaitcntProfiling(
+            
MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")),
+        Limits(Limits) {}
 
   // Return true if the current function should be compiled with no
   // optimization.
   bool isOptNone() const { return OptNone; }
 
+  const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
+
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
   // any new wait count instructions which may need to be generated by
@@ -348,9 +372,11 @@ class WaitcntGenerator {
 
   // Generates new wait count instructions according to the  value of
   // Wait, returning true if any new instructions were created.
+  // If ScoreBrackets is provided, it can be used for profiling expansion.
   virtual bool createNewWaitcnt(MachineBasicBlock &Block,
                                 MachineBasicBlock::instr_iterator It,
-                                AMDGPU::Waitcnt Wait) = 0;
+                                AMDGPU::Waitcnt Wait,
+                                WaitcntBrackets *ScoreBrackets = nullptr) = 0;
 
   // Returns an array of bit masks which can be used to map values in
   // WaitEventType to corresponding counter values in InstCounterType.
@@ -375,7 +401,10 @@ class WaitcntGenerator {
 
 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 public:
-  using WaitcntGenerator::WaitcntGenerator;
+  WaitcntGeneratorPreGFX12() = default;
+  WaitcntGeneratorPreGFX12(const MachineFunction &MF,
+                           const AMDGPU::HardwareLimits *Limits)
+      : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -384,7 +413,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -416,8 +446,10 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 public:
   WaitcntGeneratorGFX12Plus() = default;
   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
-                            InstCounterType MaxCounter, bool IsExpertMode)
-      : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {}
+                            InstCounterType MaxCounter,
+                            const AMDGPU::HardwareLimits *Limits,
+                            bool IsExpertMode)
+      : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
 
   bool
   applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
@@ -426,7 +458,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 
   bool createNewWaitcnt(MachineBasicBlock &Block,
                         MachineBasicBlock::instr_iterator It,
-                        AMDGPU::Waitcnt Wait) override;
+                        AMDGPU::Waitcnt Wait,
+                        WaitcntBrackets *ScoreBrackets = nullptr) override;
 
   const unsigned *getWaitEventMask() const override {
     assert(ST);
@@ -494,7 +527,7 @@ class SIInsertWaitcnts {
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
-  HardwareLimits Limits;
+  AMDGPU::HardwareLimits Limits;
 
 public:
   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -505,33 +538,7 @@ class SIInsertWaitcnts {
     (void)ForceVMCounter;
   }
 
-  unsigned getWaitCountMax(InstCounterType T) const {
-    switch (T) {
-    case LOAD_CNT:
-      return Limits.LoadcntMax;
-    case DS_CNT:
-      return Limits.DscntMax;
-    case EXP_CNT:
-      return Limits.ExpcntMax;
-    case STORE_CNT:
-      return Limits.StorecntMax;
-    case SAMPLE_CNT:
-      return Limits.SamplecntMax;
-    case BVH_CNT:
-      return Limits.BvhcntMax;
-    case KM_CNT:
-      return Limits.KmcntMax;
-    case X_CNT:
-      return Limits.XcntMax;
-    case VA_VDST:
-      return Limits.VaVdstMax;
-    case VM_VSRC:
-      return Limits.VmVsrcMax;
-    default:
-      break;
-    }
-    return 0;
-  }
+  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
 
   bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
   bool isPreheaderToFlush(MachineBasicBlock &MBB,
@@ -762,7 +769,7 @@ class WaitcntBrackets {
 
   unsigned getPendingGDSWait() const {
     return std::min(getScoreUB(DS_CNT) - LastGDS,
-                    Context->getWaitCountMax(DS_CNT) - 1);
+                    getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
   }
 
   void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
@@ -789,8 +796,8 @@ class WaitcntBrackets {
   }
 
   void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT,
-               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
+                              getWaitCountMax(Context->getLimits(), 
STORE_CNT));
     PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
   }
 
@@ -846,8 +853,9 @@ class WaitcntBrackets {
     if (T != EXP_CNT)
       return;
 
-    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - 
Context->getWaitCountMax(EXP_CNT);
+    if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), 
EXP_CNT))
+      ScoreLBs[EXP_CNT] =
+          ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
   }
 
   void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
@@ -1352,8 +1360,8 @@ void 
WaitcntBrackets::determineWaitForScore(InstCounterType T,
     } else {
       // If a counter has been maxed out avoid overflow by waiting for
       // MAX(CounterType) - 1 instead.
-      unsigned NeededWait =
-          std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
+      unsigned NeededWait = std::min(
+          UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1);
       addWait(Wait, T, NeededWait);
     }
   }
@@ -1682,38 +1690,109 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 /// required counters in \p Wait
 bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  // Emits waitcnts from (Outstanding-1) down to Target, or just Target if
+  // nothing to expand. The EmitWaitcnt callback emits a single waitcnt.
+  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    if (Outstanding > Target) {
+      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+        EmitWaitcnt(i);
+        Modified = true;
+      }
+    } else {
+      EmitWaitcnt(Target);
+      Modified = true;
+    }
+  };
+
   // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
   // single instruction while VScnt has its own instruction.
   if (Wait.hasWaitExceptStoreCnt()) {
-    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
-    [[maybe_unused]] auto SWaitInst =
+    // If profiling expansion is enabled and we have score brackets,
+    // emit an expanded sequence
+    if (ExpandWaitcntProfiling && ScoreBrackets) {
+      // Check if any of the counters to be waited on are out-of-order.
+      // If so, fall back to normal (non-expanded) behavior since expansion
+      // would provide misleading profiling information.
+      bool AnyOutOfOrder = false;
+      for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+        unsigned &WaitCnt = getCounterRef(Wait, CT);
+        if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
+          AnyOutOfOrder = true;
+          break;
+        }
+      }
+
+      if (AnyOutOfOrder) {
+        // Fall back to non-expanded wait
+        unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
-    Modified = true;
+        Modified = true;
+      } else {
+        // All counters are in-order, safe to expand
+        for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) {
+          unsigned &WaitCnt = getCounterRef(Wait, CT);
+          if (WaitCnt == ~0u)
+            continue;
 
-    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+          unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+                                              ScoreBrackets->getScoreLB(CT),
+                                          getWaitCountMax(getLimits(), CT) - 
1);
+          EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) {
+            AMDGPU::Waitcnt W;
+            getCounterRef(W, CT) = Count;
+            BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT))
+                .addImm(AMDGPU::encodeWaitcnt(IV, W));
+          });
+        }
+      }
+    } else {
+      // Normal behavior: emit single combined waitcnt
+      unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+      Modified = true;
+
+      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   if (Wait.hasWaitStoreCnt()) {
     assert(ST->hasVscnt());
 
-    [[maybe_unused]] auto SWaitInst =
+    if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u &&
+        !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
+      // Only expand if counter is not out-of-order
+      unsigned Outstanding =
+          std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
+                       ScoreBrackets->getScoreLB(STORE_CNT),
+                   getWaitCountMax(getLimits(), STORE_CNT) - 1);
+      EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) {
         BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
             .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-            .addImm(Wait.StoreCnt);
-    Modified = true;
+            .addImm(Count);
+      });
+    } else {
+      [[maybe_unused]] auto SWaitInst =
+          BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+              .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+              .addImm(Wait.StoreCnt);
+      Modified = true;
 
-    LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
-               if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
-               dbgs() << "New Instr: " << *SWaitInst << '\n');
+      LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
+                 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+                 dbgs() << "New Instr: " << *SWaitInst << '\n');
+    }
   }
 
   return Modified;
@@ -2013,13 +2092,55 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
 bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
     MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
-    AMDGPU::Waitcnt Wait) {
+    AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) {
   assert(ST);
   assert(!isNormalMode(MaxCounter));
 
   bool Modified = false;
   const DebugLoc &DL = Block.findDebugLoc(It);
 
+  // Helper to emit expanded waitcnt sequence for profiling.
+  auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target,
+                                 auto EmitWaitcnt) {
+    if (Outstanding > Target) {
+      for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) {
+        EmitWaitcnt(i);
+        Modified = true;
+      }
+    } else {
+      EmitWaitcnt(Target);
+      Modified = true;
+    }
+  };
+
+  // For GFX12+, we use separate wait instructions, which makes expansion
+  // simpler
+  if (ExpandWaitcntProfiling && ScoreBrackets) {
+    for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+      unsigned Count = getWait(Wait, CT);
+      if (Count == ~0u)
+        continue;
+
+      // Skip expansion for out-of-order counters - emit normal wait instead
+      if (ScoreBrackets->counterOutOfOrder(CT)) {
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Count);
+        Modified = true;
+        continue;
+      }
+
+      unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
+                                          ScoreBrackets->getScoreLB(CT),
+                                      getWaitCountMax(getLimits(), CT) - 1);
+      EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) {
+        BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+            .addImm(Val);
+      });
+    }
+    return Modified;
+  }
+
+  // Normal behavior (no expansion)
   // Check for opportunities to use combined wait instructions.
   if (Wait.DsCnt != ~0u) {
     MachineInstr *SWaitInst = nullptr;
@@ -2415,9 +2536,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt 
Wait,
     Modified =
         WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, 
It);
 
-  // Any counts that could have been applied to any existing waitcnt
-  // instructions will have been done so, now deal with any remaining.
-  ScoreBrackets.applyWaitcnt(Wait);
+  AMDGPU::Waitcnt WaitForScore = Wait;
 
   // ExpCnt can be merged into VINTERP.
   if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -2434,9 +2553,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt 
Wait,
                       << "Update Instr: " << *It);
   }
 
-  if (WCG->createNewWaitcnt(Block, It, Wait))
+  if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets))
     Modified = true;
 
+  // Any counts that could have been applied to any existing waitcnt
+  // instructions will have been done so, now deal with any remaining.
+  ScoreBrackets.applyWaitcnt(WaitForScore);
+
   return Modified;
 }
 
@@ -3052,6 +3175,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
 
+  // Initialize hardware limits first, as they're needed by the generators.
+  Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts());
+
   if (ST->hasExtendedWaitCounts()) {
     IsExpertMode = ST->hasExpertSchedulingMode() &&
                    (ExpertSchedulingModeFlag.getNumOccurrences()
@@ -3060,11 +3186,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
                               .getFnAttribute("amdgpu-expert-scheduling-mode")
                               .getValueAsBool());
     MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
-    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode);
+    WCGGFX12Plus =
+        WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode);
     WCG = &WCGGFX12Plus;
   } else {
     MaxCounter = NUM_NORMAL_INST_CNTS;
-    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
+    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
     WCG = &WCGPreGFX12;
   }
 
@@ -3075,22 +3202,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
 
   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
 
-  if (ST->hasExtendedWaitCounts()) {
-    Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
-  } else {
-    Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
-    Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
-  }
-  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
-  Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
-  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
-  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
-  Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
-  Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask();
-  Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask();
-
   BlockInfos.clear();
   bool Modified = false;
 

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0646c63083195..4ad3a5cd1d727 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1785,6 +1785,25 @@ unsigned getStorecntBitMask(const IsaVersion &Version) {
   return (1 << getStorecntBitWidth(Version.Major)) - 1;
 }
 
+HardwareLimits::HardwareLimits(const IsaVersion &IV,
+                               bool HasExtendedWaitCounts) {
+  if (HasExtendedWaitCounts) {
+    LoadcntMax = getLoadcntBitMask(IV);
+    DscntMax = getDscntBitMask(IV);
+  } else {
+    LoadcntMax = getVmcntBitMask(IV);
+    DscntMax = getLgkmcntBitMask(IV);
+  }
+  ExpcntMax = getExpcntBitMask(IV);
+  StorecntMax = getStorecntBitMask(IV);
+  SamplecntMax = getSamplecntBitMask(IV);
+  BvhcntMax = getBvhcntBitMask(IV);
+  KmcntMax = getKmcntBitMask(IV);
+  XcntMax = getXcntBitMask(IV);
+  VaVdstMax = DepCtr::getVaVdstBitMask();
+  VmVsrcMax = DepCtr::getVmVsrcBitMask();
+}
+
 unsigned getWaitcntBitMask(const IsaVersion &Version) {
   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
                                 getVmcntBitWidthLo(Version.Major));

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f6b95602644ca..770f9a86dc883 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1131,6 +1131,26 @@ struct Waitcnt {
   friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait);
 };
 
+/// Represents the hardware counter limits for 
diff erent wait count types.
+struct HardwareLimits {
+  unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12.
+  unsigned ExpcntMax;
+  unsigned DscntMax;     // Corresponds to LGKMcnt prior to gfx12.
+  unsigned StorecntMax;  // Corresponds to VScnt in gfx10/gfx11.
+  unsigned SamplecntMax; // gfx12+ only.
+  unsigned BvhcntMax;    // gfx12+ only.
+  unsigned KmcntMax;     // gfx12+ only.
+  unsigned XcntMax;      // gfx1250.
+  unsigned VaVdstMax;    // gfx12+ expert mode only.
+  unsigned VmVsrcMax;    // gfx12+ expert mode only.
+
+  HardwareLimits() = default;
+
+  /// Initializes hardware limits from ISA version.
+  /// \p HasExtendedWaitCounts should be true for gfx12+.
+  HardwareLimits(const IsaVersion &IV, bool HasExtendedWaitCounts);
+};
+
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.
 

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll 
b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
new file mode 100644
index 0000000000000..848a9d07084ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -0,0 +1,944 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc 
-mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck 
--check-prefix=GFX9-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc 
-mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck 
--check-prefix=GFX10-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc 
-mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck 
--check-prefix=GFX11-NOEXPAND %s
+; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc 
-mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s
+; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck 
--check-prefix=GFX12-NOEXPAND %s
+
+; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding
+; operations, instead of emitting a single waitcnt(target), we emit:
+;   waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target)
+;
+; This allows PC-sampling profilers to identify which specific operation
+; is causing a stall by observing where the program counter is stuck.
+
+define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, 
ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 {
+; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX9-EXPAND:       ; %bb.0:
+; GFX9-EXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX9-EXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX9-EXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-EXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX9-NOEXPAND:       ; %bb.0:
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX9-NOEXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX9-NOEXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NOEXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX10-EXPAND:       ; %bb.0:
+; GFX10-EXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX10-EXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX10-EXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-EXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX10-NOEXPAND:       ; %bb.0:
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX10-NOEXPAND-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX10-NOEXPAND-NEXT:    s_load_dword s2, s[12:13], 0x0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-NOEXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v1, s[14:15]
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX11-EXPAND:       ; %bb.0:
+; GFX11-EXPAND-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-EXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(SALU_CYCLE_1)
+; GFX11-EXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX11-NOEXPAND:       ; %bb.0:
+; GFX11-NOEXPAND-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(SALU_CYCLE_1)
+; GFX11-NOEXPAND-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX12-EXPAND:       ; %bb.0:
+; GFX12-EXPAND-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-EXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(SALU_CYCLE_1)
+; GFX12-EXPAND-NEXT:    s_add_co_i32 s0, s0, s2
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads:
+; GFX12-NOEXPAND:       ; %bb.0:
+; GFX12-NOEXPAND-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(SALU_CYCLE_1)
+; GFX12-NOEXPAND-NEXT:    s_add_co_i32 s0, s0, s2
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+
+  %val_a = load i32, ptr addrspace(4) %ptr_a, align 4
+  %val_b = load i32, ptr addrspace(4) %ptr_b, align 4
+  %val_c = load i32, ptr addrspace(4) %ptr_c, align 4
+  %sum1 = add i32 %val_a, %val_b
+  %sum2 = add i32 %sum1, %val_c
+  store i32 %sum2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr 
addrspace(1) %out) #0 {
+; GFX9-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX9-EXPAND:       ; %bb.0:
+; GFX9-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-EXPAND-NEXT:    global_load_dword v2, v0, s[0:1] offset:256
+; GFX9-EXPAND-NEXT:    global_load_dword v3, v0, s[0:1] offset:512
+; GFX9-EXPAND-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-EXPAND-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-EXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX9-NOEXPAND:       ; %bb.0:
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NOEXPAND-NEXT:    global_load_dword v2, v0, s[0:1] offset:256
+; GFX9-NOEXPAND-NEXT:    global_load_dword v3, v0, s[0:1] offset:512
+; GFX9-NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NOEXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX10-EXPAND:       ; %bb.0:
+; GFX10-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    s_clause 0x2
+; GFX10-EXPAND-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-EXPAND-NEXT:    global_load_dword v2, v0, s[0:1] offset:256
+; GFX10-EXPAND-NEXT:    global_load_dword v3, v0, s[0:1] offset:512
+; GFX10-EXPAND-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-EXPAND-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-EXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX10-NOEXPAND:       ; %bb.0:
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    s_clause 0x2
+; GFX10-NOEXPAND-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX10-NOEXPAND-NEXT:    global_load_dword v2, v0, s[0:1] offset:256
+; GFX10-NOEXPAND-NEXT:    global_load_dword v3, v0, s[0:1] offset:512
+; GFX10-NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NOEXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX11-EXPAND:       ; %bb.0:
+; GFX11-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_clause 0x2
+; GFX11-EXPAND-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-EXPAND-NEXT:    global_load_b32 v2, v0, s[0:1] offset:256
+; GFX11-EXPAND-NEXT:    global_load_b32 v3, v0, s[0:1] offset:512
+; GFX11-EXPAND-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-EXPAND-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-EXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX11-NOEXPAND:       ; %bb.0:
+; GFX11-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_clause 0x2
+; GFX11-NOEXPAND-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NOEXPAND-NEXT:    global_load_b32 v2, v0, s[0:1] offset:256
+; GFX11-NOEXPAND-NEXT:    global_load_b32 v3, v0, s[0:1] offset:512
+; GFX11-NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOEXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_vmcnt_global_loads:
+; GFX12-EXPAND:       ; %bb.0:
+; GFX12-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_clause 0x2
+; GFX12-EXPAND-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-EXPAND-NEXT:    global_load_b32 v2, v0, s[0:1] offset:256
+; GFX12-EXPAND-NEXT:    global_load_b32 v3, v0, s[0:1] offset:512
+; GFX12-EXPAND-NEXT:    s_wait_loadcnt 0x2
+; GFX12-EXPAND-NEXT:    s_wait_loadcnt 0x1
+; GFX12-EXPAND-NEXT:    s_wait_loadcnt 0x0
+; GFX12-EXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads:
+; GFX12-NOEXPAND:       ; %bb.0:
+; GFX12-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_clause 0x2
+; GFX12-NOEXPAND-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NOEXPAND-NEXT:    global_load_b32 v2, v0, s[0:1] offset:256
+; GFX12-NOEXPAND-NEXT:    global_load_b32 v3, v0, s[0:1] offset:512
+; GFX12-NOEXPAND-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NOEXPAND-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+
+  ; Use thread ID to create thread-varying addresses -> forces vector loads
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid64 = zext i32 %tid to i64
+
+  ; Three separate global loads with thread-varying addresses
+  ; Non-volatile loads allow multiple operations to be in-flight
+  %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
+  %val0 = load i32, ptr addrspace(1) %ptr0, align 4
+
+  %offset1 = add i64 %tid64, 64
+  %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
+  %val1 = load i32, ptr addrspace(1) %ptr1, align 4
+
+  %offset2 = add i64 %tid64, 128
+  %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
+  %val2 = load i32, ptr addrspace(1) %ptr2, align 4
+
+  %sum1 = add i32 %val0, %val1
+  %sum2 = add i32 %sum1, %val2
+
+  %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64
+  store i32 %sum2, ptr addrspace(1) %out_ptr, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) 
%lds_ptr, ptr addrspace(1) %out) #0 {
+; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX9-EXPAND:       ; %bb.0:
+; GFX9-EXPAND-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-EXPAND-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-EXPAND-NEXT:    ds_read_b32 v2, v2 offset:8
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-EXPAND-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-EXPAND-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX9-NOEXPAND:       ; %bb.0:
+; GFX9-NOEXPAND-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NOEXPAND-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX9-NOEXPAND-NEXT:    ds_read_b32 v2, v2 offset:8
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NOEXPAND-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NOEXPAND-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX10-EXPAND:       ; %bb.0:
+; GFX10-EXPAND-NEXT:    s_clause 0x1
+; GFX10-EXPAND-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-EXPAND-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX10-EXPAND-NEXT:    ds_read_b32 v2, v2 offset:8
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-EXPAND-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX10-NOEXPAND:       ; %bb.0:
+; GFX10-NOEXPAND-NEXT:    s_clause 0x1
+; GFX10-NOEXPAND-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NOEXPAND-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; GFX10-NOEXPAND-NEXT:    ds_read_b32 v2, v2 offset:8
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NOEXPAND-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX11-EXPAND:       ; %bb.0:
+; GFX11-EXPAND-NEXT:    s_clause 0x1
+; GFX11-EXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX11-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX11-NOEXPAND:       ; %bb.0:
+; GFX11-NOEXPAND-NEXT:    s_clause 0x1
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX11-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX12-EXPAND:       ; %bb.0:
+; GFX12-EXPAND-NEXT:    s_clause 0x1
+; GFX12-EXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX12-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
+; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x1
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations:
+; GFX12-NOEXPAND:       ; %bb.0:
+; GFX12-NOEXPAND-NEXT:    s_clause 0x1
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX12-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
+; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x1
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+
+  %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0
+  %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
+  %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2
+  %val0 = load i32, ptr addrspace(3) %ptr0, align 4
+  %val1 = load i32, ptr addrspace(3) %ptr1, align 4
+  %val2 = load i32, ptr addrspace(3) %ptr2, align 4
+  %sum1 = add i32 %val0, %val1
+  %sum2 = add i32 %sum1, %val2
+  store i32 %sum2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) 
%scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 {
+; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX9-EXPAND:       ; %bb.0:
+; GFX9-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-EXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    s_add_i32 s0, s4, s5
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX9-NOEXPAND:       ; %bb.0:
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NOEXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    s_add_i32 s0, s4, s5
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX10-EXPAND:       ; %bb.0:
+; GFX10-EXPAND-NEXT:    s_clause 0x1
+; GFX10-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-EXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    s_add_i32 s0, s4, s5
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX10-NOEXPAND:       ; %bb.0:
+; GFX10-NOEXPAND-NEXT:    s_clause 0x1
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX10-NOEXPAND-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    s_add_i32 s0, s4, s5
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX11-EXPAND:       ; %bb.0:
+; GFX11-EXPAND-NEXT:    s_clause 0x1
+; GFX11-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX11-NOEXPAND:       ; %bb.0:
+; GFX11-NOEXPAND-NEXT:    s_clause 0x1
+; GFX11-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX12-EXPAND:       ; %bb.0:
+; GFX12-EXPAND-NEXT:    s_clause 0x1
+; GFX12-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt:
+; GFX12-NOEXPAND:       ; %bb.0:
+; GFX12-NOEXPAND-NEXT:    s_clause 0x1
+; GFX12-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+
+  %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4
+  %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4
+
+  %result = add i32 %scalar_val1, %scalar_val2
+  store i32 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Test that expansion is NOT applied when counters are out-of-order (mixed 
event types).
+; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they 
can complete
+; out-of-order relative to each other. When both are in-flight, we should NOT 
expand
+; because the expansion would be misleading.
+define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) 
%lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 {
+; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX9-EXPAND:       ; %bb.0:
+; GFX9-EXPAND-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-EXPAND-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-EXPAND-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-EXPAND-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-EXPAND-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX9-NOEXPAND:       ; %bb.0:
+; GFX9-NOEXPAND-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NOEXPAND-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NOEXPAND-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NOEXPAND-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-NOEXPAND-NEXT:    global_store_dword v2, v0, s[2:3]
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX10-EXPAND:       ; %bb.0:
+; GFX10-EXPAND-NEXT:    s_clause 0x1
+; GFX10-EXPAND-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-EXPAND-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-EXPAND-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-EXPAND-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX10-NOEXPAND:       ; %bb.0:
+; GFX10-NOEXPAND-NEXT:    s_clause 0x1
+; GFX10-NOEXPAND-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-NOEXPAND-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-NOEXPAND-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NOEXPAND-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX11-EXPAND:       ; %bb.0:
+; GFX11-EXPAND-NEXT:    s_clause 0x1
+; GFX11-EXPAND-NEXT:    s_load_b32 s6, s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX11-NOEXPAND:       ; %bb.0:
+; GFX11-NOEXPAND-NEXT:    s_clause 0x1
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s6, s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX12-EXPAND:       ; %bb.0:
+; GFX12-EXPAND-NEXT:    s_clause 0x1
+; GFX12-EXPAND-NEXT:    s_load_b32 s6, s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem:
+; GFX12-NOEXPAND:       ; %bb.0:
+; GFX12-NOEXPAND-NEXT:    s_clause 0x1
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s6, s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
+; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+
+  %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4
+  %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4
+  %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1
+  %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4
+  %sum1 = add i32 %lds_val1, %lds_val2
+  %sum2 = add i32 %sum1, %smem_val
+  store i32 %sum2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 {
+; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+)
+; GFX9-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX9-EXPAND:       ; %bb.0: ; %entry
+; GFX9-EXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:256
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-EXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:512
+; GFX9-EXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX9-NOEXPAND:       ; %bb.0: ; %entry
+; GFX9-NOEXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:256
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1] offset:512
+; GFX9-NOEXPAND-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX10-EXPAND:       ; %bb.0: ; %entry
+; GFX10-EXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v2, 2
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v3, 3
+; GFX10-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:256
+; GFX10-EXPAND-NEXT:    global_store_dword v0, v3, s[0:1] offset:512
+; GFX10-EXPAND-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX10-NOEXPAND:       ; %bb.0: ; %entry
+; GFX10-NOEXPAND-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v2, 2
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v3, 3
+; GFX10-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v2, s[0:1] offset:256
+; GFX10-NOEXPAND-NEXT:    global_store_dword v0, v3, s[0:1] offset:512
+; GFX10-NOEXPAND-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX11-EXPAND:       ; %bb.0: ; %entry
+; GFX11-EXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-EXPAND-NEXT:    s_clause 0x2
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v2, s[0:1] offset:256
+; GFX11-EXPAND-NEXT:    global_store_b32 v0, v3, s[0:1] offset:512
+; GFX11-EXPAND-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX11-NOEXPAND:       ; %bb.0: ; %entry
+; GFX11-NOEXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOEXPAND-NEXT:    s_clause 0x2
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v2, s[0:1] offset:256
+; GFX11-NOEXPAND-NEXT:    global_store_b32 v0, v3, s[0:1] offset:512
+; GFX11-NOEXPAND-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_vscnt_global_stores:
+; GFX12-EXPAND:       ; %bb.0: ; %entry
+; GFX12-EXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-EXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-EXPAND-NEXT:    s_clause 0x2
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v2, s[0:1] offset:256
+; GFX12-EXPAND-NEXT:    global_store_b32 v0, v3, s[0:1] offset:512
+; GFX12-EXPAND-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-EXPAND-NEXT:    s_wait_storecnt 0x0
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores:
+; GFX12-NOEXPAND:       ; %bb.0: ; %entry
+; GFX12-NOEXPAND-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3
+; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NOEXPAND-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_clause 0x2
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v2, s[0:1] offset:256
+; GFX12-NOEXPAND-NEXT:    global_store_b32 v0, v3, s[0:1] offset:512
+; GFX12-NOEXPAND-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NOEXPAND-NEXT:    s_wait_storecnt 0x0
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid64 = zext i32 %tid to i64
+
+  ; Issue multiple stores
+  %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64
+  store i32 1, ptr addrspace(1) %ptr0, align 4
+
+  %offset1 = add i64 %tid64, 64
+  %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1
+  store i32 2, ptr addrspace(1) %ptr1, align 4
+
+  %offset2 = add i64 %tid64, 128
+  %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2
+  store i32 3, ptr addrspace(1) %ptr2, align 4
+
+  ; Memory fence forces wait for all stores
+  fence release
+  ret void
+}
+
+define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float 
%w) #0 {
+; Test export operations (EXP_CNT/expcnt)
+; GFX9-EXPAND-LABEL: test_expcnt_exports:
+; GFX9-EXPAND:       ; %bb.0: ; %entry
+; GFX9-EXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX9-EXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX9-EXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX9-EXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX9-EXPAND-NEXT:    exp param0 v4, v4, v4, v4 done
+; GFX9-EXPAND-NEXT:    s_endpgm
+;
+; GFX9-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX9-NOEXPAND:       ; %bb.0: ; %entry
+; GFX9-NOEXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX9-NOEXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX9-NOEXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX9-NOEXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX9-NOEXPAND-NEXT:    exp param0 v4, v4, v4, v4 done
+; GFX9-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX10-EXPAND-LABEL: test_expcnt_exports:
+; GFX10-EXPAND:       ; %bb.0: ; %entry
+; GFX10-EXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX10-EXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX10-EXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX10-EXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX10-EXPAND-NEXT:    exp param0 v4, v4, v4, v4 done
+; GFX10-EXPAND-NEXT:    s_endpgm
+;
+; GFX10-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX10-NOEXPAND:       ; %bb.0: ; %entry
+; GFX10-NOEXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX10-NOEXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX10-NOEXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX10-NOEXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX10-NOEXPAND-NEXT:    exp param0 v4, v4, v4, v4 done
+; GFX10-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX11-EXPAND-LABEL: test_expcnt_exports:
+; GFX11-EXPAND:       ; %bb.0: ; %entry
+; GFX11-EXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX11-EXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX11-EXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX11-EXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX11-EXPAND-NEXT:    exp invalid_target_32 v4, v4, v4, v4 done
+; GFX11-EXPAND-NEXT:    s_endpgm
+;
+; GFX11-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX11-NOEXPAND:       ; %bb.0: ; %entry
+; GFX11-NOEXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX11-NOEXPAND-NEXT:    exp mrt0 v0, v1, v2, v3
+; GFX11-NOEXPAND-NEXT:    exp mrt1 v3, v2, v1, v0
+; GFX11-NOEXPAND-NEXT:    exp mrt2 v0, v3, v1, v2
+; GFX11-NOEXPAND-NEXT:    exp invalid_target_32 v4, v4, v4, v4 done
+; GFX11-NOEXPAND-NEXT:    s_endpgm
+;
+; GFX12-EXPAND-LABEL: test_expcnt_exports:
+; GFX12-EXPAND:       ; %bb.0: ; %entry
+; GFX12-EXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX12-EXPAND-NEXT:    export mrt0 v0, v1, v2, v3
+; GFX12-EXPAND-NEXT:    export mrt1 v3, v2, v1, v0
+; GFX12-EXPAND-NEXT:    export mrt2 v0, v3, v1, v2
+; GFX12-EXPAND-NEXT:    export invalid_target_32 v4, v4, v4, v4 done
+; GFX12-EXPAND-NEXT:    s_endpgm
+;
+; GFX12-NOEXPAND-LABEL: test_expcnt_exports:
+; GFX12-NOEXPAND:       ; %bb.0: ; %entry
+; GFX12-NOEXPAND-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX12-NOEXPAND-NEXT:    export mrt0 v0, v1, v2, v3
+; GFX12-NOEXPAND-NEXT:    export mrt1 v3, v2, v1, v0
+; GFX12-NOEXPAND-NEXT:    export mrt2 v0, v3, v1, v2
+; GFX12-NOEXPAND-NEXT:    export invalid_target_32 v4, v4, v4, v4 done
+; GFX12-NOEXPAND-NEXT:    s_endpgm
+entry:
+  ; Multiple MRT exports
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, 
float %w, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, 
float %x, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, 
float %z, i1 false, i1 false)
+  ; Final export with done bit
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 
1.0, float 1.0, i1 true, i1 false)
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
+
+attributes #0 = { nounwind ATTRS }


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 3dfb782 - [AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (#169345)

Reply via email to