https://github.com/kerbowa created https://github.com/llvm/llvm-project/pull/169617
Implements a structural stall heuristic that considers both resource hazards and latency constraints when selecting instructions from the pending queue. - Add getStructuralStallCycles() to GCNSchedStrategy that computes the number of cycles an instruction must wait due to: - Resource conflicts on unbuffered resources (from the SchedModel) - Sequence-dependent hazards (from GCNHazardRecognizer) - Add getHazardWaitStates() to GCNHazardRecognizer that returns the number of wait states until all hazards for an instruction are resolved, providing cycle-accurate hazard information for scheduling heuristics. >From bbf69d1b280b197c3f0a6e0fc9edf91f361ec407 Mon Sep 17 00:00:00 2001 From: Austin Kerbow <[email protected]> Date: Tue, 25 Nov 2025 22:18:19 -0800 Subject: [PATCH] [AMDGPU] Add structural stall heuristic to scheduling strategies Implements a structural stall heuristic that considers both resource hazards and latency constraints when selecting instructions from the pending queue. - Add getStructuralStallCycles() to GCNSchedStrategy that computes the number of cycles an instruction must wait due to: - Resource conflicts on unbuffered resources (from the SchedModel) - Sequence-dependent hazards (from GCNHazardRecognizer) - Add getHazardWaitStates() to GCNHazardRecognizer that returns the number of wait states until all hazards for an instruction are resolved, providing cycle-accurate hazard information for scheduling heuristics. --- .../Target/AMDGPU/AMDGPUMLSchedStrategy.cpp | 72 +++++++++++++++++++ .../lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h | 4 +- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 4 ++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 6 ++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 40 +++++++++++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 8 ++- .../AMDGPU/ml-sched-effective-stall.mir | 8 ++- 7 files changed, 136 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp index 6bad7cc172709..20d5ca61ac01d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp @@ -13,6 +13,10 @@ #include "AMDGPUMLSchedStrategy.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "machine-scheduler" + using namespace llvm; AMDGPUMLSchedStrategy::AMDGPUMLSchedStrategy(const MachineSchedContext *C) @@ -121,6 +125,74 @@ bool AMDGPUMLSchedStrategy::tryCandidate(SchedCandidate &Cand, return false; } +bool AMDGPUMLSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // Compare effective stall cycles between candidates. + // Effective stall = max(structural stall, latency stall) + // - Structural stalls: resource/hazard constraints (HW not ready) + // - Latency stalls: data dependency constraints (operands not ready) + // + // This allows picking a pending instruction with structural stalls over + // an available instruction with higher latency stalls (e.g., scheduling + // a WMMA while waiting for a memory load result). + unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU); + unsigned TryLatencyStall = Zone->getLatencyStallCycles(TryCand.SU); + unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall); + + unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU); + unsigned CandLatencyStall = Zone->getLatencyStallCycles(Cand.SU); + unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall); + + LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) { + dbgs() << "Effective stalls: try=" << TryEffectiveStall + << " (struct=" << TryStructStall << ", lat=" << TryLatencyStall + << ") cand=" << CandEffectiveStall + << " (struct=" << CandStructStall << ", lat=" << CandLatencyStall + << ")\n"; + }); + + if (tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall)) + return TryCand.Reason != NoCand; + + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + } + + return false; +} + AMDGPUMLPostSchedStrategy::AMDGPUMLPostSchedStrategy( const MachineSchedContext *C) : PostGenericScheduler(C) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h index 1a6d042231942..b72b193c70786 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h @@ -20,6 +20,8 @@ class AMDGPUMLSchedStrategy final : public GCNSchedStrategy { protected: bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override; + bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const override; public: AMDGPUMLSchedStrategy(const MachineSchedContext *C); @@ -33,4 +35,4 @@ class AMDGPUMLPostSchedStrategy : public PostGenericScheduler { AMDGPUMLPostSchedStrategy(const MachineSchedContext *C); }; -} // End namespace llvm \ No newline at end of file +} // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 7a2f84a2f73eb..ec160d15da1ac 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -313,6 +313,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { return std::max(W, NopPadding.getValue()); } +unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const { + return const_cast<GCNHazardRecognizer *>(this)->PreEmitNoopsCommon(MI); +} + unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (MI->isBundle()) return 0; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 67beffadc0913..be914d8657870 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -145,6 +145,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { void EmitInstruction(SUnit *SU) override; void EmitInstruction(MachineInstr *MI) override; HazardType getHazardType(SUnit *SU, int Stalls) override; + + /// Returns the number of wait states until all hazards for \p MI are + /// resolved. This is useful for scheduling heuristics that want + /// cycle-accurate hazard information rather than just a boolean. Unlike + /// PreEmitNoops, this does not modify state or fix hazards. + unsigned getHazardWaitStates(MachineInstr *MI) const; void EmitNoop() override; unsigned PreEmitNoops(MachineInstr *) override; unsigned PreEmitNoopsCommon(MachineInstr *); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d9cb80c7c1676..00d1a25d0b6c8 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -25,6 +25,7 @@ #include "GCNSchedStrategy.h" #include "AMDGPUIGroupLP.h" +#include "GCNHazardRecognizer.h" #include "GCNRegPressure.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -218,6 +219,40 @@ void GCNSchedStrategy::getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } +unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone, + SUnit *SU) const { + // Only implemented for top-down scheduling currently. + if (!Zone.isTop() || !SU) + return 0; + + MachineInstr *MI = SU->getInstr(); + unsigned CurrCycle = Zone.getCurrCycle(); + unsigned Stall = 0; + + // Query SchedModel for resource stalls (unbuffered resources). + if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) { + const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + for (const MCWriteProcResEntry &PE : + make_range(SchedModel->getWriteProcResBegin(SC), + SchedModel->getWriteProcResEnd(SC))) { + unsigned NextAvail = + Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle, + PE.AcquireAtCycle) + .first; + if (NextAvail > CurrCycle) + Stall = std::max(Stall, NextAvail - CurrCycle); + } + } + + // Query HazardRecognizer for sequence-dependent hazard penalties. + if (Zone.HazardRec && Zone.HazardRec->isEnabled()) { + auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec); + Stall = std::max(Stall, HR->getHazardWaitStates(MI)); + } + + return Stall; +} + void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, @@ -673,6 +708,11 @@ bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, bool SameBoundary = Zone != nullptr; if (SameBoundary) { + unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU); + unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU); + if (tryLess(TryStructStall, CandStructStall, TryCand, Cand, Stall)) + return TryCand.Reason != NoCand; + TryCand.initResourceDelta(DAG, SchedModel); if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, TryCand, Cand, ResourceReduce)) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 367f47c3ca4ae..048eeecac0ab9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -56,6 +56,10 @@ class GCNSchedStrategy : public GenericScheduler { const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp); + /// Estimate how many cycles \p SU must wait due to structural hazards at the + /// current boundary cycle. Returns zero when no stall is required. + unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const; + /// Evaluates instructions in the pending queue using a subset of scheduling /// heuristics. /// @@ -64,8 +68,8 @@ class GCNSchedStrategy : public GenericScheduler { /// invisible to scheduling heuristics. However, in certain scenarios (such as /// avoiding register spilling), it may be beneficial to consider scheduling /// these not-yet-ready instructions. - bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const; + virtual bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const; void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred); diff --git a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir index 6c6d1c5728a34..08d9626d69f90 100644 --- a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir +++ b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir @@ -10,6 +10,8 @@ attributes #1 = { "amdgpu-waves-per-eu"="1,1" } ... +# The scheduler should reorder the use of the global load after WMMAs to hide memory latency. + --- name: with_ml_workload_attr tracksRegLiveness: true @@ -29,8 +31,8 @@ body: | ; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec ; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 ; ; ML-LABEL: name: with_ml_workload_attr @@ -47,8 +49,8 @@ body: | ; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 %0:vreg_512_align2 = IMPLICIT_DEF %1:vreg_512_align2 = IMPLICIT_DEF @@ -99,8 +101,8 @@ body: | ; ML-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; ML-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; ML-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF - ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec + ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; ML-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; ML-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; ML-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
