llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: None (hidekisaito) <details> <summary>Changes</summary> Add insertDSPreheaderFlushes() to insert S_WAIT_DSCNT 0 in loop preheaders when DS wait relaxation was applied. Assisted-by: Cursor / claude-4.5-opus-high Depends on https://github.com/llvm/llvm-project/pull/171944 --- Full diff: https://github.com/llvm/llvm-project/pull/171948.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+67) - (modified) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir (+4-2) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 777491fb58b80..28bc57ed2db4e 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -605,6 +605,7 @@ class SIInsertWaitcnts { std::optional<unsigned> getOptimalDSWaitCount(MachineBasicBlock *LoopHeader, const MachineInstr &MI) const; bool applyDSLoopWaitOpt(MachineInstr &MI, AMDGPU::Waitcnt &Wait); + bool insertDSPreheaderFlushes(MachineFunction &MF); }; // This objects maintains the current score brackets of each wait counter, and @@ -2904,6 +2905,68 @@ bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr &MI, return true; } +// Insert DS_CNT flush in preheaders of loops where DS wait relaxation was +// applied. This is necessary because the relaxed wait counts inside the loop +// are computed based on the DS loads issued at the end of the previous +// iteration (via backedge), but the first iteration enters via the preheader. +// We must ensure all DS loads from the preheader are complete before entering +// the loop. +bool SIInsertWaitcnts::insertDSPreheaderFlushes(MachineFunction &MF) { + bool Modified = false; + + for (auto &[LoopHeader, Info] : LoopDSWaitOptCache) { + if (!Info.Valid || !Info.RelaxationApplied) + continue; + + MachineLoop *ML = MLI->getLoopFor(LoopHeader); + if (!ML) + continue; + + MachineBasicBlock *Preheader = ML->getLoopPreheader(); + if (!Preheader) + continue; + + // Insert s_wait_dscnt 0 at the end of the preheader (before the terminator) + MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator(); + if (InsertPos == Preheader->end() && !Preheader->empty()) + InsertPos = std::prev(Preheader->end()); + + // Check if there's already a DS wait at this position + bool NeedInsert = true; + if (InsertPos != Preheader->end() && InsertPos != Preheader->begin()) { + auto CheckPos = std::prev(InsertPos); + if (CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT_soft || + CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT) { + if (CheckPos->getOperand(0).getImm() == 0) + NeedInsert = false; + else { + // Change existing wait to 0 + CheckPos->getOperand(0).setImm(0); + NeedInsert = false; + Modified = true; + LLVM_DEBUG(dbgs() << "DS Loop Opt: Changed existing DS_CNT wait to 0" + << " in preheader "; + Preheader->printName(dbgs()); dbgs() << "\n"); + } + } + } + + if (NeedInsert) { + DebugLoc DL; + if (InsertPos != Preheader->end()) + DL = InsertPos->getDebugLoc(); + BuildMI(*Preheader, InsertPos, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)) + .addImm(0); + Modified = true; + LLVM_DEBUG(dbgs() << "DS Loop Opt: Inserted DS_CNT flush in preheader "; + Preheader->printName(dbgs()); dbgs() << " for loop at "; + LoopHeader->printName(dbgs()); dbgs() << "\n"); + } + } + + return Modified; +} + // Return true if it is better to flush the vmcnt counter in the preheader of // the given loop. We currently decide to flush in two situations: // 1. The loop contains vmem store(s), no vmem load and at least one use of a @@ -3250,6 +3313,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { } } } + + // Insert DS_CNT flushes in preheaders of loops that had wait counts relaxed. + Modified |= insertDSPreheaderFlushes(MF); + ReleaseVGPRInsts.clear(); PreheadersToFlush.clear(); LoopDSWaitOptCache.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir index 48fdabf255e6f..e6237338fda5b 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir @@ -17,6 +17,7 @@ # DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible # DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1 # DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for: +# DBG: DS Loop Opt: Inserted DS_CNT flush in preheader bb.0 for loop at bb.1 --- | define amdgpu_kernel void @ds_loop_eligible() { ret void } @@ -31,9 +32,10 @@ machineFunctionInfo: isEntryFunction: true waveLimiter: false body: | + ; Check preheader: OPT adds S_WAIT_DSCNT 0 flush, NOOPT does not ; OPT: bb.0: - ; OPT-NOT: S_WAIT_DSCNT - ; OPT: S_BRANCH %bb.1 + ; OPT: S_WAIT_DSCNT_soft 0 + ; OPT-NEXT: S_BRANCH %bb.1 ; NOOPT: bb.0: ; NOOPT-NOT: S_WAIT_DSCNT `````````` </details> https://github.com/llvm/llvm-project/pull/171948 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
