llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: None (hidekisaito) <details> <summary>Changes</summary> …to handle them (4/4) Add handling for same-iteration use/overwrite of DS load results: - Track DS load destinations and detect when results are used or overwritten within the same iteration - Compute FloorWaitCount for WMMAs that only use flushed loads Add bailout for tensor_load_to_lds and LDS DMA writes after barrier Add negative test based on profitability criteria Assisted-by: Cursor / claude-4.5-opus-high Depends on https://github.com/llvm/llvm-project/pull/171948 --- Patch is 41.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171952.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+93-6) - (modified) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir (+1-1) - (added) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir (+109) - (added) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir (+111) - (added) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-use.mir (+107) - (added) llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-tensor-load.mir (+97) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 28bc57ed2db4e..55c0d72c125af 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -468,6 +468,11 @@ class SIInsertWaitcnts { mutable bool RelaxationApplied = false; // Pointer to the last barrier in the loop (found during eligibility check) const MachineInstr *LastBarrier = nullptr; + // The wait count "floor" established by same-iteration uses/overwrites. + // When a DS load result is used in the same iteration, the baseline inserts + // a wait. This floor indicates the expected counter state after that wait. + // WMMAs that only use flushed loads can rely on this floor. + unsigned FloorWaitCount = 0; }; // Cache of loop DS wait optimization info, keyed by loop header MBB. @@ -2775,9 +2780,21 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) { // if one exists. LastBarrier was already found during eligibility check. // These are likely to be prefetch loads whose results are used in the next // iteration. + // + // If a load result is used or overwritten within the same iteration, the + // baseline will insert a wait before that instruction. Since DS loads + // complete in FIFO order, that wait also completes all earlier loads. So we + // can drop those "flushed" loads from our tracking and only consider + // subsequent loads as true prefetch loads. Overwrites also require the load + // to complete first to avoid write-after-write races. const MachineInstr *LastBarrier = Info.LastBarrier; + // Single pass: track DS load destinations, handle uses (which flush prior + // loads) and detect overwrites (which invalidate our analysis). + // TrackedLoads: (Register, Position) pairs for checking uses/overwrites + SmallVector<std::pair<Register, unsigned>, 64> TrackedLoads; unsigned LoadPosition = 0; + unsigned LastFlushedPosition = 0; // Loads up to this position will be flushed bool AfterLastBarrier = (LastBarrier == nullptr); // If no barrier, track all for (const MachineInstr &MI : *MBB) { @@ -2789,6 +2806,42 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) { if (!AfterLastBarrier) continue; + // Check for instructions that write to LDS through DMA (global_load_lds, + // etc). These write to LDS but aren't DS instructions. + // Bail out if any appear after the barrier. + if (SIInstrInfo::mayWriteLDSThroughDMA(MI)) { + LLVM_DEBUG( + dbgs() << "Loop DS Wait Opt: LDS DMA write after last barrier, " + << "skipping\n"); + Info.Valid = false; + return; + } + + // Check for tensor_load_to_lds instructions (MIMG, not caught by above) + if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS || + MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2) { + LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: tensor_load_to_lds after last " + << "barrier, skipping\n"); + Info.Valid = false; + return; + } + + // Check if this instruction uses or overwrites any tracked DS load + // destination. If so, baseline will have inserted a wait that flushes + // all loads up to that position (since DS loads complete in order). + // Overwrites also require the load to complete first to avoid races. + for (auto &[Reg, Position] : TrackedLoads) { + if (Position <= LastFlushedPosition) + continue; // Already flushed + + if (MI.readsRegister(Reg, TRI) || MI.modifiesRegister(Reg, TRI)) { + LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: DS load at position " + << Position << " used/overwritten in same iteration, " + << "flushing positions 1-" << Position << "\n"); + LastFlushedPosition = std::max(LastFlushedPosition, Position); + } + } + // Check DS instructions if (SIInstrInfo::isDS(MI)) { // DS stores after barrier not allowed - same counter, may complete @@ -2806,6 +2859,7 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) { for (const MachineOperand &Op : MI.defs()) { if (Op.isReg() && Op.getReg().isPhysical() && TRI->isVGPR(*MRI, Op.getReg())) { + TrackedLoads.emplace_back(Op.getReg(), LoadPosition); for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { Info.VGPRToLoadPosition[static_cast<unsigned>(Unit)] = LoadPosition; @@ -2816,12 +2870,32 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) { } } - Info.TotalDSLoads = LoadPosition; + // Filter out flushed loads and renumber remaining ones + // Also compute the floor wait count - the wait established by same-iteration + // use + if (LastFlushedPosition > 0) { + DenseMap<unsigned, unsigned> NewMap; + for (auto &[RegUnit, Position] : Info.VGPRToLoadPosition) { + if (Position > LastFlushedPosition) { + NewMap[RegUnit] = Position - LastFlushedPosition; + } + } + Info.VGPRToLoadPosition = std::move(NewMap); + // FloorWaitCount: when same-iteration use waits for load N, it leaves + // (TotalLoads - N) loads in flight. For the next iteration's WMMAs, + // any that only use flushed loads are already covered by this wait. + Info.FloorWaitCount = LoadPosition - LastFlushedPosition; + } else { + Info.FloorWaitCount = 0; + } + + Info.TotalDSLoads = LoadPosition - LastFlushedPosition; Info.Valid = Info.TotalDSLoads > 0; LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Analyzed loop at "; MBB->printName(dbgs()); - dbgs() << " - " << Info.TotalDSLoads << " DS loads" + dbgs() << " - " << Info.TotalDSLoads << " DS loads, " + << "FloorWaitCount=" << Info.FloorWaitCount << ", HasBarrier=" << (LastBarrier != nullptr) << ", Valid=" << Info.Valid << "\n"); } @@ -2851,13 +2925,26 @@ SIInsertWaitcnts::getOptimalDSWaitCount(MachineBasicBlock *LoopHeader, } } - if (MaxLoadPosition == 0) - return std::nullopt; - // Optimal wait = TotalDSLoads - MaxLoadPosition // This means we wait until all loads up to and including MaxLoadPosition // have completed, but loads after it can still be in flight. - return Info.TotalDSLoads - MaxLoadPosition; + unsigned OptimalWait = Info.TotalDSLoads - MaxLoadPosition; + + // If MaxLoadPosition == 0, this instruction only uses flushed loads + // (whose results are used in the same iteration). The same-iteration use + // will insert a wait that leaves FloorWaitCount loads in flight. + // So this instruction's needs are covered if OptimalWait >= FloorWaitCount. + // We return FloorWaitCount to indicate "can relax to this level". + if (MaxLoadPosition == 0 && Info.FloorWaitCount > 0) { + // All operands are from flushed loads - covered by same-iteration use's + // wait + return Info.FloorWaitCount; + } + + if (MaxLoadPosition == 0) + return std::nullopt; + + return OptimalWait; } // Try to apply DS loop wait optimization to relax conservative wait counts. diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir index e6237338fda5b..ba0051ab03409 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir @@ -15,7 +15,7 @@ # With opt: S_WAIT_DSCNT 12 (wait for only 4 loads, 12 remain in flight) # # DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} total insts, eligible -# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, Valid=1 +# DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, FloorWaitCount=0, HasBarrier=1, Valid=1 # DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for: # DBG: DS Loop Opt: Inserted DS_CNT flush in preheader bb.0 for loop at bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir new file mode 100644 index 0000000000000..3a4dd9924edaa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-no-improvement.mir @@ -0,0 +1,109 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s + +# Test: preheader loads with slight reordering +# Baseline is close to optimal, improvement below threshold of 4 +# Both OPT and NOOPT should produce the same wait count. + +--- | + define amdgpu_kernel void @ds_loop_no_improvement() { ret void } +... + +--- +# OPT-LABEL: name: ds_loop_no_improvement +# NOOPT-LABEL: name: ds_loop_no_improvement +name: ds_loop_no_improvement +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + waveLimiter: false +body: | + ; Check preheader: neither OPT nor NOOPT adds flush (optimization doesn't apply) + ; OPT: bb.0: + ; OPT-NOT: S_WAIT_DSCNT + ; OPT: S_BRANCH %bb.1 + + ; NOOPT: bb.0: + ; NOOPT-NOT: S_WAIT_DSCNT + ; NOOPT: S_BRANCH %bb.1 + + bb.0: + successors: %bb.1 + liveins: $sgpr0, $vgpr0 + + ; Preheader: DS loads with slight reordering (Pulled up Loads #5 and #6) + ; This creates a small mismatch with loop body order, so baseline is + ; close to optimal but not perfect. Improvement should be below threshold. + $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec + $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec + $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec + $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec + $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec + $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec + $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec + $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec + $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec + $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec + $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + ; Preheader has loads #5,#6 first, then #1-4, then rest. + ; First WMMA uses loads #1-4 (vgpr10-25), which are at positions 3-6 in preheader. + ; Baseline produces S_WAIT_DSCNT 10 (wait for first 6 loads to complete). + ; Optimal would be 12 (only need first 4 loads in loop body order). + ; Improvement = 12 - 10 = 2, which is below threshold of 4, so no relaxation. + ; OPT: bb.1: + ; OPT: S_WAIT_DSCNT 10 + ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + + ; NOOPT: bb.1: + ; NOOPT: S_WAIT_DSCNT 10 + ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + + bb.1: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + + ; First WMMA uses vgpr10-25 (loads #1-4) + early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 8, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 8, $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57, 8, killed $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, 0, implicit $exec + early-clobber $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65, 8, $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, 8, killed $vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, 0, implicit $exec + + S_BARRIER + + ; Prefetch DS loads for next iteration (FORWARD order) + $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec + $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec + $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec + $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec + $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec + $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec + $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec + $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec + $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec + $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec + + $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir new file mode 100644 index 0000000000000..f2a38ab7f3d8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-same-iter-overwrite.mir @@ -0,0 +1,111 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=true -verify-machineinstrs -o - %s | FileCheck -check-prefix=OPT %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -amdgpu-waitcnt-loop-ds-opt=false -verify-machineinstrs -o - %s | FileCheck -check-prefix=NOOPT %s + +# Test: same-iteration OVERWRITE of a DS load result (load #4 = vgpr22-25) +# This flushes loads #1-4, leaving loads #5-16 (12 loads) as prefetch. +# Tests that the analysis correctly handles same-iteration overwrites. + +--- | + define amdgpu_kernel void @ds_loop_same_iter_overwrite() { ret void } +... + +--- +# OPT-LABEL: name: ds_loop_same_iter_overwrite +# NOOPT-LABEL: name: ds_loop_same_iter_overwrite +name: ds_loop_same_iter_overwrite +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + waveLimiter: false +body: | + ; OPT: bb.0: + ; OPT: S_WAIT_DSCNT_soft 0 + ; OPT-NEXT: S_BRANCH %bb.1 + + ; NOOPT: bb.0: + ; NOOPT-NOT: S_WAIT_DSCNT + ; NOOPT: S_BRANCH %bb.1 + + bb.0: + successors: %bb.1 + liveins: $sgpr0, $vgpr0 + + ; Preheader: DS loads in REVERSE order + $vgpr70_vgpr71_vgpr72_vgpr73 = DS_READ_B128 $vgpr0, 240, 0, implicit $m0, implicit $exec + $vgpr66_vgpr67_vgpr68_vgpr69 = DS_READ_B128 $vgpr0, 224, 0, implicit $m0, implicit $exec + $vgpr62_vgpr63_vgpr64_vgpr65 = DS_READ_B128 $vgpr0, 208, 0, implicit $m0, implicit $exec + $vgpr58_vgpr59_vgpr60_vgpr61 = DS_READ_B128 $vgpr0, 192, 0, implicit $m0, implicit $exec + $vgpr54_vgpr55_vgpr56_vgpr57 = DS_READ_B128 $vgpr0, 176, 0, implicit $m0, implicit $exec + $vgpr50_vgpr51_vgpr52_vgpr53 = DS_READ_B128 $vgpr0, 160, 0, implicit $m0, implicit $exec + $vgpr46_vgpr47_vgpr48_vgpr49 = DS_READ_B128 $vgpr0, 144, 0, implicit $m0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45 = DS_READ_B128 $vgpr0, 128, 0, implicit $m0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec + $vgpr34_vgpr35_vgpr36_vgpr37 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = DS_READ_B128 $vgpr0, 48, 0, implicit $m0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21 = DS_READ_B128 $vgpr0, 32, 0, implicit $m0, implicit $exec + $vgpr14_vgpr15_vgpr16_vgpr17 = DS_READ_B128 $vgpr0, 16, 0, implicit $m0, implicit $exec + $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec + S_BRANCH %bb.1 + + ; First WMMA uses flushed loads #1-4, gets FloorWaitCount = 12 + ; Second WMMA uses loads #5-8 (positions 1-4), gets 12-4 = 8 + ; OPT: bb.1: + ; OPT: S_WAIT_DSCNT 12 + ; OPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + ; OPT: S_WAIT_DSCNT 8 + ; OPT-NEXT: early-clobber $vgpr88{{.*}} = V_WMMA + + ; NOOPT: bb.1: + ; NOOPT: S_WAIT_DSCNT 0 + ; NOOPT-NEXT: early-clobber $vgpr80{{.*}} = V_WMMA + + bb.1: + successors: %bb.1, %bb.2 + liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, $... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/171952 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
