Author: Pankaj Dwivedi Date: 2026-01-12T17:35:06+05:30 New Revision: 3dfb782333bf929945f63e5b0b1cad378b0bd87a
URL: https://github.com/llvm/llvm-project/commit/3dfb782333bf929945f63e5b0b1cad378b0bd87a DIFF: https://github.com/llvm/llvm-project/commit/3dfb782333bf929945f63e5b0b1cad378b0bd87a.diff LOG: [AMDGPU][SIInsertWaitcnt] Implement Waitcnt Expansion for Profiling (#169345) Reference issue: https://github.com/ROCm/llvm-project/issues/67 This patch adds support for expanding s_waitcnt instructions into sequences with decreasing counter values, enabling PC-sampling profilers to identify which specific memory operation is causing a stall. This is controlled via: Clang flag: -mamdgpu-expand-waitcnt-profiling / -mno-amdgpu-expand-waitcnt-profiling Function attribute: "amdgpu-expand-waitcnt-profiling" When enabled, instead of emitting a single waitcnt, the pass generates a sequence that waits for each outstanding operation individually. For example, if there are 5 outstanding memory operations and the target is to wait until 2 remain: **Original**: s_waitcnt vmcnt(2) **Expanded**: s_waitcnt vmcnt(4) s_waitcnt vmcnt(3) s_waitcnt vmcnt(2) The expansion starts from (Outstanding - 1) down to the target value, since waitcnt(Outstanding) would be a no-op (the counter is already at that value). - Uses ScoreBrackets to determine the actual number of outstanding operations - Only expands when operations complete in-order - Skips expansion for mixed event types (e.g., LDS+SMEM on same counter) - Skips expansion for scalar memory (always out-of-order) Releated previous work for Reference - **PR**: llvm/llvm-project#79236 (related `-amdgpu-waitcnt-forcezero`) --------- Co-authored-by: Pierre van Houtryve <[email protected]> Added: llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll Modified: clang/include/clang/Basic/CodeGenOptions.def clang/include/clang/Options/Options.td clang/lib/CodeGen/Targets/AMDGPU.cpp llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h Removed: ################################################################################ diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 6cdbffc456193..baf8b093c10e6 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -466,6 +466,10 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign) /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only) CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign) +/// Enable expanded waitcnt for profiling (AMDGPU Only) +/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls. +CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign) + // Whether to emit Swift Async function extended frame information: auto, // never, always. ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2, diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 6a72931727a7c..d48ca15864060 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5577,6 +5577,13 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee", "This option changes the ABI. (AMDGPU only)">, NegFlag<SetFalse, [], [ClangOption, CC1Option]>>; +defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling", + CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse, + PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help " + "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), " + "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">, + NegFlag<SetFalse, [], [ClangOption]>>; + def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>, HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">, Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 0ab6c753b8bad..4bc9557b26b52 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -443,6 +443,8 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( setFunctionDeclAttributes(FD, F, M); if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); + if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling) + F->addFnAttr("amdgpu-expand-waitcnt-profiling"); } unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c0f9be77f3adc..4bf7c65daae72 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -105,6 +105,35 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { return enum_seq(LOAD_CNT, MaxCounter); } +// Get the maximum wait count value for a given counter type. +static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits, + InstCounterType T) { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + case VA_VDST: + return Limits.VaVdstMax; + case VM_VSRC: + return Limits.VmVsrcMax; + default: + return 0; + } +} + /// Integer IDs used to track vector memory locations we may have to wait on. /// Encoded as u16 chunks: /// @@ -140,19 +169,6 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { return static_cast<unsigned>(RU); } -struct HardwareLimits { - unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. - unsigned ExpcntMax; - unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. - unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. - unsigned SamplecntMax; // gfx12+ only. - unsigned BvhcntMax; // gfx12+ only. - unsigned KmcntMax; // gfx12+ only. - unsigned XcntMax; // gfx1250. - unsigned VaVdstMax; // gfx12+ expert mode only. - unsigned VmVsrcMax; // gfx12+ expert mode only. -}; - #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \ DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ @@ -314,19 +330,27 @@ class WaitcntGenerator { AMDGPU::IsaVersion IV; InstCounterType MaxCounter; bool OptNone; + bool ExpandWaitcntProfiling = false; + const AMDGPU::HardwareLimits *Limits = nullptr; public: WaitcntGenerator() = default; - WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) + WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter, + const AMDGPU::HardwareLimits *Limits) : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), OptNone(MF.getFunction().hasOptNone() || - MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} + MF.getTarget().getOptLevel() == CodeGenOptLevel::None), + ExpandWaitcntProfiling( + MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")), + Limits(Limits) {} // Return true if the current function should be compiled with no // optimization. bool isOptNone() const { return OptNone; } + const AMDGPU::HardwareLimits &getLimits() const { return *Limits; } + // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -348,9 +372,11 @@ class WaitcntGenerator { // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. + // If ScoreBrackets is provided, it can be used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) = 0; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) = 0; // Returns an array of bit masks which can be used to map values in // WaitEventType to corresponding counter values in InstCounterType. @@ -375,7 +401,10 @@ class WaitcntGenerator { class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { public: - using WaitcntGenerator::WaitcntGenerator; + WaitcntGeneratorPreGFX12() = default; + WaitcntGeneratorPreGFX12(const MachineFunction &MF, + const AMDGPU::HardwareLimits *Limits) + : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {} bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -384,7 +413,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -416,8 +446,10 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { public: WaitcntGeneratorGFX12Plus() = default; WaitcntGeneratorGFX12Plus(const MachineFunction &MF, - InstCounterType MaxCounter, bool IsExpertMode) - : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {} + InstCounterType MaxCounter, + const AMDGPU::HardwareLimits *Limits, + bool IsExpertMode) + : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {} bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -426,7 +458,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -494,7 +527,7 @@ class SIInsertWaitcnts { // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - HardwareLimits Limits; + AMDGPU::HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -505,33 +538,7 @@ class SIInsertWaitcnts { (void)ForceVMCounter; } - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - case VA_VDST: - return Limits.VaVdstMax; - case VM_VSRC: - return Limits.VmVsrcMax; - default: - break; - } - return 0; - } + const AMDGPU::HardwareLimits &getLimits() const { return Limits; } bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, @@ -762,7 +769,7 @@ class WaitcntBrackets { unsigned getPendingGDSWait() const { return std::min(getScoreUB(DS_CNT) - LastGDS, - Context->getWaitCountMax(DS_CNT) - 1); + getWaitCountMax(Context->getLimits(), DS_CNT) - 1); } void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } @@ -789,8 +796,8 @@ class WaitcntBrackets { } void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, - getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + + getWaitCountMax(Context->getLimits(), STORE_CNT)); PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; } @@ -846,8 +853,9 @@ class WaitcntBrackets { if (T != EXP_CNT) return; - if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT)) + ScoreLBs[EXP_CNT] = + ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT); } void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) { @@ -1352,8 +1360,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T, } else { // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. - unsigned NeededWait = - std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); + unsigned NeededWait = std::min( + UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1); addWait(Wait, T, NeededWait); } } @@ -1682,38 +1690,109 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + // Emits waitcnts from (Outstanding-1) down to Target, or just Target if + // nothing to expand. The EmitWaitcnt callback emits a single waitcnt. + auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + if (Outstanding > Target) { + for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { + EmitWaitcnt(i); + Modified = true; + } + } else { + EmitWaitcnt(Target); + Modified = true; + } + }; + // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = + // If profiling expansion is enabled and we have score brackets, + // emit an expanded sequence + if (ExpandWaitcntProfiling && ScoreBrackets) { + // Check if any of the counters to be waited on are out-of-order. + // If so, fall back to normal (non-expanded) behavior since expansion + // would provide misleading profiling information. + bool AnyOutOfOrder = false; + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned &WaitCnt = getCounterRef(Wait, CT); + if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) { + AnyOutOfOrder = true; + break; + } + } + + if (AnyOutOfOrder) { + // Fall back to non-expanded wait + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + Modified = true; + } else { + // All counters are in-order, safe to expand + for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { + unsigned &WaitCnt = getCounterRef(Wait, CT); + if (WaitCnt == ~0u) + continue; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - + ScoreBrackets->getScoreLB(CT), + getWaitCountMax(getLimits(), CT) - 1); + EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { + AMDGPU::Waitcnt W; + getCounterRef(W, CT) = Count; + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(AMDGPU::encodeWaitcnt(IV, W)); + }); + } + } + } else { + // Normal behavior: emit single combined waitcnt + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - [[maybe_unused]] auto SWaitInst = + if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u && + !ScoreBrackets->counterOutOfOrder(STORE_CNT)) { + // Only expand if counter is not out-of-order + unsigned Outstanding = + std::min(ScoreBrackets->getScoreUB(STORE_CNT) - + ScoreBrackets->getScoreLB(STORE_CNT), + getWaitCountMax(getLimits(), STORE_CNT) - 1); + EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) { BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(Count); + }); + } else { + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } return Modified; @@ -2013,13 +2092,55 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // Helper to emit expanded waitcnt sequence for profiling. + auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, + auto EmitWaitcnt) { + if (Outstanding > Target) { + for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { + EmitWaitcnt(i); + Modified = true; + } + } else { + EmitWaitcnt(Target); + Modified = true; + } + }; + + // For GFX12+, we use separate wait instructions, which makes expansion + // simpler + if (ExpandWaitcntProfiling && ScoreBrackets) { + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = getWait(Wait, CT); + if (Count == ~0u) + continue; + + // Skip expansion for out-of-order counters - emit normal wait instead + if (ScoreBrackets->counterOutOfOrder(CT)) { + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Count); + Modified = true; + continue; + } + + unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - + ScoreBrackets->getScoreLB(CT), + getWaitCountMax(getLimits(), CT) - 1); + EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) { + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Val); + }); + } + return Modified; + } + + // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -2415,9 +2536,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Modified = WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - // Any counts that could have been applied to any existing waitcnt - // instructions will have been done so, now deal with any remaining. - ScoreBrackets.applyWaitcnt(Wait); + AMDGPU::Waitcnt WaitForScore = Wait; // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -2434,9 +2553,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - if (WCG->createNewWaitcnt(Block, It, Wait)) + if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) Modified = true; + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(WaitForScore); + return Modified; } @@ -3052,6 +3175,9 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + // Initialize hardware limits first, as they're needed by the generators. + Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts()); + if (ST->hasExtendedWaitCounts()) { IsExpertMode = ST->hasExpertSchedulingMode() && (ExpertSchedulingModeFlag.getNumOccurrences() @@ -3060,11 +3186,12 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { .getFnAttribute("amdgpu-expert-scheduling-mode") .getValueAsBool()); MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS; - WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode); + WCGGFX12Plus = + WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode); WCG = &WCGGFX12Plus; } else { MaxCounter = NUM_NORMAL_INST_CNTS; - WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter); + WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits); WCG = &WCGPreGFX12; } @@ -3075,22 +3202,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - if (ST->hasExtendedWaitCounts()) { - Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); - Limits.DscntMax = AMDGPU::getDscntBitMask(IV); - } else { - Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); - Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); - } - Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); - Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); - Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); - Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); - Limits.XcntMax = AMDGPU::getXcntBitMask(IV); - Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask(); - Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask(); - BlockInfos.clear(); bool Modified = false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0646c63083195..4ad3a5cd1d727 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1785,6 +1785,25 @@ unsigned getStorecntBitMask(const IsaVersion &Version) { return (1 << getStorecntBitWidth(Version.Major)) - 1; } +HardwareLimits::HardwareLimits(const IsaVersion &IV, + bool HasExtendedWaitCounts) { + if (HasExtendedWaitCounts) { + LoadcntMax = getLoadcntBitMask(IV); + DscntMax = getDscntBitMask(IV); + } else { + LoadcntMax = getVmcntBitMask(IV); + DscntMax = getLgkmcntBitMask(IV); + } + ExpcntMax = getExpcntBitMask(IV); + StorecntMax = getStorecntBitMask(IV); + SamplecntMax = getSamplecntBitMask(IV); + BvhcntMax = getBvhcntBitMask(IV); + KmcntMax = getKmcntBitMask(IV); + XcntMax = getXcntBitMask(IV); + VaVdstMax = DepCtr::getVaVdstBitMask(); + VmVsrcMax = DepCtr::getVmVsrcBitMask(); +} + unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), getVmcntBitWidthLo(Version.Major)); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f6b95602644ca..770f9a86dc883 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1131,6 +1131,26 @@ struct Waitcnt { friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait); }; +/// Represents the hardware counter limits for diff erent wait count types. +struct HardwareLimits { + unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12. + unsigned ExpcntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. + unsigned XcntMax; // gfx1250. + unsigned VaVdstMax; // gfx12+ expert mode only. + unsigned VmVsrcMax; // gfx12+ expert mode only. + + HardwareLimits() = default; + + /// Initializes hardware limits from ISA version. + /// \p HasExtendedWaitCounts should be true for gfx12+. + HardwareLimits(const IsaVersion &IV, bool HasExtendedWaitCounts); +}; + // The following methods are only meaningful on targets that support // S_WAITCNT. diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll new file mode 100644 index 0000000000000..848a9d07084ed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -0,0 +1,944 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s +; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s +; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s +; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s +; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s +; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s +; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s +; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s + +; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding +; operations, instead of emitting a single waitcnt(target), we emit: +; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target) +; +; This allows PC-sampling profilers to identify which specific operation +; is causing a stall by observing where the program counter is stuck. + +define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 { +; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 + %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 + %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 + %sum1 = add i32 %val_a, %val_b + %sum2 = add i32 %sum1, %val_c + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 { +; GFX9-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_clause 0x2 +; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_clause 0x2 +; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 +; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 +; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_clause 0x2 +; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2) +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1) +; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_clause 0x2 +; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_vmcnt_global_loads: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_clause 0x2 +; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1 +; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0 +; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_clause 0x2 +; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] +; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 +; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 +; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NOEXPAND-NEXT: s_endpgm + + ; Use thread ID to create thread-varying addresses -> forces vector loads + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid64 = zext i32 %tid to i64 + + ; Three separate global loads with thread-varying addresses + ; Non-volatile loads allow multiple operations to be in-flight + %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 + %val0 = load i32, ptr addrspace(1) %ptr0, align 4 + + %offset1 = add i64 %tid64, 64 + %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 + %val1 = load i32, ptr addrspace(1) %ptr1, align 4 + + %offset2 = add i64 %tid64, 128 + %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 + %val2 = load i32, ptr addrspace(1) %ptr2, align 4 + + %sum1 = add i32 %val0, %val1 + %sum2 = add i32 %sum1, %val2 + + %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64 + store i32 %sum2, ptr addrspace(1) %out_ptr, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 { +; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 +; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 +; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0 + %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 + %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2 + %val0 = load i32, ptr addrspace(3) %ptr0, align 4 + %val1 = load i32, ptr addrspace(3) %ptr1, align 4 + %val2 = load i32, ptr addrspace(3) %ptr2, align 4 + %sum1 = add i32 %val0, %val1 + %sum2 = add i32 %sum1, %val2 + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 { +; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4 + %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4 + + %result = add i32 %scalar_val1, %scalar_val2 + store i32 %result, ptr addrspace(1) %out, align 4 + ret void +} + +; Test that expansion is NOT applied when counters are out-of-order (mixed event types). +; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete +; out-of-order relative to each other. When both are in-flight, we should NOT expand +; because the expansion would be misleading. +define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 { +; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX9-EXPAND: ; %bb.0: +; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX9-NOEXPAND: ; %bb.0: +; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX10-EXPAND: ; %bb.0: +; GFX10-EXPAND-NEXT: s_clause 0x1 +; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX10-NOEXPAND: ; %bb.0: +; GFX10-NOEXPAND-NEXT: s_clause 0x1 +; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX11-EXPAND: ; %bb.0: +; GFX11-EXPAND-NEXT: s_clause 0x1 +; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX11-NOEXPAND: ; %bb.0: +; GFX11-NOEXPAND-NEXT: s_clause 0x1 +; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX12-EXPAND: ; %bb.0: +; GFX12-EXPAND-NEXT: s_clause 0x1 +; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem: +; GFX12-NOEXPAND: ; %bb.0: +; GFX12-NOEXPAND-NEXT: s_clause 0x1 +; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NOEXPAND-NEXT: s_endpgm + + %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4 + %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4 + %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 + %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4 + %sum1 = add i32 %lds_val1, %lds_val2 + %sum2 = add i32 %sum1, %smem_val + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 { +; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+) +; GFX9-EXPAND-LABEL: test_vscnt_global_stores: +; GFX9-EXPAND: ; %bb.0: ; %entry +; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1 +; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 +; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores: +; GFX9-NOEXPAND: ; %bb.0: ; %entry +; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1 +; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 +; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_vscnt_global_stores: +; GFX10-EXPAND: ; %bb.0: ; %entry +; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 +; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 +; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores: +; GFX10-NOEXPAND: ; %bb.0: ; %entry +; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 +; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 +; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_vscnt_global_stores: +; GFX11-EXPAND: ; %bb.0: ; %entry +; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 +; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-EXPAND-NEXT: s_clause 0x2 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 +; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 +; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores: +; GFX11-NOEXPAND: ; %bb.0: ; %entry +; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 +; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NOEXPAND-NEXT: s_clause 0x2 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 +; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 +; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_vscnt_global_stores: +; GFX12-EXPAND: ; %bb.0: ; %entry +; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 +; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-EXPAND-NEXT: s_clause 0x2 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 +; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 +; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS +; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0 +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores: +; GFX12-NOEXPAND: ; %bb.0: ; %entry +; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 +; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOEXPAND-NEXT: s_clause 0x2 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 +; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 +; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0 +; GFX12-NOEXPAND-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid64 = zext i32 %tid to i64 + + ; Issue multiple stores + %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 + store i32 1, ptr addrspace(1) %ptr0, align 4 + + %offset1 = add i64 %tid64, 64 + %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 + store i32 2, ptr addrspace(1) %ptr1, align 4 + + %offset2 = add i64 %tid64, 128 + %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 + store i32 3, ptr addrspace(1) %ptr2, align 4 + + ; Memory fence forces wait for all stores + fence release + ret void +} + +define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 { +; Test export operations (EXP_CNT/expcnt) +; GFX9-EXPAND-LABEL: test_expcnt_exports: +; GFX9-EXPAND: ; %bb.0: ; %entry +; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done +; GFX9-EXPAND-NEXT: s_endpgm +; +; GFX9-NOEXPAND-LABEL: test_expcnt_exports: +; GFX9-NOEXPAND: ; %bb.0: ; %entry +; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done +; GFX9-NOEXPAND-NEXT: s_endpgm +; +; GFX10-EXPAND-LABEL: test_expcnt_exports: +; GFX10-EXPAND: ; %bb.0: ; %entry +; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done +; GFX10-EXPAND-NEXT: s_endpgm +; +; GFX10-NOEXPAND-LABEL: test_expcnt_exports: +; GFX10-NOEXPAND: ; %bb.0: ; %entry +; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done +; GFX10-NOEXPAND-NEXT: s_endpgm +; +; GFX11-EXPAND-LABEL: test_expcnt_exports: +; GFX11-EXPAND: ; %bb.0: ; %entry +; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done +; GFX11-EXPAND-NEXT: s_endpgm +; +; GFX11-NOEXPAND-LABEL: test_expcnt_exports: +; GFX11-NOEXPAND: ; %bb.0: ; %entry +; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 +; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 +; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 +; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done +; GFX11-NOEXPAND-NEXT: s_endpgm +; +; GFX12-EXPAND-LABEL: test_expcnt_exports: +; GFX12-EXPAND: ; %bb.0: ; %entry +; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3 +; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0 +; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2 +; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done +; GFX12-EXPAND-NEXT: s_endpgm +; +; GFX12-NOEXPAND-LABEL: test_expcnt_exports: +; GFX12-NOEXPAND: ; %bb.0: ; %entry +; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3 +; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0 +; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2 +; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done +; GFX12-NOEXPAND-NEXT: s_endpgm +entry: + ; Multiple MRT exports + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false) + call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false) + ; Final export with done bit + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) + +attributes #0 = { nounwind ATTRS } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
