llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen Author: Pankaj Dwivedi (PankajDwivedi-25) <details> <summary>Changes</summary> Reverts llvm/llvm-project#<!-- -->169345 to address the reviews --- Patch is 68.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175904.diff 7 Files Affected: - (modified) clang/include/clang/Basic/CodeGenOptions.def (-4) - (modified) clang/include/clang/Options/Options.td (-7) - (modified) clang/lib/CodeGen/Targets/AMDGPU.cpp (-2) - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+92-203) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (-19) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (-20) - (removed) llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll (-944) ``````````diff diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index baf8b093c10e6..6cdbffc456193 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -466,10 +466,6 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign) /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only) CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign) -/// Enable expanded waitcnt for profiling (AMDGPU Only) -/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls. -CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign) - // Whether to emit Swift Async function extended frame information: auto, // never, always. ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2, diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 2f57a5b13b917..5ad0ff2a773c8 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5585,13 +5585,6 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee", "This option changes the ABI. (AMDGPU only)">, NegFlag<SetFalse, [], [ClangOption, CC1Option]>>; -defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling", - CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse, - PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help " - "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), " - "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">, - NegFlag<SetFalse, [], [ClangOption]>>; - def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>, HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">, Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 4bc9557b26b52..0ab6c753b8bad 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -443,8 +443,6 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( setFunctionDeclAttributes(FD, F, M); if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); - if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling) - F->addFnAttr("amdgpu-expand-waitcnt-profiling"); } unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b3e834b66ad45..bf842e0ecb4af 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -105,35 +105,6 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { return enum_seq(LOAD_CNT, MaxCounter); } -// Get the maximum wait count value for a given counter type. -static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits, - InstCounterType T) { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - case VA_VDST: - return Limits.VaVdstMax; - case VM_VSRC: - return Limits.VmVsrcMax; - default: - return 0; - } -} - /// Integer IDs used to track vector memory locations we may have to wait on. /// Encoded as u16 chunks: /// @@ -169,6 +140,19 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { return static_cast<unsigned>(RU); } +struct HardwareLimits { + unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. + unsigned ExpcntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. + unsigned XcntMax; // gfx1250. + unsigned VaVdstMax; // gfx12+ expert mode only. + unsigned VmVsrcMax; // gfx12+ expert mode only. +}; + #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \ DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ @@ -330,27 +314,19 @@ class WaitcntGenerator { AMDGPU::IsaVersion IV; InstCounterType MaxCounter; bool OptNone; - bool ExpandWaitcntProfiling = false; - const AMDGPU::HardwareLimits *Limits = nullptr; public: WaitcntGenerator() = default; - WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter, - const AMDGPU::HardwareLimits *Limits) + WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), OptNone(MF.getFunction().hasOptNone() || - MF.getTarget().getOptLevel() == CodeGenOptLevel::None), - ExpandWaitcntProfiling( - MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")), - Limits(Limits) {} + MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} // Return true if the current function should be compiled with no // optimization. bool isOptNone() const { return OptNone; } - const AMDGPU::HardwareLimits &getLimits() const { return *Limits; } - // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -372,11 +348,9 @@ class WaitcntGenerator { // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. - // If ScoreBrackets is provided, it can be used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) = 0; + AMDGPU::Waitcnt Wait) = 0; // Returns an array of bit masks which can be used to map values in // WaitEventType to corresponding counter values in InstCounterType. @@ -401,10 +375,7 @@ class WaitcntGenerator { class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { public: - WaitcntGeneratorPreGFX12() = default; - WaitcntGeneratorPreGFX12(const MachineFunction &MF, - const AMDGPU::HardwareLimits *Limits) - : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {} + using WaitcntGenerator::WaitcntGenerator; bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -413,8 +384,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) override; + AMDGPU::Waitcnt Wait) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -446,10 +416,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { public: WaitcntGeneratorGFX12Plus() = default; WaitcntGeneratorGFX12Plus(const MachineFunction &MF, - InstCounterType MaxCounter, - const AMDGPU::HardwareLimits *Limits, - bool IsExpertMode) - : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {} + InstCounterType MaxCounter, bool IsExpertMode) + : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {} bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -458,8 +426,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) override; + AMDGPU::Waitcnt Wait) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -533,7 +500,7 @@ class SIInsertWaitcnts { // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - AMDGPU::HardwareLimits Limits; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -544,7 +511,33 @@ class SIInsertWaitcnts { (void)ForceVMCounter; } - const AMDGPU::HardwareLimits &getLimits() const { return Limits; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + case VA_VDST: + return Limits.VaVdstMax; + case VM_VSRC: + return Limits.VmVsrcMax; + default: + break; + } + return 0; + } PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML, const WaitcntBrackets &Brackets); @@ -776,7 +769,7 @@ class WaitcntBrackets { unsigned getPendingGDSWait() const { return std::min(getScoreUB(DS_CNT) - LastGDS, - getWaitCountMax(Context->getLimits(), DS_CNT) - 1); + Context->getWaitCountMax(DS_CNT) - 1); } void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } @@ -803,8 +796,8 @@ class WaitcntBrackets { } void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + - getWaitCountMax(Context->getLimits(), STORE_CNT)); + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; } @@ -860,9 +853,8 @@ class WaitcntBrackets { if (T != EXP_CNT) return; - if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT)) - ScoreLBs[EXP_CNT] = - ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT); + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); } void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) { @@ -1365,8 +1357,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T, } else { // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. - unsigned NeededWait = std::min( - UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1); + unsigned NeededWait = + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1683,109 +1675,38 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { + AMDGPU::Waitcnt Wait) { assert(ST); assert(isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); - // Helper to emit expanded waitcnt sequence for profiling. - // Emits waitcnts from (Outstanding-1) down to Target, or just Target if - // nothing to expand. The EmitWaitcnt callback emits a single waitcnt. - auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, - auto EmitWaitcnt) { - if (Outstanding > Target) { - for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { - EmitWaitcnt(i); - Modified = true; - } - } else { - EmitWaitcnt(Target); - Modified = true; - } - }; - // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - // If profiling expansion is enabled and we have score brackets, - // emit an expanded sequence - if (ExpandWaitcntProfiling && ScoreBrackets) { - // Check if any of the counters to be waited on are out-of-order. - // If so, fall back to normal (non-expanded) behavior since expansion - // would provide misleading profiling information. - bool AnyOutOfOrder = false; - for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { - unsigned &WaitCnt = getCounterRef(Wait, CT); - if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) { - AnyOutOfOrder = true; - break; - } - } - - if (AnyOutOfOrder) { - // Fall back to non-expanded wait - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; - } else { - // All counters are in-order, safe to expand - for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { - unsigned &WaitCnt = getCounterRef(Wait, CT); - if (WaitCnt == ~0u) - continue; - - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - - ScoreBrackets->getScoreLB(CT), - getWaitCountMax(getLimits(), CT) - 1); - EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { - AMDGPU::Waitcnt W; - getCounterRef(W, CT) = Count; - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(IV, W)); - }); - } - } - } else { - // Normal behavior: emit single combined waitcnt - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + Modified = true; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u && - !ScoreBrackets->counterOutOfOrder(STORE_CNT)) { - // Only expand if counter is not out-of-order - unsigned Outstanding = - std::min(ScoreBrackets->getScoreUB(STORE_CNT) - - ScoreBrackets->getScoreLB(STORE_CNT), - getWaitCountMax(getLimits(), STORE_CNT) - 1); - EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) { + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Count); - }); - } else { - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } return Modified; @@ -2082,55 +2003,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { + AMDGPU::Waitcnt Wait) { assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); - // Helper to emit expanded waitcnt sequence for profiling. - auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, - auto EmitWaitcnt) { - if (Outstanding > Target) { - for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { - EmitWaitcnt(i); - Modified = true; - } - } else { - EmitWaitcnt(Target); - Modified = true; - } - }; - - // For GFX12+, we use separate wait instructions, which makes expansion - // simpler - if (ExpandWaitcntProfiling && ScoreBrackets) { - for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { - unsigned Count = getWait(Wait, CT); - if (Count == ~0u) - continue; - - // Skip expansion for out-of-order counters - emit normal wait instead - if (ScoreBrackets->counterOutOfOrder(CT)) { - BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) - .addImm(Count); - Modified = true; - continue; - } - - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - - ScoreBrackets->getScoreLB(CT), - getWaitCountMax(getLimits(), CT) - 1); - EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) { - BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) - .addImm(Val); - }); - } - return Modified; - } - - // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -2529,7 +2408,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Modified = WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - AMDGPU::Waitcnt WaitForScore = Wait; + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(Wait); // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -2546,13 +2427,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) + if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; - // Any counts that could have been applied to any existing waitcnt - // instructions will have been done so, now deal with any remaining. - ScoreBrackets.applyWaitcnt(WaitForScore); - return Modified; } @@ -3259,9 +3136,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); - // Initialize hardware limits first, as they're needed by the generators. - Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts()); - if (ST->hasExtendedWaitCounts()) { IsExpertMode = ST->hasExpertSchedulingMode() && (ExpertSchedulingModeFlag.getNumOccurrences() @@ -3270,12 +3144,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { .getFnAttrib... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/175904 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
