Author: Wael Yehia Date: 2026-06-18T10:43:47-04:00 New Revision: 3f7f7323401a51db5cd858e6511c7b85db420aa2
URL: https://github.com/llvm/llvm-project/commit/3f7f7323401a51db5cd858e6511c7b85db420aa2 DIFF: https://github.com/llvm/llvm-project/commit/3f7f7323401a51db5cd858e6511c7b85db420aa2.diff LOG: [PGO] Implement PGO counter promotion for atomic updates (#202487) Currently PGO counter updates are promoted/hoisted out of loops where possible, in order to reduce memory accesses. The promotion is implemented via the LoadAndStorePromoter and SSAUpdater classes. When the updates are relaxed atomic, however, hoisting doesn't happen. Reading the semantics of relaxed atomics, it should be legal to do similar promotions, but teaching LoadAndStorePromoter and SSAUpdater seems like alot of work and would touch common code used by alot of llvm optimizations such as SROA. An easier approach, implemented here, is to perform the promotions on non-atomic updates, then transform the promoted updates to (relaxed) atomic. I also added a flag-guarded sanity check, that a user can use to make sure all PGO counter updates have been made atomic (in case we miss some). On a subset of SPEC2017, which is single threaded, on a Power9 system, the training run times are as follows: ``` Below are data from 3 test runs: noatomic: -fprofile-generate atomic.nopromote: -fprofile-generate -fprofile-update=atomic -mllvm -do-counter-promotion=0 atomic.promote: -fprofile-generate -fprofile-update=atomic -mllvm -do-counter-promotion=1 The values are runtime in seconds of the -fprofile-generate instrumented binary (the train step). and the percentages are the ratio of the runtime over the noatomic runtime (the baseline). atomic.nopromote atomic.promote noatomic 508.namd_r 113.6 250.22% 52.2 114.98% 45.4 511.povray_r 45.8 551.81% 39.3 473.49% 8.3 519.lbm_r 44.6 228.72% 44.5 228.21% 19.5 538.imagick_r 565 1822.58% 46.6 150.32% 31 544.nab_r 824.4 611.12% 224.5 166.42% 134.9 500.perlbench_r 464.5 470.62% 525.3 532.22% 98.7 505.mcf_r 664.9 638.71% 537.7 516.52% 104.1 520.omnetpp_r 373.8 364.33% 271.8 264.91% 102.6 523.xalancbmk_r 1383.2 606.67% 1028.1 450.92% 228 525.x264_r 1097.7 3118.47% 179.7 510.51% 35.2 531.deepsjeng_r 913.1 844.68% 778.2 719.89% 108.1 541.leela_r 1753.3 1109.68% 1336.9 846.14% 158 557.xz_r 392.3 797.36% 224.7 456.71% 49.2 ============================================================================ average 878.07% 417.79% ``` --------- Co-authored-by: Wael Yehia <[email protected]> Added: llvm/test/Transforms/PGOProfile/atomic_counter_promote.ll Modified: clang/docs/ReleaseNotes.rst llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp llvm/test/Transforms/PGOProfile/counter_promo.ll Removed: ################################################################################ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8a6eb2003e5f0..f1e9b2d52ec97 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -420,6 +420,8 @@ Modified Compiler Flags by ``-unique-internal-linkage-names`` option. Now it uses a path that normalized in favor of the target system (same as the preprocessor does for the file macros) and allows the reproducable IDs on any build system. +- ``-fprofile-update=atomic`` will now promote counter updates out of loops, + similar to the non-atomic case ([#202487](https://github.com/llvm/llvm-project/pull/202487)). - The ``-cl`` ``/Brepro`` option was modified to match the original CL's option and now defines the standard macros __DATE__, __TIME__ and __TIMESTAMP__ to diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 6f6ad89126024..6e06a9df2d799 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -117,6 +118,12 @@ cl::opt<bool> AtomicCounterUpdateAll( cl::desc("Make all profile counter updates atomic (for testing only)"), cl::init(false)); +cl::opt<bool> VerifyAtomicPromotion( + "verify-atomic-counter-promoted", + cl::desc("Check that all profile counter updates were made atomic; no-op " + "if atomic updates are not requested (-fprofile-update=atomic)"), + cl::init(false)); + cl::opt<bool> AtomicCounterUpdatePromoted( "atomic-counter-update-promoted", cl::desc("Do counter update using atomic fetch add " @@ -225,6 +232,19 @@ static SampledInstrumentationConfig getSampledInstrumentationConfig() { using LoadStorePair = std::pair<Instruction *, Instruction *>; +static void makeAtomic(Instruction *Load, Instruction *Store) { + auto *Addition = dyn_cast<BinaryOperator>(Store->getOperand(0)); + assert(Addition && Addition->getOpcode() == Instruction::BinaryOps::Add); + auto *Addend = Addition->getOperand(1); + + IRBuilder<> Builder(Load); + Builder.CreateAtomicRMW(AtomicRMWInst::Add, Store->getOperand(1), Addend, + MaybeAlign(), AtomicOrdering::Monotonic); + Store->eraseFromParent(); + Addition->eraseFromParent(); + Load->eraseFromParent(); +} + static uint64_t getIntModuleFlagOrZero(const Module &M, StringRef Flag) { auto *MD = dyn_cast_or_null<ConstantAsMetadata>(M.getModuleFlag(Flag)); if (!MD) @@ -311,6 +331,9 @@ class InstrLowerer final { /// Returns true if profile counter update register promotion is enabled. bool isCounterPromotionEnabled() const; + /// Returns true if profile counter updates should be atomic. + bool isAtomic() const; + /// Return true if profile sampling is enabled. bool isSamplingEnabled() const; @@ -432,9 +455,10 @@ class PGOCounterPromoterHelper : public LoadAndStorePromoter { BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks, ArrayRef<Instruction *> InsertPts, DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, - LoopInfo &LI) + LoopInfo &LI, bool IsAtomic) : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks), - InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) { + InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI), + IsAtomic(IsAtomic) { assert(isa<LoadInst>(L)); assert(isa<StoreInst>(S)); SSA.AddAvailableValue(PH, Init); @@ -464,23 +488,21 @@ class PGOCounterPromoterHelper : public LoadAndStorePromoter { Addr = Builder.CreateIntToPtr(BiasInst, PointerType::getUnqual(Ty->getContext())); } - if (AtomicCounterUpdatePromoted) - // automic update currently can only be promoted across the current - // loop, not the whole loop nest. + auto *TargetLoop = + IterativeCounterPromotion ? LI.getLoopFor(ExitBlock) : nullptr; + // Generate the relaxed atomic RMW if we've asked for it and no more + // promotion is possible. + if ((IsAtomic && !TargetLoop) || AtomicCounterUpdatePromoted) Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue, - MaybeAlign(), - AtomicOrdering::SequentiallyConsistent); + MaybeAlign(), AtomicOrdering::Monotonic); else { LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted"); auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue); auto *NewStore = Builder.CreateStore(NewVal, Addr); // Now update the parent loop's candidate list: - if (IterativeCounterPromotion) { - auto *TargetLoop = LI.getLoopFor(ExitBlock); - if (TargetLoop) - LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore); - } + if (TargetLoop) + LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore); } } } @@ -491,6 +513,7 @@ class PGOCounterPromoterHelper : public LoadAndStorePromoter { ArrayRef<Instruction *> InsertPts; DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates; LoopInfo &LI; + const bool IsAtomic; }; /// A helper class to do register promotion for all profile counter @@ -500,8 +523,9 @@ class PGOCounterPromoter { public: PGOCounterPromoter( DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands, - Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI) - : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI) { + Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI, bool IsAtomic) + : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI), + IsAtomic(IsAtomic) { // Skip collection of ExitBlocks and InsertPts for loops that will not be // able to have counters promoted. @@ -524,6 +548,16 @@ class PGOCounterPromoter { } bool run(int64_t *NumPromoted) { + bool RC = promoteCandidates(NumPromoted); + if (IsAtomic) + for (auto &Cand : LoopToCandidates[&L]) + if (Cand.first != nullptr && Cand.second != nullptr) + makeAtomic(Cand.first, Cand.second); + return RC; + } + +private: + bool promoteCandidates(int64_t *NumPromoted) { // Skip 'infinite' loops: if (ExitBlocks.size() == 0) return false; @@ -543,9 +577,9 @@ class PGOCounterPromoter { if (MaxProm == 0) return false; + const void *Ptr = LoopToCandidates.getPointerIntoBucketsArray(); unsigned Promoted = 0; for (auto &Cand : LoopToCandidates[&L]) { - SmallVector<PHINode *, 4> NewPHIs; SSAUpdater SSA(&NewPHIs); Value *InitVal = ConstantInt::get(Cand.first->getType(), 0); @@ -563,10 +597,15 @@ class PGOCounterPromoter { continue; } - PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal, - L.getLoopPreheader(), ExitBlocks, - InsertPts, LoopToCandidates, LI); + PGOCounterPromoterHelper Promoter( + Cand.first, Cand.second, SSA, InitVal, L.getLoopPreheader(), + ExitBlocks, InsertPts, LoopToCandidates, LI, IsAtomic); Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second})); + + assert(LoopToCandidates.isPointerIntoBucketsArray(Ptr) && + "References into LoopToCandidates might be invalid"); + Cand = {nullptr, nullptr}; + Promoted++; if (Promoted >= MaxProm) break; @@ -660,6 +699,7 @@ class PGOCounterPromoter { Loop &L; LoopInfo &LI; BlockFrequencyInfo *BFI; + const bool IsAtomic; }; enum class ValueProfilingCallType { @@ -870,10 +910,29 @@ bool InstrLowerer::isSamplingEnabled() const { bool InstrLowerer::isCounterPromotionEnabled() const { if (DoCounterPromotion.getNumOccurrences() > 0) return DoCounterPromotion; - return Options.DoCounterPromotion; } +bool InstrLowerer::isAtomic() const { + return Options.Atomic || AtomicCounterUpdateAll; +} + +static void doAtomicPromotionCheck(Function *F) { + for (const llvm::Instruction &I : llvm::instructions(F)) { + const Value *Addr = nullptr; + if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) + Addr = LI->getOperand(0); + else if (const StoreInst *LI = dyn_cast<StoreInst>(&I)) + Addr = LI->getOperand(1); + + if (Addr && Addr->stripInBoundsOffsets()->getName().starts_with( + getInstrProfCountersVarPrefix())) { + LLVM_DEBUG(dbgs() << "Missed candidate: "; I.dump()); + report_fatal_error("Candidate load/store not converted to atomic"); + } + } +} + void InstrLowerer::promoteCounterLoadStores(Function *F) { if (!isCounterPromotionEnabled()) return; @@ -894,8 +953,11 @@ void InstrLowerer::promoteCounterLoadStores(Function *F) { auto *CounterStore = LoadStore.second; BasicBlock *BB = CounterLoad->getParent(); Loop *ParentLoop = LI.getLoopFor(BB); - if (!ParentLoop) + if (!ParentLoop) { + if (isAtomic()) + makeAtomic(CounterLoad, CounterStore); continue; + } LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore); } @@ -904,9 +966,13 @@ void InstrLowerer::promoteCounterLoadStores(Function *F) { // Do a post-order traversal of the loops so that counter updates can be // iteratively hoisted outside the loop nest. for (auto *Loop : llvm::reverse(Loops)) { - PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get()); + PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get(), + isAtomic()); Promoter.run(&TotalCountersPromoted); } + + if (isAtomic() && VerifyAtomicPromotion) + doAtomicPromotionCheck(F); } static bool needsRuntimeHookUnconditionally(const Triple &TT) { @@ -1215,7 +1281,7 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) { Value *StepI64 = Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64"); Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64}); - } else if (Options.Atomic || AtomicCounterUpdateAll || + } else if ((!isCounterPromotionEnabled() && isAtomic()) || (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), MaybeAlign(), AtomicOrdering::Monotonic); @@ -1284,7 +1350,7 @@ void InstrLowerer::lowerMCDCTestVectorBitmapUpdate( // %mcdc.bits = load i8, ptr %4, align 1 auto *Bitmap = Builder.CreateLoad(Int8Ty, BitmapByteAddr, "mcdc.bits"); - if (Options.Atomic || AtomicCounterUpdateAll) { + if (isAtomic()) { // If ((Bitmap & Val) != Val), then execute atomic (Bitmap |= Val). // Note, just-loaded Bitmap might not be up-to-date. Use it just for // early testing. diff --git a/llvm/test/Transforms/PGOProfile/atomic_counter_promote.ll b/llvm/test/Transforms/PGOProfile/atomic_counter_promote.ll new file mode 100644 index 0000000000000..4dd3f3a249cf2 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/atomic_counter_promote.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -passes=instrprof -instrprof-atomic-counter-update-all -do-counter-promotion -S | FileCheck %s + +; CHECK: define i32 @foo(i32 %n) { +; CHECK: entry: +; CHECK: atomicrmw add {{.*}}ptr @__profc_foo +; +; CHECK: for.cond.for.cond.cleanup_crit_edge: +; CHECK-NOT: br +; CHECK: atomicrmw add {{.*}}ptr @__profc_foo +; CHECK-NEXT: atomicrmw add {{.*}}ptr @__profc_foo + +@__profn_foo = private constant [3 x i8] c"foo" + +define i32 @foo(i32 %n) { +entry: + call void @llvm.instrprof.increment(ptr @__profn_foo, i64 1124680652598534200, i32 3, i32 2) + %cmp16 = icmp slt i32 0, %n + br i1 %cmp16, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: + br label %for.cond1.preheader + +for.cond1.preheader: + %i.018 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc6, %for.cond.cleanup3 ] + %x.017 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.cond.cleanup3 ] + %cmp213 = icmp slt i32 0, %n + br i1 %cmp213, label %for.body4.lr.ph, label %for.cond.cleanup3 + +for.body4.lr.ph: + br label %for.body4 + +for.cond.for.cond.cleanup_crit_edge: + %split19 = phi i32 [ %x.1.lcssa, %for.cond.cleanup3 ] + br label %for.cond.cleanup + +for.cond.cleanup: + %x.0.lcssa = phi i32 [ %split19, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i32 %x.0.lcssa + +for.cond1.for.cond.cleanup3_crit_edge: + %split = phi i32 [ %add, %for.body4 ] + br label %for.cond.cleanup3 + +for.cond.cleanup3: + %x.1.lcssa = phi i32 [ %split, %for.cond1.for.cond.cleanup3_crit_edge ], [ %x.017, %for.cond1.preheader ] + call void @llvm.instrprof.increment(ptr @__profn_foo, i64 1124680652598534200, i32 3, i32 1) + %inc6 = add nuw nsw i32 %i.018, 1 + %cmp = icmp slt i32 %inc6, %n + br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.cond.cleanup_crit_edge + +for.body4: + %j.015 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %x.114 = phi i32 [ %x.017, %for.body4.lr.ph ], [ %add, %for.body4 ] + call void @llvm.instrprof.increment(ptr @__profn_foo, i64 1124680652598534200, i32 3, i32 0) + %add = add nsw i32 %x.114, %j.015 + %inc = add nuw nsw i32 %j.015, 1 + %cmp2 = icmp slt i32 %inc, %n + br i1 %cmp2, label %for.body4, label %for.cond1.for.cond.cleanup3_crit_edge +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) diff --git a/llvm/test/Transforms/PGOProfile/counter_promo.ll b/llvm/test/Transforms/PGOProfile/counter_promo.ll index f4c4d2a8123a3..16ff170c107c6 100644 --- a/llvm/test/Transforms/PGOProfile/counter_promo.ll +++ b/llvm/test/Transforms/PGOProfile/counter_promo.ll @@ -55,9 +55,9 @@ bb12: ; preds = %bb9 ; NONATOMIC_PROMO-NEXT: %[[PROMO3:[a-z0-9.]+]] = load {{.*}} @__profc_foo{{.*}} 2) ; NONATOMIC_PROMO-NEXT: add {{.*}} %[[PROMO3]], %[[LIVEOUT3]] ; NONATOMIC_PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2) -; ATOMIC_PROMO: atomicrmw add {{.*}} @__profc_foo{{.*}}, i64 %[[LIVEOUT1]] seq_cst -; ATOMIC_PROMO-NEXT: atomicrmw add {{.*}} @__profc_foo{{.*}}1), i64 %[[LIVEOUT2]] seq_cst -; ATOMIC_PROMO-NEXT: atomicrmw add {{.*}} @__profc_foo{{.*}}2), i64 %[[LIVEOUT3]] seq_cst +; ATOMIC_PROMO: atomicrmw add {{.*}} @__profc_foo{{.*}}, i64 %[[LIVEOUT1]] monotonic +; ATOMIC_PROMO-NEXT: atomicrmw add {{.*}} @__profc_foo{{.*}}1), i64 %[[LIVEOUT2]] monotonic +; ATOMIC_PROMO-NEXT: atomicrmw add {{.*}} @__profc_foo{{.*}}2), i64 %[[LIVEOUT3]] monotonic ; PROMO-NOT: @__profc_foo{{.*}}) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
