https://github.com/mustartt created
https://github.com/llvm/llvm-project/pull/202004
When `-funique-internal-linkage-names` is supplied, a unique suffix is appended
even for a static function with an asm label, when it really shouldn't have.
```c
static int impl(int x) asm("__impl"); //
__impl.uniq.68358509610070717889884130747296293671
static int impl(int x) { return x + 1; }
```
>From b4bb7a5b02beb7812deee7cd10bd5e64075a936b Mon Sep 17 00:00:00 2001
From: Henry Jiang <[email protected]>
Date: Fri, 5 Jun 2026 10:18:20 -0700
Subject: [PATCH 1/2] wip: shorten live range?
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 133 +++++++++
llvm/lib/CodeGen/MachineSink.cpp | 256 +++++++++++++++++-
.../machine-sink-diamond-load-coexec.mir | 51 ++++
.../AArch64/machine-sink-diamond-load.mir | 64 +++++
.../AArch64/sink-diamond-load.ll | 68 +++++
5 files changed, 570 insertions(+), 2 deletions(-)
create mode 100644
llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
create mode 100644 llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
create mode 100644
llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp
b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 74a0502d8cb7c..a2d861471347f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -140,6 +140,32 @@ static cl::opt<bool> DisableBranchOpts(
"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
cl::desc("Disable branch optimizations in CodeGenPrepare"));
+// A load that dominates a diamond and is consumed in more than one arm has had
+// its live range stretched across the conditional (typically by SimplifyCFG's
+// hoistCommonCodeFromSuccessors). The clone is dynamically neutral -- exactly
+// one arm runs per pass -- so its only cost is static code size. Sinking it
+// with duplication into each arm shortens the live range before ISel and lets
+// later passes (e.g. load-pair formation) treat each arm independently. Gated
+// off by default.
+static cl::opt<bool> EnableSinkDiamondLoads(
+ "cgp-sink-diamond-loads", cl::Hidden, cl::init(false),
+ cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+ "using arms to shorten its live range across the branch"));
+
+static cl::opt<unsigned> SinkDiamondLoadsMaxFanout(
+ "cgp-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+ cl::desc("Maximum number of arm clones for diamond-load sinking in CGP"));
+
+static cl::opt<unsigned> SinkDiamondLoadsChainDepth(
+ "cgp-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+ cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+ "at least this depth in an arm (e.g. a reduction)"));
+
+static cl::opt<bool> ForceSinkDiamondLoads(
+ "cgp-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+ cl::desc("Skip the profitability heuristics for CGP diamond-load sinking "
+ "(testing only); safety/legality checks still apply"));
+
static cl::opt<bool>
DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
cl::desc("Disable GC optimizations in CodeGenPrepare"));
@@ -431,6 +457,7 @@ class CodeGenPrepare {
bool optimizeExt(Instruction *&I);
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *Load);
+ bool sinkLoadAcrossDiamond(LoadInst *Load);
bool optimizeShiftInst(BinaryOperator *BO);
bool optimizeFunnelShift(IntrinsicInst *Fsh);
bool optimizeSelectInst(SelectInst *SI);
@@ -7394,6 +7421,110 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
return MadeChange;
}
+// True if the value V feeds a dependency chain of at least MinDepth
+// instructions. Such a value sits early in a long computation (e.g. a
+// reduction), where pinning a register for it across the branch is most
+// costly. Explores all users level by level (bounded), so it is not fooled by
+// a reduction DAG whose nodes have several users.
+static bool feedsDeepChain(Value *V, unsigned MinDepth) {
+ SmallVector<Value *, 16> Frontier{V};
+ SmallPtrSet<Value *, 32> Visited{V};
+ auto IsChainStep = [](Instruction *UI) {
+ return UI && !UI->getType()->isVoidTy() && !UI->isTerminator() &&
+ !isa<PHINode>(UI);
+ };
+ for (unsigned Depth = 0; Depth < MinDepth; ++Depth) {
+ SmallVector<Value *, 16> Next;
+ for (Value *Cur : Frontier)
+ for (User *U : Cur->users())
+ if (auto *UI = dyn_cast<Instruction>(U); IsChainStep(UI))
+ if (Visited.insert(UI).second && Visited.size() < 512)
+ Next.push_back(UI);
+ if (Next.empty())
+ return false;
+ Frontier = std::move(Next);
+ }
+ return true;
+}
+
+// Sink a load that dominates a diamond by cloning it into each arm that
+// consumes its result, then erasing the original. This shortens the loaded
+// value's live range when SimplifyCFG hoisted a load common to the arms up
into
+// their shared predecessor, stretching it across the conditional. The clone is
+// dynamically neutral (exactly one arm runs per pass), so its only cost is
+// static code size. Gated by -cgp-sink-diamond-loads.
+bool CodeGenPrepare::sinkLoadAcrossDiamond(LoadInst *Load) {
+ if (!EnableSinkDiamondLoads)
+ return false;
+
+ // Only plain loads: volatile/atomic loads have observable position.
+ if (!Load->isSimple())
+ return false;
+
+ BasicBlock *BB = Load->getParent();
+
+ // Collect the distinct arms that use the load. Require a use in more than
one
+ // successor (a single-successor use is ordinary sinking), no use in BB
+ // itself, no PHI use (those want the value in a predecessor), and each using
+ // arm to be a direct successor of BB whose *only* predecessor is BB. That
+ // sole-predecessor condition makes the arms mutually exclusive (exactly one
+ // clone runs per pass) and means the only path from the load to a clone site
+ // is BB's tail -> arm, so memory safety only has to consider BB's tail.
+ SmallPtrSet<BasicBlock *, 4> SuccSet(succ_begin(BB), succ_end(BB));
+ SmallSetVector<BasicBlock *, 4> Arms;
+ for (User *U : Load->users()) {
+ auto *UI = dyn_cast<Instruction>(U);
+ if (!UI || isa<PHINode>(UI))
+ return false;
+ BasicBlock *UseBB = UI->getParent();
+ if (UseBB == BB || !SuccSet.contains(UseBB))
+ return false;
+ Arms.insert(UseBB);
+ }
+ if (Arms.size() < 2 || Arms.size() > SinkDiamondLoadsMaxFanout)
+ return false;
+ for (BasicBlock *Arm : Arms)
+ if (Arm->getUniquePredecessor() != BB || Arm->isEHPad())
+ return false;
+
+ // Memory safety: cloning moves the load later. Given sole-predecessor arms,
+ // the only instructions between the load and its clones are BB's tail;
reject
+ // if any of them may write memory or otherwise has observable side effects.
+ // (Moving a pure load to execute later -- or not at all on some path -- is
+ // fine; only an intervening aliasing write would change its value.)
+ for (Instruction &I : make_range(std::next(Load->getIterator()), BB->end()))
+ if (I.mayWriteToMemory() || I.mayHaveSideEffects())
+ return false;
+
+ // Profitability. The clone is dynamically neutral, so the bar is low: fire
+ // when the load recurs in a loop and feeds a meaningful computation in an
arm
+ // (a reduction-style chain), where the cross-branch live range hurts the
+ // schedule. -force skips this for testing.
+ if (!ForceSinkDiamondLoads) {
+ if (!LI->getLoopFor(BB))
+ return false;
+ if (!feedsDeepChain(Load, SinkDiamondLoadsChainDepth))
+ return false;
+ }
+
+ // Clone the load into each arm before the arm's first use, and rewrite that
+ // arm's uses to the local clone.
+ for (BasicBlock *Arm : Arms) {
+ Instruction *InsertPt = &*Arm->getFirstInsertionPt();
+ auto *Clone = cast<LoadInst>(Load->clone());
+ Clone->insertBefore(InsertPt->getIterator());
+ if (Load->hasName())
+ Clone->setName(Load->getName() + ".sunk");
+ Load->replaceUsesWithIf(Clone, [&](Use &U) {
+ auto *UI = cast<Instruction>(U.getUser());
+ return UI->getParent() == Arm;
+ });
+ }
+
+ Load->eraseFromParent();
+ return true;
+}
+
// Find loads whose uses only use some of the loaded value's bits. Add an
"and"
// just after the load if the target can fold this into one extload
instruction,
// with the hope of eliminating some of the other later "and" instructions
using
@@ -8975,6 +9106,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I,
ModifyDT &ModifiedDT) {
return true;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (sinkLoadAcrossDiamond(LI))
+ return true; // LI has been erased.
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
bool Modified = optimizeLoadExt(LI);
unsigned AS = LI->getPointerAddressSpace();
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b34ebc3a80886..28d127e436e1f 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -113,7 +113,49 @@ static cl::opt<unsigned> SinkIntoCycleLimit(
"The maximum number of instructions considered for cycle sinking."),
cl::init(50), cl::Hidden);
+// When a value-producing load dominates a diamond and its result is consumed
in
+// more than one successor, ordinary sinking cannot move it (there is no single
+// sink target). Hoisting such a load -- as SimplifyCFG's
+// hoistCommonCodeFromSuccessors does -- is dynamically neutral but stretches
+// the loaded value's live range across the conditional, which hurts under
+// register pressure. When enabled, sink-with-duplication clones the load into
+// each using successor and removes the dominating copy, shortening the live
+// range. Gated off by default.
+static cl::opt<bool> EnableDiamondLoadSink(
+ "machine-sink-diamond-loads", cl::Hidden, cl::init(false),
+ cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+ "using successors to shorten its live range across the branch"));
+
+// A clone into mutually-exclusive successors is dynamically neutral -- exactly
+// one copy runs per pass -- so unlike ordinary sinking its only cost is static
+// code size. The profitability bar is therefore low: it fires when the clone
+// shortens a live range at all, cheaply, in a loop, without pushing any target
+// over its own pressure limit. These knobs tune that bar; -force skips it
+// entirely for testing the mechanics.
+static cl::opt<bool> ForceDiamondLoadSink(
+ "machine-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+ cl::desc("Skip the profitability heuristics for diamond-load sinking "
+ "(testing only); safety/legality checks still apply"));
+
+static cl::opt<unsigned> DiamondLoadSinkMaxFanout(
+ "machine-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+ cl::desc("Maximum number of successor clones for diamond-load sinking "
+ "(bounds static code growth)"));
+
+static cl::opt<unsigned> DiamondLoadSinkSpanThreshold(
+ "machine-sink-diamond-loads-span", cl::Hidden, cl::init(8),
+ cl::desc("Sink a diamond load when the value would otherwise stay live "
+ "across at least this many instructions (def's block tail plus "
+ "the depth of the first use in an arm)"));
+
+static cl::opt<unsigned> DiamondLoadSinkChainDepth(
+ "machine-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+ cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+ "at least this depth in an arm (e.g. a reduction)"));
+
STATISTIC(NumSunk, "Number of machine instructions sunk");
+STATISTIC(NumDiamondLoadsSunk,
+ "Number of diamond loads sunk-with-duplication into successors");
STATISTIC(NumCycleSunk, "Number of machine instructions sunk into a cycle");
STATISTIC(NumSplit, "Number of critical edges split");
STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -245,6 +287,13 @@ class MachineSinking {
bool SinkInstruction(MachineInstr &MI, bool &SawStore,
AllSuccsCache &AllSuccessors);
+ /// Sink a load that dominates a diamond by cloning it into each successor
+ /// block that consumes its result, then erasing the original. This shortens
+ /// the loaded value's live range when it would otherwise span the
+ /// conditional. Returns true if the load was sunk. Gated by
+ /// -machine-sink-diamond-loads.
+ bool sinkDiamondLoadToUsers(MachineInstr &MI, MachineBasicBlock *MBB);
+
/// If we sink a COPY inst, some debug users of it's destination may no
/// longer be dominated by the COPY, and will eventually be dropped.
/// This is easily rectified by forwarding the non-dominated debug uses
@@ -1875,8 +1924,13 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI,
bool &SawStore,
FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
// If there are no outputs, it must have side-effects.
- if (!SuccToSinkTo)
- return false;
+ if (!SuccToSinkTo) {
+ // FindSuccToSinkTo bails when the def is consumed by more than one
+ // successor (no single sink target). For a load that dominates a diamond,
+ // we can instead clone it into each using successor to shorten its live
+ // range. This is a no-op for non-candidates.
+ return sinkDiamondLoadToUsers(MI, ParentBlock);
+ }
// If the instruction to move defines a dead physical register which is live
// when leaving the basic block, don't move it because it could turn into a
@@ -2001,6 +2055,204 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI,
bool &SawStore,
return true;
}
+// Count non-debug instructions strictly after MI to the end of its block.
+static unsigned numNonDebugInstrsAfter(MachineInstr &MI) {
+ unsigned N = 0;
+ for (MachineInstr &O : make_range(std::next(MachineBasicBlock::iterator(MI)),
+ MI.getParent()->end()))
+ if (!O.isDebugOrPseudoInstr())
+ ++N;
+ return N;
+}
+
+// Number of non-debug instructions from the top of S up to the first
+// instruction that reads Reg. Returns ~0u if Reg is not read in S.
+static unsigned firstUseDepthInBlock(Register Reg, MachineBasicBlock &S) {
+ unsigned N = 0;
+ for (MachineInstr &I : S) {
+ if (I.isDebugOrPseudoInstr())
+ continue;
+ if (I.readsRegister(Reg, /*TRI=*/nullptr))
+ return N;
+ ++N;
+ }
+ return ~0u;
+}
+
+// True if the value in Reg feeds a dependency chain of at least MinDepth
+// value-producing instructions. Such a value sits early in a long computation
+// (e.g. a reduction), so pinning a register for it across the branch is most
+// costly -- exactly where shortening the live range pays off. Follows the
first
+// single-def virtual-register user at each step, bounded.
+static bool feedsDeepChain(Register Reg, const MachineRegisterInfo *MRI,
+ unsigned MinDepth) {
+ Register Cur = Reg;
+ for (unsigned Depth = 0; Depth < MinDepth + 1;) {
+ MachineInstr *Next = nullptr;
+ for (MachineInstr &U : MRI->use_nodbg_instructions(Cur)) {
+ if (U.getNumExplicitDefs() != 1)
+ continue;
+ const MachineOperand &D = *U.defs().begin();
+ if (D.isReg() && D.getReg().isVirtual() && !D.getSubReg()) {
+ Next = &U;
+ break;
+ }
+ }
+ if (!Next)
+ return false;
+ if (++Depth >= MinDepth)
+ return true;
+ Cur = Next->defs().begin()->getReg();
+ }
+ return false;
+}
+
+bool MachineSinking::sinkDiamondLoadToUsers(MachineInstr &MI,
+ MachineBasicBlock *MBB) {
+ if (!EnableDiamondLoadSink)
+ return false;
+
+ // Only consider plain, side-effect-free loads the target is willing to sink.
+ if (!MI.mayLoad() || MI.mayStore() || MI.hasOrderedMemoryRef() ||
+ MI.isCall() || MI.hasUnmodeledSideEffects() || MI.isConvergent())
+ return false;
+ if (!TII->shouldSink(MI))
+ return false;
+
+ // Require exactly one explicit def: a virtual register, not dead, no subreg,
+ // with a single definition. Physreg uses are only safe to duplicate if they
+ // cannot be redefined on the path (mirrors FindSuccToSinkTo).
+ Register DefReg;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg())
+ continue;
+ if (MO.isDef()) {
+ if (DefReg.isValid() || !MO.getReg().isVirtual() || MO.isDead() ||
+ MO.getSubReg())
+ return false;
+ DefReg = MO.getReg();
+ } else if (MO.getReg().isPhysical()) {
+ if (!MRI->isConstantPhysReg(MO.getReg()) && !TII->isIgnorableUse(MO))
+ return false;
+ }
+ }
+ if (!DefReg.isValid() || !MRI->hasOneDef(DefReg))
+ return false;
+ const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+ if (!TII->isSafeToMoveRegClassDefs(RC))
+ return false;
+
+ // Collect the distinct successor blocks that use DefReg. Require a use in
+ // more than one successor (single-successor uses are handled by ordinary
+ // sinking), no PHI use (those want the value in a predecessor), no use in
MBB
+ // itself, and every using block to be a direct successor of MBB.
+ SmallPtrSet<MachineBasicBlock *, 4> SuccSet(MBB->succ_begin(),
+ MBB->succ_end());
+ SmallSetVector<MachineBasicBlock *, 4> UsingBlocks;
+ for (MachineInstr &Use : MRI->use_nodbg_instructions(DefReg)) {
+ if (Use.isPHI())
+ return false;
+ MachineBasicBlock *UseBB = Use.getParent();
+ if (UseBB == MBB || !SuccSet.contains(UseBB))
+ return false;
+ UsingBlocks.insert(UseBB);
+ }
+ if (UsingBlocks.size() < 2 || UsingBlocks.size() > DiamondLoadSinkMaxFanout)
+ return false;
+
+ // Each using block must have MBB as its *only* predecessor. That single
+ // condition buys two things at once:
+ // * Dynamic neutrality: with no other entry, an arm is reached only by
+ // MBB's branch, so exactly one arm (hence one clone) runs per pass --
+ // never two. The arms are mutually exclusive by construction.
+ // * A trivial path for memory safety: the only path from the load to a
+ // clone site is MBB-tail -> arm-top (the direct edge), with no
+ // intervening block. So we need only check MBB's tail for aliasing
+ // stores, not the arms (whose stores all follow the clone we insert at
+ // the top).
+ // Also reject odd sink targets.
+ for (MachineBasicBlock *S : UsingBlocks) {
+ if (S->pred_size() != 1 || S->isEHPad() ||
+ S->isInlineAsmBrIndirectTarget())
+ return false;
+ if (MachineCycle *C = CI->getCycle(S))
+ if (!C->isReducible() || C->getHeader() == S)
+ return false;
+ }
+
+ // Memory safety: cloning moves the load later on each path. Given the
+ // single-predecessor arms above, the only instructions between the load and
+ // its clones are MBB's tail; reject if any aliasing store, call, or
+ // side-effecting instruction appears there.
+ auto IsBarrier = [&](MachineInstr &O) {
+ if (O.isCall() || O.hasUnmodeledSideEffects())
+ return true;
+ return O.mayStore() && O.mayAlias(AA, MI, /*UseTBAA=*/false);
+ };
+ for (MachineInstr &O :
+ make_range(std::next(MachineBasicBlock::iterator(MI)), MBB->end()))
+ if (IsBarrier(O))
+ return false;
+
+ // Profitability. The clone is dynamically neutral, so its only cost is
static
+ // code size; the bar is correspondingly low and is NOT the spill-pressure
+ // limit ordinary sinking uses (the value need never spill for the live range
+ // to hurt scheduling). We fire when the load recurs in a loop and shortens a
+ // meaningful live range. We deliberately do NOT gate on the consuming arm's
+ // pressure: the value is already live-in there (it is used in the arm), so
+ // sinking does not raise the arm's peak -- it only removes the value's live
+ // range across the branch in MBB, which is the whole point.
+ if (!ForceDiamondLoadSink) {
+ // (C) Recurrence: only worth the static cost where it repeats.
+ if (!CI->getCycle(MBB))
+ return false;
+
+ // (D) Non-trivial shortening: the value must currently stay live across a
+ // meaningful span (def's block tail + depth of the first use in an arm),
+ // or feed a deep dependency chain (a reduction-style consumer that pins a
+ // register). Either signal suffices.
+ unsigned Span = numNonDebugInstrsAfter(MI);
+ bool Worthwhile = feedsDeepChain(DefReg, MRI, DiamondLoadSinkChainDepth);
+ for (MachineBasicBlock *S : UsingBlocks) {
+ unsigned UseDepth = firstUseDepthInBlock(DefReg, *S);
+ if (UseDepth != ~0u && Span + UseDepth >= DiamondLoadSinkSpanThreshold)
+ Worthwhile = true;
+ }
+ if (!Worthwhile)
+ return false;
+ }
+
+ // Bail before mutating if any clone site has prologue interference.
+ for (MachineBasicBlock *S : UsingBlocks) {
+ MachineBasicBlock::iterator InsertPos = S->SkipPHIsAndLabels(S->begin());
+ if (blockPrologueInterferes(S, InsertPos, MI, TRI, TII, MRI))
+ return false;
+ }
+
+ // Clone the load into each using successor and rewrite that block's uses.
+ for (MachineBasicBlock *S : UsingBlocks) {
+ MachineBasicBlock::iterator InsertPos = S->SkipPHIsAndLabels(S->begin());
+ Register NewReg = MRI->createVirtualRegister(RC);
+ MachineInstr *Clone = MI.getMF()->CloneMachineInstr(&MI);
+ Clone->substituteRegister(DefReg, NewReg, /*SubIdx=*/0, *TRI);
+ S->insert(InsertPos, Clone);
+
+ // Rewrite all uses (including debug) of DefReg within S to the local
clone.
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_operands(DefReg)))
+ if (MO.getParent()->getParent() == S)
+ MO.setReg(NewReg);
+
+ LLVM_DEBUG(dbgs() << "Diamond-load clone into " << printMBBReference(*S)
+ << ": " << *Clone);
+ }
+
+ LLVM_DEBUG(dbgs() << "Erasing dominating diamond load: " << MI);
+ MI.eraseFromParent();
+ ++NumDiamondLoadsSunk;
+ return true;
+}
+
void MachineSinking::SalvageUnsunkDebugUsersOfCopy(
MachineInstr &MI, MachineBasicBlock *TargetBlock) {
assert(MI.isCopy());
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
new file mode 100644
index 0000000000000..e043aa79c037d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
@@ -0,0 +1,51 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -machine-sink-diamond-loads
\
+# RUN: -machine-sink-diamond-loads-force -o - %s | FileCheck %s
+
+# bb.2 falls through to bb.3, so the two using arms are NOT mutually exclusive:
+# on the path bb.1 -> bb.2 -> bb.3 both clones would execute, turning one
+# dynamic load into two. The transform must decline even with -force, because
+# mutual exclusivity is a correctness-of-neutrality requirement, not a
+# profitability heuristic. The single dominating load stays in bb.1.
+
+# CHECK-LABEL: name: coexec
+# CHECK: bb.1:
+# CHECK: {{%[0-9]+}}:fpr64 = LDRDui %0, 0
+# CHECK: bb.2:
+# CHECK-NOT: LDRDui
+# CHECK: bb.3:
+# CHECK-NOT: LDRDui
+
+--- |
+ define double @coexec(ptr %p, i32 %c) { ret double 0.000000e+00 }
+...
+---
+name: coexec
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $x0, $w1
+ %0:gpr64sp = COPY $x0
+ %1:gpr32 = COPY $w1
+ B %bb.1
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ %2:fpr64 = LDRDui %0, 0 :: (load (s64))
+ CBZW %1, %bb.3
+ B %bb.2
+
+ bb.2:
+ successors: %bb.3(0x80000000)
+ %3:fpr64 = nofpexcept FADDDrr %2, %2, implicit $fpcr
+ B %bb.3
+
+ bb.3:
+ successors: %bb.4(0x80000000)
+ %4:fpr64 = nofpexcept FMULDrr %2, %2, implicit $fpcr
+ B %bb.4
+
+ bb.4:
+ $d0 = COPY %4
+ RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
new file mode 100644
index 0000000000000..afd653da22769
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
@@ -0,0 +1,64 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -o - %s | FileCheck %s
--check-prefix=OFF
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -machine-sink-diamond-loads
\
+# RUN: -machine-sink-diamond-loads-force -o - %s | FileCheck %s
--check-prefix=ON
+
+# A load in bb.1 whose result is consumed by both successors (bb.2 and bb.3)
+# cannot be moved by ordinary sinking (there is no single sink target). With
+# -machine-sink-diamond-loads it is cloned into each using successor and the
+# dominating copy is erased, shortening its live range across the branch. This
+# test exercises the mechanics with -...-force (profitability heuristics off).
+
+--- |
+ define double @diamond(ptr %p, i32 %c) { ret double 0.000000e+00 }
+...
+---
+name: diamond
+tracksRegLiveness: true
+body: |
+ ; OFF-LABEL: name: diamond
+ ; OFF: bb.1:
+ ; OFF: %2:fpr64 = LDRDui %0, 0
+ ; OFF: bb.2:
+ ; OFF-NOT: LDRDui
+ ; OFF: FADDDrr %2, %2
+ ; OFF: bb.3:
+ ; OFF-NOT: LDRDui
+ ; OFF: FMULDrr %2, %2
+ ;
+ ; ON-LABEL: name: diamond
+ ; ON: bb.1:
+ ; ON-NOT: LDRDui
+ ; ON: bb.2:
+ ; ON: [[L1:%[0-9]+]]:fpr64 = LDRDui %0, 0
+ ; ON: FADDDrr [[L1]], [[L1]]
+ ; ON: bb.3:
+ ; ON: [[L2:%[0-9]+]]:fpr64 = LDRDui %0, 0
+ ; ON: FMULDrr [[L2]], [[L2]]
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $x0, $w1
+ %0:gpr64sp = COPY $x0
+ %1:gpr32 = COPY $w1
+ B %bb.1
+
+ bb.1:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ %2:fpr64 = LDRDui %0, 0 :: (load (s64))
+ CBZW %1, %bb.3
+ B %bb.2
+
+ bb.2:
+ successors: %bb.4(0x80000000)
+ %3:fpr64 = nofpexcept FADDDrr %2, %2, implicit $fpcr
+ B %bb.4
+
+ bb.3:
+ successors: %bb.4(0x80000000)
+ %4:fpr64 = nofpexcept FMULDrr %2, %2, implicit $fpcr
+ B %bb.4
+
+ bb.4:
+ %5:fpr64 = PHI %3, %bb.2, %4, %bb.3
+ $d0 = COPY %5
+ RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
new file mode 100644
index 0000000000000..3c940851fb4b8
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
@@ -0,0 +1,68 @@
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)'
-mtriple=aarch64 -S < %s \
+; RUN: | FileCheck %s --check-prefix=OFF
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)'
-mtriple=aarch64 \
+; RUN: -cgp-sink-diamond-loads -cgp-sink-diamond-loads-force -S < %s \
+; RUN: | FileCheck %s --check-prefix=FORCE
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)'
-mtriple=aarch64 \
+; RUN: -cgp-sink-diamond-loads -S < %s | FileCheck %s --check-prefix=HEUR
+
+; The load %v in for.body dominates a diamond and is consumed in both arms
+; (the store in %then and the reduction chain in %els). SimplifyCFG hoisted it
+; up here, stretching its live range across the branch. With
+; -cgp-sink-diamond-loads it is cloned into each arm and erased from for.body.
+; The default heuristic additionally requires a loop and a deep consumer chain
+; (present here); -force skips the heuristic.
+
+define double @reduce(ptr %p, i32 %n) {
+; OFF-LABEL: @reduce(
+; OFF: for.body:
+; OFF: load double, ptr %p
+; OFF: then:
+; OFF-NOT: load double
+; OFF: els:
+; OFF-NOT: load double
+;
+; FORCE-LABEL: @reduce(
+; FORCE: for.body:
+; FORCE-NOT: %v = load double
+; FORCE: then:
+; FORCE: load double, ptr %p
+; FORCE: els:
+; FORCE: load double, ptr %p
+;
+; HEUR-LABEL: @reduce(
+; HEUR: for.body:
+; HEUR-NOT: %v = load double
+; HEUR: then:
+; HEUR: load double, ptr %p
+; HEUR: els:
+; HEUR: load double, ptr %p
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %latch ]
+ %v = load double, ptr %p, align 8
+ %c = icmp eq i32 %i, 7
+ br i1 %c, label %then, label %els
+
+then:
+ store double %v, ptr %p, align 8
+ br label %latch
+
+els:
+ %a = fadd double %v, 1.000000e+00
+ %b = fadd double %a, 1.000000e+00
+ %d = fadd double %b, 1.000000e+00
+ %e = fadd double %d, 1.000000e+00
+ store double %e, ptr %p, align 8
+ br label %latch
+
+latch:
+ %i.next = add i32 %i, 1
+ %cmp = icmp slt i32 %i.next, %n
+ br i1 %cmp, label %for.body, label %exit
+
+exit:
+ ret double 0.000000e+00
+}
>From 8778aade25fcd292d6d21e139b40fd0a875797da Mon Sep 17 00:00:00 2001
From: Henry Jiang <[email protected]>
Date: Fri, 5 Jun 2026 22:07:16 -0700
Subject: [PATCH 2/2] [clang] No unique linkage name when asm lebel present
---
clang/lib/CodeGen/CodeGenModule.cpp | 6 ++++++
clang/test/CodeGen/unique-internal-linkage-names.c | 11 +++++++++++
clang/test/CodeGen/unique-internal-linkage-names.cpp | 8 ++++++++
3 files changed, 25 insertions(+)
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp
b/clang/lib/CodeGen/CodeGenModule.cpp
index 236738e9975d3..1c6ceafdb3c88 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2257,7 +2257,13 @@ static void AppendCPUSpecificCPUDispatchMangling(const
CodeGenModule &CGM,
static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
CodeGenModule &CGM) {
const Decl *D = GD.getDecl();
+ // An explicit asm label takes full control of the final symbol name and may
+ // be referenced under that exact name from elsewhere (e.g. a separate
+ // declaration in this or another translation unit). Appending the unique
+ // suffix would rename the definition out from under those references, so
+ // leave asm-labeled decls alone.
return !CGM.getModuleNameHash().empty() && isa<FunctionDecl>(D) &&
+ !D->hasAttr<AsmLabelAttr>() &&
(CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
}
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.c
b/clang/test/CodeGen/unique-internal-linkage-names.c
index 0fd5e516eec41..888de0e176b81 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.c
+++ b/clang/test/CodeGen/unique-internal-linkage-names.c
@@ -5,6 +5,15 @@
inline void overloaded_external() {}
extern void overloaded_external();
+// A prototyped static function gets a unique suffix...
+// CHECK: define internal i32 @_ZL7uniquedv.__uniq.{{[0-9]+}}(
+static int uniqued(void) { return 0; }
+
+// Check that a static function with asm label keeps its original name.
+// CHECK: define internal i32 @"\01custom_label"
+static int asm_label(void) asm("custom_label");
+static int asm_label(void) { return 0; }
+
// CHECK: define internal void @overloaded_internal() [[ATTR:#[0-9]+]] {
static void overloaded_internal() {}
extern void overloaded_internal();
@@ -12,6 +21,8 @@ extern void overloaded_internal();
void markUsed() {
overloaded_external();
overloaded_internal();
+ uniqued();
+ asm_label();
}
// CHECK: attributes [[ATTR]] =
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp
b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index e847cea9d273c..070a6eb4960d6 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -54,6 +54,11 @@ void test() {
A a;
}
+// Check a static function with an asm label must keep original name.
+static int asm_label() asm("custom_label");
+static int asm_label() { return 0; }
+int call_asm_label() { return asm_label(); }
+
// PLAIN: @_ZL4glob = internal global
// PLAIN: @_ZZ8retAnonMvE5fGlob = internal global
// PLAIN: @_ZN12_GLOBAL__N_16anon_mE = internal global
@@ -62,6 +67,7 @@ void test() {
// PLAIN: define internal ptr @_ZL4mverv.resolver()
// PLAIN: define internal void @_ZN12_GLOBAL__N_11AC1Ev
// PLAIN: define internal void @_ZN12_GLOBAL__N_11AD1Ev
+// PLAIN: define internal noundef i32 @custom_label()
// PLAIN: define internal noundef i32 @_ZL4mverv()
// PLAIN: define internal noundef i32 @_ZL4mverv.sse4.2()
// PLAIN-NOT: "sample-profile-suffix-elision-policy"
@@ -73,6 +79,8 @@ void test() {
// UNIQUE: define internal ptr @_ZL4mverv.[[MODHASH]].resolver()
// UNIQUE: define internal void
@_ZN12_GLOBAL__N_11AC1Ev.__uniq.68358509610070717889884130747296293671
// UNIQUE: define internal void
@_ZN12_GLOBAL__N_11AD1Ev.__uniq.68358509610070717889884130747296293671
+// An explicit asm label keeps the user-specified name with no unique suffix.
+// UNIQUE: define internal noundef i32 @custom_label()
// UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]]()
// UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]].sse4.2
// UNIQUE: attributes #[[#ATTR]] = {
{{.*}}"sample-profile-suffix-elision-policy"{{.*}} }
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits