https://github.com/mustartt created 
https://github.com/llvm/llvm-project/pull/202004

When `-funique-internal-linkage-names` is supplied, a unique suffix is appended 
even for a static function with an asm label, when it really shouldn't have.

```c
static int impl(int x) asm("__impl"); // 
__impl.uniq.68358509610070717889884130747296293671
static int impl(int x) { return x + 1; }
```


>From b4bb7a5b02beb7812deee7cd10bd5e64075a936b Mon Sep 17 00:00:00 2001
From: Henry Jiang <[email protected]>
Date: Fri, 5 Jun 2026 10:18:20 -0700
Subject: [PATCH 1/2] wip: shorten live range?

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 133 +++++++++
 llvm/lib/CodeGen/MachineSink.cpp              | 256 +++++++++++++++++-
 .../machine-sink-diamond-load-coexec.mir      |  51 ++++
 .../AArch64/machine-sink-diamond-load.mir     |  64 +++++
 .../AArch64/sink-diamond-load.ll              |  68 +++++
 5 files changed, 570 insertions(+), 2 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
 create mode 100644 llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
 create mode 100644 
llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp 
b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 74a0502d8cb7c..a2d861471347f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -140,6 +140,32 @@ static cl::opt<bool> DisableBranchOpts(
     "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
     cl::desc("Disable branch optimizations in CodeGenPrepare"));
 
+// A load that dominates a diamond and is consumed in more than one arm has had
+// its live range stretched across the conditional (typically by SimplifyCFG's
+// hoistCommonCodeFromSuccessors). The clone is dynamically neutral -- exactly
+// one arm runs per pass -- so its only cost is static code size. Sinking it
+// with duplication into each arm shortens the live range before ISel and lets
+// later passes (e.g. load-pair formation) treat each arm independently. Gated
+// off by default.
+static cl::opt<bool> EnableSinkDiamondLoads(
+    "cgp-sink-diamond-loads", cl::Hidden, cl::init(false),
+    cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+             "using arms to shorten its live range across the branch"));
+
+static cl::opt<unsigned> SinkDiamondLoadsMaxFanout(
+    "cgp-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+    cl::desc("Maximum number of arm clones for diamond-load sinking in CGP"));
+
+static cl::opt<unsigned> SinkDiamondLoadsChainDepth(
+    "cgp-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+    cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+             "at least this depth in an arm (e.g. a reduction)"));
+
+static cl::opt<bool> ForceSinkDiamondLoads(
+    "cgp-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+    cl::desc("Skip the profitability heuristics for CGP diamond-load sinking "
+             "(testing only); safety/legality checks still apply"));
+
 static cl::opt<bool>
     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
@@ -431,6 +457,7 @@ class CodeGenPrepare {
   bool optimizeExt(Instruction *&I);
   bool optimizeExtUses(Instruction *I);
   bool optimizeLoadExt(LoadInst *Load);
+  bool sinkLoadAcrossDiamond(LoadInst *Load);
   bool optimizeShiftInst(BinaryOperator *BO);
   bool optimizeFunnelShift(IntrinsicInst *Fsh);
   bool optimizeSelectInst(SelectInst *SI);
@@ -7394,6 +7421,110 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
   return MadeChange;
 }
 
+// True if the value V feeds a dependency chain of at least MinDepth
+// instructions. Such a value sits early in a long computation (e.g. a
+// reduction), where pinning a register for it across the branch is most
+// costly. Explores all users level by level (bounded), so it is not fooled by
+// a reduction DAG whose nodes have several users.
+static bool feedsDeepChain(Value *V, unsigned MinDepth) {
+  SmallVector<Value *, 16> Frontier{V};
+  SmallPtrSet<Value *, 32> Visited{V};
+  auto IsChainStep = [](Instruction *UI) {
+    return UI && !UI->getType()->isVoidTy() && !UI->isTerminator() &&
+           !isa<PHINode>(UI);
+  };
+  for (unsigned Depth = 0; Depth < MinDepth; ++Depth) {
+    SmallVector<Value *, 16> Next;
+    for (Value *Cur : Frontier)
+      for (User *U : Cur->users())
+        if (auto *UI = dyn_cast<Instruction>(U); IsChainStep(UI))
+          if (Visited.insert(UI).second && Visited.size() < 512)
+            Next.push_back(UI);
+    if (Next.empty())
+      return false;
+    Frontier = std::move(Next);
+  }
+  return true;
+}
+
+// Sink a load that dominates a diamond by cloning it into each arm that
+// consumes its result, then erasing the original. This shortens the loaded
+// value's live range when SimplifyCFG hoisted a load common to the arms up 
into
+// their shared predecessor, stretching it across the conditional. The clone is
+// dynamically neutral (exactly one arm runs per pass), so its only cost is
+// static code size. Gated by -cgp-sink-diamond-loads.
+bool CodeGenPrepare::sinkLoadAcrossDiamond(LoadInst *Load) {
+  if (!EnableSinkDiamondLoads)
+    return false;
+
+  // Only plain loads: volatile/atomic loads have observable position.
+  if (!Load->isSimple())
+    return false;
+
+  BasicBlock *BB = Load->getParent();
+
+  // Collect the distinct arms that use the load. Require a use in more than 
one
+  // successor (a single-successor use is ordinary sinking), no use in BB
+  // itself, no PHI use (those want the value in a predecessor), and each using
+  // arm to be a direct successor of BB whose *only* predecessor is BB. That
+  // sole-predecessor condition makes the arms mutually exclusive (exactly one
+  // clone runs per pass) and means the only path from the load to a clone site
+  // is BB's tail -> arm, so memory safety only has to consider BB's tail.
+  SmallPtrSet<BasicBlock *, 4> SuccSet(succ_begin(BB), succ_end(BB));
+  SmallSetVector<BasicBlock *, 4> Arms;
+  for (User *U : Load->users()) {
+    auto *UI = dyn_cast<Instruction>(U);
+    if (!UI || isa<PHINode>(UI))
+      return false;
+    BasicBlock *UseBB = UI->getParent();
+    if (UseBB == BB || !SuccSet.contains(UseBB))
+      return false;
+    Arms.insert(UseBB);
+  }
+  if (Arms.size() < 2 || Arms.size() > SinkDiamondLoadsMaxFanout)
+    return false;
+  for (BasicBlock *Arm : Arms)
+    if (Arm->getUniquePredecessor() != BB || Arm->isEHPad())
+      return false;
+
+  // Memory safety: cloning moves the load later. Given sole-predecessor arms,
+  // the only instructions between the load and its clones are BB's tail; 
reject
+  // if any of them may write memory or otherwise has observable side effects.
+  // (Moving a pure load to execute later -- or not at all on some path -- is
+  // fine; only an intervening aliasing write would change its value.)
+  for (Instruction &I : make_range(std::next(Load->getIterator()), BB->end()))
+    if (I.mayWriteToMemory() || I.mayHaveSideEffects())
+      return false;
+
+  // Profitability. The clone is dynamically neutral, so the bar is low: fire
+  // when the load recurs in a loop and feeds a meaningful computation in an 
arm
+  // (a reduction-style chain), where the cross-branch live range hurts the
+  // schedule. -force skips this for testing.
+  if (!ForceSinkDiamondLoads) {
+    if (!LI->getLoopFor(BB))
+      return false;
+    if (!feedsDeepChain(Load, SinkDiamondLoadsChainDepth))
+      return false;
+  }
+
+  // Clone the load into each arm before the arm's first use, and rewrite that
+  // arm's uses to the local clone.
+  for (BasicBlock *Arm : Arms) {
+    Instruction *InsertPt = &*Arm->getFirstInsertionPt();
+    auto *Clone = cast<LoadInst>(Load->clone());
+    Clone->insertBefore(InsertPt->getIterator());
+    if (Load->hasName())
+      Clone->setName(Load->getName() + ".sunk");
+    Load->replaceUsesWithIf(Clone, [&](Use &U) {
+      auto *UI = cast<Instruction>(U.getUser());
+      return UI->getParent() == Arm;
+    });
+  }
+
+  Load->eraseFromParent();
+  return true;
+}
+
 // Find loads whose uses only use some of the loaded value's bits.  Add an 
"and"
 // just after the load if the target can fold this into one extload 
instruction,
 // with the hope of eliminating some of the other later "and" instructions 
using
@@ -8975,6 +9106,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, 
ModifyDT &ModifiedDT) {
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (sinkLoadAcrossDiamond(LI))
+      return true; // LI has been erased.
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
     unsigned AS = LI->getPointerAddressSpace();
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b34ebc3a80886..28d127e436e1f 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -113,7 +113,49 @@ static cl::opt<unsigned> SinkIntoCycleLimit(
         "The maximum number of instructions considered for cycle sinking."),
     cl::init(50), cl::Hidden);
 
+// When a value-producing load dominates a diamond and its result is consumed 
in
+// more than one successor, ordinary sinking cannot move it (there is no single
+// sink target). Hoisting such a load -- as SimplifyCFG's
+// hoistCommonCodeFromSuccessors does -- is dynamically neutral but stretches
+// the loaded value's live range across the conditional, which hurts under
+// register pressure. When enabled, sink-with-duplication clones the load into
+// each using successor and removes the dominating copy, shortening the live
+// range. Gated off by default.
+static cl::opt<bool> EnableDiamondLoadSink(
+    "machine-sink-diamond-loads", cl::Hidden, cl::init(false),
+    cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+             "using successors to shorten its live range across the branch"));
+
+// A clone into mutually-exclusive successors is dynamically neutral -- exactly
+// one copy runs per pass -- so unlike ordinary sinking its only cost is static
+// code size. The profitability bar is therefore low: it fires when the clone
+// shortens a live range at all, cheaply, in a loop, without pushing any target
+// over its own pressure limit. These knobs tune that bar; -force skips it
+// entirely for testing the mechanics.
+static cl::opt<bool> ForceDiamondLoadSink(
+    "machine-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+    cl::desc("Skip the profitability heuristics for diamond-load sinking "
+             "(testing only); safety/legality checks still apply"));
+
+static cl::opt<unsigned> DiamondLoadSinkMaxFanout(
+    "machine-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+    cl::desc("Maximum number of successor clones for diamond-load sinking "
+             "(bounds static code growth)"));
+
+static cl::opt<unsigned> DiamondLoadSinkSpanThreshold(
+    "machine-sink-diamond-loads-span", cl::Hidden, cl::init(8),
+    cl::desc("Sink a diamond load when the value would otherwise stay live "
+             "across at least this many instructions (def's block tail plus "
+             "the depth of the first use in an arm)"));
+
+static cl::opt<unsigned> DiamondLoadSinkChainDepth(
+    "machine-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+    cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+             "at least this depth in an arm (e.g. a reduction)"));
+
 STATISTIC(NumSunk, "Number of machine instructions sunk");
+STATISTIC(NumDiamondLoadsSunk,
+          "Number of diamond loads sunk-with-duplication into successors");
 STATISTIC(NumCycleSunk, "Number of machine instructions sunk into a cycle");
 STATISTIC(NumSplit, "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -245,6 +287,13 @@ class MachineSinking {
   bool SinkInstruction(MachineInstr &MI, bool &SawStore,
                        AllSuccsCache &AllSuccessors);
 
+  /// Sink a load that dominates a diamond by cloning it into each successor
+  /// block that consumes its result, then erasing the original. This shortens
+  /// the loaded value's live range when it would otherwise span the
+  /// conditional. Returns true if the load was sunk. Gated by
+  /// -machine-sink-diamond-loads.
+  bool sinkDiamondLoadToUsers(MachineInstr &MI, MachineBasicBlock *MBB);
+
   /// If we sink a COPY inst, some debug users of it's destination may no
   /// longer be dominated by the COPY, and will eventually be dropped.
   /// This is easily rectified by forwarding the non-dominated debug uses
@@ -1875,8 +1924,13 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, 
bool &SawStore,
       FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
 
   // If there are no outputs, it must have side-effects.
-  if (!SuccToSinkTo)
-    return false;
+  if (!SuccToSinkTo) {
+    // FindSuccToSinkTo bails when the def is consumed by more than one
+    // successor (no single sink target). For a load that dominates a diamond,
+    // we can instead clone it into each using successor to shorten its live
+    // range. This is a no-op for non-candidates.
+    return sinkDiamondLoadToUsers(MI, ParentBlock);
+  }
 
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
@@ -2001,6 +2055,204 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, 
bool &SawStore,
   return true;
 }
 
+// Count non-debug instructions strictly after MI to the end of its block.
+static unsigned numNonDebugInstrsAfter(MachineInstr &MI) {
+  unsigned N = 0;
+  for (MachineInstr &O : make_range(std::next(MachineBasicBlock::iterator(MI)),
+                                    MI.getParent()->end()))
+    if (!O.isDebugOrPseudoInstr())
+      ++N;
+  return N;
+}
+
+// Number of non-debug instructions from the top of S up to the first
+// instruction that reads Reg. Returns ~0u if Reg is not read in S.
+static unsigned firstUseDepthInBlock(Register Reg, MachineBasicBlock &S) {
+  unsigned N = 0;
+  for (MachineInstr &I : S) {
+    if (I.isDebugOrPseudoInstr())
+      continue;
+    if (I.readsRegister(Reg, /*TRI=*/nullptr))
+      return N;
+    ++N;
+  }
+  return ~0u;
+}
+
+// True if the value in Reg feeds a dependency chain of at least MinDepth
+// value-producing instructions. Such a value sits early in a long computation
+// (e.g. a reduction), so pinning a register for it across the branch is most
+// costly -- exactly where shortening the live range pays off. Follows the 
first
+// single-def virtual-register user at each step, bounded.
+static bool feedsDeepChain(Register Reg, const MachineRegisterInfo *MRI,
+                           unsigned MinDepth) {
+  Register Cur = Reg;
+  for (unsigned Depth = 0; Depth < MinDepth + 1;) {
+    MachineInstr *Next = nullptr;
+    for (MachineInstr &U : MRI->use_nodbg_instructions(Cur)) {
+      if (U.getNumExplicitDefs() != 1)
+        continue;
+      const MachineOperand &D = *U.defs().begin();
+      if (D.isReg() && D.getReg().isVirtual() && !D.getSubReg()) {
+        Next = &U;
+        break;
+      }
+    }
+    if (!Next)
+      return false;
+    if (++Depth >= MinDepth)
+      return true;
+    Cur = Next->defs().begin()->getReg();
+  }
+  return false;
+}
+
+bool MachineSinking::sinkDiamondLoadToUsers(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) {
+  if (!EnableDiamondLoadSink)
+    return false;
+
+  // Only consider plain, side-effect-free loads the target is willing to sink.
+  if (!MI.mayLoad() || MI.mayStore() || MI.hasOrderedMemoryRef() ||
+      MI.isCall() || MI.hasUnmodeledSideEffects() || MI.isConvergent())
+    return false;
+  if (!TII->shouldSink(MI))
+    return false;
+
+  // Require exactly one explicit def: a virtual register, not dead, no subreg,
+  // with a single definition. Physreg uses are only safe to duplicate if they
+  // cannot be redefined on the path (mirrors FindSuccToSinkTo).
+  Register DefReg;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef()) {
+      if (DefReg.isValid() || !MO.getReg().isVirtual() || MO.isDead() ||
+          MO.getSubReg())
+        return false;
+      DefReg = MO.getReg();
+    } else if (MO.getReg().isPhysical()) {
+      if (!MRI->isConstantPhysReg(MO.getReg()) && !TII->isIgnorableUse(MO))
+        return false;
+    }
+  }
+  if (!DefReg.isValid() || !MRI->hasOneDef(DefReg))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  if (!TII->isSafeToMoveRegClassDefs(RC))
+    return false;
+
+  // Collect the distinct successor blocks that use DefReg. Require a use in
+  // more than one successor (single-successor uses are handled by ordinary
+  // sinking), no PHI use (those want the value in a predecessor), no use in 
MBB
+  // itself, and every using block to be a direct successor of MBB.
+  SmallPtrSet<MachineBasicBlock *, 4> SuccSet(MBB->succ_begin(),
+                                              MBB->succ_end());
+  SmallSetVector<MachineBasicBlock *, 4> UsingBlocks;
+  for (MachineInstr &Use : MRI->use_nodbg_instructions(DefReg)) {
+    if (Use.isPHI())
+      return false;
+    MachineBasicBlock *UseBB = Use.getParent();
+    if (UseBB == MBB || !SuccSet.contains(UseBB))
+      return false;
+    UsingBlocks.insert(UseBB);
+  }
+  if (UsingBlocks.size() < 2 || UsingBlocks.size() > DiamondLoadSinkMaxFanout)
+    return false;
+
+  // Each using block must have MBB as its *only* predecessor. That single
+  // condition buys two things at once:
+  //   * Dynamic neutrality: with no other entry, an arm is reached only by
+  //     MBB's branch, so exactly one arm (hence one clone) runs per pass --
+  //     never two. The arms are mutually exclusive by construction.
+  //   * A trivial path for memory safety: the only path from the load to a
+  //     clone site is MBB-tail -> arm-top (the direct edge), with no
+  //     intervening block. So we need only check MBB's tail for aliasing
+  //     stores, not the arms (whose stores all follow the clone we insert at
+  //     the top).
+  // Also reject odd sink targets.
+  for (MachineBasicBlock *S : UsingBlocks) {
+    if (S->pred_size() != 1 || S->isEHPad() ||
+        S->isInlineAsmBrIndirectTarget())
+      return false;
+    if (MachineCycle *C = CI->getCycle(S))
+      if (!C->isReducible() || C->getHeader() == S)
+        return false;
+  }
+
+  // Memory safety: cloning moves the load later on each path. Given the
+  // single-predecessor arms above, the only instructions between the load and
+  // its clones are MBB's tail; reject if any aliasing store, call, or
+  // side-effecting instruction appears there.
+  auto IsBarrier = [&](MachineInstr &O) {
+    if (O.isCall() || O.hasUnmodeledSideEffects())
+      return true;
+    return O.mayStore() && O.mayAlias(AA, MI, /*UseTBAA=*/false);
+  };
+  for (MachineInstr &O :
+       make_range(std::next(MachineBasicBlock::iterator(MI)), MBB->end()))
+    if (IsBarrier(O))
+      return false;
+
+  // Profitability. The clone is dynamically neutral, so its only cost is 
static
+  // code size; the bar is correspondingly low and is NOT the spill-pressure
+  // limit ordinary sinking uses (the value need never spill for the live range
+  // to hurt scheduling). We fire when the load recurs in a loop and shortens a
+  // meaningful live range. We deliberately do NOT gate on the consuming arm's
+  // pressure: the value is already live-in there (it is used in the arm), so
+  // sinking does not raise the arm's peak -- it only removes the value's live
+  // range across the branch in MBB, which is the whole point.
+  if (!ForceDiamondLoadSink) {
+    // (C) Recurrence: only worth the static cost where it repeats.
+    if (!CI->getCycle(MBB))
+      return false;
+
+    // (D) Non-trivial shortening: the value must currently stay live across a
+    // meaningful span (def's block tail + depth of the first use in an arm),
+    // or feed a deep dependency chain (a reduction-style consumer that pins a
+    // register). Either signal suffices.
+    unsigned Span = numNonDebugInstrsAfter(MI);
+    bool Worthwhile = feedsDeepChain(DefReg, MRI, DiamondLoadSinkChainDepth);
+    for (MachineBasicBlock *S : UsingBlocks) {
+      unsigned UseDepth = firstUseDepthInBlock(DefReg, *S);
+      if (UseDepth != ~0u && Span + UseDepth >= DiamondLoadSinkSpanThreshold)
+        Worthwhile = true;
+    }
+    if (!Worthwhile)
+      return false;
+  }
+
+  // Bail before mutating if any clone site has prologue interference.
+  for (MachineBasicBlock *S : UsingBlocks) {
+    MachineBasicBlock::iterator InsertPos = S->SkipPHIsAndLabels(S->begin());
+    if (blockPrologueInterferes(S, InsertPos, MI, TRI, TII, MRI))
+      return false;
+  }
+
+  // Clone the load into each using successor and rewrite that block's uses.
+  for (MachineBasicBlock *S : UsingBlocks) {
+    MachineBasicBlock::iterator InsertPos = S->SkipPHIsAndLabels(S->begin());
+    Register NewReg = MRI->createVirtualRegister(RC);
+    MachineInstr *Clone = MI.getMF()->CloneMachineInstr(&MI);
+    Clone->substituteRegister(DefReg, NewReg, /*SubIdx=*/0, *TRI);
+    S->insert(InsertPos, Clone);
+
+    // Rewrite all uses (including debug) of DefReg within S to the local 
clone.
+    for (MachineOperand &MO :
+         llvm::make_early_inc_range(MRI->use_operands(DefReg)))
+      if (MO.getParent()->getParent() == S)
+        MO.setReg(NewReg);
+
+    LLVM_DEBUG(dbgs() << "Diamond-load clone into " << printMBBReference(*S)
+                      << ": " << *Clone);
+  }
+
+  LLVM_DEBUG(dbgs() << "Erasing dominating diamond load: " << MI);
+  MI.eraseFromParent();
+  ++NumDiamondLoadsSunk;
+  return true;
+}
+
 void MachineSinking::SalvageUnsunkDebugUsersOfCopy(
     MachineInstr &MI, MachineBasicBlock *TargetBlock) {
   assert(MI.isCopy());
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir 
b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
new file mode 100644
index 0000000000000..e043aa79c037d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir
@@ -0,0 +1,51 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -machine-sink-diamond-loads 
\
+# RUN:   -machine-sink-diamond-loads-force -o - %s | FileCheck %s
+
+# bb.2 falls through to bb.3, so the two using arms are NOT mutually exclusive:
+# on the path bb.1 -> bb.2 -> bb.3 both clones would execute, turning one
+# dynamic load into two. The transform must decline even with -force, because
+# mutual exclusivity is a correctness-of-neutrality requirement, not a
+# profitability heuristic. The single dominating load stays in bb.1.
+
+# CHECK-LABEL: name: coexec
+# CHECK:     bb.1:
+# CHECK:       {{%[0-9]+}}:fpr64 = LDRDui %0, 0
+# CHECK:     bb.2:
+# CHECK-NOT:   LDRDui
+# CHECK:     bb.3:
+# CHECK-NOT:   LDRDui
+
+--- |
+  define double @coexec(ptr %p, i32 %c) { ret double 0.000000e+00 }
+...
+---
+name:            coexec
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $x0, $w1
+    %0:gpr64sp = COPY $x0
+    %1:gpr32 = COPY $w1
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    %2:fpr64 = LDRDui %0, 0 :: (load (s64))
+    CBZW %1, %bb.3
+    B %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+    %3:fpr64 = nofpexcept FADDDrr %2, %2, implicit $fpcr
+    B %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+    %4:fpr64 = nofpexcept FMULDrr %2, %2, implicit $fpcr
+    B %bb.4
+
+  bb.4:
+    $d0 = COPY %4
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir 
b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
new file mode 100644
index 0000000000000..afd653da22769
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir
@@ -0,0 +1,64 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -o - %s | FileCheck %s 
--check-prefix=OFF
+# RUN: llc -mtriple=aarch64 -run-pass=machine-sink -machine-sink-diamond-loads 
\
+# RUN:   -machine-sink-diamond-loads-force -o - %s | FileCheck %s 
--check-prefix=ON
+
+# A load in bb.1 whose result is consumed by both successors (bb.2 and bb.3)
+# cannot be moved by ordinary sinking (there is no single sink target). With
+# -machine-sink-diamond-loads it is cloned into each using successor and the
+# dominating copy is erased, shortening its live range across the branch. This
+# test exercises the mechanics with -...-force (profitability heuristics off).
+
+--- |
+  define double @diamond(ptr %p, i32 %c) { ret double 0.000000e+00 }
+...
+---
+name:            diamond
+tracksRegLiveness: true
+body:             |
+  ; OFF-LABEL: name: diamond
+  ; OFF:      bb.1:
+  ; OFF:        %2:fpr64 = LDRDui %0, 0
+  ; OFF:      bb.2:
+  ; OFF-NOT:     LDRDui
+  ; OFF:        FADDDrr %2, %2
+  ; OFF:      bb.3:
+  ; OFF-NOT:     LDRDui
+  ; OFF:        FMULDrr %2, %2
+  ;
+  ; ON-LABEL: name: diamond
+  ; ON:      bb.1:
+  ; ON-NOT:    LDRDui
+  ; ON:      bb.2:
+  ; ON:        [[L1:%[0-9]+]]:fpr64 = LDRDui %0, 0
+  ; ON:        FADDDrr [[L1]], [[L1]]
+  ; ON:      bb.3:
+  ; ON:        [[L2:%[0-9]+]]:fpr64 = LDRDui %0, 0
+  ; ON:        FMULDrr [[L2]], [[L2]]
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $x0, $w1
+    %0:gpr64sp = COPY $x0
+    %1:gpr32 = COPY $w1
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    %2:fpr64 = LDRDui %0, 0 :: (load (s64))
+    CBZW %1, %bb.3
+    B %bb.2
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+    %3:fpr64 = nofpexcept FADDDrr %2, %2, implicit $fpcr
+    B %bb.4
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+    %4:fpr64 = nofpexcept FMULDrr %2, %2, implicit $fpcr
+    B %bb.4
+
+  bb.4:
+    %5:fpr64 = PHI %3, %bb.2, %4, %bb.3
+    $d0 = COPY %5
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll 
b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
new file mode 100644
index 0000000000000..3c940851fb4b8
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll
@@ -0,0 +1,68 @@
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' 
-mtriple=aarch64 -S < %s \
+; RUN:   | FileCheck %s --check-prefix=OFF
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' 
-mtriple=aarch64 \
+; RUN:   -cgp-sink-diamond-loads -cgp-sink-diamond-loads-force -S < %s \
+; RUN:   | FileCheck %s --check-prefix=FORCE
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' 
-mtriple=aarch64 \
+; RUN:   -cgp-sink-diamond-loads -S < %s | FileCheck %s --check-prefix=HEUR
+
+; The load %v in for.body dominates a diamond and is consumed in both arms
+; (the store in %then and the reduction chain in %els). SimplifyCFG hoisted it
+; up here, stretching its live range across the branch. With
+; -cgp-sink-diamond-loads it is cloned into each arm and erased from for.body.
+; The default heuristic additionally requires a loop and a deep consumer chain
+; (present here); -force skips the heuristic.
+
+define double @reduce(ptr %p, i32 %n) {
+; OFF-LABEL: @reduce(
+; OFF:       for.body:
+; OFF:         load double, ptr %p
+; OFF:       then:
+; OFF-NOT:     load double
+; OFF:       els:
+; OFF-NOT:     load double
+;
+; FORCE-LABEL: @reduce(
+; FORCE:       for.body:
+; FORCE-NOT:     %v = load double
+; FORCE:       then:
+; FORCE:         load double, ptr %p
+; FORCE:       els:
+; FORCE:         load double, ptr %p
+;
+; HEUR-LABEL:  @reduce(
+; HEUR:        for.body:
+; HEUR-NOT:      %v = load double
+; HEUR:        then:
+; HEUR:          load double, ptr %p
+; HEUR:        els:
+; HEUR:          load double, ptr %p
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %latch ]
+  %v = load double, ptr %p, align 8
+  %c = icmp eq i32 %i, 7
+  br i1 %c, label %then, label %els
+
+then:
+  store double %v, ptr %p, align 8
+  br label %latch
+
+els:
+  %a = fadd double %v, 1.000000e+00
+  %b = fadd double %a, 1.000000e+00
+  %d = fadd double %b, 1.000000e+00
+  %e = fadd double %d, 1.000000e+00
+  store double %e, ptr %p, align 8
+  br label %latch
+
+latch:
+  %i.next = add i32 %i, 1
+  %cmp = icmp slt i32 %i.next, %n
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret double 0.000000e+00
+}

>From 8778aade25fcd292d6d21e139b40fd0a875797da Mon Sep 17 00:00:00 2001
From: Henry Jiang <[email protected]>
Date: Fri, 5 Jun 2026 22:07:16 -0700
Subject: [PATCH 2/2] [clang] No unique linkage name when asm lebel present

---
 clang/lib/CodeGen/CodeGenModule.cpp                  |  6 ++++++
 clang/test/CodeGen/unique-internal-linkage-names.c   | 11 +++++++++++
 clang/test/CodeGen/unique-internal-linkage-names.cpp |  8 ++++++++
 3 files changed, 25 insertions(+)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp 
b/clang/lib/CodeGen/CodeGenModule.cpp
index 236738e9975d3..1c6ceafdb3c88 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2257,7 +2257,13 @@ static void AppendCPUSpecificCPUDispatchMangling(const 
CodeGenModule &CGM,
 static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
                                         CodeGenModule &CGM) {
   const Decl *D = GD.getDecl();
+  // An explicit asm label takes full control of the final symbol name and may
+  // be referenced under that exact name from elsewhere (e.g. a separate
+  // declaration in this or another translation unit). Appending the unique
+  // suffix would rename the definition out from under those references, so
+  // leave asm-labeled decls alone.
   return !CGM.getModuleNameHash().empty() && isa<FunctionDecl>(D) &&
+         !D->hasAttr<AsmLabelAttr>() &&
          (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
 }
 
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.c 
b/clang/test/CodeGen/unique-internal-linkage-names.c
index 0fd5e516eec41..888de0e176b81 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.c
+++ b/clang/test/CodeGen/unique-internal-linkage-names.c
@@ -5,6 +5,15 @@
 inline void overloaded_external() {}
 extern void overloaded_external();
 
+// A prototyped static function gets a unique suffix...
+// CHECK: define internal i32 @_ZL7uniquedv.__uniq.{{[0-9]+}}(
+static int uniqued(void) { return 0; }
+
+// Check that a static function with asm label keeps its original name.
+// CHECK: define internal i32 @"\01custom_label"
+static int asm_label(void) asm("custom_label");
+static int asm_label(void) { return 0; }
+
 // CHECK: define internal void @overloaded_internal() [[ATTR:#[0-9]+]] {
 static void overloaded_internal() {}
 extern void overloaded_internal();
@@ -12,6 +21,8 @@ extern void overloaded_internal();
 void markUsed() {
   overloaded_external();
   overloaded_internal();
+  uniqued();
+  asm_label();
 }
 
 // CHECK: attributes [[ATTR]] =
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp 
b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index e847cea9d273c..070a6eb4960d6 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -54,6 +54,11 @@ void test() {
   A a;
 }
 
+// Check a static function with an asm label must keep original name.
+static int asm_label() asm("custom_label");
+static int asm_label() { return 0; }
+int call_asm_label() { return asm_label(); }
+
 // PLAIN: @_ZL4glob = internal global
 // PLAIN: @_ZZ8retAnonMvE5fGlob = internal global
 // PLAIN: @_ZN12_GLOBAL__N_16anon_mE = internal global
@@ -62,6 +67,7 @@ void test() {
 // PLAIN: define internal ptr @_ZL4mverv.resolver()
 // PLAIN: define internal void @_ZN12_GLOBAL__N_11AC1Ev
 // PLAIN: define internal void @_ZN12_GLOBAL__N_11AD1Ev
+// PLAIN: define internal noundef i32 @custom_label()
 // PLAIN: define internal noundef i32 @_ZL4mverv()
 // PLAIN: define internal noundef i32 @_ZL4mverv.sse4.2()
 // PLAIN-NOT: "sample-profile-suffix-elision-policy"
@@ -73,6 +79,8 @@ void test() {
 // UNIQUE: define internal ptr @_ZL4mverv.[[MODHASH]].resolver()
 // UNIQUE: define internal void 
@_ZN12_GLOBAL__N_11AC1Ev.__uniq.68358509610070717889884130747296293671
 // UNIQUE: define internal void 
@_ZN12_GLOBAL__N_11AD1Ev.__uniq.68358509610070717889884130747296293671
+// An explicit asm label keeps the user-specified name with no unique suffix.
+// UNIQUE: define internal noundef i32 @custom_label()
 // UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]]()
 // UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]].sse4.2
 // UNIQUE: attributes #[[#ATTR]] = { 
{{.*}}"sample-profile-suffix-elision-policy"{{.*}} }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to