Author: Benjamin Maxwell
Date: 2026-02-04T09:30:44Z
New Revision: e4328b80992c2f63d58cc19ec70a33a62cde67a8

URL: 
https://github.com/llvm/llvm-project/commit/e4328b80992c2f63d58cc19ec70a33a62cde67a8
DIFF: 
https://github.com/llvm/llvm-project/commit/e4328b80992c2f63d58cc19ec70a33a62cde67a8.diff

LOG: [AArch64][SME] Limit where SME ABI optimizations apply (#179273)

These were added recently with a fairly complex propagation step,
however, these optimizations can cause regressions in some cases.

This patch limits the cross-block optimizations to the simple case
picking a state that matches all incoming blocks. If any block doesn't
match, we fallback to using "ACTIVE", the default state.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
    llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
    llvm/test/CodeGen/AArch64/sme-new-za-function.ll
    llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
    llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
    llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp 
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 823c754a0ac05..9b96bed823817 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -201,23 +201,6 @@ class EmitContext {
   Register AgnosticZABufferPtr = AArch64::NoRegister;
 };
 
-/// Checks if \p State is a legal edge bundle state. For a state to be a legal
-/// bundle state, it must be possible to transition from it to any other bundle
-/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED,
-/// as you can transition between those states by saving/restoring ZA. The OFF
-/// state would not be legal, as transitioning to it drops the content of ZA.
-static bool isLegalEdgeBundleZAState(ZAState State) {
-  switch (State) {
-  case ZAState::ACTIVE:           // ZA state within the accumulator/ZT0.
-  case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
-  case ZAState::LOCAL_SAVED:      // ZA state may be saved on the stack.
-  case ZAState::LOCAL_COMMITTED:  // ZA state is saved on the stack.
-    return true;
-  default:
-    return false;
-  }
-}
-
 StringRef getZAStateString(ZAState State) {
 #define MAKE_CASE(V)                                                           
\
   case V:                                                                      
\
@@ -325,11 +308,6 @@ struct MachineSMEABI : public MachineFunctionPass {
                           const EdgeBundles &Bundles,
                           ArrayRef<ZAState> BundleStates);
 
-  /// Propagates desired states forwards (from predecessors -> successors) if
-  /// \p Forwards, otherwise, propagates backwards (from successors ->
-  /// predecessors).
-  void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
-
   void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI, bool IsSave);
 
@@ -526,110 +504,36 @@ FunctionInfo 
MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
                       PhysLiveRegsAfterSMEPrologue};
 }
 
-void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo,
-                                           bool Forwards) {
-  // If `Forwards`, this propagates desired states from predecessors to
-  // successors, otherwise, this propagates states from successors to
-  // predecessors.
-  auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
-    return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
-  };
-
-  SmallVector<MachineBasicBlock *> Worklist;
-  for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) {
-    if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
-      Worklist.push_back(MF->getBlockNumbered(BlockID));
-  }
-
-  while (!Worklist.empty()) {
-    MachineBasicBlock *MBB = Worklist.pop_back_val();
-    BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()];
-
-    // Pick a legal edge bundle state that matches the majority of
-    // predecessors/successors.
-    int StateCounts[ZAState::NUM_ZA_STATE] = {0};
-    for (MachineBasicBlock *PredOrSucc :
-         Forwards ? predecessors(MBB) : successors(MBB)) {
-      BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()];
-      ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards);
-      if (isLegalEdgeBundleZAState(ZAState))
-        StateCounts[ZAState]++;
-    }
-
-    ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
-    ZAState &CurrentState = GetBlockState(Block, Forwards);
-    if (PropagatedState != CurrentState) {
-      CurrentState = PropagatedState;
-      ZAState &OtherState = GetBlockState(Block, !Forwards);
-      // Propagate to the incoming/outgoing state if that is also "ANY".
-      if (OtherState == ZAState::ANY)
-        OtherState = PropagatedState;
-      // Push any successors/predecessors that may need updating to the
-      // worklist.
-      for (MachineBasicBlock *SuccOrPred :
-           Forwards ? successors(MBB) : predecessors(MBB)) {
-        BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()];
-        if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, 
Forwards)))
-          Worklist.push_back(SuccOrPred);
-      }
-    }
-  }
-}
-
 /// Assigns each edge bundle a ZA state based on the needed states of blocks
-/// that have incoming or outgoing edges in that bundle.
+/// that have incoming or outgoing blocks in that bundle.
 SmallVector<ZAState>
 MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
                                     const FunctionInfo &FnInfo) {
   SmallVector<ZAState> BundleStates(Bundles.getNumBundles());
   for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) {
-    LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
-
-    // Attempt to assign a ZA state for this bundle that minimizes state
-    // transitions. Edges within loops are given a higher weight as we assume
-    // they will be executed more than once.
-    int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
+    std::optional<ZAState> BundleState;
     for (unsigned BlockID : Bundles.getBlocks(I)) {
-      LLVM_DEBUG(dbgs() << "- bb." << BlockID);
-
       const BlockInfo &Block = FnInfo.Blocks[BlockID];
-      bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
-      bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
-
-      bool LegalInEdge =
-          InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
-      bool LegalOutEgde =
-          OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
-      if (LegalInEdge) {
-        LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
-                          << getZAStateString(Block.DesiredIncomingState));
-        EdgeStateCounts[Block.DesiredIncomingState]++;
-      }
-      if (LegalOutEgde) {
-        LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
-                          << getZAStateString(Block.DesiredOutgoingState));
-        EdgeStateCounts[Block.DesiredOutgoingState]++;
-      }
-      if (!LegalInEdge && !LegalOutEgde)
-        LLVM_DEBUG(dbgs() << " (no state preference)");
-      LLVM_DEBUG(dbgs() << '\n');
+      // Check if the block is an incoming block in the bundle. Note: We skip
+      // Block.FixedEntryState != ANY to ignore EH pads (which are only
+      // reachable via exceptions).
+      if (Block.FixedEntryState != ZAState::ANY ||
+          Bundles.getBundle(BlockID, /*Out=*/false) != I)
+        continue;
+
+      // Pick a state that matches all incoming blocks. Fallback to "ACTIVE" if
+      // any blocks doesn't match. This will hoist the state from incoming
+      // blocks to outgoing blocks.
+      if (!BundleState)
+        BundleState = Block.DesiredIncomingState;
+      else if (BundleState != Block.DesiredIncomingState)
+        BundleState = ZAState::ACTIVE;
     }
 
-    ZAState BundleState =
-        ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
-
-    if (BundleState == ZAState::ANY)
+    if (!BundleState || BundleState == ZAState::ANY)
       BundleState = ZAState::ACTIVE;
 
-    LLVM_DEBUG({
-      dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
-             << "Edge counts:";
-      for (auto [State, Count] : enumerate(EdgeStateCounts))
-        dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
-      dbgs() << "\n\n";
-    });
-
-    BundleStates[I] = BundleState;
+    BundleStates[I] = *BundleState;
   }
 
   return BundleStates;
@@ -1268,42 +1172,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction 
&MF) {
 
   FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
 
-  if (OptLevel != CodeGenOptLevel::None) {
-    // Propagate desired states forward, then backwards. Most of the 
propagation
-    // should be done in the forward step, and backwards propagation is then
-    // used to fill in the gaps. Note: Doing both in one step can give poor
-    // results. For example, consider this subgraph:
-    //
-    //    ┌─────┐
-    //  ┌─┤ BB0 ◄───┐
-    //  │ └─┬───┘   │
-    //  │ ┌─▼───◄──┐│
-    //  │ │ BB1 │  ││
-    //  │ └─┬┬──┘  ││
-    //  │   │└─────┘│
-    //  │ ┌─▼───┐   │
-    //  │ │ BB2 ├───┘
-    //  │ └─┬───┘
-    //  │ ┌─▼───┐
-    //  └─► BB3 │
-    //    └─────┘
-    //
-    // If:
-    // - "BB0" and "BB2" (outer loop) has no state preference
-    // - "BB1" (inner loop) desires the ACTIVE state on entry/exit
-    // - "BB3" desires the LOCAL_SAVED state on entry
-    //
-    // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2,
-    // then from BB2 to BB0. Which results in the inner and outer loops having
-    // the "ACTIVE" state. This avoids any state changes in the loops.
-    //
-    // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from
-    // BB3 to BB0, which would result in a transition from ACTIVE -> 
LOCAL_SAVED
-    // in the outer loop.
-    for (bool Forwards : {true, false})
-      propagateDesiredStates(FnInfo, Forwards);
-  }
-
   SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
 
   EmitContext Context;

diff  --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll 
b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 344f1ef24b843..4a18b9f61d69f 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -352,61 +352,33 @@ define i64  @test_many_callee_arguments(
 }
 
 define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind 
"aarch64_za_state_agnostic" "probe-stack"="inline-asm" 
"stack-probe-size"="65536"{
-; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
-; CHECK-SDAG:       // %bb.0:
-; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Spill
-; CHECK-SDAG-NEXT:    mov x29, sp
-; CHECK-SDAG-NEXT:    bl __arm_sme_state_size
-; CHECK-SDAG-NEXT:    mov x8, sp
-; CHECK-SDAG-NEXT:    sub x19, x8, x0
-; CHECK-SDAG-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-; CHECK-SDAG-NEXT:    cmp sp, x19
-; CHECK-SDAG-NEXT:    b.le .LBB7_3
-; CHECK-SDAG-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-SDAG-NEXT:    str xzr, [sp]
-; CHECK-SDAG-NEXT:    b .LBB7_1
-; CHECK-SDAG-NEXT:  .LBB7_3:
-; CHECK-SDAG-NEXT:    mov sp, x19
-; CHECK-SDAG-NEXT:    ldr xzr, [sp]
-; CHECK-SDAG-NEXT:    mov x0, x19
-; CHECK-SDAG-NEXT:    bl __arm_sme_save
-; CHECK-SDAG-NEXT:    bl private_za
-; CHECK-SDAG-NEXT:    mov x0, x19
-; CHECK-SDAG-NEXT:    bl __arm_sme_restore
-; CHECK-SDAG-NEXT:    mov sp, x29
-; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-SDAG-NEXT:    ret
-;
-; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    bl __arm_sme_state_size
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    sub x19, x8, x0
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    bl __arm_sme_save
-; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEXT:    cmp sp, x19
-; CHECK-NEXT:    b.le .LBB7_3
-; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEXT:    str xzr, [sp]
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_3:
-; CHECK-NEXT:    mov sp, x19
-; CHECK-NEXT:    ldr xzr, [sp]
-; CHECK-NEXT:    bl private_za
-; CHECK-NEXT:    mov x0, x19
-; CHECK-NEXT:    bl __arm_sme_restore
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Spill
+; CHECK-COMMON-NEXT:    mov x29, sp
+; CHECK-COMMON-NEXT:    bl __arm_sme_state_size
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    sub x19, x8, x0
+; CHECK-COMMON-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-COMMON-NEXT:    cmp sp, x19
+; CHECK-COMMON-NEXT:    b.le .LBB7_3
+; CHECK-COMMON-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-COMMON-NEXT:    str xzr, [sp]
+; CHECK-COMMON-NEXT:    b .LBB7_1
+; CHECK-COMMON-NEXT:  .LBB7_3:
+; CHECK-COMMON-NEXT:    mov sp, x19
+; CHECK-COMMON-NEXT:    ldr xzr, [sp]
+; CHECK-COMMON-NEXT:    mov x0, x19
+; CHECK-COMMON-NEXT:    bl __arm_sme_save
+; CHECK-COMMON-NEXT:    bl private_za
+; CHECK-COMMON-NEXT:    mov x0, x19
+; CHECK-COMMON-NEXT:    bl __arm_sme_restore
+; CHECK-COMMON-NEXT:    mov sp, x29
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ret
   call void @private_za()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll 
b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
index d2715b58439d8..6995cfae8e459 100644
--- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
+++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
@@ -51,7 +51,6 @@ define void @private_za() "aarch64_new_za" {
 }
 
 ; Note: This test must run at -O0 as otherwise the multiple exits are 
optimized out.
-; TODO: We should be able to omit the ZA save here (as this function does not 
use ZA).
 define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) 
"aarch64_new_za" {
 ; CHECK-SDAG-LABEL: private_za_multiple_exit:
 ; CHECK-SDAG:       // %bb.0: // %prelude
@@ -99,33 +98,21 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 
%cond) "aarch64_new_za"
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    cbnz x8, .LBB1_1
-; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_1: // %entry
-; CHECK-NEXT:    bl __arm_tpidr2_save
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    zero {za}
-; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_2: // %entry
-; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    str w1, [sp, #8] // 4-byte Spill
 ; CHECK-NEXT:    str w0, [sp, #12] // 4-byte Spill
 ; CHECK-NEXT:    subs x8, x2, #1
-; CHECK-NEXT:    b.ne .LBB1_4
-; CHECK-NEXT:    b .LBB1_3
-; CHECK-NEXT:  .LBB1_3: // %if.else
+; CHECK-NEXT:    b.ne .LBB1_2
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_1: // %if.else
 ; CHECK-NEXT:    ldr w8, [sp, #12] // 4-byte Reload
 ; CHECK-NEXT:    ldr w9, [sp, #8] // 4-byte Reload
 ; CHECK-NEXT:    add w0, w8, w9
-; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_4: // %if.end
+; CHECK-NEXT:  .LBB1_2: // %if.end
 ; CHECK-NEXT:    ldr w8, [sp, #12] // 4-byte Reload
 ; CHECK-NEXT:    ldr w9, [sp, #8] // 4-byte Reload
 ; CHECK-NEXT:    subs w0, w8, w9
-; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll 
b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 50449172ce85b..aae1d3b756f4e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -49,36 +49,40 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" 
nounwind {
 ; CHECK-LABEL: private_za_loop:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    cmp w0, #1
 ; CHECK-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEXT:    b.lt .LBB0_3
+; CHECK-NEXT:    b.lt .LBB0_5
 ; CHECK-NEXT:  // %bb.1: // %loop.preheader
 ; CHECK-NEXT:    mov w19, w0
+; CHECK-NEXT:    sub x20, x29, #16
+; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: // %loop
+; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    cbz w19, .LBB0_5
+; CHECK-NEXT:  .LBB0_3: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_call
-; CHECK-NEXT:    subs w19, w19, #1
-; CHECK-NEXT:    b.ne .LBB0_2
-; CHECK-NEXT:  .LBB0_3: // %exit
+; CHECK-NEXT:    sub w19, w19, #1
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB0_5
-; CHECK-NEXT:  // %bb.4: // %exit
+; CHECK-NEXT:    cbnz x8, .LBB0_2
+; CHECK-NEXT:  // %bb.4: // %loop
+; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: // %exit
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll 
b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index 5243b8d7203d8..19ea1e47f84ff 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -63,17 +63,25 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" 
personality ptr @__gxx_pe
 ; CHECK-NEXT:    ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
 ; CHECK-NEXT:    bl __cxa_throw
 ; CHECK-NEXT:  .Ltmp1: // EH_LABEL
-; CHECK-NEXT:  // %bb.3: // %throw_fail
-; CHECK-NEXT:  .LBB0_4: // %unwind_dtors
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB0_4
+; CHECK-NEXT:  // %bb.3: // %throw_exception
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB0_4: // %throw_exception
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  // %bb.5: // %throw_fail
+; CHECK-NEXT:  .LBB0_6: // %unwind_dtors
 ; CHECK-NEXT:  .Ltmp2: // EH_LABEL
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB0_6
-; CHECK-NEXT:  // %bb.5: // %unwind_dtors
+; CHECK-NEXT:    cbnz x8, .LBB0_8
+; CHECK-NEXT:  // %bb.7: // %unwind_dtors
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB0_6: // %unwind_dtors
+; CHECK-NEXT:  .LBB0_8: // %unwind_dtors
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    bl shared_za_call
 ; CHECK-NEXT:    sub x8, x29, #16
@@ -224,15 +232,15 @@ define void @try_catch() "aarch64_inout_za" personality 
ptr @__gxx_personality_v
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl may_throw
 ; CHECK-NEXT:  .Ltmp4: // EH_LABEL
-; CHECK-NEXT:  .LBB1_1: // %after_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB1_3
-; CHECK-NEXT:  // %bb.2: // %after_catch
+; CHECK-NEXT:    cbnz x8, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB1_3: // %after_catch
+; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB1_3: // %after_catch
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    b shared_za_call
@@ -251,7 +259,15 @@ define void @try_catch() "aarch64_inout_za" personality 
ptr @__gxx_personality_v
 ; CHECK-NEXT:    sub x8, x29, #16
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl __cxa_end_catch
-; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB1_8
+; CHECK-NEXT:  // %bb.7: // %catch
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB1_8: // %catch
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    b .LBB1_3
 ;
 ; CHECK-SDAG-LABEL: try_catch:
 ; CHECK-SDAG:       .Lfunc_begin1:
@@ -387,8 +403,8 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" 
personality ptr @__gx
 ; CHECK-NEXT:  .Ltmp6: // EH_LABEL
 ; CHECK-NEXT:    bl shared_za_call
 ; CHECK-NEXT:  .Ltmp7: // EH_LABEL
-; CHECK-NEXT:  .LBB2_3: // %exit
 ; CHECK-NEXT:    smstop za
+; CHECK-NEXT:  .LBB2_3: // %exit
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -408,6 +424,7 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" 
personality ptr @__gx
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl __cxa_end_catch
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    b .LBB2_3
 ;
 ; CHECK-SDAG-LABEL: try_catch_shared_za_callee:
@@ -636,9 +653,9 @@ define void @try_catch_agnostic_za() 
"aarch64_za_state_agnostic" personality ptr
 ; CHECK-NEXT:    bl __arm_sme_save
 ; CHECK-NEXT:    bl may_throw
 ; CHECK-NEXT:  .Ltmp13: // EH_LABEL
-; CHECK-NEXT:  .LBB4_1: // %exit
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:  .LBB4_1: // %exit
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -647,6 +664,8 @@ define void @try_catch_agnostic_za() 
"aarch64_za_state_agnostic" personality ptr
 ; CHECK-NEXT:  .Ltmp14: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    b .LBB4_1
 ;
 ; CHECK-SDAG-LABEL: try_catch_agnostic_za:
@@ -746,9 +765,9 @@ define void @try_catch_agnostic_za_invoke() 
"aarch64_za_state_agnostic" personal
 ; CHECK-NEXT:    bl __arm_sme_save
 ; CHECK-NEXT:    bl agnostic_za_call
 ; CHECK-NEXT:  .Ltmp16: // EH_LABEL
-; CHECK-NEXT:  .LBB5_1: // %exit
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:  .LBB5_1: // %exit
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -757,6 +776,8 @@ define void @try_catch_agnostic_za_invoke() 
"aarch64_za_state_agnostic" personal
 ; CHECK-NEXT:  .Ltmp17: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    b .LBB5_1
 ;
 ; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke:
@@ -845,15 +866,15 @@ define void @try_catch_inout_za_agnostic_za_callee() 
"aarch64_inout_za" personal
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl agnostic_za_call
 ; CHECK-NEXT:  .Ltmp19: // EH_LABEL
-; CHECK-NEXT:  .LBB6_1: // %exit
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB6_3
-; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    cbnz x8, .LBB6_2
+; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB6_3: // %exit
+; CHECK-NEXT:  .LBB6_2: // %entry
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB6_3: // %exit
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -861,7 +882,15 @@ define void @try_catch_inout_za_agnostic_za_callee() 
"aarch64_inout_za" personal
 ; CHECK-NEXT:  .Ltmp20: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    bl __cxa_end_catch
-; CHECK-NEXT:    b .LBB6_1
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB6_6
+; CHECK-NEXT:  // %bb.5: // %catch
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB6_6: // %catch
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    b .LBB6_3
 ;
 ; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee:
 ; CHECK-SDAG:       .Lfunc_begin6:
@@ -967,9 +996,9 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" 
personality ptr @__gxx_pe
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl may_throw
 ; CHECK-NEXT:  .Ltmp22: // EH_LABEL
-; CHECK-NEXT:  .LBB7_1: // %exit
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:  .LBB7_1: // %exit
 ; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
@@ -977,6 +1006,8 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" 
personality ptr @__gxx_pe
 ; CHECK-NEXT:  .Ltmp23: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
 ; CHECK-NEXT:    b .LBB7_1
 ;
 ; CHECK-SDAG-LABEL: try_catch_inout_zt0:

diff  --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll 
b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index d4840f77c5392..f5c11146a7ca6 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -1,52 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false 
< %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s 
--check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false 
< %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
 
 define i32 @no_tpidr2_save_required() "aarch64_inout_za" {
-; CHECK-COMMON-LABEL: no_tpidr2_save_required:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    mov w0, #42 // =0x2a
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    ret
 entry:
   ret i32 42
 }
 
 define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) 
"aarch64_inout_za" {
-; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
-; CHECK-COMMON-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-COMMON-NEXT:    .cfi_offset w30, -8
-; CHECK-COMMON-NEXT:    .cfi_offset w29, -16
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    cbz w0, .LBB1_2
-; CHECK-COMMON-NEXT:  // %bb.1: // %use_b
-; CHECK-COMMON-NEXT:    fmov s1, #4.00000000
-; CHECK-COMMON-NEXT:    fadd s0, s0, s1
-; CHECK-COMMON-NEXT:    b .LBB1_5
-; CHECK-COMMON-NEXT:  .LBB1_2: // %use_c
-; CHECK-COMMON-NEXT:    fmov s0, s1
-; CHECK-COMMON-NEXT:    sub x8, x29, #16
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
-; CHECK-COMMON-NEXT:    bl cosf
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT:    sub x0, x29, #16
-; CHECK-COMMON-NEXT:    cbnz x8, .LBB1_4
-; CHECK-COMMON-NEXT:  // %bb.3: // %use_c
-; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
-; CHECK-COMMON-NEXT:  .LBB1_4: // %use_c
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT:  .LBB1_5: // %exit
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-NEXT:    cbz w0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_b
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    b .LBB1_5
+; CHECK-NEXT:  .LBB1_2: // %use_c
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %use_c
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB1_4: // %use_c
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB1_5: // %exit
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %cmp = icmp ne i32 %a, 0
   br i1 %cmp, label %use_b, label %use_c
 
@@ -64,51 +64,6 @@ exit:
 }
 
 define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, 
float %c) "aarch64_inout_za" "probe-stack"="inline-asm" 
"stack-probe-size"="65536" {
-; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe:
-; CHECK-SDAG:       // %bb.0:
-; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-SDAG-NEXT:    mov x29, sp
-; CHECK-SDAG-NEXT:    str xzr, [sp, #-16]!
-; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-SDAG-NEXT:    .cfi_offset w30, -8
-; CHECK-SDAG-NEXT:    .cfi_offset w29, -16
-; CHECK-SDAG-NEXT:    rdsvl x8, #1
-; CHECK-SDAG-NEXT:    mov x9, sp
-; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
-; CHECK-SDAG-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-; CHECK-SDAG-NEXT:    cmp sp, x9
-; CHECK-SDAG-NEXT:    b.le .LBB2_3
-; CHECK-SDAG-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
-; CHECK-SDAG-NEXT:    str xzr, [sp]
-; CHECK-SDAG-NEXT:    b .LBB2_1
-; CHECK-SDAG-NEXT:  .LBB2_3:
-; CHECK-SDAG-NEXT:    mov sp, x9
-; CHECK-SDAG-NEXT:    ldr xzr, [sp]
-; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-SDAG-NEXT:    cbz w0, .LBB2_5
-; CHECK-SDAG-NEXT:  // %bb.4: // %use_b
-; CHECK-SDAG-NEXT:    fmov s1, #4.00000000
-; CHECK-SDAG-NEXT:    fadd s0, s0, s1
-; CHECK-SDAG-NEXT:    b .LBB2_8
-; CHECK-SDAG-NEXT:  .LBB2_5: // %use_c
-; CHECK-SDAG-NEXT:    fmov s0, s1
-; CHECK-SDAG-NEXT:    sub x8, x29, #16
-; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x8
-; CHECK-SDAG-NEXT:    bl cosf
-; CHECK-SDAG-NEXT:    smstart za
-; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-SDAG-NEXT:    sub x0, x29, #16
-; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_7
-; CHECK-SDAG-NEXT:  // %bb.6: // %use_c
-; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
-; CHECK-SDAG-NEXT:  .LBB2_7: // %use_c
-; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-SDAG-NEXT:  .LBB2_8: // %exit
-; CHECK-SDAG-NEXT:    mov sp, x29
-; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-SDAG-NEXT:    ret
-;
 ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -119,9 +74,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 
%a, float %b, float
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
 ; CHECK-NEXT:    cmp sp, x9
@@ -137,19 +90,21 @@ define float 
@multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
 ; CHECK-NEXT:  // %bb.4: // %use_b
 ; CHECK-NEXT:    fmov s1, #4.00000000
 ; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    b .LBB2_6
+; CHECK-NEXT:    b .LBB2_8
 ; CHECK-NEXT:  .LBB2_5: // %use_c
 ; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:  .LBB2_6: // %exit
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB2_8
-; CHECK-NEXT:  // %bb.7: // %exit
+; CHECK-NEXT:    cbnz x8, .LBB2_7
+; CHECK-NEXT:  // %bb.6: // %use_c
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB2_8: // %exit
+; CHECK-NEXT:  .LBB2_7: // %use_c
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB2_8: // %exit
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret


        
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to