Author: Benjamin Maxwell Date: 2026-02-04T09:30:44Z New Revision: e4328b80992c2f63d58cc19ec70a33a62cde67a8
URL: https://github.com/llvm/llvm-project/commit/e4328b80992c2f63d58cc19ec70a33a62cde67a8 DIFF: https://github.com/llvm/llvm-project/commit/e4328b80992c2f63d58cc19ec70a33a62cde67a8.diff LOG: [AArch64][SME] Limit where SME ABI optimizations apply (#179273) These were added recently with a fairly complex propagation step, however, these optimizations can cause regressions in some cases. This patch limits the cross-block optimizations to the simple case picking a state that matches all incoming blocks. If any block doesn't match, we fallback to using "ACTIVE", the default state. Added: Modified: llvm/lib/Target/AArch64/MachineSMEABIPass.cpp llvm/test/CodeGen/AArch64/sme-agnostic-za.ll llvm/test/CodeGen/AArch64/sme-new-za-function.ll llvm/test/CodeGen/AArch64/sme-za-control-flow.ll llvm/test/CodeGen/AArch64/sme-za-exceptions.ll llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 823c754a0ac05..9b96bed823817 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -201,23 +201,6 @@ class EmitContext { Register AgnosticZABufferPtr = AArch64::NoRegister; }; -/// Checks if \p State is a legal edge bundle state. For a state to be a legal -/// bundle state, it must be possible to transition from it to any other bundle -/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED, -/// as you can transition between those states by saving/restoring ZA. The OFF -/// state would not be legal, as transitioning to it drops the content of ZA. -static bool isLegalEdgeBundleZAState(ZAState State) { - switch (State) { - case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. - case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active). - case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack. - case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack. - return true; - default: - return false; - } -} - StringRef getZAStateString(ZAState State) { #define MAKE_CASE(V) \ case V: \ @@ -325,11 +308,6 @@ struct MachineSMEABI : public MachineFunctionPass { const EdgeBundles &Bundles, ArrayRef<ZAState> BundleStates); - /// Propagates desired states forwards (from predecessors -> successors) if - /// \p Forwards, otherwise, propagates backwards (from successors -> - /// predecessors). - void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true); - void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsSave); @@ -526,110 +504,36 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { PhysLiveRegsAfterSMEPrologue}; } -void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo, - bool Forwards) { - // If `Forwards`, this propagates desired states from predecessors to - // successors, otherwise, this propagates states from successors to - // predecessors. - auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & { - return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState; - }; - - SmallVector<MachineBasicBlock *> Worklist; - for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) { - if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards))) - Worklist.push_back(MF->getBlockNumbered(BlockID)); - } - - while (!Worklist.empty()) { - MachineBasicBlock *MBB = Worklist.pop_back_val(); - BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()]; - - // Pick a legal edge bundle state that matches the majority of - // predecessors/successors. - int StateCounts[ZAState::NUM_ZA_STATE] = {0}; - for (MachineBasicBlock *PredOrSucc : - Forwards ? predecessors(MBB) : successors(MBB)) { - BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()]; - ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards); - if (isLegalEdgeBundleZAState(ZAState)) - StateCounts[ZAState]++; - } - - ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts); - ZAState &CurrentState = GetBlockState(Block, Forwards); - if (PropagatedState != CurrentState) { - CurrentState = PropagatedState; - ZAState &OtherState = GetBlockState(Block, !Forwards); - // Propagate to the incoming/outgoing state if that is also "ANY". - if (OtherState == ZAState::ANY) - OtherState = PropagatedState; - // Push any successors/predecessors that may need updating to the - // worklist. - for (MachineBasicBlock *SuccOrPred : - Forwards ? successors(MBB) : predecessors(MBB)) { - BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()]; - if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards))) - Worklist.push_back(SuccOrPred); - } - } - } -} - /// Assigns each edge bundle a ZA state based on the needed states of blocks -/// that have incoming or outgoing edges in that bundle. +/// that have incoming or outgoing blocks in that bundle. SmallVector<ZAState> MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles, const FunctionInfo &FnInfo) { SmallVector<ZAState> BundleStates(Bundles.getNumBundles()); for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) { - LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n'); - - // Attempt to assign a ZA state for this bundle that minimizes state - // transitions. Edges within loops are given a higher weight as we assume - // they will be executed more than once. - int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; + std::optional<ZAState> BundleState; for (unsigned BlockID : Bundles.getBlocks(I)) { - LLVM_DEBUG(dbgs() << "- bb." << BlockID); - const BlockInfo &Block = FnInfo.Blocks[BlockID]; - bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I; - bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I; - - bool LegalInEdge = - InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState); - bool LegalOutEgde = - OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState); - if (LegalInEdge) { - LLVM_DEBUG(dbgs() << " DesiredIncomingState: " - << getZAStateString(Block.DesiredIncomingState)); - EdgeStateCounts[Block.DesiredIncomingState]++; - } - if (LegalOutEgde) { - LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " - << getZAStateString(Block.DesiredOutgoingState)); - EdgeStateCounts[Block.DesiredOutgoingState]++; - } - if (!LegalInEdge && !LegalOutEgde) - LLVM_DEBUG(dbgs() << " (no state preference)"); - LLVM_DEBUG(dbgs() << '\n'); + // Check if the block is an incoming block in the bundle. Note: We skip + // Block.FixedEntryState != ANY to ignore EH pads (which are only + // reachable via exceptions). + if (Block.FixedEntryState != ZAState::ANY || + Bundles.getBundle(BlockID, /*Out=*/false) != I) + continue; + + // Pick a state that matches all incoming blocks. Fallback to "ACTIVE" if + // any blocks doesn't match. This will hoist the state from incoming + // blocks to outgoing blocks. + if (!BundleState) + BundleState = Block.DesiredIncomingState; + else if (BundleState != Block.DesiredIncomingState) + BundleState = ZAState::ACTIVE; } - ZAState BundleState = - ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); - - if (BundleState == ZAState::ANY) + if (!BundleState || BundleState == ZAState::ANY) BundleState = ZAState::ACTIVE; - LLVM_DEBUG({ - dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n' - << "Edge counts:"; - for (auto [State, Count] : enumerate(EdgeStateCounts)) - dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count; - dbgs() << "\n\n"; - }); - - BundleStates[I] = BundleState; + BundleStates[I] = *BundleState; } return BundleStates; @@ -1268,42 +1172,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs); - if (OptLevel != CodeGenOptLevel::None) { - // Propagate desired states forward, then backwards. Most of the propagation - // should be done in the forward step, and backwards propagation is then - // used to fill in the gaps. Note: Doing both in one step can give poor - // results. For example, consider this subgraph: - // - // ┌─────┐ - // ┌─┤ BB0 ◄───┐ - // │ └─┬───┘ │ - // │ ┌─▼───◄──┐│ - // │ │ BB1 │ ││ - // │ └─┬┬──┘ ││ - // │ │└─────┘│ - // │ ┌─▼───┐ │ - // │ │ BB2 ├───┘ - // │ └─┬───┘ - // │ ┌─▼───┐ - // └─► BB3 │ - // └─────┘ - // - // If: - // - "BB0" and "BB2" (outer loop) has no state preference - // - "BB1" (inner loop) desires the ACTIVE state on entry/exit - // - "BB3" desires the LOCAL_SAVED state on entry - // - // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2, - // then from BB2 to BB0. Which results in the inner and outer loops having - // the "ACTIVE" state. This avoids any state changes in the loops. - // - // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from - // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED - // in the outer loop. - for (bool Forwards : {true, false}) - propagateDesiredStates(FnInfo, Forwards); - } - SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo); EmitContext Context; diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 344f1ef24b843..4a18b9f61d69f 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -352,61 +352,33 @@ define i64 @test_many_callee_arguments( } define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ -; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes: -; CHECK-SDAG: // %bb.0: -; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-SDAG-NEXT: mov x29, sp -; CHECK-SDAG-NEXT: bl __arm_sme_state_size -; CHECK-SDAG-NEXT: mov x8, sp -; CHECK-SDAG-NEXT: sub x19, x8, x0 -; CHECK-SDAG-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-SDAG-NEXT: cmp sp, x19 -; CHECK-SDAG-NEXT: b.le .LBB7_3 -; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-SDAG-NEXT: str xzr, [sp] -; CHECK-SDAG-NEXT: b .LBB7_1 -; CHECK-SDAG-NEXT: .LBB7_3: -; CHECK-SDAG-NEXT: mov sp, x19 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: mov x0, x19 -; CHECK-SDAG-NEXT: bl __arm_sme_save -; CHECK-SDAG-NEXT: bl private_za -; CHECK-SDAG-NEXT: mov x0, x19 -; CHECK-SDAG-NEXT: bl __arm_sme_restore -; CHECK-SDAG-NEXT: mov sp, x29 -; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-SDAG-NEXT: ret -; -; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: bl __arm_sme_state_size -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: sub x19, x8, x0 -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEXT: cmp sp, x19 -; CHECK-NEXT: b.le .LBB7_3 -; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-NEXT: str xzr, [sp] -; CHECK-NEXT: b .LBB7_1 -; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: mov sp, x19 -; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: bl private_za -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_za_buffer_alloc_with_stack_probes: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: bl __arm_sme_state_size +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: sub x19, x8, x0 +; CHECK-COMMON-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-COMMON-NEXT: cmp sp, x19 +; CHECK-COMMON-NEXT: b.le .LBB7_3 +; CHECK-COMMON-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-COMMON-NEXT: str xzr, [sp] +; CHECK-COMMON-NEXT: b .LBB7_1 +; CHECK-COMMON-NEXT: .LBB7_3: +; CHECK-COMMON-NEXT: mov sp, x19 +; CHECK-COMMON-NEXT: ldr xzr, [sp] +; CHECK-COMMON-NEXT: mov x0, x19 +; CHECK-COMMON-NEXT: bl __arm_sme_save +; CHECK-COMMON-NEXT: bl private_za +; CHECK-COMMON-NEXT: mov x0, x19 +; CHECK-COMMON-NEXT: bl __arm_sme_restore +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @private_za() ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll index d2715b58439d8..6995cfae8e459 100644 --- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll +++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll @@ -51,7 +51,6 @@ define void @private_za() "aarch64_new_za" { } ; Note: This test must run at -O0 as otherwise the multiple exits are optimized out. -; TODO: We should be able to omit the ZA save here (as this function does not use ZA). define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" { ; CHECK-SDAG-LABEL: private_za_multiple_exit: ; CHECK-SDAG: // %bb.0: // %prelude @@ -99,33 +98,21 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: cbnz x8, .LBB1_1 -; CHECK-NEXT: b .LBB1_2 -; CHECK-NEXT: .LBB1_1: // %entry -; CHECK-NEXT: bl __arm_tpidr2_save -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: zero {za} -; CHECK-NEXT: b .LBB1_2 -; CHECK-NEXT: .LBB1_2: // %entry -; CHECK-NEXT: smstart za ; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill ; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill ; CHECK-NEXT: subs x8, x2, #1 -; CHECK-NEXT: b.ne .LBB1_4 -; CHECK-NEXT: b .LBB1_3 -; CHECK-NEXT: .LBB1_3: // %if.else +; CHECK-NEXT: b.ne .LBB1_2 +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_1: // %if.else ; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload ; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload ; CHECK-NEXT: add w0, w8, w9 -; CHECK-NEXT: smstop za ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_4: // %if.end +; CHECK-NEXT: .LBB1_2: // %if.end ; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload ; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload ; CHECK-NEXT: subs w0, w8, w9 -; CHECK-NEXT: smstop za ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index 50449172ce85b..aae1d3b756f4e 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -49,36 +49,40 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: cmp w0, #1 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: b.lt .LBB0_3 +; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader ; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: cbz w19, .LBB0_5 +; CHECK-NEXT: .LBB0_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: subs w19, w19, #1 -; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: .LBB0_3: // %exit +; CHECK-NEXT: sub w19, w19, #1 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_5 -; CHECK-NEXT: // %bb.4: // %exit +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.4: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_5: // %exit -; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index 5243b8d7203d8..19ea1e47f84ff 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -63,17 +63,25 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] ; CHECK-NEXT: bl __cxa_throw ; CHECK-NEXT: .Ltmp1: // EH_LABEL -; CHECK-NEXT: // %bb.3: // %throw_fail -; CHECK-NEXT: .LBB0_4: // %unwind_dtors +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_4 +; CHECK-NEXT: // %bb.3: // %throw_exception +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_4: // %throw_exception +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: // %bb.5: // %throw_fail +; CHECK-NEXT: .LBB0_6: // %unwind_dtors ; CHECK-NEXT: .Ltmp2: // EH_LABEL ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_6 -; CHECK-NEXT: // %bb.5: // %unwind_dtors +; CHECK-NEXT: cbnz x8, .LBB0_8 +; CHECK-NEXT: // %bb.7: // %unwind_dtors ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_6: // %unwind_dtors +; CHECK-NEXT: .LBB0_8: // %unwind_dtors ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: sub x8, x29, #16 @@ -224,15 +232,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp4: // EH_LABEL -; CHECK-NEXT: .LBB1_1: // %after_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB1_3 -; CHECK-NEXT: // %bb.2: // %after_catch +; CHECK-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB1_3: // %after_catch +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB1_3: // %after_catch ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call @@ -251,7 +259,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __cxa_end_catch -; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_8 +; CHECK-NEXT: // %bb.7: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_8: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB1_3 ; ; CHECK-SDAG-LABEL: try_catch: ; CHECK-SDAG: .Lfunc_begin1: @@ -387,8 +403,8 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: .Ltmp6: // EH_LABEL ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: .Ltmp7: // EH_LABEL -; CHECK-NEXT: .LBB2_3: // %exit ; CHECK-NEXT: smstop za +; CHECK-NEXT: .LBB2_3: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -408,6 +424,7 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __cxa_end_catch ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za ; CHECK-NEXT: b .LBB2_3 ; ; CHECK-SDAG-LABEL: try_catch_shared_za_callee: @@ -636,9 +653,9 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp13: // EH_LABEL -; CHECK-NEXT: .LBB4_1: // %exit ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: .LBB4_1: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -647,6 +664,8 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr ; CHECK-NEXT: .Ltmp14: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: b .LBB4_1 ; ; CHECK-SDAG-LABEL: try_catch_agnostic_za: @@ -746,9 +765,9 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: bl agnostic_za_call ; CHECK-NEXT: .Ltmp16: // EH_LABEL -; CHECK-NEXT: .LBB5_1: // %exit ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: .LBB5_1: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -757,6 +776,8 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal ; CHECK-NEXT: .Ltmp17: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: b .LBB5_1 ; ; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke: @@ -845,15 +866,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl agnostic_za_call ; CHECK-NEXT: .Ltmp19: // EH_LABEL -; CHECK-NEXT: .LBB6_1: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: cbnz x8, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_3: // %exit +; CHECK-NEXT: .LBB6_2: // %entry ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB6_3: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -861,7 +882,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal ; CHECK-NEXT: .Ltmp20: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch -; CHECK-NEXT: b .LBB6_1 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB6_6 +; CHECK-NEXT: // %bb.5: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_6: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB6_3 ; ; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee: ; CHECK-SDAG: .Lfunc_begin6: @@ -967,9 +996,9 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp22: // EH_LABEL -; CHECK-NEXT: .LBB7_1: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: .LBB7_1: // %exit ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret @@ -977,6 +1006,8 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe ; CHECK-NEXT: .Ltmp23: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: b .LBB7_1 ; ; CHECK-SDAG-LABEL: try_catch_inout_zt0: diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index d4840f77c5392..f5c11146a7ca6 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -1,52 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s define i32 @no_tpidr2_save_required() "aarch64_inout_za" { -; CHECK-COMMON-LABEL: no_tpidr2_save_required: -; CHECK-COMMON: // %bb.0: // %entry -; CHECK-COMMON-NEXT: mov w0, #42 // =0x2a -; CHECK-COMMON-NEXT: ret +; CHECK-LABEL: no_tpidr2_save_required: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #42 // =0x2a +; CHECK-NEXT: ret entry: ret i32 42 } define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { -; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required: -; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-COMMON-NEXT: mov x29, sp -; CHECK-COMMON-NEXT: sub sp, sp, #16 -; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 -; CHECK-COMMON-NEXT: .cfi_offset w30, -8 -; CHECK-COMMON-NEXT: .cfi_offset w29, -16 -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] -; CHECK-COMMON-NEXT: cbz w0, .LBB1_2 -; CHECK-COMMON-NEXT: // %bb.1: // %use_b -; CHECK-COMMON-NEXT: fmov s1, #4.00000000 -; CHECK-COMMON-NEXT: fadd s0, s0, s1 -; CHECK-COMMON-NEXT: b .LBB1_5 -; CHECK-COMMON-NEXT: .LBB1_2: // %use_c -; CHECK-COMMON-NEXT: fmov s0, s1 -; CHECK-COMMON-NEXT: sub x8, x29, #16 -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 -; CHECK-COMMON-NEXT: bl cosf -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-COMMON-NEXT: sub x0, x29, #16 -; CHECK-COMMON-NEXT: cbnz x8, .LBB1_4 -; CHECK-COMMON-NEXT: // %bb.3: // %use_c -; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore -; CHECK-COMMON-NEXT: .LBB1_4: // %use_c -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr -; CHECK-COMMON-NEXT: .LBB1_5: // %exit -; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ret +; CHECK-LABEL: multi_bb_stpidr2_save_required: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: cbz w0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %use_b +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: .LBB1_2: // %use_c +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %use_c +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_4: // %use_c +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB1_5: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c @@ -64,51 +64,6 @@ exit: } define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { -; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe: -; CHECK-SDAG: // %bb.0: -; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-SDAG-NEXT: mov x29, sp -; CHECK-SDAG-NEXT: str xzr, [sp, #-16]! -; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 16 -; CHECK-SDAG-NEXT: .cfi_offset w30, -8 -; CHECK-SDAG-NEXT: .cfi_offset w29, -16 -; CHECK-SDAG-NEXT: rdsvl x8, #1 -; CHECK-SDAG-NEXT: mov x9, sp -; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 -; CHECK-SDAG-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 -; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-SDAG-NEXT: cmp sp, x9 -; CHECK-SDAG-NEXT: b.le .LBB2_3 -; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 -; CHECK-SDAG-NEXT: str xzr, [sp] -; CHECK-SDAG-NEXT: b .LBB2_1 -; CHECK-SDAG-NEXT: .LBB2_3: -; CHECK-SDAG-NEXT: mov sp, x9 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] -; CHECK-SDAG-NEXT: cbz w0, .LBB2_5 -; CHECK-SDAG-NEXT: // %bb.4: // %use_b -; CHECK-SDAG-NEXT: fmov s1, #4.00000000 -; CHECK-SDAG-NEXT: fadd s0, s0, s1 -; CHECK-SDAG-NEXT: b .LBB2_8 -; CHECK-SDAG-NEXT: .LBB2_5: // %use_c -; CHECK-SDAG-NEXT: fmov s0, s1 -; CHECK-SDAG-NEXT: sub x8, x29, #16 -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 -; CHECK-SDAG-NEXT: bl cosf -; CHECK-SDAG-NEXT: smstart za -; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-SDAG-NEXT: sub x0, x29, #16 -; CHECK-SDAG-NEXT: cbnz x8, .LBB2_7 -; CHECK-SDAG-NEXT: // %bb.6: // %use_c -; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore -; CHECK-SDAG-NEXT: .LBB2_7: // %use_c -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr -; CHECK-SDAG-NEXT: .LBB2_8: // %exit -; CHECK-SDAG-NEXT: mov sp, x29 -; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-SDAG-NEXT: ret -; ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill @@ -119,9 +74,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEXT: cmp sp, x9 @@ -137,19 +90,21 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: // %bb.4: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 ; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: b .LBB2_6 +; CHECK-NEXT: b .LBB2_8 ; CHECK-NEXT: .LBB2_5: // %use_c ; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: .LBB2_6: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB2_8 -; CHECK-NEXT: // %bb.7: // %exit +; CHECK-NEXT: cbnz x8, .LBB2_7 +; CHECK-NEXT: // %bb.6: // %use_c ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB2_8: // %exit +; CHECK-NEXT: .LBB2_7: // %use_c ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB2_8: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
