[clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, if (const auto *BTE = mdconst::extract_or_null( F.getParent()->getModuleFlag("branch-target-enforcement"))) BranchTargetEnforcement = BTE->getZExtValue(); -return; + } else { +const StringRef BTIEnable = +F.getFnAttribute("branch-target-enforcement").getValueAsString(); +assert(BTIEnable.equals_insensitive("true") || + BTIEnable.equals_insensitive("false")); efriedma-quic wrote: Is this something the verifier checks? If it isn't, we can't assert it. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -1757,46 +1826,55 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { +LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) ++MBBI; CalleeSavesEnd = MBBI; -AllocateBefore = StackOffset::getScalable(CalleeSavedSize); -AllocateAfter = SVEStackSize - AllocateBefore; +SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize); +SVELocalsSize = SVEStackSize - SVECalleeSavedSize; + +// Allocate space for the SVE callee saves. +if (SVECalleeSavedSize) { + allocateSVEStackSpace( + MBB, CalleeSavesBegin, SVECalleeSavedSize, + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes), + EmitAsyncCFI && !HasFP); + if (EmitAsyncCFI) +emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); +} } - // Allocate space for the callee saves (if any). - emitFrameOffset( - MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, - MachineInstr::FrameSetup, false, false, nullptr, - EmitAsyncCFI && !HasFP && AllocateBefore, - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); - - if (EmitAsyncCFI) -emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - - // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, - nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, - AllocateBefore + StackOffset::getFixed( - (int64_t)MFI.getStackSize() - NumBytes)); + // Allocate stack space for the local SVE objects. + if (SVELocalsSize) +allocateSVEStackSpace( +MBB, CalleeSavesEnd, SVELocalsSize, +SVECalleeSavedSize + +StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes), +EmitAsyncCFI && !HasFP); // Allocate space for the rest of the frame. if (NumBytes) { unsigned scratchSPReg = AArch64::SP; +bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) && + (NumBytes > AArch64::StackProbeMaxUnprobedStack || +MFI.hasVarSizedObjects()); if (NeedsRealignment) { scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + NeedsStackProbe |= TLI.hasInlineStackProbe(MF) && + (NumBytes + MFI.getMaxAlign().value()) > efriedma-quic wrote: Can we use the RealignmentPadding we computed earlier? https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -1757,46 +1826,55 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { +LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) ++MBBI; CalleeSavesEnd = MBBI; -AllocateBefore = StackOffset::getScalable(CalleeSavedSize); -AllocateAfter = SVEStackSize - AllocateBefore; +SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize); +SVELocalsSize = SVEStackSize - SVECalleeSavedSize; + +// Allocate space for the SVE callee saves. +if (SVECalleeSavedSize) { + allocateSVEStackSpace( + MBB, CalleeSavesBegin, SVECalleeSavedSize, + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes), + EmitAsyncCFI && !HasFP); + if (EmitAsyncCFI) +emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); +} } - // Allocate space for the callee saves (if any). - emitFrameOffset( - MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, - MachineInstr::FrameSetup, false, false, nullptr, - EmitAsyncCFI && !HasFP && AllocateBefore, - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); - - if (EmitAsyncCFI) -emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - - // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, - nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, - AllocateBefore + StackOffset::getFixed( - (int64_t)MFI.getStackSize() - NumBytes)); + // Allocate stack space for the local SVE objects. + if (SVELocalsSize) +allocateSVEStackSpace( efriedma-quic wrote: It looks like the SVE probing doesn't work on Windows (that's not a regression, though). I think we want to try to integrate the SVE probing with non-SVE probing to some extent (if we're generating a loop, use one loop for both). That said, maybe we can leave those for a followup; I don't want to block the entire patch series on trying to improve this. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -1076,6 +1076,16 @@ void CodeGenModule::Release() { "sign-return-address-with-bkey", 1); } + if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) { +auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), "inline-asm"); +if (CodeGenOpts.StackClashProtector) + getModule().addModuleFlag(llvm::Module::Override, "probe-stack", +InlineAsm); efriedma-quic wrote: Why do we want to use a module flag for this, specifically on AArch64? I'd prefer to avoid architecture-specific code here unless there's actually some architecture-specific property involved. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
oskarwirga wrote: Apologies for still not being able to create a reproducible example I can share but what I am seeing is the stack probe write overwriting the value at the tip of the stack when I step debug execution: ``` str xzr, [sp, #-0x10 {var_70}]! {0x0} ... sturx8, [x29, #-0x10 {var_70}] ... from the inlined function: str xzr, [x20] {0x0} mov sp, x20 ... ldurx8, [x29, #-0x10 {var_70}] << null deref ``` I also was able to isolate the issue to the non-fast register allocators. When building with optimized code, the greedy register allocator and the basic register allocator ended up choosing registers that were being clobbered (? don't know the term) by the stack probe write. > All the stack probing should have already finished before the call to > `malloc`. Only for the containing function, the functions which have their stack probes inlined will be in the middle of the function which then results in this null-deref. I think there's some re-arranging happening during optimization and inlining which causes the registers not to be expired (? don't know the term here) > Just to make things simpler, can you try disabling the shrink-wrapping and > see what happens? I haven't seen noticeable difference with this, I tried passing it in with `-Wl,-mllvm,-enable-shrink-wrap=false` https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
momchil-velikov wrote: > Apologies for still not being able to create a reproducible example I can > share but what I am seeing is the stack probe write overwriting the value at > the tip of the stack when I step debug execution: Can you spot a place where the probe instruction is *not* immediately after a decrement of the stack (disregarding some random register-to-register arithmetic that may appear)? If you can't (and you should not find such a case), perhaps some instruction writes below the `sp`? That would be against the AArch64. Is redzone enabled (`--aarch64-redzone=true|false`) "? If it is on, can you check if it makes a difference if you disable it? > > All the stack probing should have already finished before the call to > > `malloc`. > Only for the containing function, the functions which have their stack probes > inlined will be in the middle of the function which then results in this > null-deref. The stack probes (except `alloca` ones) are emitted by the PrologEpilogInsertion pass, which is very late in the pipeline. I didn't think inlining could happen after that pass. Can you share yous compilation pipeline (`-- debug-pass=Structure`)? https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable( return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } +MachineBasicBlock::iterator +AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register ScratchReg, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB ScratchReg, ScratchReg, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP ScratchReg, TargetReg + AArch64CC::CondCode Cond = AArch64CC::LE; + Register Op1 = ScratchReg; + Register Op2 = TargetReg; + if (Op2 == AArch64::SP) { +assert(Op1 != AArch64::SP && "At most one of the registers can be SP"); +// CMP TargetReg, ScratchReg +std::swap(Op1, Op2); +Cond = AArch64CC::GT; + } + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(Op1) + .addReg(Op2) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(Cond) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [ScratchReg] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // LoopExit: + // STR XZR, [TargetReg] + BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); oskarwirga wrote: > Can you spot a place where the probe instruction is not immediately after a > decrement of the stack (disregarding some random register-to-register > arithmetic that may appear)? This was the thread that led to me understanding what is happening: ``` sub sp, sp, #0x1, lsl #0xc cmp sp, x1 b.le0x557388 str xzr, [x1] {0x0} ``` We are probing the _old_ stack head! `x1` contains `0x7fee80` but `sp` is at `7fde80`! This means that the selection of the `x1` register instead of `sp` is incorrect. I confirmed this to be the case by fixing this probe here and testing again. ```suggestion // STR XZR, [ScratchReg] BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) .addReg(AArch64::XZR) .addReg(ScratchReg) .addImm(0) .setMIFlags(MachineInstr::FrameSetup); ``` https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
github-actions[bot] wrote: :warning: C/C++ code formatter, clang-format found issues in your code. :warning: You can test this locally with the following command: ``bash git-clang-format --diff 8d59fc5fd1599bd7153817d2af903ae9a6103343 7b66d75f43bfd4e52f48307e9fcc2992eefb8bdd -- clang/lib/CodeGen/CodeGenModule.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/CodeGen/stack-clash-protection.c llvm/lib/CodeGen/CFIFixup.cpp llvm/lib/Target/AArch64/AArch64FrameLowering.cpp llvm/lib/Target/AArch64/AArch64FrameLowering.h llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64InstrInfo.cpp llvm/lib/Target/AArch64/AArch64InstrInfo.h llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h `` View the diff from clang-format here. ``diff diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp index 7269616e3fff..964a8d56511f 100644 --- a/llvm/lib/CodeGen/CFIFixup.cpp +++ b/llvm/lib/CodeGen/CFIFixup.cpp @@ -100,7 +100,8 @@ static bool containsEpilogue(const MachineBasicBlock &MBB) { static MachineBasicBlock * findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) { MachineBasicBlock *PrologueBlock = nullptr; - for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End; ++It) { + for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End; + ++It) { MachineBasicBlock *MBB = *It; llvm::for_each(MBB->instrs(), [&](MachineInstr &MI) { if (isPrologueCFIInstruction(MI)) { `` https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
momchil-velikov wrote: Ping? https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
oskarwirga wrote: > Upon function entry the caller guarantees that it has probed the stack (e.g. > performed a store) at some address [sp, #N], where0 <= N <= 1024. I haven't been able to produce a minimal, sharable example as of yet, but I'm encountering a runtime error associated with an inlined function where stack probing is active. The error manifests as a null pointer dereference, originating from a stack value that is probed (and set to 0) before being subsequently dereferenced. The IR contributing to this runtime issue is somewhat complex and challenging to interpret, but here's my observations: - A value returned from `malloc(some_struct)` is stored in a stack variable. - This stack variable is passed as an argument to a function. - This function is later inlined, and within the inlined body, it attempts to set a value in the struct. - At runtime, when setting the value we get a null pointer dereference. I'm working to isolate this issue and will share a repro ASAP. In the meantime, any insights or suggestions based on this description would be greatly appreciated. Also is it required to write to the value? Would reading the value be sufficient? https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Stack probing for function prologues (PR #66524)
momchil-velikov wrote: > I haven't been able to produce a minimal, sharable example as of yet, but I'm > encountering a runtime error associated with an inlined function where stack > probing is active. The error manifests as a null pointer dereference, > originating from a stack value that is probed (and set to 0) before being > subsequently dereferenced. All the stack probing should have already finished before the call to `malloc`. > I'm working to isolate this issue and will share a repro ASAP. In the > meantime, any insights or suggestions based on this description would be > greatly appreciated. Just to make things simpler, can you try disabling the shrink-wrapping and see what happens? > Also is it required to write to the value? Would reading the value be > sufficient? I can't really see a compelling reason to prefer one over another. Maybe for the odd chance some kernel/runtime allocates read-only (as opposed to no access at all) guard regions. Anyway, changing that won't solve the problem here, at most it could hide it. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -26262,3 +26262,37 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { } return true; } + +bool AArch64TargetLowering::hasInlineStackProbe( +const MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { +if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == +"inline-asm") + return true; +else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned +AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) momchil-velikov wrote: Some validation of the value would be useful. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -1827,12 +1908,36 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset( - MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, - false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + StackOffset CFAOffset = SVEStackSize + - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + if (NeedsStackProbe && !NeedsRealignment) { +// If we don't need to re-align the stack, we can use a more efficient +// sequence for stack probing. +Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); momchil-velikov wrote: Why do we need scratch reg here? https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -4052,3 +4193,192 @@ void AArch64FrameLowering::orderFrameObjects( dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every ProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of ProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( +MachineBasicBlock::iterator MBBI, int64_t ProbeSize, +Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable + // in SUB). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed( +MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, +StackOffset CFAOffset) const { + MachineBasicBlock *MBB = MBBI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + bool HasFP = hasFP(MF); + + DebugLoc DL; + int64_t ProbeSize = TLI->getStackProbeSize(MF); + int64_t NumBlocks = FrameSize / ProbeSize; + int64_t ResidualSize = FrameSize % ProbeSize; + + LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " +<< NumBlocks << " blocks of " << ProbeSize +<< " bytes, plus " << ResidualSize << " bytes\n"); + + // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or + // ordinary loop. + if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { +for (int i = 0; i < NumBlocks; ++i) { + // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not momchil-velikov wrote: Wrong comment (code is OK): ProbeSize, not FrameSize. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -672,6 +673,74 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( emitCalleeSavedRestores(MBB, MBBI, true); } +void AArch64FrameLowering::allocateSVEStackSpace( +MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const { + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + // If not probing the stack or the (uknown) allocation size is less than the + // probe size decrement the stack pointer right away. This avoids having to + // emit a probing loop when allocating space for up to 16 SVE registers when + // using 4k probes. + + // The bit-length of SVE registers is architecturally limited. + const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; + int64_t ProbeSize = TLI.getStackProbeSize(MF); + if (!TLI.hasInlineStackProbe(MF) || + AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + + AllocSize.getFixed() <= + ProbeSize) { +emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII, +MachineInstr::FrameSetup, false, false, nullptr, EmitCFI, +InitialOffset); +if (TLI.hasInlineStackProbe(MF)) { + // Issue a probe at the top of the stack to prepare for subsequent + // allocations. + // STR XZR, [TargetReg] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); +} +return; + } + + // If we can't be sure the allocation size if less than the probe size, we + // have to emit a stack probing loop. + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + // Get the new top of the stack into a scratch register. + emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, false, nullptr, EmitCFI, + InitialOffset); + // Arrange to emit a probing loop by decrementing SP until it reaches that + // new top of the stack. + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR), AArch64::SP) + .addReg(ScratchReg); + // Set SP to its new value. + // MOV SP, Xs + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ADDXri), AArch64::SP) + .addReg(ScratchReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(MachineInstr::FrameSetup); + if (EmitCFI) { momchil-velikov wrote: Taken care of in invocations of `allocateSVEStackSpace` https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable( return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } +MachineBasicBlock::iterator +AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register ScratchReg, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB ScratchReg, ScratchReg, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP ScratchReg, TargetReg + AArch64CC::CondCode Cond = AArch64CC::LE; + Register Op1 = ScratchReg; + Register Op2 = TargetReg; + if (Op2 == AArch64::SP) { +assert(Op1 != AArch64::SP && "At most one of the registers can be SP"); +// CMP TargetReg, ScratchReg +std::swap(Op1, Op2); +Cond = AArch64CC::GT; + } + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(Op1) + .addReg(Op2) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(Cond) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [ScratchReg] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // LoopExit: + // STR XZR, [TargetReg] + BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); momchil-velikov wrote: > ``` > sub sp, sp, #0x1, lsl #0xc > cmp sp, x1 > b.le0x557388 > str xzr, [x1] {0x0} > ``` > > We are probing the _old_ stack head! `x1` contains `0x7fee80` but `sp` is > at `7fde80`! This means that the selection of the `x1` register instead > of `sp` is incorrect. I can't quite see how it is possible to generate this code. This is part of the sequence for allocating a compile time unknown amount of stack space that is done by `AArch64InstrInfo::insertStackProbingLoop`. In this function `TargetReg` is the new top of the stack and right now [1] `ScratchReg` is always `AArch64::SP` . Thus first we have ``` // LoopTest: // SUB ScratchReg, ScratchReg, #ProbeSize emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, StackOffset::getFixed(-ProbeSize), TII, MachineInstr::FrameSetup); ``` This is the code the emits the ` su sp, sp, #0x1, lsl #0xc`. Note, it uses `ScratchReg`. Then we emit the compare ``` // CMP ScratchReg, TargetReg AArch64CC::CondCode Cond = AArch64CC::LE; Register Op1 = ScratchReg; Register Op2 = TargetReg; if (Op2 == AArch64::SP) { // condition is false here // ... } BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), AArch64::XZR) .addReg(Op1) .addReg(Op2) .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) .setMIFlags(MachineInstr::FrameSetup); ``` That is the `cmp sp, x1`. So, `Op2` is `TargetReg` and `TargetReg` is `x1`. Then we emit the loop exit branch: ``` // B. LoopExit BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) .addImm(Cond) .addMBB(ExitMBB) .setMIFlags(MachineInstr::FrameSetup); ``` This is the `b.le0x557388` above. and then, still inside the probing loop, we emit a stack probe to `ScratchReg`, i.e. to `SP`. ``` // STR XZR, [ScratchReg] BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) .addReg(AArch64::XZR) .addReg(ScratchReg) .addImm(0) .setMIFlags(MachineInstr::FrameSetup); ``` However, i
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -26262,3 +26262,37 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { } return true; } + +bool AArch64TargetLowering::hasInlineStackProbe( +const MachineFunction &MF) const { + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) { +if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == +"inline-asm") + return true; +else + llvm_unreachable("Unsupported stack probing method"); + } + + return false; +} + +unsigned +AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = Subtarget->getFrameLowering(); + unsigned StackAlign = TFI->getStackAlignment(); + assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && + "Unexpected stack alignment"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) momchil-velikov wrote: The rounding to the stack alignment size is enough. Rounding down is the safer choice, zero is handled, there's no requirement other than be a multiple of stack alignment. Some choices might not be appropriate for certain platforms, e.g. 5k or 8k probe size with 4k guard page size, but that's not something that can be validated here. Values, greater than 64k might also be legit, e.g. a specific platform allocates 2 guard pages (128k) at the top of the stack to limit probing overhead. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable( return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } +MachineBasicBlock::iterator +AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI, + Register ScratchReg, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // LoopTest: + // SUB ScratchReg, ScratchReg, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + + // CMP ScratchReg, TargetReg + AArch64CC::CondCode Cond = AArch64CC::LE; + Register Op1 = ScratchReg; + Register Op2 = TargetReg; + if (Op2 == AArch64::SP) { +assert(Op1 != AArch64::SP && "At most one of the registers can be SP"); +// CMP TargetReg, ScratchReg +std::swap(Op1, Op2); +Cond = AArch64CC::GT; + } + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(Op1) + .addReg(Op2) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(Cond) + .addMBB(ExitMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // STR XZR, [ScratchReg] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(MachineInstr::FrameSetup); + + // LoopExit: + // STR XZR, [TargetReg] + BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(TargetReg) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); momchil-velikov wrote: I have now fixed this issue. https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -1076,6 +1076,16 @@ void CodeGenModule::Release() { "sign-return-address-with-bkey", 1); } + if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) { +auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), "inline-asm"); +if (CodeGenOpts.StackClashProtector) + getModule().addModuleFlag(llvm::Module::Override, "probe-stack", +InlineAsm); momchil-velikov wrote: We would like to use a module flag so the stack clash protection is effective for functions created by LLVM (e..g `asan.module_ctor`). It is not AArch64 specific in principle, but other backends which implement SCP still rely on function attributes. When/if other backends adopt this approach the condition can be removed. @serge-sans-paille https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)
@@ -688,6 +689,68 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( emitCalleeSavedRestores(MBB, MBBI, true); } +void AArch64FrameLowering::allocateSVEStackSpace( +MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const { + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const AArch64FunctionInfo &MFI = *MF.getInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo(); + const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + // If not probing the stack or the (uknown) allocation size is less than the + // probe size decrement the stack pointer right away. This avoids having to + // emit a probing loop when allocating space for up to 16 SVE registers when + // using 4k probes. + + // The bit-length of SVE registers is architecturally limited. + const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; + int64_t ProbeSize = MFI.getStackProbeSize(); + if (!TLI.hasInlineStackProbe(MF) || + AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + + AllocSize.getFixed() <= + ProbeSize) { +emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII, +MachineInstr::FrameSetup, false, false, nullptr, EmitCFI, +InitialOffset); +if (TLI.hasInlineStackProbe(MF)) { + // Issue a probe at the top of the stack to prepare for subsequent + // allocations. + // STR XZR, [TargetReg] oskarwirga wrote: ```suggestion // STR XZR, [SP] ``` https://github.com/llvm/llvm-project/pull/66524 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits