[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Eli Friedman via cfe-commits


@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
 if (const auto *BTE = mdconst::extract_or_null(
 F.getParent()->getModuleFlag("branch-target-enforcement")))
   BranchTargetEnforcement = BTE->getZExtValue();
-return;
+  } else {
+const StringRef BTIEnable =
+F.getFnAttribute("branch-target-enforcement").getValueAsString();
+assert(BTIEnable.equals_insensitive("true") ||
+   BTIEnable.equals_insensitive("false"));

efriedma-quic wrote:

Is this something the verifier checks?  If it isn't, we can't assert it.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Eli Friedman via cfe-commits


@@ -1757,46 +1826,55 @@ void AArch64FrameLowering::emitPrologue(MachineFunction 
&MF,
 }
   }
 
-  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+  StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
 
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+  << "\n");
 // Find callee save instructions in frame.
 CalleeSavesBegin = MBBI;
 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
   ++MBBI;
 CalleeSavesEnd = MBBI;
 
-AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
-AllocateAfter = SVEStackSize - AllocateBefore;
+SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize);
+SVELocalsSize = SVEStackSize - SVECalleeSavedSize;
+
+// Allocate space for the SVE callee saves.
+if (SVECalleeSavedSize) {
+  allocateSVEStackSpace(
+  MBB, CalleeSavesBegin, SVECalleeSavedSize,
+  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes),
+  EmitAsyncCFI && !HasFP);
+  if (EmitAsyncCFI)
+emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
+}
   }
 
-  // Allocate space for the callee saves (if any).
-  emitFrameOffset(
-  MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, 
TII,
-  MachineInstr::FrameSetup, false, false, nullptr,
-  EmitAsyncCFI && !HasFP && AllocateBefore,
-  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
-
-  if (EmitAsyncCFI)
-emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
-
-  // Finally allocate remaining SVE stack space.
-  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-  -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
-  nullptr, EmitAsyncCFI && !HasFP && AllocateAfter,
-  AllocateBefore + StackOffset::getFixed(
-   (int64_t)MFI.getStackSize() - 
NumBytes));
+  // Allocate stack space for the local SVE objects.
+  if (SVELocalsSize)
+allocateSVEStackSpace(
+MBB, CalleeSavesEnd, SVELocalsSize,
+SVECalleeSavedSize +
+StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes),
+EmitAsyncCFI && !HasFP);
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
 unsigned scratchSPReg = AArch64::SP;
+bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+   (NumBytes > AArch64::StackProbeMaxUnprobedStack ||
+MFI.hasVarSizedObjects());
 
 if (NeedsRealignment) {
   scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+  NeedsStackProbe |= TLI.hasInlineStackProbe(MF) &&
+ (NumBytes + MFI.getMaxAlign().value()) >

efriedma-quic wrote:

Can we use the RealignmentPadding we computed earlier?

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Eli Friedman via cfe-commits


@@ -1757,46 +1826,55 @@ void AArch64FrameLowering::emitPrologue(MachineFunction 
&MF,
 }
   }
 
-  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+  StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
 
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+  << "\n");
 // Find callee save instructions in frame.
 CalleeSavesBegin = MBBI;
 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
   ++MBBI;
 CalleeSavesEnd = MBBI;
 
-AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
-AllocateAfter = SVEStackSize - AllocateBefore;
+SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize);
+SVELocalsSize = SVEStackSize - SVECalleeSavedSize;
+
+// Allocate space for the SVE callee saves.
+if (SVECalleeSavedSize) {
+  allocateSVEStackSpace(
+  MBB, CalleeSavesBegin, SVECalleeSavedSize,
+  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes),
+  EmitAsyncCFI && !HasFP);
+  if (EmitAsyncCFI)
+emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
+}
   }
 
-  // Allocate space for the callee saves (if any).
-  emitFrameOffset(
-  MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, 
TII,
-  MachineInstr::FrameSetup, false, false, nullptr,
-  EmitAsyncCFI && !HasFP && AllocateBefore,
-  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
-
-  if (EmitAsyncCFI)
-emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
-
-  // Finally allocate remaining SVE stack space.
-  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-  -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
-  nullptr, EmitAsyncCFI && !HasFP && AllocateAfter,
-  AllocateBefore + StackOffset::getFixed(
-   (int64_t)MFI.getStackSize() - 
NumBytes));
+  // Allocate stack space for the local SVE objects.
+  if (SVELocalsSize)
+allocateSVEStackSpace(

efriedma-quic wrote:

It looks like the SVE probing doesn't work on Windows (that's not a regression, 
though).

I think we want to try to integrate the SVE probing with non-SVE probing to 
some extent (if we're generating a loop, use one loop for both).

That said, maybe we can leave those for a followup; I don't want to block the 
entire patch series on trying to improve this.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Eli Friedman via cfe-commits


@@ -1076,6 +1076,16 @@ void CodeGenModule::Release() {
 "sign-return-address-with-bkey", 1);
   }
 
+  if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
+auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), 
"inline-asm");
+if (CodeGenOpts.StackClashProtector)
+  getModule().addModuleFlag(llvm::Module::Override, "probe-stack",
+InlineAsm);

efriedma-quic wrote:

Why do we want to use a module flag for this, specifically on AArch64?  I'd 
prefer to avoid architecture-specific code here unless there's actually some 
architecture-specific property involved.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Oskar Wirga via cfe-commits

oskarwirga wrote:

Apologies for still not being able to create a reproducible example I can share 
but what I am seeing is the stack probe write overwriting the value at  the tip 
of the stack when I step debug execution:
```
str xzr, [sp, #-0x10 {var_70}]!  {0x0}
...
sturx8, [x29, #-0x10 {var_70}]
...
from the inlined function:
str xzr, [x20]  {0x0}
mov sp, x20
...
ldurx8, [x29, #-0x10 {var_70}] << null deref
```

I also was able to isolate the issue to the non-fast register allocators. When 
building with optimized code, the greedy register allocator and the basic 
register allocator ended up choosing registers that were being clobbered (? 
don't know the term) by the stack probe write. 

> All the stack probing should have already finished before the call to 
> `malloc`.

Only for the containing function, the functions which have their stack probes 
inlined will be in the middle of the function which then results in this 
null-deref. I think there's some re-arranging happening during optimization and 
inlining which causes the registers not to be expired (? don't know the term 
here)

> Just to make things simpler, can you try disabling the shrink-wrapping and 
> see what happens?

I haven't seen noticeable difference with this, I tried passing it in with 
`-Wl,-mllvm,-enable-shrink-wrap=false`  

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-26 Thread Momchil Velikov via cfe-commits

momchil-velikov wrote:

> Apologies for still not being able to create a reproducible example I can 
> share but what I am seeing is the stack probe write overwriting the value at 
> the tip of the stack when I step debug execution:

Can you spot a place where the probe instruction is *not* immediately after a 
decrement of the stack (disregarding some random register-to-register 
arithmetic that may appear)?

If you can't (and you should not find such a case), perhaps some instruction 
writes below the `sp`? That would be against the AArch64.

Is redzone enabled (`--aarch64-redzone=true|false`) "? If it is on, can you 
check if it makes a difference if you disable it? 

> > All the stack probing should have already finished before the call to 
> > `malloc`.

> Only for the containing function, the functions which have their stack probes 
> inlined will be in the middle of the function which then results in this 
> null-deref.

The stack probes (except `alloca` ones)  are emitted by the 
PrologEpilogInsertion pass, which is very late in the pipeline.
I didn't think inlining could happen after that pass. Can you share yous 
compilation pipeline (`-- debug-pass=Structure`)?


https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-26 Thread Oskar Wirga via cfe-commits


@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+ Register ScratchReg,
+ Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+  MF.getSubtarget().getInstrInfo();
+  int64_t ProbeSize = MF.getInfo()->getStackProbeSize();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB ScratchReg, ScratchReg, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
+  StackOffset::getFixed(-ProbeSize), TII,
+  MachineInstr::FrameSetup);
+
+  //   CMP ScratchReg, TargetReg
+  AArch64CC::CondCode Cond = AArch64CC::LE;
+  Register Op1 = ScratchReg;
+  Register Op2 = TargetReg;
+  if (Op2 == AArch64::SP) {
+assert(Op1 != AArch64::SP && "At most one of the registers can be SP");
+// CMP TargetReg, ScratchReg
+std::swap(Op1, Op2);
+Cond = AArch64CC::GT;
+  }
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+  AArch64::XZR)
+  .addReg(Op1)
+  .addReg(Op2)
+  .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B. LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+  .addImm(Cond)
+  .addMBB(ExitMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [ScratchReg]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(ScratchReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+  .addMBB(LoopTestMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  // LoopExit:
+  //   STR XZR, [TargetReg]
+  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(TargetReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);

oskarwirga wrote:

> Can you spot a place where the probe instruction is not immediately after a 
> decrement of the stack (disregarding some random register-to-register 
> arithmetic that may appear)?

This was the thread that led to me understanding what is happening:
```
sub sp, sp, #0x1, lsl #0xc
cmp sp, x1
b.le0x557388
str xzr, [x1]  {0x0}
```

We are probing the _old_ stack head! `x1` contains `0x7fee80` but `sp` is 
at `7fde80`! This means that the selection of the `x1` register instead of 
`sp` is incorrect.

I confirmed this to be the case by fixing this probe here and testing again.

```suggestion
  //   STR XZR, [ScratchReg]
  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
  .addReg(AArch64::XZR)
  .addReg(ScratchReg)
  .addImm(0)
  .setMIFlags(MachineInstr::FrameSetup);
```

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-13 Thread via cfe-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff 8d59fc5fd1599bd7153817d2af903ae9a6103343 
7b66d75f43bfd4e52f48307e9fcc2992eefb8bdd -- clang/lib/CodeGen/CodeGenModule.cpp 
clang/lib/Driver/ToolChains/Clang.cpp 
clang/test/CodeGen/stack-clash-protection.c llvm/lib/CodeGen/CFIFixup.cpp 
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp 
llvm/lib/Target/AArch64/AArch64FrameLowering.h 
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp 
llvm/lib/Target/AArch64/AArch64ISelLowering.h 
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp 
llvm/lib/Target/AArch64/AArch64InstrInfo.h 
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp 
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
``





View the diff from clang-format here.


``diff
diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp
index 7269616e3fff..964a8d56511f 100644
--- a/llvm/lib/CodeGen/CFIFixup.cpp
+++ b/llvm/lib/CodeGen/CFIFixup.cpp
@@ -100,7 +100,8 @@ static bool containsEpilogue(const MachineBasicBlock &MBB) {
 static MachineBasicBlock *
 findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) 
{
   MachineBasicBlock *PrologueBlock = nullptr;
-  for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End; 
++It) {
+  for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End;
+   ++It) {
 MachineBasicBlock *MBB = *It;
 llvm::for_each(MBB->instrs(), [&](MachineInstr &MI) {
   if (isPrologueCFIInstruction(MI)) {

``




https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-20 Thread Momchil Velikov via cfe-commits

momchil-velikov wrote:

Ping?

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-24 Thread Oskar Wirga via cfe-commits

oskarwirga wrote:

> Upon function entry the caller guarantees that it has probed the stack (e.g. 
> performed a store) at some address [sp, #N], where0 <= N <= 1024.

I haven't been able to produce a minimal, sharable example as of yet, but I'm 
encountering a runtime error associated with an inlined function where stack 
probing is active. The error manifests as a null pointer dereference, 
originating from a stack value that is probed (and set to 0) before being 
subsequently dereferenced.

The IR contributing to this runtime issue is somewhat complex and challenging 
to interpret, but here's my observations:

- A value returned from `malloc(some_struct)` is stored in a stack variable.
- This stack variable is passed as an argument to a function.
- This function is later inlined, and within the inlined body, it attempts to 
set a value in the struct.
- At runtime, when setting the value we get a null pointer dereference.

I'm working to isolate this issue and will share a repro ASAP. In the meantime, 
any insights or suggestions based on this description would be greatly 
appreciated.

Also is it required to write to the value? Would reading the value be 
sufficient?

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-10-25 Thread Momchil Velikov via cfe-commits

momchil-velikov wrote:


> I haven't been able to produce a minimal, sharable example as of yet, but I'm 
> encountering a runtime error associated with an inlined function where stack 
> probing is active. The error manifests as a null pointer dereference, 
> originating from a stack value that is probed (and set to 0) before being 
> subsequently dereferenced.

All the stack probing should have already finished before the call to `malloc`.

> I'm working to isolate this issue and will share a repro ASAP. In the 
> meantime, any insights or suggestions based on this description would be 
> greatly appreciated.

Just to make things simpler, can you try disabling the shrink-wrapping and see 
what happens?

> Also is it required to write to the value? Would reading the value be 
> sufficient?
I can't really see a compelling reason to prefer one over another. Maybe for 
the odd chance some
kernel/runtime allocates read-only (as opposed to no access at all) guard 
regions.

Anyway, changing that won't solve the problem here, at most it could hide it.


https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -26262,3 +26262,37 @@ bool 
AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
   }
   return true;
 }
+
+bool AArch64TargetLowering::hasInlineStackProbe(
+const MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack")) {
+if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+"inline-asm")
+  return true;
+else
+  llvm_unreachable("Unsupported stack probing method");
+  }
+
+  return false;
+}
+
+unsigned
+AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+ "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))

momchil-velikov wrote:

Some validation of the value would be useful.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -1827,12 +1908,36 @@ void AArch64FrameLowering::emitPrologue(MachineFunction 
&MF,
   // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
   // the correct value here, as NumBytes also includes padding bytes,
   // which shouldn't be counted here.
-  emitFrameOffset(
-  MBB, MBBI, DL, scratchSPReg, AArch64::SP,
-  StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
-  false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
+  StackOffset CFAOffset =
   SVEStackSize +
-  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+  StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+  if (NeedsStackProbe && !NeedsRealignment) {
+// If we don't need to re-align the stack, we can use a more efficient
+// sequence for stack probing.
+Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);

momchil-velikov wrote:

Why do we need scratch reg here?

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -4052,3 +4193,192 @@ void AArch64FrameLowering::orderFrameObjects(
 dbgs() << "\n";
   });
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+/// least every ProbeSize bytes. Returns an iterator of the first instruction
+/// after the loop. The difference between SP and TargetReg must be an exact
+/// multiple of ProbeSize.
+MachineBasicBlock::iterator
+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
+MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
+Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+  MF.getSubtarget().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
+  // in SUB).
+  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
+  StackOffset::getFixed(-ProbeSize), TII,
+  MachineInstr::FrameSetup);
+  // STR XZR, [SP]
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(AArch64::SP)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);
+  // CMP SP, TargetReg
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+  AArch64::XZR)
+  .addReg(AArch64::SP)
+  .addReg(TargetReg)
+  .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+  .setMIFlags(MachineInstr::FrameSetup);
+  // B.CC Loop
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
+  .addImm(AArch64CC::NE)
+  .addMBB(LoopMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
+StackOffset CFAOffset) const {
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const AArch64TargetLowering *TLI =
+  MF.getSubtarget().getTargetLowering();
+  const AArch64InstrInfo *TII =
+  MF.getSubtarget().getInstrInfo();
+  AArch64FunctionInfo *AFI = MF.getInfo();
+  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+  bool HasFP = hasFP(MF);
+
+  DebugLoc DL;
+  int64_t ProbeSize = TLI->getStackProbeSize(MF);
+  int64_t NumBlocks = FrameSize / ProbeSize;
+  int64_t ResidualSize = FrameSize % ProbeSize;
+
+  LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
+<< NumBlocks << " blocks of " << ProbeSize
+<< " bytes, plus " << ResidualSize << " bytes\n");
+
+  // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
+  // ordinary loop.
+  if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
+for (int i = 0; i < NumBlocks; ++i) {
+  // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not

momchil-velikov wrote:

Wrong comment (code is OK): ProbeSize, not FrameSize.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -672,6 +673,74 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
   emitCalleeSavedRestores(MBB, MBBI, true);
 }
 
+void AArch64FrameLowering::allocateSVEStackSpace(
+MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const {
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget();
+  const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+  const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  // If not probing the stack or the (uknown) allocation size is less than the
+  // probe size decrement the stack pointer right away. This avoids having to
+  // emit a probing loop when allocating space for up to 16 SVE registers when
+  // using 4k probes.
+
+  // The bit-length of SVE registers is architecturally limited.
+  const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
+  int64_t ProbeSize = TLI.getStackProbeSize(MF);
+  if (!TLI.hasInlineStackProbe(MF) ||
+  AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE +
+  AllocSize.getFixed() <=
+  ProbeSize) {
+emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII,
+MachineInstr::FrameSetup, false, false, nullptr, EmitCFI,
+InitialOffset);
+if (TLI.hasInlineStackProbe(MF)) {
+  // Issue a probe at the top of the stack to prepare for subsequent
+  // allocations.
+  // STR XZR, [TargetReg]
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(AArch64::SP)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);
+}
+return;
+  }
+
+  // If we can't be sure the allocation size if less than the probe size, we
+  // have to emit a stack probing loop.
+  Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+  assert(ScratchReg != AArch64::NoRegister);
+  // Get the new top of the stack into a scratch register.
+  emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
+  MachineInstr::FrameSetup, false, false, nullptr, EmitCFI,
+  InitialOffset);
+  // Arrange to emit a probing loop by decrementing SP until it reaches that
+  // new top of the stack.
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR), AArch64::SP)
+  .addReg(ScratchReg);
+  // Set SP to its new value.
+  // MOV SP, Xs
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::ADDXri), AArch64::SP)
+  .addReg(ScratchReg)
+  .addImm(0)
+  .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+  .setMIFlags(MachineInstr::FrameSetup);
+  if (EmitCFI) {

momchil-velikov wrote:

Taken care of in invocations of `allocateSVEStackSpace`

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+ Register ScratchReg,
+ Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+  MF.getSubtarget().getInstrInfo();
+  int64_t ProbeSize = MF.getInfo()->getStackProbeSize();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB ScratchReg, ScratchReg, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
+  StackOffset::getFixed(-ProbeSize), TII,
+  MachineInstr::FrameSetup);
+
+  //   CMP ScratchReg, TargetReg
+  AArch64CC::CondCode Cond = AArch64CC::LE;
+  Register Op1 = ScratchReg;
+  Register Op2 = TargetReg;
+  if (Op2 == AArch64::SP) {
+assert(Op1 != AArch64::SP && "At most one of the registers can be SP");
+// CMP TargetReg, ScratchReg
+std::swap(Op1, Op2);
+Cond = AArch64CC::GT;
+  }
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+  AArch64::XZR)
+  .addReg(Op1)
+  .addReg(Op2)
+  .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B. LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+  .addImm(Cond)
+  .addMBB(ExitMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [ScratchReg]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(ScratchReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+  .addMBB(LoopTestMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  // LoopExit:
+  //   STR XZR, [TargetReg]
+  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(TargetReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);

momchil-velikov wrote:

> ```
> sub sp, sp, #0x1, lsl #0xc
> cmp sp, x1
> b.le0x557388
> str xzr, [x1]  {0x0}
> ```
> 
> We are probing the _old_ stack head! `x1` contains `0x7fee80` but `sp` is 
> at `7fde80`! This means that the selection of the `x1` register instead 
> of `sp` is incorrect.

I can't quite see how it is possible to generate this code. This is part of the 
sequence for allocating a compile time unknown amount of stack space that is 
done by `AArch64InstrInfo::insertStackProbingLoop`. In this function 
`TargetReg` is the new
top of the stack and right now [1] `ScratchReg` is always `AArch64::SP` .


Thus first we have
```
  // LoopTest:
  //   SUB ScratchReg, ScratchReg, #ProbeSize
  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
  StackOffset::getFixed(-ProbeSize), TII,
  MachineInstr::FrameSetup);
```

This is the code the emits the ` su sp, sp, #0x1, lsl #0xc`. Note, it uses 
`ScratchReg`.

Then we emit the compare
```
  //   CMP ScratchReg, TargetReg

  AArch64CC::CondCode Cond = AArch64CC::LE;
  Register Op1 = ScratchReg;
  Register Op2 = TargetReg;
  if (Op2 == AArch64::SP) { // condition is false here
  // ...
  }

  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
  AArch64::XZR)
  .addReg(Op1)
  .addReg(Op2)
  .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
  .setMIFlags(MachineInstr::FrameSetup);
```

That is  the `cmp sp, x1`.  So, `Op2` is `TargetReg` and `TargetReg` is `x1`.

Then we emit the loop exit branch:
```
  //   B. LoopExit
  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
  .addImm(Cond)
  .addMBB(ExitMBB)
  .setMIFlags(MachineInstr::FrameSetup);
```

This is the `b.le0x557388` above.

and then, still inside the probing loop, we emit a stack probe to `ScratchReg`, 
i.e. to `SP`.

```
  //   STR XZR, [ScratchReg]
  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
  .addReg(AArch64::XZR)
  .addReg(ScratchReg)
  .addImm(0)
  .setMIFlags(MachineInstr::FrameSetup);
```

However, i

[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -26262,3 +26262,37 @@ bool 
AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
   }
   return true;
 }
+
+bool AArch64TargetLowering::hasInlineStackProbe(
+const MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack")) {
+if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+"inline-asm")
+  return true;
+else
+  llvm_unreachable("Unsupported stack probing method");
+  }
+
+  return false;
+}
+
+unsigned
+AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+ "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))

momchil-velikov wrote:

The rounding to the stack alignment size is enough. Rounding down is the safer 
choice, zero is handled, there's no requirement other than be a multiple of 
stack alignment. Some choices might not be appropriate for certain platforms, 
e.g. 
5k or 8k probe size with 4k guard page size, but that's not something that can 
be validated here. Values, greater than 64k might also be legit, e.g. a 
specific platform allocates 2 guard pages (128k) at the top of the stack to 
limit probing overhead.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -9460,6 +9461,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+ Register ScratchReg,
+ Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+  MF.getSubtarget().getInstrInfo();
+  int64_t ProbeSize = MF.getInfo()->getStackProbeSize();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+  MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB ScratchReg, ScratchReg, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
+  StackOffset::getFixed(-ProbeSize), TII,
+  MachineInstr::FrameSetup);
+
+  //   CMP ScratchReg, TargetReg
+  AArch64CC::CondCode Cond = AArch64CC::LE;
+  Register Op1 = ScratchReg;
+  Register Op2 = TargetReg;
+  if (Op2 == AArch64::SP) {
+assert(Op1 != AArch64::SP && "At most one of the registers can be SP");
+// CMP TargetReg, ScratchReg
+std::swap(Op1, Op2);
+Cond = AArch64CC::GT;
+  }
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+  AArch64::XZR)
+  .addReg(Op1)
+  .addReg(Op2)
+  .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B. LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+  .addImm(Cond)
+  .addMBB(ExitMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [ScratchReg]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(ScratchReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+  .addMBB(LoopTestMBB)
+  .setMIFlags(MachineInstr::FrameSetup);
+
+  // LoopExit:
+  //   STR XZR, [TargetReg]
+  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
+  .addReg(AArch64::XZR)
+  .addReg(TargetReg)
+  .addImm(0)
+  .setMIFlags(MachineInstr::FrameSetup);

momchil-velikov wrote:

I have now fixed this issue.

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-21 Thread Momchil Velikov via cfe-commits


@@ -1076,6 +1076,16 @@ void CodeGenModule::Release() {
 "sign-return-address-with-bkey", 1);
   }
 
+  if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
+auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), 
"inline-asm");
+if (CodeGenOpts.StackClashProtector)
+  getModule().addModuleFlag(llvm::Module::Override, "probe-stack",
+InlineAsm);

momchil-velikov wrote:

We would like to use a module flag so the stack clash protection is effective 
for functions created by LLVM (e..g `asan.module_ctor`).
It is not AArch64 specific in principle, but other backends which implement SCP 
still rely on function attributes. When/if other backends adopt this approach 
the condition can be removed.
@serge-sans-paille 

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64] Stack probing for function prologues (PR #66524)

2023-11-01 Thread Oskar Wirga via cfe-commits


@@ -688,6 +689,68 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
   emitCalleeSavedRestores(MBB, MBBI, true);
 }
 
+void AArch64FrameLowering::allocateSVEStackSpace(
+MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const {
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FunctionInfo &MFI = *MF.getInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget();
+  const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+  const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  // If not probing the stack or the (uknown) allocation size is less than the
+  // probe size decrement the stack pointer right away. This avoids having to
+  // emit a probing loop when allocating space for up to 16 SVE registers when
+  // using 4k probes.
+
+  // The bit-length of SVE registers is architecturally limited.
+  const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
+  int64_t ProbeSize = MFI.getStackProbeSize();
+  if (!TLI.hasInlineStackProbe(MF) ||
+  AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE +
+  AllocSize.getFixed() <=
+  ProbeSize) {
+emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII,
+MachineInstr::FrameSetup, false, false, nullptr, EmitCFI,
+InitialOffset);
+if (TLI.hasInlineStackProbe(MF)) {
+  // Issue a probe at the top of the stack to prepare for subsequent
+  // allocations.
+  // STR XZR, [TargetReg]

oskarwirga wrote:

```suggestion
  // STR XZR, [SP]
```

https://github.com/llvm/llvm-project/pull/66524
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits