Date: Sunday, January 10, 2021 @ 00:30:27 Author: foutrelis Revision: 816853
upgpkg: lib32-llvm 11.0.1-1: new upstream release Modified: lib32-llvm/trunk/PKGBUILD Deleted: lib32-llvm/trunk/stack-clash-fixes.patch -------------------------+ PKGBUILD | 15 stack-clash-fixes.patch | 870 ---------------------------------------------- 2 files changed, 5 insertions(+), 880 deletions(-) Modified: PKGBUILD =================================================================== --- PKGBUILD 2021-01-10 00:28:23 UTC (rev 816852) +++ PKGBUILD 2021-01-10 00:30:27 UTC (rev 816853) @@ -3,8 +3,8 @@ # Contributor: Jan "heftig" Steffens <jan.steff...@gmail.com> pkgname=('lib32-llvm' 'lib32-llvm-libs') -pkgver=11.0.0 -pkgrel=2 +pkgver=11.0.1 +pkgrel=1 arch=('x86_64') url="https://llvm.org/" license=('custom:Apache 2.0 with LLVM Exception') @@ -12,11 +12,9 @@ 'lib32-libxml2') options=('staticlibs') _source_base=https://github.com/llvm/llvm-project/releases/download/llvmorg-$pkgver -source=($_source_base/llvm-$pkgver.src.tar.xz{,.sig} - stack-clash-fixes.patch) -sha256sums=('913f68c898dfb4a03b397c5e11c6a2f39d0f22ed7665c9cefa87a34423a72469' - 'SKIP' - 'bdcaa7559223bd42a381086f7cc23fc73f88ebb1966a7c235f897db0f73b7d20') +source=($_source_base/llvm-$pkgver.src.tar.xz{,.sig}) +sha256sums=('ccd87c254b6aebc5077e4e6977d08d4be888e7eb672c6630a26a15d58b59b528' + 'SKIP') validpgpkeys+=('B6C8F98282B944E3B0D5C2530FC3042E345AD05D') # Hans Wennborg <h...@chromium.org> validpgpkeys+=('474E22316ABF4785A88C6E8EA2C794A986419D8A') # Tom Stellard <tstel...@redhat.com> @@ -23,9 +21,6 @@ prepare() { cd "$srcdir/llvm-$pkgver.src" mkdir build - - # https://bugs.llvm.org/show_bug.cgi?id=48007 - patch -Np2 -i ../stack-clash-fixes.patch } build() { Deleted: stack-clash-fixes.patch =================================================================== --- stack-clash-fixes.patch 2021-01-10 00:28:23 UTC (rev 816852) +++ stack-clash-fixes.patch 2021-01-10 00:30:27 UTC (rev 816853) @@ -1,870 +0,0 @@ -From a1e0363c7402f7aa58e24e0e6dfa447ebabc1910 Mon Sep 17 00:00:00 2001 -From: serge-sans-paille <sguel...@redhat.com> -Date: Wed, 30 Sep 2020 11:35:00 +0200 -Subject: [PATCH 1/3] Fix limit behavior of dynamic alloca - -When the allocation size is 0, we shouldn't probe. Within [1, PAGE_SIZE], we -should probe once etc. - -This fixes https://bugs.llvm.org/show_bug.cgi?id=47657 - -Differential Revision: https://reviews.llvm.org/D88548 - -(cherry picked from commit 9573c9f2a363da71b2c07a3add4e52721e6028a0) ---- - llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- - llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll | 8 ++++---- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp -index fd1e6517dfac..f68ae4461fe3 100644 ---- a/llvm/lib/Target/X86/X86ISelLowering.cpp -+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp -@@ -31876,7 +31876,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, - - BuildMI(testMBB, DL, TII->get(X86::JCC_1)) - .addMBB(tailMBB) -- .addImm(X86::COND_L); -+ .addImm(X86::COND_LE); - testMBB->addSuccessor(blockMBB); - testMBB->addSuccessor(tailMBB); - -diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -index bc4678564083..82fd67842c8a 100644 ---- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -@@ -24,12 +24,12 @@ attributes #0 = {"probe-stack"="inline-asm"} - ; CHECK-X86-64-NEXT: andq $-16, %rcx - ; CHECK-X86-64-NEXT: subq %rcx, %rax - ; CHECK-X86-64-NEXT: cmpq %rsp, %rax --; CHECK-X86-64-NEXT: jl .LBB0_3 -+; CHECK-X86-64-NEXT: jle .LBB0_3 - ; CHECK-X86-64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 - ; CHECK-X86-64-NEXT: movq $0, (%rsp) - ; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 - ; CHECK-X86-64-NEXT: cmpq %rsp, %rax --; CHECK-X86-64-NEXT: jge .LBB0_2 -+; CHECK-X86-64-NEXT: jg .LBB0_2 - ; CHECK-X86-64-NEXT: .LBB0_3: - ; CHECK-X86-64-NEXT: movq %rax, %rsp - ; CHECK-X86-64-NEXT: movl $1, 4792(%rax) -@@ -54,12 +54,12 @@ attributes #0 = {"probe-stack"="inline-asm"} - ; CHECK-X86-32-NEXT: andl $-16, %ecx - ; CHECK-X86-32-NEXT: subl %ecx, %eax - ; CHECK-X86-32-NEXT: cmpl %esp, %eax --; CHECK-X86-32-NEXT: jl .LBB0_3 -+; CHECK-X86-32-NEXT: jle .LBB0_3 - ; CHECK-X86-32-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 - ; CHECK-X86-32-NEXT: movl $0, (%esp) - ; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 - ; CHECK-X86-32-NEXT: cmpl %esp, %eax --; CHECK-X86-32-NEXT: jge .LBB0_2 -+; CHECK-X86-32-NEXT: jg .LBB0_2 - ; CHECK-X86-32-NEXT: .LBB0_3: - ; CHECK-X86-32-NEXT: movl %eax, %esp - ; CHECK-X86-32-NEXT: movl $1, 4792(%eax) - -From aac36687f7978f33751daf2870b5c812124ebfaf Mon Sep 17 00:00:00 2001 -From: serge-sans-paille <sguel...@redhat.com> -Date: Thu, 23 Jul 2020 16:22:48 +0200 -Subject: [PATCH 2/3] Fix interaction between stack alignment and inline-asm - stack clash protection - -As reported in https://github.com/rust-lang/rust/issues/70143 alignment is not -taken into account when doing the probing. Fix that by adjusting the first probe -if the stack align is small, or by extending the dynamic probing if the -alignment is large. - -Differential Revision: https://reviews.llvm.org/D84419 - -(cherry picked from commit f2c6bfa350de142e4d63808d03335f69bd136d6a) ---- - llvm/lib/Target/X86/X86FrameLowering.cpp | 222 ++++++++++++++++-- - llvm/lib/Target/X86/X86FrameLowering.h | 8 +- - .../X86/stack-clash-large-large-align.ll | 88 +++++++ - .../CodeGen/X86/stack-clash-no-free-probe.ll | 27 --- - .../stack-clash-small-alloc-medium-align.ll | 135 +++++++++++ - .../X86/stack-clash-small-large-align.ll | 83 +++++++ - 6 files changed, 512 insertions(+), 51 deletions(-) - create mode 100644 llvm/test/CodeGen/X86/stack-clash-large-large-align.ll - delete mode 100644 llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll - create mode 100644 llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll - create mode 100644 llvm/test/CodeGen/X86/stack-clash-small-large-align.ll - -diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp -index c7ca6fb2a4fc..db6b68659493 100644 ---- a/llvm/lib/Target/X86/X86FrameLowering.cpp -+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp -@@ -586,29 +586,55 @@ void X86FrameLowering::emitStackProbeInlineGeneric( - const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); - uint64_t ProbeChunk = StackProbeSize * 8; - -+ uint64_t MaxAlign = -+ TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0; -+ - // Synthesize a loop or unroll it, depending on the number of iterations. -+ // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left -+ // between the unaligned rsp and current rsp. - if (Offset > ProbeChunk) { -- emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); -+ emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset, -+ MaxAlign % StackProbeSize); - } else { -- emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); -+ emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset, -+ MaxAlign % StackProbeSize); - } - } - - void X86FrameLowering::emitStackProbeInlineGenericBlock( - MachineFunction &MF, MachineBasicBlock &MBB, -- MachineBasicBlock::iterator MBBI, const DebugLoc &DL, -- uint64_t Offset) const { -+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, -+ uint64_t AlignOffset) const { - - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - const X86TargetLowering &TLI = *STI.getTargetLowering(); - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); - const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; - const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); -+ - uint64_t CurrentOffset = 0; -- // 0 Thanks to return address being saved on the stack -- uint64_t CurrentProbeOffset = 0; - -- // For the first N - 1 pages, just probe. I tried to take advantage of -+ assert(AlignOffset < StackProbeSize); -+ -+ // If the offset is so small it fits within a page, there's nothing to do. -+ if (StackProbeSize < Offset + AlignOffset) { -+ -+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) -+ .addReg(StackPtr) -+ .addImm(StackProbeSize - AlignOffset) -+ .setMIFlag(MachineInstr::FrameSetup); -+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. -+ -+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) -+ .setMIFlag(MachineInstr::FrameSetup), -+ StackPtr, false, 0) -+ .addImm(0) -+ .setMIFlag(MachineInstr::FrameSetup); -+ NumFrameExtraProbe++; -+ CurrentOffset = StackProbeSize - AlignOffset; -+ } -+ -+ // For the next N - 1 pages, just probe. I tried to take advantage of - // natural probes but it implies much more logic and there was very few - // interesting natural probes to interleave. - while (CurrentOffset + StackProbeSize < Offset) { -@@ -626,9 +652,9 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock( - .setMIFlag(MachineInstr::FrameSetup); - NumFrameExtraProbe++; - CurrentOffset += StackProbeSize; -- CurrentProbeOffset += StackProbeSize; - } - -+ // No need to probe the tail, it is smaller than a Page. - uint64_t ChunkSize = Offset - CurrentOffset; - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) -@@ -639,8 +665,8 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock( - - void X86FrameLowering::emitStackProbeInlineGenericLoop( - MachineFunction &MF, MachineBasicBlock &MBB, -- MachineBasicBlock::iterator MBBI, const DebugLoc &DL, -- uint64_t Offset) const { -+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset, -+ uint64_t AlignOffset) const { - assert(Offset && "null offset"); - - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); -@@ -648,6 +674,26 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( - const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; - const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); - -+ if (AlignOffset) { -+ if (AlignOffset < StackProbeSize) { -+ // Perform a first smaller allocation followed by a probe. -+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset); -+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) -+ .addReg(StackPtr) -+ .addImm(AlignOffset) -+ .setMIFlag(MachineInstr::FrameSetup); -+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. -+ -+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) -+ .setMIFlag(MachineInstr::FrameSetup), -+ StackPtr, false, 0) -+ .addImm(0) -+ .setMIFlag(MachineInstr::FrameSetup); -+ NumFrameExtraProbe++; -+ Offset -= AlignOffset; -+ } -+ } -+ - // Synthesize a loop - NumFrameLoopProbe++; - const BasicBlock *LLVM_BB = MBB.getBasicBlock(); -@@ -666,8 +712,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( - - // save loop bound - { -- const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); -- BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed) -+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset); -+ BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed) - .addReg(FinalStackProbed) - .addImm(Offset / StackProbeSize * StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); -@@ -675,8 +721,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( - - // allocate a page - { -- const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); -- BuildMI(testMBB, DL, TII.get(Opc), StackPtr) -+ const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); -+ BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr) - .addReg(StackPtr) - .addImm(StackProbeSize) - .setMIFlag(MachineInstr::FrameSetup); -@@ -1052,13 +1098,149 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, - uint64_t MaxAlign) const { - uint64_t Val = -MaxAlign; - unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); -- MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) -- .addReg(Reg) -- .addImm(Val) -- .setMIFlag(MachineInstr::FrameSetup); - -- // The EFLAGS implicit def is dead. -- MI->getOperand(3).setIsDead(); -+ MachineFunction &MF = *MBB.getParent(); -+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); -+ const X86TargetLowering &TLI = *STI.getTargetLowering(); -+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); -+ const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); -+ -+ // We want to make sure that (in worst case) less than StackProbeSize bytes -+ // are not probed after the AND. This assumption is used in -+ // emitStackProbeInlineGeneric. -+ if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) { -+ { -+ NumFrameLoopProbe++; -+ MachineBasicBlock *entryMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MachineBasicBlock *headMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MachineBasicBlock *bodyMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ MachineBasicBlock *footMBB = -+ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); -+ -+ MachineFunction::iterator MBBIter = MBB.getIterator(); -+ MF.insert(MBBIter, entryMBB); -+ MF.insert(MBBIter, headMBB); -+ MF.insert(MBBIter, bodyMBB); -+ MF.insert(MBBIter, footMBB); -+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; -+ Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; -+ -+ // Setup entry block -+ { -+ -+ entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI); -+ BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) -+ .addReg(StackPtr) -+ .setMIFlag(MachineInstr::FrameSetup); -+ MachineInstr *MI = -+ BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed) -+ .addReg(FinalStackProbed) -+ .addImm(Val) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ // The EFLAGS implicit def is dead. -+ MI->getOperand(3).setIsDead(); -+ -+ BuildMI(entryMBB, DL, -+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) -+ .addReg(FinalStackProbed) -+ .addReg(StackPtr) -+ .setMIFlag(MachineInstr::FrameSetup); -+ BuildMI(entryMBB, DL, TII.get(X86::JCC_1)) -+ .addMBB(&MBB) -+ .addImm(X86::COND_E) -+ .setMIFlag(MachineInstr::FrameSetup); -+ entryMBB->addSuccessor(headMBB); -+ entryMBB->addSuccessor(&MBB); -+ } -+ -+ // Loop entry block -+ -+ { -+ const unsigned SUBOpc = -+ getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); -+ BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr) -+ .addReg(StackPtr) -+ .addImm(StackProbeSize) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ BuildMI(headMBB, DL, -+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) -+ .addReg(FinalStackProbed) -+ .addReg(StackPtr) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ // jump -+ BuildMI(headMBB, DL, TII.get(X86::JCC_1)) -+ .addMBB(footMBB) -+ .addImm(X86::COND_B) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ headMBB->addSuccessor(bodyMBB); -+ headMBB->addSuccessor(footMBB); -+ } -+ -+ // setup loop body -+ { -+ addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc)) -+ .setMIFlag(MachineInstr::FrameSetup), -+ StackPtr, false, 0) -+ .addImm(0) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ const unsigned SUBOpc = -+ getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); -+ BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr) -+ .addReg(StackPtr) -+ .addImm(StackProbeSize) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ // cmp with stack pointer bound -+ BuildMI(bodyMBB, DL, -+ TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) -+ .addReg(FinalStackProbed) -+ .addReg(StackPtr) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ // jump -+ BuildMI(bodyMBB, DL, TII.get(X86::JCC_1)) -+ .addMBB(bodyMBB) -+ .addImm(X86::COND_B) -+ .setMIFlag(MachineInstr::FrameSetup); -+ bodyMBB->addSuccessor(bodyMBB); -+ bodyMBB->addSuccessor(footMBB); -+ } -+ -+ // setup loop footer -+ { -+ BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr) -+ .addReg(FinalStackProbed) -+ .setMIFlag(MachineInstr::FrameSetup); -+ addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc)) -+ .setMIFlag(MachineInstr::FrameSetup), -+ StackPtr, false, 0) -+ .addImm(0) -+ .setMIFlag(MachineInstr::FrameSetup); -+ footMBB->addSuccessor(&MBB); -+ } -+ -+ recomputeLiveIns(*headMBB); -+ recomputeLiveIns(*bodyMBB); -+ recomputeLiveIns(*footMBB); -+ recomputeLiveIns(MBB); -+ } -+ } else { -+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) -+ .addReg(Reg) -+ .addImm(Val) -+ .setMIFlag(MachineInstr::FrameSetup); -+ -+ // The EFLAGS implicit def is dead. -+ MI->getOperand(3).setIsDead(); -+ } - } - - bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { -diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h -index c0b4be95f88d..bb2e83205e71 100644 ---- a/llvm/lib/Target/X86/X86FrameLowering.h -+++ b/llvm/lib/Target/X86/X86FrameLowering.h -@@ -213,14 +213,14 @@ private: - void emitStackProbeInlineGenericBlock(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, -- const DebugLoc &DL, -- uint64_t Offset) const; -+ const DebugLoc &DL, uint64_t Offset, -+ uint64_t Align) const; - - void emitStackProbeInlineGenericLoop(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, -- const DebugLoc &DL, -- uint64_t Offset) const; -+ const DebugLoc &DL, uint64_t Offset, -+ uint64_t Align) const; - - /// Emit a stub to later inline the target stack probe. - MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, -diff --git a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll -new file mode 100644 -index 000000000000..6c981cb4ac91 ---- /dev/null -+++ b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll -@@ -0,0 +1,88 @@ -+; RUN: llc < %s | FileCheck %s -+ -+ -+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -+target triple = "x86_64-unknown-linux-gnu" -+ -+define i32 @foo_noprotect() local_unnamed_addr { -+; CHECK-LABEL: foo_noprotect: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000 -+; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 -+; CHECK-NEXT: movl $1, 392(%rsp) -+; CHECK-NEXT: movl $1, 28792(%rsp) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ -+ %a = alloca i32, i64 18000, align 4096 -+ %b0 = getelementptr inbounds i32, i32* %a, i64 98 -+ %b1 = getelementptr inbounds i32, i32* %a, i64 7198 -+ store volatile i32 1, i32* %b0 -+ store volatile i32 1, i32* %b1 -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+define i32 @foo_protect() local_unnamed_addr #0 { -+; CHECK-LABEL: foo_protect: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: movq %rsp, %r11 -+; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: je .LBB1_4 -+; CHECK-NEXT:# %bb.1: -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: jb .LBB1_3 -+; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: jb .LBB1_2 -+; CHECK-NEXT:.LBB1_3: -+; CHECK-NEXT: movq %r11, %rsp -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT:.LBB1_4: -+; CHECK-NEXT: movq %rsp, %r11 -+; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 -+; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: cmpq %r11, %rsp -+; CHECK-NEXT: jne .LBB1_5 -+; CHECK-NEXT:# %bb.6: -+; CHECK-NEXT: movl $1, 392(%rsp) -+; CHECK-NEXT: movl $1, 28792(%rsp) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ -+ -+ -+ %a = alloca i32, i64 18000, align 4096 -+ %b0 = getelementptr inbounds i32, i32* %a, i64 98 -+ %b1 = getelementptr inbounds i32, i32* %a, i64 7198 -+ store volatile i32 1, i32* %b0 -+ store volatile i32 1, i32* %b1 -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+attributes #0 = {"probe-stack"="inline-asm"} -diff --git a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll -deleted file mode 100644 -index 652acbdf00ba..000000000000 ---- a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll -+++ /dev/null -@@ -1,27 +0,0 @@ --; RUN: llc < %s | FileCheck %s -- --target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" --target triple = "x86_64-unknown-linux-gnu" -- --define i32 @foo(i64 %i) local_unnamed_addr #0 { --; CHECK-LABEL: foo: --; CHECK: # %bb.0: --; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 --; CHECK-NEXT: movq $0, (%rsp) --; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 --; CHECK-NEXT: .cfi_def_cfa_offset 7888 --; CHECK-NEXT: movl $1, -128(%rsp,%rdi,4) --; CHECK-NEXT: movl -128(%rsp), %eax --; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 --; CHECK-NEXT: .cfi_def_cfa_offset 8 --; CHECK-NEXT: retq -- -- %a = alloca i32, i32 2000, align 16 -- %b = getelementptr inbounds i32, i32* %a, i64 %i -- store volatile i32 1, i32* %b -- %c = load volatile i32, i32* %a -- ret i32 %c --} -- --attributes #0 = {"probe-stack"="inline-asm"} -- -diff --git a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll -new file mode 100644 -index 000000000000..eafa86f1eba9 ---- /dev/null -+++ b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll -@@ -0,0 +1,135 @@ -+; RUN: llc < %s | FileCheck %s -+ -+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -+target triple = "x86_64-unknown-linux-gnu" -+ -+; | case1 | alloca + align < probe_size -+define i32 @foo1(i64 %i) local_unnamed_addr #0 { -+; CHECK-LABEL: foo1: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: andq $-64, %rsp -+; CHECK-NEXT: subq $832, %rsp # imm = 0x340 -+; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ %a = alloca i32, i32 200, align 64 -+ %b = getelementptr inbounds i32, i32* %a, i64 %i -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+; | case2 | alloca > probe_size, align > probe_size -+define i32 @foo2(i64 %i) local_unnamed_addr #0 { -+; CHECK-LABEL: foo2: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: andq $-2048, %rsp # imm = 0xF800 -+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 -+; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ %a = alloca i32, i32 2000, align 2048 -+ %b = getelementptr inbounds i32, i32* %a, i64 %i -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+; | case3 | alloca < probe_size, align < probe_size, alloca + align > probe_size -+define i32 @foo3(i64 %i) local_unnamed_addr #0 { -+; CHECK-LABEL: foo3: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -+; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $1024, %rsp # imm = 0x400 -+; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ -+ %a = alloca i32, i32 1000, align 1024 -+ %b = getelementptr inbounds i32, i32* %a, i64 %i -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+; | case4 | alloca + probe_size < probe_size, followed by dynamic alloca -+define i32 @foo4(i64 %i) local_unnamed_addr #0 { -+; CHECK-LABEL: foo4: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: pushq %rbx -+; CHECK-NEXT: andq $-64, %rsp -+; CHECK-NEXT: subq $896, %rsp # imm = 0x380 -+; CHECK-NEXT: movq %rsp, %rbx -+; CHECK-NEXT: .cfi_offset %rbx, -24 -+; CHECK-NEXT: movl $1, (%rbx,%rdi,4) -+; CHECK-NEXT: movl (%rbx), %ecx -+; CHECK-NEXT: movq %rsp, %rax -+; CHECK-NEXT: leaq 15(,%rcx,4), %rcx -+; CHECK-NEXT: andq $-16, %rcx -+; CHECK-NEXT: subq %rcx, %rax -+; CHECK-NEXT: cmpq %rsp, %rax -+; CHECK-NEXT: jle .LBB3_3 -+; CHECK-NEXT:.LBB3_2: # =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: cmpq %rsp, %rax -+; CHECK-NEXT: jg .LBB3_2 -+; CHECK-NEXT:.LBB3_3: -+; CHECK-NEXT: andq $-64, %rax -+; CHECK-NEXT: movq %rax, %rsp -+; CHECK-NEXT: movl (%rax), %eax -+; CHECK-NEXT: leaq -8(%rbp), %rsp -+; CHECK-NEXT: popq %rbx -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ %a = alloca i32, i32 200, align 64 -+ %b = getelementptr inbounds i32, i32* %a, i64 %i -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ %d = alloca i32, i32 %c, align 64 -+ %e = load volatile i32, i32* %d -+ ret i32 %e -+} -+ -+attributes #0 = {"probe-stack"="inline-asm"} -+ -diff --git a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll -new file mode 100644 -index 000000000000..e608bab90415 ---- /dev/null -+++ b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll -@@ -0,0 +1,83 @@ -+; RUN: llc < %s | FileCheck %s -+ -+ -+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -+target triple = "x86_64-unknown-linux-gnu" -+ -+define i32 @foo_noprotect() local_unnamed_addr { -+; CHECK-LABEL: foo_noprotect: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: andq $-65536, %rsp -+; CHECK-NEXT: subq $65536, %rsp -+; CHECK-NEXT: movl $1, 392(%rsp) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ -+ -+ %a = alloca i32, i64 100, align 65536 -+ %b = getelementptr inbounds i32, i32* %a, i64 98 -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+define i32 @foo_protect() local_unnamed_addr #0 { -+; CHECK-LABEL: foo_protect: -+; CHECK: # %bb.0: -+; CHECK-NEXT: pushq %rbp -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: .cfi_offset %rbp, -16 -+; CHECK-NEXT: movq %rsp, %rbp -+; CHECK-NEXT: .cfi_def_cfa_register %rbp -+; CHECK-NEXT: movq %rsp, %r11 -+; CHECK-NEXT: andq $-65536, %r11 # imm = 0xFFFF0000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: je .LBB1_4 -+; CHECK-NEXT:# %bb.1: -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: jb .LBB1_3 -+; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: cmpq %rsp, %r11 -+; CHECK-NEXT: jb .LBB1_2 -+; CHECK-NEXT:.LBB1_3: -+; CHECK-NEXT: movq %r11, %rsp -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT:.LBB1_4: -+; CHECK-NEXT: movq %rsp, %r11 -+; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 -+; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 -+; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -+; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: cmpq %r11, %rsp -+; CHECK-NEXT: jne .LBB1_5 -+; CHECK-NEXT:# %bb.6: -+; CHECK-NEXT: movl $1, 392(%rsp) -+; CHECK-NEXT: movl (%rsp), %eax -+; CHECK-NEXT: movq %rbp, %rsp -+; CHECK-NEXT: popq %rbp -+; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -+; CHECK-NEXT: retq -+ -+ -+ -+ -+ %a = alloca i32, i64 100, align 65536 -+ %b = getelementptr inbounds i32, i32* %a, i64 98 -+ store volatile i32 1, i32* %b -+ %c = load volatile i32, i32* %a -+ ret i32 %c -+} -+ -+attributes #0 = {"probe-stack"="inline-asm"} - -From bbe6cbbed8c7460a7e8477373b9250543362e771 Mon Sep 17 00:00:00 2001 -From: serge-sans-paille <sguel...@redhat.com> -Date: Tue, 27 Oct 2020 10:59:42 +0100 -Subject: [PATCH 3/3] [stack-clash] Fix probing of dynamic alloca - -- Perform the probing in the correct direction. - Related to https://github.com/rust-lang/rust/pull/77885#issuecomment-711062924 - -- The first touch on a dynamic alloca cannot use a mov because it clobbers - existing space. Use a xor 0 instead - -Differential Revision: https://reviews.llvm.org/D90216 - -(cherry picked from commit 0f60bcc36c34522618bd1425a45f8c6006568fb6) ---- - llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- - llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll | 12 ++++++------ - .../X86/stack-clash-small-alloc-medium-align.ll | 6 +++--- - 3 files changed, 13 insertions(+), 13 deletions(-) - -diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp -index f68ae4461fe3..afe470cc6e0b 100644 ---- a/llvm/lib/Target/X86/X86ISelLowering.cpp -+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp -@@ -31876,7 +31876,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, - - BuildMI(testMBB, DL, TII->get(X86::JCC_1)) - .addMBB(tailMBB) -- .addImm(X86::COND_LE); -+ .addImm(X86::COND_GE); - testMBB->addSuccessor(blockMBB); - testMBB->addSuccessor(tailMBB); - -@@ -31892,9 +31892,9 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, - // - // The property we want to enforce is to never have more than [page alloc] between two probes. - -- const unsigned MovMIOpc = -- TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi; -- addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) -+ const unsigned XORMIOpc = -+ TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8; -+ addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0) - .addImm(0); - - BuildMI(blockMBB, DL, -diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -index 82fd67842c8a..6dd8b6ab5897 100644 ---- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll -@@ -24,12 +24,12 @@ attributes #0 = {"probe-stack"="inline-asm"} - ; CHECK-X86-64-NEXT: andq $-16, %rcx - ; CHECK-X86-64-NEXT: subq %rcx, %rax - ; CHECK-X86-64-NEXT: cmpq %rsp, %rax --; CHECK-X86-64-NEXT: jle .LBB0_3 -+; CHECK-X86-64-NEXT: jge .LBB0_3 - ; CHECK-X86-64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 --; CHECK-X86-64-NEXT: movq $0, (%rsp) -+; CHECK-X86-64-NEXT: xorq $0, (%rsp) - ; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 - ; CHECK-X86-64-NEXT: cmpq %rsp, %rax --; CHECK-X86-64-NEXT: jg .LBB0_2 -+; CHECK-X86-64-NEXT: jl .LBB0_2 - ; CHECK-X86-64-NEXT: .LBB0_3: - ; CHECK-X86-64-NEXT: movq %rax, %rsp - ; CHECK-X86-64-NEXT: movl $1, 4792(%rax) -@@ -54,12 +54,12 @@ attributes #0 = {"probe-stack"="inline-asm"} - ; CHECK-X86-32-NEXT: andl $-16, %ecx - ; CHECK-X86-32-NEXT: subl %ecx, %eax - ; CHECK-X86-32-NEXT: cmpl %esp, %eax --; CHECK-X86-32-NEXT: jle .LBB0_3 -+; CHECK-X86-32-NEXT: jge .LBB0_3 - ; CHECK-X86-32-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 --; CHECK-X86-32-NEXT: movl $0, (%esp) -+; CHECK-X86-32-NEXT: xorl $0, (%esp) - ; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 - ; CHECK-X86-32-NEXT: cmpl %esp, %eax --; CHECK-X86-32-NEXT: jg .LBB0_2 -+; CHECK-X86-32-NEXT: jl .LBB0_2 - ; CHECK-X86-32-NEXT: .LBB0_3: - ; CHECK-X86-32-NEXT: movl %eax, %esp - ; CHECK-X86-32-NEXT: movl $1, 4792(%eax) -diff --git a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll -index eafa86f1eba9..39b6c3640a60 100644 ---- a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll -+++ b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll -@@ -106,12 +106,12 @@ define i32 @foo4(i64 %i) local_unnamed_addr #0 { - ; CHECK-NEXT: andq $-16, %rcx - ; CHECK-NEXT: subq %rcx, %rax - ; CHECK-NEXT: cmpq %rsp, %rax --; CHECK-NEXT: jle .LBB3_3 -+; CHECK-NEXT: jge .LBB3_3 - ; CHECK-NEXT:.LBB3_2: # =>This Inner Loop Header: Depth=1 --; CHECK-NEXT: movq $0, (%rsp) -+; CHECK-NEXT: xorq $0, (%rsp) - ; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 - ; CHECK-NEXT: cmpq %rsp, %rax --; CHECK-NEXT: jg .LBB3_2 -+; CHECK-NEXT: jl .LBB3_2 - ; CHECK-NEXT:.LBB3_3: - ; CHECK-NEXT: andq $-64, %rax - ; CHECK-NEXT: movq %rax, %rsp