https://github.com/dpaoliello updated https://github.com/llvm/llvm-project/pull/160604
>From 002a75fd06b670246cc9a5834dd528a343208e85 Mon Sep 17 00:00:00 2001 From: Daniel Paoliello <[email protected]> Date: Fri, 12 Dec 2025 09:45:37 -0800 Subject: [PATCH] [win][x64] Fix import call optimization for calls to dllimports and global function pointers --- .../CodeGenCXX/microsoft-abi-eh-ip2state.cpp | 2 +- llvm/lib/Target/X86/X86AsmPrinter.cpp | 4 +- llvm/lib/Target/X86/X86AsmPrinter.h | 2 +- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 8 +- llvm/lib/Target/X86/X86FastISel.cpp | 16 +- llvm/lib/Target/X86/X86FrameLowering.cpp | 4 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 - llvm/lib/Target/X86/X86ISelLowering.h | 4 - llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 16 +- llvm/lib/Target/X86/X86InstrCompiler.td | 11 +- llvm/lib/Target/X86/X86InstrControl.td | 10 +- llvm/lib/Target/X86/X86InstrFragments.td | 4 - llvm/lib/Target/X86/X86InstrInfo.cpp | 15 +- llvm/lib/Target/X86/X86InstrPredicates.td | 2 - llvm/lib/Target/X86/X86MCInstLower.cpp | 114 +++++++--- llvm/lib/Target/X86/X86RegisterInfo.cpp | 1 - llvm/lib/Target/X86/X86RegisterInfo.td | 4 - .../Inputs/reference_x86_vocab_print.txt | 2 - .../reference_x86_vocab_wo=0.5_print.txt | 2 - .../win-import-call-optimization-cfguard.ll | 154 +++++++++++-- .../win-import-call-optimization-jumptable.ll | 61 ++++-- .../X86/win-import-call-optimization.ll | 202 ++++++++++++++---- 22 files changed, 463 insertions(+), 177 deletions(-) diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp index 0b7b406e2ba8e..541789fc9d339 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp @@ -40,7 +40,7 @@ void case_calls_dll_import() NO_TAIL { // CHECK: .seh_endprologue // CHECK: .Limpcall{{[0-9]+}}: // CHECK-NEXT: rex64 -// CHECK-NEXT: call __imp_some_dll_import +// CHECK-NEXT: call qword ptr [rip + __imp_some_dll_import] // CHECK-NEXT: nop dword ptr {{\[.*\]}} // CHECK-NEXT: nop // CHECK-NEXT: .seh_startepilogue diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index 84b921222a116..6a876b8963545 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -479,8 +479,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) { Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi || Opc == X86::TCRETURN_WINmi64 || Opc == X86::TCRETURNri64 || - Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall || - Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX; + Opc == X86::TCRETURNmi64 || Opc == X86::TAILJMPr64_REX || + Opc == X86::TAILJMPm64_REX; } void X86AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) { diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index e02b5562d3b5e..7c55f06e86d4b 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -53,7 +53,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { MCSymbol *CalleeSymbol; ImportCallKind Kind; }; - DenseMap<MCSection *, std::vector<ImportCallInfo>> + MapVector<MCSection *, std::vector<ImportCallInfo>> SectionToImportedFunctionCalls; // This utility class tracks the length of a stackmap instruction's 'shadow'. diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 2f5ee9d2c9a13..6574bd2d974a8 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -290,7 +290,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB, case X86::TCRETURNdi64: case X86::TCRETURNdi64cc: case X86::TCRETURNri64: - case X86::TCRETURNri64_ImpCall: case X86::TCRETURNmi64: case X86::TCRETURN_WINmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64 || @@ -366,9 +365,7 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); - } else if (Opcode == X86::TCRETURNri64 || - Opcode == X86::TCRETURNri64_ImpCall || - Opcode == X86::TCRETURN_WIN64ri) { + } else if (Opcode == X86::TCRETURNri64 || Opcode == X86::TCRETURN_WIN64ri) { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) @@ -722,9 +719,6 @@ bool X86ExpandPseudoImpl::expandMI(MachineBasicBlock &MBB, case X86::CALL64m_RVMARKER: expandCALL_RVMARKER(MBB, MBBI); return true; - case X86::CALL64r_ImpCall: - MI.setDesc(TII->get(X86::CALL64r)); - return true; case X86::ADD32mi_ND: case X86::ADD64mi32_ND: case X86::SUB32mi_ND: diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index c69ca77031495..26b247c797b00 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3326,11 +3326,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (Flag.isSwiftError() || Flag.isPreallocated()) return false; - // Can't handle import call optimization. - if (Is64Bit && - MF->getFunction().getParent()->getModuleFlag("import-call-optimization")) - return false; - SmallVector<MVT, 16> OutVTs; SmallVector<Type *, 16> ArgTys; SmallVector<Register, 16> ArgRegs; @@ -3572,6 +3567,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (CalleeOp) { // Register-indirect call. unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; + + const Module *M = FuncInfo.MF->getFunction().getParent(); + if (CalleeOp != X86::RAX && Is64Bit && + M->getModuleFlag("import-call-optimization")) { + // Import call optimization requires all indirect calls to be via RAX. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + TII.get(TargetOpcode::COPY), X86::RAX) + .addReg(CalleeOp); + CalleeOp = X86::RAX; + } + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc)) .addReg(CalleeOp); } else { diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 8bca6344d6521..7494f756de68a 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2400,8 +2400,8 @@ static bool isTailCallOpcode(unsigned Opc) { return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || - Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 || - Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURN_WINmi64; + Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64 || + Opc == X86::TCRETURN_WINmi64; } void X86FrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ef94c198558c7..3d8c455af01ab 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35396,7 +35396,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FST) NODE_NAME_CASE(CALL) NODE_NAME_CASE(CALL_RVMARKER) - NODE_NAME_CASE(IMP_CALL) NODE_NAME_CASE(BT) NODE_NAME_CASE(CMP) NODE_NAME_CASE(FCMP) @@ -63310,7 +63309,6 @@ X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, Register TargetReg; switch (MBBI->getOpcode()) { case X86::CALL64r: - case X86::CALL64r_ImpCall: case X86::CALL64r_NT: case X86::TAILJMPr64: case X86::TAILJMPr64_REX: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 08d5e2331727b..a24c8dde6497b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -94,10 +94,6 @@ namespace llvm { /// POP_FROM_X87_REG (which may remove a required FPU stack pop). POP_FROM_X87_REG, - // Pseudo for a call to an imported function to ensure the correct machine - // instruction is emitted for Import Call Optimization. - IMP_CALL, - /// X86 compare and logical compare instructions. CMP, FCMP, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index f4de2f8c6c22e..f2fb5c685f348 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2585,6 +2585,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "CFG Function load should not have an offset"); Callee = DAG.getTargetGlobalAddress( GA->getGlobal(), dl, GA->getValueType(0), 0, X86II::MO_NO_FLAG); + } else if (M->getModuleFlag("import-call-optimization")) { + // When import call optimization is enabled, all register indirect calls + // must use RAX. + Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Callee, InGlue); + InGlue = Chain.getValue(1); + Callee = DAG.getRegister(X86::RAX, Callee.getValueType()); } SmallVector<SDValue, 8> Ops; @@ -2689,8 +2695,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); - auto Opcode = - IsCFGuardCall ? X86ISD::TC_RETURN_GLOBALADDR : X86ISD::TC_RETURN; + auto Opcode = (IsCFGuardCall || IsImpCall) ? X86ISD::TC_RETURN_GLOBALADDR + : X86ISD::TC_RETURN; SDValue Ret = DAG.getNode(Opcode, dl, MVT::Other, Ops); if (IsCFICall) @@ -2703,11 +2709,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a chain & a glue for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - if (IsImpCall) { - Chain = DAG.getNode(X86ISD::IMP_CALL, dl, NodeTys, Ops); - } else if (IsNoTrackIndirectCall) { + if (IsNoTrackIndirectCall) { Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); - } else if (IsCFGuardCall) { + } else if (IsCFGuardCall || IsImpCall) { Chain = DAG.getNode(X86ISD::CALL_GLOBALADDR, dl, NodeTys, Ops); } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { // Calls with a "clang.arc.attachedcall" bundle are special. They should be diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 3e07db678809d..3ca60135784de 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1322,9 +1322,6 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)), def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)), (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>; -def : Pat<(X86imp_call (i64 tglobaladdr:$dst)), - (CALL64pcrel32 tglobaladdr:$dst)>; - // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they // can never use callee-saved registers. That is the purpose of the GR64_TC // register classes. @@ -1359,15 +1356,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; + Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>; def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off), (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>, - Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; - -def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), - (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>; + Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index c67feb7668234..aed7df993880a 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -331,7 +331,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { Requires<[In64BitMode]>; def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, - Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>; + Requires<[In64BitMode,NotUseIndirectThunkCalls]>; def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, Requires<[In64BitMode,FavorMemIndirectCall, @@ -364,10 +364,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; - def TCRETURNri64_ImpCall : PseudoI<(outs), - (ins GR64_A:$dst, i32imm:$offset), - []>, Sched<[WriteJump]>; - let mayLoad = 1 in def TCRETURNmi64 : PseudoI<(outs), (ins i64mem_TC:$dst, i32imm:$offset), @@ -433,10 +429,6 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, def CALL64pcrel32_RVMARKER : PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>, Requires<[In64BitMode]>; - - def CALL64r_ImpCall : - PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>, - Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>; } // Conditional tail calls are similar to the above, but they are branches diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 38ab02667317e..29bf4c46ae69c 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -214,10 +214,6 @@ def X86call_globaladdr : SDNode<"X86ISD::CALL_GLOBALADDR", SDT_X86Call, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; -def X86imp_call : SDNode<"X86ISD::IMP_CALL", SDT_X86Call, - [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, - SDNPVariadic]>; - def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 53b148c11c4e1..ebe60922abd9f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3700,7 +3700,6 @@ bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const { case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: - case X86::TCRETURNri64_ImpCall: case X86::TCRETURNmi64: return true; default: @@ -3731,9 +3730,16 @@ bool X86InstrInfo::canMakeTailCallConditional( return false; } - if (Subtarget.isTargetWin64() && MF->hasWinCFI()) { + if (Subtarget.isTargetWin64()) { // Conditional tail calls confuse the Win64 unwinder. - return false; + if (MF->hasWinCFI()) + return false; + + // Conditional tail calls cannot be encoded in the Import Call Optimization + // metadata. + if (MF->getFunction().getParent()->getModuleFlag( + "import-call-optimization")) + return false; } assert(BranchCond.size() == 1); @@ -7496,8 +7502,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // do not fold loads into calls or pushes, unless optimizing for size // aggressively. if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() && - (Opc == X86::CALL32r || Opc == X86::CALL64r || - Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r || + (Opc == X86::CALL32r || Opc == X86::CALL64r || Opc == X86::PUSH16r || Opc == X86::PUSH32r || Opc == X86::PUSH64r)) return nullptr; diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 21e6bacbacee2..1d23604d66d2c 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -237,8 +237,6 @@ let RecomputePerFunction = 1 in { "shouldOptForSize(MF)">; def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || " "!Subtarget->hasSSE41()">; - def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; - def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 2287a921a19c0..0a70f1ad7b8f8 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2311,10 +2311,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11)) EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX)); - if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) { - emitLabelAndRecordForImportCallOptimization( - IMAGE_RETPOLINE_AMD64_IMPORT_BR); - } + if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) + reportFatalInternalError( + "Tail jumps to imported functions must use TAILJMPm64_REX"); // Lower this as normal, but add a comment. OutStreamer->AddComment("TAILCALL"); @@ -2329,8 +2328,9 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::TAILJMPm64: case X86::TAILJMPd64_CC: if (EnableImportCallOptimization) - report_fatal_error("Unexpected TAILJMP instruction was emitted when " - "import call optimization was enabled"); + reportFatalInternalError( + "Unexpected TAILJMP instruction was emitted when " + "import call optimization was enabled"); // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); @@ -2338,9 +2338,22 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { break; case X86::TAILJMPm64_REX: - if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) { - emitLabelAndRecordForImportCallOptimization( - IMAGE_RETPOLINE_AMD64_CFG_BR_REX); + if (EnableImportCallOptimization) { + if (isCallToCFGuardFunction(MI)) { + emitLabelAndRecordForImportCallOptimization( + IMAGE_RETPOLINE_AMD64_CFG_BR_REX); + } else if (isImportedFunction(MI->getOperand(3))) { + emitLabelAndRecordForImportCallOptimization( + IMAGE_RETPOLINE_AMD64_IMPORT_BR); + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + emitCallInstruction(TmpInst); + + // Must be followed by five int3 instructions. + for (int i = 0; i < 5; ++i) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + return; + } } OutStreamer->AddComment("TAILCALL"); @@ -2349,11 +2362,20 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::TAILJMPr64_REX: { if (EnableImportCallOptimization) { - assert(MI->getOperand(0).getReg() == X86::RAX && - "Indirect tail calls with impcall enabled must go through RAX (as " - "enforced by TCRETURNImpCallri64)"); + if (MI->getOperand(0).getReg() != X86::RAX) + reportFatalInternalError( + "Indirect tail calls with impcall enabled must go through RAX (as " + "enforced by TCRETURNImpCallri64)"); emitLabelAndRecordForImportCallOptimization( - IMAGE_RETPOLINE_AMD64_INDIR_BR); + IMAGE_RETPOLINE_AMD64_INDIR_BR_REX); + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + emitCallInstruction(TmpInst); + + // Must be followed by 2 int3 instructions. + for (int i = 0; i < 2; ++i) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + return; } OutStreamer->AddComment("TAILCALL"); @@ -2369,6 +2391,14 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { emitLabelAndRecordForImportCallOptimization( (ImportCallKind)(IMAGE_RETPOLINE_AMD64_SWITCHTABLE_FIRST + EncodedReg)); + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + emitCallInstruction(TmpInst); + + // Must be followed by 4 int3 instructions. + for (int i = 0; i < 4; ++i) + EmitAndCountInstruction(MCInstBuilder(X86::INT3)); + return; } break; @@ -2378,7 +2408,7 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::JMP32m: case X86::JMP64m: if (EnableImportCallOptimization && hasJumpTableInfoInBlock(MI)) - report_fatal_error( + reportFatalInternalError( "Unexpected JMP instruction was emitted for a jump-table when import " "call optimization was enabled"); break; @@ -2550,29 +2580,19 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11)) EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX)); - if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) { - emitLabelAndRecordForImportCallOptimization( - IMAGE_RETPOLINE_AMD64_IMPORT_CALL); - - MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); - - // For Import Call Optimization to work, we need a the call instruction - // with a rex prefix, and a 5-byte nop after the call instruction. - EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); - emitCallInstruction(TmpInst); - emitNop(*OutStreamer, 5, Subtarget); - maybeEmitNopAfterCallForWindowsEH(MI); - return; - } + if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) + reportFatalInternalError( + "Calls to imported functions with import call optimization " + "should be lowered to CALL64m via CALL64_ImpCall"); break; case X86::CALL64r: if (EnableImportCallOptimization) { - assert(MI->getOperand(0).getReg() == X86::RAX && - "Indirect calls with impcall enabled must go through RAX (as " - "enforced by CALL64r_ImpCall)"); + if (MI->getOperand(0).getReg() != X86::RAX) + reportFatalInternalError( + "Indirect calls with import call optimization enabled must go " + "through RAX"); emitLabelAndRecordForImportCallOptimization( IMAGE_RETPOLINE_AMD64_INDIR_CALL); @@ -2589,9 +2609,33 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { break; case X86::CALL64m: - if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) { - emitLabelAndRecordForImportCallOptimization( - IMAGE_RETPOLINE_AMD64_CFG_CALL); + if (EnableImportCallOptimization) { + if (isCallToCFGuardFunction(MI)) { + emitLabelAndRecordForImportCallOptimization( + IMAGE_RETPOLINE_AMD64_CFG_CALL); + } else if (isImportedFunction(MI->getOperand(3))) { + emitLabelAndRecordForImportCallOptimization( + IMAGE_RETPOLINE_AMD64_IMPORT_CALL); + + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + + // For Import Call Optimization to work, we need a the call instruction + // with a rex prefix, and a 5-byte nop after the call instruction. + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + emitCallInstruction(TmpInst); + // MSVC Linker is *very* picky about the exact nop to use. + MCInst Nop = MCInstBuilder(X86::NOOPL) + .addReg(X86::RAX) + .addImm(1) + .addReg(X86::RAX) + .addImm(0) + .addReg(0); + Nop.setFlags(X86::IP_USE_DISP8); + EmitAndCountInstruction(Nop); + maybeEmitNopAfterCallForWindowsEH(MI); + return; + } } break; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 72f38133e21ff..5878a0f7a61d3 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -983,7 +983,6 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg( case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: - case X86::TCRETURNri64_ImpCall: case X86::TCRETURNmi64: case X86::TCRETURN_WINmi64: case X86::EH_RETURN: diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 692e42ae5e752..a513371506038 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -728,10 +728,6 @@ def GR32_SIDI : RegisterClass<"X86", [i32], 32, (add ESI, EDI)>; def GR32_DIBP : RegisterClass<"X86", [i32], 32, (add EDI, EBP)>; def GR32_BPSP : RegisterClass<"X86", [i32], 32, (add EBP, ESP)>; -// Class to support Windows Import Call Optimization: all indirect jumps must -// happen through RAX. -def GR64_A : RegisterClass<"X86", [i64], 64, (add RAX)>; - // Scalar SSE2 floating point registers. def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index 62e07445ad12e..f6bca667d5078 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -7001,7 +7001,6 @@ Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] -Key: PhyReg_GR64_A: [ 0.00 0.00 ] Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] @@ -7136,7 +7135,6 @@ Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] -Key: VirtReg_GR64_A: [ 0.00 0.00 ] Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index 03a3fafc6b801..4da4adf82951e 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -7001,7 +7001,6 @@ Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] -Key: PhyReg_GR64_A: [ 0.00 0.00 ] Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] @@ -7136,7 +7135,6 @@ Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ] -Key: VirtReg_GR64_A: [ 0.00 0.00 ] Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ] Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ] Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll index 12be910d68ee9..39d5a2596b5b6 100644 --- a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll +++ b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll @@ -1,33 +1,151 @@ -; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM +; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM +; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \ +; RUN: FileCheck %s --check-prefix ASM +; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \ +; RUN: | FileCheck %s --check-prefix OBJ + +@global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8 +declare dllimport void @a() local_unnamed_addr +declare dllimport void @b() local_unnamed_addr +declare dso_local i32 @__C_specific_handler(...) define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" { entry: + call void @a() + call void @a() call void %func_ptr() + %0 = load ptr, ptr @global_func_ptr, align 8 + call void %0() + ret void +} +; ASM-LABEL: normal_call: +; ASM: movq %rcx, %rsi +; ASM-NEXT: .Limpcall0: +; ASM-NEXT: rex64 +; ASM-NEXT: callq *__imp_a(%rip) +; ASM-NEXT: nopl (%rax,%rax) +; ASM-NEXT: .Limpcall1: +; ASM-NEXT: rex64 +; ASM-NEXT: callq *__imp_a(%rip) +; ASM-NEXT: nopl (%rax,%rax) +; ASM-NEXT: movq %rsi, %rax +; ASM-NEXT: .Limpcall2: +; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip) +; ASM-NEXT: movq global_func_ptr(%rip), %rax +; ASM-NEXT: .Limpcall3: +; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip) +; ASM-NEXT: nop + +define dso_local void @tail_call() local_unnamed_addr section "tc_sect" { +entry: + tail call void @b() ret void } -; CHECK-LABEL: normal_call: -; CHECK: .Limpcall0: -; CHECK-NEXT: callq *__guard_dispatch_icall_fptr(%rip) +; ASM-LABEL: tail_call: +; ASM: .Limpcall4: +; ASM-NEXT: rex64 jmpq *__imp_b(%rip) define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" { entry: tail call void %func_ptr() ret void } -; CHECK-LABEL: tail_call_fp: -; CHECK: .Limpcall1: -; CHECK-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip) - -; CHECK-LABEL .section .retplne,"yi" -; CHECK-NEXT .asciz "RetpolineV1" -; CHECK-NEXT .long 16 -; CHECK-NEXT .secnum tc_sect -; CHECK-NEXT .long 10 -; CHECK-NEXT .secoffset .Limpcall1 -; CHECK-NEXT .long 16 -; CHECK-NEXT .secnum nc_sect -; CHECK-NEXT .long 9 -; CHECK-NEXT .secoffset .Limpcall0 +; ASM-LABEL: tail_call_fp: +; ASM: movq %rcx, %rax +; ASM-NEXT: .Limpcall5: +; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip) + +define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" { +entry: + %0 = load ptr, ptr @global_func_ptr, align 8 + tail call void %0() + ret void +} +; ASM-LABEL: tail_call_global_fp: +; ASM: movq global_func_ptr(%rip), %rax +; ASM-NEXT: .Limpcall6: +; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip) + +; Regression test: the call to the CFG Guard was being indirected via a register, which is not +; permitted when retpoline is enabled. +define dso_local i32 @might_call_global_func_ptr(ptr %0, ptr %1, i32 %2) { +3: + %4 = icmp eq i32 %2, 0 + br i1 %4, label %5, label %8 + +5: ; preds = %11 + %6 = load ptr, ptr @global_func_ptr, align 8 + %7 = tail call i32 %6(ptr noundef %1) + br label %8 + +8: + %9 = phi i32 [ %7, %5 ], [ -1, %3 ] + ret i32 %9 +} +; ASM-LABEL: might_call_global_func_ptr: +; ASM: movq global_func_ptr(%rip), %rax +; ASM-NEXT: movq %rdx, %rcx +; ASM-NEXT: .Limpcall7: +; ASM-NEXT: rex64 jmpq *__guard_dispatch_icall_fptr(%rip) + +define dso_local void @invoke_many_args(ptr %0, ptr %1, ptr %2) personality ptr @__C_specific_handler { + %4 = alloca ptr, align 8 + %5 = alloca ptr, align 8 + %6 = alloca ptr, align 8 + invoke void %0(ptr %1, ptr %2, ptr %4, ptr %5, ptr %6) + to label %7 unwind label %8 + +7: + ret void + +8: + %9 = cleanuppad within none [] + cleanupret from %9 unwind to caller +} +; ASM-LABEL: invoke_many_args: +; ASM: .Limpcall8: +; ASM-NEXT: callq *__guard_dispatch_icall_fptr(%rip) +; ASM-NEXT: nop + +; ASM-LABEL .section .retplne,"yi" +; ASM-NEXT .asciz "RetpolineV1" +; ASM-NEXT .long 24 +; ASM-NEXT .secnum .text +; ASM-NEXT .long 10 +; ASM-NEXT .secoffset .Limpcall7 +; ASM-NEXT .long 9 +; ASM-NEXT .secoffset .Limpcall8 +; ASM-NEXT .long 40 +; ASM-NEXT .secnum nc_sect +; ASM-NEXT .long 3 +; ASM-NEXT .secoffset .Limpcall0 +; ASM-NEXT .long 3 +; ASM-NEXT .secoffset .Limpcall1 +; ASM-NEXT .long 9 +; ASM-NEXT .secoffset .Limpcall2 +; ASM-NEXT .long 9 +; ASM-NEXT .secoffset .Limpcall3 +; ASM-NEXT .long 32 +; ASM-NEXT .secnum tc_sect +; ASM-NEXT .long 2 +; ASM-NEXT .secoffset .Limpcall4 +; ASM-NEXT .long 10 +; ASM-NEXT .secoffset .Limpcall5 +; ASM-NEXT .long 10 +; ASM-NEXT .secoffset .Limpcall6 + +; The loader assumes an exact sequence of instructions/bytes at each marked site since it may +; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link +; time. + +; Kind = 9 (IMAGE_RETPOLINE_AMD64_CFG_CALL) +; OBJ-LABEL: <normal_call>: +; OBJ: : ff 15 00 00 00 00 callq *(%rip) + +; Kind = 10 (IMAGE_RETPOLINE_AMD64_CFG_BR_REX) +; OBJ-LABEL: <tc_sect>: +; OBJ: : 48 ff 25 00 00 00 00 jmpq *(%rip) !llvm.module.flags = !{!0, !1} !0 = !{i32 1, !"import-call-optimization", i32 1} diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll index fe22b251685e6..fb628fc34bdb5 100644 --- a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll +++ b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll @@ -1,8 +1,15 @@ -; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s - -; CHECK-LABEL: uses_rax: -; CHECK: .Limpcall0: -; CHECK-NEXT: jmpq *%rax +; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix ASM +; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM +; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \ +; RUN: | FileCheck %s --check-prefix OBJ + +; ASM-LABEL: uses_rax: +; ASM: .Limpcall0: +; ASM-NEXT: jmpq *%rax +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 define void @uses_rax(i32 %x) { entry: @@ -34,9 +41,13 @@ sw.epilog: ret void } -; CHECK-LABEL: uses_rcx: -; CHECK: .Limpcall1: -; CHECK-NEXT: jmpq *%rcx +; ASM-LABEL: uses_rcx: +; ASM: .Limpcall1: +; ASM-NEXT: jmpq *%rcx +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 define void @uses_rcx(i32 %x) { entry: @@ -70,14 +81,32 @@ sw.epilog: declare void @g(i32) -; CHECK-LABEL: .section .retplne,"yi" -; CHECK-NEXT: .asciz "RetpolineV1" -; CHECK-NEXT: .long 24 -; CHECK-NEXT: .secnum .text -; CHECK-NEXT: .long 16 -; CHECK-NEXT: .secoffset .Limpcall0 -; CHECK-NEXT: .long 17 -; CHECK-NEXT: .secoffset .Limpcall1 +; ASM-LABEL: .section .retplne,"yi" +; ASM-NEXT: .asciz "RetpolineV1" +; ASM-NEXT: .long 24 +; ASM-NEXT: .secnum .text +; ASM-NEXT: .long 16 +; ASM-NEXT: .secoffset .Limpcall0 +; ASM-NEXT: .long 17 +; ASM-NEXT: .secoffset .Limpcall1 + +; The loader assumes an exact sequence of instructions/bytes at each marked site since it may +; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link +; time. + +; Kind = 16-31 (IMAGE_RETPOLINE_AMD64_SWITCHTABLE_*) +; OBJ-LABEL: <uses_rax>: +; OBJ: : ff e0 jmpq *%rax +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-LABEL: <uses_rcx>: +; OBJ: : ff e1 jmpq *%rcx +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 !llvm.module.flags = !{!0} !0 = !{i32 1, !"import-call-optimization", i32 1} diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization.ll b/llvm/test/CodeGen/X86/win-import-call-optimization.ll index cc7e1a9f81e34..0d62779cb444b 100644 --- a/llvm/test/CodeGen/X86/win-import-call-optimization.ll +++ b/llvm/test/CodeGen/X86/win-import-call-optimization.ll @@ -1,67 +1,189 @@ -; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK -; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK -; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM +; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s --check-prefix ASM +; RUN: llc --global-isel --global-isel-abort=2 -mtriple=x86_64-pc-windows-msvc -o - %s | \ +; RUN: FileCheck %s --check-prefix ASM +; RUN: llc -mtriple=x86_64-pc-windows-msvc --filetype=obj -o - %s | llvm-objdump - --disassemble \ +; RUN: | FileCheck %s --check-prefix OBJ + +@global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" { entry: call void @a() call void @a() call void %func_ptr() + %0 = load ptr, ptr @global_func_ptr, align 8 + call void %0() ret void } -; CHECK-LABEL: normal_call: -; CHECK: .Limpcall0: -; CHECK-NEXT: rex64 -; CHECK-NEXT: callq __imp_a -; CHECK-NEXT: nopl 8(%rax,%rax) -; CHECK-NEXT: .Limpcall1: -; CHECK-NEXT: rex64 -; CHECK-NEXT: callq __imp_a -; CHECK-NEXT: nopl 8(%rax,%rax) -; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: .Limpcall2: -; CHECK-NEXT: callq *%rax -; CHECK-NEXT: nopl (%rax) -; CHECK-NEXT: nop +; ASM-LABEL: normal_call: +; ASM: .Limpcall0: +; ASM-NEXT: rex64 +; ASM-NEXT: callq *__imp_a(%rip) +; ASM-NEXT: nopl (%rax,%rax) +; ASM-NEXT: .Limpcall1: +; ASM-NEXT: rex64 +; ASM-NEXT: callq *__imp_a(%rip) +; ASM-NEXT: nopl (%rax,%rax) +; ASM-NEXT: movq %rsi, %rax +; ASM-NEXT: .Limpcall2: +; ASM-NEXT: callq *%rax +; ASM-NEXT: nopl (%rax) +; ASM-NEXT: movq global_func_ptr(%rip), %rax +; ASM-NEXT: .Limpcall3: +; ASM-NEXT: callq *%rax +; ASM-NEXT: nopl (%rax) +; ASM-NEXT: nop define dso_local void @tail_call() local_unnamed_addr section "tc_sect" { entry: tail call void @b() ret void } -; CHECK-LABEL: tail_call: -; CHECK: .Limpcall3: -; CHECK-NEXT: jmp __imp_b +; ASM-LABEL: tail_call: +; ASM: .Limpcall4: +; ASM-NEXT: rex64 jmpq *__imp_b(%rip) +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" { entry: tail call void %func_ptr() ret void } -; CHECK-LABEL: tail_call_fp: -; CHECK: movq %rcx, %rax -; CHECK-NEXT: .Limpcall4: -; CHECK-NEXT: rex64 jmpq *%rax +; ASM-LABEL: tail_call_fp: +; ASM: movq %rcx, %rax +; ASM-NEXT: .Limpcall5: +; ASM-NEXT: rex64 jmpq *%rax +; ASM-NEXT: int3 +; ASM-NEXT: int3 + +define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" { +entry: + %0 = load ptr, ptr @global_func_ptr, align 8 + tail call void %0() + ret void +} +; ASM-LABEL: tail_call_global_fp: +; ASM: movq global_func_ptr(%rip), %rax +; ASM-NEXT: .Limpcall6: +; ASM-NEXT: rex64 jmpq *%rax +; ASM-NEXT: int3 +; ASM-NEXT: int3 + +; Regression test: conditional tail calls can't be encoded, so make sure they aren't emitted. +define void @might_call(i1 %4) local_unnamed_addr { + br i1 %4, label %makecall, label %finish + +makecall: + tail call void @a() + br label %finish + +finish: + ret void +} +; ASM-LABEL: might_call: +; ASM: .Limpcall7: +; ASM-NEXT: rex64 jmpq *__imp_a(%rip) +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 +; ASM-NEXT: int3 + +; Regression test: this particular sequence caused a cycle in DAG scheduling due +; to the requirement to use RAX for register-indirect calls. We now explicitly +; copy to RAX which breaks the cycle. +define dso_local i32 @not_scheduled_repro(ptr %0, ptr %1, ptr %2) local_unnamed_addr { + %4 = load i64, ptr %0, align 8 + %5 = inttoptr i64 %4 to ptr + %6 = tail call i64 %5(ptr noundef %1) + store i64 %6, ptr %2, align 8 + ret i32 0 +} +; ASM-LABEL: not_scheduled_repro: +; ASM: movq (%rcx), %rax +; ASM-NEXT: movq %rdx, %rcx +; ASM-NEXT: .Limpcall8: +; ASM-NEXT: callq *%rax +; ASM-NEXT: nopl (%rax) + +define dso_local void @not_scheduled_repro_tc(ptr %0, ptr %1) local_unnamed_addr { + %4 = load i64, ptr %0, align 8 + %5 = inttoptr i64 %4 to ptr + tail call void %5(ptr noundef %1) + ret void +} +; ASM-LABEL: not_scheduled_repro_tc: +; ASM: movq (%rcx), %rax +; ASM-NEXT: movq %rdx, %rcx +; ASM-NEXT: .Limpcall9: +; ASM-NEXT: rex64 jmpq *%rax +; ASM-NEXT: int3 +; ASM-NEXT: int3 declare dllimport void @a() local_unnamed_addr declare dllimport void @b() local_unnamed_addr -; CHECK-LABEL .section .retplne,"yi" -; CHECK-NEXT .asciz "RetpolineV1" -; CHECK-NEXT .long 24 -; CHECK-NEXT .secnum tc_sect -; CHECK-NEXT .long 3 -; CHECK-NEXT .secoffset .Limpcall3 -; CHECK-NEXT .long 5 -; CHECK-NEXT .secoffset .Limpcall4 -; CHECK-NEXT .long 32 -; CHECK-NEXT .secnum nc_sect -; CHECK-NEXT .long 3 -; CHECK-NEXT .secoffset .Limpcall0 -; CHECK-NEXT .long 3 -; CHECK-NEXT .secoffset .Limpcall1 -; CHECK-NEXT .long 5 -; CHECK-NEXT .secoffset .Limpcall2 +; ASM-LABEL .section .retplne,"yi" +; ASM-NEXT .asciz "RetpolineV1" +; ASM-NEXT .long 32 +; ASM-NEXT .secnum tc_sect +; ASM-NEXT .long 2 +; ASM-NEXT .secoffset .Limpcall4 +; ASM-NEXT .long 6 +; ASM-NEXT .secoffset .Limpcall5 +; ASM-NEXT .long 6 +; ASM-NEXT .secoffset .Limpcall6 +; ASM-NEXT .long 40 +; ASM-NEXT .secnum nc_sect +; ASM-NEXT .long 3 +; ASM-NEXT .secoffset .Limpcall0 +; ASM-NEXT .long 3 +; ASM-NEXT .secoffset .Limpcall1 +; ASM-NEXT .long 5 +; ASM-NEXT .secoffset .Limpcall2 +; ASM-NEXT .long 5 +; ASM-NEXT .secoffset .Limpcall3 +; ASM-NEXT .long 32 +; ASM-NEXT .secnum .text +; ASM-NEXT .long 2 +; ASM-NEXT .secoffset .Limpcall7 +; ASM-NEXT .long 5 +; ASM-NEXT .secoffset .Limpcall8 +; ASM-NEXT .long 6 +; ASM-NEXT .secoffset .Limpcall9 + +; The loader assumes an exact sequence of instructions/bytes at each marked site since it may +; replace the instruction(s) with new instruction(s), and the MSVC linker validates these at link +; time. + +; Kind = 3 (IMAGE_RETPOLINE_AMD64_IMPORT_CALL) +; OBJ-LABEL: <normal_call>: +; OBJ: : 48 ff 15 00 00 00 00 callq *(%rip) +; OBJ-NEXT: : 0f 1f 44 00 00 nopl (%rax,%rax) + +; Kind = 5 (IMAGE_RETPOLINE_AMD64_INDIR_CALL) +; OBJ: : ff d0 callq *%rax +; OBJ-NEXT: : 0f 1f 00 nopl (%rax) + +; Kind = 2 (IMAGE_RETPOLINE_AMD64_IMPORT_BR) +; OBJ-LABEL: <tc_sect>: +; OBJ: : 48 ff 25 00 00 00 00 jmpq *(%rip) +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 + +; Kind = 6 (IMAGE_RETPOLINE_AMD64_INDIR_BR) +; OBJ-LABEL: <tail_call_fp>: +; OBJ: : 48 ff e0 jmpq *%rax +; OBJ-NEXT: : cc int3 +; OBJ-NEXT: : cc int3 !llvm.module.flags = !{!0} !0 = !{i32 1, !"import-call-optimization", i32 1} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
