https://github.com/Saieiei updated https://github.com/llvm/llvm-project/pull/183108
>From 540e92d408de1d1b30ce083df748ecd28e5b8336 Mon Sep 17 00:00:00 2001 From: Sairudra More <[email protected]> Date: Tue, 24 Feb 2026 11:21:17 -0600 Subject: [PATCH] [flang] Add runtime trampoline pool for W^X compliance Replace LLVM's llvm.init.trampoline / llvm.adjust.trampoline intrinsics with a runtime-managed trampoline pool that keeps the stack non-executable, satisfying W^X (Write XOR Execute) policies enforced by modern toolchains and security-hardened kernels. The new -fenable-runtime-trampoline flag activates this path in BoxedProcedure, which emits calls to _FortranATrampolineInit, _FortranATrampolineAdjust, and _FortranATrampolineFree instead of the legacy LLVM trampoline intrinsics. Closes #182813 --- clang/include/clang/Options/Options.td | 5 + clang/lib/Driver/ToolChains/Flang.cpp | 14 + .../include/flang-rt/runtime/trampoline.h | 69 +++ flang-rt/lib/runtime/CMakeLists.txt | 1 + flang-rt/lib/runtime/trampoline.cpp | 425 ++++++++++++++++ .../Driver/runtime-trampoline-gnustack.f90 | 45 ++ flang/docs/RuntimeEnvironment.md | 12 + .../include/flang/Frontend/CodeGenOptions.def | 1 + .../Optimizer/Builder/Runtime/RTBuilder.h | 4 + .../Optimizer/Builder/Runtime/Trampoline.h | 47 ++ .../flang/Optimizer/CodeGen/CGPasses.td | 16 +- .../flang/Optimizer/Passes/CommandLineOpts.h | 1 + .../flang/Optimizer/Passes/Pipelines.h | 3 +- flang/include/flang/Runtime/trampoline.h | 69 +++ flang/include/flang/Tools/CrossToolHelpers.h | 2 + flang/lib/Frontend/CompilerInvocation.cpp | 4 + flang/lib/Optimizer/Builder/CMakeLists.txt | 1 + .../Optimizer/Builder/Runtime/Trampoline.cpp | 49 ++ .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 464 ++++++++++-------- .../lib/Optimizer/Passes/CommandLineOpts.cpp | 2 + flang/lib/Optimizer/Passes/Pipelines.cpp | 15 +- flang/test/Driver/fruntime-trampoline.f90 | 15 + flang/test/Fir/boxproc-runtime-trampoline.fir | 67 +++ flang/test/Lower/runtime-trampoline.f90 | 41 ++ 24 files changed, 1170 insertions(+), 202 deletions(-) create mode 100644 flang-rt/include/flang-rt/runtime/trampoline.h create mode 100644 flang-rt/lib/runtime/trampoline.cpp create mode 100644 flang-rt/test/Driver/runtime-trampoline-gnustack.f90 create mode 100644 flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h create mode 100644 flang/include/flang/Runtime/trampoline.h create mode 100644 flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp create mode 100644 flang/test/Driver/fruntime-trampoline.f90 create mode 100644 flang/test/Fir/boxproc-runtime-trampoline.fir create mode 100644 flang/test/Lower/runtime-trampoline.f90 diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 4ac812e92e2cb..859292d3fc6ab 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -7567,6 +7567,11 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays", PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries on the stack, no matter their size">, NegFlag<SetFalse, [], [ClangOption], "Allocate array temporaries on the heap (default)">>; +defm runtime_trampoline : BoolOptionWithoutMarshalling<"f", + "runtime-trampoline", + PosFlag<SetTrue, [], [ClangOption], "Use W^X compliant runtime trampoline pool for internal procedures">, + NegFlag<SetFalse, [], [ClangOption], "Use stack-based trampolines for internal procedures (default)">>; + defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stride", PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">, NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 8425f8fec62a4..c90a2d1a5d947 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -203,6 +203,20 @@ void Flang::addCodegenOptions(const ArgList &Args, !stackArrays->getOption().matches(options::OPT_fno_stack_arrays)) CmdArgs.push_back("-fstack-arrays"); + if (Args.hasFlag(options::OPT_fruntime_trampoline, + options::OPT_fno_runtime_trampoline, false)) { + const llvm::Triple &T = getToolChain().getTriple(); + if (T.getArch() == llvm::Triple::x86_64 || + T.getArch() == llvm::Triple::aarch64 || + T.getArch() == llvm::Triple::aarch64_be) { + CmdArgs.push_back("-fruntime-trampoline"); + } else { + getToolChain().getDriver().Diag( + diag::warn_drv_unsupported_option_for_target) + << "-fruntime-trampoline" << T.str(); + } + } + // -fno-protect-parens is the default for -Ofast. if (!Args.hasFlag(options::OPT_fprotect_parens, options::OPT_fno_protect_parens, diff --git a/flang-rt/include/flang-rt/runtime/trampoline.h b/flang-rt/include/flang-rt/runtime/trampoline.h new file mode 100644 index 0000000000000..3b3ddff7a0587 --- /dev/null +++ b/flang-rt/include/flang-rt/runtime/trampoline.h @@ -0,0 +1,69 @@ +//===-- flang-rt/runtime/trampoline.h ----------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Internal declarations for the W^X-compliant trampoline pool. +// +//===----------------------------------------------------------------------===// + +#ifndef FLANG_RT_RUNTIME_TRAMPOLINE_H_ +#define FLANG_RT_RUNTIME_TRAMPOLINE_H_ + +#include <cstddef> +#include <cstdint> + +namespace Fortran::runtime::trampoline { + +/// Per-trampoline data entry. Stored in a writable (non-executable) region. +/// Each entry is paired with a trampoline code stub in the executable region. +struct TrampolineData { + const void *calleeAddress; + const void *staticChainAddress; +}; + +/// Default number of trampoline slots in the pool. +/// Can be overridden via FLANG_TRAMPOLINE_POOL_SIZE environment variable. +constexpr std::size_t kDefaultPoolSize = 1024; + +/// Size of each trampoline code stub in bytes (platform-specific). +#if defined(__x86_64__) || defined(_M_X64) +// x86-64 trampoline stub: +// movq TDATA_OFFSET(%rip), %r10 # load static chain from TDATA +// movabsq $0, %r11 # placeholder for callee address +// jmpq *%r11 +// Actually we use an indirect approach through the TDATA pointer: +// movq (%r10), %r10 # load static chain (8 bytes) +// -- but we need the TDATA pointer first +// Simplified approach for x86-64: +// leaq tdata_entry(%rip), %r11 # get TDATA entry address +// movq 8(%r11), %r10 # load static chain +// jmpq *(%r11) # jump to callee +constexpr std::size_t kTrampolineStubSize = 32; +constexpr int kNestRegister = 10; // %r10 is the nest/static chain register +#elif defined(__aarch64__) || defined(_M_ARM64) +// AArch64 trampoline stub: +// adr x17, tdata_entry # get TDATA entry address +// ldr x18, [x17, #8] # load static chain +// ldr x17, [x17] # load callee address +// br x17 +constexpr std::size_t kTrampolineStubSize = 32; +constexpr int kNestRegister = 18; // x18 is the platform register +#elif defined(__powerpc64__) || defined(__ppc64__) +constexpr std::size_t kTrampolineStubSize = 48; +constexpr int kNestRegister = 11; // r11 +#else +// Fallback: generous size +constexpr std::size_t kTrampolineStubSize = 64; +constexpr int kNestRegister = 0; +#endif + +/// Alignment requirement for trampoline code stubs. +constexpr std::size_t kTrampolineAlignment = 16; + +} // namespace Fortran::runtime::trampoline + +#endif // FLANG_RT_RUNTIME_TRAMPOLINE_H_ diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 9fa8376e9b99c..d5e89a169255c 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -88,6 +88,7 @@ set(host_sources stop.cpp temporary-stack.cpp time-intrinsic.cpp + trampoline.cpp unit-map.cpp ) if (TARGET llvm-libc-common-utilities) diff --git a/flang-rt/lib/runtime/trampoline.cpp b/flang-rt/lib/runtime/trampoline.cpp new file mode 100644 index 0000000000000..a1f6fa2cea783 --- /dev/null +++ b/flang-rt/lib/runtime/trampoline.cpp @@ -0,0 +1,425 @@ +//===-- lib/runtime/trampoline.cpp -------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// W^X-compliant trampoline pool implementation. +// +// This file implements a runtime trampoline pool that maintains separate +// memory regions for executable code (RX) and writable data (RW). +// +// On Linux the code region transitions RW → RX (never simultaneously W+X). +// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X +// toggling via pthread_jit_write_protect_np, so the mapping permissions +// include both W and X but hardware enforces that only one is active at +// a time on any given thread. +// +// Architecture: +// - Code region (RX): Contains pre-assembled trampoline stubs that load +// callee address and static chain from a paired TDATA entry, then jump +// to the callee with the static chain in the appropriate register. +// - Data region (RW): Contains TrampolineData entries with {callee_address, +// static_chain_address} pairs, one per trampoline slot. +// - Free list: Tracks available trampoline slots for O(1) alloc/free. +// +// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX, +// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime +// library dependence. A single global lock serializes pool operations. +// This is a deliberate V1 design choice to keep the initial W^X +// architectural change minimal. Per-thread lock-free pools are deferred +// to a future optimization patch. +// +// AddressSanitizer note: The trampoline code region is allocated via +// mmap (not malloc/new), so ASan does not track it. The data region +// and handles are allocated via malloc (through AllocateMemoryOrCrash), +// which ASan intercepts normally. No special annotations are needed. +// +// See flang/docs/InternalProcedureTrampolines.md for design details. +// +//===----------------------------------------------------------------------===// + +#include "flang/Runtime/trampoline.h" +#include "flang-rt/runtime/lock.h" +#include "flang-rt/runtime/memory.h" +#include "flang-rt/runtime/terminator.h" +#include "flang-rt/runtime/trampoline.h" + +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <new> // For placement-new only (no operator new/delete dependency) + +// Platform-specific headers for memory mapping. +#if defined(_WIN32) +#include <windows.h> +#else +#include <sys/mman.h> +#include <unistd.h> +#endif + +// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np +// to create executable memory under the hardened runtime. +#if defined(__APPLE__) && defined(__aarch64__) +#include <libkern/OSCacheControl.h> +#include <pthread.h> +#endif + +// Architecture support check. Stub generators exist only for x86-64 and +// AArch64. On other architectures the file compiles but the runtime API +// functions crash with a diagnostic if actually called, so that building +// flang-rt on e.g. RISC-V or PPC64 never fails. +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \ + defined(_M_ARM64) +#define TRAMPOLINE_ARCH_SUPPORTED 1 +#else +#define TRAMPOLINE_ARCH_SUPPORTED 0 +#endif + +namespace Fortran::runtime::trampoline { + +/// A handle returned to the caller. Contains enough info to find +/// both the trampoline stub and its data entry. +struct TrampolineHandle { + void *codePtr; // Pointer to the trampoline stub in the RX region. + TrampolineData *dataPtr; // Pointer to the data entry in the RW region. + std::size_t slotIndex; // Index in the pool for free-list management. +}; + +// Namespace-scope globals following Flang runtime conventions: +// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION) +// - Pool pointer starts null; initialized under lock (double-checked locking) +class TrampolinePool; // Forward declaration for pointer below. +static Lock poolLock; +static TrampolinePool *poolInstance{nullptr}; + +/// The global trampoline pool. +class TrampolinePool { +public: + static TrampolinePool &instance() { + if (poolInstance) { + return *poolInstance; + } + CriticalSection critical{poolLock}; + if (poolInstance) { + return *poolInstance; + } + // Allocate pool using malloc + placement new (trivial constructor). + Terminator terminator{__FILE__, __LINE__}; + void *storage = AllocateMemoryOrCrash(terminator, sizeof(TrampolinePool)); + poolInstance = new (storage) TrampolinePool(); + return *poolInstance; + } + + /// Allocate a trampoline slot and initialize it. + TrampolineHandle *allocate( + const void *calleeAddress, const void *staticChainAddress) { + CriticalSection critical{lock_}; + ensureInitialized(); + + if (freeHead_ == kInvalidIndex) { + // Pool exhausted — fixed size by design for V1. + // The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE + // (default 1024). Dynamic slab growth can be added in a follow-up + // patch if real workloads demonstrate a need for it. + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Trampoline pool exhausted (max %zu slots). " + "Set FLANG_TRAMPOLINE_POOL_SIZE to increase.", + poolSize_); + } + + std::size_t index = freeHead_; + freeHead_ = freeList_[index]; + + // Initialize the data entry. + dataRegion_[index].calleeAddress = calleeAddress; + dataRegion_[index].staticChainAddress = staticChainAddress; + + // Create handle using malloc + placement new. + Terminator terminator{__FILE__, __LINE__}; + void *mem = AllocateMemoryOrCrash(terminator, sizeof(TrampolineHandle)); + auto *handle = new (mem) TrampolineHandle(); + handle->codePtr = + static_cast<char *>(codeRegion_) + index * kTrampolineStubSize; + handle->dataPtr = &dataRegion_[index]; + handle->slotIndex = index; + + return handle; + } + + /// Get the callable address of a trampoline. + void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; } + + /// Free a trampoline slot. + void free(TrampolineHandle *handle) { + CriticalSection critical{lock_}; + + std::size_t index = handle->slotIndex; + + // Poison the data entry so that any dangling call through a freed + // trampoline traps immediately. We use a non-null, obviously-invalid + // address (0xDEAD...) so that the resulting fault is distinguishable + // from a null-pointer dereference when debugging. + dataRegion_[index].calleeAddress = reinterpret_cast<const void *>( + static_cast<uintptr_t>(~uintptr_t{0} - 1)); + dataRegion_[index].staticChainAddress = nullptr; + + // Return slot to free list. + freeList_[index] = freeHead_; + freeHead_ = index; + + FreeMemory(handle); + } + +private: + static constexpr std::size_t kInvalidIndex = ~std::size_t{0}; + + TrampolinePool() = default; + + void ensureInitialized() { + if (initialized_) + return; + initialized_ = true; + + // Check environment variable for pool size override. + // Fixed-size pool by design (V1): avoids complexity of dynamic growth + // and re-protection of code pages. The default (1024 slots) is + // sufficient for typical Fortran programs. Users can override via: + // export FLANG_TRAMPOLINE_POOL_SIZE=4096 + poolSize_ = kDefaultPoolSize; + if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) { + long val = std::strtol(envSize, nullptr, 10); + if (val > 0) + poolSize_ = static_cast<std::size_t>(val); + } + + // Allocate the data region (RW). + dataRegion_ = static_cast<TrampolineData *>( + std::calloc(poolSize_, sizeof(TrampolineData))); + assert(dataRegion_ && "Failed to allocate trampoline data region"); + + // Allocate the code region (initially RW for writing stubs, then RX). + std::size_t codeSize = poolSize_ * kTrampolineStubSize; +#if defined(_WIN32) + codeRegion_ = VirtualAlloc( + nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); +#elif defined(__APPLE__) && defined(__aarch64__) + // macOS Apple Silicon: MAP_JIT is required for pages that will become + // executable. Use pthread_jit_write_protect_np to toggle W↔X. + codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0); + if (codeRegion_ == MAP_FAILED) + codeRegion_ = nullptr; + if (codeRegion_) { + // Enable writing on this thread (MAP_JIT defaults to execute). + pthread_jit_write_protect_np(0); // 0 = writable + } +#else + codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (codeRegion_ == MAP_FAILED) + codeRegion_ = nullptr; +#endif + assert(codeRegion_ && "Failed to allocate trampoline code region"); + + // Generate trampoline stubs. + generateStubs(); + + // Flush instruction cache. Required on architectures with non-coherent + // I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op + // but harmless. Without this, AArch64 may execute stale instructions. +#if defined(__APPLE__) && defined(__aarch64__) + // On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h). + sys_icache_invalidate(codeRegion_, codeSize); +#elif defined(_WIN32) + FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize); +#else + __builtin___clear_cache(static_cast<char *>(codeRegion_), + static_cast<char *>(codeRegion_) + codeSize); +#endif + + // Make code region executable and non-writable (W^X). +#if defined(_WIN32) + DWORD oldProtect; + VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect); +#elif defined(__APPLE__) && defined(__aarch64__) + // Switch back to execute-only (MAP_JIT manages per-thread W^X). + pthread_jit_write_protect_np(1); // 1 = executable +#else + mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC); +#endif + + // Initialize free list. + freeList_ = static_cast<std::size_t *>( + std::malloc(poolSize_ * sizeof(std::size_t))); + assert(freeList_ && "Failed to allocate trampoline free list"); + + for (std::size_t i = 0; i < poolSize_ - 1; ++i) + freeList_[i] = i + 1; + freeList_[poolSize_ - 1] = kInvalidIndex; + freeHead_ = 0; + } + + /// Generate platform-specific trampoline stubs in the code region. + /// Each stub loads callee address and static chain from its paired + /// TDATA entry and jumps to the callee. + void generateStubs() { +#if defined(__x86_64__) || defined(_M_X64) + generateStubsX86_64(); +#elif defined(__aarch64__) || defined(_M_ARM64) + generateStubsAArch64(); +#else + // Unsupported architecture — should never be reached because the + // extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED. + // Fill with trap bytes as a safety net. + std::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize); +#endif + } + +#if defined(__x86_64__) || defined(_M_X64) + /// Generate x86-64 trampoline stubs. + /// + /// Each stub does: + /// movabsq $dataEntry, %r11 ; load TDATA entry address + /// movq 8(%r11), %r10 ; load static chain -> nest register + /// jmpq *(%r11) ; jump to callee address + /// + /// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize. + void generateStubsX86_64() { + auto *code = static_cast<uint8_t *>(codeRegion_); + + for (std::size_t i = 0; i < poolSize_; ++i) { + uint8_t *stub = code + i * kTrampolineStubSize; + + // Address of the corresponding TDATA entry. + auto dataAddr = reinterpret_cast<uint64_t>(&dataRegion_[i]); + + std::size_t off = 0; + + // movabsq $dataAddr, %r11 (REX.W + B, opcode 0xBB for r11) + stub[off++] = 0x49; // REX.WB + stub[off++] = 0xBB; // MOV r11, imm64 + std::memcpy(&stub[off], &dataAddr, 8); + off += 8; + + // movq 8(%r11), %r10 (load staticChainAddress into r10) + stub[off++] = 0x4D; // REX.WRB + stub[off++] = 0x8B; // MOV r/m64 -> r64 + stub[off++] = 0x53; // ModRM: [r11 + disp8], r10 + stub[off++] = 0x08; // disp8 = 8 + + // jmpq *(%r11) (jump to calleeAddress) + stub[off++] = 0x41; // REX.B + stub[off++] = 0xFF; // JMP r/m64 + stub[off++] = 0x23; // ModRM: [r11], opcode extension 4 + + // Pad the rest with INT3 (0xCC) for safety. + while (off < kTrampolineStubSize) + stub[off++] = 0xCC; + } + } +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) + /// Generate AArch64 trampoline stubs. + /// + /// Each stub does: + /// ldr x17, .Ldata_addr ; load TDATA entry address + /// ldr x18, [x17, #8] ; load static chain -> x18 (nest reg) + /// ldr x17, [x17] ; load callee address + /// br x17 ; jump to callee + /// .Ldata_addr: + /// .quad <address of dataRegion_[i]> + /// + /// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize. + void generateStubsAArch64() { + auto *code = static_cast<uint8_t *>(codeRegion_); + + for (std::size_t i = 0; i < poolSize_; ++i) { + auto *stub = reinterpret_cast<uint32_t *>(code + i * kTrampolineStubSize); + + // Address of the corresponding TDATA entry. + auto dataAddr = reinterpret_cast<uint64_t>(&dataRegion_[i]); + + // ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16 + // bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17 + stub[0] = 0x58000091; // ldr x17, #16 (imm19=4, shifted left 2 = 16) + // Encoding: 0101 1000 0000 0000 0000 0000 1001 0001 + + // ldr x18, [x17, #8] (load static chain) + // LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=18 + stub[1] = 0xF9400632; // ldr x18, [x17, #8] + + // ldr x17, [x17] (load callee address) + // LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17 + stub[2] = 0xF9400231; // ldr x17, [x17, #0] + + // br x17 + stub[3] = 0xD61F0220; // br x17 + + // .Ldata_addr: .quad dataRegion_[i] + std::memcpy(&stub[4], &dataAddr, 8); + + // Pad remaining with BRK #0 (trap) for safety. + std::size_t usedWords = 4 + 2; // 4 instructions + 1 quad (2 words) + for (std::size_t w = usedWords; + w < kTrampolineStubSize / sizeof(uint32_t); ++w) + stub[w] = 0xD4200000; // brk #0 + } + } +#endif + + Lock lock_; + bool initialized_{false}; + std::size_t poolSize_{0}; + + void *codeRegion_{nullptr}; // RX after initialization + TrampolineData *dataRegion_{nullptr}; // RW always + std::size_t *freeList_{nullptr}; // Intrusive free list + std::size_t freeHead_{kInvalidIndex}; +}; + +} // namespace Fortran::runtime::trampoline + +namespace Fortran::runtime { +extern "C" { + +// Helper: crash with a clear message on unsupported architectures. +// This is only reached if -fruntime-trampoline was used on a target +// that lacks stub generators. The driver should emit a warning and +// ignore the flag on unsupported architectures, but the runtime +// provides a safety net. +static inline void crashIfUnsupported() { +#if !TRAMPOLINE_ARCH_SUPPORTED + Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Runtime trampolines are not supported on this " + "architecture. Remove -fruntime-trampoline " + "or use the legacy stack-trampoline path."); +#endif +} + +void *RTDEF(TrampolineInit)( + void *scratch, const void *calleeAddress, const void *staticChainAddress) { + crashIfUnsupported(); + auto &pool = trampoline::TrampolinePool::instance(); + return pool.allocate(calleeAddress, staticChainAddress); +} + +void *RTDEF(TrampolineAdjust)(void *handle) { + crashIfUnsupported(); + auto &pool = trampoline::TrampolinePool::instance(); + return pool.getCallableAddress( + static_cast<trampoline::TrampolineHandle *>(handle)); +} + +void RTDEF(TrampolineFree)(void *handle) { + crashIfUnsupported(); + auto &pool = trampoline::TrampolinePool::instance(); + pool.free(static_cast<trampoline::TrampolineHandle *>(handle)); +} + +} // extern "C" +} // namespace Fortran::runtime diff --git a/flang-rt/test/Driver/runtime-trampoline-gnustack.f90 b/flang-rt/test/Driver/runtime-trampoline-gnustack.f90 new file mode 100644 index 0000000000000..ee55f28691bf2 --- /dev/null +++ b/flang-rt/test/Driver/runtime-trampoline-gnustack.f90 @@ -0,0 +1,45 @@ +! UNSUPPORTED: system-windows +! UNSUPPORTED: offload-cuda +! UNSUPPORTED: system-darwin + +! Verify that -fruntime-trampoline produces an executable whose +! GNU_STACK program header is RW (not RWE), proving W^X compliance. +! The legacy stack-trampoline path requires an executable stack; the +! runtime trampoline pool does not. + +! RUN: %flang %isysroot -fruntime-trampoline -L"%libdir" %s -o %t +! RUN: llvm-readelf -lW %t | FileCheck %s + +! Ensure GNU_STACK exists and has RW flags (no E). +! CHECK: GNU_STACK +! CHECK-SAME: RW +! CHECK-NOT: RWE + +subroutine host_proc(x, res) + implicit none + integer, intent(in) :: x + integer, intent(out) :: res + + interface + function f_iface() result(r) + integer :: r + end function + end interface + + procedure(f_iface), pointer :: fptr + fptr => inner + res = fptr() + +contains + function inner() result(r) + integer :: r + r = x + 1 + end function +end subroutine + +program test_gnustack + implicit none + integer :: result + call host_proc(1, result) + print *, result +end program diff --git a/flang/docs/RuntimeEnvironment.md b/flang/docs/RuntimeEnvironment.md index e762ce770fd83..6d70a7ef49746 100644 --- a/flang/docs/RuntimeEnvironment.md +++ b/flang/docs/RuntimeEnvironment.md @@ -66,3 +66,15 @@ when output takes place to a sequential unit after executing a `BACKSPACE` or `REWIND` statement. Truncation of a stream-access unit is common to several other compilers, but it is not mentioned in the standard. + +## `FLANG_TRAMPOLINE_POOL_SIZE` + +Set `FLANG_TRAMPOLINE_POOL_SIZE` to an integer value to control the maximum +number of runtime trampoline slots available when `-fruntime-trampoline` is +enabled. Each slot is 32 bytes of executable code backed by a writable data +entry. The default is 1024 slots, which is sufficient for typical Fortran +programs. If more internal-procedure closures are alive simultaneously than +the pool can hold, the runtime terminates with a diagnostic message that +includes the current pool capacity. + +Example: `export FLANG_TRAMPOLINE_POOL_SIZE=4096` diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 05ee0e28bcaa6..2b31047f79cd5 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -42,6 +42,7 @@ CODEGENOPT(PrepareForThinLTO , 1, 0) ///< Set when -flto=thin is enabled on the ///< compile step. CODEGENOPT(ProtectParens, 1, 1) ///< -fprotect-parens (enable parenthesis protection) CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays pass) +CODEGENOPT(EnableRuntimeTrampoline, 1, 0) ///< -fruntime-trampoline (W^X compliant trampoline pool) CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization. CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization. CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange. diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h index 2c74ab29f75e8..89d1aff4b26a8 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h @@ -244,6 +244,10 @@ constexpr TypeBuilderFunc getModel<void *>() { }; } template <> +constexpr TypeBuilderFunc getModel<const void *>() { + return getModel<void *>(); +} +template <> constexpr TypeBuilderFunc getModel<void (*)(int)>() { return [](mlir::MLIRContext *context) -> mlir::Type { return fir::LLVMPointerType::get( diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h new file mode 100644 index 0000000000000..45e079818c1fc --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h @@ -0,0 +1,47 @@ +//===-- Trampoline.h - Runtime trampoline pool builder ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Builder routines for generating calls to the Fortran runtime trampoline +// pool APIs (_FortranATrampolineInit, _FortranATrampolineAdjust, +// _FortranATrampolineFree). +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H +#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H + +namespace mlir { +class Value; +class Location; +} // namespace mlir + +namespace fir { +class FirOpBuilder; +} + +namespace fir::runtime { + +/// Generate a call to _FortranATrampolineInit. +/// Returns an opaque handle (void*) for the trampoline. +mlir::Value genTrampolineInit(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value scratch, mlir::Value calleeAddress, + mlir::Value staticChainAddress); + +/// Generate a call to _FortranATrampolineAdjust. +/// Returns the callable function pointer for the trampoline. +mlir::Value genTrampolineAdjust(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value handle); + +/// Generate a call to _FortranATrampolineFree. +/// Frees the trampoline slot. +void genTrampolineFree(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value handle); + +} // namespace fir::runtime + +#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td index df0ecf5540776..783d73c7e4b4e 100644 --- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td +++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td @@ -91,12 +91,18 @@ def TargetRewritePass : Pass<"target-rewrite", "mlir::ModuleOp"> { } def BoxedProcedurePass : Pass<"boxed-procedure", "mlir::ModuleOp"> { - let options = [ - Option<"useThunks", "use-thunks", - "bool", /*default=*/"true", + let options = + [Option< + "useThunks", "use-thunks", "bool", /*default=*/"true", "Convert procedure pointer abstractions to a single code pointer, " - "deploying thunks wherever required."> - ]; + "deploying thunks wherever required.">, + Option< + "useRuntimeTrampoline", "use-runtime-trampoline", "bool", + /*default=*/"false", + "Use runtime trampoline pool instead of stack-based trampolines " + "for W^X compliance. When enabled, internal procedure pointers " + "use a runtime-managed pool of executable trampolines with " + "separate data region, avoiding the need for an executable stack.">]; } def LowerRepackArraysPass : Pass<"lower-repack-arrays", "mlir::ModuleOp"> { diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h index 1dc8e4f079d72..8bf652cd051cb 100644 --- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h +++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h @@ -65,6 +65,7 @@ extern llvm::cl::opt<bool> disableDebugInfo; extern llvm::cl::opt<bool> disableFirToLlvmIr; extern llvm::cl::opt<bool> disableLlvmIrToLlvm; extern llvm::cl::opt<bool> disableBoxedProcedureRewrite; +extern llvm::cl::opt<bool> enableRuntimeTrampoline; extern llvm::cl::opt<bool> disableExternalNameConversion; extern llvm::cl::opt<bool> enableConstantArgumentGlobalisation; diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h index 1a7ff4ff3dfa2..21b009cc42eb7 100644 --- a/flang/include/flang/Optimizer/Passes/Pipelines.h +++ b/flang/include/flang/Optimizer/Passes/Pipelines.h @@ -93,7 +93,8 @@ void addTargetRewritePass(mlir::PassManager &pm); mlir::LLVM::DIEmissionKind getEmissionKind(llvm::codegenoptions::DebugInfoKind kind); -void addBoxedProcedurePass(mlir::PassManager &pm); +void addBoxedProcedurePass(mlir::PassManager &pm, + bool enableRuntimeTrampoline = false); void addExternalNameConversionPass(mlir::PassManager &pm, bool appendUnderscore = true); diff --git a/flang/include/flang/Runtime/trampoline.h b/flang/include/flang/Runtime/trampoline.h new file mode 100644 index 0000000000000..3322df8b1b340 --- /dev/null +++ b/flang/include/flang/Runtime/trampoline.h @@ -0,0 +1,69 @@ +//===-- include/flang/Runtime/trampoline.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Runtime support for W^X-compliant trampoline pool management. +// +// This provides an alternative to stack-based trampolines for internal +// procedures with host association. Instead of requiring the stack to be +// both writable and executable (violating W^X security policies), this +// implementation uses a pool of pre-assembled trampolines in a separate +// executable (but not writable) memory region, paired with writable (but +// not executable) data entries. +// +// See flang/docs/InternalProcedureTrampolines.md for design details. +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_TRAMPOLINE_H_ +#define FORTRAN_RUNTIME_TRAMPOLINE_H_ + +#include "flang/Runtime/entry-names.h" + +namespace Fortran::runtime { +extern "C" { + +/// Initializes a new trampoline and returns its internal handle. +/// +/// Allocates a trampoline entry from the pool, configuring it to call +/// \p calleeAddress with the static chain pointer \p staticChainAddress +/// set in the appropriate register (per target ABI). +/// +/// \p scratch is reserved for future use (e.g., fallback to stack +/// trampolines). Pass nullptr for pool-based allocation. +/// +/// The returned handle must be passed to FreeTrampoline() when the +/// host procedure exits. +/// +/// Pool capacity: The pool is fixed-size (default 1024 slots, configurable +/// via FLANG_TRAMPOLINE_POOL_SIZE env var). If all slots are in use, the +/// runtime issues a fatal error. Dynamic slab growth may be added later. +/// +/// Architecture support: Currently x86-64 and AArch64. On unsupported +/// architectures, calling this function issues a fatal diagnostic. +void *RTDECL(TrampolineInit)( + void *scratch, const void *calleeAddress, const void *staticChainAddress); + +/// Returns the callable trampoline address for the given handle. +/// +/// \p handle is a value returned by TrampolineInit(). +/// The result is a function pointer that can be called directly; it will +/// set up the static chain register and jump to the original callee. +void *RTDECL(TrampolineAdjust)(void *handle); + +/// Frees the trampoline entry associated with the given handle. +/// +/// Must be called at every exit from the host procedure to return the +/// trampoline slot to the pool. After this call, any function pointer +/// previously obtained via TrampolineAdjust() for this handle becomes +/// invalid. +void RTDECL(TrampolineFree)(void *handle); + +} // extern "C" +} // namespace Fortran::runtime + +#endif // FORTRAN_RUNTIME_TRAMPOLINE_H_ diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h index 44fb252d2b366..a2fadb72ad212 100644 --- a/flang/include/flang/Tools/CrossToolHelpers.h +++ b/flang/include/flang/Tools/CrossToolHelpers.h @@ -88,6 +88,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks { const Fortran::common::MathOptionsBase &mathOpts) { OptLevel = level; StackArrays = opts.StackArrays; + EnableRuntimeTrampoline = opts.EnableRuntimeTrampoline; Underscoring = opts.Underscoring; LoopVersioning = opts.LoopVersioning; DebugInfo = opts.getDebugInfo(); @@ -114,6 +115,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks { llvm::OptimizationLevel OptLevel; ///< optimisation level bool StackArrays = false; ///< convert memory allocations to alloca. + bool EnableRuntimeTrampoline = false; ///< Use runtime trampoline pool (W^X). bool Underscoring = true; ///< add underscores to function names. bool LoopVersioning = false; ///< Run the version loop pass. bool AliasAnalysis = false; ///< Add TBAA tags to generated LLVMIR. diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index fc4975f9592eb..b0bfede63257a 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -284,6 +284,10 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::options::OPT_fno_stack_arrays, false)) opts.StackArrays = 1; + if (args.hasFlag(clang::options::OPT_fruntime_trampoline, + clang::options::OPT_fno_runtime_trampoline, false)) + opts.EnableRuntimeTrampoline = 1; + if (args.getLastArg(clang::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index d966c52b29c33..8e5b5deff47c1 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -34,6 +34,7 @@ add_flang_library(FIRBuilder Runtime/Stop.cpp Runtime/Support.cpp Runtime/TemporaryStack.cpp + Runtime/Trampoline.cpp Runtime/Transformational.cpp TemporaryStorage.cpp diff --git a/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp new file mode 100644 index 0000000000000..4c33ef8e0a148 --- /dev/null +++ b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp @@ -0,0 +1,49 @@ +//===-- Trampoline.cpp - Runtime trampoline pool builder --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/Runtime/Trampoline.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/RTBuilder.h" +#include "flang/Runtime/trampoline.h" + +using namespace Fortran::runtime; +using namespace fir::runtime; + +mlir::Value fir::runtime::genTrampolineInit(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value scratch, + mlir::Value calleeAddress, + mlir::Value staticChainAddress) { + mlir::func::FuncOp func{ + getRuntimeFunc<mkRTKey(TrampolineInit)>(loc, builder)}; + mlir::FunctionType fTy = func.getFunctionType(); + llvm::SmallVector<mlir::Value> args = createArguments( + builder, loc, fTy, scratch, calleeAddress, staticChainAddress); + return fir::CallOp::create(builder, loc, func, args).getResult(0); +} + +mlir::Value fir::runtime::genTrampolineAdjust(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value handle) { + mlir::func::FuncOp func{ + getRuntimeFunc<mkRTKey(TrampolineAdjust)>(loc, builder)}; + mlir::FunctionType fTy = func.getFunctionType(); + llvm::SmallVector<mlir::Value> args = + createArguments(builder, loc, fTy, handle); + return fir::CallOp::create(builder, loc, func, args).getResult(0); +} + +void fir::runtime::genTrampolineFree(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value handle) { + mlir::func::FuncOp func{ + getRuntimeFunc<mkRTKey(TrampolineFree)>(loc, builder)}; + mlir::FunctionType fTy = func.getFunctionType(); + llvm::SmallVector<mlir::Value> args = + createArguments(builder, loc, fTy, handle); + fir::CallOp::create(builder, loc, func, args); +} diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 61d6d2ae6329a..d1d0201a5ba24 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -10,6 +10,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" +#include "flang/Optimizer/Builder/Runtime/Trampoline.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" @@ -20,6 +21,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" namespace fir { #define GEN_PASS_DEF_BOXEDPROCEDUREPASS @@ -31,12 +33,6 @@ namespace fir { using namespace fir; namespace { -/// Options to the procedure pointer pass. -struct BoxedProcedureOptions { - // Lower the boxproc abstraction to function pointers and thunks where - // required. - bool useThunks = true; -}; /// This type converter rewrites all `!fir.boxproc<Func>` types to `Func` types. class BoxprocTypeRewriter : public mlir::TypeConverter { @@ -219,200 +215,284 @@ class BoxedProcedurePass inline mlir::ModuleOp getModule() { return getOperation(); } void runOnOperation() override final { - if (options.useThunks) { + if (useThunks) { auto *context = &getContext(); mlir::IRRewriter rewriter(context); BoxprocTypeRewriter typeConverter(mlir::UnknownLoc::get(context)); - getModule().walk([&](mlir::Operation *op) { - bool opIsValid = true; - typeConverter.setLocation(op->getLoc()); - if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) { - mlir::Type ty = addr.getVal().getType(); - mlir::Type resTy = addr.getResult().getType(); - if (llvm::isa<mlir::FunctionType>(ty) || - llvm::isa<fir::BoxProcType>(ty)) { - // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc` - // or function type to be `fir.convert` ops. - rewriter.setInsertionPoint(addr); - rewriter.replaceOpWithNewOp<ConvertOp>( - addr, typeConverter.convertType(addr.getType()), addr.getVal()); - opIsValid = false; - } else if (typeConverter.needsConversion(resTy)) { - rewriter.startOpModification(op); - op->getResult(0).setType(typeConverter.convertType(resTy)); - rewriter.finalizeOpModification(op); - } - } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) { - mlir::FunctionType ty = func.getFunctionType(); - if (typeConverter.needsConversion(ty)) { - rewriter.startOpModification(func); - auto toTy = - mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty)); - if (!func.empty()) - for (auto e : llvm::enumerate(toTy.getInputs())) { - unsigned i = e.index(); - auto &block = func.front(); - block.insertArgument(i, e.value(), func.getLoc()); - block.getArgument(i + 1).replaceAllUsesWith( - block.getArgument(i)); - block.eraseArgument(i + 1); - } - func.setType(toTy); - rewriter.finalizeOpModification(func); - } - } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) { - // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk - // as required. - mlir::Type toTy = typeConverter.convertType( - mlir::cast<BoxProcType>(embox.getType()).getEleTy()); - rewriter.setInsertionPoint(embox); - if (embox.getHost()) { - // Create the thunk. - auto module = embox->getParentOfType<mlir::ModuleOp>(); - FirOpBuilder builder(rewriter, module); - const auto triple{fir::getTargetTriple(module)}; - auto loc = embox.getLoc(); - mlir::Type i8Ty = builder.getI8Type(); - mlir::Type i8Ptr = builder.getRefType(i8Ty); - // For PPC32 and PPC64, the thunk is populated by a call to - // __trampoline_setup, which is defined in - // compiler-rt/lib/builtins/trampoline_setup.c and requires the - // thunk size greater than 32 bytes. For AArch64, RISCV and x86_64, - // the thunk setup doesn't go through __trampoline_setup and fits in - // 32 bytes. - fir::SequenceType::Extent thunkSize = triple.getTrampolineSize(); - mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty); - auto buffer = AllocaOp::create(builder, loc, buffTy); - mlir::Value closure = - builder.createConvert(loc, i8Ptr, embox.getHost()); - mlir::Value tramp = builder.createConvert(loc, i8Ptr, buffer); - mlir::Value func = - builder.createConvert(loc, i8Ptr, embox.getFunc()); - fir::CallOp::create( - builder, loc, factory::getLlvmInitTrampoline(builder), - llvm::ArrayRef<mlir::Value>{tramp, func, closure}); - auto adjustCall = fir::CallOp::create( - builder, loc, factory::getLlvmAdjustTrampoline(builder), - llvm::ArrayRef<mlir::Value>{tramp}); - rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, - adjustCall.getResult(0)); - opIsValid = false; - } else { - // Just forward the function as a pointer. - rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, - embox.getFunc()); - opIsValid = false; - } - } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) { - auto ty = global.getType(); - if (typeConverter.needsConversion(ty)) { - rewriter.startOpModification(global); - auto toTy = typeConverter.convertType(ty); - global.setType(toTy); - rewriter.finalizeOpModification(global); - } - } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) { - auto ty = mem.getType(); - if (typeConverter.needsConversion(ty)) { - rewriter.setInsertionPoint(mem); - auto toTy = typeConverter.convertType(unwrapRefType(ty)); - bool isPinned = mem.getPinned(); - llvm::StringRef uniqName = - mem.getUniqName().value_or(llvm::StringRef()); - llvm::StringRef bindcName = - mem.getBindcName().value_or(llvm::StringRef()); - rewriter.replaceOpWithNewOp<AllocaOp>( - mem, toTy, uniqName, bindcName, isPinned, mem.getTypeparams(), - mem.getShape()); - opIsValid = false; - } - } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) { - auto ty = mem.getType(); - if (typeConverter.needsConversion(ty)) { - rewriter.setInsertionPoint(mem); - auto toTy = typeConverter.convertType(unwrapRefType(ty)); - llvm::StringRef uniqName = - mem.getUniqName().value_or(llvm::StringRef()); - llvm::StringRef bindcName = - mem.getBindcName().value_or(llvm::StringRef()); - rewriter.replaceOpWithNewOp<AllocMemOp>( - mem, toTy, uniqName, bindcName, mem.getTypeparams(), - mem.getShape()); - opIsValid = false; - } - } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) { - auto ty = coor.getType(); - mlir::Type baseTy = coor.getBaseType(); - if (typeConverter.needsConversion(ty) || - typeConverter.needsConversion(baseTy)) { - rewriter.setInsertionPoint(coor); - auto toTy = typeConverter.convertType(ty); - auto toBaseTy = typeConverter.convertType(baseTy); - rewriter.replaceOpWithNewOp<CoordinateOp>( - coor, toTy, coor.getRef(), coor.getCoor(), toBaseTy, - coor.getFieldIndicesAttr()); - opIsValid = false; - } - } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) { - auto ty = index.getType(); - mlir::Type onTy = index.getOnType(); - if (typeConverter.needsConversion(ty) || - typeConverter.needsConversion(onTy)) { - rewriter.setInsertionPoint(index); - auto toTy = typeConverter.convertType(ty); - auto toOnTy = typeConverter.convertType(onTy); - rewriter.replaceOpWithNewOp<FieldIndexOp>( - index, toTy, index.getFieldId(), toOnTy, index.getTypeparams()); - opIsValid = false; - } - } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) { - auto ty = index.getType(); - mlir::Type onTy = index.getOnType(); - if (typeConverter.needsConversion(ty) || - typeConverter.needsConversion(onTy)) { - rewriter.setInsertionPoint(index); - auto toTy = typeConverter.convertType(ty); - auto toOnTy = typeConverter.convertType(onTy); - rewriter.replaceOpWithNewOp<LenParamIndexOp>( - index, toTy, index.getFieldId(), toOnTy, index.getTypeparams()); - opIsValid = false; + + // When using runtime trampolines, we need to track handles per + // function so we can insert FreeTrampoline calls at each return. + // Process functions individually to manage this state. + if (useRuntimeTrampoline) { + getModule().walk([&](mlir::func::FuncOp funcOp) { + trampolineHandles.clear(); + processFunction(funcOp, rewriter, typeConverter); + insertTrampolineFrees(funcOp, rewriter); + }); + // Also process non-function ops at module level (globals, etc.) + processModuleLevelOps(rewriter, typeConverter); + } else { + getModule().walk([&](mlir::Operation *op) { + processOp(op, rewriter, typeConverter); + }); + } + } + } + +private: + /// Trampoline handles collected while processing a function. + /// Each entry is a Value representing the opaque handle returned + /// by _FortranATrampolineInit, which must be freed before the + /// function returns. + llvm::SmallVector<mlir::Value> trampolineHandles; + + /// Process all ops within a function. + void processFunction(mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter, + BoxprocTypeRewriter &typeConverter) { + funcOp.walk( + [&](mlir::Operation *op) { processOp(op, rewriter, typeConverter); }); + } + + /// Process non-function ops at module level (globals, etc.) + void processModuleLevelOps(mlir::IRRewriter &rewriter, + BoxprocTypeRewriter &typeConverter) { + for (auto &op : getModule().getBody()->getOperations()) { + if (!mlir::isa<mlir::func::FuncOp>(op)) + processOp(&op, rewriter, typeConverter); + } + } + + /// Insert _FortranATrampolineFree calls before every return in the function. + void insertTrampolineFrees(mlir::func::FuncOp funcOp, + mlir::IRRewriter &rewriter) { + if (trampolineHandles.empty()) + return; + + auto module = funcOp->getParentOfType<mlir::ModuleOp>(); + // Insert TrampolineFree calls before every func.return in this function. + // At this pass stage (after CFGConversion), func.return is the only + // terminator that exits the function. Other terminators are either + // intra-function branches (cf.br, cf.cond_br, fir.select*) or + // fir.unreachable (after STOP/ERROR STOP), which don't need cleanup + // since the process is terminating. + funcOp.walk([&](mlir::func::ReturnOp retOp) { + rewriter.setInsertionPoint(retOp); + FirOpBuilder builder(rewriter, module); + auto loc = retOp.getLoc(); + for (mlir::Value handle : trampolineHandles) { + fir::runtime::genTrampolineFree(builder, loc, handle); + } + }); + } + + /// Process a single operation for boxproc type rewriting. + void processOp(mlir::Operation *op, mlir::IRRewriter &rewriter, + BoxprocTypeRewriter &typeConverter) { + bool opIsValid = true; + typeConverter.setLocation(op->getLoc()); + if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) { + mlir::Type ty = addr.getVal().getType(); + mlir::Type resTy = addr.getResult().getType(); + if (llvm::isa<mlir::FunctionType>(ty) || + llvm::isa<fir::BoxProcType>(ty)) { + // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc` + // or function type to be `fir.convert` ops. + rewriter.setInsertionPoint(addr); + rewriter.replaceOpWithNewOp<ConvertOp>( + addr, typeConverter.convertType(addr.getType()), addr.getVal()); + opIsValid = false; + } else if (typeConverter.needsConversion(resTy)) { + rewriter.startOpModification(op); + op->getResult(0).setType(typeConverter.convertType(resTy)); + rewriter.finalizeOpModification(op); + } + } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) { + mlir::FunctionType ty = func.getFunctionType(); + if (typeConverter.needsConversion(ty)) { + rewriter.startOpModification(func); + auto toTy = + mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty)); + if (!func.empty()) + for (auto e : llvm::enumerate(toTy.getInputs())) { + unsigned i = e.index(); + auto &block = func.front(); + block.insertArgument(i, e.value(), func.getLoc()); + block.getArgument(i + 1).replaceAllUsesWith(block.getArgument(i)); + block.eraseArgument(i + 1); } - } else { - rewriter.startOpModification(op); - // Convert the operands if needed - for (auto i : llvm::enumerate(op->getResultTypes())) - if (typeConverter.needsConversion(i.value())) { - auto toTy = typeConverter.convertType(i.value()); - op->getResult(i.index()).setType(toTy); - } + func.setType(toTy); + rewriter.finalizeOpModification(func); + } + } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) { + // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk + // as required. + mlir::Type toTy = typeConverter.convertType( + mlir::cast<BoxProcType>(embox.getType()).getEleTy()); + rewriter.setInsertionPoint(embox); + if (embox.getHost()) { + auto module = embox->getParentOfType<mlir::ModuleOp>(); + FirOpBuilder builder(rewriter, module); + auto loc = embox.getLoc(); + mlir::Type i8Ty = builder.getI8Type(); + mlir::Type i8Ptr = builder.getRefType(i8Ty); - // Convert the type attributes if needed - for (const mlir::NamedAttribute &attr : op->getAttrDictionary()) - if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue())) - if (typeConverter.needsConversion(tyAttr.getValue())) { - auto toTy = typeConverter.convertType(tyAttr.getValue()); - op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy)); - } - rewriter.finalizeOpModification(op); + if (useRuntimeTrampoline) { + // Runtime trampoline pool path (W^X compliant). + // Instead of allocating a writable+executable buffer on the + // stack, call the runtime to allocate from a pre-initialized + // pool with separate RX (code) and RW (data) regions. + mlir::Value nullPtr = builder.createNullConstant(loc, i8Ptr); + mlir::Value closure = + builder.createConvert(loc, i8Ptr, embox.getHost()); + mlir::Value func = builder.createConvert(loc, i8Ptr, embox.getFunc()); + + // _FortranATrampolineInit(nullptr, func, closure) -> handle + mlir::Value handle = fir::runtime::genTrampolineInit( + builder, loc, nullPtr, func, closure); + + // _FortranATrampolineAdjust(handle) -> callable address + mlir::Value callableAddr = + fir::runtime::genTrampolineAdjust(builder, loc, handle); + + // Track the handle so we can free it at function exits. + trampolineHandles.push_back(handle); + + rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, callableAddr); + } else { + // Legacy stack-based trampoline path. + const auto triple{fir::getTargetTriple(module)}; + // For PPC32 and PPC64, the thunk is populated by a call to + // __trampoline_setup, which is defined in + // compiler-rt/lib/builtins/trampoline_setup.c and requires the + // thunk size greater than 32 bytes. For AArch64, RISCV and + // x86_64, the thunk setup doesn't go through + // __trampoline_setup and fits in 32 bytes. + fir::SequenceType::Extent thunkSize = triple.getTrampolineSize(); + mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty); + auto buffer = AllocaOp::create(builder, loc, buffTy); + mlir::Value closure = + builder.createConvert(loc, i8Ptr, embox.getHost()); + mlir::Value tramp = builder.createConvert(loc, i8Ptr, buffer); + mlir::Value func = builder.createConvert(loc, i8Ptr, embox.getFunc()); + fir::CallOp::create( + builder, loc, factory::getLlvmInitTrampoline(builder), + llvm::ArrayRef<mlir::Value>{tramp, func, closure}); + auto adjustCall = fir::CallOp::create( + builder, loc, factory::getLlvmAdjustTrampoline(builder), + llvm::ArrayRef<mlir::Value>{tramp}); + rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, + adjustCall.getResult(0)); } - // Ensure block arguments are updated if needed. - if (opIsValid && op->getNumRegions() != 0) { - rewriter.startOpModification(op); - for (mlir::Region ®ion : op->getRegions()) - for (mlir::Block &block : region.getBlocks()) - for (mlir::BlockArgument blockArg : block.getArguments()) - if (typeConverter.needsConversion(blockArg.getType())) { - mlir::Type toTy = - typeConverter.convertType(blockArg.getType()); - blockArg.setType(toTy); - } - rewriter.finalizeOpModification(op); + opIsValid = false; + } else { + // Just forward the function as a pointer. + rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, embox.getFunc()); + opIsValid = false; + } + } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) { + auto ty = global.getType(); + if (typeConverter.needsConversion(ty)) { + rewriter.startOpModification(global); + auto toTy = typeConverter.convertType(ty); + global.setType(toTy); + rewriter.finalizeOpModification(global); + } + } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) { + auto ty = mem.getType(); + if (typeConverter.needsConversion(ty)) { + rewriter.setInsertionPoint(mem); + auto toTy = typeConverter.convertType(unwrapRefType(ty)); + bool isPinned = mem.getPinned(); + llvm::StringRef uniqName = + mem.getUniqName().value_or(llvm::StringRef()); + llvm::StringRef bindcName = + mem.getBindcName().value_or(llvm::StringRef()); + rewriter.replaceOpWithNewOp<AllocaOp>(mem, toTy, uniqName, bindcName, + isPinned, mem.getTypeparams(), + mem.getShape()); + opIsValid = false; + } + } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) { + auto ty = mem.getType(); + if (typeConverter.needsConversion(ty)) { + rewriter.setInsertionPoint(mem); + auto toTy = typeConverter.convertType(unwrapRefType(ty)); + llvm::StringRef uniqName = + mem.getUniqName().value_or(llvm::StringRef()); + llvm::StringRef bindcName = + mem.getBindcName().value_or(llvm::StringRef()); + rewriter.replaceOpWithNewOp<AllocMemOp>(mem, toTy, uniqName, bindcName, + mem.getTypeparams(), + mem.getShape()); + opIsValid = false; + } + } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) { + auto ty = coor.getType(); + mlir::Type baseTy = coor.getBaseType(); + if (typeConverter.needsConversion(ty) || + typeConverter.needsConversion(baseTy)) { + rewriter.setInsertionPoint(coor); + auto toTy = typeConverter.convertType(ty); + auto toBaseTy = typeConverter.convertType(baseTy); + rewriter.replaceOpWithNewOp<CoordinateOp>(coor, toTy, coor.getRef(), + coor.getCoor(), toBaseTy, + coor.getFieldIndicesAttr()); + opIsValid = false; + } + } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) { + auto ty = index.getType(); + mlir::Type onTy = index.getOnType(); + if (typeConverter.needsConversion(ty) || + typeConverter.needsConversion(onTy)) { + rewriter.setInsertionPoint(index); + auto toTy = typeConverter.convertType(ty); + auto toOnTy = typeConverter.convertType(onTy); + rewriter.replaceOpWithNewOp<FieldIndexOp>( + index, toTy, index.getFieldId(), toOnTy, index.getTypeparams()); + opIsValid = false; + } + } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) { + auto ty = index.getType(); + mlir::Type onTy = index.getOnType(); + if (typeConverter.needsConversion(ty) || + typeConverter.needsConversion(onTy)) { + rewriter.setInsertionPoint(index); + auto toTy = typeConverter.convertType(ty); + auto toOnTy = typeConverter.convertType(onTy); + rewriter.replaceOpWithNewOp<LenParamIndexOp>( + index, toTy, index.getFieldId(), toOnTy, index.getTypeparams()); + opIsValid = false; + } + } else { + rewriter.startOpModification(op); + // Convert the operands if needed + for (auto i : llvm::enumerate(op->getResultTypes())) + if (typeConverter.needsConversion(i.value())) { + auto toTy = typeConverter.convertType(i.value()); + op->getResult(i.index()).setType(toTy); } - }); + + // Convert the type attributes if needed + for (const mlir::NamedAttribute &attr : op->getAttrDictionary()) + if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue())) + if (typeConverter.needsConversion(tyAttr.getValue())) { + auto toTy = typeConverter.convertType(tyAttr.getValue()); + op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy)); + } + rewriter.finalizeOpModification(op); + } + // Ensure block arguments are updated if needed. + if (opIsValid && op->getNumRegions() != 0) { + rewriter.startOpModification(op); + for (mlir::Region ®ion : op->getRegions()) + for (mlir::Block &block : region.getBlocks()) + for (mlir::BlockArgument blockArg : block.getArguments()) + if (typeConverter.needsConversion(blockArg.getType())) { + mlir::Type toTy = typeConverter.convertType(blockArg.getType()); + blockArg.setType(toTy); + } + rewriter.finalizeOpModification(op); } } - -private: - BoxedProcedureOptions options; }; } // namespace diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp index 75e818d330b51..794dffebc38bb 100644 --- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp +++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp @@ -71,6 +71,8 @@ DisableOption(FirToLlvmIr, "fir-to-llvmir", "FIR to LLVM-IR dialect"); DisableOption(LlvmIrToLlvm, "llvm", "conversion to LLVM"); DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite", "rewrite boxed procedures"); +EnableOption(RuntimeTrampoline, "runtime-trampoline", + "W^X compliant runtime trampoline pool"); DisableOption(ExternalNameConversion, "external-name-interop", "convert names with external convention"); diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 5927fff960270..1187e8a27baa2 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -140,9 +140,16 @@ void addLLVMDialectToLLVMPass(mlir::PassManager &pm, }); } -void addBoxedProcedurePass(mlir::PassManager &pm) { - addPassConditionally(pm, disableBoxedProcedureRewrite, - [&]() { return fir::createBoxedProcedurePass(); }); +void addBoxedProcedurePass(mlir::PassManager &pm, + bool enableRuntimeTrampolineFromConfig) { + addPassConditionally(pm, disableBoxedProcedureRewrite, [&]() { + fir::BoxedProcedurePassOptions opts; + // Support both the frontend -fruntime-trampoline flag (via config) + // and the cl::opt --runtime-trampoline (for fir-opt/tco tools). + opts.useRuntimeTrampoline = + enableRuntimeTrampolineFromConfig || enableRuntimeTrampoline; + return fir::createBoxedProcedurePass(opts); + }); } void addExternalNameConversionPass(mlir::PassManager &pm, @@ -371,7 +378,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig config, llvm::StringRef inputFilename) { pm.addPass(fir::createMIFOpConversion()); - fir::addBoxedProcedurePass(pm); + fir::addBoxedProcedurePass(pm, config.EnableRuntimeTrampoline); if (config.OptLevel.isOptimizingForSpeed() && config.AliasAnalysis && !disableFirAliasTags && !useOldAliasTags) pm.addPass(fir::createAddAliasTags()); diff --git a/flang/test/Driver/fruntime-trampoline.f90 b/flang/test/Driver/fruntime-trampoline.f90 new file mode 100644 index 0000000000000..4feff99bb3f98 --- /dev/null +++ b/flang/test/Driver/fruntime-trampoline.f90 @@ -0,0 +1,15 @@ +! Test that -fruntime-trampoline is properly forwarded from driver to +! frontend, and that -fno-runtime-trampoline (default) works. + +! REQUIRES: x86-registered-target + +! RUN: %flang -### -fruntime-trampoline %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-ON +! RUN: %flang -### -fno-runtime-trampoline %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-OFF +! RUN: %flang -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT + +! CHECK-ON: "-fruntime-trampoline" +! CHECK-OFF-NOT: "-fruntime-trampoline" +! CHECK-DEFAULT-NOT: "-fruntime-trampoline" + +program dummy +end program diff --git a/flang/test/Fir/boxproc-runtime-trampoline.fir b/flang/test/Fir/boxproc-runtime-trampoline.fir new file mode 100644 index 0000000000000..6dd061dce420e --- /dev/null +++ b/flang/test/Fir/boxproc-runtime-trampoline.fir @@ -0,0 +1,67 @@ +// RUN: fir-opt --boxed-procedure="use-runtime-trampoline=true" %s | FileCheck %s + +// Test that the --boxed-procedure pass with use-runtime-trampoline=true +// generates calls to _FortranATrampolineInit, _FortranATrampolineAdjust, +// and _FortranATrampolineFree instead of llvm.init.trampoline and +// llvm.adjust.trampoline intrinsics. + +// CHECK-LABEL: func.func @_QPtest_proc_dummy() +// CHECK: fir.zero_bits !fir.ref<i8> +// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.ref<i8> +// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.ref<i8> +// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8> +// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8> +// CHECK: fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8> +// CHECK: %[[HANDLE:.*]] = fir.call @_FortranATrampolineInit({{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> !fir.llvm_ptr<i8> +// CHECK: %[[ADDR:.*]] = fir.call @_FortranATrampolineAdjust(%[[HANDLE]]) : (!fir.llvm_ptr<i8>) -> !fir.llvm_ptr<i8> +// CHECK: %[[FPTR:.*]] = fir.convert %[[ADDR]] +// CHECK: fir.call @_QPtest_proc_dummy_other(%[[FPTR]]) +// CHECK: fir.call @_FortranATrampolineFree(%[[HANDLE]]) +// CHECK: return + +func.func @_QPtest_proc_dummy() { + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c-1_i32 = arith.constant -1 : i32 + %c5_i32 = arith.constant 5 : i32 + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_proc_dummyEi"} + %1 = fir.alloca tuple<!fir.ref<i32>> + %2 = fir.coordinate_of %1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>> + fir.store %0 to %2 : !fir.llvm_ptr<!fir.ref<i32>> + fir.store %c1_i32 to %0 : !fir.ref<i32> + %3 = fir.address_of(@_QFtest_proc_dummyPtest_proc_dummy_a) : (!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> () + %4 = fir.emboxproc %3, %1 : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()> + fir.call @_QPtest_proc_dummy_other(%4) : (!fir.boxproc<() -> ()>) -> () + %5 = fir.address_of(@_QQclX2E2F682E66393000) : !fir.ref<!fir.char<1,8>> + %6 = fir.convert %5 : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8> + %7 = fir.call @_FortranAioBeginExternalListOutput(%c-1_i32, %6, %c5_i32) : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8> + %8 = fir.load %0 : !fir.ref<i32> + %9 = fir.call @_FortranAioOutputInteger32(%7, %8) : (!fir.ref<i8>, i32) -> i1 + %10 = fir.call @_FortranAioEndIoStatement(%7) : (!fir.ref<i8>) -> i32 + return +} +func.func @_QFtest_proc_dummyPtest_proc_dummy_a(%arg0: !fir.ref<i32> {fir.bindc_name = "j"}, %arg1: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) { + %c0_i32 = arith.constant 0 : i32 + %0 = fir.coordinate_of %arg1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>> + %1 = fir.load %0 : !fir.llvm_ptr<!fir.ref<i32>> + %2 = fir.load %1 : !fir.ref<i32> + %3 = fir.load %arg0 : !fir.ref<i32> + %4 = arith.addi %2, %3 : i32 + fir.store %4 to %1 : !fir.ref<i32> + return +} +func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) { + %c4_i32 = arith.constant 4 : i32 + %0 = fir.alloca i32 {adapt.valuebyref} + fir.store %c4_i32 to %0 : !fir.ref<i32> + %1 = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> ((!fir.ref<i32>) -> ()) + fir.call %1(%0) : (!fir.ref<i32>) -> () + return +} +fir.global linkonce @_QQclX2E2F682E66393000 constant : !fir.char<1,8> { + %0 = fir.string_lit "./h.f90\00"(8) : !fir.char<1,8> + fir.has_value %0 : !fir.char<1,8> +} +func.func private @_FortranAioOutputInteger32(!fir.ref<i8>, i32) -> i1 attributes {fir.io, fir.runtime} +func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) -> !fir.ref<i8> attributes {fir.io, fir.runtime} +func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime} diff --git a/flang/test/Lower/runtime-trampoline.f90 b/flang/test/Lower/runtime-trampoline.f90 new file mode 100644 index 0000000000000..a197f5b0b05bf --- /dev/null +++ b/flang/test/Lower/runtime-trampoline.f90 @@ -0,0 +1,41 @@ +! RUN: %flang -fruntime-trampoline -emit-llvm -S -o - %s | FileCheck %s +! +! Test that -fruntime-trampoline generates calls to the runtime +! trampoline pool instead of stack-based trampolines. + +! CHECK: call {{.*}}@_FortranATrampolineInit +! CHECK: call {{.*}}@_FortranATrampolineAdjust +! CHECK: call {{.*}}@_FortranATrampolineFree + +module other + abstract interface + function callback() + integer :: callback + end function callback + end interface + contains + subroutine foo(fptr) + procedure(callback), pointer :: fptr + print *, fptr() + end subroutine foo +end module other + +subroutine host(local) + use other + integer :: local + procedure(callback), pointer :: fptr + fptr => callee + call foo(fptr) + return + + contains + + function callee() + integer :: callee + callee = local + end function callee +end subroutine host + +program main + call host(10) +end program main _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
