[clang] [flang] [llvm] [flang] Add runtime trampoline pool for W^X compliance (PR #183108)

Sairudra More via cfe-commits Wed, 25 Feb 2026 23:28:17 -0800

https://github.com/Saieiei updated 
https://github.com/llvm/llvm-project/pull/183108


>From 540e92d408de1d1b30ce083df748ecd28e5b8336 Mon Sep 17 00:00:00 2001
From: Sairudra More <[email protected]>
Date: Tue, 24 Feb 2026 11:21:17 -0600
Subject: [PATCH] [flang] Add runtime trampoline pool for W^X compliance

Replace LLVM's llvm.init.trampoline / llvm.adjust.trampoline intrinsics
with a runtime-managed trampoline pool that keeps the stack non-executable,
satisfying W^X (Write XOR Execute) policies enforced by modern toolchains
and security-hardened kernels.

The new -fenable-runtime-trampoline flag activates this path in
BoxedProcedure, which emits calls to _FortranATrampolineInit,
_FortranATrampolineAdjust, and _FortranATrampolineFree instead of
the legacy LLVM trampoline intrinsics.

Closes #182813
---
 clang/include/clang/Options/Options.td        |   5 +
 clang/lib/Driver/ToolChains/Flang.cpp         |  14 +
 .../include/flang-rt/runtime/trampoline.h     |  69 +++
 flang-rt/lib/runtime/CMakeLists.txt           |   1 +
 flang-rt/lib/runtime/trampoline.cpp           | 425 ++++++++++++++++
 .../Driver/runtime-trampoline-gnustack.f90    |  45 ++
 flang/docs/RuntimeEnvironment.md              |  12 +
 .../include/flang/Frontend/CodeGenOptions.def |   1 +
 .../Optimizer/Builder/Runtime/RTBuilder.h     |   4 +
 .../Optimizer/Builder/Runtime/Trampoline.h    |  47 ++
 .../flang/Optimizer/CodeGen/CGPasses.td       |  16 +-
 .../flang/Optimizer/Passes/CommandLineOpts.h  |   1 +
 .../flang/Optimizer/Passes/Pipelines.h        |   3 +-
 flang/include/flang/Runtime/trampoline.h      |  69 +++
 flang/include/flang/Tools/CrossToolHelpers.h  |   2 +
 flang/lib/Frontend/CompilerInvocation.cpp     |   4 +
 flang/lib/Optimizer/Builder/CMakeLists.txt    |   1 +
 .../Optimizer/Builder/Runtime/Trampoline.cpp  |  49 ++
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  | 464 ++++++++++--------
 .../lib/Optimizer/Passes/CommandLineOpts.cpp  |   2 +
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  15 +-
 flang/test/Driver/fruntime-trampoline.f90     |  15 +
 flang/test/Fir/boxproc-runtime-trampoline.fir |  67 +++
 flang/test/Lower/runtime-trampoline.f90       |  41 ++
 24 files changed, 1170 insertions(+), 202 deletions(-)
 create mode 100644 flang-rt/include/flang-rt/runtime/trampoline.h
 create mode 100644 flang-rt/lib/runtime/trampoline.cpp
 create mode 100644 flang-rt/test/Driver/runtime-trampoline-gnustack.f90
 create mode 100644 flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
 create mode 100644 flang/include/flang/Runtime/trampoline.h
 create mode 100644 flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
 create mode 100644 flang/test/Driver/fruntime-trampoline.f90
 create mode 100644 flang/test/Fir/boxproc-runtime-trampoline.fir
 create mode 100644 flang/test/Lower/runtime-trampoline.f90

diff --git a/clang/include/clang/Options/Options.td 
b/clang/include/clang/Options/Options.td
index 4ac812e92e2cb..859292d3fc6ab 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7567,6 +7567,11 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", 
"stack-arrays",
   PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries 
on the stack, no matter their size">,
   NegFlag<SetFalse, [], [ClangOption], "Allocate array temporaries on the heap 
(default)">>;
 
+defm runtime_trampoline : BoolOptionWithoutMarshalling<"f",
+  "runtime-trampoline",
+  PosFlag<SetTrue, [], [ClangOption], "Use W^X compliant runtime trampoline 
pool for internal procedures">,
+  NegFlag<SetFalse, [], [ClangOption], "Use stack-based trampolines for 
internal procedures (default)">>;
+
 defm loop_versioning : BoolOptionWithoutMarshalling<"f", 
"version-loops-for-stride",
   PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">,
    NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops 
(default)">>;
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp 
b/clang/lib/Driver/ToolChains/Flang.cpp
index 8425f8fec62a4..c90a2d1a5d947 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -203,6 +203,20 @@ void Flang::addCodegenOptions(const ArgList &Args,
       !stackArrays->getOption().matches(options::OPT_fno_stack_arrays))
     CmdArgs.push_back("-fstack-arrays");
 
+  if (Args.hasFlag(options::OPT_fruntime_trampoline,
+                   options::OPT_fno_runtime_trampoline, false)) {
+    const llvm::Triple &T = getToolChain().getTriple();
+    if (T.getArch() == llvm::Triple::x86_64 ||
+        T.getArch() == llvm::Triple::aarch64 ||
+        T.getArch() == llvm::Triple::aarch64_be) {
+      CmdArgs.push_back("-fruntime-trampoline");
+    } else {
+      getToolChain().getDriver().Diag(
+          diag::warn_drv_unsupported_option_for_target)
+          << "-fruntime-trampoline" << T.str();
+    }
+  }
+
   // -fno-protect-parens is the default for -Ofast.
   if (!Args.hasFlag(options::OPT_fprotect_parens,
                     options::OPT_fno_protect_parens,
diff --git a/flang-rt/include/flang-rt/runtime/trampoline.h 
b/flang-rt/include/flang-rt/runtime/trampoline.h
new file mode 100644
index 0000000000000..3b3ddff7a0587
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/trampoline.h
@@ -0,0 +1,69 @@
+//===-- flang-rt/runtime/trampoline.h ----------------------------*- 
C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Internal declarations for the W^X-compliant trampoline pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FLANG_RT_RUNTIME_TRAMPOLINE_H_
+#define FLANG_RT_RUNTIME_TRAMPOLINE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace Fortran::runtime::trampoline {
+
+/// Per-trampoline data entry. Stored in a writable (non-executable) region.
+/// Each entry is paired with a trampoline code stub in the executable region.
+struct TrampolineData {
+  const void *calleeAddress;
+  const void *staticChainAddress;
+};
+
+/// Default number of trampoline slots in the pool.
+/// Can be overridden via FLANG_TRAMPOLINE_POOL_SIZE environment variable.
+constexpr std::size_t kDefaultPoolSize = 1024;
+
+/// Size of each trampoline code stub in bytes (platform-specific).
+#if defined(__x86_64__) || defined(_M_X64)
+// x86-64 trampoline stub:
+//   movq TDATA_OFFSET(%rip), %r10    # load static chain from TDATA
+//   movabsq $0, %r11                 # placeholder for callee address
+//   jmpq *%r11
+// Actually we use an indirect approach through the TDATA pointer:
+//   movq (%r10), %r10                # load static chain (8 bytes)
+//   -- but we need the TDATA pointer first
+// Simplified approach for x86-64:
+//   leaq tdata_entry(%rip), %r11     # get TDATA entry address
+//   movq 8(%r11), %r10               # load static chain
+//   jmpq *(%r11)                     # jump to callee
+constexpr std::size_t kTrampolineStubSize = 32;
+constexpr int kNestRegister = 10; // %r10 is the nest/static chain register
+#elif defined(__aarch64__) || defined(_M_ARM64)
+// AArch64 trampoline stub:
+//   adr x17, tdata_entry             # get TDATA entry address
+//   ldr x18, [x17, #8]              # load static chain
+//   ldr x17, [x17]                  # load callee address
+//   br x17
+constexpr std::size_t kTrampolineStubSize = 32;
+constexpr int kNestRegister = 18; // x18 is the platform register
+#elif defined(__powerpc64__) || defined(__ppc64__)
+constexpr std::size_t kTrampolineStubSize = 48;
+constexpr int kNestRegister = 11; // r11
+#else
+// Fallback: generous size
+constexpr std::size_t kTrampolineStubSize = 64;
+constexpr int kNestRegister = 0;
+#endif
+
+/// Alignment requirement for trampoline code stubs.
+constexpr std::size_t kTrampolineAlignment = 16;
+
+} // namespace Fortran::runtime::trampoline
+
+#endif // FLANG_RT_RUNTIME_TRAMPOLINE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt 
b/flang-rt/lib/runtime/CMakeLists.txt
index 9fa8376e9b99c..d5e89a169255c 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -88,6 +88,7 @@ set(host_sources
   stop.cpp
   temporary-stack.cpp
   time-intrinsic.cpp
+  trampoline.cpp
   unit-map.cpp
 )
 if (TARGET llvm-libc-common-utilities)
diff --git a/flang-rt/lib/runtime/trampoline.cpp 
b/flang-rt/lib/runtime/trampoline.cpp
new file mode 100644
index 0000000000000..a1f6fa2cea783
--- /dev/null
+++ b/flang-rt/lib/runtime/trampoline.cpp
@@ -0,0 +1,425 @@
+//===-- lib/runtime/trampoline.cpp -------------------------------*- 
C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// W^X-compliant trampoline pool implementation.
+//
+// This file implements a runtime trampoline pool that maintains separate
+// memory regions for executable code (RX) and writable data (RW).
+//
+// On Linux the code region transitions RW → RX (never simultaneously W+X).
+// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X
+// toggling via pthread_jit_write_protect_np, so the mapping permissions
+// include both W and X but hardware enforces that only one is active at
+// a time on any given thread.
+//
+// Architecture:
+//   - Code region (RX): Contains pre-assembled trampoline stubs that load
+//     callee address and static chain from a paired TDATA entry, then jump
+//     to the callee with the static chain in the appropriate register.
+//   - Data region (RW): Contains TrampolineData entries with {callee_address,
+//     static_chain_address} pairs, one per trampoline slot.
+//   - Free list: Tracks available trampoline slots for O(1) alloc/free.
+//
+// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX,
+// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime
+// library dependence. A single global lock serializes pool operations.
+// This is a deliberate V1 design choice to keep the initial W^X
+// architectural change minimal. Per-thread lock-free pools are deferred
+// to a future optimization patch.
+//
+// AddressSanitizer note: The trampoline code region is allocated via
+// mmap (not malloc/new), so ASan does not track it. The data region
+// and handles are allocated via malloc (through AllocateMemoryOrCrash),
+// which ASan intercepts normally. No special annotations are needed.
+//
+// See flang/docs/InternalProcedureTrampolines.md for design details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/trampoline.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/trampoline.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <new> // For placement-new only (no operator new/delete dependency)
+
+// Platform-specific headers for memory mapping.
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np
+// to create executable memory under the hardened runtime.
+#if defined(__APPLE__) && defined(__aarch64__)
+#include <libkern/OSCacheControl.h>
+#include <pthread.h>
+#endif
+
+// Architecture support check. Stub generators exist only for x86-64 and
+// AArch64. On other architectures the file compiles but the runtime API
+// functions crash with a diagnostic if actually called, so that building
+// flang-rt on e.g. RISC-V or PPC64 never fails.
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \
+    defined(_M_ARM64)
+#define TRAMPOLINE_ARCH_SUPPORTED 1
+#else
+#define TRAMPOLINE_ARCH_SUPPORTED 0
+#endif
+
+namespace Fortran::runtime::trampoline {
+
+/// A handle returned to the caller. Contains enough info to find
+/// both the trampoline stub and its data entry.
+struct TrampolineHandle {
+  void *codePtr; // Pointer to the trampoline stub in the RX region.
+  TrampolineData *dataPtr; // Pointer to the data entry in the RW region.
+  std::size_t slotIndex; // Index in the pool for free-list management.
+};
+
+// Namespace-scope globals following Flang runtime conventions:
+// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION)
+// - Pool pointer starts null; initialized under lock (double-checked locking)
+class TrampolinePool; // Forward declaration for pointer below.
+static Lock poolLock;
+static TrampolinePool *poolInstance{nullptr};
+
+/// The global trampoline pool.
+class TrampolinePool {
+public:
+  static TrampolinePool &instance() {
+    if (poolInstance) {
+      return *poolInstance;
+    }
+    CriticalSection critical{poolLock};
+    if (poolInstance) {
+      return *poolInstance;
+    }
+    // Allocate pool using malloc + placement new (trivial constructor).
+    Terminator terminator{__FILE__, __LINE__};
+    void *storage = AllocateMemoryOrCrash(terminator, sizeof(TrampolinePool));
+    poolInstance = new (storage) TrampolinePool();
+    return *poolInstance;
+  }
+
+  /// Allocate a trampoline slot and initialize it.
+  TrampolineHandle *allocate(
+      const void *calleeAddress, const void *staticChainAddress) {
+    CriticalSection critical{lock_};
+    ensureInitialized();
+
+    if (freeHead_ == kInvalidIndex) {
+      // Pool exhausted — fixed size by design for V1.
+      // The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE
+      // (default 1024). Dynamic slab growth can be added in a follow-up
+      // patch if real workloads demonstrate a need for it.
+      Terminator terminator{__FILE__, __LINE__};
+      terminator.Crash("Trampoline pool exhausted (max %zu slots). "
+                       "Set FLANG_TRAMPOLINE_POOL_SIZE to increase.",
+          poolSize_);
+    }
+
+    std::size_t index = freeHead_;
+    freeHead_ = freeList_[index];
+
+    // Initialize the data entry.
+    dataRegion_[index].calleeAddress = calleeAddress;
+    dataRegion_[index].staticChainAddress = staticChainAddress;
+
+    // Create handle using malloc + placement new.
+    Terminator terminator{__FILE__, __LINE__};
+    void *mem = AllocateMemoryOrCrash(terminator, sizeof(TrampolineHandle));
+    auto *handle = new (mem) TrampolineHandle();
+    handle->codePtr =
+        static_cast<char *>(codeRegion_) + index * kTrampolineStubSize;
+    handle->dataPtr = &dataRegion_[index];
+    handle->slotIndex = index;
+
+    return handle;
+  }
+
+  /// Get the callable address of a trampoline.
+  void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; 
}
+
+  /// Free a trampoline slot.
+  void free(TrampolineHandle *handle) {
+    CriticalSection critical{lock_};
+
+    std::size_t index = handle->slotIndex;
+
+    // Poison the data entry so that any dangling call through a freed
+    // trampoline traps immediately. We use a non-null, obviously-invalid
+    // address (0xDEAD...) so that the resulting fault is distinguishable
+    // from a null-pointer dereference when debugging.
+    dataRegion_[index].calleeAddress = reinterpret_cast<const void *>(
+        static_cast<uintptr_t>(~uintptr_t{0} - 1));
+    dataRegion_[index].staticChainAddress = nullptr;
+
+    // Return slot to free list.
+    freeList_[index] = freeHead_;
+    freeHead_ = index;
+
+    FreeMemory(handle);
+  }
+
+private:
+  static constexpr std::size_t kInvalidIndex = ~std::size_t{0};
+
+  TrampolinePool() = default;
+
+  void ensureInitialized() {
+    if (initialized_)
+      return;
+    initialized_ = true;
+
+    // Check environment variable for pool size override.
+    // Fixed-size pool by design (V1): avoids complexity of dynamic growth
+    // and re-protection of code pages. The default (1024 slots) is
+    // sufficient for typical Fortran programs. Users can override via:
+    //   export FLANG_TRAMPOLINE_POOL_SIZE=4096
+    poolSize_ = kDefaultPoolSize;
+    if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) {
+      long val = std::strtol(envSize, nullptr, 10);
+      if (val > 0)
+        poolSize_ = static_cast<std::size_t>(val);
+    }
+
+    // Allocate the data region (RW).
+    dataRegion_ = static_cast<TrampolineData *>(
+        std::calloc(poolSize_, sizeof(TrampolineData)));
+    assert(dataRegion_ && "Failed to allocate trampoline data region");
+
+    // Allocate the code region (initially RW for writing stubs, then RX).
+    std::size_t codeSize = poolSize_ * kTrampolineStubSize;
+#if defined(_WIN32)
+    codeRegion_ = VirtualAlloc(
+        nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+#elif defined(__APPLE__) && defined(__aarch64__)
+    // macOS Apple Silicon: MAP_JIT is required for pages that will become
+    // executable. Use pthread_jit_write_protect_np to toggle W↔X.
+    codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+        MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0);
+    if (codeRegion_ == MAP_FAILED)
+      codeRegion_ = nullptr;
+    if (codeRegion_) {
+      // Enable writing on this thread (MAP_JIT defaults to execute).
+      pthread_jit_write_protect_np(0); // 0 = writable
+    }
+#else
+    codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE,
+        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (codeRegion_ == MAP_FAILED)
+      codeRegion_ = nullptr;
+#endif
+    assert(codeRegion_ && "Failed to allocate trampoline code region");
+
+    // Generate trampoline stubs.
+    generateStubs();
+
+    // Flush instruction cache. Required on architectures with non-coherent
+    // I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op
+    // but harmless. Without this, AArch64 may execute stale instructions.
+#if defined(__APPLE__) && defined(__aarch64__)
+    // On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h).
+    sys_icache_invalidate(codeRegion_, codeSize);
+#elif defined(_WIN32)
+    FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize);
+#else
+    __builtin___clear_cache(static_cast<char *>(codeRegion_),
+        static_cast<char *>(codeRegion_) + codeSize);
+#endif
+
+    // Make code region executable and non-writable (W^X).
+#if defined(_WIN32)
+    DWORD oldProtect;
+    VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect);
+#elif defined(__APPLE__) && defined(__aarch64__)
+    // Switch back to execute-only (MAP_JIT manages per-thread W^X).
+    pthread_jit_write_protect_np(1); // 1 = executable
+#else
+    mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC);
+#endif
+
+    // Initialize free list.
+    freeList_ = static_cast<std::size_t *>(
+        std::malloc(poolSize_ * sizeof(std::size_t)));
+    assert(freeList_ && "Failed to allocate trampoline free list");
+
+    for (std::size_t i = 0; i < poolSize_ - 1; ++i)
+      freeList_[i] = i + 1;
+    freeList_[poolSize_ - 1] = kInvalidIndex;
+    freeHead_ = 0;
+  }
+
+  /// Generate platform-specific trampoline stubs in the code region.
+  /// Each stub loads callee address and static chain from its paired
+  /// TDATA entry and jumps to the callee.
+  void generateStubs() {
+#if defined(__x86_64__) || defined(_M_X64)
+    generateStubsX86_64();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+    generateStubsAArch64();
+#else
+    // Unsupported architecture — should never be reached because the
+    // extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED.
+    // Fill with trap bytes as a safety net.
+    std::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize);
+#endif
+  }
+
+#if defined(__x86_64__) || defined(_M_X64)
+  /// Generate x86-64 trampoline stubs.
+  ///
+  /// Each stub does:
+  ///   movabsq $dataEntry, %r11         ; load TDATA entry address
+  ///   movq    8(%r11), %r10            ; load static chain -> nest register
+  ///   jmpq    *(%r11)                  ; jump to callee address
+  ///
+  /// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize.
+  void generateStubsX86_64() {
+    auto *code = static_cast<uint8_t *>(codeRegion_);
+
+    for (std::size_t i = 0; i < poolSize_; ++i) {
+      uint8_t *stub = code + i * kTrampolineStubSize;
+
+      // Address of the corresponding TDATA entry.
+      auto dataAddr = reinterpret_cast<uint64_t>(&dataRegion_[i]);
+
+      std::size_t off = 0;
+
+      // movabsq $dataAddr, %r11    (REX.W + B, opcode 0xBB for r11)
+      stub[off++] = 0x49; // REX.WB
+      stub[off++] = 0xBB; // MOV r11, imm64
+      std::memcpy(&stub[off], &dataAddr, 8);
+      off += 8;
+
+      // movq 8(%r11), %r10         (load staticChainAddress into r10)
+      stub[off++] = 0x4D; // REX.WRB
+      stub[off++] = 0x8B; // MOV r/m64 -> r64
+      stub[off++] = 0x53; // ModRM: [r11 + disp8], r10
+      stub[off++] = 0x08; // disp8 = 8
+
+      // jmpq *(%r11)               (jump to calleeAddress)
+      stub[off++] = 0x41; // REX.B
+      stub[off++] = 0xFF; // JMP r/m64
+      stub[off++] = 0x23; // ModRM: [r11], opcode extension 4
+
+      // Pad the rest with INT3 (0xCC) for safety.
+      while (off < kTrampolineStubSize)
+        stub[off++] = 0xCC;
+    }
+  }
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /// Generate AArch64 trampoline stubs.
+  ///
+  /// Each stub does:
+  ///   ldr x17, .Ldata_addr         ; load TDATA entry address
+  ///   ldr x18, [x17, #8]           ; load static chain -> x18 (nest reg)
+  ///   ldr x17, [x17]               ; load callee address
+  ///   br  x17                      ; jump to callee
+  ///   .Ldata_addr:
+  ///     .quad <address of dataRegion_[i]>
+  ///
+  /// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize.
+  void generateStubsAArch64() {
+    auto *code = static_cast<uint8_t *>(codeRegion_);
+
+    for (std::size_t i = 0; i < poolSize_; ++i) {
+      auto *stub = reinterpret_cast<uint32_t *>(code + i * 
kTrampolineStubSize);
+
+      // Address of the corresponding TDATA entry.
+      auto dataAddr = reinterpret_cast<uint64_t>(&dataRegion_[i]);
+
+      // ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16
+      // bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17
+      stub[0] = 0x58000091; // ldr x17, #16  (imm19=4, shifted left 2 = 16)
+                            // Encoding: 0101 1000 0000 0000 0000 0000 1001 
0001
+
+      // ldr x18, [x17, #8]  (load static chain)
+      // LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=18
+      stub[1] = 0xF9400632; // ldr x18, [x17, #8]
+
+      // ldr x17, [x17]      (load callee address)
+      // LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17
+      stub[2] = 0xF9400231; // ldr x17, [x17, #0]
+
+      // br x17
+      stub[3] = 0xD61F0220; // br x17
+
+      // .Ldata_addr: .quad dataRegion_[i]
+      std::memcpy(&stub[4], &dataAddr, 8);
+
+      // Pad remaining with BRK #0 (trap) for safety.
+      std::size_t usedWords = 4 + 2; // 4 instructions + 1 quad (2 words)
+      for (std::size_t w = usedWords;
+          w < kTrampolineStubSize / sizeof(uint32_t); ++w)
+        stub[w] = 0xD4200000; // brk #0
+    }
+  }
+#endif
+
+  Lock lock_;
+  bool initialized_{false};
+  std::size_t poolSize_{0};
+
+  void *codeRegion_{nullptr}; // RX after initialization
+  TrampolineData *dataRegion_{nullptr}; // RW always
+  std::size_t *freeList_{nullptr}; // Intrusive free list
+  std::size_t freeHead_{kInvalidIndex};
+};
+
+} // namespace Fortran::runtime::trampoline
+
+namespace Fortran::runtime {
+extern "C" {
+
+// Helper: crash with a clear message on unsupported architectures.
+// This is only reached if -fruntime-trampoline was used on a target
+// that lacks stub generators. The driver should emit a warning and
+// ignore the flag on unsupported architectures, but the runtime
+// provides a safety net.
+static inline void crashIfUnsupported() {
+#if !TRAMPOLINE_ARCH_SUPPORTED
+  Terminator terminator{__FILE__, __LINE__};
+  terminator.Crash("Runtime trampolines are not supported on this "
+                   "architecture. Remove -fruntime-trampoline "
+                   "or use the legacy stack-trampoline path.");
+#endif
+}
+
+void *RTDEF(TrampolineInit)(
+    void *scratch, const void *calleeAddress, const void *staticChainAddress) {
+  crashIfUnsupported();
+  auto &pool = trampoline::TrampolinePool::instance();
+  return pool.allocate(calleeAddress, staticChainAddress);
+}
+
+void *RTDEF(TrampolineAdjust)(void *handle) {
+  crashIfUnsupported();
+  auto &pool = trampoline::TrampolinePool::instance();
+  return pool.getCallableAddress(
+      static_cast<trampoline::TrampolineHandle *>(handle));
+}
+
+void RTDEF(TrampolineFree)(void *handle) {
+  crashIfUnsupported();
+  auto &pool = trampoline::TrampolinePool::instance();
+  pool.free(static_cast<trampoline::TrampolineHandle *>(handle));
+}
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang-rt/test/Driver/runtime-trampoline-gnustack.f90 
b/flang-rt/test/Driver/runtime-trampoline-gnustack.f90
new file mode 100644
index 0000000000000..ee55f28691bf2
--- /dev/null
+++ b/flang-rt/test/Driver/runtime-trampoline-gnustack.f90
@@ -0,0 +1,45 @@
+! UNSUPPORTED: system-windows
+! UNSUPPORTED: offload-cuda
+! UNSUPPORTED: system-darwin
+
+! Verify that -fruntime-trampoline produces an executable whose
+! GNU_STACK program header is RW (not RWE), proving W^X compliance.
+! The legacy stack-trampoline path requires an executable stack; the
+! runtime trampoline pool does not.
+
+! RUN: %flang %isysroot -fruntime-trampoline -L"%libdir" %s -o %t
+! RUN: llvm-readelf -lW %t | FileCheck %s
+
+! Ensure GNU_STACK exists and has RW flags (no E).
+! CHECK: GNU_STACK
+! CHECK-SAME: RW
+! CHECK-NOT: RWE
+
+subroutine host_proc(x, res)
+  implicit none
+  integer, intent(in) :: x
+  integer, intent(out) :: res
+
+  interface
+    function f_iface() result(r)
+      integer :: r
+    end function
+  end interface
+
+  procedure(f_iface), pointer :: fptr
+  fptr => inner
+  res = fptr()
+
+contains
+  function inner() result(r)
+    integer :: r
+    r = x + 1
+  end function
+end subroutine
+
+program test_gnustack
+  implicit none
+  integer :: result
+  call host_proc(1, result)
+  print *, result
+end program
diff --git a/flang/docs/RuntimeEnvironment.md b/flang/docs/RuntimeEnvironment.md
index e762ce770fd83..6d70a7ef49746 100644
--- a/flang/docs/RuntimeEnvironment.md
+++ b/flang/docs/RuntimeEnvironment.md
@@ -66,3 +66,15 @@ when output takes place to a sequential unit after
 executing a `BACKSPACE` or `REWIND` statement.
 Truncation of a stream-access unit is common to several other
 compilers, but it is not mentioned in the standard.
+
+## `FLANG_TRAMPOLINE_POOL_SIZE`
+
+Set `FLANG_TRAMPOLINE_POOL_SIZE` to an integer value to control the maximum
+number of runtime trampoline slots available when `-fruntime-trampoline` is
+enabled. Each slot is 32 bytes of executable code backed by a writable data
+entry. The default is 1024 slots, which is sufficient for typical Fortran
+programs. If more internal-procedure closures are alive simultaneously than
+the pool can hold, the runtime terminates with a diagnostic message that
+includes the current pool capacity.
+
+Example: `export FLANG_TRAMPOLINE_POOL_SIZE=4096`
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def 
b/flang/include/flang/Frontend/CodeGenOptions.def
index 05ee0e28bcaa6..2b31047f79cd5 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -42,6 +42,7 @@ CODEGENOPT(PrepareForThinLTO , 1, 0) ///< Set when -flto=thin 
is enabled on the
                                      ///< compile step.
 CODEGENOPT(ProtectParens, 1, 1) ///< -fprotect-parens (enable parenthesis 
protection)
 CODEGENOPT(StackArrays, 1, 0) ///< -fstack-arrays (enable the stack-arrays 
pass)
+CODEGENOPT(EnableRuntimeTrampoline, 1, 0) ///< -fruntime-trampoline (W^X 
compliant trampoline pool)
 CODEGENOPT(VectorizeLoop, 1, 0) ///< Enable loop vectorization.
 CODEGENOPT(VectorizeSLP, 1, 0) ///< Enable SLP vectorization.
 CODEGENOPT(InterchangeLoops, 1, 0) ///< Enable loop interchange.
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h 
b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 2c74ab29f75e8..89d1aff4b26a8 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -244,6 +244,10 @@ constexpr TypeBuilderFunc getModel<void *>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc getModel<const void *>() {
+  return getModel<void *>();
+}
+template <>
 constexpr TypeBuilderFunc getModel<void (*)(int)>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return fir::LLVMPointerType::get(
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h 
b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
new file mode 100644
index 0000000000000..45e079818c1fc
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Trampoline.h
@@ -0,0 +1,47 @@
+//===-- Trampoline.h - Runtime trampoline pool builder ----------*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Builder routines for generating calls to the Fortran runtime trampoline
+// pool APIs (_FortranATrampolineInit, _FortranATrampolineAdjust,
+// _FortranATrampolineFree).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
+
+namespace mlir {
+class Value;
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+}
+
+namespace fir::runtime {
+
+/// Generate a call to _FortranATrampolineInit.
+/// Returns an opaque handle (void*) for the trampoline.
+mlir::Value genTrampolineInit(fir::FirOpBuilder &builder, mlir::Location loc,
+                              mlir::Value scratch, mlir::Value calleeAddress,
+                              mlir::Value staticChainAddress);
+
+/// Generate a call to _FortranATrampolineAdjust.
+/// Returns the callable function pointer for the trampoline.
+mlir::Value genTrampolineAdjust(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value handle);
+
+/// Generate a call to _FortranATrampolineFree.
+/// Frees the trampoline slot.
+void genTrampolineFree(fir::FirOpBuilder &builder, mlir::Location loc,
+                       mlir::Value handle);
+
+} // namespace fir::runtime
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_TRAMPOLINE_H
diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td 
b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
index df0ecf5540776..783d73c7e4b4e 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
@@ -91,12 +91,18 @@ def TargetRewritePass : Pass<"target-rewrite", 
"mlir::ModuleOp"> {
 }
 
 def BoxedProcedurePass : Pass<"boxed-procedure", "mlir::ModuleOp"> {
-  let options = [
-    Option<"useThunks", "use-thunks",
-           "bool", /*default=*/"true",
+  let options =
+      [Option<
+           "useThunks", "use-thunks", "bool", /*default=*/"true",
            "Convert procedure pointer abstractions to a single code pointer, "
-           "deploying thunks wherever required.">
-  ];
+           "deploying thunks wherever required.">,
+       Option<
+           "useRuntimeTrampoline", "use-runtime-trampoline", "bool",
+           /*default=*/"false",
+           "Use runtime trampoline pool instead of stack-based trampolines "
+           "for W^X compliance. When enabled, internal procedure pointers "
+           "use a runtime-managed pool of executable trampolines with "
+           "separate data region, avoiding the need for an executable 
stack.">];
 }
 
 def LowerRepackArraysPass : Pass<"lower-repack-arrays", "mlir::ModuleOp"> {
diff --git a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h 
b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
index 1dc8e4f079d72..8bf652cd051cb 100644
--- a/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
+++ b/flang/include/flang/Optimizer/Passes/CommandLineOpts.h
@@ -65,6 +65,7 @@ extern llvm::cl::opt<bool> disableDebugInfo;
 extern llvm::cl::opt<bool> disableFirToLlvmIr;
 extern llvm::cl::opt<bool> disableLlvmIrToLlvm;
 extern llvm::cl::opt<bool> disableBoxedProcedureRewrite;
+extern llvm::cl::opt<bool> enableRuntimeTrampoline;
 
 extern llvm::cl::opt<bool> disableExternalNameConversion;
 extern llvm::cl::opt<bool> enableConstantArgumentGlobalisation;
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h 
b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 1a7ff4ff3dfa2..21b009cc42eb7 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -93,7 +93,8 @@ void addTargetRewritePass(mlir::PassManager &pm);
 mlir::LLVM::DIEmissionKind
 getEmissionKind(llvm::codegenoptions::DebugInfoKind kind);
 
-void addBoxedProcedurePass(mlir::PassManager &pm);
+void addBoxedProcedurePass(mlir::PassManager &pm,
+                           bool enableRuntimeTrampoline = false);
 
 void addExternalNameConversionPass(mlir::PassManager &pm,
                                    bool appendUnderscore = true);
diff --git a/flang/include/flang/Runtime/trampoline.h 
b/flang/include/flang/Runtime/trampoline.h
new file mode 100644
index 0000000000000..3322df8b1b340
--- /dev/null
+++ b/flang/include/flang/Runtime/trampoline.h
@@ -0,0 +1,69 @@
+//===-- include/flang/Runtime/trampoline.h ----------------------*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Runtime support for W^X-compliant trampoline pool management.
+//
+// This provides an alternative to stack-based trampolines for internal
+// procedures with host association. Instead of requiring the stack to be
+// both writable and executable (violating W^X security policies), this
+// implementation uses a pool of pre-assembled trampolines in a separate
+// executable (but not writable) memory region, paired with writable (but
+// not executable) data entries.
+//
+// See flang/docs/InternalProcedureTrampolines.md for design details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_TRAMPOLINE_H_
+#define FORTRAN_RUNTIME_TRAMPOLINE_H_
+
+#include "flang/Runtime/entry-names.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+/// Initializes a new trampoline and returns its internal handle.
+///
+/// Allocates a trampoline entry from the pool, configuring it to call
+/// \p calleeAddress with the static chain pointer \p staticChainAddress
+/// set in the appropriate register (per target ABI).
+///
+/// \p scratch is reserved for future use (e.g., fallback to stack
+/// trampolines). Pass nullptr for pool-based allocation.
+///
+/// The returned handle must be passed to FreeTrampoline() when the
+/// host procedure exits.
+///
+/// Pool capacity: The pool is fixed-size (default 1024 slots, configurable
+/// via FLANG_TRAMPOLINE_POOL_SIZE env var). If all slots are in use, the
+/// runtime issues a fatal error. Dynamic slab growth may be added later.
+///
+/// Architecture support: Currently x86-64 and AArch64. On unsupported
+/// architectures, calling this function issues a fatal diagnostic.
+void *RTDECL(TrampolineInit)(
+    void *scratch, const void *calleeAddress, const void *staticChainAddress);
+
+/// Returns the callable trampoline address for the given handle.
+///
+/// \p handle is a value returned by TrampolineInit().
+/// The result is a function pointer that can be called directly; it will
+/// set up the static chain register and jump to the original callee.
+void *RTDECL(TrampolineAdjust)(void *handle);
+
+/// Frees the trampoline entry associated with the given handle.
+///
+/// Must be called at every exit from the host procedure to return the
+/// trampoline slot to the pool. After this call, any function pointer
+/// previously obtained via TrampolineAdjust() for this handle becomes
+/// invalid.
+void RTDECL(TrampolineFree)(void *handle);
+
+} // extern "C"
+} // namespace Fortran::runtime
+
+#endif // FORTRAN_RUNTIME_TRAMPOLINE_H_
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h 
b/flang/include/flang/Tools/CrossToolHelpers.h
index 44fb252d2b366..a2fadb72ad212 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -88,6 +88,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks 
{
       const Fortran::common::MathOptionsBase &mathOpts) {
     OptLevel = level;
     StackArrays = opts.StackArrays;
+    EnableRuntimeTrampoline = opts.EnableRuntimeTrampoline;
     Underscoring = opts.Underscoring;
     LoopVersioning = opts.LoopVersioning;
     DebugInfo = opts.getDebugInfo();
@@ -114,6 +115,7 @@ struct MLIRToLLVMPassPipelineConfig : public 
FlangEPCallBacks {
 
   llvm::OptimizationLevel OptLevel; ///< optimisation level
   bool StackArrays = false; ///< convert memory allocations to alloca.
+  bool EnableRuntimeTrampoline = false; ///< Use runtime trampoline pool (W^X).
   bool Underscoring = true; ///< add underscores to function names.
   bool LoopVersioning = false; ///< Run the version loop pass.
   bool AliasAnalysis = false; ///< Add TBAA tags to generated LLVMIR.
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp 
b/flang/lib/Frontend/CompilerInvocation.cpp
index fc4975f9592eb..b0bfede63257a 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -284,6 +284,10 @@ static void 
parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                    clang::options::OPT_fno_stack_arrays, false))
     opts.StackArrays = 1;
 
+  if (args.hasFlag(clang::options::OPT_fruntime_trampoline,
+                   clang::options::OPT_fno_runtime_trampoline, false))
+    opts.EnableRuntimeTrampoline = 1;
+
   if (args.getLastArg(clang::options::OPT_floop_interchange))
     opts.InterchangeLoops = 1;
 
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt 
b/flang/lib/Optimizer/Builder/CMakeLists.txt
index d966c52b29c33..8e5b5deff47c1 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -34,6 +34,7 @@ add_flang_library(FIRBuilder
   Runtime/Stop.cpp
   Runtime/Support.cpp
   Runtime/TemporaryStack.cpp
+  Runtime/Trampoline.cpp
   Runtime/Transformational.cpp
   TemporaryStorage.cpp
 
diff --git a/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp 
b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
new file mode 100644
index 0000000000000..4c33ef8e0a148
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/Trampoline.cpp
@@ -0,0 +1,49 @@
+//===-- Trampoline.cpp - Runtime trampoline pool builder --------*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/Trampoline.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Runtime/trampoline.h"
+
+using namespace Fortran::runtime;
+using namespace fir::runtime;
+
+mlir::Value fir::runtime::genTrampolineInit(fir::FirOpBuilder &builder,
+                                            mlir::Location loc,
+                                            mlir::Value scratch,
+                                            mlir::Value calleeAddress,
+                                            mlir::Value staticChainAddress) {
+  mlir::func::FuncOp func{
+      getRuntimeFunc<mkRTKey(TrampolineInit)>(loc, builder)};
+  mlir::FunctionType fTy = func.getFunctionType();
+  llvm::SmallVector<mlir::Value> args = createArguments(
+      builder, loc, fTy, scratch, calleeAddress, staticChainAddress);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
+}
+
+mlir::Value fir::runtime::genTrampolineAdjust(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              mlir::Value handle) {
+  mlir::func::FuncOp func{
+      getRuntimeFunc<mkRTKey(TrampolineAdjust)>(loc, builder)};
+  mlir::FunctionType fTy = func.getFunctionType();
+  llvm::SmallVector<mlir::Value> args =
+      createArguments(builder, loc, fTy, handle);
+  return fir::CallOp::create(builder, loc, func, args).getResult(0);
+}
+
+void fir::runtime::genTrampolineFree(fir::FirOpBuilder &builder,
+                                     mlir::Location loc, mlir::Value handle) {
+  mlir::func::FuncOp func{
+      getRuntimeFunc<mkRTKey(TrampolineFree)>(loc, builder)};
+  mlir::FunctionType fTy = func.getFunctionType();
+  llvm::SmallVector<mlir::Value> args =
+      createArguments(builder, loc, fTy, handle);
+  fir::CallOp::create(builder, loc, func, args);
+}
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp 
b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 61d6d2ae6329a..d1d0201a5ba24 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -10,6 +10,7 @@
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
+#include "flang/Optimizer/Builder/Runtime/Trampoline.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -20,6 +21,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace fir {
 #define GEN_PASS_DEF_BOXEDPROCEDUREPASS
@@ -31,12 +33,6 @@ namespace fir {
 using namespace fir;
 
 namespace {
-/// Options to the procedure pointer pass.
-struct BoxedProcedureOptions {
-  // Lower the boxproc abstraction to function pointers and thunks where
-  // required.
-  bool useThunks = true;
-};
 
 /// This type converter rewrites all `!fir.boxproc<Func>` types to `Func` 
types.
 class BoxprocTypeRewriter : public mlir::TypeConverter {
@@ -219,200 +215,284 @@ class BoxedProcedurePass
   inline mlir::ModuleOp getModule() { return getOperation(); }
 
   void runOnOperation() override final {
-    if (options.useThunks) {
+    if (useThunks) {
       auto *context = &getContext();
       mlir::IRRewriter rewriter(context);
       BoxprocTypeRewriter typeConverter(mlir::UnknownLoc::get(context));
-      getModule().walk([&](mlir::Operation *op) {
-        bool opIsValid = true;
-        typeConverter.setLocation(op->getLoc());
-        if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) {
-          mlir::Type ty = addr.getVal().getType();
-          mlir::Type resTy = addr.getResult().getType();
-          if (llvm::isa<mlir::FunctionType>(ty) ||
-              llvm::isa<fir::BoxProcType>(ty)) {
-            // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc`
-            // or function type to be `fir.convert` ops.
-            rewriter.setInsertionPoint(addr);
-            rewriter.replaceOpWithNewOp<ConvertOp>(
-                addr, typeConverter.convertType(addr.getType()), 
addr.getVal());
-            opIsValid = false;
-          } else if (typeConverter.needsConversion(resTy)) {
-            rewriter.startOpModification(op);
-            op->getResult(0).setType(typeConverter.convertType(resTy));
-            rewriter.finalizeOpModification(op);
-          }
-        } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) {
-          mlir::FunctionType ty = func.getFunctionType();
-          if (typeConverter.needsConversion(ty)) {
-            rewriter.startOpModification(func);
-            auto toTy =
-                mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty));
-            if (!func.empty())
-              for (auto e : llvm::enumerate(toTy.getInputs())) {
-                unsigned i = e.index();
-                auto &block = func.front();
-                block.insertArgument(i, e.value(), func.getLoc());
-                block.getArgument(i + 1).replaceAllUsesWith(
-                    block.getArgument(i));
-                block.eraseArgument(i + 1);
-              }
-            func.setType(toTy);
-            rewriter.finalizeOpModification(func);
-          }
-        } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) {
-          // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
-          // as required.
-          mlir::Type toTy = typeConverter.convertType(
-              mlir::cast<BoxProcType>(embox.getType()).getEleTy());
-          rewriter.setInsertionPoint(embox);
-          if (embox.getHost()) {
-            // Create the thunk.
-            auto module = embox->getParentOfType<mlir::ModuleOp>();
-            FirOpBuilder builder(rewriter, module);
-            const auto triple{fir::getTargetTriple(module)};
-            auto loc = embox.getLoc();
-            mlir::Type i8Ty = builder.getI8Type();
-            mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            // For PPC32 and PPC64, the thunk is populated by a call to
-            // __trampoline_setup, which is defined in
-            // compiler-rt/lib/builtins/trampoline_setup.c and requires the
-            // thunk size greater than 32 bytes.  For AArch64, RISCV and 
x86_64,
-            // the thunk setup doesn't go through __trampoline_setup and fits 
in
-            // 32 bytes.
-            fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
-            mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
-            auto buffer = AllocaOp::create(builder, loc, buffTy);
-            mlir::Value closure =
-                builder.createConvert(loc, i8Ptr, embox.getHost());
-            mlir::Value tramp = builder.createConvert(loc, i8Ptr, buffer);
-            mlir::Value func =
-                builder.createConvert(loc, i8Ptr, embox.getFunc());
-            fir::CallOp::create(
-                builder, loc, factory::getLlvmInitTrampoline(builder),
-                llvm::ArrayRef<mlir::Value>{tramp, func, closure});
-            auto adjustCall = fir::CallOp::create(
-                builder, loc, factory::getLlvmAdjustTrampoline(builder),
-                llvm::ArrayRef<mlir::Value>{tramp});
-            rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
-                                                   adjustCall.getResult(0));
-            opIsValid = false;
-          } else {
-            // Just forward the function as a pointer.
-            rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
-                                                   embox.getFunc());
-            opIsValid = false;
-          }
-        } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) {
-          auto ty = global.getType();
-          if (typeConverter.needsConversion(ty)) {
-            rewriter.startOpModification(global);
-            auto toTy = typeConverter.convertType(ty);
-            global.setType(toTy);
-            rewriter.finalizeOpModification(global);
-          }
-        } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) {
-          auto ty = mem.getType();
-          if (typeConverter.needsConversion(ty)) {
-            rewriter.setInsertionPoint(mem);
-            auto toTy = typeConverter.convertType(unwrapRefType(ty));
-            bool isPinned = mem.getPinned();
-            llvm::StringRef uniqName =
-                mem.getUniqName().value_or(llvm::StringRef());
-            llvm::StringRef bindcName =
-                mem.getBindcName().value_or(llvm::StringRef());
-            rewriter.replaceOpWithNewOp<AllocaOp>(
-                mem, toTy, uniqName, bindcName, isPinned, mem.getTypeparams(),
-                mem.getShape());
-            opIsValid = false;
-          }
-        } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) {
-          auto ty = mem.getType();
-          if (typeConverter.needsConversion(ty)) {
-            rewriter.setInsertionPoint(mem);
-            auto toTy = typeConverter.convertType(unwrapRefType(ty));
-            llvm::StringRef uniqName =
-                mem.getUniqName().value_or(llvm::StringRef());
-            llvm::StringRef bindcName =
-                mem.getBindcName().value_or(llvm::StringRef());
-            rewriter.replaceOpWithNewOp<AllocMemOp>(
-                mem, toTy, uniqName, bindcName, mem.getTypeparams(),
-                mem.getShape());
-            opIsValid = false;
-          }
-        } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) {
-          auto ty = coor.getType();
-          mlir::Type baseTy = coor.getBaseType();
-          if (typeConverter.needsConversion(ty) ||
-              typeConverter.needsConversion(baseTy)) {
-            rewriter.setInsertionPoint(coor);
-            auto toTy = typeConverter.convertType(ty);
-            auto toBaseTy = typeConverter.convertType(baseTy);
-            rewriter.replaceOpWithNewOp<CoordinateOp>(
-                coor, toTy, coor.getRef(), coor.getCoor(), toBaseTy,
-                coor.getFieldIndicesAttr());
-            opIsValid = false;
-          }
-        } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) {
-          auto ty = index.getType();
-          mlir::Type onTy = index.getOnType();
-          if (typeConverter.needsConversion(ty) ||
-              typeConverter.needsConversion(onTy)) {
-            rewriter.setInsertionPoint(index);
-            auto toTy = typeConverter.convertType(ty);
-            auto toOnTy = typeConverter.convertType(onTy);
-            rewriter.replaceOpWithNewOp<FieldIndexOp>(
-                index, toTy, index.getFieldId(), toOnTy, 
index.getTypeparams());
-            opIsValid = false;
-          }
-        } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) {
-          auto ty = index.getType();
-          mlir::Type onTy = index.getOnType();
-          if (typeConverter.needsConversion(ty) ||
-              typeConverter.needsConversion(onTy)) {
-            rewriter.setInsertionPoint(index);
-            auto toTy = typeConverter.convertType(ty);
-            auto toOnTy = typeConverter.convertType(onTy);
-            rewriter.replaceOpWithNewOp<LenParamIndexOp>(
-                index, toTy, index.getFieldId(), toOnTy, 
index.getTypeparams());
-            opIsValid = false;
+
+      // When using runtime trampolines, we need to track handles per
+      // function so we can insert FreeTrampoline calls at each return.
+      // Process functions individually to manage this state.
+      if (useRuntimeTrampoline) {
+        getModule().walk([&](mlir::func::FuncOp funcOp) {
+          trampolineHandles.clear();
+          processFunction(funcOp, rewriter, typeConverter);
+          insertTrampolineFrees(funcOp, rewriter);
+        });
+        // Also process non-function ops at module level (globals, etc.)
+        processModuleLevelOps(rewriter, typeConverter);
+      } else {
+        getModule().walk([&](mlir::Operation *op) {
+          processOp(op, rewriter, typeConverter);
+        });
+      }
+    }
+  }
+
+private:
+  /// Trampoline handles collected while processing a function.
+  /// Each entry is a Value representing the opaque handle returned
+  /// by _FortranATrampolineInit, which must be freed before the
+  /// function returns.
+  llvm::SmallVector<mlir::Value> trampolineHandles;
+
+  /// Process all ops within a function.
+  void processFunction(mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter,
+                       BoxprocTypeRewriter &typeConverter) {
+    funcOp.walk(
+        [&](mlir::Operation *op) { processOp(op, rewriter, typeConverter); });
+  }
+
+  /// Process non-function ops at module level (globals, etc.)
+  void processModuleLevelOps(mlir::IRRewriter &rewriter,
+                             BoxprocTypeRewriter &typeConverter) {
+    for (auto &op : getModule().getBody()->getOperations()) {
+      if (!mlir::isa<mlir::func::FuncOp>(op))
+        processOp(&op, rewriter, typeConverter);
+    }
+  }
+
+  /// Insert _FortranATrampolineFree calls before every return in the function.
+  void insertTrampolineFrees(mlir::func::FuncOp funcOp,
+                             mlir::IRRewriter &rewriter) {
+    if (trampolineHandles.empty())
+      return;
+
+    auto module = funcOp->getParentOfType<mlir::ModuleOp>();
+    // Insert TrampolineFree calls before every func.return in this function.
+    // At this pass stage (after CFGConversion), func.return is the only
+    // terminator that exits the function. Other terminators are either
+    // intra-function branches (cf.br, cf.cond_br, fir.select*) or
+    // fir.unreachable (after STOP/ERROR STOP), which don't need cleanup
+    // since the process is terminating.
+    funcOp.walk([&](mlir::func::ReturnOp retOp) {
+      rewriter.setInsertionPoint(retOp);
+      FirOpBuilder builder(rewriter, module);
+      auto loc = retOp.getLoc();
+      for (mlir::Value handle : trampolineHandles) {
+        fir::runtime::genTrampolineFree(builder, loc, handle);
+      }
+    });
+  }
+
+  /// Process a single operation for boxproc type rewriting.
+  void processOp(mlir::Operation *op, mlir::IRRewriter &rewriter,
+                 BoxprocTypeRewriter &typeConverter) {
+    bool opIsValid = true;
+    typeConverter.setLocation(op->getLoc());
+    if (auto addr = mlir::dyn_cast<BoxAddrOp>(op)) {
+      mlir::Type ty = addr.getVal().getType();
+      mlir::Type resTy = addr.getResult().getType();
+      if (llvm::isa<mlir::FunctionType>(ty) ||
+          llvm::isa<fir::BoxProcType>(ty)) {
+        // Rewrite all `fir.box_addr` ops on values of type `!fir.boxproc`
+        // or function type to be `fir.convert` ops.
+        rewriter.setInsertionPoint(addr);
+        rewriter.replaceOpWithNewOp<ConvertOp>(
+            addr, typeConverter.convertType(addr.getType()), addr.getVal());
+        opIsValid = false;
+      } else if (typeConverter.needsConversion(resTy)) {
+        rewriter.startOpModification(op);
+        op->getResult(0).setType(typeConverter.convertType(resTy));
+        rewriter.finalizeOpModification(op);
+      }
+    } else if (auto func = mlir::dyn_cast<mlir::func::FuncOp>(op)) {
+      mlir::FunctionType ty = func.getFunctionType();
+      if (typeConverter.needsConversion(ty)) {
+        rewriter.startOpModification(func);
+        auto toTy =
+            mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty));
+        if (!func.empty())
+          for (auto e : llvm::enumerate(toTy.getInputs())) {
+            unsigned i = e.index();
+            auto &block = func.front();
+            block.insertArgument(i, e.value(), func.getLoc());
+            block.getArgument(i + 1).replaceAllUsesWith(block.getArgument(i));
+            block.eraseArgument(i + 1);
           }
-        } else {
-          rewriter.startOpModification(op);
-          // Convert the operands if needed
-          for (auto i : llvm::enumerate(op->getResultTypes()))
-            if (typeConverter.needsConversion(i.value())) {
-              auto toTy = typeConverter.convertType(i.value());
-              op->getResult(i.index()).setType(toTy);
-            }
+        func.setType(toTy);
+        rewriter.finalizeOpModification(func);
+      }
+    } else if (auto embox = mlir::dyn_cast<EmboxProcOp>(op)) {
+      // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
+      // as required.
+      mlir::Type toTy = typeConverter.convertType(
+          mlir::cast<BoxProcType>(embox.getType()).getEleTy());
+      rewriter.setInsertionPoint(embox);
+      if (embox.getHost()) {
+        auto module = embox->getParentOfType<mlir::ModuleOp>();
+        FirOpBuilder builder(rewriter, module);
+        auto loc = embox.getLoc();
+        mlir::Type i8Ty = builder.getI8Type();
+        mlir::Type i8Ptr = builder.getRefType(i8Ty);
 
-          // Convert the type attributes if needed
-          for (const mlir::NamedAttribute &attr : op->getAttrDictionary())
-            if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue()))
-              if (typeConverter.needsConversion(tyAttr.getValue())) {
-                auto toTy = typeConverter.convertType(tyAttr.getValue());
-                op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy));
-              }
-          rewriter.finalizeOpModification(op);
+        if (useRuntimeTrampoline) {
+          // Runtime trampoline pool path (W^X compliant).
+          // Instead of allocating a writable+executable buffer on the
+          // stack, call the runtime to allocate from a pre-initialized
+          // pool with separate RX (code) and RW (data) regions.
+          mlir::Value nullPtr = builder.createNullConstant(loc, i8Ptr);
+          mlir::Value closure =
+              builder.createConvert(loc, i8Ptr, embox.getHost());
+          mlir::Value func = builder.createConvert(loc, i8Ptr, 
embox.getFunc());
+
+          // _FortranATrampolineInit(nullptr, func, closure) -> handle
+          mlir::Value handle = fir::runtime::genTrampolineInit(
+              builder, loc, nullPtr, func, closure);
+
+          // _FortranATrampolineAdjust(handle) -> callable address
+          mlir::Value callableAddr =
+              fir::runtime::genTrampolineAdjust(builder, loc, handle);
+
+          // Track the handle so we can free it at function exits.
+          trampolineHandles.push_back(handle);
+
+          rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, callableAddr);
+        } else {
+          // Legacy stack-based trampoline path.
+          const auto triple{fir::getTargetTriple(module)};
+          // For PPC32 and PPC64, the thunk is populated by a call to
+          // __trampoline_setup, which is defined in
+          // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+          // thunk size greater than 32 bytes.  For AArch64, RISCV and
+          // x86_64, the thunk setup doesn't go through
+          // __trampoline_setup and fits in 32 bytes.
+          fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
+          mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
+          auto buffer = AllocaOp::create(builder, loc, buffTy);
+          mlir::Value closure =
+              builder.createConvert(loc, i8Ptr, embox.getHost());
+          mlir::Value tramp = builder.createConvert(loc, i8Ptr, buffer);
+          mlir::Value func = builder.createConvert(loc, i8Ptr, 
embox.getFunc());
+          fir::CallOp::create(
+              builder, loc, factory::getLlvmInitTrampoline(builder),
+              llvm::ArrayRef<mlir::Value>{tramp, func, closure});
+          auto adjustCall = fir::CallOp::create(
+              builder, loc, factory::getLlvmAdjustTrampoline(builder),
+              llvm::ArrayRef<mlir::Value>{tramp});
+          rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy,
+                                                 adjustCall.getResult(0));
         }
-        // Ensure block arguments are updated if needed.
-        if (opIsValid && op->getNumRegions() != 0) {
-          rewriter.startOpModification(op);
-          for (mlir::Region &region : op->getRegions())
-            for (mlir::Block &block : region.getBlocks())
-              for (mlir::BlockArgument blockArg : block.getArguments())
-                if (typeConverter.needsConversion(blockArg.getType())) {
-                  mlir::Type toTy =
-                      typeConverter.convertType(blockArg.getType());
-                  blockArg.setType(toTy);
-                }
-          rewriter.finalizeOpModification(op);
+        opIsValid = false;
+      } else {
+        // Just forward the function as a pointer.
+        rewriter.replaceOpWithNewOp<ConvertOp>(embox, toTy, embox.getFunc());
+        opIsValid = false;
+      }
+    } else if (auto global = mlir::dyn_cast<GlobalOp>(op)) {
+      auto ty = global.getType();
+      if (typeConverter.needsConversion(ty)) {
+        rewriter.startOpModification(global);
+        auto toTy = typeConverter.convertType(ty);
+        global.setType(toTy);
+        rewriter.finalizeOpModification(global);
+      }
+    } else if (auto mem = mlir::dyn_cast<AllocaOp>(op)) {
+      auto ty = mem.getType();
+      if (typeConverter.needsConversion(ty)) {
+        rewriter.setInsertionPoint(mem);
+        auto toTy = typeConverter.convertType(unwrapRefType(ty));
+        bool isPinned = mem.getPinned();
+        llvm::StringRef uniqName =
+            mem.getUniqName().value_or(llvm::StringRef());
+        llvm::StringRef bindcName =
+            mem.getBindcName().value_or(llvm::StringRef());
+        rewriter.replaceOpWithNewOp<AllocaOp>(mem, toTy, uniqName, bindcName,
+                                              isPinned, mem.getTypeparams(),
+                                              mem.getShape());
+        opIsValid = false;
+      }
+    } else if (auto mem = mlir::dyn_cast<AllocMemOp>(op)) {
+      auto ty = mem.getType();
+      if (typeConverter.needsConversion(ty)) {
+        rewriter.setInsertionPoint(mem);
+        auto toTy = typeConverter.convertType(unwrapRefType(ty));
+        llvm::StringRef uniqName =
+            mem.getUniqName().value_or(llvm::StringRef());
+        llvm::StringRef bindcName =
+            mem.getBindcName().value_or(llvm::StringRef());
+        rewriter.replaceOpWithNewOp<AllocMemOp>(mem, toTy, uniqName, bindcName,
+                                                mem.getTypeparams(),
+                                                mem.getShape());
+        opIsValid = false;
+      }
+    } else if (auto coor = mlir::dyn_cast<CoordinateOp>(op)) {
+      auto ty = coor.getType();
+      mlir::Type baseTy = coor.getBaseType();
+      if (typeConverter.needsConversion(ty) ||
+          typeConverter.needsConversion(baseTy)) {
+        rewriter.setInsertionPoint(coor);
+        auto toTy = typeConverter.convertType(ty);
+        auto toBaseTy = typeConverter.convertType(baseTy);
+        rewriter.replaceOpWithNewOp<CoordinateOp>(coor, toTy, coor.getRef(),
+                                                  coor.getCoor(), toBaseTy,
+                                                  coor.getFieldIndicesAttr());
+        opIsValid = false;
+      }
+    } else if (auto index = mlir::dyn_cast<FieldIndexOp>(op)) {
+      auto ty = index.getType();
+      mlir::Type onTy = index.getOnType();
+      if (typeConverter.needsConversion(ty) ||
+          typeConverter.needsConversion(onTy)) {
+        rewriter.setInsertionPoint(index);
+        auto toTy = typeConverter.convertType(ty);
+        auto toOnTy = typeConverter.convertType(onTy);
+        rewriter.replaceOpWithNewOp<FieldIndexOp>(
+            index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+        opIsValid = false;
+      }
+    } else if (auto index = mlir::dyn_cast<LenParamIndexOp>(op)) {
+      auto ty = index.getType();
+      mlir::Type onTy = index.getOnType();
+      if (typeConverter.needsConversion(ty) ||
+          typeConverter.needsConversion(onTy)) {
+        rewriter.setInsertionPoint(index);
+        auto toTy = typeConverter.convertType(ty);
+        auto toOnTy = typeConverter.convertType(onTy);
+        rewriter.replaceOpWithNewOp<LenParamIndexOp>(
+            index, toTy, index.getFieldId(), toOnTy, index.getTypeparams());
+        opIsValid = false;
+      }
+    } else {
+      rewriter.startOpModification(op);
+      // Convert the operands if needed
+      for (auto i : llvm::enumerate(op->getResultTypes()))
+        if (typeConverter.needsConversion(i.value())) {
+          auto toTy = typeConverter.convertType(i.value());
+          op->getResult(i.index()).setType(toTy);
         }
-      });
+
+      // Convert the type attributes if needed
+      for (const mlir::NamedAttribute &attr : op->getAttrDictionary())
+        if (auto tyAttr = llvm::dyn_cast<mlir::TypeAttr>(attr.getValue()))
+          if (typeConverter.needsConversion(tyAttr.getValue())) {
+            auto toTy = typeConverter.convertType(tyAttr.getValue());
+            op->setAttr(attr.getName(), mlir::TypeAttr::get(toTy));
+          }
+      rewriter.finalizeOpModification(op);
+    }
+    // Ensure block arguments are updated if needed.
+    if (opIsValid && op->getNumRegions() != 0) {
+      rewriter.startOpModification(op);
+      for (mlir::Region &region : op->getRegions())
+        for (mlir::Block &block : region.getBlocks())
+          for (mlir::BlockArgument blockArg : block.getArguments())
+            if (typeConverter.needsConversion(blockArg.getType())) {
+              mlir::Type toTy = typeConverter.convertType(blockArg.getType());
+              blockArg.setType(toTy);
+            }
+      rewriter.finalizeOpModification(op);
     }
   }
-
-private:
-  BoxedProcedureOptions options;
 };
 } // namespace
diff --git a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp 
b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
index 75e818d330b51..794dffebc38bb 100644
--- a/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
+++ b/flang/lib/Optimizer/Passes/CommandLineOpts.cpp
@@ -71,6 +71,8 @@ DisableOption(FirToLlvmIr, "fir-to-llvmir", "FIR to LLVM-IR 
dialect");
 DisableOption(LlvmIrToLlvm, "llvm", "conversion to LLVM");
 DisableOption(BoxedProcedureRewrite, "boxed-procedure-rewrite",
               "rewrite boxed procedures");
+EnableOption(RuntimeTrampoline, "runtime-trampoline",
+             "W^X compliant runtime trampoline pool");
 
 DisableOption(ExternalNameConversion, "external-name-interop",
               "convert names with external convention");
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp 
b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 5927fff960270..1187e8a27baa2 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -140,9 +140,16 @@ void addLLVMDialectToLLVMPass(mlir::PassManager &pm,
   });
 }
 
-void addBoxedProcedurePass(mlir::PassManager &pm) {
-  addPassConditionally(pm, disableBoxedProcedureRewrite,
-                       [&]() { return fir::createBoxedProcedurePass(); });
+void addBoxedProcedurePass(mlir::PassManager &pm,
+                           bool enableRuntimeTrampolineFromConfig) {
+  addPassConditionally(pm, disableBoxedProcedureRewrite, [&]() {
+    fir::BoxedProcedurePassOptions opts;
+    // Support both the frontend -fruntime-trampoline flag (via config)
+    // and the cl::opt --runtime-trampoline (for fir-opt/tco tools).
+    opts.useRuntimeTrampoline =
+        enableRuntimeTrampolineFromConfig || enableRuntimeTrampoline;
+    return fir::createBoxedProcedurePass(opts);
+  });
 }
 
 void addExternalNameConversionPass(mlir::PassManager &pm,
@@ -371,7 +378,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager 
&pm,
                                          MLIRToLLVMPassPipelineConfig config,
                                          llvm::StringRef inputFilename) {
   pm.addPass(fir::createMIFOpConversion());
-  fir::addBoxedProcedurePass(pm);
+  fir::addBoxedProcedurePass(pm, config.EnableRuntimeTrampoline);
   if (config.OptLevel.isOptimizingForSpeed() && config.AliasAnalysis &&
       !disableFirAliasTags && !useOldAliasTags)
     pm.addPass(fir::createAddAliasTags());
diff --git a/flang/test/Driver/fruntime-trampoline.f90 
b/flang/test/Driver/fruntime-trampoline.f90
new file mode 100644
index 0000000000000..4feff99bb3f98
--- /dev/null
+++ b/flang/test/Driver/fruntime-trampoline.f90
@@ -0,0 +1,15 @@
+! Test that -fruntime-trampoline is properly forwarded from driver to
+! frontend, and that -fno-runtime-trampoline (default) works.
+
+! REQUIRES: x86-registered-target
+
+! RUN: %flang -### -fruntime-trampoline %s -o %t 2>&1 | FileCheck %s 
--check-prefix=CHECK-ON
+! RUN: %flang -### -fno-runtime-trampoline %s -o %t 2>&1 | FileCheck %s 
--check-prefix=CHECK-OFF
+! RUN: %flang -### %s -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
+
+! CHECK-ON: "-fruntime-trampoline"
+! CHECK-OFF-NOT: "-fruntime-trampoline"
+! CHECK-DEFAULT-NOT: "-fruntime-trampoline"
+
+program dummy
+end program
diff --git a/flang/test/Fir/boxproc-runtime-trampoline.fir 
b/flang/test/Fir/boxproc-runtime-trampoline.fir
new file mode 100644
index 0000000000000..6dd061dce420e
--- /dev/null
+++ b/flang/test/Fir/boxproc-runtime-trampoline.fir
@@ -0,0 +1,67 @@
+// RUN: fir-opt --boxed-procedure="use-runtime-trampoline=true" %s | FileCheck 
%s
+
+// Test that the --boxed-procedure pass with use-runtime-trampoline=true
+// generates calls to _FortranATrampolineInit, _FortranATrampolineAdjust,
+// and _FortranATrampolineFree instead of llvm.init.trampoline and
+// llvm.adjust.trampoline intrinsics.
+
+// CHECK-LABEL: func.func @_QPtest_proc_dummy()
+// CHECK:         fir.zero_bits !fir.ref<i8>
+// CHECK:         fir.convert {{.*}} : {{.*}} -> !fir.ref<i8>
+// CHECK:         fir.convert {{.*}} : {{.*}} -> !fir.ref<i8>
+// CHECK:         fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK:         fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK:         fir.convert {{.*}} : {{.*}} -> !fir.llvm_ptr<i8>
+// CHECK:         %[[HANDLE:.*]] = fir.call @_FortranATrampolineInit({{.*}}) : 
(!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>) -> !fir.llvm_ptr<i8>
+// CHECK:         %[[ADDR:.*]] = fir.call 
@_FortranATrampolineAdjust(%[[HANDLE]]) : (!fir.llvm_ptr<i8>) -> 
!fir.llvm_ptr<i8>
+// CHECK:         %[[FPTR:.*]] = fir.convert %[[ADDR]]
+// CHECK:         fir.call @_QPtest_proc_dummy_other(%[[FPTR]])
+// CHECK:         fir.call @_FortranATrampolineFree(%[[HANDLE]])
+// CHECK:         return
+
+func.func @_QPtest_proc_dummy() {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c-1_i32 = arith.constant -1 : i32
+  %c5_i32 = arith.constant 5 : i32
+  %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_proc_dummyEi"}
+  %1 = fir.alloca tuple<!fir.ref<i32>>
+  %2 = fir.coordinate_of %1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, i32) 
-> !fir.llvm_ptr<!fir.ref<i32>>
+  fir.store %0 to %2 : !fir.llvm_ptr<!fir.ref<i32>>
+  fir.store %c1_i32 to %0 : !fir.ref<i32>
+  %3 = fir.address_of(@_QFtest_proc_dummyPtest_proc_dummy_a) : (!fir.ref<i32>, 
!fir.ref<tuple<!fir.ref<i32>>>) -> ()
+  %4 = fir.emboxproc %3, %1 : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) 
-> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
+  fir.call @_QPtest_proc_dummy_other(%4) : (!fir.boxproc<() -> ()>) -> ()
+  %5 = fir.address_of(@_QQclX2E2F682E66393000) : !fir.ref<!fir.char<1,8>>
+  %6 = fir.convert %5 : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8>
+  %7 = fir.call @_FortranAioBeginExternalListOutput(%c-1_i32, %6, %c5_i32) : 
(i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+  %8 = fir.load %0 : !fir.ref<i32>
+  %9 = fir.call @_FortranAioOutputInteger32(%7, %8) : (!fir.ref<i8>, i32) -> i1
+  %10 = fir.call @_FortranAioEndIoStatement(%7) : (!fir.ref<i8>) -> i32
+  return
+}
+func.func @_QFtest_proc_dummyPtest_proc_dummy_a(%arg0: !fir.ref<i32> 
{fir.bindc_name = "j"}, %arg1: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) 
{
+  %c0_i32 = arith.constant 0 : i32
+  %0 = fir.coordinate_of %arg1, %c0_i32 : (!fir.ref<tuple<!fir.ref<i32>>>, 
i32) -> !fir.llvm_ptr<!fir.ref<i32>>
+  %1 = fir.load %0 : !fir.llvm_ptr<!fir.ref<i32>>
+  %2 = fir.load %1 : !fir.ref<i32>
+  %3 = fir.load %arg0 : !fir.ref<i32>
+  %4 = arith.addi %2, %3 : i32
+  fir.store %4 to %1 : !fir.ref<i32>
+  return
+}
+func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
+  %c4_i32 = arith.constant 4 : i32
+  %0 = fir.alloca i32 {adapt.valuebyref}
+  fir.store %c4_i32 to %0 : !fir.ref<i32>
+  %1 = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> ((!fir.ref<i32>) -> ())
+  fir.call %1(%0) : (!fir.ref<i32>) -> ()
+  return
+}
+fir.global linkonce @_QQclX2E2F682E66393000 constant : !fir.char<1,8> {
+  %0 = fir.string_lit "./h.f90\00"(8) : !fir.char<1,8>
+  fir.has_value %0 : !fir.char<1,8>
+}
+func.func private @_FortranAioOutputInteger32(!fir.ref<i8>, i32) -> i1 
attributes {fir.io, fir.runtime}
+func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) 
-> !fir.ref<i8> attributes {fir.io, fir.runtime}
+func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes 
{fir.io, fir.runtime}
diff --git a/flang/test/Lower/runtime-trampoline.f90 
b/flang/test/Lower/runtime-trampoline.f90
new file mode 100644
index 0000000000000..a197f5b0b05bf
--- /dev/null
+++ b/flang/test/Lower/runtime-trampoline.f90
@@ -0,0 +1,41 @@
+! RUN: %flang -fruntime-trampoline -emit-llvm -S -o - %s | FileCheck %s
+!
+! Test that -fruntime-trampoline generates calls to the runtime
+! trampoline pool instead of stack-based trampolines.
+
+! CHECK: call {{.*}}@_FortranATrampolineInit
+! CHECK: call {{.*}}@_FortranATrampolineAdjust
+! CHECK: call {{.*}}@_FortranATrampolineFree
+
+module other
+  abstract interface
+     function callback()
+       integer :: callback
+     end function callback
+  end interface
+  contains
+  subroutine foo(fptr)
+    procedure(callback), pointer :: fptr
+    print *, fptr()
+  end subroutine foo
+end module other
+
+subroutine host(local)
+  use other
+  integer :: local
+  procedure(callback), pointer :: fptr
+  fptr => callee
+  call foo(fptr)
+  return
+
+  contains
+
+  function callee()
+    integer :: callee
+    callee = local
+  end function callee
+end subroutine host
+
+program main
+  call host(10)
+end program main

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [flang] [llvm] [flang] Add runtime trampoline pool for W^X compliance (PR #183108)

Reply via email to