================
@@ -0,0 +1,432 @@
+//===-- lib/runtime/trampoline.cpp -------------------------------*-
C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// W^X-compliant trampoline pool implementation.
+//
+// This file implements a runtime trampoline pool that maintains separate
+// memory regions for executable code (RX) and writable data (RW).
+//
+// On Linux the code region transitions RW → RX (never simultaneously W+X).
+// On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X
+// toggling via pthread_jit_write_protect_np, so the mapping permissions
+// include both W and X but hardware enforces that only one is active at
+// a time on any given thread.
+//
+// Architecture:
+// - Code region (RX): Contains pre-assembled trampoline stubs that load
+// callee address and static chain from a paired TDATA entry, then jump
+// to the callee with the static chain in the appropriate register.
+// - Data region (RW): Contains TrampolineData entries with {callee_address,
+// static_chain_address} pairs, one per trampoline slot.
+// - Free list: Tracks available trampoline slots for O(1) alloc/free.
+//
+// Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX,
+// CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime
+// library dependence. A single global lock serializes pool operations.
+// This is a deliberate V1 design choice to keep the initial W^X
+// architectural change minimal. Per-thread lock-free pools are deferred
+// to a future optimization patch.
+//
+// AddressSanitizer note: The trampoline code region is allocated via
+// mmap (not malloc/new), so ASan does not track it. The data region
+// and handles are allocated via malloc (through AllocateMemoryOrCrash),
+// which ASan intercepts normally. No special annotations are needed.
+//
+// See flang/docs/InternalProcedureTrampolines.md for design details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/trampoline.h"
+#include "flang-rt/runtime/lock.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/trampoline.h"
+#include "flang/Runtime/freestanding-tools.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+// Platform-specific headers for memory mapping.
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+// macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np
+// to create executable memory under the hardened runtime.
+#if defined(__APPLE__) && defined(__aarch64__)
+#include <libkern/OSCacheControl.h>
+#include <pthread.h>
+#endif
+
+// Architecture support check. Stub generators exist only for x86-64 and
+// AArch64. On other architectures the file compiles but the runtime API
+// functions crash with a diagnostic if actually called, so that building
+// flang-rt on e.g. RISC-V or PPC64 never fails.
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \
+ defined(_M_ARM64)
+#define TRAMPOLINE_ARCH_SUPPORTED 1
+#else
+#define TRAMPOLINE_ARCH_SUPPORTED 0
+#endif
+
+namespace Fortran::runtime::trampoline {
+
+/// A handle returned to the caller. Contains enough info to find
+/// both the trampoline stub and its data entry.
+struct TrampolineHandle {
+ void *codePtr{nullptr}; // Pointer to the trampoline stub in the RX region.
+ TrampolineData *dataPtr{
+ nullptr}; // Pointer to the data entry in the RW region.
+ std::size_t slotIndex{0}; // Index in the pool for free-list management.
+};
+
+// Namespace-scope globals following Flang runtime conventions:
+// - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION)
+// - Pool pointer uses std::atomic for safe double-checked locking
+class TrampolinePool; // Forward declaration for pointer below.
+static Lock poolLock;
+static std::atomic<TrampolinePool *> poolInstance{nullptr};
+
+/// The global trampoline pool.
+class TrampolinePool {
+public:
+ TrampolinePool() = default;
+
+ static TrampolinePool &instance() {
+ TrampolinePool *p = poolInstance.load(std::memory_order_acquire);
+ if (p) {
+ return *p;
+ }
+ CriticalSection critical{poolLock};
+ p = poolInstance.load(std::memory_order_relaxed);
+ if (p) {
+ return *p;
+ }
+ // Allocate pool using SizedNew (malloc + placement new).
+ Terminator terminator{__FILE__, __LINE__};
+ auto owning = SizedNew<TrampolinePool>{terminator}(sizeof(TrampolinePool));
+ p = owning.release();
+ poolInstance.store(p, std::memory_order_release);
+ return *p;
+ }
+
+ /// Allocate a trampoline slot and initialize it.
+ TrampolineHandle *allocate(
+ const void *calleeAddress, const void *staticChainAddress) {
+ CriticalSection critical{lock_};
+ ensureInitialized();
+
+ if (freeHead_ == kInvalidIndex) {
+ // Pool exhausted — fixed size by design for V1.
+ // The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE
+ // (default 1024). Dynamic slab growth can be added in a follow-up
+ // patch if real workloads demonstrate a need for it.
+ Terminator terminator{__FILE__, __LINE__};
+ terminator.Crash("Trampoline pool exhausted (max %zu slots). "
+ "Set FLANG_TRAMPOLINE_POOL_SIZE to increase.",
+ poolSize_);
+ }
+
+ std::size_t index = freeHead_;
+ freeHead_ = freeList_[index];
+
+ // Initialize the data entry.
+ dataRegion_[index].calleeAddress = calleeAddress;
+ dataRegion_[index].staticChainAddress = staticChainAddress;
+
+ // Create handle using SizedNew (malloc + placement new).
+ Terminator terminator{__FILE__, __LINE__};
+ auto owning = New<TrampolineHandle>{terminator}();
+ TrampolineHandle *handle = owning.release();
+ handle->codePtr =
+ static_cast<char *>(codeRegion_) + index * kTrampolineStubSize;
+ handle->dataPtr = &dataRegion_[index];
+ handle->slotIndex = index;
+
+ return handle;
+ }
+
+ /// Get the callable address of a trampoline.
+ void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr;
}
+
+ /// Free a trampoline slot.
+ void free(TrampolineHandle *handle) {
+ CriticalSection critical{lock_};
+
+ std::size_t index = handle->slotIndex;
+
+ // Poison the data entry so that any dangling call through a freed
+ // trampoline traps immediately. Setting to NULL means the stub will
+ // jump to address 0, which is unmapped on all supported platforms
+ // and produces SIGSEGV/SIGBUS immediately.
+ dataRegion_[index].calleeAddress = nullptr;
+ dataRegion_[index].staticChainAddress = nullptr;
+
+ // Return slot to free list.
+ freeList_[index] = freeHead_;
+ freeHead_ = index;
+
+ FreeMemory(handle);
+ }
+
+private:
+ static constexpr std::size_t kInvalidIndex = ~std::size_t{0};
+
+ void ensureInitialized() {
+ if (initialized_) {
+ return;
+ }
+ initialized_ = true;
+
+ // Check environment variable for pool size override.
+ // Fixed-size pool by design (V1): avoids complexity of dynamic growth
+ // and re-protection of code pages. The default (1024 slots) is
+ // sufficient for typical Fortran programs. Users can override via:
+ // export FLANG_TRAMPOLINE_POOL_SIZE=4096
+ poolSize_ = kDefaultPoolSize;
+ if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) {
+ long val = std::strtol(envSize, nullptr, 10);
+ if (val > 0)
+ poolSize_ = static_cast<std::size_t>(val);
+ }
+
+ // Allocate the data region (RW).
+ Terminator terminator{__FILE__, __LINE__};
+ dataRegion_ = static_cast<TrampolineData *>(
+ AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(TrampolineData)));
+ runtime::memset(dataRegion_, 0, poolSize_ * sizeof(TrampolineData));
+
+ // Allocate the code region (initially RW for writing stubs, then RX).
+ std::size_t codeSize = poolSize_ * kTrampolineStubSize;
+#if defined(_WIN32)
+ codeRegion_ = VirtualAlloc(
+ nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+#elif defined(__APPLE__) && defined(__aarch64__)
+ // macOS Apple Silicon: MAP_JIT is required for pages that will become
+ // executable. Use pthread_jit_write_protect_np to toggle W↔X.
+ codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0);
+ if (codeRegion_ == MAP_FAILED)
+ codeRegion_ = nullptr;
+ if (codeRegion_) {
+ // Enable writing on this thread (MAP_JIT defaults to execute).
+ pthread_jit_write_protect_np(0); // 0 = writable
+ }
+#else
+ codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (codeRegion_ == MAP_FAILED)
+ codeRegion_ = nullptr;
+#endif
+ if (!codeRegion_) {
+ terminator.Crash("Failed to allocate trampoline code region");
+ }
+
+ // Generate trampoline stubs.
+ generateStubs();
+
+ // Flush instruction cache. Required on architectures with non-coherent
+ // I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op
+ // but harmless. Without this, AArch64 may execute stale instructions.
+#if defined(__APPLE__) && defined(__aarch64__)
+ // On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h).
+ sys_icache_invalidate(codeRegion_, codeSize);
+#elif defined(_WIN32)
+ FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize);
+#else
+ __builtin___clear_cache(static_cast<char *>(codeRegion_),
+ static_cast<char *>(codeRegion_) + codeSize);
+#endif
+
+ // Make code region executable and non-writable (W^X).
+#if defined(_WIN32)
+ DWORD oldProtect;
+ VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect);
+#elif defined(__APPLE__) && defined(__aarch64__)
+ // Switch back to execute-only (MAP_JIT manages per-thread W^X).
+ pthread_jit_write_protect_np(1); // 1 = executable
+#else
+ mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC);
+#endif
+
+ // Initialize free list.
+ freeList_ = static_cast<std::size_t *>(
+ AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(std::size_t)));
+
+ for (std::size_t i = 0; i < poolSize_ - 1; ++i)
+ freeList_[i] = i + 1;
----------------
Saieiei wrote:
Done.
Converted all local variable initializations to brace init (`{}`) across the
runtime (`trampoline.h`, `trampoline.cpp`), the builder (`BoxedProcedure.cpp`)
and `CrossToolHelpers.h`.
This also exposed a `size_t` → `unsigned` narrowing, now fixed. Latest push:
[c71773ae](https://github.com/llvm/llvm-project/compare/95239b46c85d9e8c1deb7fd70189b119e5159e73..c71773aeaf919087e69b6a91b93204f19a4a22ed).
https://github.com/llvm/llvm-project/pull/183108
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits