(tvm-ffi) branch main updated: [BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and cold-marking of error paths (#589)

tqchen Wed, 13 May 2026 19:21:46 -0700

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git



The following commit(s) were added to refs/heads/main by this push:
     new d73a267  [BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and 
cold-marking of error paths (#589)
d73a267 is described below

commit d73a26783488430986fd855ae67ff7a9fb016413
Author: Tianqi Chen <[email protected]>
AuthorDate: Wed May 13 22:21:33 2026 -0400

    [BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and cold-marking 
of error paths (#589)
    
    ## Summary
    
    Adds three header-only macros (`TVM_FFI_COLD_CODE`,
    `TVM_FFI_PREDICT_FALSE`, `TVM_FFI_PREDICT_TRUE`) in
    `tvm/ffi/base_details.h` and applies them to a small audited set of
    error-only helpers. No CMake changes.
    
    ## What
    
    `TVM_FFI_COLD_CODE`: `[[gnu::cold]]` on GCC/Clang, no-op on MSVC.
    `TVM_FFI_PREDICT_FALSE` / `TVM_FFI_PREDICT_TRUE`: `__builtin_expect` on
    GCC/Clang, no-op on MSVC.
    
    Cold-marked (error / setup / teardown only):
    
    - `details::ErrorBuilder` ctors and the `[[noreturn]]` destructor
    - `TVMFFISegFaultHandler`
    - `TVMFFIInstallSignalHandler`
    - `ForwardPyErrorToFFI`
    
    C ABI exports stay hot per cross-DSO surface hygiene — `TVMFFIError*`
    family, `TVMFFIBacktrace`, and `SafeCallContext` setters all remain
    ordinary entry points. Deleters are not cold-marked: they run on every
    callback destruction during normal program operation, not on an error
    path.
    
    `TVM_FFI_PREDICT_FALSE` is applied to `TVM_FFI_CHECK_SAFE_CALL`,
    `TVM_FFI_CHECK`, and ~17 error-check branches in the Python→FFI
    dispatchers in `tvm_ffi_python_helpers.h`.
    
    ## Mechanism
    
    GCC and Clang emit cold-marked functions into per-TU `.text.unlikely`.
    The default GNU linker script's `*(.text.unlikely .text.*_unlikely
    .text.unlikely.*)` rule gathers them into a contiguous slot inside
    `.text`. No `-ffunction-sections` flag required.
    
    Detailed measurements (binary-size matrix, isolation study, perf table,
    cold-cluster bounds) posted as a follow-up comment.
---
 include/tvm/ffi/base_details.h                 | 38 +++++++++++++++
 include/tvm/ffi/error.h                        |  7 ++-
 include/tvm/ffi/function.h                     |  2 +-
 python/tvm_ffi/cython/tvm_ffi_python_helpers.h | 65 ++++++++++++++++++--------
 src/ffi/backtrace.cc                           |  2 +
 src/ffi/error.cc                               |  1 +
 src/ffi/function.cc                            |  4 +-
 7 files changed, 94 insertions(+), 25 deletions(-)

diff --git a/include/tvm/ffi/base_details.h b/include/tvm/ffi/base_details.h
index acbd652..c2ac2b8 100644
--- a/include/tvm/ffi/base_details.h
+++ b/include/tvm/ffi/base_details.h
@@ -76,6 +76,44 @@
 #define TVM_FFI_UNREACHABLE() __builtin_unreachable()
 #endif
 
+/*!
+ * \brief Mark a function as cold so the toolchain places it in a
+ *        separate cold region of `.text`. Apply to functions that only
+ *        run on error / setup / teardown paths.
+ *
+ * On GCC and Clang, expands to `[[gnu::cold]]`, which emits the
+ * function into a per-TU `.text.unlikely` section. The default GNU
+ * linker script gathers `.text.unlikely.*` into a contiguous slot
+ * inside `.text`, so cold-marked functions cluster away from hot
+ * code without any additional CMake flags. On MSVC the attribute
+ * does not exist and the macro is a no-op.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_COLD_CODE [[gnu::cold]]
+#else
+#define TVM_FFI_COLD_CODE
+#endif
+
+/*!
+ * \brief Branch-prediction / layout hint that the condition is unlikely
+ *        to be true. Use on error-checking branches to keep the hot
+ *        fall-through contiguous and push the error-handling block to
+ *        the function tail.
+ *
+ *   if (TVM_FFI_PREDICT_FALSE(rc != 0)) { ...error... }
+ *
+ * On GCC/Clang, expands to `__builtin_expect((cond), 0)`. On MSVC,
+ * expands to `(cond)` (no equivalent builtin; modern MSVC does its own
+ * profile-driven block reordering).
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_PREDICT_FALSE(cond) (__builtin_expect(static_cast<bool>(cond), 
0))
+#define TVM_FFI_PREDICT_TRUE(cond) (__builtin_expect(static_cast<bool>(cond), 
1))
+#else
+#define TVM_FFI_PREDICT_FALSE(cond) (cond)
+#define TVM_FFI_PREDICT_TRUE(cond) (cond)
+#endif
+
 #define TVM_FFI_STR_CONCAT_(__x, __y) __x##__y
 #define TVM_FFI_STR_CONCAT(__x, __y) TVM_FFI_STR_CONCAT_(__x, __y)
 
diff --git a/include/tvm/ffi/error.h b/include/tvm/ffi/error.h
index 1b6cb47..4931703 100644
--- a/include/tvm/ffi/error.h
+++ b/include/tvm/ffi/error.h
@@ -338,11 +338,13 @@ TVM_FFI_INLINE void SetSafeCallRaised(const Error& error) 
{
 
 class ErrorBuilder {
  public:
+  TVM_FFI_COLD_CODE
   explicit ErrorBuilder(std::string kind, std::string backtrace, bool 
log_before_throw)
       : kind_(std::move(kind)),
         backtrace_(std::move(backtrace)),
         log_before_throw_(log_before_throw) {}
 
+  TVM_FFI_COLD_CODE
   explicit ErrorBuilder(std::string kind, const TVMFFIByteArray* backtrace, 
bool log_before_throw)
       : ErrorBuilder(std::move(kind), std::string(backtrace->data, 
backtrace->size),
                      log_before_throw) {}
@@ -353,7 +355,7 @@ class ErrorBuilder {
 #pragma warning(disable : 4722)
 #endif
   // avoid inline to reduce binary size, error throw path do not need to be 
fast
-  [[noreturn]] ~ErrorBuilder() noexcept(false) {
+  [[noreturn]] TVM_FFI_COLD_CODE ~ErrorBuilder() noexcept(false) {
     ::tvm::ffi::Error error(std::move(kind_), stream_.str(), 
std::move(backtrace_));
     if (log_before_throw_) {
       std::cerr << error.FullMessage();
@@ -456,7 +458,8 @@ TVM_FFI_CHECK_FUNC(_NE, !=)
   TVM_FFI_THROW(ErrorKind) << "Check failed: " << #x " " #op " " #y << 
*__tvm_ffi_log_err << ": "
 
 #define TVM_FFI_CHECK(cond, ErrorKind) \
-  if (!(cond)) TVM_FFI_THROW(ErrorKind) << "Check failed: (" #cond << ") is 
false: "
+  if (TVM_FFI_PREDICT_FALSE(!(cond)))  \
+  TVM_FFI_THROW(ErrorKind) << "Check failed: (" #cond << ") is false: "
 
 #define TVM_FFI_CHECK_LT(x, y, ErrorKind) TVM_FFI_CHECK_BINARY_OP(_LT, <, x, 
y, ErrorKind)
 #define TVM_FFI_CHECK_GT(x, y, ErrorKind) TVM_FFI_CHECK_BINARY_OP(_GT, >, x, 
y, ErrorKind)
diff --git a/include/tvm/ffi/function.h b/include/tvm/ffi/function.h
index 4ec0e00..1100603 100644
--- a/include/tvm/ffi/function.h
+++ b/include/tvm/ffi/function.h
@@ -101,7 +101,7 @@ namespace ffi {
 #define TVM_FFI_CHECK_SAFE_CALL(func)                      \
   {                                                        \
     int ret_code = (func);                                 \
-    if (ret_code != 0) {                                   \
+    if (TVM_FFI_PREDICT_FALSE(ret_code != 0)) {            \
       throw ::tvm::ffi::details::MoveFromSafeCallRaised(); \
     }                                                      \
   }
diff --git a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h 
b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
index 79494d5..c82b09c 100644
--- a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
+++ b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
@@ -36,6 +36,29 @@
 #endif
 #endif
 
+// Local mirror of TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_* from
+// <tvm/ffi/base_details.h>. The Cython helper deliberately avoids that header
+// (keeps the include surface c-headers-only), so we duplicate the macro
+// definitions here. Keep these in sync with base_details.h: same expansion on
+// GCC/Clang, no-op on MSVC.
+#ifndef TVM_FFI_COLD_CODE
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_COLD_CODE [[gnu::cold]]
+#else
+#define TVM_FFI_COLD_CODE
+#endif
+#endif
+
+#ifndef TVM_FFI_PREDICT_FALSE
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_PREDICT_FALSE(cond) (__builtin_expect(static_cast<bool>(cond), 
0))
+#define TVM_FFI_PREDICT_TRUE(cond) (__builtin_expect(static_cast<bool>(cond), 
1))
+#else
+#define TVM_FFI_PREDICT_FALSE(cond) (cond)
+#define TVM_FFI_PREDICT_TRUE(cond) (cond)
+#endif
+#endif
+
 #include <cstring>
 #include <exception>
 #include <iostream>
@@ -252,7 +275,7 @@ int TVMFFIPyArgSetterInt_(TVMFFIPyArgSetter*, 
TVMFFIPyCallContext*, PyObject* ar
   out->type_index = kTVMFFIInt;
   out->v_int64 = PyLong_AsLongLongAndOverflow(arg, &overflow);
 
-  if (overflow != 0) {
+  if (TVM_FFI_PREDICT_FALSE(overflow != 0)) {
     PyErr_SetString(PyExc_OverflowError, "Python int too large to convert to 
int64_t");
     return -1;
   }
@@ -454,7 +477,7 @@ class TVMFFIPyCallManager {
                               int* c_api_ret_code, bool release_gil,
                               const DLPackExchangeAPI** 
optional_out_ctx_dlpack_api) {
     int64_t num_args = PyTuple_Size(py_arg_tuple);
-    if (num_args == -1) return -1;
+    if (TVM_FFI_PREDICT_FALSE(num_args == -1)) return -1;
     try {
       // allocate a call stack
       TVMFFIPyCallContext ctx(&call_stack_, num_args);
@@ -462,7 +485,7 @@ class TVMFFIPyCallManager {
       for (int64_t i = 0; i < num_args; ++i) {
         PyObject* py_arg = PyTuple_GetItem(py_arg_tuple, i);
         TVMFFIAny* c_arg = ctx.packed_args + i;
-        if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+        if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) 
return -1;
       }
       TVMFFIStreamHandle prev_stream = nullptr;
       DLPackManagedTensorAllocator prev_tensor_allocator = nullptr;
@@ -471,13 +494,13 @@ class TVMFFIPyCallManager {
         c_api_ret_code[0] =
             TVMFFIEnvSetStream(ctx.device_type, ctx.device_id, ctx.stream, 
&prev_stream);
         // setting failed, directly return
-        if (c_api_ret_code[0] != 0) return 0;
+        if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
       }
       if (ctx.dlpack_c_exchange_api != nullptr &&
           ctx.dlpack_c_exchange_api->managed_tensor_allocator != nullptr) {
         c_api_ret_code[0] = TVMFFIEnvSetDLPackManagedTensorAllocator(
             ctx.dlpack_c_exchange_api->managed_tensor_allocator, 0, 
&prev_tensor_allocator);
-        if (c_api_ret_code[0] != 0) return 0;
+        if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
       }
       // call the function
       if (release_gil) {
@@ -491,7 +514,8 @@ class TVMFFIPyCallManager {
       // restore the original stream
       if (ctx.device_type != -1 && prev_stream != ctx.stream) {
         // always try recover first, even if error happens
-        if (TVMFFIEnvSetStream(ctx.device_type, ctx.device_id, prev_stream, 
nullptr) != 0) {
+        if (TVM_FFI_PREDICT_FALSE(
+                TVMFFIEnvSetStream(ctx.device_type, ctx.device_id, 
prev_stream, nullptr) != 0)) {
           // recover failed, set python error
           PyErr_SetString(PyExc_RuntimeError, "Failed to recover stream");
           return -1;
@@ -502,12 +526,13 @@ class TVMFFIPyCallManager {
           prev_tensor_allocator != 
ctx.dlpack_c_exchange_api->managed_tensor_allocator) {
         // note: we cannot set the error value to c_api_ret_code[0] here 
because it
         // will be overwritten by the error value from the function call
-        if (TVMFFIEnvSetDLPackManagedTensorAllocator(prev_tensor_allocator, 0, 
nullptr) != 0) {
+        if (TVM_FFI_PREDICT_FALSE(
+                
TVMFFIEnvSetDLPackManagedTensorAllocator(prev_tensor_allocator, 0, nullptr) != 
0)) {
           PyErr_SetString(PyExc_RuntimeError, "Failed to recover DLPack 
managed tensor allocator");
           return -1;
         }
         // return error after
-        if (c_api_ret_code[0] != 0) return 0;
+        if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
       }
       if (optional_out_ctx_dlpack_api != nullptr && ctx.dlpack_c_exchange_api 
!= nullptr) {
         *optional_out_ctx_dlpack_api = ctx.dlpack_c_exchange_api;
@@ -540,7 +565,7 @@ class TVMFFIPyCallManager {
   TVM_FFI_INLINE int ConstructorCall(void* func_handle, PyObject* 
py_arg_tuple, TVMFFIAny* result,
                                      int* c_api_ret_code, TVMFFIPyCallContext* 
parent_ctx) {
     int64_t num_args = PyTuple_Size(py_arg_tuple);
-    if (num_args == -1) return -1;
+    if (TVM_FFI_PREDICT_FALSE(num_args == -1)) return -1;
     try {
       // allocate a call stack
       TVMFFIPyCallContext ctx(&call_stack_, num_args);
@@ -548,7 +573,7 @@ class TVMFFIPyCallManager {
       for (int64_t i = 0; i < num_args; ++i) {
         PyObject* py_arg = PyTuple_GetItem(py_arg_tuple, i);
         TVMFFIAny* c_arg = ctx.packed_args + i;
-        if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+        if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) 
return -1;
       }
       c_api_ret_code[0] = TVMFFIFunctionCall(func_handle, ctx.packed_args, 
num_args, result);
       // propagate the call context to the parent context
@@ -577,7 +602,7 @@ class TVMFFIPyCallManager {
     try {
       TVMFFIPyCallContext ctx(&call_stack_, 1);
       TVMFFIAny* c_arg = ctx.packed_args;
-      if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+      if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) return 
-1;
       if (!(field_flags & kTVMFFIFieldFlagBitSetterIsFunctionObj)) {
         auto setter = reinterpret_cast<TVMFFIFieldSetter>(field_setter);
         c_api_ret_code[0] = (*setter)(field_ptr, c_arg);
@@ -603,7 +628,7 @@ class TVMFFIPyCallManager {
     try {
       TVMFFIPyCallContext ctx(&call_stack_, 1);
       TVMFFIAny* c_arg = ctx.packed_args;
-      if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+      if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) return 
-1;
       c_api_ret_code[0] = TVMFFIAnyViewToOwnedAny(c_arg, out);
       return 0;
     } catch (const std::exception& ex) {
@@ -629,20 +654,20 @@ class TVMFFIPyCallManager {
     // find the pre-cached setter
     // This class is thread-local, so we don't need to worry about race 
condition
     auto it = arg_dispatch_map_.find(py_type);
-    if (it != arg_dispatch_map_.end()) {
+    if (TVM_FFI_PREDICT_TRUE(it != arg_dispatch_map_.end())) {
       TVMFFIPyArgSetter setter = it->second;
       // if error happens, propagate it back
-      if (setter(ctx, py_arg, out) != 0) return -1;
+      if (TVM_FFI_PREDICT_FALSE(setter(ctx, py_arg, out) != 0)) return -1;
     } else {
       // no dispatch found, query and create a new one.
       TVMFFIPyArgSetter setter;
       // propagate python error back
-      if (TVMFFICyArgSetterFactory(py_arg, &setter) != 0) {
+      if (TVM_FFI_PREDICT_FALSE(TVMFFICyArgSetterFactory(py_arg, &setter) != 
0)) {
         return -1;
       }
       // update dispatch table
       arg_dispatch_map_.emplace(py_type, setter);
-      if (setter(ctx, py_arg, out) != 0) return -1;
+      if (TVM_FFI_PREDICT_FALSE(setter(ctx, py_arg, out) != 0)) return -1;
     }
     return 0;
   }
@@ -706,8 +731,8 @@ class TVMFFIPyCallManager {
       TVMFFIPyCallbackContext cb_ctx(&call_stack_, num_args);
       // Step 1: Convert each packed arg (borrowed AnyView) to a PyObject*
       for (int32_t i = 0; i < num_args; ++i) {
-        if (SetPyCallbackArg(closure->dlpack_exchange_api, &packed_args[i], 
&cb_ctx.py_args[i]) !=
-            0) {
+        if 
(TVM_FFI_PREDICT_FALSE(SetPyCallbackArg(closure->dlpack_exchange_api, 
&packed_args[i],
+                                                   &cb_ctx.py_args[i]) != 0)) {
           ForwardPyErrorToFFI();
           return -1;
         }
@@ -749,7 +774,7 @@ class TVMFFIPyCallManager {
         // The guard's destructor runs AFTER the return value is computed.
         TVMFFIPyCallContext ret_ctx(&call_stack_, 1);
         TVMFFIAny* view = ret_ctx.packed_args;
-        if (SetArgument(&ret_ctx, py_result.p, view) != 0) {
+        if (TVM_FFI_PREDICT_FALSE(SetArgument(&ret_ctx, py_result.p, view) != 
0)) {
           ForwardPyErrorToFFI();
           return -1;
         }
@@ -776,7 +801,7 @@ class TVMFFIPyCallManager {
    * returned by PyErr_Occurred()) so that set_last_ffi_error can access the
    * message and traceback.
    */
-  static void ForwardPyErrorToFFI() noexcept {
+  TVM_FFI_COLD_CODE static void ForwardPyErrorToFFI() noexcept {
 #if PY_VERSION_HEX >= 0x030C0000
     // Python 3.12+: PyErr_Fetch / PyErr_NormalizeException are deprecated.
     // PyErr_GetRaisedException returns an already-normalized exception
diff --git a/src/ffi/backtrace.cc b/src/ffi/backtrace.cc
index 0e2e9b4..8a349d2 100644
--- a/src/ffi/backtrace.cc
+++ b/src/ffi/backtrace.cc
@@ -147,6 +147,7 @@ const TVMFFIByteArray* TVMFFIBacktrace(const char* 
filename, int lineno, const c
 }
 
 #if TVM_FFI_BACKTRACE_ON_SEGFAULT
+TVM_FFI_COLD_CODE
 void TVMFFISegFaultHandler(int sig) {
   // Technically we shouldn't do any allocation in a signal handler, but
   // Backtrace may allocate. What's the worst it could do? We're already
@@ -163,6 +164,7 @@ void TVMFFISegFaultHandler(int sig) {
   raise(sig);
 }
 
+TVM_FFI_COLD_CODE
 __attribute__((constructor)) void TVMFFIInstallSignalHandler() {
   // this may override already installed signal handlers
   std::signal(SIGSEGV, TVMFFISegFaultHandler);
diff --git a/src/ffi/error.cc b/src/ffi/error.cc
index 8dd209c..058002f 100644
--- a/src/ffi/error.cc
+++ b/src/ffi/error.cc
@@ -20,6 +20,7 @@
  * \file src/ffi/error.cc
  * \brief Error handling implementation
  */
+#include <tvm/ffi/base_details.h>
 #include <tvm/ffi/c_api.h>
 #include <tvm/ffi/error.h>
 
diff --git a/src/ffi/function.cc b/src/ffi/function.cc
index 4b378f7..15ffd80 100644
--- a/src/ffi/function.cc
+++ b/src/ffi/function.cc
@@ -83,7 +83,7 @@ class GlobalFunctionTable {
   };
 
   void Update(const String& name, Function func, bool can_override) {
-    if (table_.count(name)) {
+    if (TVM_FFI_PREDICT_FALSE(table_.count(name) != 0)) {
       if (!can_override) {
         TVM_FFI_THROW(RuntimeError) << "Global Function `" << name << "` is 
already registered";
       }
@@ -93,7 +93,7 @@ class GlobalFunctionTable {
 
   void Update(const TVMFFIMethodInfo* method_info, bool can_override) {
     String name(method_info->name.data, method_info->name.size);
-    if (table_.count(name)) {
+    if (TVM_FFI_PREDICT_FALSE(table_.count(name) != 0)) {
       if (!can_override) {
         TVM_FFI_LOG_AND_THROW(RuntimeError)
             << "Global Function `" << name << "` is already registered, 
possible causes:\n"

(tvm-ffi) branch main updated: [BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and cold-marking of error paths (#589)

Reply via email to