This change attempts to track the actual available stack and stop
early if the flow translation logic is about to overflow it.

Unlike the recursion depth counter, this approach allows to track the
actual stack usage and bail early even if the recursion depth is not
reached yet.  This is important because different actions have vastly
different stack requirements and different systems have different
amount of stack allocated per thread by default.

Should work with both GCC and Clang, will likely not work on Windows.
The change should have no effect on platforms / compilers that do not
support checking the current stack frame address via builtins.

The main thread is not treated fairly.  At least on Linux the main
thread can grow its stack if the limit is dynamically increased.
That is not normally true for other threads.  However, this patch
sticks to initial stack size even for the main thread.  This should
not be a problem for OVS though, as vast majority of all the packet
processing is normally done by handlers, revalidators or PMD threads.

Unlike the previous RFC version of this change [1], we're not trying
to work around the stack limits by recirculating packets through the
datapath.  Practice shows that such techniques may lead to self-DoS,
overwhelming OVS with upcalls.  See the ovn-controller self-DoS issue
fixed recently in OVN:
  https://mail.openvswitch.org/pipermail/ovs-dev/2025-October/426746.html
So, it's safer to just drop the packets and only execute actions that
we can translate safely.  All in all, stack exhaustion usually means
a loop or otherwise a very inefficient OpenFlow pipeline that should
be fixed by the users.  Attempts to process the whole thing would only
mask the problem.

It seems hard to create a unit test for this, as support for measuring
the actual stack depth as well as the amount of space each stack frame
takes and the ways to limit the stack largely depend on a platform and
a compiler.  Can be tested manually with an infinite resubmit case:

  make -j8
  (ulimit -s 386; make sandbox)
  ovs-vsctl add-br br0
  ovs-vsctl add-port br0 p1
  ovs-ofctl del-flows br0
  (for i in $(seq 0 64); do
      j=$(expr $i + 1);
      echo "table=$i, actions=local,resubmit(,$j),local,resubmit(,$j),local";
   done;
   echo "table=65, actions=resubmit(,0)") > ./resubmits.txt
  ovs-ofctl add-flows br0 ./resubmits.txt
  ovs-appctl ofproto/trace br0 'in_port=1' > ./trace.txt

[1] https://mail.openvswitch.org/pipermail/ovs-dev/2024-February/412048.html

Signed-off-by: Ilya Maximets <[email protected]>
---
 include/linux/openvswitch.h    |  1 +
 include/openvswitch/compiler.h | 11 ++++++++
 lib/odp-execute.c              |  4 +++
 lib/ovs-thread.c               | 47 ++++++++++++++++++++++++++++++++++
 lib/ovs-thread.h               |  8 ++++++
 lib/sat-math.h                 |  5 +---
 ofproto/ofproto-dpif-xlate.c   | 10 ++++++--
 vswitchd/ovs-vswitchd.c        |  1 +
 8 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index e69fed4b7..8e0b9ad46 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -475,6 +475,7 @@ enum xlate_error {
        XLATE_TUNNEL_OUTPUT_NO_ETHERNET,
        XLATE_TUNNEL_NEIGH_CACHE_MISS,
        XLATE_TUNNEL_HEADER_BUILD_FAILED,
+       XLATE_THREAD_STACK_EXHAUSTED,
        XLATE_MAX,
 };
 #endif
diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h
index bd30369a7..352980e19 100644
--- a/include/openvswitch/compiler.h
+++ b/include/openvswitch/compiler.h
@@ -29,6 +29,9 @@
 #ifndef __has_attribute
   #define __has_attribute(x) 0
 #endif
+#ifndef __has_builtin
+  #define __has_builtin(x) 0
+#endif
 
 /* To make OVS_NO_RETURN portable across gcc/clang and MSVC, it should be
  * added at the beginning of the function declaration. */
@@ -317,5 +320,13 @@
 #define BUILD_ASSERT_DECL_GCCONLY(EXPR) ((void) 0)
 #endif
 
+/* OVS_FRAME_ADDRESS can be used to get the address of the current stack frame.
+ * Note: Attempts to get address of any frame beside the current one (0) are
+ * dangerous and can lead to crashes according to GCC documentation.  */
+#if __has_builtin(__builtin_frame_address)
+#define OVS_FRAME_ADDRESS() ((char *) __builtin_frame_address(0))
+#else
+#define OVS_FRAME_ADDRESS() ((char *) 0)
+#endif
 
 #endif /* compiler.h */
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index ecbda8c01..2271e7d72 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -60,6 +60,7 @@ COVERAGE_DEFINE(drop_action_tunnel_routing_failed);
 COVERAGE_DEFINE(drop_action_tunnel_output_no_ethernet);
 COVERAGE_DEFINE(drop_action_tunnel_neigh_cache_miss);
 COVERAGE_DEFINE(drop_action_tunnel_header_build_failed);
+COVERAGE_DEFINE(drop_action_thread_stack_exhausted);
 
 static void
 dp_update_drop_action_counter(enum xlate_error drop_reason,
@@ -116,6 +117,9 @@ dp_update_drop_action_counter(enum xlate_error drop_reason,
    case XLATE_TUNNEL_HEADER_BUILD_FAILED:
         COVERAGE_ADD(drop_action_tunnel_header_build_failed, delta);
         break;
+   case XLATE_THREAD_STACK_EXHAUSTED:
+        COVERAGE_ADD(drop_action_thread_stack_exhausted, delta);
+        break;
    case XLATE_MAX:
    default:
         VLOG_ERR_RL(&rl, "Invalid Drop reason type: %d", drop_reason);
diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
index 243791de8..a390ed4f5 100644
--- a/lib/ovs-thread.c
+++ b/lib/ovs-thread.c
@@ -404,6 +404,52 @@ ovsthread_id_init(void)
     return *ovsthread_id_get() = atomic_count_inc(&next_id);
 }
 
+DEFINE_STATIC_PER_THREAD_DATA(uintptr_t, ovsthread_stack_base, 0);
+DEFINE_STATIC_PER_THREAD_DATA(size_t, ovsthread_stack_size, 0);
+
+void
+ovsthread_stack_init(void)
+{
+    pthread_attr_t attr;
+    size_t stacksize;
+    int error;
+
+    ovs_assert(*ovsthread_stack_base_get() == 0);
+    *ovsthread_stack_base_get() = (uintptr_t) OVS_FRAME_ADDRESS();
+
+    pthread_attr_init(&attr);
+
+    error = pthread_attr_getstacksize(&attr, &stacksize);
+    if (error) {
+        VLOG_ABORT("pthread_attr_getstacksize failed: %s",
+                   ovs_strerror(error));
+    }
+    *ovsthread_stack_size_get() = stacksize;
+    pthread_attr_destroy(&attr);
+}
+
+bool
+ovsthread_low_on_stack(void)
+{
+    uintptr_t curr = (uintptr_t) OVS_FRAME_ADDRESS();
+    uintptr_t base = *ovsthread_stack_base_get();
+    size_t size = *ovsthread_stack_size_get();
+    size_t used;
+
+    if (!curr || !base || !size) {
+        /* Either not initialized or not supported. */
+        return false;
+    }
+
+    used = (base > curr) ? base - curr : curr - base;
+
+    /* Consider 'low' as less than a 100 KB. */
+    if (size < used + 100 * 1024) {
+        return true;
+    }
+    return false;
+}
+
 static void *
 ovsthread_wrapper(void *aux_)
 {
@@ -412,6 +458,7 @@ ovsthread_wrapper(void *aux_)
     unsigned int id;
 
     id = ovsthread_id_init();
+    ovsthread_stack_init();
 
     aux = *auxp;
     free(auxp);
diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
index d37f4ffc4..55c9859b9 100644
--- a/lib/ovs-thread.h
+++ b/lib/ovs-thread.h
@@ -474,6 +474,14 @@ ovsthread_id_self(void)
     return id;
 }
 
+/* Enables use of ovsthread_low_on_stack().  Must be called by a new thread
+ * early after the thread creation, e.g. called from ovs_thread_create(). */
+void ovsthread_stack_init(void);
+
+/* Returns 'true' if the current thread used up most of its stack space.
+ * Can be used, for example, to check if recursion has to be stopped. */
+bool ovsthread_low_on_stack(void);
+
 /* Simulated global counter.
  *
  * Incrementing such a counter is meant to be cheaper than incrementing a
diff --git a/lib/sat-math.h b/lib/sat-math.h
index d66872387..34b939277 100644
--- a/lib/sat-math.h
+++ b/lib/sat-math.h
@@ -18,12 +18,9 @@
 #define SAT_MATH_H 1
 
 #include <limits.h>
+#include "openvswitch/compiler.h"
 #include "openvswitch/util.h"
 
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
 /* Returns x + y, clamping out-of-range results into the range of the return
  * type. */
 static inline unsigned int
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 2c8197fb7..1af2c4118 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -463,6 +463,8 @@ const char *xlate_strerror(enum xlate_error error)
         return "Tunnel neighbor cache miss";
     case XLATE_TUNNEL_HEADER_BUILD_FAILED:
         return "Native tunnel header build failed";
+    case XLATE_THREAD_STACK_EXHAUSTED:
+        return "Thread stack exhausted";
     case XLATE_MAX:
         break;
     }
@@ -4702,6 +4704,9 @@ xlate_resubmit_resource_check(struct xlate_ctx *ctx)
     } else if (ctx->stack.size >= 65536) {
         xlate_report_error(ctx, "resubmits yielded over 64 kB of stack");
         ctx->error = XLATE_STACK_TOO_DEEP;
+    } else if (ovsthread_low_on_stack()) {
+        xlate_report_error(ctx, "thread stack exhausted.");
+        ctx->error = XLATE_THREAD_STACK_EXHAUSTED;
     } else {
         return true;
     }
@@ -4906,8 +4911,9 @@ xlate_group_bucket(struct xlate_ctx *ctx, struct 
ofputil_bucket *bucket,
      *
      * Exception to above is errors which are system limits to protect
      * translation from running too long or occupy too much space. These errors
-     * should not be masked. XLATE_RECURSION_TOO_DEEP, XLATE_TOO_MANY_RESUBMITS
-     * and XLATE_STACK_TOO_DEEP fall in this category. */
+     * should not be masked. Following errors fall in this category:
+     * XLATE_RECURSION_TOO_DEEP, XLATE_TOO_MANY_RESUBMITS,
+     * XLATE_STACK_TOO_DEEP and XLATE_THREAD_STACK_EXHAUSTED. */
     if (ctx->error == XLATE_TOO_MANY_MPLS_LABELS ||
         ctx->error == XLATE_UNSUPPORTED_PACKET_TYPE) {
         /* reset the error and continue processing other buckets */
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
index 6d90c73b8..99a090c15 100644
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -94,6 +94,7 @@ main(int argc, char *argv[])
     fatal_ignore_sigpipe();
 
     daemonize_start(true, hw_rawio_access);
+    ovsthread_stack_init();
 
     if (want_mlockall) {
 #ifdef HAVE_MLOCKALL
-- 
2.51.0

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to