This is an automated email from the ASF dual-hosted git repository.
guangmingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/brpc.git
The following commit(s) were added to refs/heads/master by this push:
new bea48d75 Bugfix: SignalTrace mode has memory and deadlock issues
(#3019)
bea48d75 is described below
commit bea48d75fe8d57f5dd6dd3e9d0b0678d9d6d53a4
Author: Bright Chen <[email protected]>
AuthorDate: Fri Jul 18 23:35:00 2025 +0800
Bugfix: SignalTrace mode has memory and deadlock issues (#3019)
* Bugfix: SignalTrace mode has memory and deadlock issues
* Bugfix: Memory leak of SignalSync and wrong status of global priority
bthread
---
.github/actions/init-ut-make-config/action.yml | 10 +-
.../actions/install-all-dependencies/action.yml | 10 +-
.github/workflows/ci-linux.yml | 4 +-
BUILD.bazel | 15 +-
CMakeLists.txt | 6 +-
MODULE.bazel | 2 +-
config_brpc.sh | 7 +
docs/cn/bthread_tracer.md | 7 +-
src/bthread/task_control.cpp | 2 +-
src/bthread/task_group.cpp | 13 +-
src/bthread/task_tracer.cpp | 335 ++++++++-------------
src/bthread/task_tracer.h | 56 ++--
test/bthread_unittest.cpp | 30 +-
13 files changed, 206 insertions(+), 291 deletions(-)
diff --git a/.github/actions/init-ut-make-config/action.yml
b/.github/actions/init-ut-make-config/action.yml
index 891e45b2..9df81036 100644
--- a/.github/actions/init-ut-make-config/action.yml
+++ b/.github/actions/init-ut-make-config/action.yml
@@ -7,6 +7,8 @@ runs:
steps:
- run: |
sudo apt-get update && sudo apt-get install -y clang-12 lldb-12 lld-12
libgtest-dev cmake gdb libstdc++6-11-dbg
+ shell: bash
+ - run: |
cd /usr/src/gtest && export CC=clang-12 && export CXX=clang++-12 && sudo
cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .
sudo make -j ${{env.proc_num}} && sudo mv lib/libgtest* /usr/lib/
shell: bash
@@ -22,5 +24,11 @@ runs:
sudo ./autogen.sh && sudo CC=clang-12 CXX=clang++-12 ./configure
--prefix=/gperftools --enable-frame-pointers
sudo make -j ${{env.proc_num}} && sudo make install
shell: bash
- - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include
/usr/include" --libs="/libunwind/lib /gperftools/lib /usr/lib /usr/lib64"
${{inputs.options}}
+ - run: |
+ sudo git clone https://github.com/abseil/abseil-cpp.git
+ cd abseil-cpp && sudo git checkout lts_2022_06_23 && sudo mkdir -p
/abseil-cpp
+ sudo CC=clang-12 CXX=clang++-12 cmake -DBUILD_TESTING=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_INSTALL_PREFIX=/abseil-cpp
-DCMAKE_POLICY_VERSION_MINIMUM=3.5 .
+ sudo make -j ${{env.proc_num}} && sudo sudo make install
+ shell: bash
+ - run: sh config_brpc.sh --headers="/libunwind/include /gperftools/include
/abseil-cpp/include /usr/include" --libs="/libunwind/lib /gperftools/lib
/abseil-cpp/lib /usr/lib /usr/lib64" --cc=clang-12 --cxx=clang++-12
${{inputs.options}} && cat config.mk
shell: bash
diff --git a/.github/actions/install-all-dependencies/action.yml
b/.github/actions/install-all-dependencies/action.yml
index cb4d0002..179f86cd 100644
--- a/.github/actions/install-all-dependencies/action.yml
+++ b/.github/actions/install-all-dependencies/action.yml
@@ -4,7 +4,13 @@ runs:
- uses: ./.github/actions/install-essential-dependencies
- run: sudo apt-get install -y libunwind-dev libgoogle-glog-dev automake
bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1
libibverbs-dev
shell: bash
- - run: wget
https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf
thrift-0.11.0.tar.gz
+ - run: |
+ wget
https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf
thrift-0.11.0.tar.gz && cd thrift-0.11.0/
+ ./configure --prefix=/usr --with-rs=no --with-ruby=no --with-python=no
--with-java=no --with-go=no --with-perl=no --with-php=no --with-csharp=no
--with-erlang=no --with-lua=no --with-nodejs=no --with-haskell=no
--with-dotnetcore=no CXXFLAGS="-Wno-unused-variable"
+ make -j ${{env.proc_num}} && sudo make install
shell: bash
- - run: cd thrift-0.11.0/ && ./configure --prefix=/usr --with-rs=no
--with-ruby=no --with-python=no --with-java=no --with-go=no --with-perl=no
--with-php=no --with-csharp=no --with-erlang=no --with-lua=no --with-nodejs=no
--with-haskell=no --with-dotnetcore=no CXXFLAGS="-Wno-unused-variable" && make
-j ${{env.proc_num}} && sudo make install
+ - run: |
+ git clone https://github.com/abseil/abseil-cpp.git && cd abseil-cpp &&
git checkout lts_2022_06_23
+ cmake -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_POLICY_VERSION_MINIMUM=3.5 .
+ make -j ${{env.proc_num}} && sudo make install
shell: bash
diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index dbceac1a..b1704aa4 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -176,7 +176,7 @@ jobs:
- uses: ./.github/actions/install-essential-dependencies
- uses: ./.github/actions/init-ut-make-config
with:
- options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer
+ options: --with-bthread-tracer
- name: compile tests
run: |
cat config.mk
@@ -194,7 +194,7 @@ jobs:
- uses: ./.github/actions/install-essential-dependencies
- uses: ./.github/actions/init-ut-make-config
with:
- options: --cc=clang-12 --cxx=clang++-12 --with-bthread-tracer
--with-asan
+ options: --with-bthread-tracer --with-asan
- name: compile tests
run: |
cat config.mk
diff --git a/BUILD.bazel b/BUILD.bazel
index f6be0807..d2b3d946 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -50,9 +50,6 @@ COPTS = [
}) + select({
"//bazel/config:brpc_with_debug_lock": ["-DBRPC_DEBUG_LOCK=1"],
"//conditions:default": ["-DBRPC_DEBUG_LOCK=0"],
-}) + select({
- "//bazel/config:brpc_with_bthread_tracer": ["-DBRPC_BTHREAD_TRACER"],
- "//conditions:default": [],
}) + select({
"//bazel/config:brpc_with_asan": ["-fsanitize=address"],
"//conditions:default": [""],
@@ -402,6 +399,12 @@ cc_library(
"src/bthread/*.h",
"src/bthread/*.list",
]),
+ defines = [] + select({
+ "//bazel/config:brpc_with_bthread_tracer": [
+ "-DBRPC_BTHREAD_TRACER",
+ ],
+ "//conditions:default": [],
+ }),
copts = COPTS,
includes = [
"src/",
@@ -412,7 +415,11 @@ cc_library(
":butil",
":bvar",
] + select({
- "//bazel/config:brpc_with_bthread_tracer":
["@com_github_libunwind_libunwind//:libunwind"],
+ "//bazel/config:brpc_with_bthread_tracer": [
+ "@com_github_libunwind_libunwind//:libunwind",
+ "@com_google_absl//absl/debugging:stacktrace",
+ "@com_google_absl//absl/debugging:symbolize",
+ ],
"//conditions:default": [],
}),
)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c72a1646..bcffdb9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,8 @@ if (WITH_BTHREAD_TRACER)
if (NOT LIBUNWIND_INCLUDE_PATH OR NOT LIBUNWIND_LIB)
message(FATAL_ERROR "Fail to find libunwind, which is needed by
bthread tracer")
endif()
+ find_package(absl REQUIRED CONFIG)
+ set(bthread_tracer_ABSL_USED_TARGETS absl::base absl::stacktrace
absl::symbolize)
add_definitions(-DBRPC_BTHREAD_TRACER)
include_directories(${LIBUNWIND_INCLUDE_PATH})
endif ()
@@ -329,8 +331,8 @@ if(WITH_SNAPPY)
endif()
if (WITH_BTHREAD_TRACER)
- set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB})
- set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64")
+ set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB}
${bthread_tracer_ABSL_USED_TARGETS})
+ set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64
-labsl_stacktrace -labsl_symbolize -labsl_debugging_internal
-labsl_demangle_internal -labsl_malloc_internal -labsl_raw_logging_internal
-labsl_spinlock_wait -labsl_base")
endif()
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
diff --git a/MODULE.bazel b/MODULE.bazel
index b9474156..2439a881 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -16,7 +16,7 @@ bazel_dep(name = "apple_support", version = "1.17.1")
bazel_dep(name = 'rules_cc', version = '0.0.1')
bazel_dep(name = 'rules_proto', version = '4.0.0')
bazel_dep(name = 'zlib', version = '1.3.1.bcr.5', repo_name =
'com_github_madler_zlib')
-bazel_dep(name = "libunwind", version = "1.8.1", repo_name =
'com_github_libunwind_libunwind')
+bazel_dep(name = 'libunwind', version = '1.8.1', repo_name =
'com_github_libunwind_libunwind')
# --registry=https://baidu.github.io/babylon/registry
bazel_dep(name = 'leveldb', version = '1.23', repo_name =
'com_github_google_leveldb')
diff --git a/config_brpc.sh b/config_brpc.sh
index a16bab81..75826452 100755
--- a/config_brpc.sh
+++ b/config_brpc.sh
@@ -363,6 +363,8 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then
fi
LIBUNWIND_HDR=$(find_dir_of_header_or_die libunwind.h)
LIBUNWIND_LIB=$(find_dir_of_lib_or_die unwind)
+ ABSL_HDR=$(find_dir_of_header_or_die absl/base/config.h)
+ ABSL_LIB=$(find_dir_of_lib_or_die absl_symbolize)
CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER"
@@ -371,6 +373,11 @@ if [ $WITH_BTHREAD_TRACER != 0 ]; then
else
STATIC_LINKINGS="$STATIC_LINKINGS -lunwind -lunwind-x86_64"
fi
+ if [ -f "$ABSL_LIB/libabsl_base.$SO" ]; then
+ DYNAMIC_LINKINGS="$DYNAMIC_LINKINGS -labsl_stacktrace -labsl_symbolize
-labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal
-labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base"
+ else
+ STATIC_LINKINGS="$STATIC_LINKINGS -labsl_stacktrace -labsl_symbolize
-labsl_debugging_internal -labsl_demangle_internal -labsl_malloc_internal
-labsl_raw_logging_internal -labsl_spinlock_wait -labsl_base"
+ fi
fi
HDRS=$($ECHO
"$LIBUNWIND_HDR\n$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR"
| sort | uniq)
diff --git a/docs/cn/bthread_tracer.md b/docs/cn/bthread_tracer.md
index 7758bee1..cc6e3a1e 100644
--- a/docs/cn/bthread_tracer.md
+++ b/docs/cn/bthread_tracer.md
@@ -58,8 +58,8 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。
# 使用方法
-1. 下载安装libunwind。
-2.
给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项。
+1. 下载安装libunwind和abseil-cpp。
+2.
给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项或者给bazel(Bzlmod模式)增加`--define
with_bthread_tracer=true`选项。
3.
访问服务的内置服务:`http://ip:port/bthreads/<bthread_id>?st=1`或者代码里调用`bthread::stack_trace()`函数。
4.
如果希望追踪pthread的调用栈,在对应pthread上调用`bthread::init_for_pthread_stack_trace()`函数获取一个伪bthread_t,然后使用步骤3即可获取pthread调用栈。
@@ -75,5 +75,4 @@ jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。
# 相关flag
--
`enable_fast_unwind`:是否启用快速回溯功能,默认为true。大多数情况下,不需要关闭快速回溯功能。除非你关注的调用栈函数名转换失败,显示为`<unknown>`,则可以尝试关闭快速回溯功能,但这会导致性能下降。以包含30帧的调用栈举例,快速回溯基本上在200us以内就可以完成,而关闭快速回溯则需要4ms左右,性能下降了近20倍。
--
`signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。虽然libunwind文档显示回溯功能是异步信号安全的,但是[gpertools社区发现libunwind在某些情况下会死锁](https://github.com/gperftools/gperftools/issues/775),所以TaskTracer会设置了超时时间,超时后会放弃回溯,打破死锁。
\ No newline at end of file
+- `signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。
\ No newline at end of file
diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp
index 47b2199b..66307d32 100644
--- a/src/bthread/task_control.cpp
+++ b/src/bthread/task_control.cpp
@@ -209,7 +209,7 @@ int TaskControl::init(int concurrency) {
"bthread_worker_usage", tag_str, _tagged_cumulated_worker_time[i],
1));
_tagged_nbthreads.push_back(new bvar::Adder<int64_t>("bthread_count",
tag_str));
if (_priority_queues[i].init(BTHREAD_MAX_CONCURRENCY) != 0) {
- LOG(FATAL) << "Fail to init _priority_q";
+ LOG(ERROR) << "Fail to init _priority_q";
return -1;
}
}
diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp
index c18f7507..0d3e473e 100644
--- a/src/bthread/task_group.cpp
+++ b/src/bthread/task_group.cpp
@@ -496,7 +496,7 @@ int TaskGroup::start_foreground(TaskGroup** pg,
(bool)(using_attr.flags & BTHREAD_NOSIGNAL)
};
g->set_remained(fn, &args);
- TaskGroup::sched_to(pg, m->tid);
+ sched_to(pg, m->tid);
}
return 0;
}
@@ -870,6 +870,10 @@ void TaskGroup::ready_to_run_in_worker_ignoresignal(void*
args_in) {
void TaskGroup::priority_to_run(void* args_in) {
ReadyToRunArgs* args = static_cast<ReadyToRunArgs*>(args_in);
+#ifdef BRPC_BTHREAD_TRACER
+ tls_task_group->_control->_task_tracer.set_status(
+ TASK_STATUS_READY, args->meta);
+#endif // BRPC_BTHREAD_TRACER
return tls_task_group->control()->push_priority_queue(args->tag,
args->meta->tid);
}
@@ -1031,14 +1035,15 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c,
bthread_tag_t tag) {
}
} else if (sleep_id != 0) {
if (get_global_timer_thread()->unschedule(sleep_id) == 0) {
- bthread::TaskGroup* g = bthread::tls_task_group;
+ TaskGroup* g = tls_task_group;
+ TaskMeta* m = address_meta(tid);
if (g) {
- g->ready_to_run(TaskGroup::address_meta(tid));
+ g->ready_to_run(m);
} else {
if (!c) {
return EINVAL;
}
-
c->choose_one_group(tag)->ready_to_run_remote(TaskGroup::address_meta(tid));
+ c->choose_one_group(tag)->ready_to_run_remote(m);
}
}
}
diff --git a/src/bthread/task_tracer.cpp b/src/bthread/task_tracer.cpp
index 1c75136c..cbd45d43 100644
--- a/src/bthread/task_tracer.cpp
+++ b/src/bthread/task_tracer.cpp
@@ -21,9 +21,12 @@
#include <unistd.h>
#include <poll.h>
#include <gflags/gflags.h>
+#include <absl/debugging/stacktrace.h>
+#include <absl/debugging/symbolize.h>
#include "butil/debug/stack_trace.h"
#include "butil/memory/scope_guard.h"
#include "butil/reloadable_flags.h"
+#include "butil/fd_utility.h"
#include "bthread/task_group.h"
#include "bthread/processor.h"
@@ -36,13 +39,11 @@ BUTIL_VALIDATE_GFLAG(signal_trace_timeout_ms,
butil::PositiveInteger<uint32_t>);
extern BAIDU_THREAD_LOCAL TaskMeta* pthread_fake_meta;
TaskTracer::SignalSync::~SignalSync() {
- if (_pipe_init) {
+ if (pipe_fds[0] >= 0) {
close(pipe_fds[0]);
- close(pipe_fds[1]);
}
-
- if (_sem_init) {
- sem_destroy(&sem);
+ if (pipe_fds[1] >= 0) {
+ close(pipe_fds[1]);
}
}
@@ -59,88 +60,60 @@ bool TaskTracer::SignalSync::Init() {
PLOG(ERROR) << "Fail to make_non_blocking";
return false;
}
- _pipe_init = true;
-
- if (sem_init(&sem, 0, 0) != 0) {
- PLOG(ERROR) << "Fail to sem_init";
- return false;
- }
- _sem_init = true;
return true;
}
-std::string TaskTracer::Result::OutputToString() {
+std::string TaskTracer::Result::OutputToString() const {
std::string str;
- if (err_count > 0 || frame_count > 0) {
+ if (frame_count > 0) {
str.reserve(1024);
}
+ char symbol_name[128];
+ char unknown_symbol_name[] = "<unknown>";
if (frame_count > 0) {
- if (fast_unwind) {
- butil::debug::StackTrace stack_trace((void**)&ips, frame_count);
- stack_trace.OutputToString(str);
- } else {
- for (size_t i = 0; i < frame_count; ++i) {
- butil::string_appendf(&str, "#%zu 0x%016lx ", i, ips[i]);
- if (mangled[i][0] == '\0') {
- str.append("<unknown>");
- } else {
- str.append(butil::demangle(mangled[i]));
- }
- if (i + 1 < frame_count) {
- str.push_back('\n');
- }
+ for (size_t i = 0; i < frame_count; ++i) {
+ butil::string_appendf(&str, "#%zu 0x%16p ", i, ips[i]);
+ if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) {
+ str.append(symbol_name);
+ } else {
+ str.append(unknown_symbol_name);
+ }
+ if (i + 1 < frame_count) {
+ str.push_back('\n');
}
}
} else {
str.append("No frame");
}
-
- if (err_count > 0) {
- str.append("\nError message:\n");
- }
- for (size_t i = 0; i < err_count; ++i) {
- str.append(err_msg[i]);
- if (i + 1 < err_count) {
- str.push_back('\n');
- }
+ if (error) {
+ str.append("\nError message: ");
+ str.append(err_msg);
}
return str;
}
-void TaskTracer::Result::OutputToStream(std::ostream& os) {
+void TaskTracer::Result::OutputToStream(std::ostream& os) const {
+ char symbol_name[128];
+ char unknown_symbol_name[] = "<unknown>";
if (frame_count > 0) {
- if (fast_unwind) {
- butil::debug::StackTrace stack_trace((void**)&ips, frame_count);
- stack_trace.OutputToStream(&os);
- } else {
- for (size_t i = 0; i < frame_count; ++i) {
- os << "# " << i << " 0x" << std::hex << ips[i] << std::dec <<
" ";
- if (mangled[i][0] == '\0') {
- os << "<unknown>";
- } else {
- os << butil::demangle(mangled[i]);
- }
- if (i + 1 < frame_count) {
- os << '\n';
- }
+ for (size_t i = 0; i < frame_count; ++i) {
+ os << "# " << i << " 0x" << std::hex << ips[i] << std::dec << " ";
+ if (absl::Symbolize(ips[i], symbol_name, arraysize(symbol_name))) {
+ os << symbol_name;
+ } else {
+ os << unknown_symbol_name;
+ }
+ if (i + 1 < frame_count) {
+ os << '\n';
}
}
} else {
os << "No frame";
}
-
- if (err_count == 0) {
- return;
- }
-
- os << "\nError message:\n";
- for (size_t i = 0; i < err_count; ++i) {
- os << err_msg[i];
- if (i + 1 < err_count) {
- os << '\n';
- }
+ if (error) {
+ os << "\nError message: " << err_msg;
}
}
@@ -153,8 +126,8 @@ bool TaskTracer::Init() {
}
// Warm up the libunwind.
unw_cursor_t cursor;
- if (unw_getcontext(&_context) == 0 && unw_init_local(&cursor, &_context)
== 0) {
- butil::ignore_result(TraceCore(cursor));
+ if (0 == unw_getcontext(&_context) && unw_init_local(&cursor, &_context)
== 0) {
+ butil::ignore_result(TraceByLibunwind(cursor));
}
return true;
}
@@ -245,7 +218,7 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) {
if (tid == bthread_self() ||
(NULL != pthread_fake_meta && tid == pthread_fake_meta->tid)) {
- return Result::MakeErrorResult("Can not trace self=%d", tid);
+ return Result::MakeErrorResult("Forbid to trace self=%d", tid);
}
// Make sure only one bthread is traced at a time.
@@ -297,14 +270,13 @@ TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) {
}
// After jumping, the status may be RUNNING, SUSPENDED, or READY, which is
traceable.
- Result result{};
if (TASK_STATUS_RUNNING == status) {
- result = SignalTrace(worker_tid);
+ return SignalTrace(worker_tid);
} else if (TASK_STATUS_SUSPENDED == status || TASK_STATUS_READY == status)
{
- result = ContextTrace(m->stack->context);
+ return ContextTrace(m->stack->context);
}
- return result;
+ return Result::MakeErrorResult("Invalid TaskStatus=%d", status);
}
// Instruct ASan to ignore this function.
@@ -335,7 +307,28 @@ unw_cursor_t TaskTracer::MakeCursor(bthread_fcontext_t
fcontext) {
TaskTracer::Result TaskTracer::ContextTrace(bthread_fcontext_t fcontext) {
unw_cursor_t cursor = MakeCursor(fcontext);
- return TraceCore(cursor);
+ return TraceByLibunwind(cursor);
+}
+
+TaskTracer::Result TaskTracer::TraceByLibunwind(unw_cursor_t& cursor) {
+ Result result;
+ while (result.frame_count < arraysize(result.ips)) {
+ int rc = unw_step(&cursor);
+ if (0 == rc) {
+ break;
+ } else if (rc < 0) {
+ return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc);
+ }
+
+ unw_word_t ip = 0;
+ if (0 != unw_get_reg(&cursor, UNW_REG_IP, &ip)) {
+ continue;
+ }
+ result.ips[result.frame_count] = reinterpret_cast<void*>(ip);
+ ++result.frame_count;
+ }
+
+ return result;
}
bool TaskTracer::RegisterSignalHandler() {
@@ -360,87 +353,25 @@ bool TaskTracer::RegisterSignalHandler() {
// Caution: This function should be async-signal-safety.
void TaskTracer::SignalHandler(int, siginfo_t* info, void* context) {
ErrnoGuard guard;
+ // Ref has been taken before the signal is sent, so no need to add ref
here.
butil::intrusive_ptr<SignalSync> signal_sync(
- static_cast<SignalSync*>(info->si_value.sival_ptr));
- if (NULL == signal_sync) {
+ static_cast<SignalSync*>(info->si_value.sival_ptr), false);
+ if (NULL == signal_sync || NULL == signal_sync->result) {
// The signal is not from Tracer, such as TaskControl, do nothing.
return;
}
-
- signal_sync->context = static_cast<unw_context_t*>(context);
- // Notify SignalTrace that SignalTraceHandler has started.
- // Binary semaphore do not fail, so no need to check return value.
- // sem_post() is async-signal-safe.
- sem_post(&signal_sync->sem);
-
- butil::Timer timer;
- if (FLAGS_signal_trace_timeout_ms > 0) {
- timer.start();
- }
- int timeout = -1;
- pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0};
- // Wait for tracing to complete.
- while (true) {
- if (FLAGS_signal_trace_timeout_ms > 0) {
- timer.stop();
- // At least 1ms timeout.
- timeout = std::max(
- (int64_t)FLAGS_signal_trace_timeout_ms - timer.m_elapsed(),
(int64_t)1);
- }
- // poll() is async-signal-safe.
- // Similar to self-pipe trick:
https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html
- int rc = poll(&poll_fd, 1, timeout);
- if (-1 == rc && EINTR == errno) {
- continue;
- }
- // No need to read the pipe or handle errors, just return.
- return;
- }
-}
-
-// Caution: This fnction should be async-signal-safety.
-bool TaskTracer::WaitForSignalHandler(butil::intrusive_ptr<SignalSync>
signal_sync,
- const timespec* abs_timeout, Result&
result) {
- // It is safe to sem_timedwait() here and sem_post() in SignalHandler.
- while (sem_timedwait(&signal_sync->sem, abs_timeout) != 0) {
- if (EINTR == errno) {
- continue;
- }
- if (ETIMEDOUT == errno) {
- result.SetError("Timeout exceed %dms",
FLAGS_signal_trace_timeout_ms);
- } else {
- // During the process of signal handler,
- // can not use berro() which is not async-signal-safe.
- result.SetError("Fail to sem_timedwait, errno=%d", errno);
- }
- return false;
- }
- return true;
-}
-
-// Caution: This fnction should be async-signal-safety.
-void TaskTracer::WakeupSignalHandler(butil::intrusive_ptr<SignalSync>
signal_sync, Result& result) {
- while (true) {
- ssize_t nw = write(signal_sync->pipe_fds[1], "1", 1);
- if (0 < nw) {
- break;
- } else if (-1 == nw && EINTR == errno) {
- // Only EINTR is allowed. Even EAGAIN should not be returned.
- continue;
- }
- // During the process of signal handler,
- // can not use berro() which is not async-signal-safe.
- result.SetError("Fail to write pipe to notify signal handler,
errno=%d", errno);
- }
+ Result* result = signal_sync->result;
+ // Skip the first frame, which is the signal handler itself.
+ result->frame_count = absl::DefaultStackUnwinder(&result->ips[0], NULL,
+ arraysize(result->ips),
+ 1, context, NULL);
+ // write() is async-signal-safe.
+ // Don't care about the return value.
+ butil::ignore_result(write(signal_sync->pipe_fds[1], "1", 1));
}
TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) {
// CAUTION:
- // The signal handler will wait for the backtrace to complete.
- // If the worker thread is interrupted when holding a resource(lock, etc),
- // and this function waits for the resource during capturing backtraces,
- // it may cause a deadlock.
- //
//
https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues#libunwind
// Generally, libunwind promises async-signal-safety for capturing
backtraces.
// But in practice, it is only partially async-signal-safe due to reliance
on
@@ -472,99 +403,69 @@ TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) {
// #19 0x00007f0d390088dc in __cxa_throw () from
/lib/x86_64-linux-gnu/libstdc++.so.6
// #20 0x00007f0d3b5b2245 in __cxxabiv1::__cxa_throw
(thrownException=0x7f0d114ea8c0, type=0x7f0d3d6dd830 <typeinfo for
rockset::GRPCError>, destructor=<optimized out>) at
/src/folly/folly/experimental/exception_tracer/ExceptionTracerLib.cpp:107
//
- // Therefore, we do not capture backtracks in the signal handler to avoid
mutex
- // reentry and deadlock. Instead, we capture backtracks in this function
and
- // ends the signal handler after capturing backtraces is complete.
- // Even so, there is still a deadlock problem:
- // the worker thread is interrupted when during an existing
dl_iterate_phdr call,
- // and wait for the capturing backtraces to complete. This function capture
- // backtracks with dl_iterate_phdr. We introduce a timeout mechanism in
signal
- // handler to avoid deadlock.
+ // Therefore, use async-signal-safe absl::DefaultStackUnwinder instead of
libunwind.
+
+ // Remove unused SignalSyncs.
+ auto iter = std::remove_if(
+ _inuse_signal_syncs.begin(), _inuse_signal_syncs.end(),
+ [](butil::intrusive_ptr<SignalSync>& sync) {
+ return sync->ref_count() == 1;
+ });
+ _inuse_signal_syncs.erase(iter, _inuse_signal_syncs.end());
// Each signal trace has an independent SignalSync to
// prevent the previous SignalHandler from affecting the new SignalTrace.
- butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync());
+ Result result;
+ butil::intrusive_ptr<SignalSync> signal_sync(new SignalSync(&result));
if (!signal_sync->Init()) {
- return Result::MakeErrorResult("Fail to init SignalSync");
+ result.SetError("Fail to init SignalSync");
+ return result;
}
+
// Add reference for SignalHandler.
signal_sync->AddRefManually();
- union sigval value{};
+ sigval value{};
value.sival_ptr = signal_sync.get();
size_t sigqueue_try = 0;
while (sigqueue(tid, SIGURG, value) != 0) {
if (errno != EAGAIN || sigqueue_try++ >= 3) {
- return Result::MakeErrorResult("Fail to sigqueue: %s", berror());
+ // Remove reference for SignalHandler.
+ signal_sync->RemoveRefManually();
+ result.SetError("Fail to sigqueue: %s", berror());
+ return result;
}
}
+ _inuse_signal_syncs.push_back(signal_sync);
- // Caution: Start here, need to ensure async-signal-safety.
- Result result;
- // Wakeup the signal handler at the end.
- BRPC_SCOPE_EXIT {
- WakeupSignalHandler(signal_sync, result);
- };
-
- timespec abs_timeout{};
- timespec* abs_timeout_ptr = NULL;
+ // Wait for the signal handler to complete.
+ butil::Timer timer;
if (FLAGS_signal_trace_timeout_ms > 0) {
- abs_timeout =
butil::milliseconds_from_now(FLAGS_signal_trace_timeout_ms);
- abs_timeout_ptr = &abs_timeout;
- }
- // Wait for the signal handler to start.
- if (!WaitForSignalHandler(signal_sync, abs_timeout_ptr, result)) {
- return result;
- }
-
- if (NULL == signal_sync->context) {
- result.SetError("context is NULL");
- return result;
- }
- unw_cursor_t cursor;
- int rc = unw_init_local(&cursor, signal_sync->context);
- if (0 != rc) {
- result.SetError("Failed to init local, rc=%d", rc);
- return result;
+ timer.start();
}
-
- return TraceCore(cursor);
-}
-
-TaskTracer::Result TaskTracer::TraceCore(unw_cursor_t& cursor) {
- Result result{};
- result.fast_unwind = FLAGS_enable_fast_unwind;
- for (result.frame_count = 0; result.frame_count < arraysize(result.ips);
++result.frame_count) {
- int rc = unw_step(&cursor);
- if (0 == rc) {
- break;
- } else if (rc < 0) {
- return Result::MakeErrorResult("Fail to unw_step, rc=%d", rc);
- }
-
- unw_word_t ip = 0;
- // Fast unwind do not care about the return value.
- rc = unw_get_reg(&cursor, UNW_REG_IP, &ip);
- result.ips[result.frame_count] = ip;
-
- if (result.fast_unwind) {
- continue;
- }
-
- if (0 != rc) {
- result.mangled[result.frame_count][0] = '\0';
- continue;
+ pollfd poll_fd = {signal_sync->pipe_fds[0], POLLIN, 0};
+ // Wait for tracing to complete.
+ while (true) {
+ int timeout = -1;
+ if (FLAGS_signal_trace_timeout_ms > 0) {
+ timer.stop();
+ if (timer.m_elapsed() >= FLAGS_signal_trace_timeout_ms) {
+ result.SetError("Timeout exceed %dms",
FLAGS_signal_trace_timeout_ms);
+ break;
+ }
+ timeout = (int64_t)FLAGS_signal_trace_timeout_ms -
timer.m_elapsed();
}
-
- // Slow path.
- rc = unw_get_proc_name(&cursor, result.mangled[result.frame_count],
- sizeof(result.mangled[result.frame_count]),
NULL);
- // UNW_ENOMEM is OK.
- if (0 != rc && UNW_ENOMEM != rc) {
- result.mangled[result.frame_count][0] = '\0';
+ // poll() is async-signal-safe.
+ // Self-pipe trick:
https://man7.org/tlpi/code/online/dist/altio/self_pipe.c.html
+ int rc = poll(&poll_fd, 1, timeout);
+ if (-1 == rc) {
+ if (EINTR == errno) {
+ continue;
+ }
+ result.SetError("Fail to poll: %s", berror());
}
+ break;
}
-
return result;
}
diff --git a/src/bthread/task_tracer.h b/src/bthread/task_tracer.h
index f154c3ee..12a5b59c 100644
--- a/src/bthread/task_tracer.h
+++ b/src/bthread/task_tracer.h
@@ -21,14 +21,12 @@
#ifdef BRPC_BTHREAD_TRACER
#include <signal.h>
-#include <semaphore.h>
#include <vector>
-#include <algorithm>
#include <libunwind.h>
-#include "butil/strings/safe_sprintf.h"
#include "butil/synchronization/condition_variable.h"
+#include "butil/intrusive_ptr.hpp"
+#include "butil/strings/safe_sprintf.h"
#include "butil/shared_object.h"
-#include "butil/fd_utility.h"
#include "bthread/task_meta.h"
#include "bthread/mutex.h"
@@ -63,47 +61,38 @@ private:
struct Result {
template<typename... Args>
- static Result MakeErrorResult(const char* fmt, Args... args) {
- Result result{};
+ static Result MakeErrorResult(const char* fmt, Args&&... args) {
+ Result result;
result.SetError(fmt, std::forward<Args>(args)...);
return result;
}
template<typename... Args>
- void SetError(const char* fmt, Args... args) {
- err_count = std::max(err_count + 1, MAX_ERROR_NUM);
- butil::strings::SafeSPrintf(err_msg[err_count - 1], fmt, args...);
+ void SetError(const char* fmt, Args&&... args) {
+ error = true;
+ butil::ignore_result(butil::strings::SafeSPrintf(
+ err_msg, fmt, std::forward<Args>(args)...));
}
- std::string OutputToString();
- void OutputToStream(std::ostream& os);
-
- bool OK() const { return err_count == 0; }
+ std::string OutputToString() const;
+ void OutputToStream(std::ostream& os) const;
static constexpr size_t MAX_TRACE_NUM = 64;
- static constexpr size_t MAX_ERROR_NUM = 2;
- unw_word_t ips[MAX_TRACE_NUM];
- char mangled[MAX_TRACE_NUM][256]{};
+ void* ips[MAX_TRACE_NUM];
size_t frame_count{0};
- char err_msg[MAX_ERROR_NUM][64]{};
- size_t err_count{0};
-
- bool fast_unwind{false};
+ char err_msg[64];
+ bool error{false};
};
// For signal trace.
struct SignalSync : public butil::SharedObject {
+ explicit SignalSync(Result* result) : result(result) {}
~SignalSync() override;
bool Init();
- unw_context_t* context{NULL};
- sem_t sem{};
- int pipe_fds[2]{};
-
- private:
- bool _pipe_init{false};
- bool _sem_init{false};
+ int pipe_fds[2]{-1, -1};
+ Result* result{NULL};
};
static TaskStatus WaitForJumping(TaskMeta* m);
@@ -111,19 +100,14 @@ private:
unw_cursor_t MakeCursor(bthread_fcontext_t fcontext);
Result ContextTrace(bthread_fcontext_t fcontext);
+ static Result TraceByLibunwind(unw_cursor_t& cursor);
static bool RegisterSignalHandler();
static void SignalHandler(int sig, siginfo_t* info, void* context);
- static bool WaitForSignalHandler(butil::intrusive_ptr<SignalSync>
signal_sync,
- const timespec* abs_timeout, Result&
result);
- static void WakeupSignalHandler(
- butil::intrusive_ptr<SignalSync> signal_sync, Result& result);
Result SignalTrace(pid_t worker_tid);
- static Result TraceCore(unw_cursor_t& cursor);
-
// Make sure only one bthread is traced at a time.
- bthread::Mutex _trace_request_mutex;
+ Mutex _trace_request_mutex;
// For signal trace.
// Make sure bthread does not jump stack when it is being traced.
@@ -133,6 +117,10 @@ private:
// For context trace.
unw_context_t _context{};
+ // Hold SignalSync and wait until no one is using it before releasing it.
+ // This will avoid deadlock because it will not be released in the signal
handler.
+ std::vector<butil::intrusive_ptr<SignalSync>> _inuse_signal_syncs;
+
bvar::LatencyRecorder _trace_time{"bthread_trace_time"};
};
diff --git a/test/bthread_unittest.cpp b/test/bthread_unittest.cpp
index eaa16fa5..dcb8d873 100644
--- a/test/bthread_unittest.cpp
+++ b/test/bthread_unittest.cpp
@@ -34,7 +34,6 @@ int main(int argc, char* argv[]) {
namespace bthread {
extern __thread bthread::LocalStorage tls_bls;
-DECLARE_bool(enable_fast_unwind);
#ifdef BRPC_BTHREAD_TRACER
extern std::string stack_trace(bthread_t tid);
#endif // BRPC_BTHREAD_TRACER
@@ -639,22 +638,18 @@ void spin_and_log_trace() {
while (!start) {
usleep(10 * 1000);
}
- bthread::FLAGS_enable_fast_unwind = false;
+
std::string st1 = bthread::stack_trace(th);
LOG(INFO) << "spin_and_log stack trace:\n" << st1;
+ ok = st1.find("spin_and_log") != std::string::npos;
- bthread::FLAGS_enable_fast_unwind = true;
- std::string st2 = bthread::stack_trace(th);
- LOG(INFO) << "fast_unwind spin_and_log stack trace:\n" << st2;
stop = true;
ASSERT_EQ(0, bthread_join(th, NULL));
- std::string st3 = bthread::stack_trace(th);
- LOG(INFO) << "ended bthread stack trace:\n" << st3;
- ASSERT_NE(std::string::npos, st3.find("not exist now"));
+ std::string st2 = bthread::stack_trace(th);
+ LOG(INFO) << "ended bthread stack trace:\n" << st2;
+ ASSERT_NE(std::string::npos, st2.find("not exist now"));
- ok = st1.find("spin_and_log") != std::string::npos &&
- st2.find("spin_and_log") != std::string::npos;
if (ok) {
break;
}
@@ -672,21 +667,18 @@ void repeated_sleep_trace() {
while (!start) {
usleep(10 * 1000);
}
- bthread::FLAGS_enable_fast_unwind = false;
+
std::string st1 = bthread::stack_trace(th);
LOG(INFO) << "repeated_sleep stack trace:\n" << st1;
+ ok = st1.find("repeated_sleep") != std::string::npos;
- bthread::FLAGS_enable_fast_unwind = true;
- std::string st2 = bthread::stack_trace(th);
- LOG(INFO) << "fast_unwind repeated_sleep stack trace:\n" << st2;
stop = true;
ASSERT_EQ(0, bthread_join(th, NULL));
- std::string st3 = bthread::stack_trace(th);
- LOG(INFO) << "ended bthread stack trace:\n" << st3;
- ASSERT_NE(std::string::npos, st3.find("not exist now"));
- ok = st1.find("repeated_sleep") != std::string::npos &&
- st2.find("repeated_sleep") != std::string::npos;
+ std::string st2 = bthread::stack_trace(th);
+ LOG(INFO) << "ended bthread stack trace:\n" << st2;
+ ASSERT_NE(std::string::npos, st2.find("not exist now"));
+
if (ok) {
break;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]