thread: show thread limit info when thread creation fails It seems useful to return the number of outstanding threads and the ulimit nproc when thread creation fails, to help an administrator diagnose the cause of the fork()/clone() failure. This patch adds that info to the error Status returned from Thread::Create().
That necessitated making the ReadThreadsRunning() method of ThreadManager public. Tested manually on Linux. Example: $ ps -efwwwL | grep mpercy | wc -l 2357 $ ulimit -u 2370 $ ./bin/kudu-tserver --fs-wal-dir $(pwd)/wal --logtostderr ... F1018 14:16:00.312577 21557 service_pool.cc:93] Check failed: _s.ok() Bad status: Runtime error: Could not create thread (63 Kudu-managed threads running in this process, 2370 max processes allowed for current user): Resource temporarily unavailable (error 11) *** Check failure stack trace: *** *** Aborted at 1539897360 (unix time) try "date -d @1539897360" if you are using GNU date *** PC: @ 0x7f503d9d9e97 gsignal *** SIGABRT (@0x3e800005435) received by PID 21557 (TID 0x7f503cf34900) from PID 21557; stack trace: *** @ 0x7f5042455890 (unknown) @ 0x7f503d9d9e97 gsignal @ 0x7f503d9db801 abort @ 0x7f504024c309 kudu::AbortFailureFunction() @ 0x7f503f645d0d google::LogMessage::Fail() @ 0x7f503f647ce4 google::LogMessage::SendToLog() @ 0x7f503f64582d google::LogMessage::Flush() @ 0x7f503f6486b9 google::LogMessageFatal::~LogMessageFatal() @ 0x7f504172688e kudu::rpc::ServicePool::Init() @ 0x7f50456b3eed kudu::RpcServer::RegisterService() @ 0x7f50456bfc59 kudu::server::ServerBase::RegisterService() @ 0x7f50459004ac kudu::tserver::TabletServer::Start() @ 0x40683e kudu::tserver::TabletServerMain() @ 0x4060a2 main @ 0x7f503d9bcb97 __libc_start_main @ 0x405fba _start Aborted (core dumped) Change-Id: I8e0bd0d0776142e8feff18bffe15e61ca1ba5816 Reviewed-on: http://gerrit.cloudera.org:8080/11726 Reviewed-by: Andrew Wong <aw...@cloudera.com> Reviewed-by: Adar Dembo <a...@cloudera.com> Tested-by: Mike Percy <mpe...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/a33e5733 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/a33e5733 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/a33e5733 Branch: refs/heads/master Commit: a33e5733c16bf7da7eae2b813086faf7d01b0e2f Parents: 8a0d160 Author: Mike Percy <mpe...@apache.org> Authored: Thu Oct 18 13:19:12 2018 -0700 Committer: Mike Percy <mpe...@apache.org> Committed: Fri Oct 19 23:16:35 2018 +0000 ---------------------------------------------------------------------- src/kudu/util/thread.cc | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/a33e5733/src/kudu/util/thread.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/thread.cc b/src/kudu/util/thread.cc index 4abc7c1..bb53f5a 100644 --- a/src/kudu/util/thread.cc +++ b/src/kudu/util/thread.cc @@ -48,6 +48,7 @@ #include "kudu/gutil/once.h" #include "kudu/gutil/port.h" #include "kudu/gutil/strings/substitute.h" +#include "kudu/util/env.h" #include "kudu/util/flag_tags.h" #include "kudu/util/kernel_stack_watchdog.h" #include "kudu/util/logging.h" @@ -139,7 +140,7 @@ static uint64_t GetInVoluntaryContextSwitches() { class ThreadMgr; -__thread Thread* Thread::tls_ = NULL; +__thread Thread* Thread::tls_ = nullptr; // Singleton instance of ThreadMgr. Only visible in this file, used only by Thread. // The Thread class adds a reference to thread_manager while it is supervising a thread so @@ -178,6 +179,9 @@ class ThreadMgr { // already been removed, this is a no-op. void RemoveThread(const pthread_t& pthread_id, const string& category); + // Metric callback for number of threads running. Also used for error messages. + uint64_t ReadThreadsRunning(); + private: // Container class for any details we want to capture about a thread // TODO: Add start-time. @@ -219,9 +223,8 @@ class ThreadMgr { uint64_t threads_started_metric_; uint64_t threads_running_metric_; - // Metric callbacks. + // Metric callback for number of threads started. uint64_t ReadThreadsStarted(); - uint64_t ReadThreadsRunning(); // Webpage callback; prints all threads by category. void ThreadPathHandler(const WebCallbackRegistry::WebRequest& req, @@ -475,7 +478,7 @@ Status ThreadJoiner::Join() { // Unconditionally join before returning, to guarantee that any TLS // has been destroyed (pthread_key_create() destructors only run // after a pthread's user method has returned). - int ret = pthread_join(thread_->thread_, NULL); + int ret = pthread_join(thread_->thread_, nullptr); CHECK_EQ(ret, 0); thread_->joinable_ = false; return Status::OK(); @@ -553,9 +556,18 @@ Status Thread::StartThread(const std::string& category, const std::string& name, { SCOPED_LOG_SLOW_EXECUTION_PREFIX(WARNING, 500 /* ms */, log_prefix, "creating pthread"); SCOPED_WATCH_STACK((flags & NO_STACK_WATCHDOG) ? 0 : 250); - int ret = pthread_create(&t->thread_, NULL, &Thread::SuperviseThread, t.get()); + int ret = pthread_create(&t->thread_, nullptr, &Thread::SuperviseThread, t.get()); if (ret) { - return Status::RuntimeError("Could not create thread", strerror(ret), ret); + string msg = ""; + if (ret == EAGAIN) { + uint64_t rlimit_nproc = Env::Default()->GetResourceLimit( + Env::ResourceLimitType::RUNNING_THREADS_PER_EUID); + uint64_t num_threads = thread_manager->ReadThreadsRunning(); + msg = Substitute(" ($0 Kudu-managed threads running in this process, " + "$1 max processes allowed for current user)", + num_threads, rlimit_nproc); + } + return Status::RuntimeError(Substitute("Could not create thread$0", msg), strerror(ret), ret); } } @@ -603,7 +615,7 @@ void* Thread::SuperviseThread(void* arg) { t->functor_(); pthread_cleanup_pop(true); - return NULL; + return nullptr; } void Thread::FinishThread(void* arg) {