James Peach created MESOS-7122:
----------------------------------

             Summary: Process reaper should have a dedicated thread to avoid 
deadlock.
                 Key: MESOS-7122
                 URL: https://issues.apache.org/jira/browse/MESOS-7122
             Project: Mesos
          Issue Type: Bug
          Components: libprocess
            Reporter: James Peach


In a test environment, we saw that libprocess can deadlock when the process 
reaper is unable to run. 

This happens in the Mesos HDFS client, which synchronously runs a {{hadoop}} 
subprocess. If this happens too many times, the {{ReaperProcess}} is never 
scheduled to reap the subprocess statuses. Since the HDFS {{Future}} never 
completes, we deadlock with all the threads in the call stack below. If there 
was a dedicated thread for the {{ReaperProcess}} to run on, or some other way 
to endure that is is scheduled we could avoid the deadlock.

{noformat}
#0  0x00007f67b6ffc68c in pthread_cond_wait@@GLIBC_2.3.2 () from 
/lib64/libpthread.so.0
#1  0x00007f67b6da12fc in 
std::condition_variable::wait(std::unique_lock<std::mutex>&) () from 
/usr/lib64/libstdc++.so.6
#2  0x00007f67b8b864f6 in process::ProcessManager::wait(process::UPID const&) 
() from /usr/lib64/libmesos-1.2.0.so
#3  0x00007f67b8b8d347 in process::wait(process::UPID const&, Duration const&) 
() from /usr/lib64/libmesos-1.2.0.so
#4  0x00007f67b8b51a85 in process::Latch::await(Duration const&) () from 
/usr/lib64/libmesos-1.2.0.so
#5  0x00007f67b834fc9f in process::Future<Bytes>::await(Duration const&) const 
() from /usr/lib64/libmesos-1.2.0.so
#6  0x00007f67b833d700 in 
mesos::internal::slave::fetchSize(std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&) () from /usr/lib64/libmesos-1.2.0.so
#7  0x00007f67b833df5e in 
std::result_of<mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID 
const&, mesos::CommandInfo const&, std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&, mesos::SlaveID const&, mesos::internal::slave::Flags 
const&)::{lambda()#2} ()()>::type 
process::AsyncExecutorProcess::execute<mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID
 const&, mesos::CommandInfo const&, std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&, mesos::SlaveID const&, mesos::internal::slave::Flags 
const&)::{lambda()#2}>(std::result_of const&, boost::disable_if<std::result_of 
const&::is_void<std::result_of<mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID
 const&, mesos::CommandInfo const&, std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&, mesos::SlaveID const&, mesos::internal::slave::Flags 
const&)::{lambda()#2} ()()> >, void>::type*) () from 
/usr/lib64/libmesos-1.2.0.so
#8  0x00007f67b833a3d5 in std::_Function_handler<void 
()(process::ProcessBase*), process::Future<Try<Bytes, Error> > 
process::dispatch<Try<Bytes, Error>, process::AsyncExecutorProcess, 
mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID const&, 
mesos::CommandInfo const&, std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, Option<std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > > const&, mesos::SlaveID const&, 
mesos::internal::slave::Flags const&)::{lambda()#2} const&, void*, 
{lambda()#2}, mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID 
const&, mesos::CommandInfo const&, std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&, mesos::SlaveID const&, mesos::internal::slave::Flags 
const&)::{lambda()#2} const&>(process::PID<process::AsyncExecutorProcess> 
const&, process::Future 
(process::PID::*)(mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID
 const&, mesos::CommandInfo const&, std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const&, 
Option<std::basic_string<char, std::char_traits<char>, std::allocator<char> > > 
const&, mesos::SlaveID const&, mesos::internal::slave::Flags 
const&)::{lambda()#2} const&, void*), {lambda()#2}, 
mesos::internal::slave::FetcherProcess::fetch(mesos::ContainerID const&, 
mesos::CommandInfo const&, std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, Option<std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > > const&, mesos::SlaveID const&, 
mesos::internal::slave::Flags const&)::{lambda()#2} 
const&)::{lambda(process::ProcessBase*)#1}>::_M_invoke(std::_Any_data const&, 
process::ProcessBase*) () from /usr/lib64/libmesos-1.2.0.so
#9  0x00007f67b8b85ede in 
process::ProcessManager::resume(process::ProcessBase*) () from 
/usr/lib64/libmesos-1.2.0.so
#10 0x00007f67b8b8fc8f in 
std::thread::_Impl<std::_Bind_simple<process::ProcessManager::init_threads()::{unnamed
 type#1} ()()> >::_M_run() () from /usr/lib64/libmesos-1.2.0.so
#11 0x00007f67b6da1470 in ?? () from /usr/lib64/libstdc++.so.6
#12 0x00007f67b6ff8aa1 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f67b6a3faad in clone () from /lib64/libc.so.6
{noformat}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Reply via email to