[ 
https://issues.apache.org/jira/browse/IMPALA-8322?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16808271#comment-16808271
 ] 

Joe McDonnell commented on IMPALA-8322:
---------------------------------------

I created a diagnostic change so that if the coordinator sees the "timed out 
waiting for receiver" error, it sends along an extra field on the cancel for 
that query that causes the executor to LOG(FATAL) and get a minidump. The 
problem reproduced, and now we have a minidump/core for the executor impalad 
that wasn't starting the receiver.

Some threads doing Cancel() are stuck in QueryState::WaitForPrepare():
{noformat}
0 libpthread-2.17.so + 0xb945
1 impalad!impala::Promise<bool, (impala::PromiseMode)0>::Get() [promise.h : 87 
+ 0x13]
2 impalad!impala::TypedCountingBarrier<bool>::Wait() [counting-barrier.h : 59 + 
0xc]
3 impalad!impala::CountingBarrier::Wait() [counting-barrier.h : 95 + 0xc]
4 impalad!impala::QueryState::WaitForPrepare() [query-state.cc : 478 + 0x1a]
5 impalad!impala::QueryState::Cancel() [query-state.cc : 659 + 0x16]
6 
impalad!impala::ControlService::CancelQueryFInstances(impala::CancelQueryFInstancesRequestPB
 const*, impala::CancelQueryFInstancesResponsePB*, kudu::rpc::RpcContext*) 
[control-service.cc : 183 + 0x17]
7 
impalad!impala::ControlServiceIf::ControlServiceIf(scoped_refptr<kudu::MetricEntity>
 const&, scoped_refptr<kudu::rpc::ResultTracker> 
const&)::{lambda(google::protobuf::Message const*, google::protobuf::Message*, 
kudu::rpc::RpcContext*)#4}::operator()(google::protobuf::Message const*, 
google::protobuf::Message*, kudu::rpc::RpcContext*) const + 0x3e
8 impalad!std::_Function_handler<void(const google::protobuf::Message*, 
google::protobuf::Message*, kudu::rpc::RpcContext*), 
impala::ControlServiceIf::ControlServiceIf(const 
scoped_refptr<kudu::MetricEntity>&, const 
scoped_refptr<kudu::rpc::ResultTracker>&)::<lambda(const 
google::protobuf::Message*, google::protobuf::Message*, 
kudu::rpc::RpcContext*)> >::_M_invoke [functional : 2039 + 0x20]
...{noformat}
Those threads are stuck there because there are a bunch of threads that are 
stuck in ThreadResourceMgr::CreatePool() (there are 25 threads here):

 
{noformat}
0 libpthread-2.17.so + 0xe42d
1 impalad!boost::unique_lock<boost::mutex>::lock() [lock_types.hpp : 346 + 0xf]
2 impalad!boost::unique_lock<boost::mutex>::unique_lock(boost::mutex&) 
[lock_types.hpp : 124 + 0xc]
3 impalad!impala::ThreadResourceMgr::CreatePool() [thread-resource-mgr.cc : 52 
+ 0x17]
4 impalad!impala::RuntimeState::Init() [runtime-state.cc : 117 + 0x22]
5 impalad!impala::RuntimeState::RuntimeState(impala::QueryState*, 
impala::TPlanFragmentCtx const&, impala::TPlanFragmentInstanceCtx const&, 
impala::ExecEnv*) [runtime-state.cc : 79 + 0xc]
6 impalad!impala::FragmentInstanceState::Prepare() [fragment-instance-state.cc 
: 137 + 0x4b]
7 impalad!impala::FragmentInstanceState::Exec() [fragment-instance-state.cc : 
73 + 0x19]
8 impalad!impala::QueryState::ExecFInstance(impala::FragmentInstanceState*) 
[query-state.cc : 639 + 0x19]
9 impalad!impala::QueryState::StartFInstances()::{lambda()#1}::operator()() 
const + 0x26
...{noformat}
Those threads are waiting on a lock held by a thread in 
ThreadResourceMgr::DestroyPool():
{noformat}
0 libpthread-2.17.so + 0xe42d
1 impalad!boost::unique_lock<boost::mutex>::lock() [lock_types.hpp : 346 + 0xf]
2 impalad!boost::unique_lock<boost::mutex>::unique_lock(boost::mutex&) 
[lock_types.hpp : 124 + 0xc]
3 
impalad!impala::HdfsScanNode::ThreadTokenAvailableCb(impala::ThreadResourcePool*)
 [hdfs-scan-node.cc : 296 + 0x20]
4 impalad!boost::_mfi::mf1<void, impala::HdfsScanNode, 
impala::ThreadResourcePool*>::operator()(impala::HdfsScanNode*, 
impala::ThreadResourcePool*) const [mem_fn_template.hpp : 165 + 0x1e]
5 impalad!void boost::_bi::list2<boost::_bi::value<impala::HdfsScanNode*>, 
boost::arg<1> >::operator()<boost::_mfi::mf1<void, impala::HdfsScanNode, 
impala::ThreadResourcePool*>, boost::_bi::list1<impala::ThreadResourcePool*&> 
>(boost::_bi::type<void>, boost::_mfi::mf1<void, impala::HdfsScanNode, 
impala::ThreadResourcePool*>&, 
boost::_bi::list1<impala::ThreadResourcePool*&>&, int) [bind.hpp : 313 + 0x4e]
6 impalad!void boost::_bi::bind_t<void, boost::_mfi::mf1<void, 
impala::HdfsScanNode, impala::ThreadResourcePool*>, 
boost::_bi::list2<boost::_bi::value<impala::HdfsScanNode*>, boost::arg<1> > 
>::operator()<impala::ThreadResourcePool*>(impala::ThreadResourcePool*&) 
[bind_template.hpp : 32 + 0x22]
7 
impalad!boost::detail::function::void_function_obj_invoker1<boost::_bi::bind_t<void,
 boost::_mfi::mf1<void, impala::HdfsScanNode, impala::ThreadResourcePool*>, 
boost::_bi::list2<boost::_bi::value<impala::HdfsScanNode*>, boost::arg<1> > >, 
void, 
impala::ThreadResourcePool*>::invoke(boost::detail::function::function_buffer&, 
impala::ThreadResourcePool*) [function_template.hpp : 153 + 0x13]
8 impalad!boost::function1<void, 
impala::ThreadResourcePool*>::operator()(impala::ThreadResourcePool*) const 
[function_template.hpp : 767 + 0x18]
9 impalad!impala::ThreadResourcePool::InvokeCallbacks() [thread-resource-mgr.cc 
: 105 + 0x19]
10 
impalad!impala::ThreadResourceMgr::UpdatePoolQuotas(impala::ThreadResourcePool*)
 [thread-resource-mgr.cc : 120 + 0xc]
11 
impalad!impala::ThreadResourceMgr::DestroyPool(std::unique_ptr<impala::ThreadResourcePool,
 std::default_delete<impala::ThreadResourcePool> >) [thread-resource-mgr.cc : 
71 + 0x14]
12 impalad!impala::RuntimeState::ReleaseResources() [runtime-state.cc : 281 + 
0x46]
13 impalad!impala::FragmentInstanceState::Close() [fragment-instance-state.cc : 
406 + 0x13]
14 impalad!impala::FragmentInstanceState::Exec() [fragment-instance-state.cc : 
99 + 0xf]
15 impalad!impala::QueryState::ExecFInstance(impala::FragmentInstanceState*) 
[query-state.cc : 639 + 0x19]
16 impalad!impala::QueryState::StartFInstances()::{lambda()#1}::operator()() 
const + 0x26
...{noformat}
I'm still exploring below that level, but there is a tangle of locks and 
threads waiting.

 

 

 

> S3 tests encounter "timed out waiting for receiver fragment instance"
> ---------------------------------------------------------------------
>
>                 Key: IMPALA-8322
>                 URL: https://issues.apache.org/jira/browse/IMPALA-8322
>             Project: IMPALA
>          Issue Type: Bug
>          Components: Backend
>    Affects Versions: Impala 3.3.0
>            Reporter: Joe McDonnell
>            Priority: Blocker
>              Labels: broken-build
>         Attachments: run_tests_swimlane.json.gz
>
>
> This has been seen multiple times when running s3 tests:
> {noformat}
> query_test/test_join_queries.py:57: in test_basic_joins
>     self.run_test_case('QueryTest/joins', new_vector)
> common/impala_test_suite.py:472: in run_test_case
>     result = self.__execute_query(target_impalad_client, query, user=user)
> common/impala_test_suite.py:699: in __execute_query
>     return impalad_client.execute(query, user=user)
> common/impala_connection.py:174: in execute
>     return self.__beeswax_client.execute(sql_stmt, user=user)
> beeswax/impala_beeswax.py:183: in execute
>     handle = self.__execute_query(query_string.strip(), user=user)
> beeswax/impala_beeswax.py:360: in __execute_query
>     self.wait_for_finished(handle)
> beeswax/impala_beeswax.py:381: in wait_for_finished
>     raise ImpalaBeeswaxException("Query aborted:" + error_log, None)
> E   ImpalaBeeswaxException: ImpalaBeeswaxException:
> E    Query aborted:Sender 127.0.0.1 timed out waiting for receiver fragment 
> instance: 6c40d992bb87af2f:0ce96e5d00000007, dest node: 4{noformat}
> This is related to IMPALA-6818. On a bad run, there are various time outs in 
> the impalad logs:
> {noformat}
> I0316 10:47:16.359313 20175 krpc-data-stream-mgr.cc:354] Sender 127.0.0.1 
> timed out waiting for receiver fragment instance: 
> ef4a5dc32a6565bd:a8720b8500000007, dest node: 5
> I0316 10:47:16.359345 20175 rpcz_store.cc:265] Call 
> impala.DataStreamService.TransmitData from 127.0.0.1:40030 (request call id 
> 14881) took 120182ms. Request Metrics: {}
> I0316 10:47:16.359380 20175 krpc-data-stream-mgr.cc:354] Sender 127.0.0.1 
> timed out waiting for receiver fragment instance: 
> d148d83e11a4603d:54dc35f700000004, dest node: 3
> I0316 10:47:16.359395 20175 rpcz_store.cc:265] Call 
> impala.DataStreamService.TransmitData from 127.0.0.1:40030 (request call id 
> 14880) took 123097ms. Request Metrics: {}
> ... various messages ...
> I0316 10:47:56.364990 20154 kudu-util.h:108] Cancel() RPC failed: Timed out: 
> CancelQueryFInstances RPC to 127.0.0.1:27000 timed out after 10.000s (SENT)
> ... various messages ...
> W0316 10:48:15.056421 20150 rpcz_store.cc:251] Call 
> impala.ControlService.CancelQueryFInstances from 127.0.0.1:40912 (request 
> call id 202) took 48695ms (client timeout 10000).
> W0316 10:48:15.056473 20150 rpcz_store.cc:255] Trace:
> 0316 10:47:26.361265 (+ 0us) impala-service-pool.cc:165] Inserting onto call 
> queue
> 0316 10:47:26.361285 (+ 20us) impala-service-pool.cc:245] Handling call
> 0316 10:48:15.056398 (+48695113us) inbound_call.cc:162] Queueing success 
> response
> Metrics: {}
> I0316 10:48:15.057087 20139 connection.cc:584] Got response to call id 202 
> after client already timed out or cancelled{noformat}
> So far, this has only happened on s3. The system load at the time is not 
> higher than normal. If anything it is lower than normal. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscr...@impala.apache.org
For additional commands, e-mail: issues-all-h...@impala.apache.org

Reply via email to