Mostafa Mokhtar created IMPALA-6787:
---------------------------------------

             Summary: On large secure clusters the connection setup thread 
becomes bottleneck at warmup and cause occasional timeout failures
                 Key: IMPALA-6787
                 URL: https://issues.apache.org/jira/browse/IMPALA-6787
             Project: IMPALA
          Issue Type: Bug
          Components: Distributed Exec
    Affects Versions: Impala 2.12.0
            Reporter: Mostafa Mokhtar


On +200 node clusters a single thread is not sufficient and ends up being a 
bottleneck for a while, which appears to cause queries to fail with 
{code}
I0401 20:20:55.032140 1806361 thrift-util.cc:123] TSocket::open() connect() 
<Host: va1007.foo.com Port: 22000>Connection timed out
I0401 20:20:55.032346 1806361 thrift-client.cc:78] Couldn't open transport for 
va1007.foo.com:22000 (connect() failed: Connection timed out)
I0401 20:20:55.032364 1806361 thrift-client.cc:94] Unable to connect to 
va1007.foo.com:22000
{code}

{code}
// Only using one thread here is sufficient for performance, and it avoids 
potential
  // thread safety issues with the thrift code called in SetupConnection.
  constexpr int CONNECTION_SETUP_POOL_SIZE = 1;

  // New - this is the thread pool used to process the internal accept queue.
  ThreadPool<shared_ptr<TTransport>> connection_setup_pool("setup-server", 
"setup-worker",
      CONNECTION_SETUP_POOL_SIZE, FLAGS_accepted_cnxn_queue_depth,
      [this](int tid, const shared_ptr<TTransport>& item) {
        this->SetupConnection(item);
      });
{code}

{code}
#0  0x00007fd927de8e20 in krb5int_MD5Update () from /lib64/libk5crypto.so.3
#1  0x00007fd927de7bca in k5_md5_hash () from /lib64/libk5crypto.so.3
#2  0x00007fd927e01e32 in krb5int_hmac_keyblock () from /lib64/libk5crypto.so.3
#3  0x00007fd927dfc448 in usage_key.isra.2 () from /lib64/libk5crypto.so.3
#4  0x00007fd927dfc9fc in krb5int_arcfour_decrypt () from 
/lib64/libk5crypto.so.3
#5  0x00007fd927df97e4 in krb5_k_decrypt () from /lib64/libk5crypto.so.3
#6  0x00007fd927df98bd in krb5_c_decrypt () from /lib64/libk5crypto.so.3
#7  0x00007fd9297191fb in rd_req_decoded_opt () from /lib64/libkrb5.so.3
#8  0x00007fd92971a1da in krb5_rd_req_decoded () from /lib64/libkrb5.so.3
#9  0x00007fd9282371df in kg_accept_krb5 () from /lib64/libgssapi_krb5.so.2
#10 0x00007fd9282388ca in krb5_gss_accept_sec_context_ext () from 
/lib64/libgssapi_krb5.so.2
#11 0x00007fd928238a29 in krb5_gss_accept_sec_context () from 
/lib64/libgssapi_krb5.so.2
#12 0x00007fd92822607a in gss_accept_sec_context () from 
/lib64/libgssapi_krb5.so.2
#13 0x00007fd92653aedc in gssapi_server_mech_step () from 
/usr/lib64/sasl2/libgssapiv2.so
#14 0x00007fd92bc27b9b in sasl_server_step () from /lib64/libsasl2.so.3
#15 0x0000000000caf3b1 in 
sasl::TSaslServer::evaluateChallengeOrResponse(unsigned char const*, unsigned 
int, unsigned int*) ()
#16 0x0000000000cb3040 in 
apache::thrift::transport::TSaslTransport::doSaslNegotiation() ()
#17 0x0000000000cb1488 in 
apache::thrift::transport::TSaslServerTransport::Factory::getTransport(boost::shared_ptr<apache::thrift::transport::TTransport>)
 ()
#18 0x0000000000b143c7 in 
apache::thrift::server::TAcceptQueueServer::SetupConnection(boost::shared_ptr<apache::thrift::transport::TTransport>)
 ()
#19 0x0000000000b14eb2 in 
boost::detail::function::void_function_obj_invoker2<apache::thrift::server::TAcceptQueueServer::serve()::{lambda(int,
 boost::shared_ptr<apache::thrift::transport::TTransport> const&)#1}, void, 
int, boost::shared_ptr<apache::thrift::transport::TTransport> 
const&>::invoke(boost::detail::function::function_buffer&, int, 
boost::shared_ptr<apache::thrift::transport::TTransport> const&) ()
#20 0x0000000000b17d79 in 
impala::ThreadPool<boost::shared_ptr<apache::thrift::transport::TTransport> 
>::WorkerThread(int) ()
#21 0x0000000000d6049f in impala::Thread::SuperviseThread(std::string const&, 
std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, 
impala::Promise<long>*) ()
#22 0x0000000000d60c9a in boost::detail::thread_data<boost::_bi::bind_t<void, 
void (*)(std::string const&, std::string const&, boost::function<void ()>, 
impala::ThreadDebugInfo const*, impala::Promise<long>*), 
boost::_bi::list5<boost::_bi::value<std::string>, 
boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, 
boost::_bi::value<impala::ThreadDebugInfo*>, 
boost::_bi::value<impala::Promise<long>*> > > >::run() ()
#23 0x00000000012d794a in thread_proxy ()
#24 0x00007fd928c7ddc5 in start_thread () from /lib64/libpthread.so.0
#25 0x00007fd9289aaced in clone () from /lib64/libc.so.6
{code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to