Mostafa Mokhtar created IMPALA-6359:
---------------------------------------

             Summary: Blocking in Service threads due to tcmalloc contention 
due to memory allocations for RowBatch::Deserialize
                 Key: IMPALA-6359
                 URL: https://issues.apache.org/jira/browse/IMPALA-6359
             Project: IMPALA
          Issue Type: Sub-task
            Reporter: Mostafa Mokhtar
            Priority: Critical


This is a regression compared to thrift with KRPC is enabled.
Results are based on running 48 concurrent TPCDS queries on a 40 node cluster, 
regression is in the order of 2-4x (Will update the JIRA with a more accurate 
number soon). 

Service threads stall on memory allocation in the stack below, looking at the 
Service threads memory allocation/deallocation I believe they won't benefit 
much from the per thread cache as there is more allocations than deallocations 
on this threads as a result they go to the CentralFreeList for most 
allocations. 

{code}
#0  0x00000000020931e7 in base::internal::SpinLockDelay(int volatile*, int, 
int) ()
#1  0x00000000020dde29 in SpinLock::SlowLock() ()
#2  0x0000000002121668 in tcmalloc::CentralFreeList::Populate() ()
#3  0x0000000002121768 in tcmalloc::CentralFreeList::FetchFromOneSpansSafe(int, 
void**, void**) ()
#4  0x00000000021217f4 in tcmalloc::CentralFreeList::RemoveRange(void**, 
void**, int) ()
#5  0x000000000212ef53 in tcmalloc::ThreadCache::FetchFromCentralCache(unsigned 
long, unsigned long) ()
#6  0x00000000021d37fb in tc_malloc ()
#7  0x0000000000b27c3a in impala::MemPool::FindChunk(long, bool) ()
#8  0x0000000000b354e6 in impala::MemPool::Allocate(long) ()
#9  0x0000000000b34409 in impala::RowBatch::Deserialize(kudu::Slice const&, 
kudu::Slice const&, long, bool) ()
#10 0x0000000000b347fb in impala::RowBatch::RowBatch(impala::RowDescriptor 
const*, impala::RowBatchHeaderPB const&, kudu::Slice const&, kudu::Slice 
const&, impala::MemTracker*) ()
#11 0x0000000000b15d68 in 
impala::KrpcDataStreamRecvr::SenderQueue::AddBatchWork(long, 
impala::RowBatchHeaderPB const&, kudu::Slice const&, kudu::Slice const&, 
boost::unique_lock<impala::SpinLock>*) ()
#12 0x0000000000b18715 in 
impala::KrpcDataStreamRecvr::SenderQueue::AddBatch(impala::TransmitDataRequestPB
 const*, kudu::rpc::RpcContext*) ()
#13 0x0000000000b0e750 in 
impala::KrpcDataStreamMgr::AddData(impala::TransmitDataRequestPB const*, 
kudu::rpc::RpcContext*) ()
#14 0x0000000001183305 in 
kudu::rpc::GeneratedServiceIf::Handle(kudu::rpc::InboundCall*) ()
#15 0x00000000011bad7d in impala::ImpalaServicePool::RunThread() ()
#16 0x0000000000d18a73 in 
impala::Thread::SuperviseThread(std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, boost::function<void ()()>, 
impala::Promise<long>*) ()
{code}

I added logging to track allocations that take a long time and found that in 
the extreme case some allocations take up to 10 seconds when allocating 16KB of 
memory. 

{code}
 MonotonicStopWatch timer;
    timer.Start();
    SCOPED_TIMER(recvr_->deserialize_row_batch_timer_);
    // At this point, the row batch will be inserted into batch_queue_. Close() 
will
    // handle deleting any unconsumed batches from batch_queue_. Close() cannot 
proceed
    // until there are no pending insertion to batch_queue_.
    batch.reset(new RowBatch(recvr_->row_desc(), header, tuple_offsets, 
tuple_data,
        recvr_->mem_tracker()));
    timer.Stop();
    if (timer.ElapsedTime() > 1000000) {
      LOG(INFO) << "TTT RowBatch allocation size:" << 
header.uncompressed_size() << " allocation time " << timer.ElapsedTime();
    }
{code}

Call stacks from other threads that release RowBatch memory which also are 
affected by the tcmalloc contention 

{code}
#0  0x000000000212c1f4 in tcmalloc::PageHeap::MayMergeSpans(tcmalloc::Span*, 
tcmalloc::Span*) ()
#1  0x000000000212c54d in 
tcmalloc::PageHeap::MergeIntoFreeList(tcmalloc::Span*) ()
#2  0x000000000212c874 in tcmalloc::PageHeap::Delete(tcmalloc::Span*) ()
#3  0x0000000002121090 in tcmalloc::CentralFreeList::ReleaseToSpans(void*) ()
#4  0x000000000212111b in tcmalloc::CentralFreeList::ReleaseListToSpans(void*) 
()
#5  0x000000000212138c in tcmalloc::CentralFreeList::InsertRange(void*, void*, 
int) ()
#6  0x000000000212f0f4 in 
tcmalloc::ThreadCache::ReleaseToCentralCache(tcmalloc::ThreadCache::FreeList*, 
unsigned long, int) ()
#7  0x000000000212f1bc in 
tcmalloc::ThreadCache::ListTooLong(tcmalloc::ThreadCache::FreeList*, unsigned 
long) ()
#8  0x00000000021d3d10 in tc_free ()
#9  0x0000000000b26af8 in impala::MemPool::FreeAll() ()
#10 0x0000000000b339b1 in impala::RowBatch::Reset() ()
#11 0x0000000000fea7b1 in 
impala::PartitionedAggregationNode::Open(impala::RuntimeState*) ()
#12 0x0000000000b77dfd in impala::FragmentInstanceState::Open() ()
#13 0x0000000000b7943b in impala::FragmentInstanceState::Exec() ()
#14 0x0000000000b68a0a in 
impala::QueryState::ExecFInstance(impala::FragmentInstanceState*) ()
{code}

{code}
Thread 682 (Thread 0x7f61ebec4700 (LWP 28762)):
#0  0x00000000020931e7 in base::internal::SpinLockDelay(int volatile*, int, 
int) ()
#1  0x00000000020dde29 in SpinLock::SlowLock() ()
#2  0x000000000212140f in tcmalloc::CentralFreeList::InsertRange(void*, void*, 
int) ()
#3  0x000000000212f0f4 in 
tcmalloc::ThreadCache::ReleaseToCentralCache(tcmalloc::ThreadCache::FreeList*, 
unsigned long, int) ()
#4  0x000000000212f1bc in 
tcmalloc::ThreadCache::ListTooLong(tcmalloc::ThreadCache::FreeList*, unsigned 
long) ()
#5  0x00000000021d3d10 in tc_free ()
#6  0x0000000000b338b1 in impala::RowBatch::~RowBatch() ()
#7  0x0000000000d8d1e3 in 
impala::HdfsScanNode::GetNextInternal(impala::RuntimeState*, impala::RowBatch*, 
bool*) ()
#8  0x0000000000d8ebf2 in impala::HdfsScanNode::GetNext(impala::RuntimeState*, 
impala::RowBatch*, bool*) ()
#9  0x0000000000fee76e in 
impala::PartitionedHashJoinNode::NextProbeRowBatch(impala::RuntimeState*, 
impala::RowBatch*) ()
#10 0x0000000000ff43b1 in 
impala::PartitionedHashJoinNode::GetNext(impala::RuntimeState*, 
impala::RowBatch*, bool*) ()
#11 0x0000000000fee76e in 
impala::PartitionedHashJoinNode::NextProbeRowBatch(impala::RuntimeState*, 
impala::RowBatch*) ()
#12 0x0000000000ff43b1 in 
impala::PartitionedHashJoinNode::GetNext(impala::RuntimeState*, 
impala::RowBatch*, bool*) ()
#13 0x0000000000fee76e in 
impala::PartitionedHashJoinNode::NextProbeRowBatch(impala::RuntimeState*, 
impala::RowBatch*) ()
#14 0x0000000000ff43b1 in 
impala::PartitionedHashJoinNode::GetNext(impala::RuntimeState*, 
impala::RowBatch*, bool*) ()
#15 0x0000000000fea54d in 
impala::PartitionedAggregationNode::Open(impala::RuntimeState*) ()
#16 0x0000000000b77dfd in impala::FragmentInstanceState::Open() ()
#17 0x0000000000b7943b in impala::FragmentInstanceState::Exec() ()
#18 0x0000000000b68a0a in 
impala::QueryState::ExecFInstance(impala::FragmentInstanceState*) ()
#19 0x0000000000d18a73 in 
impala::Thread::SuperviseThread(std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > const&, boost::function<void ()()>, 
impala::Promise<long>*) ()
#20 0x0000000000d19204 in boost::detail::thread_data<boost::_bi::bind_t<void, 
void (*)(std::basic_string<char, std::char_traits<char>, std::allocator<char> > 
const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > 
const&, boost::function<void ()()>, impala::Promise<long>*), 
boost::_bi::list4<boost::_bi::value<std::basic_string<char, 
std::char_traits<char>, std::allocator<char> > >, 
boost::_bi::value<std::basic_string<char, std::char_traits<char>, 
std::allocator<char> > >, boost::_bi::value<boost::function<void ()()> >, 
boost::_bi::value<impala::Promise<long>*> > > >::run() ()
#21 0x000000000129bf5a in thread_proxy ()
#22 0x000000326c607aa1 in start_thread () from /lib64/libpthread.so.0
#23 0x000000326c2e893d in clone () from /lib64/libc.so.6
{code}

This regression was observed with the tcmalloc options below
{code}
TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=2147483648
TCMALLOC_TRANSFER_NUM_OBJ=40
{code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to