kof02guy opened a new issue #1136:
URL: https://github.com/apache/incubator-brpc/issues/1136


   我们用braft+brpc来做强一致性解决方案,目前在压力大持续时间长时,会导致死锁,通过gdb可以直接bt看到的栈分为两类。
   1在等2中的锁,而2在执行过程中拿了1所需要的锁,在试图往rq中push时由于_rq是满的会失败然后usleep。而_rq里的task 
pop出来执行很可能也是log_manager的相关操作,也需要拿同一把锁。这样就形成了循环,死锁
   我理解的思路有:
   1 是否有可能在_rq满时不再继续usleep等待入queue,而是直接执行?
   2 是否有可能通过某种方法在queue满时通知用户进行处理?
   
   1.
   `#0  0x00007f9edd212334 in __lll_lock_wait () from /lib64/libpthread.so.0
   #1  0x00007f9edd20d5d8 in _L_lock_854 () from /lib64/libpthread.so.0
   #2  0x00007f9edd20d4a7 in pthread_mutex_lock () from /lib64/libpthread.so.0
   #3  0x0000000000df637b in pthread_mutex_lock_impl (__mutex=0x7f89f75ff4e8) 
at xxx/deps/incubator-brpc/src/bthread/mutex.cpp:555
   #4  pthread_mutex_lock (__mutex=0x7f89f75ff4e8) at 
xxx/deps/incubator-brpc/src/bthread/mutex.cpp:813
   #5  0x0000000000d14ca0 in lock (this=0x7f89f75ff4b0, index=14541) at 
/usr/include/butil/synchronization/lock.h:69
   #6  lock (this=0x7f89f75ff4b0, index=14541) at 
/usr/include/c++/4.9.2/mutex:474
   #7  unique_lock (this=0x7f89f75ff4b0, index=14541) at 
/usr/include/c++/4.9.2/mutex:406
   #8  braft::LogManager::get_term (this=0x7f89f75ff4b0, index=14541) at 
xxx/src/braft/log_manager.cpp:787
   #9  0x0000000000d2ad2d in braft::Replicator::_fill_common_fields 
(this=0x7f86cc92c460, request=0x7f86a8f9d360, prev_log_index=14541, 
is_heartbeat=Unhandled dwarf expression opcode 0xf3
   ) at xxx/src/braft/replicator.cpp:496
   #10 0x0000000000d3063c in braft::Replicator::_send_entries 
(this=0x7f86cc92c460) at xxx/src/braft/replicator.cpp:611
   #11 0x0000000000d326bf in braft::Replicator::_on_rpc_returned (id=Unhandled 
dwarf expression opcode 0xf3
   ) at xxx/src/braft/replicator.cpp:489
   #12 0x0000000000d33b36 in brpc::internal::FunctionClosure5<unsigned long, 
brpc::Controller*, braft::AppendEntriesRequest*, braft::AppendEntriesResponse*, 
long>::Run (this=0x7f86d16ad950) at /usr/include/brpc/callback.h:339
   #13 0x0000000000e03ef8 in brpc::Controller::EndRPC (this=0x7f86d16ad5d0, 
info=Unhandled dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/brpc/controller.cpp:893
   #14 0x0000000000e05bc4 in brpc::Controller::OnVersionedRPCReturned 
(this=0x7f86d16ad5d0, info=..., new_bthread=Unhandled dwarf expression opcode 
0xf3
   ) at xxx/deps/incubator-brpc/src/brpc/controller.cpp:676
   #15 0x0000000000e6de9a in OnResponse (msg_base=0x7f897544b140) at 
xxx/deps/incubator-brpc/src/brpc/details/controller_private_accessor.h:48
   #16 brpc::policy::ProcessRpcResponse (msg_base=0x7f897544b140) at 
xxx/deps/incubator-brpc/src/brpc/policy/baidu_rpc_protocol.cpp:618 
   #17 0x0000000000e5b9fa in brpc::ProcessInputMessage (void_arg=Unhandled 
dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/brpc/input_messenger.cpp:136
   #18 0x0000000000decee1 in bthread::TaskGroup::task_runner 
(skip_remained=Unhandled dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:297
   #19 0x0000000000e3d5b1 in bthread_make_fcontext ()
   #20 0x0000000000000000 in ?? ()`
   
   2.
   `#0  0x00007f9edb9c1cbd in nanosleep () from /lib64/libc.so.6
   #1  0x00007f9edb9f6f14 in usleep () from /lib64/libc.so.6
   #2  0x0000000000deb891 in bthread::TaskGroup::ready_to_run_remote 
(this=0x7f8b6c0008c0, tid=142391050791690, nosignal=Unhandled dwarf expression 
opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:675
   #3  0x0000000000dee52a in bthread::TaskGroup::start_background<true> 
(this=Unhandled dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:448
   #4  0x0000000000df4d6c in start_from_non_worker (tid=0x7f8aed7e9800, 
attr=0x7f89f7582988, fn=0xddcbd0 
<bthread::ExecutionQueueBase::_execute_tasks(void*)>, arg=0x7f8511942ef0)
       at xxx/deps/incubator-brpc/src/bthread/bthread.cpp:146
   #5  bthread_start_background (tid=0x7f8aed7e9800, attr=0x7f89f7582988, 
fn=0xddcbd0 <bthread::ExecutionQueueBase::_execute_tasks(void*)>, 
arg=0x7f8511942ef0)
       at xxx/deps/incubator-brpc/src/bthread/bthread.cpp:194
   #6  0x0000000000dddb97 in bthread::ExecutionQueueBase::start_execute 
(this=0x7f89f75828d0, node=0x7f8511942ef0) at 
xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:115
   #7  0x0000000000d1a898 in execute (id=<value optimized out>, 
task=@0x7f8aed7e98d8, options=0x0, handle=0x0) at 
/usr/include/bthread/execution_queue_inl.h:318
   #8  bthread::execution_queue_execute<braft::LogManager::StableClosure*> 
(id=<value optimized out>, task=@0x7f8aed7e98d8, options=0x0, handle=0x0) at 
/usr/include/bthread/execution_queue_inl.h:363
   #9  0x0000000000d16251 in 
execution_queue_execute<braft::LogManager::StableClosure*> 
(this=0x7f89f75ff4b0, entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at 
/usr/include/bthread/execution_queue_inl.h:352
   #10 execution_queue_execute<braft::LogManager::StableClosure*> 
(this=0x7f89f75ff4b0, entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at 
/usr/include/bthread/execution_queue_inl.h:345
   #11 braft::LogManager::append_entries (this=0x7f89f75ff4b0, 
entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at 
xxx/src/braft/log_manager.cpp:485
   #12 0x0000000000cf3821 in braft::NodeImpl::apply (this=0x7f89f75feb50, 
tasks=Unhandled dwarf expression opcode 0xf3
   ) at xxx/src/braft/node.cpp:1959
   #13 0x0000000000cf3b1e in braft::NodeImpl::execute_applying_tasks 
(meta=0x7f89f75feb50, iter=...) at xxx/src/braft/node.cpp:724
   #14 0x0000000000dda82d in bthread::ExecutionQueueBase::_execute 
(this=0x7f89f75827d0, head=0x7f8572168a90, high_priority=Unhandled dwarf 
expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:273
   #15 0x0000000000dddc08 in bthread::ExecutionQueueBase::start_execute 
(this=0x7f89f75827d0, node=0x7f8572168a90) at 
xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:95
   #16 0x0000000000c3ba4c in cellar::raft::BucketStateMachine::AsyncApply 
(this=0x3efba20, op_type=Unhandled dwarf expression opcode 0xf3
   ) at bucket_state_machine.cpp:87
   ... 
   ...`
   
   通过gdb_bthread_stack.py拿到的bthread stack基本上(99%)都是:
   `#0  0x0000000000dea8a8 in jump_stack (pg=0x0, next_meta=0x5a7f77d934dc6) at 
xxx/deps/incubator-brpc/src/bthread/stack_inl.h:133
   #1  bthread::TaskGroup::sched_to (pg=0x0, next_meta=0x5a7f77d934dc6) at 
xxx/deps/incubator-brpc/src/bthread/task_group.cpp:605
   #2  0x0000000000deadce in sched_to (pg=0x7f894ebeba08) at 
xxx/deps/incubator-brpc/src/bthread/task_group_inl.h:80
   #3  bthread::TaskGroup::sched (pg=0x7f894ebeba08) at 
xxx/deps/incubator-brpc/src/bthread/task_group.cpp:563
   #4  0x0000000000de1cc7 in bthread::butex_wait (arg=0x7f87bceb60c0, 
expected_value=1, abstime=0x0) at 
xxx/deps/incubator-brpc/src/bthread/butex.cpp:660
   #5  0x0000000000de93a0 in bthread::CountdownEvent::wait 
(this=0x7f894ebebb48) at 
xxx/deps/incubator-brpc/src/bthread/countdown_event.cpp:65
   #6  0x0000000000d13991 in wait (this=Unhandled dwarf expression opcode 0xf3
   ) at xxx/src/braft/log_manager.cpp:164
   #7  braft::LogManager::last_log_id (this=Unhandled dwarf expression opcode 
0xf3
   ) at xxx/src/braft/log_manager.cpp:201
   #8  0x0000000000cf1d7a in braft::NodeImpl::handle_pre_vote_request 
(this=0x7f89d42a3c30, request=0x7f87bc3c30a0, response=0x7f87bd29ef20) at 
xxx/src/braft/node.cpp:2027
   #9  0x0000000000d7d892 in braft::RaftServiceImpl::pre_vote (this=Unhandled 
dwarf expression opcode 0xf3
   ) at xxx/src/braft/raft_service.cpp:62
   #10 0x0000000000d67c6b in braft::RaftService::CallMethod (this=Unhandled 
dwarf expression opcode 0xf3
   ) at xxx/bld/braft/raft.pb.cc:5130
   #11 0x0000000000e70cab in brpc::policy::ProcessRpcRequest 
(msg_base=Unhandled dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/brpc/policy/baidu_rpc_protocol.cpp:499
   #12 0x0000000000e5b9fa in brpc::ProcessInputMessage (void_arg=Unhandled 
dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/brpc/input_messenger.cpp:136
   #13 0x0000000000decee1 in bthread::TaskGroup::task_runner 
(skip_remained=Unhandled dwarf expression opcode 0xf3
   ) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:297
   #14 0x0000000000e3d5b1 in bthread_make_fcontext ()
   #15 0x0000000000000000 in ?? ()`
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@brpc.apache.org
For additional commands, e-mail: dev-h...@brpc.apache.org

Reply via email to