chenBright opened a new pull request, #3291:
URL: https://github.com/apache/brpc/pull/3291

   ### What problem does this PR solve?
   
   Issue Number: regression introduced by #2949 
   
   Problem Summary:
   
   After PR #2949 ("Fix thread safety of AgentCombiner") changed
   `Agent::combiner` from a raw pointer to a `weak_ptr`, the
   `MultiDimensionTest.shared` UT reproducibly fails under 
   AddressSanitizer with a heap-use-after-free in `clear_all_agents()` 
   called from `~AgentCombiner`:
   
   ```
   [ RUN      ] MultiDimensionTest.shared
   =================================================================
   ==38807==ERROR: AddressSanitizer: heap-use-after-free on address 
0x62100065cd00 at pc 0x000000506883 bp 0x7f85a8a343f0 sp 0x7f85a8a343e8
   WRITE of size 8 at 0x62100065cd00 thread T1272
       #0 0x506882 in butil::LinkNode<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >::Agent>::RemoveFromList() 
/home/runner/work/brpc/brpc/test/../src/butil/containers/linked_list.h:130:28
       #1 0x5060de in bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >::clear_all_agents() 
/home/runner/work/brpc/brpc/test/../src/bvar/detail/combiner.h:330:19
       #2 0x505d66 in bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >::~AgentCombiner() 
/home/runner/work/brpc/brpc/test/../src/bvar/detail/combiner.h:236:13
       #3 0x505cf4 in void std::_Destroy<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> > >(bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >*) 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/stl_construct.h:151:19
       #4 0x505c78 in void std::allocator_traits<std::allocator<void> 
>::destroy<bvar::detail::AgentCombiner<int, int, bvar::detail::AddTo<int> > 
>(std::allocator<void>&, bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >*) 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/alloc_traits.h:648:4
       #5 0x504e35 in 
std::_Sp_counted_ptr_inplace<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >, std::allocator<void>, 
(__gnu_cxx::_Lock_policy)2>::_M_dispose() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:613:2
       #6 0x507467 in 
std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release_last_use() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:175:2
       #7 0x5073f4 in 
std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release_last_use_cold() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:199:9
       #8 0x5073a1 in 
std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:353:8
       #9 0x506f75 in 
std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:1071:11
       #10 0x506f08 in std::__shared_ptr<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:1524:31
       #11 0x506734 in std::shared_ptr<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> > >::~shared_ptr() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr.h:175:11
       #12 0x503b22 in bvar::Reducer<int, bvar::detail::AddTo<int>, 
bvar::detail::MinusFrom<int> >::~Reducer() 
/home/runner/work/brpc/brpc/test/../src/bvar/reducer.h:218:5
       #13 0x502c5a in bvar::Adder<int, void>::~Adder() 
/home/runner/work/brpc/brpc/test/../src/bvar/reducer.h:349:43
       #14 0x84ddf0 in void std::_Destroy<bvar::Adder<int, void> 
>(bvar::Adder<int, void>*) 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/stl_construct.h:151:19
       #15 0x84dd48 in void std::allocator_traits<std::allocator<void> 
>::destroy<bvar::Adder<int, void> >(std::allocator<void>&, bvar::Adder<int, 
void>*) 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/alloc_traits.h:648:4
       #16 0x84dac5 in std::_Sp_counted_ptr_inplace<bvar::Adder<int, void>, 
std::allocator<void>, (__gnu_cxx::_Lock_policy)2>::_M_dispose() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:613:2
       #17 0x507140 in 
std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:346:8
       #18 0x506f75 in 
std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:1071:11
       #19 0x84e138 in std::__shared_ptr<bvar::Adder<int, void>, 
(__gnu_cxx::_Lock_policy)2>::~__shared_ptr() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr_base.h:1524:31
       #20 0x5a02f4 in std::shared_ptr<bvar::Adder<int, void> >::~shared_ptr() 
/usr/bin/../lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/bits/shared_ptr.h:175:11
       #21 0x5953f7 in get_shared_adder_thread(void*) 
/home/runner/work/brpc/brpc/test/bvar_multi_dimension_unittest.cpp:623:5
       #22 0x7f85b4094ac2 in start_thread nptl/./nptl/pthread_create.c:442:8
       #23 0x7f85b41268cf  misc/../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
   
   0x62100065cd00 is located 0 bytes inside of 4160-byte region 
[0x62100065cd00,0x62100065dd40)
   freed by thread T1267 here:
       #0 0x4f368d in operator delete(void*) 
(/home/runner/work/brpc/brpc/test/test_bvar+0x4f368d)
       #1 0x62c8f7 in bvar::detail::AgentGroup<bvar::detail::AgentCombiner<int, 
int, bvar::detail::AddTo<int> >::Agent>::_destroy_tls_blocks() 
/home/runner/work/brpc/brpc/test/../src/bvar/detail/agent_group.h:166:13
       #2 0x9b22bf in butil::detail::ThreadExitHelper::~ThreadExitHelper() 
/home/runner/work/brpc/brpc/src/butil/thread_local.cpp:41:13
       #3 0x9b22bf in butil::detail::delete_thread_exit_helper(void*) 
/home/runner/work/brpc/brpc/src/butil/thread_local.cpp:77:5
       #4 0x7f85b4091690 in __GI___nptl_deallocate_tsd 
nptl/./nptl/nptl_deallocate_tsd.c:73:29
       #5 0x7f85b4091690 in __GI___nptl_deallocate_tsd 
nptl/./nptl/nptl_deallocate_tsd.c:22:1
   
   previously allocated by thread T1267 here:
       #0 0x4f304d in operator new(unsigned long, std::nothrow_t const&) 
(/home/runner/work/brpc/brpc/test/test_bvar+0x4f304d)
       #1 0x62c377 in bvar::detail::AgentGroup<bvar::detail::AgentCombiner<int, 
int, bvar::detail::AddTo<int> >::Agent>::get_or_create_tls_agent(int) 
/home/runner/work/brpc/brpc/test/../src/bvar/detail/agent_group.h:150:38
       #2 0x62a43c in bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >::get_or_create_tls_agent() 
/home/runner/work/brpc/brpc/test/../src/bvar/detail/combiner.h:304:21
       #3 0x59a35c in bvar::Reducer<int, bvar::detail::AddTo<int>, 
bvar::detail::MinusFrom<int> >::operator<<(int) 
/home/runner/work/brpc/brpc/test/../src/bvar/reducer.h:304:36
       #4 0x5953e9 in get_shared_adder_thread(void*) 
/home/runner/work/brpc/brpc/test/bvar_multi_dimension_unittest.cpp:622:16
       #5 0x7f85b4094ac2 in start_thread nptl/./nptl/pthread_create.c:442:8
   
   Thread T1272 created by T0 here:
       #0 0x4ac45c in pthread_create 
(/home/runner/work/brpc/brpc/test/test_bvar+0x4ac45c)
       #1 0x597629 in MultiDimensionTest_shared_Test::TestBody() 
/home/runner/work/brpc/brpc/test/bvar_multi_dimension_unittest.cpp:654:9
       #2 0x983f7e in void 
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) 
(/home/runner/work/brpc/brpc/test/test_bvar+0x983f7e)
   
   Thread T1267 created by T0 here:
       #0 0x4ac45c in pthread_create 
(/home/runner/work/brpc/brpc/test/test_bvar+0x4ac45c)
       #1 0x597629 in MultiDimensionTest_shared_Test::TestBody() 
/home/runner/work/brpc/brpc/test/bvar_multi_dimension_unittest.cpp:654:9
       #2 0x983f7e in void 
testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, 
void>(testing::Test*, void (testing::Test::*)(), char const*) 
(/home/runner/work/brpc/brpc/test/test_bvar+0x983f7e)
   
   SUMMARY: AddressSanitizer: heap-use-after-free 
/home/runner/work/brpc/brpc/test/../src/butil/containers/linked_list.h:130:28 
in butil::LinkNode<bvar::detail::AgentCombiner<int, int, 
bvar::detail::AddTo<int> >::Agent>::RemoveFromList()
   Shadow bytes around the buggy address:
     0x0c42800c3950: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
     0x0c42800c3960: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
     0x0c42800c3970: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
     0x0c42800c3980: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
     0x0c42800c3990: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
   =>0x0c42800c39a0:[fd]fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
     0x0c42800c39b0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
     0x0c42800c39c0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
     0x0c42800c39d0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
     0x0c42800c39e0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
     0x0c42800c39f0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
   Shadow byte legend (one shadow byte represents 8 application bytes):
     Addressable:           00
     Partially addressable: 01 02 03 04 05 06 07 
     Heap left redzone:       fa
     Freed heap region:       fd
     Stack left redzone:      f1
     Stack mid redzone:       f2
     Stack right redzone:     f3
     Stack after return:      f5
     Stack use after scope:   f8
     Global redzone:          f9
     Global init order:       f6
     Poisoned by user:        f7
     Container overflow:      fc
     Array cookie:            ac
     Intra object redzone:    bb
     ASan internal:           fe
     Left alloca redzone:     ca
     Right alloca redzone:    cb
     Shadow gap:              cc
   ==38807==ABORTING
   [runtest] 'test_bvar' failed, 1 succeeded
   ```
   
   **Root cause**: std::shared_ptr makes its weak_ptr expire the instant 
use_count drops to 0, while ~AgentCombiner only starts running after that. The 
two events therefore form a race window between two threads:
   
       * thread A: last shared_ptr<AgentCombiner> released
                   use_count -> 0   -> weak_ptr expired
                   ~AgentCombiner -> clear_all_agents() takes _lock and
                                     starts walking _agents
       * thread B: thread exits, _destroy_tls_blocks() deletes ThreadBlock,
                   each ~Agent calls combiner.lock() which now returns null
                   (weak_ptr already expired), so commit_and_erase() is
                   skipped and the Agent is freed while still linked into
                   A's _agents list
       * thread A: dereferences a freed LinkNode in clear_all_agents() ->
                   heap-use-after-free
   
   ### What is changed and the side effects?
   
   Changed:
   
   Don't traverse _agents from ~AgentCombiner. Letting _agents go out of scope 
is safe because:
   
     * butil::LinkedList and butil::LinkNode have trivial destructors and never 
traverse on destruction, so tearing down _agents does not dereference any agent 
node.
     * Surviving Agents simply observe combiner.expired() == true in ~Agent and 
skip commit_and_erase, leaving prev_/next_ dangling but never read.
     * If the freed agent_id is later reused by a new combiner and the same TLS 
slot is taken, get_or_create_tls_agent calls Agent::reset and Append's it into 
the new combiner's _agents. LinkNode::InsertBefore only writes prev_/next_ (it 
never reads their stale values), so the dangling pointers are safely 
overwritten.
     * Agent::element is destroyed together with its ThreadBlock, so any 
non-POD resource it holds is still released; if the agent slot is reused, 
Agent::reset rewrites the element value before it is observed again.
   
   The body of clear_all_agents() is kept (commented out) for reference, 
together with a NOTE explaining why it must not be re-enabled from 
~AgentCombiner.
   
   Side effects:
   - Performance effects:
   
   - Breaking backward compatibility: 
   
   ---
   ### Check List:
   - Please make sure your changes are compilable.
   - When providing us with a new feature, it is best to add related tests.
   - Please follow [Contributor Covenant Code of 
Conduct](https://github.com/apache/brpc/blob/master/CODE_OF_CONDUCT.md).
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to