[ 
https://issues.apache.org/jira/browse/HDFS-9890?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

James Clampffer updated HDFS-9890:
----------------------------------
    Attachment: hs_err_pid4944.log
                hs_err_pid26832.log

I was able to get two crashes when I ran this that look pretty similar, files 
attached.  It seems like a race condition or something like that; I haven't 
been able to reproduce crashes since the first two.  Error simulation was 
effectively turned off (set to 1E-9).

Here's the tops of the stacks where things look roughly the same.  I cut off 
the bits about stack offsets to make them fit in the comment better.

In DataNodeConnectionImpl
{code}
0) void asio::detail::op_queue_access::next<asio::detail::reactor_op, 
asio::detail::reactor_op>(asio::detail::reactor_op*&, 
asio::detail::reactor_op*)+0x17
1) 
asio::detail::op_queue<asio::detail::reactor_op>::push(asio::detail::reactor_op*)+0x45
2) asio::detail::epoll_reactor::start_op(int, int, 
asio::detail::epoll_reactor::descriptor_state*&, asio::detail::reactor_op*, 
bool, bool)+0x2b8
3) 
asio::detail::reactive_socket_service_base::start_op(asio::detail::reactive_socket_service_base::base_implementation_type&,
 int, asio::detail::reactor_op*, bool, bool, bool)+0xb6
4) void 
asio::detail::reactive_socket_service_base::async_receive<asio::mutable_buffers_1,
 std::function<void ()(std::error_code const&, unsigned long)> 
>(asio::detail::reactive_socket_service_base::base_implementation_type&, 
asio::mutable_buffers_1 const&, int, std::function<void ()(std::error_code 
const&, unsigned long)>&)+0x131
5) asio::async_result<asio::handler_type<std::function<void ()(std::error_code 
const&, unsigned long)>&, void ()(std::error_code, unsigned long)>::type>::type 
asio::stream_socket_service<asio::ip::tcp>::async_receive<asio::mutable_buffers_1,
 std::function<void ()(std::error_code const&, unsigned 
long)>&>(asio::detail::reactive_socket_service<asio::ip::tcp>::implementation_type&,
 asio::mutable_buffers_1 const&, int, std::function<void ()(std::error_code 
const&, unsigned long)>&&&)+0x60
6) asio::async_result<asio::handler_type<std::function<void ()(std::error_code 
const&, unsigned long)>&, void ()(std::error_code, unsigned long)>::type>::type 
asio::basic_stream_socket<asio::ip::tcp, 
asio::stream_socket_service<asio::ip::tcp> 
>::async_read_some<asio::mutable_buffers_1, std::function<void 
()(std::error_code const&, unsigned long)>&>(asio::mutable_buffers_1 const&, 
std::function<void ()(std::error_code const&, unsigned long)>&&&)+0x4b
7) hdfs::DataNodeConnectionImpl::async_read_some(asio::mutable_buffers_1 
const&, std::function<void ()(std::error_code const&, unsigned long)>)+0xb2
8)  
_ZN4asio6detail7read_opIN4hdfs11AsyncStreamENS_17mutable_buffers_1ESt5_BindIFSt7_Mem_fnIMNS2_12continuation34ReadDelimitedPBMessageContinuationIS3_Lm16384EEEFmRKSt10error_codemEEPS9_St12_PlaceholderILi1EESH_ILi2EEEEZNS9_3RunERKSt8functionIFvRKNS2_6StatusEEEEUlSC_mE_EclESC_mi+0x143
{code}

In RpcEngine:
{code}
0) asio::detail::wait_op* 
asio::detail::op_queue_access::next<asio::detail::wait_op>(asio::detail::wait_op*)+0xc
1) asio::detail::op_queue<asio::detail::wait_op>::pop()+0x32
2) asio::detail::op_queue<asio::detail::wait_op>::~op_queue()+0x2a
3) 
asio::detail::timer_queue<asio::detail::chrono_time_traits<std::chrono::_V2::system_clock,
 asio::wait_traits<std::chrono::_V2::system_clock> > 
>::per_timer_data::~per_timer_data()+0x18
4) 
asio::detail::deadline_timer_service<asio::detail::chrono_time_traits<std::chrono::_V2::system_clock,
 asio::wait_traits<std::chrono::_V2::system_clock> > 
>::implementation_type::~implementation_type()+0x1c
5) 
asio::basic_io_object<asio::deadline_timer_service<std::chrono::_V2::system_clock,
 asio::detail::chrono_time_traits<std::chrono::_V2::system_clock, 
asio::wait_traits<std::chrono::_V2::system_clock> > >, 
false>::~basic_io_object()+0x36
6) asio::basic_deadline_timer<std::chrono::_V2::system_clock, 
asio::detail::chrono_time_traits<std::chrono::_V2::system_clock, 
asio::wait_traits<std::chrono::_V2::system_clock> >, 
asio::deadline_timer_service<std::chrono::_V2::system_clock, 
asio::detail::chrono_time_traits<std::chrono::_V2::system_clock, 
asio::wait_traits<std::chrono::_V2::system_clock> > > 
>::~basic_deadline_timer()+0x18
7) hdfs::Request::~Request()+0x3c
8) void 
__gnu_cxx::new_allocator<hdfs::Request>::destroy<hdfs::Request>(hdfs::Request*)+0x1c
9) std::enable_if<std::allocator_traits<std::allocator<hdfs::Request> 
>::__destroy_helper<hdfs::Request>::value, void>::type 
std::allocator_traits<std::allocator<hdfs::Request> 
>::_S_destroy<hdfs::Request>(std::allocator<hdfs::Request>&, 
hdfs::Request*)+0x23
10) void std::allocator_traits<std::allocator<hdfs::Request> 
>::destroy<hdfs::Request>(std::allocator<hdfs::Request>&, hdfs::Request*)+0x23
11) std::_Sp_counted_ptr_inplace<hdfs::Request, std::allocator<hdfs::Request>, 
(__gnu_cxx::_Lock_policy)2>::_M_dispose()+0x27
12) std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release()+0x42
13) std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count()+0x27
14) std::__shared_ptr<hdfs::Request, 
(__gnu_cxx::_Lock_policy)2>::~__shared_ptr()+0x1c
15) std::shared_ptr<hdfs::Request>::~shared_ptr()+0x18
16) _ZNKSs4sizeEv@@GLIBCXX_3.4+0x30f79c
17) _ZNKSs4sizeEv@@GLIBCXX_3.4+0x31226
{code}

I can't really figure out what's going on here (from second stack):
{code}
std::shared_ptr<hdfs::Request>::~shared_ptr()
_ZNKSs4sizeEv@@GLIBCXX_3.4+0x30f79c = std::string::size() const
{code}
No idea why it looks like string::size is doing anything to a 
shared_ptr<Request>.

The tops of the stacks are both due to a linked list operation gone wrong.
from include/asio/detail/op_queue.hpp
{code}
    template <typename Operation>
    static Operation* next(Operation* o)
    {
      return static_cast<Operation*>(o->next_);
    }
{code}


> libhdfs++: Add test suite to simulate network issues
> ----------------------------------------------------
>
>                 Key: HDFS-9890
>                 URL: https://issues.apache.org/jira/browse/HDFS-9890
>             Project: Hadoop HDFS
>          Issue Type: Sub-task
>          Components: hdfs-client
>            Reporter: James Clampffer
>            Assignee: Xiaowei Zhu
>         Attachments: HDFS-9890.HDFS-8707.000.patch, 
> HDFS-9890.HDFS-8707.001.patch, HDFS-9890.HDFS-8707.002.patch, 
> HDFS-9890.HDFS-8707.003.patch, HDFS-9890.HDFS-8707.004.patch, 
> HDFS-9890.HDFS-8707.005.patch, HDFS-9890.HDFS-8707.006.patch, 
> HDFS-9890.HDFS-8707.007.patch, hs_err_pid26832.log, hs_err_pid4944.log
>
>
> I propose adding a test suite to simulate various network issues/failures in 
> order to get good test coverage on some of the retry paths that aren't easy 
> to hit in mock unit tests.
> At the moment the only things that hit the retry paths are the gmock unit 
> tests.  The gmock are only as good as their mock implementations which do a 
> great job of simulating protocol correctness but not more complex 
> interactions.  They also can't really simulate the types of lock contention 
> and subtle memory stomps that show up while doing hundreds or thousands of 
> concurrent reads.   We should add a new minidfscluster test that focuses on 
> heavy read/seek load and then randomly convert error codes returned by 
> network functions into errors.
> List of things to simulate(while heavily loaded), roughly in order of how 
> badly I think they need to be tested at the moment:
> -Rpc connection disconnect
> -Rpc connection slowed down enough to cause a timeout and trigger retry
> -DN connection disconnect



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org

Reply via email to