Hi,

As discussed on IRC with sustrik; I run streamer devices that get
connections from ~2800 httpd workers. When the httpd workers gets
massively restarted, the devices all stop at once or get extremely slow,
and our system stops working properly.

The backtrace looks like this:

> (gdb) bt
> #0  0xb772342e in send () from /lib/libpthread.so.0
> #1  0xb7758468 in zmq::signaler_t::send(zmq::command_t const&) () from 
> /usr/lib/libzmq.so.0
> #2  0xb7745e4d in zmq::ctx_t::send_command(unsigned int, zmq::command_t 
> const&) () from /usr/lib/libzmq.so.0
> #3  0xb774d5aa in zmq::object_t::send_command(zmq::command_t&) () from 
> /usr/lib/libzmq.so.0
> #4  0xb774d631 in zmq::object_t::send_term(zmq::owned_t*) () from 
> /usr/lib/libzmq.so.0
> #5  0xb77588fb in zmq::socket_base_t::process_term_req(zmq::owned_t*) () from 
> /usr/lib/libzmq.so.0
> #6  0xb774e22f in zmq::object_t::process_command(zmq::command_t&) () from 
> /usr/lib/libzmq.so.0
> #7  0xb774528f in zmq::app_thread_t::process_commands(bool, bool) () from 
> /usr/lib/libzmq.so.0
> #8  0xb7758b0d in zmq::socket_base_t::recv(zmq_msg_t*, int) () from 
> /usr/lib/libzmq.so.0
> #9  0xb7749c01 in zmq::forwarder(zmq::socket_base_t*, zmq::socket_base_t*) () 
> from /usr/lib/libzmq.so.0
> #10 0xb7760fcc in zmq_device () from /usr/lib/libzmq.so.0
> #11 0x0804f707 in ?? ()
> #12 0xb7495b86 in __libc_start_main () from /lib/libc.so.6
> #13 0x08048f11 in ?? ()
> (gdb) info threads
>   2 Thread 0xb747cb70 (LWP 14032)  0xb772342e in send () from 
> /lib/libpthread.so.0
> * 1 Thread 0xb747d710 (LWP 14031)  0xb772342e in send () from 
> /lib/libpthread.so.0
> (gdb) thread 2
> [Switching to thread 2 (Thread 0xb747cb70 (LWP 14032))]#0  0xb772342e in send 
> () from /lib/libpthread.so.0
> (gdb) info threads
> * 2 Thread 0xb747cb70 (LWP 14032)  0xb772342e in send () from 
> /lib/libpthread.so.0
>   1 Thread 0xb747d710 (LWP 14031)  0xb772342e in send () from 
> /lib/libpthread.so.0
> (gdb) bt
> #0  0xb772342e in send () from /lib/libpthread.so.0
> #1  0xb7758468 in zmq::signaler_t::send(zmq::command_t const&) () from 
> /usr/lib/libzmq.so.0
> #2  0xb7745e4d in zmq::ctx_t::send_command(unsigned int, zmq::command_t 
> const&) () from /usr/lib/libzmq.so.0
> #3  0xb774d5aa in zmq::object_t::send_command(zmq::command_t&) () from 
> /usr/lib/libzmq.so.0
> #4  0xb774d781 in zmq::object_t::send_revive(zmq::object_t*) () from 
> /usr/lib/libzmq.so.0
> #5  0xb774fe36 in zmq::writer_t::flush() () from /usr/lib/libzmq.so.0
> #6  0xb7751084 in zmq::writer_t::write_delimiter() () from 
> /usr/lib/libzmq.so.0
> #7  0xb775151e in zmq::writer_t::term() () from /usr/lib/libzmq.so.0
> #8  0xb77574db in zmq::session_t::process_unplug() () from 
> /usr/lib/libzmq.so.0
> #9  0xb774ead0 in zmq::owned_t::finalise() () from /usr/lib/libzmq.so.0
> #10 0xb774eb57 in zmq::owned_t::process_term() () from /usr/lib/libzmq.so.0
> #11 0xb774e0a0 in zmq::object_t::process_command(zmq::command_t&) () from 
> /usr/lib/libzmq.so.0
> #12 0xb774a98f in zmq::io_thread_t::in_event() () from /usr/lib/libzmq.so.0
> #13 0xb7749721 in zmq::epoll_t::loop() () from /usr/lib/libzmq.so.0
> #14 0xb774982d in zmq::epoll_t::worker_routine(void*) () from 
> /usr/lib/libzmq.so.0
> #15 0xb775d977 in zmq::thread_t::thread_routine(void*) () from 
> /usr/lib/libzmq.so.0
> #16 0xb771b830 in start_thread () from /lib/libpthread.so.0
> #17 0xb755f15e in clone () from /lib/libc.so.6
> (gdb) quit

When I attach an strace to the streamer, it's seems it always blocks on
fd #3:
> send(3, "`!\370\10\n\0\0\0\210\266\203\266\270\fw\t;\204u\267\30\3w\267"..., 
> 24, 0) = 24
> recv(7, "\320I\5\10\t\0\0\0\3604w\266\364\337]\267\200\363]\2670\7q\267"..., 
> 24, MSG_DONTWAIT) = 24
> send(3, "\3604w\266\n\0\0\0X\267{\10\270\fw\t;\204u\267\30\3w\267"..., 24, 0) 
> = 24
> recv(7, "\320I\5\10\t\0\0\0X\365\374\n\364\337]\267\20\0\260\2660\7q\267"..., 
> 24, MSG_DONTWAIT) = 24
> send(3, "X\365\374\n\n\0\0\0P\321\203\266\270\fw\t;\204u\267\30\3w\267"..., 
> 24, 0) = 24
> recv(7, 
> "\320I\5\10\t\0\0\0\340\206[\10\364\337]\267\200\363]\2670\7q\267"..., 24, 
> MSG_DONTWAIT) = 24
> send(3, "\340\206[\10\n\0\0\0(\252\252\266\270\fw\t;\204u\267\30\3w\267"..., 
> 24, 0) = 24
> recv(7, 
> "\320I\5\10\t\0\0\0\330\343\t\v\364\337]\267\200\363]\2670\7q\267"..., 24, 
> MSG_DONTWAIT) = 24
> send(3, 
> "\330\343\t\v\n\0\0\0\20\325\203\266\270\fw\t;\204u\267\30\3w\267"..., 24, 0) 
> = 24
> recv(7, 
> "\320I\5\10\t\0\0\0\310\311R\266\364\337]\267\200\363]\2670\7q\267"..., 24, 
> MSG_DONTWAIT) = 24
> send(3, "\310\311R\266\n\0\0\0\210\307S\v\270\fw\t;\204u\267\30\3w\267"..., 
> 24, 0)  <unfinished ...>


I've also reproduced it on a dev server - the attached file contains the
scripts and instructions for reproducing it.

This is using ZeroMQ v2.0.10 on Slackware (i686)

Thanks

--
Thomas

Attachment: streamer_bug.tar.bz2
Description: application/bzip

_______________________________________________
zeromq-dev mailing list
[email protected]
http://lists.zeromq.org/mailman/listinfo/zeromq-dev

Reply via email to