On Thu, 2012-08-16 at 23:40 +0000, Hefty, Sean wrote: > I don't have RoCE installed. With IB, I haven't been able to see this > problem after dozens of attempts. The performance isn't divided equally, but > I usually see between 3-10 Gbps out of each connection. I'm running with > some additional patches, but I don't see where those would affect any hang. > > During development, I've seen issues where specific transfer patterns just > happen to fall on boundary conditions that result in hangs or slowness. > Hopefully that's not the case because those are a pain to identify. > > Is this something that you would be able to use a debugger on? Dumping the > contents of the struct rsocket * would be useful in troubleshooting if we're > waiting on credits, buffer space, the app, or something else. > Here are the dumps of struct rsocket of a hung netperf connection. netserver: hang in recv()
(gdb) bt #0 0x0000003286ed83f0 in __read_nocancel () from /lib64/libc.so.6 #1 0x0000003b7220a1c4 in ibv_get_cq_event () from /usr/lib64/libibverbs.so.1 #2 0x00007fee00642197 in rs_get_cq_event (rs=0x18139c0) at src/rsocket.c:941 #3 0x00007fee00643567 in rs_process_cq (rs=0x18139c0, nonblock=0, test=0x7fee006414b0 <rs_conn_have_rdata>) at src/rsocket.c:989 #4 0x00007fee006438bd in rs_get_comp (rs=0x18139c0, nonblock=<value optimized out>, test=0x7fee006414b0 <rs_conn_have_rdata>) at src/rsocket.c:1019 #5 0x00007fee00643b51 in rrecv (socket=<value optimized out>, buf=0x17fdd10, len=87380, flags=<value optimized out>) at src/rsocket.c:1136 #6 0x000000000042d6e3 in recv_data () #7 0x0000000000430da9 in recv_omni () #8 0x0000000000403628 in process_requests () #9 0x00000000004037b0 in spawn_child () #10 0x00000000004038f0 in accept_connection () #11 0x0000000000403a46 in accept_connections () #12 0x000000000040407a in main () (gdb) p *(struct rsocket *)0x18139c0 $1 = {cm_id = 0x1813b80, slock = {sem = {__size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 0}, rlock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 1}, cq_lock = {sem = {__size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 0}, cq_wait_lock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 1}, opts = 0, fd_flags = 2, so_opts = 4, tcp_opts = 2, ipv6_opts = 0, state = 1792, cq_armed = 1, retries = 0, err = 0, index = 16, ctrl_avail = 3, sqe_avail = 1020, sbuf_bytes_avail = 131072, sseq_no = 0, sseq_comp = 1024, sq_size = 1024, sq_inline = 64, rq_size = 1024, rseq_no = 3515, rseq_comp = 4026, rbuf_bytes_avail = 63488, rbuf_free_offset = 65536, rbuf_offset = 129024, rmsg_head = 440, rmsg_tail = 440, rmsg = 0x1813dd0, remote_sge = 1, remote_sgl = {addr = 6784856, key = 2550145917, length = 2}, target_mr = 0x1815e60, target_sge = 0, target_sgl = {{ addr = 140617248624656, key = 2550146173, length = 65536}, {addr = 140617248690192, key = 2550146173, length = 65536}}, rbuf_size = 131072, rmr = 0x17dd680, rbuf = 0x7fee00347010 "netperf", sbuf_size = 131072, smr = 0x17dffd0, ssgl = {{ addr = 140660182515728, length = 0, lkey = 3758105205}, {addr = 140660182515728, length = 0, lkey = 3758105205}}, sbuf = 0x7fee00368010 ""} netperf: hang in send() (gdb) bt #0 0x0000003120ad83f0 in __read_nocancel () from /lib64/libc.so.6 #1 0x0000003da700a1c4 in ibv_get_cq_event () from /usr/lib64/libibverbs.so.1 #2 0x00007fe40157f197 in rs_get_cq_event (rs=0x678610) at src/rsocket.c:941 #3 0x00007fe401580567 in rs_process_cq (rs=0x678610, nonblock=0, test=0x7fe40157e430 <rs_conn_can_send>) at src/rsocket.c:989 #4 0x00007fe4015808bd in rs_get_comp (rs=0x678610, nonblock=<value optimized out>, test=0x7fe40157e430 <rs_conn_can_send>) at src/rsocket.c:1019 #5 0x00007fe401581b92 in rsend (socket=<value optimized out>, buf=<value optimized out>, len=65536, flags=<value optimized out>) at src/rsocket.c:1244 #6 0x000000000042ba6d in send_data () #7 0x000000000042d158 in send_omni_inner () #8 0x000000000042fbf1 in send_tcp_stream () #9 0x000000000040239d in main () (gdb) p *(struct rsocket *)0x678610 $1 = {cm_id = 0x6a2af0, slock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 1}, rlock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 0}, cq_lock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 0}, cq_wait_lock = {sem = { __size = "\000\000\000\000\200", '\000' <repeats 26 times>, __align = 549755813888}, cnt = 1}, opts = 0, fd_flags = 2, so_opts = 4, tcp_opts = 0, ipv6_opts = 0, state = 1792, cq_armed = 1, retries = 0, err = 0, index = 12, ctrl_avail = 4, sqe_avail = 1020, sbuf_bytes_avail = 131072, sseq_no = 3522, sseq_comp = 4538, sq_size = 1024, sq_inline = 64, rq_size = 1024, rseq_no = 0, rseq_comp = 512, rbuf_bytes_avail = 0, rbuf_free_offset = 0, rbuf_offset = 0, rmsg_head = 0, rmsg_tail = 0, rmsg = 0x6a2d90, remote_sge = 0, remote_sgl = {addr = 25246472, key = 3758105461, length = 2}, target_mr = 0x6a4e20, target_sge = 1, target_sgl = {{addr = 140660182446096, key = 3758105717, length = 0}, { addr = 140660182511632, key = 3758105717, length = 0}}, rbuf_size = 131072, rmr = 0x677c80, rbuf = 0x7fe401275010 "", sbuf_size = 131072, smr = 0x678980, ssgl = {{addr = 140617248825360, length = 2048, lkey = 2550145661}, { addr = 140617248759824, length = 0, lkey = 2550145661}}, sbuf = 0x7fe401296010 "netperf"} -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html