Sasha, I am working on making libibnetdisc a parallel implementation. As a result I have found that _do_madrpc is not thread safe. The following patch fixes this. However, I don't know you want to do...
If one only uses mad_rpc and mad_rpc_rmpp then the patch works. However, if someone is using mad_send_via at the same time _do_madrpc will still fail. Is it by design that some responses will be lost while _do_madrpc is looking for it's response via the TID? Also, according to C13-18.1.1 and C13-19.1.1 you must use the SGID (or SLID) and the MgmtClass in addition to the TID to determine the uniqueness of a message. The SGID (or SLID) is of course the same but should the MgmtClass be checked here as well? Finally, why does _do_madrpc cast the transaction id to a 32 bit value? Confused, Ira From: Ira Weiny <[email protected]> Date: Wed, 8 Jul 2009 15:01:11 -0700 Subject: [PATCH] Lock _do_madrpc for thread safety Signed-off-by: Ira Weiny <[email protected]> --- libibmad/configure.in | 2 ++ libibmad/src/rpc.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/libibmad/configure.in b/libibmad/configure.in index b4f5c41..5d3e304 100644 --- a/libibmad/configure.in +++ b/libibmad/configure.in @@ -43,6 +43,8 @@ then AC_CHECK_HEADER(infiniband/umad.h, [], AC_MSG_ERROR([<infiniband/umad.h> not found. libibmad requires libibumad.]) ) +AC_CHECK_HEADER(pthread.h, [], + AC_MSG_ERROR([<pthread.h> not found. libibmad requires pthread.h])) fi dnl Checks for library functions diff --git a/libibmad/src/rpc.c b/libibmad/src/rpc.c index 07b623d..83bd4f0 100644 --- a/libibmad/src/rpc.c +++ b/libibmad/src/rpc.c @@ -43,6 +43,7 @@ #include <infiniband/umad.h> #include <infiniband/mad.h> +#include <pthread.h> #include "mad_internal.h" @@ -58,6 +59,8 @@ static int madrpc_timeout = MAD_DEF_TIMEOUT_MS; static void *save_mad; static int save_mad_len = 256; +static pthread_mutex_t snd_rcv_lock = PTHREAD_MUTEX_INITIALIZER; + #undef DEBUG #define DEBUG if (ibdebug) IBWARN #define ERRS(fmt, ...) do { \ @@ -127,6 +130,7 @@ static int _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len, int timeout, int max_retries) { + uint32_t trid_rcv; uint32_t trid; /* only low 32 bits */ int retries; int length, status; @@ -150,6 +154,8 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len, ERRS("retry %d (timeout %d ms)", retries, timeout); length = len; + + pthread_mutex_lock(&snd_rcv_lock); if (umad_send(port_id, agentid, sndbuf, length, timeout, 0) < 0) { IBWARN("send failed; %m"); return -1; @@ -159,6 +165,7 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len, /* send packet is lost somewhere. */ do { if (umad_recv(port_id, rcvbuf, &length, timeout) < 0) { + pthread_mutex_unlock(&snd_rcv_lock); IBWARN("recv failed: %m"); return -1; } @@ -168,9 +175,14 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len, xdump(stderr, "rcv buf\n", umad_get_mad(rcvbuf), IB_MAD_SIZE); } - } while ((uint32_t) - mad_get_field64(umad_get_mad(rcvbuf), 0, - IB_MAD_TRID_F) != trid); + + trid_rcv = (uint32_t) mad_get_field64(umad_get_mad(rcvbuf), + 0, IB_MAD_TRID_F); + if (trid_rcv != trid) + IBWARN("trid_rcv(%x) != trid (%x)\n", trid_rcv, trid); + + } while (trid_rcv != trid); + pthread_mutex_unlock(&snd_rcv_lock); status = umad_status(rcvbuf); if (!status) -- 1.5.4.5 _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
