Sasha,

I am working on making libibnetdisc a parallel implementation.  As a result I
have found that _do_madrpc is not thread safe.  The following patch fixes
this.  However, I don't know you want to do...

If one only uses mad_rpc and mad_rpc_rmpp then the patch works.  However, if
someone is using mad_send_via at the same time _do_madrpc will still fail.  Is
it by design that some responses will be lost while _do_madrpc is looking for
it's response via the TID?
   
Also, according to C13-18.1.1 and C13-19.1.1 you must use the SGID (or SLID)
and the MgmtClass in addition to the TID to determine the uniqueness of a
message.  The SGID (or SLID) is of course the same but should the MgmtClass be
checked here as well?

Finally, why does _do_madrpc cast the transaction id to a 32 bit value?

Confused,
Ira

From: Ira Weiny <[email protected]>
Date: Wed, 8 Jul 2009 15:01:11 -0700
Subject: [PATCH] Lock _do_madrpc for thread safety


Signed-off-by: Ira Weiny <[email protected]>
---
 libibmad/configure.in |    2 ++
 libibmad/src/rpc.c    |   18 +++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/libibmad/configure.in b/libibmad/configure.in
index b4f5c41..5d3e304 100644
--- a/libibmad/configure.in
+++ b/libibmad/configure.in
@@ -43,6 +43,8 @@ then
 AC_CHECK_HEADER(infiniband/umad.h, [],
        AC_MSG_ERROR([<infiniband/umad.h> not found. libibmad requires 
libibumad.])
 )
+AC_CHECK_HEADER(pthread.h, [],
+       AC_MSG_ERROR([<pthread.h> not found.  libibmad requires pthread.h]))
 fi
 
 dnl Checks for library functions
diff --git a/libibmad/src/rpc.c b/libibmad/src/rpc.c
index 07b623d..83bd4f0 100644
--- a/libibmad/src/rpc.c
+++ b/libibmad/src/rpc.c
@@ -43,6 +43,7 @@
 
 #include <infiniband/umad.h>
 #include <infiniband/mad.h>
+#include <pthread.h>
 
 #include "mad_internal.h"
 
@@ -58,6 +59,8 @@ static int madrpc_timeout = MAD_DEF_TIMEOUT_MS;
 static void *save_mad;
 static int save_mad_len = 256;
 
+static pthread_mutex_t snd_rcv_lock = PTHREAD_MUTEX_INITIALIZER;
+
 #undef DEBUG
 #define DEBUG  if (ibdebug)    IBWARN
 #define ERRS(fmt, ...) do {    \
@@ -127,6 +130,7 @@ static int
 _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len,
           int timeout, int max_retries)
 {
+       uint32_t trid_rcv;
        uint32_t trid;          /* only low 32 bits */
        int retries;
        int length, status;
@@ -150,6 +154,8 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int 
agentid, int len,
                        ERRS("retry %d (timeout %d ms)", retries, timeout);
 
                length = len;
+
+               pthread_mutex_lock(&snd_rcv_lock);
                if (umad_send(port_id, agentid, sndbuf, length, timeout, 0) < 
0) {
                        IBWARN("send failed; %m");
                        return -1;
@@ -159,6 +165,7 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int 
agentid, int len,
                /* send packet is lost somewhere. */
                do {
                        if (umad_recv(port_id, rcvbuf, &length, timeout) < 0) {
+                               pthread_mutex_unlock(&snd_rcv_lock);
                                IBWARN("recv failed: %m");
                                return -1;
                        }
@@ -168,9 +175,14 @@ _do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int 
agentid, int len,
                                xdump(stderr, "rcv buf\n", umad_get_mad(rcvbuf),
                                      IB_MAD_SIZE);
                        }
-               } while ((uint32_t)
-                        mad_get_field64(umad_get_mad(rcvbuf), 0,
-                                        IB_MAD_TRID_F) != trid);
+
+                       trid_rcv = (uint32_t) 
mad_get_field64(umad_get_mad(rcvbuf),
+                                       0, IB_MAD_TRID_F);
+                       if (trid_rcv != trid)
+                               IBWARN("trid_rcv(%x) != trid (%x)\n", trid_rcv, 
trid);
+
+               } while (trid_rcv != trid);
+               pthread_mutex_unlock(&snd_rcv_lock);
 
                status = umad_status(rcvbuf);
                if (!status)
-- 
1.5.4.5

_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to