James,

Here is a small uDAPL patch that should go into 1.0 that fixes some issues that 
we just found with
MPI scale out testing on OpenIB. QP was not being destroyed in some cases and 
hca_close issues with
async work thread. I am still working one other elusive disconnect problem that 
may require another
small patch.

Thanks,

-arlin

Signed-off by: Arlin Davis <[EMAIL PROTECTED]>

Index: dapl/openib_cma/dapl_ib_util.c
===================================================================
--- dapl/openib_cma/dapl_ib_util.c      (revision 5489)
+++ dapl/openib_cma/dapl_ib_util.c      (working copy)
@@ -330,6 +330,13 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
                hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
        }
 
+       dapl_os_lock(&g_hca_lock);
+       if (g_ib_thread_state != IB_THREAD_RUN) {
+               dapl_os_unlock(&g_hca_lock);
+               goto bail;
+       }
+       dapl_os_unlock(&g_hca_lock);
+
        /* 
         * Remove hca from async and CQ event processing list
         * Wakeup work thread to remove from polling list
@@ -342,10 +349,12 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
                struct timespec sleep, remain;
                sleep.tv_sec = 0;
                sleep.tv_nsec = 10000000; /* 10 ms */
+               write(g_ib_pipe[1], "w", sizeof "w");
                dapl_dbg_log(DAPL_DBG_TYPE_UTIL, 
                             " ib_thread_destroy: wait on hca %p destroy\n");
                nanosleep (&sleep, &remain);
        }
+bail:
        return (DAT_SUCCESS);
 }
   
Index: dapl/openib_cma/dapl_ib_cm.c
===================================================================
--- dapl/openib_cma/dapl_ib_cm.c        (revision 5489)
+++ dapl/openib_cma/dapl_ib_cm.c        (working copy)
@@ -306,15 +306,6 @@ static int dapli_cm_active_cb(struct dap
        destroy = conn->destroy;
        conn->in_callback = conn->destroy;
        dapl_os_unlock(&conn->lock);
-       if (destroy) {
-               dapl_dbg_log(DAPL_DBG_TYPE_CM, 
-                            " active_cb: DESTROY conn %p id %d \n",
-                            conn, conn->cm_id );
-               if (conn->ep)
-                       conn->ep->cm_handle = IB_INVALID_HANDLE;
-               
-               dapl_os_free(conn, sizeof(*conn));
-       }
        return(destroy);
 }
 
@@ -389,12 +380,6 @@ static int dapli_cm_passive_cb(struct da
        destroy = conn->destroy;
        conn->in_callback = conn->destroy;
        dapl_os_unlock(&conn->lock);
-       if (destroy) {
-               if (conn->ep)
-                       conn->ep->cm_handle = IB_INVALID_HANDLE;
-
-               dapl_os_free(conn, sizeof(*conn));
-       }
        return(destroy);
 }
 
@@ -1080,10 +1065,21 @@ void dapli_cma_event_cb(void)
                                ret = dapli_cm_passive_cb(conn,event);
                        else 
                                ret = dapli_cm_active_cb(conn,event);
-                       
-                       if (ret) 
+
+                       /* destroy both qp and cm_id */
+                       if (ret) {
+                               dapl_dbg_log(DAPL_DBG_TYPE_CM, 
+                                            " cma_cb: DESTROY conn %p" 
+                                            " cm_id %p qp %p\n",
+                                            conn, conn->cm_id, 
+                                            conn->cm_id->qp);
+       
+                               if (conn->cm_id->qp)
+                                       rdma_destroy_qp(conn->cm_id);
+
                                rdma_destroy_id(conn->cm_id);
-                       
+                               dapl_os_free(conn, sizeof(*conn));
+                       }
                        break;
                case RDMA_CM_EVENT_CONNECT_RESPONSE:
                default:
@@ -1095,7 +1091,7 @@ void dapli_cma_event_cb(void)
                }
                rdma_ack_cm_event(event);
        } else {
-               dapl_dbg_log(DAPL_DBG_TYPE_WARN,
+               dapl_dbg_log(DAPL_DBG_TYPE_CM,
                        " cm_event: ERROR: rdma_get_cm_event() %d %d %s\n",
                        ret, errno, strerror(errno));
        }
Index: dapl/openib_cma/dapl_ib_util.h
===================================================================
--- dapl/openib_cma/dapl_ib_util.h      (revision 5489)
+++ dapl/openib_cma/dapl_ib_util.h      (working copy)
@@ -295,7 +295,8 @@ dapl_convert_errno( IN int err, IN const
     if (!err)  return DAT_SUCCESS;
        
 #if DAPL_DBG
-    if ((err != EAGAIN) && (err != ETIME) && (err != ETIMEDOUT))
+    if ((err != EAGAIN) && (err != ETIME) && 
+       (err != ETIMEDOUT) && (err != EINTR))
        dapl_dbg_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err));
 #endif 
 
Index: dapl/openib_cma/dapl_ib_cq.c
===================================================================
--- dapl/openib_cma/dapl_ib_cq.c        (revision 5489)
+++ dapl/openib_cma/dapl_ib_cq.c        (working copy)
@@ -498,7 +498,10 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
        if (timeout != DAT_TIMEOUT_INFINITE)
                timeout_ms = timeout/1000;
 
-       status = poll(&cq_fd, 1, timeout_ms);
+       /* restart syscall */
+       while ((status = poll(&cq_fd, 1, timeout_ms)) == -1 )
+               if (errno == EINTR)
+                       continue;
 
        /* returned event */
        if (status > 0) {
@@ -511,13 +514,15 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
        /* timeout */
        } else if (status == 0) 
                status = ETIMEDOUT;
+       else 
+               status = errno;
        
        dapl_dbg_log(DAPL_DBG_TYPE_UTIL, 
                     " cq_object_wait: RET evd %p ibv_cq %p ibv_ctx %p %s\n",
                     evd_ptr, ibv_cq,ibv_ctx,strerror(errno));
        
        return(dapl_convert_errno(status,"cq_wait_object_wait"));
-       
+
 }
 #endif
 


Attachment: udapl_patch_1.0
Description: Binary data

_______________________________________________
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to