James, Here is a small uDAPL patch that should go into 1.0 that fixes some issues that we just found with MPI scale out testing on OpenIB. QP was not being destroyed in some cases and hca_close issues with async work thread. I am still working one other elusive disconnect problem that may require another small patch.
Thanks, -arlin Signed-off by: Arlin Davis <[EMAIL PROTECTED]> Index: dapl/openib_cma/dapl_ib_util.c =================================================================== --- dapl/openib_cma/dapl_ib_util.c (revision 5489) +++ dapl/openib_cma/dapl_ib_util.c (working copy) @@ -330,6 +330,13 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC hca_ptr->ib_hca_handle = IB_INVALID_HANDLE; } + dapl_os_lock(&g_hca_lock); + if (g_ib_thread_state != IB_THREAD_RUN) { + dapl_os_unlock(&g_hca_lock); + goto bail; + } + dapl_os_unlock(&g_hca_lock); + /* * Remove hca from async and CQ event processing list * Wakeup work thread to remove from polling list @@ -342,10 +349,12 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC struct timespec sleep, remain; sleep.tv_sec = 0; sleep.tv_nsec = 10000000; /* 10 ms */ + write(g_ib_pipe[1], "w", sizeof "w"); dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " ib_thread_destroy: wait on hca %p destroy\n"); nanosleep (&sleep, &remain); } +bail: return (DAT_SUCCESS); } Index: dapl/openib_cma/dapl_ib_cm.c =================================================================== --- dapl/openib_cma/dapl_ib_cm.c (revision 5489) +++ dapl/openib_cma/dapl_ib_cm.c (working copy) @@ -306,15 +306,6 @@ static int dapli_cm_active_cb(struct dap destroy = conn->destroy; conn->in_callback = conn->destroy; dapl_os_unlock(&conn->lock); - if (destroy) { - dapl_dbg_log(DAPL_DBG_TYPE_CM, - " active_cb: DESTROY conn %p id %d \n", - conn, conn->cm_id ); - if (conn->ep) - conn->ep->cm_handle = IB_INVALID_HANDLE; - - dapl_os_free(conn, sizeof(*conn)); - } return(destroy); } @@ -389,12 +380,6 @@ static int dapli_cm_passive_cb(struct da destroy = conn->destroy; conn->in_callback = conn->destroy; dapl_os_unlock(&conn->lock); - if (destroy) { - if (conn->ep) - conn->ep->cm_handle = IB_INVALID_HANDLE; - - dapl_os_free(conn, sizeof(*conn)); - } return(destroy); } @@ -1080,10 +1065,21 @@ void dapli_cma_event_cb(void) ret = dapli_cm_passive_cb(conn,event); else ret = dapli_cm_active_cb(conn,event); - - if (ret) + + /* destroy both qp and cm_id */ + if (ret) { + dapl_dbg_log(DAPL_DBG_TYPE_CM, + " cma_cb: DESTROY conn %p" + " cm_id %p qp %p\n", + conn, conn->cm_id, + conn->cm_id->qp); + + if (conn->cm_id->qp) + rdma_destroy_qp(conn->cm_id); + rdma_destroy_id(conn->cm_id); - + dapl_os_free(conn, sizeof(*conn)); + } break; case RDMA_CM_EVENT_CONNECT_RESPONSE: default: @@ -1095,7 +1091,7 @@ void dapli_cma_event_cb(void) } rdma_ack_cm_event(event); } else { - dapl_dbg_log(DAPL_DBG_TYPE_WARN, + dapl_dbg_log(DAPL_DBG_TYPE_CM, " cm_event: ERROR: rdma_get_cm_event() %d %d %s\n", ret, errno, strerror(errno)); } Index: dapl/openib_cma/dapl_ib_util.h =================================================================== --- dapl/openib_cma/dapl_ib_util.h (revision 5489) +++ dapl/openib_cma/dapl_ib_util.h (working copy) @@ -295,7 +295,8 @@ dapl_convert_errno( IN int err, IN const if (!err) return DAT_SUCCESS; #if DAPL_DBG - if ((err != EAGAIN) && (err != ETIME) && (err != ETIMEDOUT)) + if ((err != EAGAIN) && (err != ETIME) && + (err != ETIMEDOUT) && (err != EINTR)) dapl_dbg_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err)); #endif Index: dapl/openib_cma/dapl_ib_cq.c =================================================================== --- dapl/openib_cma/dapl_ib_cq.c (revision 5489) +++ dapl/openib_cma/dapl_ib_cq.c (working copy) @@ -498,7 +498,10 @@ dapls_ib_wait_object_wait(IN ib_wait_obj if (timeout != DAT_TIMEOUT_INFINITE) timeout_ms = timeout/1000; - status = poll(&cq_fd, 1, timeout_ms); + /* restart syscall */ + while ((status = poll(&cq_fd, 1, timeout_ms)) == -1 ) + if (errno == EINTR) + continue; /* returned event */ if (status > 0) { @@ -511,13 +514,15 @@ dapls_ib_wait_object_wait(IN ib_wait_obj /* timeout */ } else if (status == 0) status = ETIMEDOUT; + else + status = errno; dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " cq_object_wait: RET evd %p ibv_cq %p ibv_ctx %p %s\n", evd_ptr, ibv_cq,ibv_ctx,strerror(errno)); return(dapl_convert_errno(status,"cq_wait_object_wait")); - + } #endif
udapl_patch_1.0
Description: Binary data
_______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general