James and Steve, Here is revised patch that was tested (free and debug build versions) with dtest, dapltest, and Intel MPI test suites. The rdma_destroy_id will block until we acknowledge the event so there was no need to add our own wait objects for synchronization. This will never be called from the async event thread so there is no chance of deadlock conditions. I also made some changes to build with configure enable-debug. Some unused variables that were deleted are actually used in the debug messages. Please review the changes.
Steve, can you test this version and see if it works for your iWARP device. Thanks, -arlin Signed-off by: Arlin Davis [EMAIL PROTECTED] Index: dapl/openib_cma/dapl_ib_cm.c =================================================================== --- dapl/openib_cma/dapl_ib_cm.c (revision 6305) +++ dapl/openib_cma/dapl_ib_cm.c (working copy) @@ -62,9 +62,9 @@ /* local prototypes */ static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn, struct rdma_cm_event *event); -static int dapli_cm_active_cb(struct dapl_cm_id *conn, +static void dapli_cm_active_cb(struct dapl_cm_id *conn, struct rdma_cm_event *event); -static int dapli_cm_passive_cb(struct dapl_cm_id *conn, +static void dapli_cm_passive_cb(struct dapl_cm_id *conn, struct rdma_cm_event *event); static void dapli_addr_resolve(struct dapl_cm_id *conn); static void dapli_route_resolve(struct dapl_cm_id *conn); @@ -87,7 +87,9 @@ static inline uint64_t cpu_to_be64(uint6 static void dapli_addr_resolve(struct dapl_cm_id *conn) { int ret; - +#ifdef DAPL_DBG + struct rdma_addr *ipaddr = &conn->cm_id->route.addr; +#endif dapl_dbg_log(DAPL_DBG_TYPE_CM, " addr_resolve: cm_id %p SRC %x DST %x\n", conn->cm_id, @@ -110,8 +112,10 @@ static void dapli_addr_resolve(struct da static void dapli_route_resolve(struct dapl_cm_id *conn) { int ret; - struct rdma_cm_id *cm_id = conn->cm_id; - +#ifdef DAPL_DBG + struct rdma_addr *ipaddr = &conn->cm_id->route.addr; + struct ib_addr *ibaddr = &conn->cm_id->route.addr.addr.ibaddr; +#endif dapl_dbg_log(DAPL_DBG_TYPE_CM, " route_resolve: cm_id %p SRC %x DST %x PORT %d\n", conn->cm_id, @@ -158,37 +162,51 @@ bail: NULL, conn->ep); } +/* + * Called from consumer thread via dat_ep_free(). + * CANNOT be called from the async event processing thread + * dapli_cma_event_cb() since a cm_id reference is held and + * a deadlock will occur. + */ void dapli_destroy_conn(struct dapl_cm_id *conn) { - int in_callback; + struct rdma_cm_id *cm_id; dapl_dbg_log(DAPL_DBG_TYPE_CM, " destroy_conn: conn %p id %d\n", conn,conn->cm_id); - + dapl_os_lock(&conn->lock); conn->destroy = 1; - in_callback = conn->in_callback; + + if (conn->ep) + conn->ep->cm_handle = IB_INVALID_HANDLE; + + cm_id = conn->cm_id; + conn->cm_id = NULL; dapl_os_unlock(&conn->lock); - if (!in_callback) { - if (conn->ep) - conn->ep->cm_handle = IB_INVALID_HANDLE; - if (conn->cm_id) { - if (conn->cm_id->qp) - rdma_destroy_qp(conn->cm_id); - rdma_destroy_id(conn->cm_id); - } - - conn->cm_id = NULL; - dapl_os_free(conn, sizeof(*conn)); + /* + * rdma_destroy_id will force synchronization with async CM event + * thread since it blocks until the in-process event reference + * is cleared during our event processing call exit. + */ + if (cm_id) { + if (cm_id->qp) + rdma_destroy_qp(cm_id); + + rdma_destroy_id(cm_id); } + dapl_os_free(conn, sizeof(*conn)); } static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn, struct rdma_cm_event *event) { struct dapl_cm_id *new_conn; +#ifdef DAPL_DBG + struct rdma_addr *ipaddr = &event->id->route.addr; +#endif if (conn->sp == NULL) { dapl_dbg_log(DAPL_DBG_TYPE_ERR, @@ -239,11 +257,9 @@ static struct dapl_cm_id * dapli_req_rec return new_conn; } -static int dapli_cm_active_cb(struct dapl_cm_id *conn, +static void dapli_cm_active_cb(struct dapl_cm_id *conn, struct rdma_cm_event *event) { - int destroy; - dapl_dbg_log(DAPL_DBG_TYPE_CM, " active_cb: conn %p id %d event %d\n", conn, conn->cm_id, event->event ); @@ -251,9 +267,8 @@ static int dapli_cm_active_cb(struct dap dapl_os_lock(&conn->lock); if (conn->destroy) { dapl_os_unlock(&conn->lock); - return 0; + return; } - conn->in_callback = 1; dapl_os_unlock(&conn->lock); switch (event->event) { @@ -298,17 +313,12 @@ static int dapli_cm_active_cb(struct dap break; } - dapl_os_lock(&conn->lock); - destroy = conn->destroy; - conn->in_callback = conn->destroy; - dapl_os_unlock(&conn->lock); - return(destroy); + return; } -static int dapli_cm_passive_cb(struct dapl_cm_id *conn, +static void dapli_cm_passive_cb(struct dapl_cm_id *conn, struct rdma_cm_event *event) { - int destroy; struct dapl_cm_id *new_conn; dapl_dbg_log(DAPL_DBG_TYPE_CM, @@ -318,9 +328,8 @@ static int dapli_cm_passive_cb(struct da dapl_os_lock(&conn->lock); if (conn->destroy) { dapl_os_unlock(&conn->lock); - return 0; + return; } - conn->in_callback = 1; dapl_os_unlock(&conn->lock); switch (event->event) { @@ -372,11 +381,7 @@ static int dapli_cm_passive_cb(struct da break; } - dapl_os_lock(&conn->lock); - destroy = conn->destroy; - conn->in_callback = conn->destroy; - dapl_os_unlock(&conn->lock); - return(destroy); + return; } @@ -1008,16 +1013,12 @@ dapls_ib_get_cm_event(IN DAT_EVENT_NUMBE void dapli_cma_event_cb(void) { struct rdma_cm_event *event; - int ret; - - dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " cm_event()\n"); - ret = rdma_get_cm_event(&event); + dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " cm_event()\n"); /* process one CM event, fairness */ - if(!ret) { + if(!rdma_get_cm_event(&event)) { struct dapl_cm_id *conn; - int ret; /* set proper conn from cm_id context*/ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) @@ -1055,26 +1056,9 @@ void dapli_cma_event_cb(void) case RDMA_CM_EVENT_DISCONNECTED: /* passive or active */ if (conn->sp) - ret = dapli_cm_passive_cb(conn,event); + dapli_cm_passive_cb(conn,event); else - ret = dapli_cm_active_cb(conn,event); - - /* destroy both qp and cm_id */ - if (ret) { - dapl_dbg_log(DAPL_DBG_TYPE_CM, - " cma_cb: DESTROY conn %p" - " cm_id %p qp %p\n", - conn, conn->cm_id, - conn->cm_id->qp); - - if (conn->cm_id->qp) - rdma_destroy_qp(conn->cm_id); - - rdma_ack_cm_event(event); - rdma_destroy_id(conn->cm_id); - dapl_os_free(conn, sizeof(*conn)); - return; - } + dapli_cm_active_cb(conn,event); break; case RDMA_CM_EVENT_CONNECT_RESPONSE: default: @@ -1084,12 +1068,9 @@ void dapli_cma_event_cb(void) event->id->context); break; } + /* ack event, unblocks destroy_cm_id in consumer threads */ rdma_ack_cm_event(event); - } else { - dapl_dbg_log(DAPL_DBG_TYPE_CM, - " cm_event: ERROR: rdma_get_cm_event() %d %d %s\n", - ret, errno, strerror(errno)); - } + } } /* _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general