Fix some timeout and long disconnect delay issues discovered during scale-out testing. Added support to retry rdma_cm address and route resolution with configuration options. Provide a disconnect call when receiving the disconnect request to guarantee a disconnect reply and event on the remote side. The rdma_disconnect was not being called from dat_ep_disconnect() as a result of the state changing to DISCONNECTED in the event callback. Here are the new options (environment variables) with the default setting: DAPL_CM_ARP_TIMEOUT_MS 4000 DAPL_CM_ARP_RETRY_COUNT 15 DAPL_CM_ROUTE_TIMEOUT_MS 4000 DAPL_CM_ROUTE_RETRY_COUNT 15 Signed-off by: Arlin Davis [EMAIL PROTECTED]
Index: dapl/openib_cma/dapl_ib_cm.c =================================================================== --- dapl/openib_cma/dapl_ib_cm.c (revision 10032) +++ dapl/openib_cma/dapl_ib_cm.c (working copy) @@ -58,6 +58,9 @@ #include "dapl_ib_util.h" #include <sys/poll.h> #include <signal.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> #include <rdma/rdma_cma_ib.h> extern struct rdma_event_channel *g_cm_events; @@ -99,8 +102,8 @@ static void dapli_addr_resolve(struct da &ipaddr->src_addr)->sin_addr.s_addr), ntohl(((struct sockaddr_in *) &ipaddr->dst_addr)->sin_addr.s_addr)); - - ret = rdma_resolve_route(conn->cm_id, 2000); + + ret = rdma_resolve_route(conn->cm_id, conn->route_timeout); if (ret) { dapl_dbg_log(DAPL_DBG_TYPE_ERR, " rdma_connect failed: %s\n",strerror(errno)); @@ -120,6 +123,7 @@ static void dapli_route_resolve(struct d struct rdma_addr *ipaddr = &conn->cm_id->route.addr; struct ib_addr *ibaddr = &conn->cm_id->route.addr.addr.ibaddr; #endif + dapl_dbg_log(DAPL_DBG_TYPE_CM, " route_resolve: cm_id %p SRC %x DST %x PORT %d\n", conn->cm_id, @@ -331,21 +335,17 @@ static void dapli_cm_active_cb(struct da case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: { - ib_cm_events_t cm_event; - dapl_dbg_log( + dapl_dbg_log( DAPL_DBG_TYPE_WARN, " dapli_cm_active_handler: CONN_ERR " " event=0x%x status=%d %s\n", event->event, event->status, (event->status == -ETIMEDOUT)?"TIMEOUT":"" ); - /* no device type specified so assume IB for now */ - if (event->status == -ETIMEDOUT) /* IB timeout */ - cm_event = IB_CME_TIMEOUT; - else - cm_event = IB_CME_DESTINATION_UNREACHABLE; - - dapl_evd_connection_callback(conn, cm_event, NULL, conn->ep); + /* per DAT SPEC provider always returns UNREACHABLE */ + dapl_evd_connection_callback(conn, + IB_CME_DESTINATION_UNREACHABLE, + NULL, conn->ep); break; } case RDMA_CM_EVENT_REJECTED: @@ -381,6 +381,7 @@ static void dapli_cm_active_cb(struct da break; case RDMA_CM_EVENT_DISCONNECTED: + rdma_disconnect(conn->cm_id); /* force the DREP */ /* validate EP handle */ if (!DAPL_BAD_HANDLE(conn->ep, DAPL_MAGIC_EP)) dapl_evd_connection_callback(conn, @@ -494,6 +495,7 @@ static void dapli_cm_passive_cb(struct d break; case RDMA_CM_EVENT_DISCONNECTED: + rdma_disconnect(conn->cm_id); /* force the DREP */ /* validate SP handle context */ if (!DAPL_BAD_HANDLE(conn->sp, DAPL_MAGIC_PSP) || !DAPL_BAD_HANDLE(conn->sp, DAPL_MAGIC_RSP)) @@ -543,7 +545,8 @@ DAT_RETURN dapls_ib_connect(IN DAT_EP_HA IN void *p_data) { struct dapl_ep *ep_ptr = ep_handle; - + struct dapl_cm_id *conn; + /* Sanity check */ if (NULL == ep_ptr) return DAT_SUCCESS; @@ -552,36 +555,38 @@ DAT_RETURN dapls_ib_connect(IN DAT_EP_HA r_qual,p_data,p_size); /* rdma conn and cm_id pre-bound; reference via qp_handle */ - ep_ptr->cm_handle = ep_ptr->qp_handle; + conn = ep_ptr->cm_handle = ep_ptr->qp_handle; /* Setup QP/CM parameters and private data in cm_id */ - (void)dapl_os_memzero(&ep_ptr->cm_handle->params, - sizeof(ep_ptr->cm_handle->params)); - ep_ptr->cm_handle->params.responder_resources = IB_TARGET_MAX; - ep_ptr->cm_handle->params.initiator_depth = IB_INITIATOR_DEPTH; - ep_ptr->cm_handle->params.flow_control = 1; - ep_ptr->cm_handle->params.rnr_retry_count = IB_RNR_RETRY_COUNT; - ep_ptr->cm_handle->params.retry_count = IB_RC_RETRY_COUNT; + (void)dapl_os_memzero(&conn->params, sizeof(conn->params)); + conn->params.responder_resources = IB_TARGET_MAX; + conn->params.initiator_depth = IB_INITIATOR_DEPTH; + conn->params.flow_control = 1; + conn->params.rnr_retry_count = IB_RNR_RETRY_COUNT; + conn->params.retry_count = IB_RC_RETRY_COUNT; if (p_size) { - dapl_os_memcpy(ep_ptr->cm_handle->p_data, p_data, p_size); - ep_ptr->cm_handle->params.private_data = - ep_ptr->cm_handle->p_data; - ep_ptr->cm_handle->params.private_data_len = p_size; + dapl_os_memcpy(conn->p_data, p_data, p_size); + conn->params.private_data = conn->p_data; + conn->params.private_data_len = p_size; } + /* copy in remote address, need a copy for retry attempts */ + dapl_os_memcpy(&conn->r_addr, r_addr, sizeof(*r_addr)); + /* Resolve remote address, src already bound during QP create */ - ((struct sockaddr_in*)r_addr)->sin_port = htons(MAKE_PORT(r_qual)); - if (rdma_resolve_addr(ep_ptr->cm_handle->cm_id, - NULL, (struct sockaddr *)r_addr, 2000)) + ((struct sockaddr_in*)&conn->r_addr)->sin_port = htons(MAKE_PORT(r_qual)); + ((struct sockaddr_in*)&conn->r_addr)->sin_family = AF_INET; + + if (rdma_resolve_addr(conn->cm_id, NULL, + (struct sockaddr *)&conn->r_addr, + conn->arp_timeout)) return dapl_convert_errno(errno,"ib_connect"); dapl_dbg_log(DAPL_DBG_TYPE_CM, - " connect: resolve_addr: cm_id %p SRC %x DST %x port %d\n", - ep_ptr->cm_handle->cm_id, - ntohl(((struct sockaddr_in *) - &ep_ptr->cm_handle->hca->hca_address)->sin_addr.s_addr), - ntohl(((struct sockaddr_in *)r_addr)->sin_addr.s_addr), - MAKE_PORT(r_qual) ); + " connect: resolve_addr: cm_id %p -> %s port %d\n", + conn->cm_id, + inet_ntoa(((struct sockaddr_in *)&conn->r_addr)->sin_addr), + ((struct sockaddr_in*)&conn->r_addr)->sin_port ); return DAT_SUCCESS; } @@ -1163,15 +1168,60 @@ void dapli_cma_event_cb(void) case RDMA_CM_EVENT_ADDR_RESOLVED: dapli_addr_resolve(conn); break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: dapli_route_resolve(conn); break; + case RDMA_CM_EVENT_ADDR_ERROR: + dapl_dbg_log(DAPL_DBG_TYPE_WARN, + " CM ADDR ERROR: -> %s retry (%d)..\n", + inet_ntoa(((struct sockaddr_in *) + &conn->r_addr)->sin_addr), + conn->arp_retries); + + /* retry address resolution */ + if ((--conn->arp_retries) && + (event->status == -ETIMEDOUT)) { + int ret; + ret = rdma_resolve_addr( + conn->cm_id, NULL, + (struct sockaddr *)&conn->r_addr, + conn->arp_timeout); + if (!ret) + break; + else { + dapl_dbg_log( + DAPL_DBG_TYPE_WARN, + " ERROR: rdma_resolve_addr = " + "%d %s\n", + ret,strerror(errno)); + } + } + /* retries exhausted or resolve_addr failed */ + dapl_evd_connection_callback( + conn, IB_CME_DESTINATION_UNREACHABLE, + NULL, conn->ep); + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: - dapl_evd_connection_callback(conn, - IB_CME_DESTINATION_UNREACHABLE, - NULL, conn->ep); + dapl_dbg_log(DAPL_DBG_TYPE_WARN, + " CM ROUTE ERROR: -> %s retry (%d)..\n", + inet_ntoa(((struct sockaddr_in *) + &conn->r_addr)->sin_addr), + conn->route_retries ); + + /* retry route resolution */ + if ((--conn->route_retries) && + (event->status == -ETIMEDOUT)) + dapli_addr_resolve(conn); + else + dapl_evd_connection_callback( conn, + IB_CME_DESTINATION_UNREACHABLE, + NULL, conn->ep); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: dapl_evd_connection_callback(conn, IB_CME_LOCAL_FAILURE, Index: dapl/openib_cma/dapl_ib_qp.c =================================================================== --- dapl/openib_cma/dapl_ib_qp.c (revision 10032) +++ dapl/openib_cma/dapl_ib_qp.c (working copy) @@ -160,6 +168,17 @@ DAT_RETURN dapls_ib_qp_alloc(IN DAPL_IA conn->cm_id = cm_id; conn->ep = ep_ptr; conn->hca = ia_ptr->hca_ptr; + + /* setup timers for address and route resolution */ + conn->arp_timeout = dapl_os_get_env_val("DAPL_CM_ARP_TIMEOUT_MS", + IB_ARP_TIMEOUT); + conn->arp_retries = dapl_os_get_env_val("DAPL_CM_ARP_RETRY_COUNT", + IB_ARP_RETRY_COUNT); + conn->route_timeout = dapl_os_get_env_val("DAPL_CM_ROUTE_TIMEOUT_MS", + IB_ROUTE_TIMEOUT); + conn->route_retries = dapl_os_get_env_val("DAPL_CM_ROUTE_RETRY_COUNT", + IB_ROUTE_RETRY_COUNT); + ep_ptr->qp_handle = conn; ep_ptr->qp_state = IB_QP_STATE_INIT; Index: dapl/openib_cma/dapl_ib_util.h =================================================================== --- dapl/openib_cma/dapl_ib_util.h (revision 10032) +++ dapl/openib_cma/dapl_ib_util.h (working copy) @@ -67,8 +67,12 @@ typedef ib_hca_handle_t dapl_ibal_ca_t; #define IB_RC_RETRY_COUNT 7 #define IB_RNR_RETRY_COUNT 7 -#define IB_CM_RESPONSE_TIMEOUT 20 /* 4 sec */ -#define IB_CM_RETRIES 15 +#define IB_CM_RESPONSE_TIMEOUT 23 /* 16 sec */ +#define IB_CM_RETRIES 15 /* 240 sec total default */ +#define IB_ARP_TIMEOUT 4000 /* 4 sec */ +#define IB_ARP_RETRY_COUNT 15 /* 60 sec total */ +#define IB_ROUTE_TIMEOUT 4000 /* 4 sec */ +#define IB_ROUTE_RETRY_COUNT 15 /* 60 sec total */ #define IB_REQ_MRA_TIMEOUT 27 /* a little over 9 minutes */ #define IB_MAX_AT_RETRY 3 #define IB_TARGET_MAX 4 /* max_qp_ous_rd_atom */ @@ -177,12 +181,17 @@ struct ib_llist_entry struct dapl_cm_id { DAPL_OS_LOCK lock; int destroy; + int arp_retries; + int arp_timeout; + int route_retries; + int route_timeout; int in_callback; struct rdma_cm_id *cm_id; struct dapl_hca *hca; struct dapl_sp *sp; struct dapl_ep *ep; struct rdma_conn_param params; + DAT_SOCK_ADDR6 r_addr; int p_len; unsigned char p_data[IB_MAX_DREP_PDATA_SIZE]; }; _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general