Here is version 3 of the patch. Between Steve and Sean's comments, it seems there is no universally accepted answer, which is why it would be nice if the underlying system could provide good defaults for the user mode programs. However, that isn't here yet, and I am not prepared to try to create such a thing, so I have redone this patch to allow command line specification of the timeout values and retry counts. The timeout values are the same as the original code, and the retry counts are both set to 10.
Dave The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in some fabrics. This adds command line options to set the number of retries for each of the calls, with a default of 10. Since there may be cases where larger timeouts are desired, probably along with fewer retries, this patch also adds the ability to specify the timeout values on the command line. If none of the command line options are chosen, it will now do the retries and not fail in the larger and busier fabrics. Signed-off-by: David A. McMillen <[email protected]> --- rdma_bw.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++- rdma_lat.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 4 deletions(-) diff --git a/rdma_bw.c b/rdma_bw.c index 2628ac4..14ff80b 100755 --- a/rdma_bw.c +++ b/rdma_bw.c @@ -61,6 +61,10 @@ #define PINGPONG_RDMA_WRID 3 static int sl = 0; +static int addr_timeout = 2000; +static int addr_retries = 10; +static int route_timeout = 2000; +static int route_retries = 10; static int page_size; static pid_t pid; @@ -152,8 +156,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); +retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, - (struct sockaddr *)&sin, 2000)) { + (struct sockaddr *)&sin, addr_retries)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", pid, __func__ ); goto err2; @@ -162,6 +167,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + + if (event->event == RDMA_CM_EVENT_ADDR_ERROR + && addr_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_addr; + } + if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -169,7 +181,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) } rdma_ack_cm_event(event); - if (rdma_resolve_route(data->cm_id, 2000)) { +retry_route: + if (rdma_resolve_route(data->cm_id, route_timeout)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); goto err2; @@ -178,6 +191,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR + && route_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_route; + } + if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -863,6 +882,10 @@ static void usage(const char *argv0) printf(" -S, --sl=<sl> SL (default 0)\n"); printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n"); printf(" -c, --cma use RDMA CM\n"); + printf(" --addr-timeout=<ms> RDMA CM resolve_addr timeout ms (default 2000)\n"); + printf(" --addr-retries=<num> RDMA CM resolve_addr retry count (default 10)\n"); + printf(" --route-timeout=<ms> RDMA CM resolve_route timeout ms (default 2000)\n"); + printf(" --route-retries=<num> RDMA CM resolve_route retry count (default 10)\n"); } static void print_report(unsigned int iters, unsigned size, int duplex, @@ -949,6 +972,10 @@ int main(int argc, char *argv[]) { .name = "sl", .has_arg = 1, .val = 'S' }, { .name = "bidirectional", .has_arg = 0, .val = 'b' }, { .name = "cma", .has_arg = 0, .val = 'c' }, + { .name = "addr-timeout", .has_arg = 1, .val = 1 }, + { .name = "addr-retries", .has_arg = 1, .val = 2 }, + { .name = "route-timeout", .has_arg = 1, .val = 3 }, + { .name = "route-retries", .has_arg = 1, .val = 4 }, { 0 } }; @@ -1011,6 +1038,27 @@ int main(int argc, char *argv[]) case 'c': data.use_cma = 1; break; + + case 1: + addr_timeout = strtol(optarg, NULL, 0); + if (addr_timeout <= 0) { usage(argv[0]); return 1; } + break; + + case 2: + addr_retries = strtol(optarg, NULL, 0); + if (addr_retries < 0) { usage(argv[0]); return 1; } + break; + + case 3: + route_timeout = strtol(optarg, NULL, 0); + if (route_timeout <= 0) { usage(argv[0]); return 1; } + break; + + case 4: + route_retries = strtol(optarg, NULL, 0); + if (route_retries < 0) { usage(argv[0]); return 1; } + break; + default: usage(argv[0]); return 1; diff --git a/rdma_lat.c b/rdma_lat.c index 3681b35..cb5a6e4 100755 --- a/rdma_lat.c +++ b/rdma_lat.c @@ -63,6 +63,10 @@ static int inline_size = MAX_INLINE; static int sl = 0; +static int addr_timeout = 2000; +static int addr_retries = 10; +static int route_timeout = 2000; +static int route_retries = 10; static int page_size; static pid_t pid; @@ -228,8 +232,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr; sin.sin_family = AF_INET; sin.sin_port = htons(data->port); +retry_addr: if (rdma_resolve_addr(data->cm_id, NULL, - (struct sockaddr *)&sin, 2000)) { + (struct sockaddr *)&sin, addr_timeout)) { fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n", pid, __func__ ); goto err2; @@ -238,6 +243,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ADDR_ERROR + && addr_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_addr; + } + if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -245,7 +256,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) } rdma_ack_cm_event(event); - if (rdma_resolve_route(data->cm_id, 2000)) { +retry_route: + if (rdma_resolve_route(data->cm_id, route_timeout)) { fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", pid, __func__); goto err2; @@ -254,6 +266,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data) if (rdma_get_cm_event(data->cm_channel, &event)) goto err2; + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR + && route_retries-- > 0) { + rdma_ack_cm_event(event); + goto retry_route; + } + if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { fprintf(stderr, "%d:%s: unexpected CM event %d\n", pid, __func__, event->event); @@ -929,6 +947,10 @@ static void usage(const char *argv0) printf(" -H, --report-histogram print out all results (default print summary only)\n"); printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n"); printf(" -c, --cma Use the RDMA CMA to setup the RDMA connection\n"); + printf(" --addr-timeout=<ms> RDMA CM resolve_addr timeout ms (default 2000)\n"); + printf(" --addr-retries=<num> RDMA CM resolve_addr retry count (default 10)\n"); + printf(" --route-timeout=<ms> RDMA CM resolve_route timeout ms (default 2000)\n"); + printf(" --route-retries=<num> RDMA CM resolve_route retry count (default 10)\n"); } /* @@ -1052,6 +1074,10 @@ int main(int argc, char *argv[]) { .name = "report-histogram",.has_arg = 0, .val = 'H' }, { .name = "report-unsorted",.has_arg = 0, .val = 'U' }, { .name = "cma", .has_arg = 0, .val = 'c' }, + { .name = "addr-timeout", .has_arg = 1, .val = 1 }, + { .name = "addr-retries", .has_arg = 1, .val = 2 }, + { .name = "route-timeout", .has_arg = 1, .val = 3 }, + { .name = "route-retries", .has_arg = 1, .val = 4 }, { 0 } }; @@ -1123,6 +1149,38 @@ int main(int argc, char *argv[]) data.use_cma = 1; break; + case 1: + addr_timeout = strtol(optarg, NULL, 0); + if (addr_timeout <= 0) { + usage(argv[0]); + return 7; + } + break; + + case 2: + addr_retries = strtol(optarg, NULL, 0); + if (addr_retries < 0) { + usage(argv[0]); + return 7; + } + break; + + case 3: + route_timeout = strtol(optarg, NULL, 0); + if (route_timeout <= 0) { + usage(argv[0]); + return 7; + } + break; + + case 4: + route_retries = strtol(optarg, NULL, 0); + if (route_retries < 0) { + usage(argv[0]); + return 7; + } + break; + default: usage(argv[0]); return 7; _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
