Here is version 2 of the patch.  Based on observations of tests, I believe
Steve Wise's comments are reasonable, so I removed the rdma_resolve_addr
retry and simply changed the timeout value.  Feel free to use whichever one
of these patches you like best.  However, I urge you to apply one of these,
since the programs fail in a busy large fabric.

Dave


  The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
  some fabrics.  Steve Wise pointed out that increasing the rdma_resolve_addr
  timeout value should be as good as doing timeouts, so this patch changes
  that from 2 seconds to 20 seconds.  As far as getting path records, there
  is no retry in rdma_cm so this patch adds 10 process level retries to the
  rdma_resolve_route call.

Signed-off-by: David A. McMillen <[email protected]>
---
 rdma_bw.c  |   10 +++++++++-
 rdma_lat.c |   10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/rdma_bw.c b/rdma_bw.c
index 2628ac4..84ccf94 100755
--- a/rdma_bw.c
+++ b/rdma_bw.c
@@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
        char *service;
        int n;
        int sockfd = -1;
+       int n_retries = 10;
        struct rdma_cm_event *event;
        struct sockaddr_in sin;
        struct pingpong_context *ctx = NULL;
@@ -153,7 +154,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                sin.sin_family = AF_INET;
                sin.sin_port = htons(data->port);
                if (rdma_resolve_addr(data->cm_id, NULL,
-                                        (struct sockaddr *)&sin, 2000)) {
+                                        (struct sockaddr *)&sin, 20000)) {
                        fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
                                         pid, __func__ );
                        goto err2;
@@ -169,6 +170,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                }
                rdma_ack_cm_event(event);
        
+retry_route:
                if (rdma_resolve_route(data->cm_id, 2000)) {
                        fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
                                                pid, __func__);
@@ -178,6 +180,12 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                if (rdma_get_cm_event(data->cm_channel, &event))
                        goto err2;
 
+               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+                && n_retries-- > 0) {
+                       rdma_ack_cm_event(event);
+                       goto retry_route;
+               }
+
                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
                                        pid, __func__, event->event);
diff --git a/rdma_lat.c b/rdma_lat.c
index 3681b35..818e924 100755
--- a/rdma_lat.c
+++ b/rdma_lat.c
@@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
        char *service;
        int n;
        int sockfd = -1;
+       int n_retries = 10;
        struct rdma_cm_event *event;
        struct sockaddr_in sin;
        struct pingpong_context *ctx = NULL;
@@ -229,7 +230,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                sin.sin_family = AF_INET;
                sin.sin_port = htons(data->port);
                if (rdma_resolve_addr(data->cm_id, NULL,
-                                        (struct sockaddr *)&sin, 2000)) {
+                                        (struct sockaddr *)&sin, 20000)) {
                        fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
                                         pid, __func__ );
                        goto err2;
@@ -245,6 +246,7 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                }
                rdma_ack_cm_event(event);
        
+retry_route:
                if (rdma_resolve_route(data->cm_id, 2000)) {
                        fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
                                                pid, __func__);
@@ -254,6 +256,12 @@ static struct pingpong_context *pp_client_connect(struct 
pp_data *data)
                if (rdma_get_cm_event(data->cm_channel, &event))
                        goto err2;
 
+               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+                && n_retries-- > 0) {
+                       rdma_ack_cm_event(event);
+                       goto retry_route;
+               }
+
                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
                                        pid, __func__, event->event);
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to