The attach patch should resolve the long pending issue that we have on our track https://svn.open-mpi.org/trac/ompi/ticket/1912. The issue: As process of OpenIB BTL creation we also create set of SRQs and corresponding receive fragments are allocated and posted on all SRQs. It mean that a processes that do not have active communication will keep bunch on unused memory on SRQ,

The Solution: The patch modify openib btl to pre-post very limited number of receive fragments on each SRQ. If the number of receive buffers will not be enough, openib btl will get SRQ limit event and will pre-post additional fragments.



Please review.

diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib.c
--- a/ompi/mca/btl/openib/btl_openib.c  Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.c  Wed Dec 02 16:24:55 2009 +0200
@@ -214,6 +214,7 @@
 static int create_srq(mca_btl_openib_module_t *openib_btl)
 {
     int qp;
+    int32_t rd_num, rd_curr_num; 

     /* create the SRQ's */
     for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
@@ -242,6 +243,24 @@
                                                
ibv_get_device_name(openib_btl->device->ib_dev));
                 return OMPI_ERROR;
             }
+
+            rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
+            rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = 
mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init;
+
+            if(true == mca_btl_openib_component.enable_srq_resize) {
+                if(0 == rd_curr_num) {
+                    openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1;
+                }
+
+                openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - 
(rd_curr_num >> 2);
+                openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
+            } else {
+                openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num;
+                openib_btl->qps[qp].u.srq_qp.rd_low_local = 
mca_btl_openib_component.qp_infos[qp].rd_low;
+                /* Not used in this case, but we don't need a garbage */
+                mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0;
+                openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+            }
         }
     }

diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib.h
--- a/ompi/mca/btl/openib/btl_openib.h  Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib.h  Wed Dec 02 16:24:55 2009 +0200
@@ -87,6 +87,12 @@

 struct mca_btl_openib_srq_qp_info_t {
     int32_t sd_max;
+    /* The init value for rd_curr_num variables of all SRQs */
+    int32_t rd_init;
+    /* The watermark, threshold - if the number of WQEs in SRQ is less then 
this value =>
+       the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on 
corresponding SRQ.
+       As result the maximal number of pre-posted WQEs on the SRQ will be 
increased */
+    int32_t srq_limit;
 }; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;

 struct mca_btl_openib_qp_info_t {
@@ -254,6 +260,8 @@
     ompi_free_list_t recv_user_free;
     /**< frags for coalesced massages */
     ompi_free_list_t send_free_coalesced;
+    /**< Whether we want a dynamically resizing srq, enabled by default */
+    bool enable_srq_resize;
 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;

 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t 
mca_btl_openib_component;
@@ -348,6 +356,16 @@
     int32_t sd_credits;  /* the max number of outstanding sends on a QP when 
using SRQ */
                          /*  i.e. the number of frags that  can be outstanding 
(down counter) */
     opal_list_t pending_frags[2];    /**< list of high/low prio frags */
+    /**< The number of max rd that we can post in the current time.
+         The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED
+         event handler. The value starts from (rd_num / 4) and increased up to 
rd_num */
+    int32_t rd_curr_num;
+    /**< We post additional WQEs only if a number of WQEs (in specific SRQ) is 
less of this value.
+         The value increased together with rd_curr_num. The value is unique 
for every SRQ. */
+    int32_t rd_low_local;
+    /**< The flag points if we want to get the 
+         IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
+    bool srq_limit_event_flag;
 }; typedef struct mca_btl_openib_module_srq_qp_t 
mca_btl_openib_module_srq_qp_t;

 struct mca_btl_openib_module_qp_t {
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_async.c
--- a/ompi/mca/btl/openib/btl_openib_async.c    Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_async.c    Wed Dec 02 16:24:55 2009 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2007-2009 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
  * $COPYRIGHT$
@@ -226,10 +226,51 @@
     return OMPI_SUCCESS;
 }

+/* The main idea of resizing SRQ algorithm - 
+   We create a SRQ with size = rd_num, but for efficient usage of resources
+   the number of WQEs that we post = rd_curr_num < rd_num and this value is
+   increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in 
this function),
+   the event will thrown by device if number of WQEs in SRQ will be less than 
srq_limit */
+static int btl_openib_async_srq_limit_event(struct ibv_srq* srq, 
+                                              mca_btl_openib_module_t 
*openib_btl)
+{
+    int qp;
+
+    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
+        if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
+            if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
+                break;
+            }
+        }
+    }
+
+    if(qp >= mca_btl_openib_component.num_qps) {
+        BTL_ERROR(("The srq doesn't found on %s.", 
ibv_get_device_name(openib_btl->device->ib_dev)));
+        return OMPI_ERROR;
+    }
+
+    /* dynamically re-size the SRQ to be larger */
+    openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;
+
+    if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= 
mca_btl_openib_component.qp_infos[qp].rd_num) {
+        openib_btl->qps[qp].u.srq_qp.rd_curr_num = 
mca_btl_openib_component.qp_infos[qp].rd_num;
+        openib_btl->qps[qp].u.srq_qp.rd_low_local = 
mca_btl_openib_component.qp_infos[qp].rd_low;
+
+        openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+
+        return OMPI_SUCCESS;
+    }
+
+    openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
+    openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;
+
+    return OMPI_SUCCESS;
+}
+
 /* Function handle async device events */
 static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll 
*devices_poll, int index)
 {
-    int j;
+    int j, btl_index = 0;
     mca_btl_openib_device_t *device = NULL;
     struct ibv_async_event event;
     bool xrc_event = false;
@@ -240,6 +281,8 @@
         if 
(mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                 devices_poll->async_pollfd[index].fd ) {
             device = mca_btl_openib_component.openib_btls[j]->device;
+            btl_index = j;
+
             break;
         }
     }
@@ -306,7 +349,15 @@
 #if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
             case IBV_EVENT_CLIENT_REREGISTER:
 #endif
+                break;
+            /* The event is signaled when number of prepost receive WQEs is 
going
+                                            under predefined threshold - 
srq_limit */
             case IBV_EVENT_SRQ_LIMIT_REACHED:
+                if(OMPI_SUCCESS != 
btl_openib_async_srq_limit_event(event.element.srq, 
+                                     
mca_btl_openib_component.openib_btls[btl_index])) {
+                    return OMPI_ERROR;
+                }
+
                 break;
             default:
                 orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_component.c
--- a/ompi/mca/btl/openib/btl_openib_component.c        Mon Nov 23 19:00:16 
2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_component.c        Wed Dec 02 16:24:55 
2009 +0200
@@ -1361,8 +1361,8 @@
                         true, rd_win, rd_num - rd_low);
             }
         } else {
-            int32_t sd_max;
-            if (count < 3 || count > 5) {
+            int32_t sd_max, rd_init, srq_limit;
+            if (count < 3 || count > 7) {
                 orte_show_help("help-mpi-btl-openib.txt",
                                "invalid srq specification", true,
                                orte_process_info.nodename, queues[qp]);
@@ -1376,15 +1376,47 @@
             /* by default set rd_low to be 3/4 of rd_num */
             rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
             sd_max = atoi_param(P(4), rd_low / 4);
-            BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d",
-                         rd_num, rd_low, sd_max));
+            /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of 
rd_num by default */
+            rd_init = atoi_param(P(5), rd_num / 4);
+            /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of 
rd_low_local,
+               the value of rd_low_local we calculate in create_srq function) 
*/
+            srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
+
+            /* If we set srq_limit less or greater than rd_init
+               (init value for rd_curr_num) => we receive the 
IBV_EVENT_SRQ_LIMIT_REACHED
+               event immediately and the value of rd_curr_num will be 
increased */
+
+            /* If we set srq_limit to zero, but size of SRQ greater than 1 and
+               it is not a user request (param number 6 in --mca 
btl_openib_receive_queues) => set it to be 1 */
+            if((0 == srq_limit) && (1 < rd_num) && (0 != P(6))) {
+                srq_limit = 1;
+            }
+
+            BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max 
is %d srq_limit is %d",
+                         rd_num, rd_low, sd_max, rd_init, srq_limit));

             /* Calculate the smallest freelist size that can be allowed */
             if (rd_num > min_freelist_size) {
                 min_freelist_size = rd_num;
             }

+            if (rd_num < rd_init) {
+                orte_show_help("help-mpi-btl-openib.txt", "rd_num must be >= 
rd_init",
+                        true, orte_process_info.nodename, queues[qp]);
+                ret = OMPI_ERR_BAD_PARAM;
+                goto error;
+            }
+
+            if (rd_num < srq_limit) {
+                orte_show_help("help-mpi-btl-openib.txt", "srq_limit must be > 
rd_num",
+                        true, orte_process_info.nodename, queues[qp]);
+                ret = OMPI_ERR_BAD_PARAM;
+                goto error;
+            }
+
             mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
+            mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
+            mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 
srq_limit;
         }

         if (rd_num <= rd_low) {
@@ -3185,19 +3217,19 @@

 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
 {
-    int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
-    int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
+    int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
+    int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
     int num_post, i, rc;
     struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;

     assert(!BTL_OPENIB_QP_TYPE_PP(qp));

     OPAL_THREAD_LOCK(&openib_btl->ib_lock);
-    if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) {
+    if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
         return OMPI_SUCCESS;
     }
-    num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
+    num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;

     for(i = 0; i < num_post; i++) {
         ompi_free_list_item_t* item;
@@ -3214,7 +3246,26 @@

     rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
     if(OPAL_LIKELY(0 == rc)) {
+        struct ibv_srq_attr srq_attr;
+
         OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
+
+        if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
+            srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
+            srq_attr.max_sge = 1;
+            srq_attr.srq_limit = 
mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
+
+            openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
+            if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, 
IBV_SRQ_LIMIT)) {
+                BTL_ERROR(("Failed to request limit event for srq on  %s.  "
+                   "Fatal error, stoping asynch event thread",
+                   ibv_get_device_name(openib_btl->device->ib_dev)));
+
+                OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
+                return OMPI_ERROR;
+            }
+        }
+
         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
         return OMPI_SUCCESS;
     }
diff -r a5938d9dcada ompi/mca/btl/openib/btl_openib_mca.c
--- a/ompi/mca/btl/openib/btl_openib_mca.c      Mon Nov 23 19:00:16 2009 -0800
+++ b/ompi/mca/btl/openib/btl_openib_mca.c      Wed Dec 02 16:24:55 2009 +0200
@@ -10,7 +10,7 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2006-2008 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2006-2007 Los Alamos National Security, LLC.  All rights
  *                         reserved.
  * Copyright (c) 2006-2007 Voltaire All rights reserved.
@@ -163,6 +163,11 @@
                   1, &ival, 0));
     mca_btl_openib_component.warn_nonexistent_if = (0 != ival);

+    CHECK(reg_int("enable_srq_resize", NULL,
+                  "Enable/Disable on demand SRQ resize. "
+                  "(0 = without resizing, nonzero = with resizing)", 1, &ival, 
0));
+    mca_btl_openib_component.enable_srq_resize = (0 != ival);
+
     if (OMPI_HAVE_IBV_FORK_INIT) {
         ival2 = -1;
     } else {
diff -r a5938d9dcada ompi/mca/btl/openib/help-mpi-btl-openib.txt
--- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt       Mon Nov 23 19:00:16 
2009 -0800
+++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt       Wed Dec 02 16:24:55 
2009 +0200
@@ -11,7 +11,7 @@
 # Copyright (c) 2004-2006 The Regents of the University of California.
 #                         All rights reserved.
 # Copyright (c) 2006-2009 Cisco Systems, Inc.  All rights reserved.
-# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
+# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
 # Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
 # $COPYRIGHT$
 # 
@@ -414,6 +414,24 @@
   Local host: %s
   Bad queue specification: %s
 #
+[rd_num must be >= rd_init]
+WARNING: The number of buffers for a queue pair specified via the
+btl_openib_receive_queues MCA parameter (parametr #2) must be
+greater or equal to the init srq size (parametr #5).
+The OpenFabrics (openib) BTL will therefore be deactivated for this run.
+
+  Local host: %s
+  Bad queue specification: %s
+#
+[srq_limit must be > rd_num]
+WARNING: The number of buffers for a queue pair specified via the
+btl_openib_receive_queues MCA parameter (parametr #2) must be greater than the 
limit
+buffer count (parametr #6).  The OpenFabrics (openib) BTL will therefore
+be deactivated for this run.
+
+  Local host: %s
+  Bad queue specification: %s
+#
 [biggest qp size is too small]
 WARNING: The largest queue pair buffer size specified in the
 btl_openib_receive_queues MCA parameter is smaller than the maximum

Reply via email to