Hi,

We would like to add SDP support for OPENMPI.

 

SDP - Socket Direct Protocol is a byte-stream transport protocol
implementing the TCP SOCK_STREAM semantics utilizing transport
offloading capabilities of the InfiniBand fabric 

(http://www.mellanox.com/pdf/whitepapers/SDP_Whitepaper.pdf,
http://www.openfabrics.org/archives/aug2005datacenter/das_SDP_Linux.pdf
).

 

SDP can be used to accelerate job start ( oob over sdp ) and IPoIB
performance.

 

The main idea is to use AF_INET_SDP protocol family instead of AF_INET
and AF_INET6 when opening sockets.

SDP will be used in OOB and BTL with appropriate mca parameters - 

            -mca btl_tcp_sdp_enable 1

            -mca oob_tcp_sdp_enable 1

 

Since not all functions support this family, the changes were maid only
in critical sections of the code

 

Since SDP support is relevant only for InfiniBand Fabrics you need to
configure sdp support with -enable-sdp flag. SDP will be disabled by
default.

./configure -enable-sdp

 

Test results of running bandwidth and latency of SDP on 2 DDR nodes.

 

BWsize             VERBS             IPoIB  connected           IPoIB
datagram              btl SDP

1000000            1507.68             665.70
425.21                          1272.37

 

LTsize

5                      3.82                  28.83
28.24                            25.73

 

 

 

 

 

Index: opal/include/opal_config_bottom.h

===================================================================

--- opal/include/opal_config_bottom.h     (revision 17027)

+++ opal/include/opal_config_bottom.h     (working copy)

@@ -509,7 +509,15 @@

 #if !HAVE_DECL_PF_INET6

 #define PF_INET6 PF_UNSPEC

 #endif

+#if !HAVE_DECL_AF_INET_SDP

+#define AF_INET_SDP 27

+#endif

 

+#if OPAL_ENABLE_SDP

+#define OPAL_WANT_SDP 1

+#else

+#define OPAL_WANT_SDP 0

+#endif

 #if defined(__APPLE__) && defined(HAVE_INTTYPES_H)

 /* Prior to Mac OS X 10.3, the length modifier "ll" wasn't

    supported, but "q" was for long long.  This isn't ANSI

Index: configure.ac

===================================================================

--- configure.ac  (revision 17027)

+++ configure.ac  (working copy)

@@ -674,7 +674,7 @@

 #include <netinet/in.h>

 #endif])

 

-AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6], 

+AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6, AF_INET_SDP],


                [], [], [AC_INCLUDES_DEFAULT

 #if HAVE_SYS_SOCKET_H

 #include <sys/socket.h>

Index: ompi/mca/btl/tcp/btl_tcp_component.c

===================================================================

--- ompi/mca/btl/tcp/btl_tcp_component.c  (revision 17027)

+++ ompi/mca/btl/tcp/btl_tcp_component.c  (working copy)

@@ -263,6 +263,10 @@

 

     mca_btl_tcp_component.tcp_disable_family =

         mca_btl_tcp_param_register_int ("disable_family", NULL, 0);

+#if OPAL_WANT_SDP

+    mca_btl_tcp_component.sdp_enable =

+        mca_btl_tcp_param_register_int("sdp_enable", "Enable SDP for
TCP connections", 0);

+#endif

     return OMPI_SUCCESS;

 }

 

@@ -527,6 +531,11 @@

 

         memset (&hints, 0, sizeof(hints));

         hints.ai_family = af_family;

+#if OPAL_WANT_SDP

+     if ( mca_btl_tcp_component.sdp_enable ) {

+         hints.ai_family = AF_INET6;

+     }

+#endif

         hints.ai_socktype = SOCK_STREAM;

         hints.ai_flags = AI_PASSIVE;

 

@@ -555,7 +564,7 @@

 #endif /* IPV6_V6ONLY */

     }

 #else

-    ((struct sockaddr_in*) &inaddr)->sin_family = AF_INET;

+    ((struct sockaddr_in*) &inaddr)->sin_family = af_family;

     ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;

     addrlen = sizeof(struct sockaddr_in);

 #endif

@@ -600,7 +609,11 @@

             }

             goto socket_binded;

         }

-        if( AF_INET == af_family ) {

+#if OPAL_WANT_SDP

+     if( AF_INET == af_family|| AF_INET_SDP == af_family) {

+#else

+        if( AF_INET == af_family) {

+#endif

             BTL_ERROR(("bind() failed: no port available in the range
[%d..%d]",

                        mca_btl_tcp_component.tcp_port_min,

                        mca_btl_tcp_component.tcp_port_min + range));

@@ -624,7 +637,11 @@

         return OMPI_ERROR;

     }

 

-    if (AF_INET == af_family) {

+#if OPAL_WANT_SDP

+        if( AF_INET == af_family|| AF_INET_SDP == af_family) {

+#else

+        if( AF_INET == af_family) {

+#endif

         mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*)
&inaddr)->sin_port;

         mca_btl_tcp_component.tcp_listen_sd = sd;

     }

@@ -660,7 +677,11 @@

     }

 

     /* register listen port */

-    if (AF_INET == af_family) {

+#if OPAL_WANT_SDP

+        if( AF_INET == af_family|| AF_INET_SDP == af_family) {

+#else

+        if( AF_INET == af_family) {

+#endif

         opal_event_set( &mca_btl_tcp_component.tcp_recv_event,

                         mca_btl_tcp_component.tcp_listen_sd,

                         OPAL_EV_READ|OPAL_EV_PERSIST,

@@ -822,6 +843,12 @@

     }

 

     /* create a TCP listen socket for incoming connection attempts */

+#if OPAL_WANT_SDP

+    if (mca_btl_tcp_component.sdp_enable) {

+        if(OMPI_SUCCESS != (ret =
mca_btl_tcp_component_create_listen(AF_INET_SDP) )) {

+           return 0;

+        }

+    } else {

     if(OMPI_SUCCESS != (ret =
mca_btl_tcp_component_create_listen(AF_INET) )) {

         return 0;

     }

@@ -833,7 +860,21 @@

         }

     }

 #endif

+    }

 

+#else

+    if(OMPI_SUCCESS != (ret =
mca_btl_tcp_component_create_listen(AF_INET) )) {

+        return 0;

+    }

+#if OPAL_WANT_IPV6

+    if((ret = mca_btl_tcp_component_create_listen(AF_INET6)) !=
OMPI_SUCCESS) {

+        if (!(OMPI_ERR_IN_ERRNO == ret && EAFNOSUPPORT ==
opal_socket_errno)) {

+            opal_output (0, "mca_btl_tcp_component: IPv6 listening
socket failed\n");

+            return 0;

+        }

+    }

+#endif

+#endif

     /* publish TCP parameters with the MCA framework */

     if(OMPI_SUCCESS != (ret = mca_btl_tcp_component_exchange() )) {

         return 0;

Index: ompi/mca/btl/tcp/btl_tcp_endpoint.c

===================================================================

--- ompi/mca/btl/tcp/btl_tcp_endpoint.c   (revision 17027)

+++ ompi/mca/btl/tcp/btl_tcp_endpoint.c   (working copy)

@@ -535,7 +535,14 @@

         addrlen = sizeof (struct sockaddr_in6);

     }

 #endif

-    

+

+#if OPAL_WANT_SDP

+   if ( mca_btl_tcp_component.sdp_enable){

+       af_family = AF_INET_SDP;

+       addrlen = sizeof(struct sockaddr_in);

+    }

+#endif

+

     btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0);

     if (btl_endpoint->endpoint_sd < 0) {

         btl_endpoint->endpoint_retries++;

Index: ompi/mca/btl/tcp/btl_tcp.h

===================================================================

--- ompi/mca/btl/tcp/btl_tcp.h      (revision 17027)

+++ ompi/mca/btl/tcp/btl_tcp.h      (working copy)

@@ -90,6 +90,9 @@

     int    tcp_sndbuf;                      /**< socket sndbuf size */

     int    tcp_rcvbuf;                      /**< socket rcvbuf size */

     int    tcp_disable_family;              /**< disabled AF_family */

+#if OPAL_WANT_SDP

+    int    sdp_enable;                      /**< enable SDP         */

+#endif /* OPAL_WANT_SDP */

 

     /* free list of fragment descriptors */

     ompi_free_list_t tcp_frag_eager;

Index: config/ompi_configure_options.m4

===================================================================

--- config/ompi_configure_options.m4      (revision 17027)

+++ config/ompi_configure_options.m4      (working copy)

@@ -683,6 +683,23 @@

                    [Enable IPv6 support, but only if the underlying
system supports it])

 

 #

+# Do we want to disable SDP support?

+#

+AC_MSG_CHECKING([if want SDP support])

+AC_ARG_ENABLE([sdp],

+    [AC_HELP_STRING([--enable-sdp],

+        [Enable SDP support (default: disabled)])])

+if test "$enable_sdp" = "yes"; then

+    AC_MSG_RESULT([yes])

+    opal_want_sdp=1

+else

+    AC_MSG_RESULT([no])

+    opal_want_sdp=0

+fi

+AC_DEFINE_UNQUOTED([OPAL_ENABLE_SDP], [$opal_want_sdp],

+                   [Enable SDP support])

+

+#

 # Do we want orterun's --prefix behavior to be enabled by default?

 #

 AC_MSG_CHECKING([if want orterun "--prefix" behavior to be enabled by
default])

Index: orte/mca/oob/tcp/oob_tcp_peer.c

===================================================================

--- orte/mca/oob/tcp/oob_tcp_peer.c (revision 17027)

+++ orte/mca/oob/tcp/oob_tcp_peer.c (working copy)

@@ -371,7 +371,15 @@

                         opal_net_get_port((struct sockaddr*) &inaddr));

         }

 

-        rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);

+#if OPAL_WANT_SDP

+     if (mca_oob_tcp_component.sdp_enable) {

+            rc = mca_oob_tcp_peer_create_socket(peer, AF_INET_SDP);

+     } else {

+         rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);

+     }

+#else

+     rc = mca_oob_tcp_peer_create_socket(peer, inaddr.ss_family);

+#endif

         if (ORTE_SUCCESS != rc) {

             struct timeval tv = { 1,0 };

             opal_evtimer_add(&peer->peer_timer_event, &tv);

Index: orte/mca/oob/tcp/oob_tcp.c

===================================================================

--- orte/mca/oob/tcp/oob_tcp.c      (revision 17027)

+++ orte/mca/oob/tcp/oob_tcp.c      (working copy)

@@ -380,6 +380,13 @@

     mca_oob_tcp_component.tcp6_listen_sd = -1;

 #endif  /* OPAL_WANT_IPV6 */

 

+#if OPAL_WANT_SDP

+    mca_base_param_reg_int(&mca_oob_tcp_component.super.oob_base,

+                           "sdp_enable","Enable SDP for TCP
connections",

+                            false, false,

+                            0,

+                            &mca_oob_tcp_component.sdp_enable);

+#endif

     /* initialize state */

     mca_oob_tcp_component.tcp_shutdown = false;

     mca_oob_tcp_component.tcp_listen_sd = -1;

@@ -514,7 +521,7 @@

         int error;

 

         memset(&hints, 0, sizeof(hints));

-        hints.ai_family = af_family;

+        hints.ai_family = AF_INET6;

         hints.ai_socktype = SOCK_STREAM;

         hints.ai_flags = AI_PASSIVE;

         

@@ -542,9 +549,6 @@

     }

 #endif /* IPV6_V6ONLY */

 #else

-    if (AF_INET != af_family) {

-        return ORTE_ERROR;

-    }

     ((struct sockaddr_in*) &inaddr)->sin_family = af_family;

     ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;

     addrlen = sizeof(struct sockaddr_in);

@@ -590,7 +594,11 @@

             }

             goto socket_binded;

         }

-        if( AF_INET == af_family ) {

+#if OPAL_WANT_SDP

+        if( AF_INET == af_family || AF_INET_SDP == af_family) {

+#else

+     if( AF_INET == af_family) {

+#endif

             opal_output( 0, "bind() failed: no port available in the
range [%d..%d]",

                          mca_oob_tcp_component.tcp_port_min,

                          mca_oob_tcp_component.tcp_port_min + range);

@@ -614,7 +622,11 @@

         return ORTE_ERROR;

     }

 

-    if (AF_INET == af_family) {

+#if OPAL_WANT_SDP

+        if( AF_INET == af_family || AF_INET_SDP == af_family) {

+#else

+        if( AF_INET == af_family) {

+#endif

         mca_oob_tcp_component.tcp_listen_port = ((struct sockaddr_in*)
&inaddr)->sin_port;

         mca_oob_tcp_component.tcp_listen_sd = *target_sd;

     }

@@ -647,7 +659,11 @@

     }

 

     /* register listen port */

-    if (AF_INET == af_family) {

+#if OPAL_WANT_SDP

+        if( AF_INET == af_family || AF_INET_SDP == af_family) {

+#else

+        if( AF_INET == af_family) {

+#endif

         opal_event_set(&mca_oob_tcp_component.tcp_recv_event,

                        *target_sd,

                        OPAL_EV_READ|OPAL_EV_PERSIST,

@@ -822,6 +838,7 @@

     int flags;

 

     /* create a listen socket for incoming connections */

+    /* FIXME add support for SDP */

     mca_oob_tcp_component.tcp_listen_sd = socket(AF_INET, SOCK_STREAM,
0);

     if(mca_oob_tcp_component.tcp_listen_sd < 0) {

         opal_output(0,"mca_oob_tcp_component_init: socket() failed: %s
(%d)",

@@ -1194,26 +1211,41 @@

            but can't do that since we weren't the HNP. */

         mca_oob_tcp_component.tcp_listen_type = OOB_TCP_EVENT;

 

-        rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,

-                                       AF_INET);

-        if (ORTE_SUCCESS != rc && 

-            (EAFNOSUPPORT != opal_socket_errno ||

-             mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT))
{

-            opal_output(0,

-                        "mca_oob_tcp_init: unable to create IPv4 listen
socket: %s\n",

+#if OPAL_WANT_SDP

+        if ( mca_oob_tcp_component.sdp_enable){

+                rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,

+                                       AF_INET_SDP);

+         if (ORTE_SUCCESS != rc &&

+             (EAFNOSUPPORT != opal_socket_errno ||

+              mca_oob_tcp_component.tcp_debug >=
OOB_TCP_DEBUG_CONNECT)) {

+              opal_output(0,

+                        "mca_oob_tcp_init: unable to create SDP listen
socket: %s\n",

                         opal_strerror(rc));

-        }

+            }

+        } else 

+#endif

+     {

+           rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp_listen_sd,

+                                         AF_INET);

+        if (ORTE_SUCCESS != rc && 

+              (EAFNOSUPPORT != opal_socket_errno ||

+               mca_oob_tcp_component.tcp_debug >=
OOB_TCP_DEBUG_CONNECT)) {

+               opal_output(0,

+                         "mca_oob_tcp_init: unable to create IPv4
listen socket: %s\n",

+                         opal_strerror(rc));

+         }

 #if OPAL_WANT_IPV6

-        rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp6_listen_sd,

-                                       AF_INET6);

-        if (ORTE_SUCCESS != rc && 

-            (EAFNOSUPPORT != opal_socket_errno ||

-             mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_CONNECT))
{

-            opal_output(0,

-                        "mca_oob_tcp_init: unable to create IPv6 listen
socket: %s\n",

-                        opal_strerror(rc));

-        }

+           rc =
mca_oob_tcp_create_listen(&mca_oob_tcp_component.tcp6_listen_sd,

+                                         AF_INET6);

+           if (ORTE_SUCCESS != rc && 

+              (EAFNOSUPPORT != opal_socket_errno ||

+              mca_oob_tcp_component.tcp_debug >=
OOB_TCP_DEBUG_CONNECT)) {

+              opal_output(0,

+                         "mca_oob_tcp_init: unable to create IPv6
listen socket: %s\n",

+                         opal_strerror(rc));

+           }

 #endif

+     }

         if (mca_oob_tcp_component.tcp_debug >= OOB_TCP_DEBUG_INFO) {

             opal_output(0, "%s accepting connections via event
library",

                         ORTE_NAME_PRINT(orte_process_info.my_name));

Index: orte/mca/oob/tcp/oob_tcp.h

===================================================================

--- orte/mca/oob/tcp/oob_tcp.h      (revision 17027)

+++ orte/mca/oob/tcp/oob_tcp.h      (working copy)

@@ -217,6 +217,9 @@

     int                tcp6_port_min;        /**< Minimum allowed port
for the OOB listen socket */

     int                tcp6_port_range;      /**< Range of allowed TCP
ports */

 #endif  /* OPAL_WANT_IPV6 */

+#if OPAL_WANT_SDP

+    int                sdp_enable;           /**< support for SDP */   

+#endif /* OAP_WANT_SDP */

     opal_mutex_t       tcp_lock;             /**< lock for accessing
module state */

     opal_list_t        tcp_events;           /**< list of pending
events (accepts) */

     opal_list_t        tcp_msg_post;         /**< list of recieves user
has posted */

 

 

 

Thanks,

Verkhovsky Lenny.

 

Reply via email to