[OMPI devel] SDP support for OPEN-MPI
Hi, We would like to add SDP support for OPENMPI. SDP - Socket Direct Protocol is a byte-stream transport protocol implementing the TCP SOCK_STREAM semantics utilizing transport offloading capabilities of the InfiniBand fabric (http://www.mellanox.com/pdf/whitepapers/SDP_Whitepaper.pdf, http://www.openfabrics.org/archives/aug2005datacenter/das_SDP_Linux.pdf ). SDP can be used to accelerate job start ( oob over sdp ) and IPoIB performance. The main idea is to use AF_INET_SDP protocol family instead of AF_INET and AF_INET6 when opening sockets. SDP will be used in OOB and BTL with appropriate mca parameters - -mca btl_tcp_sdp_enable 1 -mca oob_tcp_sdp_enable 1 Since not all functions support this family, the changes were maid only in critical sections of the code Since SDP support is relevant only for InfiniBand Fabrics you need to configure sdp support with -enable-sdp flag. SDP will be disabled by default. ./configure -enable-sdp Test results of running bandwidth and latency of SDP on 2 DDR nodes. BWsize VERBS IPoIB connected IPoIB datagram btl SDP 1001507.68 665.70 425.21 1272.37 LTsize 5 3.82 28.83 28.2425.73 Index: opal/include/opal_config_bottom.h === --- opal/include/opal_config_bottom.h (revision 17027) +++ opal/include/opal_config_bottom.h (working copy) @@ -509,7 +509,15 @@ #if !HAVE_DECL_PF_INET6 #define PF_INET6 PF_UNSPEC #endif +#if !HAVE_DECL_AF_INET_SDP +#define AF_INET_SDP 27 +#endif +#if OPAL_ENABLE_SDP +#define OPAL_WANT_SDP 1 +#else +#define OPAL_WANT_SDP 0 +#endif #if defined(__APPLE__) && defined(HAVE_INTTYPES_H) /* Prior to Mac OS X 10.3, the length modifier "ll" wasn't supported, but "q" was for long long. This isn't ANSI Index: configure.ac === --- configure.ac (revision 17027) +++ configure.ac (working copy) @@ -674,7 +674,7 @@ #include #endif]) -AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6], +AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6, AF_INET_SDP], [], [], [AC_INCLUDES_DEFAULT #if HAVE_SYS_SOCKET_H #include Index: ompi/mca/btl/tcp/btl_tcp_component.c === --- ompi/mca/btl/tcp/btl_tcp_component.c (revision 17027) +++ ompi/mca/btl/tcp/btl_tcp_component.c (working copy) @@ -263,6 +263,10 @@ mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); +#if OPAL_WANT_SDP +mca_btl_tcp_component.sdp_enable = +mca_btl_tcp_param_register_int("sdp_enable", "Enable SDP for TCP connections", 0); +#endif return OMPI_SUCCESS; } @@ -527,6 +531,11 @@ memset (&hints, 0, sizeof(hints)); hints.ai_family = af_family; +#if OPAL_WANT_SDP + if ( mca_btl_tcp_component.sdp_enable ) { + hints.ai_family = AF_INET6; + } +#endif hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; @@ -555,7 +564,7 @@ #endif /* IPV6_V6ONLY */ } #else -((struct sockaddr_in*) &inaddr)->sin_family = AF_INET; +((struct sockaddr_in*) &inaddr)->sin_family = af_family; ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY; addrlen = sizeof(struct sockaddr_in); #endif @@ -600,7 +609,11 @@ } goto socket_binded; } -if( AF_INET == af_family ) { +#if OPAL_WANT_SDP + if( AF_INET == af_family|| AF_INET_SDP == af_family) { +#else +if( AF_INET == af_family) { +#endif BTL_ERROR(("bind() failed: no port available in the range [%d..%d]", mca_btl_tcp_component.tcp_port_min, mca_btl_tcp_component.tcp_port_min + range)); @@ -624,7 +637,11 @@ return OMPI_ERROR; } -if (AF_INET == af_family) { +#if OPAL_WANT_SDP +if( AF_INET == af_family|| AF_INET_SDP == af_family) { +#else +if( AF_INET == af_family) { +#endif mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*) &inaddr)->sin_port; mca_btl_tcp_component.tcp_listen_sd = sd; } @@ -660,7 +677,11 @@ } /* register listen port */ -if (AF_INET == af_family) { +#if OPAL_WANT_SDP +if( AF_INET == af_family|| AF_INET_SDP == af_family) { +#else +if( AF_INET == af_family) { +#endif opal_event_set( &mca_btl_tcp_component.tcp_recv_event, mca_btl_tcp_component.tcp_listen_sd, OPAL_EV_READ|OPAL_EV_PERSIST, @@ -822,6 +843,12 @@ } /* create a TCP l
Re: [OMPI devel] SDP support for OPEN-MPI
Since I used to be the OOB guy, I wanted to throw my $0.02 out there. I think this is the right approach for adding such support. I haven't tested it, but if it works see no reason not to commit. Brian On Dec 31, 2007, at 1:41 AM, Lenny Verkhovsky wrote: Hi, We would like to add SDP support for OPENMPI. SDP - Socket Direct Protocol is a byte-stream transport protocol implementing the TCP SOCK_STREAM semantics utilizing transport offloading capabilities of the InfiniBand fabric (http://www.mellanox.com/pdf/whitepapers/SDP_Whitepaper.pdf, http:// www.openfabrics.org/archives/aug2005datacenter/das_SDP_Linux.pdf ). SDP can be used to accelerate job start ( oob over sdp ) and IPoIB performance. The main idea is to use AF_INET_SDP protocol family instead of AF_INET and AF_INET6 when opening sockets. SDP will be used in OOB and BTL with appropriate mca parameters – -mca btl_tcp_sdp_enable 1 -mca oob_tcp_sdp_enable 1 Since not all functions support this family, the changes were maid only in critical sections of the code Since SDP support is relevant only for InfiniBand Fabrics you need to configure sdp support with –enable-sdp flag. SDP will be disabled by default. ./configure –enable-sdp Test results of running bandwidth and latency of SDP on 2 DDR nodes. BWsize VERBS IPoIB connected IPoIB datagram btl SDP 1001507.68 665.70 425.21 1272.37 LTsize 5 3.82 28.8328.24 25.73 Index: opal/include/opal_config_bottom.h === --- opal/include/opal_config_bottom.h (revision 17027) +++ opal/include/opal_config_bottom.h (working copy) @@ -509,7 +509,15 @@ #if !HAVE_DECL_PF_INET6 #define PF_INET6 PF_UNSPEC #endif +#if !HAVE_DECL_AF_INET_SDP +#define AF_INET_SDP 27 +#endif +#if OPAL_ENABLE_SDP +#define OPAL_WANT_SDP 1 +#else +#define OPAL_WANT_SDP 0 +#endif #if defined(__APPLE__) && defined(HAVE_INTTYPES_H) /* Prior to Mac OS X 10.3, the length modifier "ll" wasn't supported, but "q" was for long long. This isn't ANSI Index: configure.ac === --- configure.ac (revision 17027) +++ configure.ac (working copy) @@ -674,7 +674,7 @@ #include #endif]) -AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6], +AC_CHECK_DECLS([AF_UNSPEC, PF_UNSPEC, AF_INET6, PF_INET6, AF_INET_SDP], [], [], [AC_INCLUDES_DEFAULT #if HAVE_SYS_SOCKET_H #include Index: ompi/mca/btl/tcp/btl_tcp_component.c === --- ompi/mca/btl/tcp/btl_tcp_component.c (revision 17027) +++ ompi/mca/btl/tcp/btl_tcp_component.c (working copy) @@ -263,6 +263,10 @@ mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); +#if OPAL_WANT_SDP +mca_btl_tcp_component.sdp_enable = +mca_btl_tcp_param_register_int("sdp_enable", "Enable SDP for TCP connections", 0); +#endif return OMPI_SUCCESS; } @@ -527,6 +531,11 @@ memset (&hints, 0, sizeof(hints)); hints.ai_family = af_family; +#if OPAL_WANT_SDP + if ( mca_btl_tcp_component.sdp_enable ) { + hints.ai_family = AF_INET6; + } +#endif hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; @@ -555,7 +564,7 @@ #endif /* IPV6_V6ONLY */ } #else -((struct sockaddr_in*) &inaddr)->sin_family = AF_INET; +((struct sockaddr_in*) &inaddr)->sin_family = af_family; ((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY; addrlen = sizeof(struct sockaddr_in); #endif @@ -600,7 +609,11 @@ } goto socket_binded; } -if( AF_INET == af_family ) { +#if OPAL_WANT_SDP + if( AF_INET == af_family|| AF_INET_SDP == af_family) { +#else +if( AF_INET == af_family) { +#endif BTL_ERROR(("bind() failed: no port available in the range [%d..%d]", mca_btl_tcp_component.tcp_port_min, mca_btl_tcp_component.tcp_port_min + range)); @@ -624,7 +637,11 @@ return OMPI_ERROR; } -if (AF_INET == af_family) { +#if OPAL_WANT_SDP +if( AF_INET == af_family|| AF_INET_SDP == af_family) { +#else +if( AF_INET == af_family) { +#endif mca_btl_tcp_component.tcp_listen_port = ((struct sockaddr_in*) &inaddr)->sin_port; mca_btl_tcp_component.tcp_listen_sd = sd; } @@ -660,7 +677,11 @@ } /* register listen port */ -if (AF_INET == af_family) { +#if OPAL_WANT_SDP +if( AF_INE
[OMPI devel] Cisco MTT runs
In case you hadn't noticed, Cisco resumed running MTT literally right before the holiday weekend -- I got about 9 days of runs: http://www.open-mpi.org/mtt/stats/index.php?dates=2007-12-01+-+2007-12-31&org_name=all&platform_name=all&os_name=all&mpi_install_compiler_name=all&mpi_get_name=all&test_suite=all I also just bumped up the number of variants we're running per some recent openib btl activity on the trunk, so our nightly contribution should be going up. We used to run 9 variants on both v1.2 and the trunk; we're now running 8 variants on v1.2 and 12 variants on the trunk. I will likely add more as some other internal test clusters [finally] come on-line in the new year... :-) Happy holidays! -- Jeff Squyres Cisco Systems
[OMPI devel] Minor patch for !IPV6_V6ONLY
I just tried today to build the OMPI trunk on an old RH8 box and found that for OPAL_WANT_IPV6 && !defined(IPV6_V6ONLY) the file oob_tcp.c fails to compile due to unbalanced braces. Swapping an #endif with a closing branc (patch below) fixed the problem for me. -Paul --- orte/mca/oob/tcp/oob_tcp.c (revision 17027) +++ orte/mca/oob/tcp/oob_tcp.c (working copy) @@ -539,8 +539,8 @@ "mca_oob_tcp_create_listen: unable to disable v4-mapped addresses\n"); } } +#endif /* IPV6_V6ONLY */ } -#endif /* IPV6_V6ONLY */ #else if (AF_INET != af_family) { return ORTE_ERROR; -- Paul H. Hargrove phhargr...@lbl.gov Future Technologies Group HPC Research Department Tel: +1-510-495-2352 Lawrence Berkeley National Laboratory Fax: +1-510-486-6900
[OMPI devel] patch for building gm btl
I tried today to build the OMPI trunk on a system w/ GM libs installed (I tried both GM-2.0.16 and GM-1.6.4) and found that the GM BTL won't even compile, due to unbalanced parens. The patch below reintroduces the parens that were apparently lost in r16633: r16633 | rlgraham | 2007-11-01 15:38:50 -0800 (Thu, 01 Nov 2007) | 3 lines change all instances of ompi_free_list_init to ompi_free_list_init_new. Header and payload data are specified separately at this stage. The fact that this has gone unfixed for 2 months suggests to me that nobody is building the GM BTL. So, how would I go about checking ... a) ...if there exists any periodic build of the GM BTL via MTT? b) ...if such builds, if any, experience the same error(s) as I c) ...which GM library versions such builds, if any, compile against d) ...if anybody wants to help setup an MTT for GM on my system (NOTE: Jeff Squyres, Brian Barrett and George Bosilca all have existing accounts on my cluster, though possibly expired/disabled). -Paul --- ompi/mca/btl/gm/btl_gm_component.c (revision 17027) +++ ompi/mca/btl/gm/btl_gm_component.c (working copy) @@ -285,7 +285,7 @@ sizeof (mca_btl_gm_frag_eager_t), CACHE_LINE_SIZE, OBJ_CLASS (mca_btl_gm_frag_eager_t), -1 << mca_btl_gm_component.gm_eager_frag_size) + sizeof (uintptr_t), +(1 << mca_btl_gm_component.gm_eager_frag_size) + sizeof (uintptr_t), CACHE_LINE_SIZE, btl->gm_max_send_tokens, mca_btl_gm_component.gm_free_list_max, @@ -296,7 +296,7 @@ sizeof (mca_btl_gm_frag_max_t), CACHE_LINE_SIZE, OBJ_CLASS (mca_btl_gm_frag_max_t), -1 << mca_btl_gm_component.gm_max_frag_size) + sizeof (uintptr_t), +(1 << mca_btl_gm_component.gm_max_frag_size) + sizeof (uintptr_t), CACHE_LINE_SIZE, btl->gm_max_recv_tokens, mca_btl_gm_component.gm_free_list_max, -- Paul H. Hargrove phhargr...@lbl.gov Future Technologies Group HPC Research Department Tel: +1-510-495-2352 Lawrence Berkeley National Laboratory Fax: +1-510-486-6900