For some reason, the MX btl sets btl_bandwidth in megabits/s instead of
megabytes/s. So we get crazy btl_weights in case of heterogeneous
multirail. And --mca btl_mx_bandwidth <width> cannot work around the
problem (it probably doesn't help because it's overriden by the runtime
link width detection anyway?).
Signed-off-by: Brice Goglin <[email protected]>
Index: ompi/mca/btl/mx/btl_mx_component.c
===================================================================
--- ompi/mca/btl/mx/btl_mx_component.c (révision 23711)
+++ ompi/mca/btl/mx/btl_mx_component.c (copie de travail)
@@ -159,7 +159,7 @@
MCA_BTL_FLAGS_PUT |
MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA_MATCHED);
- mca_btl_mx_module.super.btl_bandwidth = 2000;
+ mca_btl_mx_module.super.btl_bandwidth = 250;
mca_btl_mx_module.super.btl_latency = 5;
mca_btl_base_param_register(&mca_btl_mx_component.super.btl_version,
&mca_btl_mx_module.super);
@@ -357,7 +357,7 @@
mx_btl->mx_endpoint = mx_endpoint;
mx_btl->mx_endpoint_addr = mx_endpoint_addr;
- mx_btl->super.btl_bandwidth = 2000; /* whatever */
+ mx_btl->super.btl_bandwidth = 250; /* whatever */
mx_btl->super.btl_latency = 10;
#if defined(MX_HAS_NET_TYPE)
{
@@ -370,11 +370,11 @@
} else {
if( MX_SPEED_2G == value ) {
mx_unique_network_id |= 0xaa000000;
- mx_btl->super.btl_bandwidth = 2000;
+ mx_btl->super.btl_bandwidth = 250;
mx_btl->super.btl_latency = 5;
} else if( MX_SPEED_10G == value ) {
mx_unique_network_id |= 0xbb000000;
- mx_btl->super.btl_bandwidth = 10000;
+ mx_btl->super.btl_bandwidth = 1250;
mx_btl->super.btl_latency = 3;
} else {
mx_unique_network_id |= 0xcc000000;
Index: ompi/mca/btl/mx/btl_mx_component.c
===================================================================
--- ompi/mca/btl/mx/btl_mx_component.c (révision 23711)
+++ ompi/mca/btl/mx/btl_mx_component.c (copie de travail)
@@ -159,7 +159,7 @@
MCA_BTL_FLAGS_PUT |
MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA_MATCHED);
- mca_btl_mx_module.super.btl_bandwidth = 2000;
+ mca_btl_mx_module.super.btl_bandwidth = 250;
mca_btl_mx_module.super.btl_latency = 5;
mca_btl_base_param_register(&mca_btl_mx_component.super.btl_version,
&mca_btl_mx_module.super);
@@ -357,7 +357,7 @@
mx_btl->mx_endpoint = mx_endpoint;
mx_btl->mx_endpoint_addr = mx_endpoint_addr;
- mx_btl->super.btl_bandwidth = 2000; /* whatever */
+ mx_btl->super.btl_bandwidth = 250; /* whatever */
mx_btl->super.btl_latency = 10;
#if defined(MX_HAS_NET_TYPE)
{
@@ -370,11 +370,11 @@
} else {
if( MX_SPEED_2G == value ) {
mx_unique_network_id |= 0xaa000000;
- mx_btl->super.btl_bandwidth = 2000;
+ mx_btl->super.btl_bandwidth = 250;
mx_btl->super.btl_latency = 5;
} else if( MX_SPEED_10G == value ) {
mx_unique_network_id |= 0xbb000000;
- mx_btl->super.btl_bandwidth = 10000;
+ mx_btl->super.btl_bandwidth = 1250;
mx_btl->super.btl_latency = 3;
} else {
mx_unique_network_id |= 0xcc000000;