You know, I've been looking around the code base, and I cannot find this MCA 
param registered anywhere, and neither does ompi_info show it.

> rmaps_base_dist_hca


Is some code missing?? According to your code, you use a device name that is 
obtained from the standard "--map-by dist:device" option. So did you mean to 
add another variable and then realized one wasn't required??



On Aug 28, 2013, at 9:23 AM, svn-commit-mai...@open-mpi.org wrote:

> Author: jladd (Joshua Ladd)
> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013)
> New Revision: 29079
> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
> 
> Log:
> Add support for autodetecting a MLNX HCA in the rmaps min distance feature. 
> In this way, .ini files distributed with software stacks need not specify a 
> particular HCA but instead may select the key word auto which will 
> automatically select the discovered device. To use this feature, simply pass 
> the keyword auto instead of a specific device name, --mca rmaps_base_dist_hca 
> auto. If more than one card is installed, the mapper will inform the user of 
> this and, at this point, the user will then need to specify which card via 
> the normal route, e.g. --mca rmaps_base_dist_hca <dev_name>. This should be 
> added to \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist 
> mapping
> 
> Text files modified: 
>  trunk/opal/mca/hwloc/base/base.h                    |     4 ++--             
>                        
>  trunk/opal/mca/hwloc/base/hwloc_base_util.c         |    40 
> ++++++++++++++++++++++++++++++++++++----
>  trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt |     8 ++++++++         
>                        
>  trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c |    11 +++++++++--      
>                        
>  4 files changed, 55 insertions(+), 8 deletions(-)
> 
> Modified: trunk/opal/mca/hwloc/base/base.h
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/base.h  Wed Aug 28 12:03:23 2013        (r29078)
> +++ trunk/opal/mca/hwloc/base/base.h  2013-08-28 12:23:33 EDT (Wed, 28 Aug 
> 2013)      (r29079)
> @@ -169,8 +169,8 @@
>                                                       hwloc_obj_t obj,
>                                                       
> opal_hwloc_resource_type_t rtype);
> 
> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> -                                    const char* device_name, 
> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> +                                    char* device_name, 
>                                    opal_list_t *sorted_list);
> 
> /**
> 
> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c       Wed Aug 28 12:03:23 
> 2013        (r29078)
> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c       2013-08-28 12:23:33 EDT 
> (Wed, 28 Aug 2013)      (r29079)
> @@ -1729,7 +1729,7 @@
>    }
> }
> 
> -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, 
> opal_list_t *sorted_list)
> +static void sort_by_dist(hwloc_topology_t topo, char* device_name, 
> opal_list_t *sorted_list)
> {
>    hwloc_obj_t device_obj = NULL;
>    hwloc_obj_t obj = NULL, root = NULL;
> @@ -1751,6 +1751,9 @@
>                    obj = obj->parent;
>                }
>                if (obj == NULL) {
> +                    opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                            "hwloc:base:get_sorted_numa_list: NUMA node 
> closest to %s wasn't found.",
> +                            device_name);
>                    return;
>                } else {
>                    close_node_index = obj->logical_index;
> @@ -1762,6 +1765,8 @@
>                    /* we can try to find distances under group object. This 
> info can be there. */
>                    depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
>                    if (depth < 0) {
> +                        opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                                "hwloc:base:get_sorted_numa_list: There is 
> no information about distances on the node.");
>                        return;
>                    }
>                    root = hwloc_get_root_obj(topo);
> @@ -1779,6 +1784,8 @@
>                }
>                /* find all distances for our close node with logical index = 
> close_node_index as close_node_index + nbobjs*j */
>                if ((NULL == distances) || (0 == distances->nbobjs)) {
> +                    opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                            "hwloc:base:get_sorted_numa_list: There is no 
> information about distances on the node.");
>                    return;
>                }
>                /* fill list of numa nodes */
> @@ -1797,13 +1804,28 @@
>    }
> }
> 
> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* 
> device_name, opal_list_t *sorted_list)
> +static int find_devices(hwloc_topology_t topo, char* device_name) 
> +{
> +    hwloc_obj_t device_obj = NULL;
> +    int count = 0;
> +    for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); 
> device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
> +        if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
> +            count++;
> +            free(device_name);
> +            device_name = strdup(device_obj->name);
> +        }
> +    }
> +    return count;
> +}
> +
> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* 
> device_name, opal_list_t *sorted_list)
> {
>    hwloc_obj_t obj;
>    opal_list_item_t *item;
>    opal_hwloc_summary_t *sum;
>    opal_hwloc_topo_data_t *data;
>    orte_rmaps_numa_node_t *numa, *copy_numa;
> +    int count;
> 
>    obj = hwloc_get_root_obj(topo);
> 
> @@ -1823,9 +1845,19 @@
>                        copy_numa->dist_from_closed = numa->dist_from_closed;
>                        opal_list_append(sorted_list, &copy_numa->super);
>                    }
> -                    return;
> +                    return 0;
>                }else {
>                    /* don't already know it - go get it */
> +                    /* firstly we check if we need to autodetect OpenFabrics 
>  devices or we have the specified one */
> +                    if (!strcmp(device_name, "auto")) {
> +                        count = find_devices(topo, device_name);
> +                       if (count > 1) {
> +                           return count;
> +                       }
> +                    }
> +                    if (!device_name || (strlen(device_name) == 0)) {
> +                        return 1;
> +                    }
>                    sort_by_dist(topo, device_name, sorted_list);
>                    /* store this info in summary object for later usage */
>                    OPAL_LIST_FOREACH(numa, sorted_list, 
> orte_rmaps_numa_node_t) {
> @@ -1834,7 +1866,7 @@
>                        copy_numa->dist_from_closed = numa->dist_from_closed;
>                        opal_list_append(&(sum->sorted_by_dist_list), 
> &copy_numa->super);
>                    }
> -                    return;
> +                    return 0;
>                }
>            }
>        }
> 
> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt       Wed Aug 28 
> 12:03:23 2013        (r29078)
> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt       2013-08-28 
> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> @@ -29,3 +29,11 @@
>  Node: %s
> 
> Open MPI therefore cannot mapp the application as specified.
> +#
> +[orte-rmaps-mindist:several-hca-devices]
> +There are several OpenFabrics devices found on at least one node. Please 
> specify the definite one.
> +
> +  Devices: %d
> +  Node: %s
> +
> +Open MPI therefore cannot mapp the application as specified.
> 
> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c       Wed Aug 28 
> 12:03:23 2013        (r29078)
> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c       2013-08-28 
> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> @@ -71,6 +71,7 @@
>    mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
>    bool initial_map=true;
>    bool bynode = false;
> +    int ret;
> 
>    /* this mapper can only handle initial launch
>     * when mindist mapping is desired
> @@ -245,7 +246,13 @@
>             * so we call opal_hwloc_base_get_nbobjs_by_type */
>            opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_NODE, 
> 0, OPAL_HWLOC_AVAILABLE);
>            OBJ_CONSTRUCT(&numa_list, opal_list_t);
> -            opal_hwloc_get_sorted_numa_list(node->topology, 
> orte_rmaps_base.device, &numa_list);
> +            ret = opal_hwloc_get_sorted_numa_list(node->topology, 
> orte_rmaps_base.device, &numa_list);
> +            if (ret > 1) {
> +                orte_show_help("help-orte-rmaps-md.txt", 
> "orte-rmaps-mindist:several-hca-devices",
> +                        true, ret, node->name);
> +                rc = ORTE_ERR_SILENT;
> +                goto error;
> +            }
>            if (opal_list_get_size(&numa_list) > 0) {
>                j = 0;
>                required = 0;
> @@ -390,7 +397,7 @@
>        }
>        OBJ_DESTRUCT(&node_list);
>    }
> -
> +    free(orte_rmaps_base.device);
>    return ORTE_SUCCESS;
> 
> error:
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn

Reply via email to