I have no objections to this.

Josh

-----Original Message-----
From: devel [mailto:devel-boun...@open-mpi.org] On Behalf Of Jeff Squyres 
(jsquyres)
Sent: Wednesday, August 28, 2013 12:37 PM
To: <de...@open-mpi.org>
Subject: Re: [OMPI devel] [OMPI svn-full] svn:open-mpi r29079 - in trunk: 
opal/mca/hwloc/base orte/mca/rmaps/mindist

Can we rename rmaps_base_dist_hca to something that is less specific to IB?

E.g., rmaps_base_dist_verbs_device?  (admittedly, that's a little long, but...)



On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote:

> Author: jladd (Joshua Ladd)
> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) New Revision: 29079
> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
> 
> Log:
> Add support for autodetecting a MLNX HCA in the rmaps min distance 
> feature. In this way, .ini files distributed with software stacks need 
> not specify a particular HCA but instead may select the key word auto 
> which will automatically select the discovered device. To use this 
> feature, simply pass the keyword auto instead of a specific device 
> name, --mca rmaps_base_dist_hca auto. If more than one card is 
> installed, the mapper will inform the user of this and, at this point, 
> the user will then need to specify which card via the normal route, 
> e.g. --mca rmaps_base_dist_hca <dev_name>. This should be added to 
> \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist 
> mapping
> 
> Text files modified: 
>   trunk/opal/mca/hwloc/base/base.h                    |     4 ++--            
>                         
>   trunk/opal/mca/hwloc/base/hwloc_base_util.c         |    40 
> ++++++++++++++++++++++++++++++++++++----
>   trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt |     8 ++++++++        
>                         
>   trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c |    11 +++++++++--     
>                         
>   4 files changed, 55 insertions(+), 8 deletions(-)
> 
> Modified: trunk/opal/mca/hwloc/base/base.h 
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/base.h  Wed Aug 28 12:03:23 2013        (r29078)
> +++ trunk/opal/mca/hwloc/base/base.h  2013-08-28 12:23:33 EDT (Wed, 28 Aug 
> 2013)      (r29079)
> @@ -169,8 +169,8 @@
>                                                        hwloc_obj_t obj,
>                                                        
> opal_hwloc_resource_type_t rtype);
> 
> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> -                                    const char* device_name, 
> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
> +                                    char* device_name,
>                                     opal_list_t *sorted_list);
> 
> /**
> 
> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
> ==============================================================================
> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c       Wed Aug 28 12:03:23 
> 2013        (r29078)
> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c       2013-08-28 12:23:33 EDT 
> (Wed, 28 Aug 2013)      (r29079)
> @@ -1729,7 +1729,7 @@
>     }
> }
> 
> -static void sort_by_dist(hwloc_topology_t topo, const char* 
> device_name, opal_list_t *sorted_list)
> +static void sort_by_dist(hwloc_topology_t topo, char* device_name, 
> +opal_list_t *sorted_list)
> {
>     hwloc_obj_t device_obj = NULL;
>     hwloc_obj_t obj = NULL, root = NULL; @@ -1751,6 +1751,9 @@
>                     obj = obj->parent;
>                 }
>                 if (obj == NULL) {
> +                    opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                            "hwloc:base:get_sorted_numa_list: NUMA node 
> closest to %s wasn't found.",
> +                            device_name);
>                     return;
>                 } else {
>                     close_node_index = obj->logical_index; @@ -1762,6 
> +1765,8 @@
>                     /* we can try to find distances under group object. This 
> info can be there. */
>                     depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
>                     if (depth < 0) {
> +                        opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                                "hwloc:base:get_sorted_numa_list: 
> + There is no information about distances on the node.");
>                         return;
>                     }
>                     root = hwloc_get_root_obj(topo); @@ -1779,6 
> +1784,8 @@
>                 }
>                 /* find all distances for our close node with logical index = 
> close_node_index as close_node_index + nbobjs*j */
>                 if ((NULL == distances) || (0 == distances->nbobjs)) {
> +                    opal_output_verbose(5, 
> opal_hwloc_base_framework.framework_output,
> +                            "hwloc:base:get_sorted_numa_list: There 
> + is no information about distances on the node.");
>                     return;
>                 }
>                 /* fill list of numa nodes */ @@ -1797,13 +1804,28 @@
>     }
> }
> 
> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const 
> char* device_name, opal_list_t *sorted_list)
> +static int find_devices(hwloc_topology_t topo, char* device_name) {
> +    hwloc_obj_t device_obj = NULL;
> +    int count = 0;
> +    for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); 
> device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
> +        if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
> +            count++;
> +            free(device_name);
> +            device_name = strdup(device_obj->name);
> +        }
> +    }
> +    return count;
> +}
> +
> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* 
> +device_name, opal_list_t *sorted_list)
> {
>     hwloc_obj_t obj;
>     opal_list_item_t *item;
>     opal_hwloc_summary_t *sum;
>     opal_hwloc_topo_data_t *data;
>     orte_rmaps_numa_node_t *numa, *copy_numa;
> +    int count;
> 
>     obj = hwloc_get_root_obj(topo);
> 
> @@ -1823,9 +1845,19 @@
>                         copy_numa->dist_from_closed = numa->dist_from_closed;
>                         opal_list_append(sorted_list, &copy_numa->super);
>                     }
> -                    return;
> +                    return 0;
>                 }else {
>                     /* don't already know it - go get it */
> +                    /* firstly we check if we need to autodetect OpenFabrics 
>  devices or we have the specified one */
> +                    if (!strcmp(device_name, "auto")) {
> +                        count = find_devices(topo, device_name);
> +                       if (count > 1) {
> +                           return count;
> +                       }
> +                    }
> +                    if (!device_name || (strlen(device_name) == 0)) {
> +                        return 1;
> +                    }
>                     sort_by_dist(topo, device_name, sorted_list);
>                     /* store this info in summary object for later usage */
>                     OPAL_LIST_FOREACH(numa, sorted_list, 
> orte_rmaps_numa_node_t) { @@ -1834,7 +1866,7 @@
>                         copy_numa->dist_from_closed = numa->dist_from_closed;
>                         opal_list_append(&(sum->sorted_by_dist_list), 
> &copy_numa->super);
>                     }
> -                    return;
> +                    return 0;
>                 }
>             }
>         }
> 
> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt       Wed Aug 28 
> 12:03:23 2013        (r29078)
> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt       2013-08-28 
> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> @@ -29,3 +29,11 @@
>   Node: %s
> 
> Open MPI therefore cannot mapp the application as specified.
> +#
> +[orte-rmaps-mindist:several-hca-devices]
> +There are several OpenFabrics devices found on at least one node. Please 
> specify the definite one.
> +
> +  Devices: %d
> +  Node: %s
> +
> +Open MPI therefore cannot mapp the application as specified.
> 
> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
> ==============================================================================
> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c       Wed Aug 28 
> 12:03:23 2013        (r29078)
> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c       2013-08-28 
> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
> @@ -71,6 +71,7 @@
>     mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
>     bool initial_map=true;
>     bool bynode = false;
> +    int ret;
> 
>     /* this mapper can only handle initial launch
>      * when mindist mapping is desired @@ -245,7 +246,13 @@
>              * so we call opal_hwloc_base_get_nbobjs_by_type */
>             opal_hwloc_base_get_nbobjs_by_type(node->topology, 
> HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
>             OBJ_CONSTRUCT(&numa_list, opal_list_t);
> -            opal_hwloc_get_sorted_numa_list(node->topology, 
> orte_rmaps_base.device, &numa_list);
> +            ret = opal_hwloc_get_sorted_numa_list(node->topology, 
> orte_rmaps_base.device, &numa_list);
> +            if (ret > 1) {
> +                orte_show_help("help-orte-rmaps-md.txt", 
> "orte-rmaps-mindist:several-hca-devices",
> +                        true, ret, node->name);
> +                rc = ORTE_ERR_SILENT;
> +                goto error;
> +            }
>             if (opal_list_get_size(&numa_list) > 0) {
>                 j = 0;
>                 required = 0;
> @@ -390,7 +397,7 @@
>         }
>         OBJ_DESTRUCT(&node_list);
>     }
> -
> +    free(orte_rmaps_base.device);
>     return ORTE_SUCCESS;
> 
> error:
> _______________________________________________
> svn-full mailing list
> svn-f...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full


--
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to