Please also change the name - it is being declared in a component, and thus the 
"base" should not be in the name

On Aug 28, 2013, at 9:40 AM, Nathan Hjelm <hje...@lanl.gov> wrote:

> Would something even more generic such as rmaps_base_dist_network_device
> be better. Could update the code to detect Gemini or Ares for example.
> 
> -Nathan
> 
> On Wed, Aug 28, 2013 at 04:36:22PM +0000, Jeff Squyres (jsquyres) wrote:
>> Can we rename rmaps_base_dist_hca to something that is less specific to IB?
>> 
>> E.g., rmaps_base_dist_verbs_device?  (admittedly, that's a little long, 
>> but...)
>> 
>> 
>> 
>> On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote:
>> 
>>> Author: jladd (Joshua Ladd)
>>> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013)
>>> New Revision: 29079
>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079
>>> 
>>> Log:
>>> Add support for autodetecting a MLNX HCA in the rmaps min distance feature. 
>>> In this way, .ini files distributed with software stacks need not specify a 
>>> particular HCA but instead may select the key word auto which will 
>>> automatically select the discovered device. To use this feature, simply 
>>> pass the keyword auto instead of a specific device name, --mca 
>>> rmaps_base_dist_hca auto. If more than one card is installed, the mapper 
>>> will inform the user of this and, at this point, the user will then need to 
>>> specify which card via the normal route, e.g. --mca rmaps_base_dist_hca 
>>> <dev_name>. This should be added to 
>>> \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist mapping
>>> 
>>> Text files modified: 
>>>  trunk/opal/mca/hwloc/base/base.h                    |     4 ++--           
>>>                          
>>>  trunk/opal/mca/hwloc/base/hwloc_base_util.c         |    40 
>>> ++++++++++++++++++++++++++++++++++++----
>>>  trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt |     8 ++++++++       
>>>                          
>>>  trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c |    11 +++++++++--    
>>>                          
>>>  4 files changed, 55 insertions(+), 8 deletions(-)
>>> 
>>> Modified: trunk/opal/mca/hwloc/base/base.h
>>> ==============================================================================
>>> --- trunk/opal/mca/hwloc/base/base.h        Wed Aug 28 12:03:23 2013        
>>> (r29078)
>>> +++ trunk/opal/mca/hwloc/base/base.h        2013-08-28 12:23:33 EDT (Wed, 
>>> 28 Aug 2013)      (r29079)
>>> @@ -169,8 +169,8 @@
>>>                                                       hwloc_obj_t obj,
>>>                                                       
>>> opal_hwloc_resource_type_t rtype);
>>> 
>>> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
>>> -                                    const char* device_name, 
>>> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, 
>>> +                                    char* device_name, 
>>>                                    opal_list_t *sorted_list);
>>> 
>>> /**
>>> 
>>> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c
>>> ==============================================================================
>>> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c     Wed Aug 28 12:03:23 
>>> 2013        (r29078)
>>> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c     2013-08-28 12:23:33 EDT 
>>> (Wed, 28 Aug 2013)      (r29079)
>>> @@ -1729,7 +1729,7 @@
>>>    }
>>> }
>>> 
>>> -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, 
>>> opal_list_t *sorted_list)
>>> +static void sort_by_dist(hwloc_topology_t topo, char* device_name, 
>>> opal_list_t *sorted_list)
>>> {
>>>    hwloc_obj_t device_obj = NULL;
>>>    hwloc_obj_t obj = NULL, root = NULL;
>>> @@ -1751,6 +1751,9 @@
>>>                    obj = obj->parent;
>>>                }
>>>                if (obj == NULL) {
>>> +                    opal_output_verbose(5, 
>>> opal_hwloc_base_framework.framework_output,
>>> +                            "hwloc:base:get_sorted_numa_list: NUMA node 
>>> closest to %s wasn't found.",
>>> +                            device_name);
>>>                    return;
>>>                } else {
>>>                    close_node_index = obj->logical_index;
>>> @@ -1762,6 +1765,8 @@
>>>                    /* we can try to find distances under group object. This 
>>> info can be there. */
>>>                    depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
>>>                    if (depth < 0) {
>>> +                        opal_output_verbose(5, 
>>> opal_hwloc_base_framework.framework_output,
>>> +                                "hwloc:base:get_sorted_numa_list: There is 
>>> no information about distances on the node.");
>>>                        return;
>>>                    }
>>>                    root = hwloc_get_root_obj(topo);
>>> @@ -1779,6 +1784,8 @@
>>>                }
>>>                /* find all distances for our close node with logical index 
>>> = close_node_index as close_node_index + nbobjs*j */
>>>                if ((NULL == distances) || (0 == distances->nbobjs)) {
>>> +                    opal_output_verbose(5, 
>>> opal_hwloc_base_framework.framework_output,
>>> +                            "hwloc:base:get_sorted_numa_list: There is no 
>>> information about distances on the node.");
>>>                    return;
>>>                }
>>>                /* fill list of numa nodes */
>>> @@ -1797,13 +1804,28 @@
>>>    }
>>> }
>>> 
>>> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* 
>>> device_name, opal_list_t *sorted_list)
>>> +static int find_devices(hwloc_topology_t topo, char* device_name) 
>>> +{
>>> +    hwloc_obj_t device_obj = NULL;
>>> +    int count = 0;
>>> +    for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); 
>>> device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
>>> +        if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
>>> +            count++;
>>> +            free(device_name);
>>> +            device_name = strdup(device_obj->name);
>>> +        }
>>> +    }
>>> +    return count;
>>> +}
>>> +
>>> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* 
>>> device_name, opal_list_t *sorted_list)
>>> {
>>>    hwloc_obj_t obj;
>>>    opal_list_item_t *item;
>>>    opal_hwloc_summary_t *sum;
>>>    opal_hwloc_topo_data_t *data;
>>>    orte_rmaps_numa_node_t *numa, *copy_numa;
>>> +    int count;
>>> 
>>>    obj = hwloc_get_root_obj(topo);
>>> 
>>> @@ -1823,9 +1845,19 @@
>>>                        copy_numa->dist_from_closed = numa->dist_from_closed;
>>>                        opal_list_append(sorted_list, &copy_numa->super);
>>>                    }
>>> -                    return;
>>> +                    return 0;
>>>                }else {
>>>                    /* don't already know it - go get it */
>>> +                    /* firstly we check if we need to autodetect 
>>> OpenFabrics  devices or we have the specified one */
>>> +                    if (!strcmp(device_name, "auto")) {
>>> +                        count = find_devices(topo, device_name);
>>> +                       if (count > 1) {
>>> +                           return count;
>>> +                       }
>>> +                    }
>>> +                    if (!device_name || (strlen(device_name) == 0)) {
>>> +                        return 1;
>>> +                    }
>>>                    sort_by_dist(topo, device_name, sorted_list);
>>>                    /* store this info in summary object for later usage */
>>>                    OPAL_LIST_FOREACH(numa, sorted_list, 
>>> orte_rmaps_numa_node_t) {
>>> @@ -1834,7 +1866,7 @@
>>>                        copy_numa->dist_from_closed = numa->dist_from_closed;
>>>                        opal_list_append(&(sum->sorted_by_dist_list), 
>>> &copy_numa->super);
>>>                    }
>>> -                    return;
>>> +                    return 0;
>>>                }
>>>            }
>>>        }
>>> 
>>> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
>>> ==============================================================================
>>> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt     Wed Aug 28 
>>> 12:03:23 2013        (r29078)
>>> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt     2013-08-28 
>>> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
>>> @@ -29,3 +29,11 @@
>>>  Node: %s
>>> 
>>> Open MPI therefore cannot mapp the application as specified.
>>> +#
>>> +[orte-rmaps-mindist:several-hca-devices]
>>> +There are several OpenFabrics devices found on at least one node. Please 
>>> specify the definite one.
>>> +
>>> +  Devices: %d
>>> +  Node: %s
>>> +
>>> +Open MPI therefore cannot mapp the application as specified.
>>> 
>>> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c
>>> ==============================================================================
>>> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c     Wed Aug 28 
>>> 12:03:23 2013        (r29078)
>>> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c     2013-08-28 
>>> 12:23:33 EDT (Wed, 28 Aug 2013)      (r29079)
>>> @@ -71,6 +71,7 @@
>>>    mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
>>>    bool initial_map=true;
>>>    bool bynode = false;
>>> +    int ret;
>>> 
>>>    /* this mapper can only handle initial launch
>>>     * when mindist mapping is desired
>>> @@ -245,7 +246,13 @@
>>>             * so we call opal_hwloc_base_get_nbobjs_by_type */
>>>            opal_hwloc_base_get_nbobjs_by_type(node->topology, 
>>> HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE);
>>>            OBJ_CONSTRUCT(&numa_list, opal_list_t);
>>> -            opal_hwloc_get_sorted_numa_list(node->topology, 
>>> orte_rmaps_base.device, &numa_list);
>>> +            ret = opal_hwloc_get_sorted_numa_list(node->topology, 
>>> orte_rmaps_base.device, &numa_list);
>>> +            if (ret > 1) {
>>> +                orte_show_help("help-orte-rmaps-md.txt", 
>>> "orte-rmaps-mindist:several-hca-devices",
>>> +                        true, ret, node->name);
>>> +                rc = ORTE_ERR_SILENT;
>>> +                goto error;
>>> +            }
>>>            if (opal_list_get_size(&numa_list) > 0) {
>>>                j = 0;
>>>                required = 0;
>>> @@ -390,7 +397,7 @@
>>>        }
>>>        OBJ_DESTRUCT(&node_list);
>>>    }
>>> -
>>> +    free(orte_rmaps_base.device);
>>>    return ORTE_SUCCESS;
>>> 
>>> error:
>>> _______________________________________________
>>> svn-full mailing list
>>> svn-f...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>> 
>> 
>> -- 
>> Jeff Squyres
>> jsquy...@cisco.com
>> For corporate legal information go to: 
>> http://www.cisco.com/web/about/doing_business/legal/cri/
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to