Please also change the name - it is being declared in a component, and thus the "base" should not be in the name
On Aug 28, 2013, at 9:40 AM, Nathan Hjelm <hje...@lanl.gov> wrote: > Would something even more generic such as rmaps_base_dist_network_device > be better. Could update the code to detect Gemini or Ares for example. > > -Nathan > > On Wed, Aug 28, 2013 at 04:36:22PM +0000, Jeff Squyres (jsquyres) wrote: >> Can we rename rmaps_base_dist_hca to something that is less specific to IB? >> >> E.g., rmaps_base_dist_verbs_device? (admittedly, that's a little long, >> but...) >> >> >> >> On Aug 28, 2013, at 12:23 PM, <svn-commit-mai...@open-mpi.org> wrote: >> >>> Author: jladd (Joshua Ladd) >>> Date: 2013-08-28 12:23:33 EDT (Wed, 28 Aug 2013) >>> New Revision: 29079 >>> URL: https://svn.open-mpi.org/trac/ompi/changeset/29079 >>> >>> Log: >>> Add support for autodetecting a MLNX HCA in the rmaps min distance feature. >>> In this way, .ini files distributed with software stacks need not specify a >>> particular HCA but instead may select the key word auto which will >>> automatically select the discovered device. To use this feature, simply >>> pass the keyword auto instead of a specific device name, --mca >>> rmaps_base_dist_hca auto. If more than one card is installed, the mapper >>> will inform the user of this and, at this point, the user will then need to >>> specify which card via the normal route, e.g. --mca rmaps_base_dist_hca >>> <dev_name>. This should be added to >>> \ncmr=v1.7.4:reviewer=rhc:subject=Autodetect logic for min dist mapping >>> >>> Text files modified: >>> trunk/opal/mca/hwloc/base/base.h | 4 ++-- >>> >>> trunk/opal/mca/hwloc/base/hwloc_base_util.c | 40 >>> ++++++++++++++++++++++++++++++++++++---- >>> trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt | 8 ++++++++ >>> >>> trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c | 11 +++++++++-- >>> >>> 4 files changed, 55 insertions(+), 8 deletions(-) >>> >>> Modified: trunk/opal/mca/hwloc/base/base.h >>> ============================================================================== >>> --- trunk/opal/mca/hwloc/base/base.h Wed Aug 28 12:03:23 2013 >>> (r29078) >>> +++ trunk/opal/mca/hwloc/base/base.h 2013-08-28 12:23:33 EDT (Wed, >>> 28 Aug 2013) (r29079) >>> @@ -169,8 +169,8 @@ >>> hwloc_obj_t obj, >>> >>> opal_hwloc_resource_type_t rtype); >>> >>> -OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, >>> - const char* device_name, >>> +OPAL_DECLSPEC int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, >>> + char* device_name, >>> opal_list_t *sorted_list); >>> >>> /** >>> >>> Modified: trunk/opal/mca/hwloc/base/hwloc_base_util.c >>> ============================================================================== >>> --- trunk/opal/mca/hwloc/base/hwloc_base_util.c Wed Aug 28 12:03:23 >>> 2013 (r29078) >>> +++ trunk/opal/mca/hwloc/base/hwloc_base_util.c 2013-08-28 12:23:33 EDT >>> (Wed, 28 Aug 2013) (r29079) >>> @@ -1729,7 +1729,7 @@ >>> } >>> } >>> >>> -static void sort_by_dist(hwloc_topology_t topo, const char* device_name, >>> opal_list_t *sorted_list) >>> +static void sort_by_dist(hwloc_topology_t topo, char* device_name, >>> opal_list_t *sorted_list) >>> { >>> hwloc_obj_t device_obj = NULL; >>> hwloc_obj_t obj = NULL, root = NULL; >>> @@ -1751,6 +1751,9 @@ >>> obj = obj->parent; >>> } >>> if (obj == NULL) { >>> + opal_output_verbose(5, >>> opal_hwloc_base_framework.framework_output, >>> + "hwloc:base:get_sorted_numa_list: NUMA node >>> closest to %s wasn't found.", >>> + device_name); >>> return; >>> } else { >>> close_node_index = obj->logical_index; >>> @@ -1762,6 +1765,8 @@ >>> /* we can try to find distances under group object. This >>> info can be there. */ >>> depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE); >>> if (depth < 0) { >>> + opal_output_verbose(5, >>> opal_hwloc_base_framework.framework_output, >>> + "hwloc:base:get_sorted_numa_list: There is >>> no information about distances on the node."); >>> return; >>> } >>> root = hwloc_get_root_obj(topo); >>> @@ -1779,6 +1784,8 @@ >>> } >>> /* find all distances for our close node with logical index >>> = close_node_index as close_node_index + nbobjs*j */ >>> if ((NULL == distances) || (0 == distances->nbobjs)) { >>> + opal_output_verbose(5, >>> opal_hwloc_base_framework.framework_output, >>> + "hwloc:base:get_sorted_numa_list: There is no >>> information about distances on the node."); >>> return; >>> } >>> /* fill list of numa nodes */ >>> @@ -1797,13 +1804,28 @@ >>> } >>> } >>> >>> -void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* >>> device_name, opal_list_t *sorted_list) >>> +static int find_devices(hwloc_topology_t topo, char* device_name) >>> +{ >>> + hwloc_obj_t device_obj = NULL; >>> + int count = 0; >>> + for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); >>> device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) { >>> + if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { >>> + count++; >>> + free(device_name); >>> + device_name = strdup(device_obj->name); >>> + } >>> + } >>> + return count; >>> +} >>> + >>> +int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* >>> device_name, opal_list_t *sorted_list) >>> { >>> hwloc_obj_t obj; >>> opal_list_item_t *item; >>> opal_hwloc_summary_t *sum; >>> opal_hwloc_topo_data_t *data; >>> orte_rmaps_numa_node_t *numa, *copy_numa; >>> + int count; >>> >>> obj = hwloc_get_root_obj(topo); >>> >>> @@ -1823,9 +1845,19 @@ >>> copy_numa->dist_from_closed = numa->dist_from_closed; >>> opal_list_append(sorted_list, ©_numa->super); >>> } >>> - return; >>> + return 0; >>> }else { >>> /* don't already know it - go get it */ >>> + /* firstly we check if we need to autodetect >>> OpenFabrics devices or we have the specified one */ >>> + if (!strcmp(device_name, "auto")) { >>> + count = find_devices(topo, device_name); >>> + if (count > 1) { >>> + return count; >>> + } >>> + } >>> + if (!device_name || (strlen(device_name) == 0)) { >>> + return 1; >>> + } >>> sort_by_dist(topo, device_name, sorted_list); >>> /* store this info in summary object for later usage */ >>> OPAL_LIST_FOREACH(numa, sorted_list, >>> orte_rmaps_numa_node_t) { >>> @@ -1834,7 +1866,7 @@ >>> copy_numa->dist_from_closed = numa->dist_from_closed; >>> opal_list_append(&(sum->sorted_by_dist_list), >>> ©_numa->super); >>> } >>> - return; >>> + return 0; >>> } >>> } >>> } >>> >>> Modified: trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt >>> ============================================================================== >>> --- trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt Wed Aug 28 >>> 12:03:23 2013 (r29078) >>> +++ trunk/orte/mca/rmaps/mindist/help-orte-rmaps-md.txt 2013-08-28 >>> 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) >>> @@ -29,3 +29,11 @@ >>> Node: %s >>> >>> Open MPI therefore cannot mapp the application as specified. >>> +# >>> +[orte-rmaps-mindist:several-hca-devices] >>> +There are several OpenFabrics devices found on at least one node. Please >>> specify the definite one. >>> + >>> + Devices: %d >>> + Node: %s >>> + >>> +Open MPI therefore cannot mapp the application as specified. >>> >>> Modified: trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c >>> ============================================================================== >>> --- trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c Wed Aug 28 >>> 12:03:23 2013 (r29078) >>> +++ trunk/orte/mca/rmaps/mindist/rmaps_mindist_module.c 2013-08-28 >>> 12:23:33 EDT (Wed, 28 Aug 2013) (r29079) >>> @@ -71,6 +71,7 @@ >>> mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; >>> bool initial_map=true; >>> bool bynode = false; >>> + int ret; >>> >>> /* this mapper can only handle initial launch >>> * when mindist mapping is desired >>> @@ -245,7 +246,13 @@ >>> * so we call opal_hwloc_base_get_nbobjs_by_type */ >>> opal_hwloc_base_get_nbobjs_by_type(node->topology, >>> HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); >>> OBJ_CONSTRUCT(&numa_list, opal_list_t); >>> - opal_hwloc_get_sorted_numa_list(node->topology, >>> orte_rmaps_base.device, &numa_list); >>> + ret = opal_hwloc_get_sorted_numa_list(node->topology, >>> orte_rmaps_base.device, &numa_list); >>> + if (ret > 1) { >>> + orte_show_help("help-orte-rmaps-md.txt", >>> "orte-rmaps-mindist:several-hca-devices", >>> + true, ret, node->name); >>> + rc = ORTE_ERR_SILENT; >>> + goto error; >>> + } >>> if (opal_list_get_size(&numa_list) > 0) { >>> j = 0; >>> required = 0; >>> @@ -390,7 +397,7 @@ >>> } >>> OBJ_DESTRUCT(&node_list); >>> } >>> - >>> + free(orte_rmaps_base.device); >>> return ORTE_SUCCESS; >>> >>> error: >>> _______________________________________________ >>> svn-full mailing list >>> svn-f...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full >> >> >> -- >> Jeff Squyres >> jsquy...@cisco.com >> For corporate legal information go to: >> http://www.cisco.com/web/about/doing_business/legal/cri/ >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel