On Oct 18, 2011, at 7:35 AM, TERRY DONTJE wrote: >> Strange - it ran fine for me on multiple tests. I'll check to see if >> something strange got into the mix and recommit. >> > Not sure it is the same issue but it looks like all my MTT tests on the trunk > r25308 are timing out.
Okay - sorry about that. I'm looking into it now. I tested it with a multi-node setup, but it's always possible that something got in there after the tests (and sounds like it did). > --td > >> On Oct 17, 2011, at 8:51 PM, George Bosilca wrote: >> >>> This commit put the mpirun process in an infinite loop for the simple case >>> mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app* >>> >>> george. >>> >>> On Oct 17, 2011, at 15:49 , r...@osl.iu.edu wrote: >>> >>>> Author: rhc >>>> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011) >>>> New Revision: 25302 >>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302 >>>> >>>> Log: >>>> Fix the mapping algo for computing vpids - it was borked for bynode >>>> operations when using nperxxx directives >>>> >>>> Text files modified: >>>> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 >>>> ++++++++++++++++++++------------------- >>>> 1 files changed, 34 insertions(+), 33 deletions(-) >>>> >>>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c >>>> ============================================================================== >>>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original) >>>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 >>>> 15:49:04 EDT (Mon, 17 Oct 2011) >>>> @@ -527,7 +527,7 @@ >>>> int orte_rmaps_base_compute_vpids(orte_job_t *jdata) >>>> { >>>> orte_job_map_t *map; >>>> - orte_vpid_t vpid; >>>> + orte_vpid_t vpid, cnt; >>>> int i, j; >>>> orte_node_t *node; >>>> orte_proc_t *proc; >>>> @@ -539,6 +539,7 @@ >>>> ORTE_MAPPING_BYSOCKET & map->policy || >>>> ORTE_MAPPING_BYBOARD & map->policy) { >>>> /* assign the ranks sequentially */ >>>> + vpid = 0; >>>> for (i=0; i < map->nodes->size; i++) { >>>> if (NULL == (node = >>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >>>> continue; >>>> @@ -553,12 +554,10 @@ >>>> } >>>> if (ORTE_VPID_INVALID == proc->name.vpid) { >>>> /* find the next available vpid */ >>>> - for (vpid=0; vpid < jdata->num_procs; vpid++) { >>>> - if (NULL == >>>> opal_pointer_array_get_item(jdata->procs, vpid)) { >>>> - break; >>>> - } >>>> + while (NULL != >>>> opal_pointer_array_get_item(jdata->procs, vpid)) { >>>> + vpid++; >>>> } >>>> - proc->name.vpid = vpid; >>>> + proc->name.vpid = vpid++; >>>> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> >>>> @@ -580,39 +579,41 @@ >>>> >>>> if (ORTE_MAPPING_BYNODE & map->policy) { >>>> /* assign the ranks round-robin across nodes */ >>>> - for (i=0; i < map->nodes->size; i++) { >>>> - if (NULL == (node = >>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >>>> - continue; >>>> - } >>>> - for (j=0; j < node->procs->size; j++) { >>>> - if (NULL == (proc = >>>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { >>>> + cnt = 0; >>>> + vpid = 0; >>>> + do { >>>> + for (i=0; i < map->nodes->size; i++) { >>>> + if (NULL == (node = >>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { >>>> continue; >>>> } >>>> - /* ignore procs from other jobs */ >>>> - if (proc->name.jobid != jdata->jobid) { >>>> - continue; >>>> - } >>>> - if (ORTE_VPID_INVALID == proc->name.vpid) { >>>> - /* find the next available vpid */ >>>> - vpid = i; >>>> - while (NULL != >>>> opal_pointer_array_get_item(jdata->procs, vpid)) { >>>> - vpid += map->num_nodes; >>>> - if (jdata->num_procs <= vpid) { >>>> - vpid = vpid - jdata->num_procs; >>>> + for (j=0; j < node->procs->size; j++) { >>>> + if (NULL == (proc = >>>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { >>>> + continue; >>>> + } >>>> + /* ignore procs from other jobs */ >>>> + if (proc->name.jobid != jdata->jobid) { >>>> + continue; >>>> + } >>>> + if (ORTE_VPID_INVALID == proc->name.vpid) { >>>> + /* find next available vpid */ >>>> + while (NULL != >>>> opal_pointer_array_get_item(jdata->procs, vpid)) { >>>> + vpid++; >>>> + } >>>> + proc->name.vpid = vpid++; >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> + >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> + if (ORTE_SUCCESS != (rc = >>>> opal_pointer_array_set_item(jdata->procs, >>>> + >>>> proc->name.vpid, proc))) { >>>> + ORTE_ERROR_LOG(rc); >>>> + return rc; >>>> } >>>> + cnt++; >>>> + break; /* move to next node */ >>>> } >>>> - proc->name.vpid = vpid; >>>> - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID); >>>> - >>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name)); >>>> - } >>>> - if (NULL == opal_pointer_array_get_item(jdata->procs, >>>> proc->name.vpid)) { >>>> - if (ORTE_SUCCESS != (rc = >>>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { >>>> - ORTE_ERROR_LOG(rc); >>>> - return rc; >>>> - } >>>> } >>>> } >>>> - } >>>> + } while (cnt < jdata->num_procs); >>>> + >>>> return ORTE_SUCCESS; >>>> } >>>> >>>> _______________________________________________ >>>> svn mailing list >>>> s...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > -- > <Mail Attachment.gif> > Terry D. Dontje | Principal Software Engineer > Developer Tools Engineering | +1.781.442.2631 > Oracle - Performance Technologies > 95 Network Drive, Burlington, MA 01803 > Email terry.don...@oracle.com > > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel