On Oct 18, 2011, at 7:35 AM, TERRY DONTJE wrote:

>> Strange - it ran fine for me on multiple tests. I'll check to see if 
>> something strange got into the mix and recommit.
>> 
> Not sure it is the same issue but it looks like all my MTT tests on the trunk 
> r25308 are timing out.

Okay - sorry about that. I'm looking into it now. I tested it with a multi-node 
setup, but it's always possible that something got in there after the tests 
(and sounds like it did).

> --td
> 
>> On Oct 17, 2011, at 8:51 PM, George Bosilca wrote:
>> 
>>> This commit put the mpirun process in an infinite loop for the simple case 
>>> mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*
>>> 
>>>  george.
>>> 
>>> On Oct 17, 2011, at 15:49 , r...@osl.iu.edu wrote:
>>> 
>>>> Author: rhc
>>>> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>>>> New Revision: 25302
>>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302
>>>> 
>>>> Log:
>>>> Fix the mapping algo for computing vpids - it was borked for bynode 
>>>> operations when using nperxxx directives
>>>> 
>>>> Text files modified: 
>>>>  trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c |    67 
>>>> ++++++++++++++++++++------------------- 
>>>>  1 files changed, 34 insertions(+), 33 deletions(-)
>>>> 
>>>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
>>>> ==============================================================================
>>>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c     (original)
>>>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c     2011-10-17 
>>>> 15:49:04 EDT (Mon, 17 Oct 2011)
>>>> @@ -527,7 +527,7 @@
>>>> int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
>>>> {
>>>>    orte_job_map_t *map;
>>>> -    orte_vpid_t vpid;
>>>> +    orte_vpid_t vpid, cnt;
>>>>    int i, j;
>>>>    orte_node_t *node;
>>>>    orte_proc_t *proc;
>>>> @@ -539,6 +539,7 @@
>>>>        ORTE_MAPPING_BYSOCKET & map->policy ||
>>>>        ORTE_MAPPING_BYBOARD & map->policy) {
>>>>        /* assign the ranks sequentially */
>>>> +        vpid = 0;
>>>>        for (i=0; i < map->nodes->size; i++) {
>>>>            if (NULL == (node = 
>>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>>>                continue;
>>>> @@ -553,12 +554,10 @@
>>>>                }
>>>>                if (ORTE_VPID_INVALID == proc->name.vpid) {
>>>>                    /* find the next available vpid */
>>>> -                    for (vpid=0; vpid < jdata->num_procs; vpid++) {
>>>> -                        if (NULL == 
>>>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>>>> -                            break;
>>>> -                        }
>>>> +                    while (NULL != 
>>>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>>>> +                        vpid++;
>>>>                    }
>>>> -                    proc->name.vpid = vpid;
>>>> +                    proc->name.vpid = vpid++;
>>>>                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>>>                    
>>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>>> 
>>>> @@ -580,39 +579,41 @@
>>>> 
>>>>    if (ORTE_MAPPING_BYNODE & map->policy) {
>>>>        /* assign the ranks round-robin across nodes */
>>>> -        for (i=0; i < map->nodes->size; i++) {
>>>> -            if (NULL == (node = 
>>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>>> -                continue;
>>>> -            }
>>>> -            for (j=0; j < node->procs->size; j++) {
>>>> -                if (NULL == (proc = 
>>>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>>>> +        cnt = 0;
>>>> +        vpid = 0;
>>>> +        do {
>>>> +            for (i=0; i < map->nodes->size; i++) {
>>>> +                if (NULL == (node = 
>>>> (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>>>                    continue;
>>>>                }
>>>> -                /* ignore procs from other jobs */
>>>> -                if (proc->name.jobid != jdata->jobid) {
>>>> -                    continue;
>>>> -                }
>>>> -                if (ORTE_VPID_INVALID == proc->name.vpid) {
>>>> -                    /* find the next available vpid */
>>>> -                    vpid = i;
>>>> -                    while (NULL != 
>>>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>>>> -                        vpid += map->num_nodes;
>>>> -                        if (jdata->num_procs <= vpid) {
>>>> -                            vpid = vpid - jdata->num_procs;
>>>> +                for (j=0; j < node->procs->size; j++) {
>>>> +                    if (NULL == (proc = 
>>>> (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>>>> +                        continue;
>>>> +                    }
>>>> +                    /* ignore procs from other jobs */
>>>> +                    if (proc->name.jobid != jdata->jobid) {
>>>> +                        continue;
>>>> +                    }
>>>> +                    if (ORTE_VPID_INVALID == proc->name.vpid) {
>>>> +                        /* find next available vpid */
>>>> +                        while (NULL != 
>>>> opal_pointer_array_get_item(jdata->procs, vpid)) {
>>>> +                            vpid++;
>>>> +                        }
>>>> +                        proc->name.vpid = vpid++;
>>>> +                        
>>>> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>>> +                        
>>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>>> +                        if (ORTE_SUCCESS != (rc = 
>>>> opal_pointer_array_set_item(jdata->procs,
>>>> +                                                                          
>>>>     proc->name.vpid, proc))) {
>>>> +                            ORTE_ERROR_LOG(rc);
>>>> +                            return rc;
>>>>                        }
>>>> +                        cnt++;
>>>> +                        break;  /* move to next node */
>>>>                    }
>>>> -                    proc->name.vpid = vpid;
>>>> -                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>>> -                    
>>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>>> -                }
>>>> -                if (NULL == opal_pointer_array_get_item(jdata->procs, 
>>>> proc->name.vpid)) {
>>>> -                    if (ORTE_SUCCESS != (rc = 
>>>> opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
>>>> -                        ORTE_ERROR_LOG(rc);
>>>> -                        return rc;
>>>> -                    }                    
>>>>                }
>>>>            }
>>>> -        }
>>>> +        } while (cnt < jdata->num_procs);
>>>> +
>>>>        return ORTE_SUCCESS;
>>>>    }
>>>> 
>>>> _______________________________________________
>>>> svn mailing list
>>>> s...@open-mpi.org
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>>> 
>>> _______________________________________________
>>> devel mailing list
>>> de...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 
> -- 
> <Mail Attachment.gif>
> Terry D. Dontje | Principal Software Engineer
> Developer Tools Engineering | +1.781.442.2631
> Oracle - Performance Technologies
> 95 Network Drive, Burlington, MA 01803
> Email terry.don...@oracle.com
> 
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to