Just to clarify, as this commit message is somewhat misleading. The nested loop 
problem would cause a problem whenever the system had a specified limit (that 
we had sensed) on the number of files a process could have open, and that 
number would have been violated by starting another process. It had nothing to 
do with comm_spawn_multiple or any other specific MPI command, which is why it 
has passed MTT for so long.


On Jul 14, 2011, at 2:10 PM, eug...@osl.iu.edu wrote:

> Author: eugene
> Date: 2011-07-14 16:10:48 EDT (Thu, 14 Jul 2011)
> New Revision: 24903
> URL: https://svn.open-mpi.org/trac/ompi/changeset/24903
> 
> Log:
> Clean up the computations of num_procs_alive.  Do some code
> refactoring to improve readability and to compute num_procs_alive
> correctly and to remove the use of loop iteration variables for
> two loops nested one inside another (causing MPI_Comm_spawn_multiple
> to fail).
> 
> 
> Text files modified: 
>   trunk/orte/mca/odls/base/odls_base_default_fns.c |    62 
> ++++++++++++++++++++--------------------
>   1 files changed, 31 insertions(+), 31 deletions(-)
> 
> Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c
> ==============================================================================
> --- trunk/orte/mca/odls/base/odls_base_default_fns.c  (original)
> +++ trunk/orte/mca/odls/base/odls_base_default_fns.c  2011-07-14 16:10:48 EDT 
> (Thu, 14 Jul 2011)
> @@ -9,7 +9,7 @@
>  *                         University of Stuttgart.  All rights reserved.
>  * Copyright (c) 2004-2005 The Regents of the University of California.
>  *                         All rights reserved.
> - * Copyright (c) 2007-2010 Oracle and/or its affiliates.  All rights 
> reserved. 
> + * Copyright (c) 2007-2011 Oracle and/or its affiliates.  All rights 
> reserved. 
>  * Copyright (c) 2011      Oak Ridge National Labs.  All rights reserved.
>  * Copyright (c) 2011      Los Alamos National Security, LLC.
>  *                         All rights reserved.
> @@ -1240,6 +1240,28 @@
>     time_is_up = true;
> }
> 
> +static int compute_num_procs_alive(orte_jobid_t *job)
> +{
> +    opal_list_item_t *item;
> +    orte_odls_child_t *child;
> +    int num_procs_alive = 0, match_job;
> +
> +    for (item  = opal_list_get_first(&orte_local_children);
> +         item != opal_list_get_end  (&orte_local_children);
> +         item  = opal_list_get_next(item)) {
> +        child = (orte_odls_child_t*)item;
> +        if ( NULL != job ) {
> +            match_job = ( OPAL_EQUAL == opal_dss.compare(job, 
> &(child->name->jobid), ORTE_JOBID) );
> +        } else {
> +            match_job = 0;
> +        }
> +        if (child->alive || match_job) {
> +            num_procs_alive++;
> +        }
> +    }
> +    return num_procs_alive;
> +}
> +
> int orte_odls_base_default_launch_local(orte_jobid_t job,
>                                         orte_odls_base_fork_local_proc_fn_t 
> fork_local)
> {
> @@ -1371,16 +1393,7 @@
>         /* compute the number of local procs alive or about to be launched
>          * as part of this job
>          */
> -        num_procs_alive = 0;
> -        for (item = opal_list_get_first(&orte_local_children);
> -             item != opal_list_get_end(&orte_local_children);
> -             item = opal_list_get_next(item)) {
> -            child = (orte_odls_child_t*)item;
> -            if (child->alive ||
> -                OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), 
> ORTE_JOBID)) {
> -                num_procs_alive++;
> -            }
> -        }
> +        num_procs_alive = compute_num_procs_alive(&job);
>         /* get the number of local processors */
>         if (ORTE_SUCCESS != (rc = 
> opal_paffinity_base_get_processor_info(&num_processors))) {
>             /* if we cannot find the number of local processors, we have no 
> choice
> @@ -1409,6 +1422,9 @@
>     /* setup to report the proc state to the HNP */
>     OBJ_CONSTRUCT(&alert, opal_buffer_t);
> 
> +    /* compute the num procs alive */
> +    num_procs_alive = compute_num_procs_alive(NULL);
> +
>     for (j=0; j < jobdat->apps.size; j++) {
>         if (NULL == (app = 
> (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) {
>             continue;
> @@ -1438,15 +1454,7 @@
>                 /* wait */
>                 ORTE_PROGRESSED_WAIT(time_is_up, 0, 1);
>                 /* recompute the num procs alive */
> -                num_procs_alive = 0;
> -                for (item = opal_list_get_first(&orte_local_children);
> -                     item != opal_list_get_end(&orte_local_children);
> -                     item = opal_list_get_next(item)) {
> -                    child = (orte_odls_child_t*)item;
> -                    if (child->alive) {
> -                        num_procs_alive++;
> -                    }
> -                }
> +                num_procs_alive = compute_num_procs_alive(NULL);
>                 /* see if we still have a problem */
>                 limit = num_procs_alive + app->num_procs;
>                 OPAL_OUTPUT_VERBOSE((10,  orte_odls_globals.output,
> @@ -1600,7 +1608,7 @@
>              */
>             if (0 < opal_sys_limits.num_files) {
>                 int limit;
> -                limit = (4*num_procs_alive)+6;
> +                limit = 4*(num_procs_alive + app->num_procs)+6;
>                 OPAL_OUTPUT_VERBOSE((10,  orte_odls_globals.output,
>                                      "%s checking limit on file descriptors 
> %d need %d",
>                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> @@ -1612,17 +1620,9 @@
>                     /* wait */
>                     ORTE_PROGRESSED_WAIT(time_is_up, 0, 1);
>                     /* recompute the num procs alive */
> -                    num_procs_alive = 0;
> -                    for (item = opal_list_get_first(&orte_local_children);
> -                         item != opal_list_get_end(&orte_local_children);
> -                         item = opal_list_get_next(item)) {
> -                        child = (orte_odls_child_t*)item;
> -                        if (child->alive) {
> -                            num_procs_alive++;
> -                        }
> -                    }
> +                    num_procs_alive = compute_num_procs_alive(NULL);
>                     /* see if we still have a problem */
> -                    limit = (4*num_procs_alive)+6;
> +                    limit = 4*(num_procs_alive + app->num_procs)+6;
>                     OPAL_OUTPUT_VERBOSE((10,  orte_odls_globals.output,
>                                          "%s rechecking limit on file 
> descriptors %d need %d",
>                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn


Reply via email to