Just to clarify, as this commit message is somewhat misleading. The nested loop problem would cause a problem whenever the system had a specified limit (that we had sensed) on the number of files a process could have open, and that number would have been violated by starting another process. It had nothing to do with comm_spawn_multiple or any other specific MPI command, which is why it has passed MTT for so long.
On Jul 14, 2011, at 2:10 PM, eug...@osl.iu.edu wrote: > Author: eugene > Date: 2011-07-14 16:10:48 EDT (Thu, 14 Jul 2011) > New Revision: 24903 > URL: https://svn.open-mpi.org/trac/ompi/changeset/24903 > > Log: > Clean up the computations of num_procs_alive. Do some code > refactoring to improve readability and to compute num_procs_alive > correctly and to remove the use of loop iteration variables for > two loops nested one inside another (causing MPI_Comm_spawn_multiple > to fail). > > > Text files modified: > trunk/orte/mca/odls/base/odls_base_default_fns.c | 62 > ++++++++++++++++++++-------------------- > 1 files changed, 31 insertions(+), 31 deletions(-) > > Modified: trunk/orte/mca/odls/base/odls_base_default_fns.c > ============================================================================== > --- trunk/orte/mca/odls/base/odls_base_default_fns.c (original) > +++ trunk/orte/mca/odls/base/odls_base_default_fns.c 2011-07-14 16:10:48 EDT > (Thu, 14 Jul 2011) > @@ -9,7 +9,7 @@ > * University of Stuttgart. All rights reserved. > * Copyright (c) 2004-2005 The Regents of the University of California. > * All rights reserved. > - * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights > reserved. > + * Copyright (c) 2007-2011 Oracle and/or its affiliates. All rights > reserved. > * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. > * Copyright (c) 2011 Los Alamos National Security, LLC. > * All rights reserved. > @@ -1240,6 +1240,28 @@ > time_is_up = true; > } > > +static int compute_num_procs_alive(orte_jobid_t *job) > +{ > + opal_list_item_t *item; > + orte_odls_child_t *child; > + int num_procs_alive = 0, match_job; > + > + for (item = opal_list_get_first(&orte_local_children); > + item != opal_list_get_end (&orte_local_children); > + item = opal_list_get_next(item)) { > + child = (orte_odls_child_t*)item; > + if ( NULL != job ) { > + match_job = ( OPAL_EQUAL == opal_dss.compare(job, > &(child->name->jobid), ORTE_JOBID) ); > + } else { > + match_job = 0; > + } > + if (child->alive || match_job) { > + num_procs_alive++; > + } > + } > + return num_procs_alive; > +} > + > int orte_odls_base_default_launch_local(orte_jobid_t job, > orte_odls_base_fork_local_proc_fn_t > fork_local) > { > @@ -1371,16 +1393,7 @@ > /* compute the number of local procs alive or about to be launched > * as part of this job > */ > - num_procs_alive = 0; > - for (item = opal_list_get_first(&orte_local_children); > - item != opal_list_get_end(&orte_local_children); > - item = opal_list_get_next(item)) { > - child = (orte_odls_child_t*)item; > - if (child->alive || > - OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), > ORTE_JOBID)) { > - num_procs_alive++; > - } > - } > + num_procs_alive = compute_num_procs_alive(&job); > /* get the number of local processors */ > if (ORTE_SUCCESS != (rc = > opal_paffinity_base_get_processor_info(&num_processors))) { > /* if we cannot find the number of local processors, we have no > choice > @@ -1409,6 +1422,9 @@ > /* setup to report the proc state to the HNP */ > OBJ_CONSTRUCT(&alert, opal_buffer_t); > > + /* compute the num procs alive */ > + num_procs_alive = compute_num_procs_alive(NULL); > + > for (j=0; j < jobdat->apps.size; j++) { > if (NULL == (app = > (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, j))) { > continue; > @@ -1438,15 +1454,7 @@ > /* wait */ > ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); > /* recompute the num procs alive */ > - num_procs_alive = 0; > - for (item = opal_list_get_first(&orte_local_children); > - item != opal_list_get_end(&orte_local_children); > - item = opal_list_get_next(item)) { > - child = (orte_odls_child_t*)item; > - if (child->alive) { > - num_procs_alive++; > - } > - } > + num_procs_alive = compute_num_procs_alive(NULL); > /* see if we still have a problem */ > limit = num_procs_alive + app->num_procs; > OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, > @@ -1600,7 +1608,7 @@ > */ > if (0 < opal_sys_limits.num_files) { > int limit; > - limit = (4*num_procs_alive)+6; > + limit = 4*(num_procs_alive + app->num_procs)+6; > OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, > "%s checking limit on file descriptors > %d need %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > @@ -1612,17 +1620,9 @@ > /* wait */ > ORTE_PROGRESSED_WAIT(time_is_up, 0, 1); > /* recompute the num procs alive */ > - num_procs_alive = 0; > - for (item = opal_list_get_first(&orte_local_children); > - item != opal_list_get_end(&orte_local_children); > - item = opal_list_get_next(item)) { > - child = (orte_odls_child_t*)item; > - if (child->alive) { > - num_procs_alive++; > - } > - } > + num_procs_alive = compute_num_procs_alive(NULL); > /* see if we still have a problem */ > - limit = (4*num_procs_alive)+6; > + limit = 4*(num_procs_alive + app->num_procs)+6; > OPAL_OUTPUT_VERBOSE((10, orte_odls_globals.output, > "%s rechecking limit on file > descriptors %d need %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > _______________________________________________ > svn mailing list > s...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn