Sorry for delay - local network was down and I couldn't commit the one-line fix :-(
Turns out that there was a bug in the rsh launcher (the daemons *always* declared a failed launch) that was previously being ignored and was now exposed, resulting in a possible race condition. Fixed now. Thanks Ralph On Jul 26, 2012, at 8:39 AM, TERRY DONTJE wrote: > Interestingly enough it worked for me for a while and then after many runs I > started seeing the below too. > > --td > > On 7/26/2012 11:07 AM, Ralph Castain wrote: >> >> Hmmm...it was working for me, but I'll recheck. Thanks! >> >> On Jul 26, 2012, at 8:04 AM, George Bosilca wrote: >> >>> r26868 seems to have some issues. It works well as long as all processes >>> are started on the same node (aka. there is a single daemon), but it breaks >>> with the error message attached below if there are more than two daemons. >>> >>> $ mpirun -np 2 --bynode ./runme >>> [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be >>> sent to a process whose contact information is unknown in file >>> ../../../../../ompi/orte/mca/rml/oob/rml_oob_send.c at line 362 >>> [node01:07767] [[21341,0],1] attempted to send to [[21341,0],2]: tag 15 >>> [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be >>> sent to a process whose contact information is unknown in file >>> ../../../../ompi/orte/mca/grpcomm/base/grpcomm_base_xcast.c at line 157 >>> >>> I confirm that applying the reverted commit brings the trunk to a normal >>> state. >>> >>> Please - a tad more care in what gets committed?? >>> >>> george. >>> >>> >>> On Jul 25, 2012, at 23:46 , svn-commit-mai...@open-mpi.org wrote: >>> >>>> Author: rhc (Ralph Castain) >>>> Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012) >>>> New Revision: 26868 >>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/26868 >>>> >>>> Log: >>>> Reconnect the rsh/ssh error reporting code for remote spawns to report >>>> failure to launch. Ensure the HNP correctly reports non-zero exit status >>>> when ssh encounters a problem. >>>> >>>> Thanks to Terry for spotting it! >>>> >>>> Text files modified: >>>> trunk/orte/mca/plm/base/plm_base_launch_support.c | 44 >>>> ++++++++++++++++++++++++++++++++++++++++ >>>> trunk/orte/mca/plm/base/plm_base_receive.c | 6 +++++ >>>> >>>> trunk/orte/mca/plm/base/plm_private.h | 4 +++ >>>> >>>> trunk/orte/mca/plm/rsh/plm_rsh_module.c | 18 >>>> +++++++--------- >>>> 4 files changed, 62 insertions(+), 10 deletions(-) >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_launch_support.c Wed Jul 25 >>>> 12:32:51 2012 (r26867) >>>> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2012-07-25 >>>> 17:46:45 EDT (Wed, 25 Jul 2012) (r26868) >>>> @@ -741,6 +741,50 @@ >>>> >>>> } >>>> >>>> +void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender, >>>> + opal_buffer_t *buffer, >>>> + orte_rml_tag_t tag, void *cbdata) >>>> +{ >>>> + int status, rc; >>>> + int32_t n; >>>> + orte_vpid_t vpid; >>>> + orte_proc_t *daemon; >>>> + >>>> + /* get the daemon job, if necessary */ >>>> + if (NULL == jdatorted) { >>>> + jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); >>>> + } >>>> + >>>> + /* unpack the daemon that failed */ >>>> + n=1; >>>> + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, >>>> ORTE_VPID))) { >>>> + ORTE_ERROR_LOG(rc); >>>> + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); >>>> + goto finish; >>>> + } >>>> + >>>> + /* unpack the exit status */ >>>> + n=1; >>>> + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, >>>> OPAL_INT))) { >>>> + ORTE_ERROR_LOG(rc); >>>> + status = ORTE_ERROR_DEFAULT_EXIT_CODE; >>>> + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); >>>> + } else { >>>> + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); >>>> + } >>>> + >>>> + /* find the daemon and update its state/status */ >>>> + if (NULL == (daemon = >>>> (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) { >>>> + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >>>> + goto finish; >>>> + } >>>> + daemon->state = ORTE_PROC_STATE_FAILED_TO_START; >>>> + daemon->exit_code = status; >>>> + >>>> + finish: >>>> + ORTE_ACTIVATE_PROC_STATE(&daemon->name, >>>> ORTE_PROC_STATE_FAILED_TO_START); >>>> +} >>>> + >>>> int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) >>>> { >>>> int i, loc; >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_base_receive.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_base_receive.c Wed Jul 25 12:32:51 >>>> 2012 (r26867) >>>> +++ trunk/orte/mca/plm/base/plm_base_receive.c 2012-07-25 17:46:45 EDT >>>> (Wed, 25 Jul 2012) (r26868) >>>> @@ -87,6 +87,12 @@ >>>> >>>> orte_plm_base_daemon_callback, NULL))) { >>>> ORTE_ERROR_LOG(rc); >>>> } >>>> + if (ORTE_SUCCESS != (rc = >>>> orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >>>> + >>>> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, >>>> + >>>> ORTE_RML_PERSISTENT, >>>> + >>>> orte_plm_base_daemon_failed, NULL))) { >>>> + ORTE_ERROR_LOG(rc); >>>> + } >>>> } >>>> recv_issued = true; >>>> >>>> >>>> Modified: trunk/orte/mca/plm/base/plm_private.h >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/base/plm_private.h Wed Jul 25 12:32:51 2012 >>>> (r26867) >>>> +++ trunk/orte/mca/plm/base/plm_private.h 2012-07-25 17:46:45 EDT (Wed, >>>> 25 Jul 2012) (r26868) >>>> @@ -78,6 +78,10 @@ >>>> ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, >>>> orte_process_name_t* sender, >>>> opal_buffer_t *buffer, >>>> orte_rml_tag_t tag, void >>>> *cbdata); >>>> +ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status, >>>> orte_process_name_t* sender, >>>> + opal_buffer_t *buffer, >>>> + orte_rml_tag_t tag, void >>>> *cbdata); >>>> + >>>> ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata); >>>> ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); >>>> ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata); >>>> >>>> Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c >>>> ============================================================================== >>>> --- trunk/orte/mca/plm/rsh/plm_rsh_module.c Wed Jul 25 12:32:51 >>>> 2012 (r26867) >>>> +++ trunk/orte/mca/plm/rsh/plm_rsh_module.c 2012-07-25 17:46:45 EDT >>>> (Wed, 25 Jul 2012) (r26868) >>>> @@ -258,8 +258,6 @@ >>>> */ >>>> static void rsh_wait_daemon(pid_t pid, int status, void* cbdata) >>>> { >>>> - orte_std_cntr_t cnt=1; >>>> - uint8_t flag; >>>> orte_job_t *jdata; >>>> orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata; >>>> orte_proc_t *daemon=caddy->daemon; >>>> @@ -283,10 +281,8 @@ >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> (int)daemon->name.vpid, >>>> WEXITSTATUS(status))); >>>> buf = OBJ_NEW(opal_buffer_t); >>>> - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); >>>> - flag = 1; >>>> - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); >>>> opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID); >>>> + opal_dss.pack(buf, &status, 1, OPAL_INT); >>>> orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, >>>> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, >>>> orte_rml_send_callback, NULL); >>>> @@ -297,6 +293,8 @@ >>>> "%s daemon %d failed with status %d", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >>>> (int)daemon->name.vpid, >>>> WEXITSTATUS(status))); >>>> + /* set the exit status */ >>>> + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); >>>> /* note that this daemon failed */ >>>> daemon->state = ORTE_PROC_STATE_FAILED_TO_START; >>>> /* increment the #daemons terminated so we will exit properly */ >>>> @@ -735,7 +733,7 @@ >>>> char **argv = NULL; >>>> char *prefix, *hostname, *var; >>>> int argc; >>>> - int rc; >>>> + int rc=ORTE_SUCCESS; >>>> bool failed_launch = true; >>>> orte_std_cntr_t n; >>>> opal_byte_object_t *bo; >>>> @@ -748,6 +746,9 @@ >>>> "%s plm:rsh: remote spawn called", >>>> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >>>> >>>> + /* if we hit any errors, tell the HNP it was us */ >>>> + target.vpid = ORTE_PROC_MY_NAME->vpid; >>>> + >>>> /* extract the prefix from the launch buffer */ >>>> n = 1; >>>> if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, >>>> OPAL_STRING))) { >>>> @@ -867,12 +868,9 @@ >>>> if (failed_launch) { >>>> /* report cannot launch this daemon to HNP */ >>>> opal_buffer_t *buf; >>>> - orte_std_cntr_t cnt=1; >>>> - uint8_t flag=1; >>>> buf = OBJ_NEW(opal_buffer_t); >>>> - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); >>>> - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); >>>> opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID); >>>> + opal_dss.pack(buf, &rc, 1, OPAL_INT); >>>> orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, >>>> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, >>>> orte_rml_send_callback, NULL); >>>> _______________________________________________ >>>> svn mailing list >>>> s...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > -- > Terry D. Dontje | Principal Software Engineer > Developer Tools Engineering | +1.781.442.2631 > Oracle - Performance Technologies > 95 Network Drive, Burlington, MA 01803 > Email terry.don...@oracle.com > > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel