Author: rhc (Ralph Castain)
Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012)
New Revision: 26868
URL: https://svn.open-mpi.org/trac/ompi/changeset/26868
Log:
Reconnect the rsh/ssh error reporting code for remote spawns to report failure
to launch. Ensure the HNP correctly reports non-zero exit status when ssh
encounters a problem.
Thanks to Terry for spotting it!
Text files modified:
trunk/orte/mca/plm/base/plm_base_launch_support.c | 44
++++++++++++++++++++++++++++++++++++++++
trunk/orte/mca/plm/base/plm_base_receive.c | 6 +++++
trunk/orte/mca/plm/base/plm_private.h | 4 +++
trunk/orte/mca/plm/rsh/plm_rsh_module.c | 18 +++++++---------
4 files changed, 62 insertions(+), 10 deletions(-)
Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
==============================================================================
--- trunk/orte/mca/plm/base/plm_base_launch_support.c Wed Jul 25 12:32:51
2012 (r26867)
+++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2012-07-25 17:46:45 EDT
(Wed, 25 Jul 2012) (r26868)
@@ -741,6 +741,50 @@
}
+void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender,
+ opal_buffer_t *buffer,
+ orte_rml_tag_t tag, void *cbdata)
+{
+ int status, rc;
+ int32_t n;
+ orte_vpid_t vpid;
+ orte_proc_t *daemon;
+
+ /* get the daemon job, if necessary */
+ if (NULL == jdatorted) {
+ jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+ }
+
+ /* unpack the daemon that failed */
+ n=1;
+ if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer,&vpid,&n, ORTE_VPID))) {
+ ORTE_ERROR_LOG(rc);
+ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+ goto finish;
+ }
+
+ /* unpack the exit status */
+ n=1;
+ if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer,&status,&n, OPAL_INT))) {
+ ORTE_ERROR_LOG(rc);
+ status = ORTE_ERROR_DEFAULT_EXIT_CODE;
+ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+ } else {
+ ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
+ }
+
+ /* find the daemon and update its state/status */
+ if (NULL == (daemon =
(orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) {
+ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
+ goto finish;
+ }
+ daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
+ daemon->exit_code = status;
+
+ finish:
+ ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
+}
+
int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
{
int i, loc;
Modified: trunk/orte/mca/plm/base/plm_base_receive.c
==============================================================================
--- trunk/orte/mca/plm/base/plm_base_receive.c Wed Jul 25 12:32:51 2012
(r26867)
+++ trunk/orte/mca/plm/base/plm_base_receive.c 2012-07-25 17:46:45 EDT (Wed,
25 Jul 2012) (r26868)
@@ -87,6 +87,12 @@
orte_plm_base_daemon_callback, NULL))) {
ORTE_ERROR_LOG(rc);
}
+ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
+
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
+ ORTE_RML_PERSISTENT,
+
orte_plm_base_daemon_failed, NULL))) {
+ ORTE_ERROR_LOG(rc);
+ }
}
recv_issued = true;
Modified: trunk/orte/mca/plm/base/plm_private.h
==============================================================================
--- trunk/orte/mca/plm/base/plm_private.h Wed Jul 25 12:32:51 2012
(r26867)
+++ trunk/orte/mca/plm/base/plm_private.h 2012-07-25 17:46:45 EDT (Wed,
25 Jul 2012) (r26868)
@@ -78,6 +78,10 @@
ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status,
orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void
*cbdata);
+ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status,
orte_process_name_t* sender,
+ opal_buffer_t *buffer,
+ orte_rml_tag_t tag, void
*cbdata);
+
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata);
ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata);
Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c
==============================================================================
--- trunk/orte/mca/plm/rsh/plm_rsh_module.c Wed Jul 25 12:32:51 2012
(r26867)
+++ trunk/orte/mca/plm/rsh/plm_rsh_module.c 2012-07-25 17:46:45 EDT (Wed,
25 Jul 2012) (r26868)
@@ -258,8 +258,6 @@
*/
static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
- orte_std_cntr_t cnt=1;
- uint8_t flag;
orte_job_t *jdata;
orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
orte_proc_t *daemon=caddy->daemon;
@@ -283,10 +281,8 @@
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)daemon->name.vpid, WEXITSTATUS(status)));
buf = OBJ_NEW(opal_buffer_t);
- opal_dss.pack(buf,&cnt, 1, ORTE_STD_CNTR);
- flag = 1;
- opal_dss.pack(buf,&flag, 1, OPAL_UINT8);
opal_dss.pack(buf,&(daemon->name.vpid), 1, ORTE_VPID);
+ opal_dss.pack(buf,&status, 1, OPAL_INT);
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
orte_rml_send_callback, NULL);
@@ -297,6 +293,8 @@
"%s daemon %d failed with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)daemon->name.vpid, WEXITSTATUS(status)));
+ /* set the exit status */
+ ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
/* note that this daemon failed */
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
/* increment the #daemons terminated so we will exit properly */
@@ -735,7 +733,7 @@
char **argv = NULL;
char *prefix, *hostname, *var;
int argc;
- int rc;
+ int rc=ORTE_SUCCESS;
bool failed_launch = true;
orte_std_cntr_t n;
opal_byte_object_t *bo;
@@ -748,6 +746,9 @@
"%s plm:rsh: remote spawn called",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+ /* if we hit any errors, tell the HNP it was us */
+ target.vpid = ORTE_PROC_MY_NAME->vpid;
+
/* extract the prefix from the launch buffer */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch,&prefix,&n, OPAL_STRING)))
{
@@ -867,12 +868,9 @@
if (failed_launch) {
/* report cannot launch this daemon to HNP */
opal_buffer_t *buf;
- orte_std_cntr_t cnt=1;
- uint8_t flag=1;
buf = OBJ_NEW(opal_buffer_t);
- opal_dss.pack(buf,&cnt, 1, ORTE_STD_CNTR);
- opal_dss.pack(buf,&flag, 1, OPAL_UINT8);
opal_dss.pack(buf,&target.vpid, 1, ORTE_VPID);
+ opal_dss.pack(buf,&rc, 1, OPAL_INT);
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
orte_rml_send_callback, NULL);
_______________________________________________
svn mailing list
s...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn