Interestingly enough it worked for me for a while and then after many runs I started seeing the below too.

--td

On 7/26/2012 11:07 AM, Ralph Castain wrote:
Hmmm...it was working for me, but I'll recheck. Thanks!

On Jul 26, 2012, at 8:04 AM, George Bosilca wrote:

r26868 seems to have some issues. It works well as long as all processes are 
started on the same node (aka. there is a single daemon), but it breaks with 
the error message attached below if there are more than two daemons.

$ mpirun -np 2 --bynode ./runme
[node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent 
to a process whose contact information is unknown in file 
../../../../../ompi/orte/mca/rml/oob/rml_oob_send.c at line 362
[node01:07767] [[21341,0],1] attempted to send to [[21341,0],2]: tag 15
[node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent 
to a process whose contact information is unknown in file 
../../../../ompi/orte/mca/grpcomm/base/grpcomm_base_xcast.c at line 157

I confirm that applying the reverted commit brings the trunk to a normal state.

Please - a tad more care in what gets committed??

  george.


On Jul 25, 2012, at 23:46 , svn-commit-mai...@open-mpi.org wrote:

Author: rhc (Ralph Castain)
Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012)
New Revision: 26868
URL: https://svn.open-mpi.org/trac/ompi/changeset/26868

Log:
Reconnect the rsh/ssh error reporting code for remote spawns to report failure 
to launch. Ensure the HNP correctly reports non-zero exit status when ssh 
encounters a problem.

Thanks to Terry for spotting it!

Text files modified:
  trunk/orte/mca/plm/base/plm_base_launch_support.c |    44 
++++++++++++++++++++++++++++++++++++++++
  trunk/orte/mca/plm/base/plm_base_receive.c        |     6 +++++
  trunk/orte/mca/plm/base/plm_private.h             |     4 +++
  trunk/orte/mca/plm/rsh/plm_rsh_module.c           |    18 +++++++---------
  4 files changed, 62 insertions(+), 10 deletions(-)

Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
==============================================================================
--- trunk/orte/mca/plm/base/plm_base_launch_support.c   Wed Jul 25 12:32:51 
2012        (r26867)
+++ trunk/orte/mca/plm/base/plm_base_launch_support.c   2012-07-25 17:46:45 EDT 
(Wed, 25 Jul 2012)      (r26868)
@@ -741,6 +741,50 @@

}

+void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender,
+                                 opal_buffer_t *buffer,
+                                 orte_rml_tag_t tag, void *cbdata)
+{
+    int status, rc;
+    int32_t n;
+    orte_vpid_t vpid;
+    orte_proc_t *daemon;
+
+    /* get the daemon job, if necessary */
+    if (NULL == jdatorted) {
+        jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+    }
+
+    /* unpack the daemon that failed */
+    n=1;
+    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer,&vpid,&n, ORTE_VPID))) {
+        ORTE_ERROR_LOG(rc);
+        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+        goto finish;
+    }
+
+    /* unpack the exit status */
+    n=1;
+    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer,&status,&n, OPAL_INT))) {
+        ORTE_ERROR_LOG(rc);
+        status = ORTE_ERROR_DEFAULT_EXIT_CODE;
+        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+    } else {
+        ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
+    }
+
+    /* find the daemon and update its state/status */
+    if (NULL == (daemon = 
(orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) {
+        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
+        goto finish;
+    }
+    daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
+    daemon->exit_code = status;
+
+ finish:
+    ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
+}
+
int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
{
    int i, loc;

Modified: trunk/orte/mca/plm/base/plm_base_receive.c
==============================================================================
--- trunk/orte/mca/plm/base/plm_base_receive.c  Wed Jul 25 12:32:51 2012        
(r26867)
+++ trunk/orte/mca/plm/base/plm_base_receive.c  2012-07-25 17:46:45 EDT (Wed, 
25 Jul 2012)      (r26868)
@@ -87,6 +87,12 @@
                                                          
orte_plm_base_daemon_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
+        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
+                                                          
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
+                                                          ORTE_RML_PERSISTENT,
+                                                          
orte_plm_base_daemon_failed, NULL))) {
+            ORTE_ERROR_LOG(rc);
+        }
    }
    recv_issued = true;


Modified: trunk/orte/mca/plm/base/plm_private.h
==============================================================================
--- trunk/orte/mca/plm/base/plm_private.h       Wed Jul 25 12:32:51 2012        
(r26867)
+++ trunk/orte/mca/plm/base/plm_private.h       2012-07-25 17:46:45 EDT (Wed, 
25 Jul 2012)      (r26868)
@@ -78,6 +78,10 @@
ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, 
orte_process_name_t* sender,
                                                 opal_buffer_t *buffer,
                                                 orte_rml_tag_t tag, void 
*cbdata);
+ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status, 
orte_process_name_t* sender,
+                                               opal_buffer_t *buffer,
+                                               orte_rml_tag_t tag, void 
*cbdata);
+
ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata);
ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata);

Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c
==============================================================================
--- trunk/orte/mca/plm/rsh/plm_rsh_module.c     Wed Jul 25 12:32:51 2012        
(r26867)
+++ trunk/orte/mca/plm/rsh/plm_rsh_module.c     2012-07-25 17:46:45 EDT (Wed, 
25 Jul 2012)      (r26868)
@@ -258,8 +258,6 @@
*/
static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
-    orte_std_cntr_t cnt=1;
-    uint8_t flag;
    orte_job_t *jdata;
    orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
    orte_proc_t *daemon=caddy->daemon;
@@ -283,10 +281,8 @@
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (int)daemon->name.vpid, WEXITSTATUS(status)));
            buf = OBJ_NEW(opal_buffer_t);
-            opal_dss.pack(buf,&cnt, 1, ORTE_STD_CNTR);
-            flag = 1;
-            opal_dss.pack(buf,&flag, 1, OPAL_UINT8);
            opal_dss.pack(buf,&(daemon->name.vpid), 1, ORTE_VPID);
+            opal_dss.pack(buf,&status, 1, OPAL_INT);
            orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
                                    ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
                                    orte_rml_send_callback, NULL);
@@ -297,6 +293,8 @@
                                 "%s daemon %d failed with status %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (int)daemon->name.vpid, WEXITSTATUS(status)));
+            /* set the exit status */
+            ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
            /* note that this daemon failed */
            daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
            /* increment the #daemons terminated so we will exit properly */
@@ -735,7 +733,7 @@
    char **argv = NULL;
    char *prefix, *hostname, *var;
    int argc;
-    int rc;
+    int rc=ORTE_SUCCESS;
    bool failed_launch = true;
    orte_std_cntr_t n;
    opal_byte_object_t *bo;
@@ -748,6 +746,9 @@
                         "%s plm:rsh: remote spawn called",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

+    /* if we hit any errors, tell the HNP it was us */
+    target.vpid = ORTE_PROC_MY_NAME->vpid;
+
    /* extract the prefix from the launch buffer */
    n = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch,&prefix,&n, OPAL_STRING))) 
{
@@ -867,12 +868,9 @@
    if (failed_launch) {
        /* report cannot launch this daemon to HNP */
        opal_buffer_t *buf;
-        orte_std_cntr_t cnt=1;
-        uint8_t flag=1;
        buf = OBJ_NEW(opal_buffer_t);
-        opal_dss.pack(buf,&cnt, 1, ORTE_STD_CNTR);
-        opal_dss.pack(buf,&flag, 1, OPAL_UINT8);
        opal_dss.pack(buf,&target.vpid, 1, ORTE_VPID);
+        opal_dss.pack(buf,&rc, 1, OPAL_INT);
        orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
                                ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
                                orte_rml_send_callback, NULL);
_______________________________________________
svn mailing list
s...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

--
Terry D. Dontje | Principal Software Engineer
Developer Tools Engineering | +1.781.442.2631
Oracle *- Performance Technologies*
95 Network Drive, Burlington, MA 01803
Email terry.don...@oracle.com <mailto:terry.don...@oracle.com>



Reply via email to