Then I guess you will be happy to learn that instead of eating your multi word arguments we now pass them to your srun as expected.

  george.

On Jun 24, 2009, at 16:18 , Jeff Squyres wrote:

As a non-rsh'er (I run all my jobs in SLURM), this is very important to me.

Please revert.



On Jun 24, 2009, at 4:15 PM, Ralph Castain wrote:

Yo George

This commit is going to break non-rsh launchers. While it is true that the rsh launcher may handle multi-word options by putting them in quotes, we specifically avoided it here because it breaks SLURM, Torque, and others.

This is why we specifically put the inclusion of multi-word options in the rsh plm module, and not here. Would you please move it back there?

Thanks
Ralph


On Wed, Jun 24, 2009 at 1:51 PM, <bosi...@osl.iu.edu> wrote:
Author: bosilca
Date: 2009-06-24 15:51:52 EDT (Wed, 24 Jun 2009)
New Revision: 21513
URL: https://svn.open-mpi.org/trac/ompi/changeset/21513

Log:
When we get a report from an orted about its state, don't use the sender of the message to update the structures, but instead use the information from the URI. The reason is that even the launch report messages can get routed.

Deal with the orted_cmd_line in a single location.

Text files modified:
trunk/orte/mca/plm/base/plm_base_launch_support.c | 69 +++++++++ ++++++++++++++----------------
 1 files changed, 41 insertions(+), 28 deletions(-)

Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
= = = = = = = = = =====================================================================
--- trunk/orte/mca/plm/base/plm_base_launch_support.c   (original)
+++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2009-06-24 15:51:52 EDT (Wed, 24 Jun 2009)
@@ -433,7 +433,8 @@
{
   orte_message_event_t *mev = (orte_message_event_t*)data;
   opal_buffer_t *buffer = mev->buffer;
-    char *rml_uri;
+    orte_process_name_t peer;
+    char *rml_uri = NULL;
   int rc, idx;
   int32_t arch;
   orte_node_t **nodes;
@@ -442,19 +443,11 @@
   int64_t setupsec, setupusec;
   int64_t startsec, startusec;

-    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
- "%s plm:base:orted_report_launch from daemon %s",
-                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                         ORTE_NAME_PRINT(&mev->sender)));
-
   /* see if we need to timestamp this receipt */
   if (orte_timing) {
       gettimeofday(&recvtime, NULL);
   }

-    /* update state */
-    pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
-
   /* unpack its contact info */
   idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) {
@@ -466,13 +459,26 @@
   /* set the contact info into the hash table */
   if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
       ORTE_ERROR_LOG(rc);
-        free(rml_uri);
       orted_failed_launch = true;
       goto CLEANUP;
   }
-    /* lookup and record this daemon's contact info */
-    pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri);
-    free(rml_uri);
+
+    rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL );
+    if( ORTE_SUCCESS != rc ) {
+        ORTE_ERROR_LOG(rc);
+        orted_failed_launch = true;
+        goto CLEANUP;
+    }
+
+    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
+ "%s plm:base:orted_report_launch from daemon %s via %s",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         ORTE_NAME_PRINT(&peer),
+                         ORTE_NAME_PRINT(&mev->sender)));
+
+    /* update state and record for this daemon contact info */
+    pdatorted[peer.vpid]->state = ORTE_PROC_STATE_RUNNING;
+    pdatorted[peer.vpid]->rml_uri = rml_uri;

   /* get the remote arch */
   idx = 1;
@@ -555,31 +561,33 @@

   /* lookup the node */
   nodes = (orte_node_t**)orte_node_pool->addr;
-    if (NULL == nodes[mev->sender.vpid]) {
+    if (NULL == nodes[peer.vpid]) {
       ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
       orted_failed_launch = true;
       goto CLEANUP;
   }
   /* store the arch */
-    nodes[mev->sender.vpid]->arch = arch;
+    nodes[peer.vpid]->arch = arch;

   /* if a tree-launch is underway, send the cmd back */
   if (NULL != orte_tree_launch_cmd) {
- orte_rml.send_buffer(&mev->sender, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0); + orte_rml.send_buffer(&peer, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0);
   }

CLEANUP:

   OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
- "%s plm:base:orted_report_launch %s for daemon %s at contact %s", + "%s plm:base:orted_report_launch %s for daemon %s (via %s) at contact %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        orted_failed_launch ? "failed" : "completed",
- ORTE_NAME_PRINT(&mev->sender), pdatorted[mev->sender.vpid]->rml_uri));
+                         ORTE_NAME_PRINT(&peer),
+ ORTE_NAME_PRINT(&mev->sender), pdatorted[peer.vpid]->rml_uri));

   /* release the message */
   OBJ_RELEASE(mev);

   if (orted_failed_launch) {
+        if( NULL != rml_uri ) free(rml_uri);
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
   } else {
       orted_num_callback++;
@@ -1133,18 +1141,23 @@
    * being sure to "purge" any that would cause problems
    * on backend nodes
    */
-    if (ORTE_PROC_IS_HNP) {
+    if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
       cnt = opal_argv_count(orted_cmd_line);
       for (i=0; i < cnt; i+=3) {
- /* if the specified option is more than one word, we don't - * have a generic way of passing it as some environments ignore - * any quotes we add, while others don't - so we ignore any - * such options. In most cases, this won't be a problem as - * they typically only apply to things of interest to the HNP. - * Individual environments can add these back into the cmd line
-             * as they know if it can be supported
-             */
-            if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
+ /* in the rsh environment, we can append multi-word arguments + * by enclosing them in quotes. Check for any multi- word
+              * mca params passed to mpirun and include them
+              */
+             if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
+                char* param;
+
+                /* must add quotes around it */
+                asprintf(&param, "\"%s\"", orted_cmd_line[i+2]);
+                /* now pass it along */
+                opal_argv_append(argc, argv, orted_cmd_line[i]);
+                opal_argv_append(argc, argv, orted_cmd_line[i+1]);
+                opal_argv_append(argc, argv, param);
+                free(param);
               continue;
           }
           /* The daemon will attempt to open the PLM on the remote
_______________________________________________
svn mailing list
s...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel


--
Jeff Squyres
Cisco Systems

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to