I can't guarantee this for all PLM but I can confirm that rsh and
slurm (1.3.12) works well with this.
We try with and without Open MPI, and the outcome is the same.
[bosilca@dancer c]$ srun -n 4 echo "1 2 3 4 5 it works"
1 2 3 4 5 it works
1 2 3 4 5 it works
1 2 3 4 5 it works
1 2 3 4 5 it works
[bosilca@dancer c]$ srun -N 2 -c 2 mpirun --mca plm slurm --mca
orte_launch_agent "orted -s" --mca plm_rsh_tree_spawn 1 --bynode --mca
pml ob1 --mca orte_daemon_spin 0 ./hello
Hello, world, I am 0 of 2 on node03
Hello, world, I am 1 of 2 on node04
*after releasing the orted from their spin.
In fact what I find strange is the old behavior. Dropping arguments
without even letting the user know about it, is certainly not a
desirable approach.
george.
On Jun 24, 2009, at 16:15 , Ralph Castain wrote:
> Yo George
>
> This commit is going to break non-rsh launchers. While it is true
> that the rsh launcher may handle multi-word options by putting them
> in quotes, we specifically avoided it here because it breaks SLURM,
> Torque, and others.
>
> This is why we specifically put the inclusion of multi-word options
> in the rsh plm module, and not here. Would you please move it back
> there?
>
> Thanks
> Ralph
>
>
> On Wed, Jun 24, 2009 at 1:51 PM, <bosi...@osl.iu.edu> wrote:
> Author: bosilca
> Date: 2009-06-24 15:51:52 EDT (Wed, 24 Jun 2009)
> New Revision: 21513
> URL: https://svn.open-mpi.org/trac/ompi/changeset/21513
>
> Log:
> When we get a report from an orted about its state, don't use the
> sender of
> the message to update the structures, but instead use the
> information from
> the URI. The reason is that even the launch report messages can get
> routed.
>
> Deal with the orted_cmd_line in a single location.
>
> Text files modified:
> trunk/orte/mca/plm/base/plm_base_launch_support.c | 69 +++++++
++
> ++++++++++++++----------------
> 1 files changed, 41 insertions(+), 28 deletions(-)
>
> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
> =
> =
> =
> =
> =
> =
> =
> =
>
======================================================================
> --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original)
> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2009-06-24
> 15:51:52 EDT (Wed, 24 Jun 2009)
> @@ -433,7 +433,8 @@
> {
> orte_message_event_t *mev = (orte_message_event_t*)data;
> opal_buffer_t *buffer = mev->buffer;
> - char *rml_uri;
> + orte_process_name_t peer;
> + char *rml_uri = NULL;
> int rc, idx;
> int32_t arch;
> orte_node_t **nodes;
> @@ -442,19 +443,11 @@
> int64_t setupsec, setupusec;
> int64_t startsec, startusec;
>
> - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> - "%s plm:base:orted_report_launch from
> daemon %s",
> - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> - ORTE_NAME_PRINT(&mev->sender)));
> -
> /* see if we need to timestamp this receipt */
> if (orte_timing) {
> gettimeofday(&recvtime, NULL);
> }
>
> - /* update state */
> - pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
> -
> /* unpack its contact info */
> idx = 1;
> if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri,
> &idx, OPAL_STRING))) {
> @@ -466,13 +459,26 @@
> /* set the contact info into the hash table */
> if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
> ORTE_ERROR_LOG(rc);
> - free(rml_uri);
> orted_failed_launch = true;
> goto CLEANUP;
> }
> - /* lookup and record this daemon's contact info */
> - pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri);
> - free(rml_uri);
> +
> + rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL );
> + if( ORTE_SUCCESS != rc ) {
> + ORTE_ERROR_LOG(rc);
> + orted_failed_launch = true;
> + goto CLEANUP;
> + }
> +
> + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> + "%s plm:base:orted_report_launch from
> daemon %s via %s",
> + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> + ORTE_NAME_PRINT(&peer),
> + ORTE_NAME_PRINT(&mev->sender)));
> +
> + /* update state and record for this daemon contact info */
> + pdatorted[peer.vpid]->state = ORTE_PROC_STATE_RUNNING;
> + pdatorted[peer.vpid]->rml_uri = rml_uri;
>
> /* get the remote arch */
> idx = 1;
> @@ -555,31 +561,33 @@
>
> /* lookup the node */
> nodes = (orte_node_t**)orte_node_pool->addr;
> - if (NULL == nodes[mev->sender.vpid]) {
> + if (NULL == nodes[peer.vpid]) {
> ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
> orted_failed_launch = true;
> goto CLEANUP;
> }
> /* store the arch */
> - nodes[mev->sender.vpid]->arch = arch;
> + nodes[peer.vpid]->arch = arch;
>
> /* if a tree-launch is underway, send the cmd back */
> if (NULL != orte_tree_launch_cmd) {
> - orte_rml.send_buffer(&mev->sender, orte_tree_launch_cmd,
> ORTE_RML_TAG_DAEMON, 0);
> + orte_rml.send_buffer(&peer, orte_tree_launch_cmd,
> ORTE_RML_TAG_DAEMON, 0);
> }
>
> CLEANUP:
>
> OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> - "%s plm:base:orted_report_launch %s for
> daemon %s at contact %s",
> + "%s plm:base:orted_report_launch %s for
> daemon %s (via %s) at contact %s",
> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> orted_failed_launch ? "failed" :
"completed",
> - ORTE_NAME_PRINT(&mev->sender),
> pdatorted[mev->sender.vpid]->rml_uri));
> + ORTE_NAME_PRINT(&peer),
> + ORTE_NAME_PRINT(&mev->sender),
> pdatorted[peer.vpid]->rml_uri));
>
> /* release the message */
> OBJ_RELEASE(mev);
>
> if (orted_failed_launch) {
> + if( NULL != rml_uri ) free(rml_uri);
> orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid,
> ORTE_ERROR_DEFAULT_EXIT_CODE);
> } else {
> orted_num_callback++;
> @@ -1133,18 +1141,23 @@
> * being sure to "purge" any that would cause problems
> * on backend nodes
> */
> - if (ORTE_PROC_IS_HNP) {
> + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
> cnt = opal_argv_count(orted_cmd_line);
> for (i=0; i < cnt; i+=3) {
> - /* if the specified option is more than one word, we
> don't
> - * have a generic way of passing it as some
> environments ignore
> - * any quotes we add, while others don't - so we ignore
> any
> - * such options. In most cases, this won't be a problem
> as
> - * they typically only apply to things of interest to
> the HNP.
> - * Individual environments can add these back into the
> cmd line
> - * as they know if it can be supported
> - */
> - if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
> + /* in the rsh environment, we can append multi-word
> arguments
> + * by enclosing them in quotes. Check for any multi-
word
> + * mca params passed to mpirun and include them
> + */
> + if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
> + char* param;
> +
> + /* must add quotes around it */
> + asprintf(¶m, "\"%s\"", orted_cmd_line[i+2]);
> + /* now pass it along */
> + opal_argv_append(argc, argv, orted_cmd_line[i]);
> + opal_argv_append(argc, argv, orted_cmd_line[i+1]);
> + opal_argv_append(argc, argv, param);
> + free(param);
> continue;
> }
> /* The daemon will attempt to open the PLM on the remote
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel
_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel