Folks, Since r32672 (trunk), grpcomm/rcd is the default module. the attached spawn.c test program is a trimmed version of the spawn_with_env_vars.c test case from the ibm test suite.
when invoked on two nodes : - the program hangs with -np 2 - the program can crash with np > 2 error message is [node0:30701] [[42913,0],0] TWO RECEIVES WITH SAME PEER [[42913,0],1] AND TAG -33 - ABORTING here is my full command line (from node0) : mpirun -host node0,node1 -np 2 --oversubscribe --mca btl tcp,self --mca coll ^ml ./spawn a simple workaround is to add the following extra parameter to the mpirun command line : --mca grpcomm_rcd_priority 0 my understanding it that the race condition occurs when all the processes call MPI_Finalize() internally, the pmix module will have mpirun/orted issue two ALLGATHER involving mpirun and orted (one job 1 aka the parent, and one for job 2 aka the spawned tasks) the error message is very explicit : this is not (currently) supported i wrote the attached rml.patch which is really a workaround and not a fix : in this case, each job will invoke an ALLGATHER but with a different tag /* that works for a limited number of jobs only */ i did not commit this patch since this is not a fix, could someone (Ralph ?) please review the issue and comment ? Cheers, Gilles
/* * $HEADER$ * * Program to test MPI_Comm_spawn with environment variables. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include "mpi.h" static void do_parent(char *cmd, int rank, int count) { int *errcode, err; int i; MPI_Comm child_inter; MPI_Comm intra; FILE *fp; int found; int size; /* First, see if cmd exists on all ranks */ fp = fopen(cmd, "r"); if (NULL == fp) { found = 0; } else { fclose(fp); found = 1; } MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Allreduce(&found, &count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (count != size) { if (rank == 0) { MPI_Abort(MPI_COMM_WORLD, 77); } return; } /* Now try the spawn if it's found anywhere */ errcode = malloc(sizeof(int) * count); if (NULL == errcode) { MPI_Abort(MPI_COMM_WORLD, 1); } memset(errcode, -1, count); MPI_Comm_spawn(cmd, MPI_ARGV_NULL, count, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &child_inter, errcode); /* Clean up */ MPI_Barrier(child_inter); MPI_Comm_disconnect(&child_inter); free(errcode); } static void do_target(MPI_Comm parent) { MPI_Barrier(parent); MPI_Comm_disconnect(&parent); } int main(int argc, char *argv[]) { int rank, size; MPI_Comm parent; /* Ok, we're good. Proceed with the test. */ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* Check to see if we *were* spawned -- because this is a test, we can only assume the existence of this one executable. Hence, we both mpirun it and spawn it. */ parent = MPI_COMM_NULL; MPI_Comm_get_parent(&parent); if (parent != MPI_COMM_NULL) { do_target(parent); } else { do_parent(argv[0], rank, size); } MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (0 < rank) sleep(3); MPI_Finalize(); /* All done */ return 0; }
Index: orte/mca/grpcomm/brks/grpcomm_brks.c =================================================================== --- orte/mca/grpcomm/brks/grpcomm_brks.c (revision 32688) +++ orte/mca/grpcomm/brks/grpcomm_brks.c (working copy) @@ -6,6 +6,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All * rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -111,6 +113,7 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance) { orte_process_name_t peer_send, peer_recv; opal_buffer_t *send_buf; + orte_rml_tag_t tag; int rc; peer_send.jobid = ORTE_PROC_MY_NAME->jobid; @@ -174,8 +177,14 @@ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer_send))); + if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != coll->sig->signature[0].vpid) { + tag = ORTE_RML_TAG_ALLGATHER; + } else { + tag = ORTE_RML_TAG_JOB_ALLGATHER + ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % (ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER); + } + if (0 > (rc = orte_rml.send_buffer_nb(&peer_send, send_buf, - -ORTE_RML_TAG_ALLGATHER, + -tag, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(send_buf); @@ -189,7 +198,7 @@ /* setup recv for distance data */ orte_rml.recv_buffer_nb(&peer_recv, - -ORTE_RML_TAG_ALLGATHER, + -tag, ORTE_RML_NON_PERSISTENT, brks_allgather_recv_dist, NULL); Index: orte/mca/grpcomm/rcd/grpcomm_rcd.c =================================================================== --- orte/mca/grpcomm/rcd/grpcomm_rcd.c (revision 32688) +++ orte/mca/grpcomm/rcd/grpcomm_rcd.c (working copy) @@ -6,6 +6,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All * rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -105,6 +107,7 @@ static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance) { orte_process_name_t peer; opal_buffer_t *send_buf; + orte_rml_tag_t tag; int rc; peer.jobid = ORTE_PROC_MY_NAME->jobid; @@ -163,8 +166,14 @@ ORTE_NAME_PRINT(&peer))); + if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != coll->sig->signature[0].vpid) { + tag = ORTE_RML_TAG_ALLGATHER; + } else { + tag = ORTE_RML_TAG_JOB_ALLGATHER + ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % (ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER); + } + if (0 > (rc = orte_rml.send_buffer_nb(&peer, send_buf, - -ORTE_RML_TAG_ALLGATHER, + -tag, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(send_buf); @@ -178,7 +187,7 @@ /* setup recv for distance data */ orte_rml.recv_buffer_nb(&peer, - -ORTE_RML_TAG_ALLGATHER, + -tag, ORTE_RML_NON_PERSISTENT, rcd_allgather_recv_dist, NULL); Index: orte/mca/grpcomm/direct/grpcomm_direct.c =================================================================== --- orte/mca/grpcomm/direct/grpcomm_direct.c (revision 32688) +++ orte/mca/grpcomm/direct/grpcomm_direct.c (working copy) @@ -6,6 +6,8 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All * rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -70,6 +72,7 @@ */ static int init(void) { + orte_rml_tag_t tag; OBJ_CONSTRUCT(&tracker, opal_list_t); /* post the receives */ @@ -81,6 +84,12 @@ ORTE_RML_TAG_ALLGATHER, ORTE_RML_PERSISTENT, allgather_recv, NULL); + for (tag=ORTE_RML_TAG_JOB_ALLGATHER; tag<ORTE_RML_TAG_MAX; tag++) { + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, + tag, + ORTE_RML_PERSISTENT, + allgather_recv, NULL); + } /* setup recv for barrier release */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_RELEASE, @@ -125,6 +134,7 @@ int rc, ret; opal_buffer_t *relay; orte_job_t *jdata; + orte_rml_tag_t tag; uint64_t nprocs; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, @@ -189,8 +199,15 @@ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* send the info to the HNP for tracking */ + + if (1 != coll->sig->sz || ORTE_VPID_WILDCARD != coll->sig->signature[0].vpid) { + tag = ORTE_RML_TAG_ALLGATHER; + } else { + tag = ORTE_RML_TAG_JOB_ALLGATHER + ORTE_LOCAL_JOBID(coll->sig->signature[0].jobid) % (ORTE_RML_TAG_MAX-ORTE_RML_TAG_JOB_ALLGATHER); + } + rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, relay, - ORTE_RML_TAG_ALLGATHER, + tag, orte_rml_send_callback, NULL); return rc; } Index: orte/mca/rml/rml_types.h =================================================================== --- orte/mca/rml/rml_types.h (revision 32688) +++ orte/mca/rml/rml_types.h (working copy) @@ -12,6 +12,8 @@ * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -152,6 +154,8 @@ /* global collective ID request */ #define ORTE_RML_TAG_FULL_COLL_ID 51 +#define ORTE_RML_TAG_JOB_ALLGATHER 52 + #define ORTE_RML_TAG_MAX 100