Hi, I am trying to write trivial master-slave program. Master simply creates slaves, sends them a string, they print it out and exit. Everything works just fine, however, when I add a delay (more than 2 sec) before calling MPI_Init on slave, MPI fails with MPI_ERR_SPAWN. I am pretty sure that MPI_Comm_spawn has some kind of timeout on waiting for slaves to call MPI_Init, and if they fail to respond in time, it returns an error.
I believe there is a way to change this behaviour, but I wasn't able to find any suggestions/ideas in the internet. I would appreciate if someone could help with this. --- --- terminal command i use to run program: mpirun -n 1 hello 2 2 // the first argument to "hello" is number of slaves, the second is delay in seconds --- Error message I get when delay is >=2 sec: [host:2231] *** An error occurred in MPI_Comm_spawn [host:2231] *** reported by process [3453419521,0] [host:2231] *** on communicator MPI_COMM_SELF [host:2231] *** MPI_ERR_SPAWN: could not spawn processes [host:2231] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, [host:2231] *** and potentially your MPI job) --- The program itself: #include "stdlib.h" #include "stdio.h" #include "mpi.h" #include "unistd.h" MPI_Comm slave_comm; MPI_Comm new_world; #define MESSAGE_SIZE 40 void slave() { printf("Slave initialized; "); MPI_Comm_get_parent(&slave_comm); MPI_Intercomm_merge(slave_comm, 1, &new_world); int slave_rank; MPI_Comm_rank(new_world, &slave_rank); char message[MESSAGE_SIZE]; MPI_Bcast(message, MESSAGE_SIZE, MPI_CHAR, 0, new_world); printf("Slave %d received message from master: %s\n", slave_rank, message); } void master(int slave_count, char* executable, char* delay) { char* slave_argv[] = { delay, NULL }; MPI_Comm_spawn( executable, slave_argv, slave_count, MPI_INFO_NULL, 0, MPI_COMM_SELF, &slave_comm, MPI_ERRCODES_IGNORE); MPI_Intercomm_merge(slave_comm, 0, &new_world); char* helloWorld = "Hello New World!\0"; MPI_Bcast(helloWorld, MESSAGE_SIZE, MPI_CHAR, 0, new_world); printf("Processes spawned!\n"); } int main(int argc, char* argv[]) { if (argc > 2) { MPI_Init(&argc, &argv); master(atoi(argv[1]), argv[0], argv[2]); } else { sleep(atoi(argv[1])); /// delay MPI_Init(&argc, &argv); slave(); } MPI_Comm_free(&new_world); MPI_Comm_free(&slave_comm); MPI_Finalize(); } Thank you, Andrew Elistratov _______________________________________________ users mailing list users@lists.open-mpi.org https://rfd.newmexicoconsortium.org/mailman/listinfo/users