Hi,

I am trying to write trivial master-slave program. Master simply creates
slaves, sends them a string, they print it out and exit. Everything works
just fine, however, when I add a delay (more than 2 sec) before calling
MPI_Init on slave, MPI fails with MPI_ERR_SPAWN. I am pretty sure that
MPI_Comm_spawn has some kind of timeout on waiting for slaves to call
MPI_Init, and if they fail to respond in time, it returns an error.

I believe there is a way to change this behaviour, but I wasn't able to
find any suggestions/ideas in the internet.
I would appreciate if someone could help with this.

---
--- terminal command i use to run program:
mpirun -n 1 hello 2 2 // the first argument to "hello" is number of
slaves, the second is delay in seconds

--- Error message I get when delay is >=2 sec:
[host:2231] *** An error occurred in MPI_Comm_spawn
[host:2231] *** reported by process [3453419521,0]
[host:2231] *** on communicator MPI_COMM_SELF
[host:2231] *** MPI_ERR_SPAWN: could not spawn processes
[host:2231] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will
now abort,
[host:2231] ***    and potentially your MPI job)

--- The program itself:
#include "stdlib.h"
#include "stdio.h"
#include "mpi.h"
#include "unistd.h"

MPI_Comm slave_comm;
MPI_Comm new_world;
#define MESSAGE_SIZE 40

void slave() {
        printf("Slave initialized; ");
        MPI_Comm_get_parent(&slave_comm);
        MPI_Intercomm_merge(slave_comm, 1, &new_world);

        int slave_rank;
        MPI_Comm_rank(new_world, &slave_rank);

        char message[MESSAGE_SIZE];
        MPI_Bcast(message, MESSAGE_SIZE, MPI_CHAR, 0, new_world);

        printf("Slave %d received message from master: %s\n", slave_rank, 
message);
}

void master(int slave_count, char* executable, char* delay) {
        char* slave_argv[] = { delay, NULL };
        MPI_Comm_spawn( executable,
                        slave_argv,
                        slave_count,
                        MPI_INFO_NULL,
                        0,
                        MPI_COMM_SELF,
                        &slave_comm,
                        MPI_ERRCODES_IGNORE);
        MPI_Intercomm_merge(slave_comm, 0, &new_world);
        char* helloWorld = "Hello New World!\0";
        MPI_Bcast(helloWorld, MESSAGE_SIZE, MPI_CHAR, 0, new_world);
        printf("Processes spawned!\n");
}

int main(int argc, char* argv[]) {
        if (argc > 2) {
                MPI_Init(&argc, &argv);
                master(atoi(argv[1]), argv[0], argv[2]);
        } else {
                sleep(atoi(argv[1])); /// delay
                MPI_Init(&argc, &argv);
                slave();
        }
        MPI_Comm_free(&new_world);
        MPI_Comm_free(&slave_comm);
        MPI_Finalize();
}


Thank you,

Andrew Elistratov


_______________________________________________
users mailing list
users@lists.open-mpi.org
https://rfd.newmexicoconsortium.org/mailman/listinfo/users

Reply via email to