Hi,
I have verified the openMPI version to be 1.2.7 on all the nodes and also
ompi_info | grep thread is Thread support: posix (mpi: no, progress: no) on
these machines.
I get the error with and without -mca oob_tcp_listen_mode listen_thread.
Sometimes, the startup takes too long with the listen_thread enabled and I
have to resort to killing and restarting the program.
Would the following matter in any way?
1. The head node (node where I start the mpi process) being a part of the
cluster
2. The head node also being the root node (node with vpid 0)
3. The head node not being a part of the cluster
I am currently trying the above stuff and other combinations such as
tweaking -mca oob_tcp_thread_max_size.
The test program I run is the following:
#include <boost/mpi.hpp>
#include <iostream>
int
main(int argc, char **argv)
{
// Initialize MPI environment
boost::mpi::environment env(argc, argv);
if (!env.initialized()) {
std::cout << "Could not initialize MPI environment!" << std::endl;
return -1;
}
boost::mpi::communicator world;
// Find out my identity in the default communicator
int myrank = world.rank();
// Find out how many processes there are in the default communicator
int ntasks = world.size();
char hn[256];
gethostname(hn, 255);
std::cout << hn << " is node " << myrank << " of " << ntasks << std::endl;
int allranks = boost::mpi::all_reduce(world, myrank, std::plus<int>());
world.barrier();
if (myrank == 0) {
std::cout << "ranks sum to " << allranks << std::endl;
}
// finalize MPI environment when env is destructed
return 0;
}
I also tried a version without Boost::MPI with the same results.
#include <stdio.h>
#include <mpi.h>
int main (int argc, char* argv[])
{
int rank, size;
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes
*/
char hn[256];
gethostname(hn, 255);
printf( "%s is node %d of %d\n", hn, rank, size );
int all_ranks;
int count[1024] = {1};
MPI_Reduce_scatter (&rank,&all_ranks, count, MPI_INT, MPI_SUM,
MPI_COMM_WORLD);
MPI_Barrier (MPI_COMM_WORLD);
if(rank == 0 )
printf( "ranks sum to %d\n",all_ranks);
MPI_Finalize();
return 0;
}
Regards,
Prasanna.