Running the following program on two nodes connected by Infiniband and OMPI 1.5.1 (openib BTL), one task per node:

#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv) {
  int rank, size, i, *buf1, *buf2;
  MPI_Request* reqs;
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  buf1 = (int*)calloc(1000 * size, sizeof(int));
  if (!buf1) abort();
  buf2 = (int*)calloc(1000 * size, sizeof(int));
  if (!buf2) abort();
  reqs = (MPI_Request*)malloc(2 * size * sizeof(MPI_Request));
  if (!reqs) abort();
  for (i = 0; i < size; ++i) {
MPI_Isend(buf1 + 1000 * i, 1000, MPI_INT, (i + rank) % size, 0, MPI_COMM_WORLD, &reqs[i]); MPI_Irecv(buf2 + 1000 * i, 1000, MPI_INT, (i + rank) % size, 0, MPI_COMM_WORLD, &reqs[size + i]);
  }
  MPI_Waitall(2 * size, reqs, MPI_STATUSES_IGNORE);
  free(buf1);
  free(buf2);
  free(reqs);
  MPI_Finalize();
  return 0;
}

produces an infinite loop in MPI_Waitall. Running under valgrind, the MPI_Waitall call produces a NULL pointer dereference and thus a segmentation fault. The code works correctly in Open MPI 1.4.1 and in 1.5.1 when using the TCP BTL.

-- Jeremiah Willcock

Reply via email to