
The OpenMpi I used is 1.8.4. I just tried to run a test program to see if
the CUDA aware feature works. But I got the following errors.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 s1
[ss-Inspiron-5439:32514] *** Process received signal ***
[ss-Inspiron-5439:32514] Signal: Segmentation fault (11)
[ss-Inspiron-5439:32514] Signal code: Address not mapped (1)
[ss-Inspiron-5439:32514] Failing at address: 0x3
[ss-Inspiron-5439:32514] [ 0]
[ss-Inspiron-5439:32514] [ 1]
[ss-Inspiron-5439:32514] [ 2]
[ss-Inspiron-5439:32514] [ 3]
[ss-Inspiron-5439:32514] [ 4]
[ss-Inspiron-5439:32514] [ 5]
[ss-Inspiron-5439:32514] [ 6]
[ss-Inspiron-5439:32514] [ 7] s1[0x408b1e]
[ss-Inspiron-5439:32514] [ 8]
[ss-Inspiron-5439:32514] [ 9] s1[0x4088e9]
[ss-Inspiron-5439:32514] *** End of error message ***
mpirun noticed that process rank 0 with PID 32514 on node ss-Inspiron-5439
exited on signal 11 (Segmentation fault).

Looks like MPI_Send can not send CUDA buffer. But I already did  the
command      ./configure --with-cuda for OpenMPI.

The command I uesd is.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ nvcc -c k1.cu
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -c main.cc
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -o s1 main.o k1.o
-L/usr/local/cuda/lib64/ -lcudart
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 ./s1

The code I'm running is

main.cc file
using namespace std;
#define vect_len 16
const int blocksize = 16;

int main(int argv, char *argc[])
int numprocs, myid;
MPI_Status status;
const int vect_size = vect_len*sizeof(int);

int *vect1 = new int[vect_size];
int *vect2 = new int[vect_size];
int *result = new int[vect_size];
  bool flag;

  int *ad;
  int *bd;

MPI_Init(&argv, &argc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  if(myid == 0)
for(int i = 0; i < vect_len; i++)
vect1[i] = i;
vect2[i] = 2 * i;
for(int i = 0; i < vect_len; i++)
vect1[i] = 2 * i;
vect2[i] = i;

  initializeGPU(vect1, vect2, ad, bd, vect_size);

if(myid == 0)
for(int i = 0; i < numprocs; i++)
MPI_Send(ad,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD );
MPI_Send(bd,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD );
MPI_Recv(ad,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status );
MPI_Recv(bd,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status );

computeGPU(blocksize, vect_len, ad, bd, result, vect_size);

flag = true;

for(int i = 0; i < vect_len; i++)
if (i < 8)
vect1[i] += vect2[i];
vect1[i] -= vect2[i];


for(int i = 0; i < vect_len; i++)
if( result[i] != vect1[i] )
cout<<"the result ["<<i<<"] by m2s is"<<result[i]<<endl;
cout<<"the result ["<<i<<"] of vector is"<<vect1[i]<<endl;
cout << "Verification fail at " << i << endl;
flag = false;

cout << "Verification passes." <<endl;
// free device memory
/* cudaFree( ad );
cudaFree( bd );

k1.h file

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int*
result, int vect_size);

k1.cu file


__global__ void vect_add(int *a, int *b, int n)

int id = threadIdx.x;

if (id < n)
a[id] = a[id] + b[id];
a[id] = a[id] - b[id];

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int

// initialize device memory
cudaMalloc( (void**)&ad, vect_size );
cudaMalloc( (void**)&bd, vect_size );

// copy data to device
cudaMemcpy( ad, hostptr1, vect_size, cudaMemcpyHostToDevice );
cudaMemcpy( bd, hostptr2, vect_size, cudaMemcpyHostToDevice );


void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int*
result, int vect_size)
// setup block and grid size
dim3 dimBlock( block_size, 1, 1);
dim3 dimGrid( vect_len/block_size, 1 , 1);
vect_add<<<dimGrid, dimBlock>>>(ptr1, ptr2, 8);

cudaMemcpy( result, ptr1 , vect_size, cudaMemcpyDeviceToHost );

Many Thanks for help,

Reply via email to