Hi, The OpenMpi I used is 1.8.4. I just tried to run a test program to see if the CUDA aware feature works. But I got the following errors.
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 s1 [ss-Inspiron-5439:32514] *** Process received signal *** [ss-Inspiron-5439:32514] Signal: Segmentation fault (11) [ss-Inspiron-5439:32514] Signal code: Address not mapped (1) [ss-Inspiron-5439:32514] Failing at address: 0x3 [ss-Inspiron-5439:32514] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x36c30)[0x7f74d7048c30] [ss-Inspiron-5439:32514] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x98a70)[0x7f74d70aaa70] [ss-Inspiron-5439:32514] [ 2] /usr/local/openmpi-1.8.4/lib/libopen-pal.so.6(opal_convertor_pack+0x187)[0x7f74d673f097] [ss-Inspiron-5439:32514] [ 3] /usr/local/openmpi-1.8.4/lib/openmpi/mca_btl_self.so(mca_btl_self_prepare_src+0xb8)[0x7f74ce196888] [ss-Inspiron-5439:32514] [ 4] /usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_prepare+0x4c)[0x7f74cd2c183c] [ss-Inspiron-5439:32514] [ 5] /usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x5ba)[0x7f74cd2b78aa] [ss-Inspiron-5439:32514] [ 6] /usr/local/openmpi-1.8.4/lib/libmpi.so.1(PMPI_Send+0xf2)[0x7f74d79602a2] [ss-Inspiron-5439:32514] [ 7] s1[0x408b1e] [ss-Inspiron-5439:32514] [ 8] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5)[0x7f74d7033ec5] [ss-Inspiron-5439:32514] [ 9] s1[0x4088e9] [ss-Inspiron-5439:32514] *** End of error message *** -------------------------------------------------------------------------- mpirun noticed that process rank 0 with PID 32514 on node ss-Inspiron-5439 exited on signal 11 (Segmentation fault). Looks like MPI_Send can not send CUDA buffer. But I already did the command ./configure --with-cuda for OpenMPI. The command I uesd is. ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ nvcc -c k1.cu ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -c main.cc ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -o s1 main.o k1.o -L/usr/local/cuda/lib64/ -lcudart ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 ./s1 The code I'm running is main.cc file #include<iostream> using namespace std; #include<mpi.h> #include"k1.h" #define vect_len 16 const int blocksize = 16; int main(int argv, char *argc[]) { int numprocs, myid; MPI_Status status; const int vect_size = vect_len*sizeof(int); int *vect1 = new int[vect_size]; int *vect2 = new int[vect_size]; int *result = new int[vect_size]; bool flag; int *ad; int *bd; MPI_Init(&argv, &argc); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); if(myid == 0) { for(int i = 0; i < vect_len; i++) { vect1[i] = i; vect2[i] = 2 * i; } } else { for(int i = 0; i < vect_len; i++) { vect1[i] = 2 * i; vect2[i] = i; } } initializeGPU(vect1, vect2, ad, bd, vect_size); if(myid == 0) { for(int i = 0; i < numprocs; i++) { MPI_Send(ad,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD ); MPI_Send(bd,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD ); } } else { MPI_Recv(ad,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status ); MPI_Recv(bd,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status ); } computeGPU(blocksize, vect_len, ad, bd, result, vect_size); //Verify flag = true; for(int i = 0; i < vect_len; i++) { if (i < 8) vect1[i] += vect2[i]; else vect1[i] -= vect2[i]; } for(int i = 0; i < vect_len; i++) { if( result[i] != vect1[i] ) { cout<<"the result ["<<i<<"] by m2s is"<<result[i]<<endl; cout<<"the result ["<<i<<"] of vector is"<<vect1[i]<<endl; cout << "Verification fail at " << i << endl; flag = false; break; } } if(flag) cout << "Verification passes." <<endl; // free device memory /* cudaFree( ad ); cudaFree( bd ); free(vect1); free(vect2); free(result); */ MPI_Finalize(); } k1.h file void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int vect_size); void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* result, int vect_size); k1.cu file #include"k1.h" __global__ void vect_add(int *a, int *b, int n) { int id = threadIdx.x; if (id < n) a[id] = a[id] + b[id]; else a[id] = a[id] - b[id]; } void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int vect_size) { // initialize device memory cudaMalloc( (void**)&ad, vect_size ); cudaMalloc( (void**)&bd, vect_size ); // copy data to device cudaMemcpy( ad, hostptr1, vect_size, cudaMemcpyHostToDevice ); cudaMemcpy( bd, hostptr2, vect_size, cudaMemcpyHostToDevice ); } void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* result, int vect_size) { // setup block and grid size dim3 dimBlock( block_size, 1, 1); dim3 dimGrid( vect_len/block_size, 1 , 1); vect_add<<<dimGrid, dimBlock>>>(ptr1, ptr2, 8); cudaMemcpy( result, ptr1 , vect_size, cudaMemcpyDeviceToHost ); } Many Thanks for help, Xun