I think I found a bug in your program with how you were allocating the GPU 
buffers.  I will send you a version offlist with the fix.
Also, there is no need to rerun with the flags I had mentioned below.
Rolf


From: Rolf vandeVaart
Sent: Monday, January 12, 2015 9:38 AM
To: us...@open-mpi.org
Subject: RE: [OMPI users] Segmentation fault when using CUDA Aware feature

That is strange, not sure why that is happening.  I will try to reproduce with 
your program on my system.  Also, perhaps you could rerun with –mca 
mpi_common_cuda_verbose 100 and send me that output.

Thanks

From: users [mailto:users-boun...@open-mpi.org] On Behalf Of Xun Gong
Sent: Sunday, January 11, 2015 11:41 PM
To: us...@open-mpi.org<mailto:us...@open-mpi.org>
Subject: [OMPI users] Segmentation fault when using CUDA Aware feature

Hi,

The OpenMpi I used is 1.8.4. I just tried to run a test program to see if the 
CUDA aware feature works. But I got the following errors.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 s1
[ss-Inspiron-5439:32514] *** Process received signal ***
[ss-Inspiron-5439:32514] Signal: Segmentation fault (11)
[ss-Inspiron-5439:32514] Signal code: Address not mapped (1)
[ss-Inspiron-5439:32514] Failing at address: 0x3
[ss-Inspiron-5439:32514] [ 0] 
/lib/x86_64-linux-gnu/libc.so.6(+0x36c30)[0x7f74d7048c30]
[ss-Inspiron-5439:32514] [ 1] 
/lib/x86_64-linux-gnu/libc.so.6(+0x98a70)[0x7f74d70aaa70]
[ss-Inspiron-5439:32514] [ 2] 
/usr/local/openmpi-1.8.4/lib/libopen-pal.so.6(opal_convertor_pack+0x187)[0x7f74d673f097]
[ss-Inspiron-5439:32514] [ 3] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_btl_self.so(mca_btl_self_prepare_src+0xb8)[0x7f74ce196888]
[ss-Inspiron-5439:32514] [ 4] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_prepare+0x4c)[0x7f74cd2c183c]
[ss-Inspiron-5439:32514] [ 5] 
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x5ba)[0x7f74cd2b78aa]
[ss-Inspiron-5439:32514] [ 6] 
/usr/local/openmpi-1.8.4/lib/libmpi.so.1(PMPI_Send+0xf2)[0x7f74d79602a2]
[ss-Inspiron-5439:32514] [ 7] s1[0x408b1e]
[ss-Inspiron-5439:32514] [ 8] 
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5)[0x7f74d7033ec5]
[ss-Inspiron-5439:32514] [ 9] s1[0x4088e9]
[ss-Inspiron-5439:32514] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 32514 on node ss-Inspiron-5439 
exited on signal 11 (Segmentation fault).

Looks like MPI_Send can not send CUDA buffer. But I already did  the command    
  ./configure --with-cuda for OpenMPI.


The command I uesd is.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ nvcc -c k1.cu<http://k1.cu>
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -c main.cc
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -o s1 main.o k1.o 
-L/usr/local/cuda/lib64/ -lcudart
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 ./s1



The code I'm running is

main.cc file
#include<iostream>
using namespace std;
#include<mpi.h>
#include"k1.h"
#define vect_len 16
const int blocksize = 16;

int main(int argv, char *argc[])
{
          int numprocs, myid;
          MPI_Status status;
          const int vect_size = vect_len*sizeof(int);

          int *vect1 = new int[vect_size];
          int *vect2 = new int[vect_size];
          int *result = new int[vect_size];
          bool flag;

          int *ad;
          int *bd;

          MPI_Init(&argv, &argc);
          MPI_Comm_rank(MPI_COMM_WORLD, &myid);
          MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
          if(myid == 0)
          {
                      for(int i = 0; i < vect_len; i++)
                      {
                                  vect1[i] = i;
                                  vect2[i] = 2 * i;
                      }
          }
          else
          {
                      for(int i = 0; i < vect_len; i++)
                      {
                                  vect1[i] = 2 * i;
                                  vect2[i] = i;
                      }
          }

          initializeGPU(vect1, vect2, ad, bd, vect_size);

          if(myid == 0)
          {
                      for(int i = 0; i < numprocs; i++)
                      {
                                  MPI_Send(ad,vect_len, MPI_INT, i, 99, 
MPI_COMM_WORLD );
                                  MPI_Send(bd,vect_len, MPI_INT, i, 99, 
MPI_COMM_WORLD );
                      }
          }
          else
          {
                      MPI_Recv(ad,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, 
&status );
                      MPI_Recv(bd,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, 
&status );
          }



          computeGPU(blocksize, vect_len, ad, bd, result, vect_size);

          //Verify
          flag = true;

          for(int i = 0; i < vect_len; i++)
          {
                      if (i < 8)
                      vect1[i] += vect2[i];
                      else
                      vect1[i] -= vect2[i];

          }

          for(int i = 0; i < vect_len; i++)
          {
                      if( result[i] != vect1[i] )
                      {
                                  cout<<"the result ["<<i<<"] by m2s 
is"<<result[i]<<endl;
                                  cout<<"the result ["<<i<<"] of vector 
is"<<vect1[i]<<endl;
                                  cout << "Verification fail at " << i << endl;
                                  flag = false;
                                  break;
                      }

          }
          if(flag)
                      cout << "Verification passes." <<endl;
          // free device memory
/*       cudaFree( ad );
          cudaFree( bd );
          free(vect1);
          free(vect2);
          free(result);
*/
          MPI_Finalize();
}


k1.h file

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int 
vect_size);

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* 
result, int vect_size);



k1.cu<http://k1.cu> file

#include"k1.h"

__global__ void vect_add(int *a, int *b, int n)
{

          int id = threadIdx.x;

          if (id < n)
                      a[id] = a[id] + b[id];
          else
                      a[id] = a[id] - b[id];
}

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int 
vect_size)
{

          // initialize device memory
          cudaMalloc( (void**)&ad, vect_size );
          cudaMalloc( (void**)&bd, vect_size );

          // copy data to device
          cudaMemcpy( ad, hostptr1, vect_size, cudaMemcpyHostToDevice );
          cudaMemcpy( bd, hostptr2, vect_size, cudaMemcpyHostToDevice );

}

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int* 
result, int vect_size)
{
          // setup block and grid size
          dim3 dimBlock( block_size, 1, 1);
          dim3 dimGrid( vect_len/block_size, 1 , 1);
          vect_add<<<dimGrid, dimBlock>>>(ptr1, ptr2, 8);

          cudaMemcpy( result, ptr1 , vect_size, cudaMemcpyDeviceToHost );
}


Many Thanks for help,
Xun





-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may 
contain
confidential information.  Any unauthorized review, use, disclosure or 
distribution
is prohibited.  If you are not the intended recipient, please contact the 
sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------

Reply via email to