Hi,

The OpenMpi I used is 1.8.4. I just tried to run a test program to see if
the CUDA aware feature works. But I got the following errors.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 s1
[ss-Inspiron-5439:32514] *** Process received signal ***
[ss-Inspiron-5439:32514] Signal: Segmentation fault (11)
[ss-Inspiron-5439:32514] Signal code: Address not mapped (1)
[ss-Inspiron-5439:32514] Failing at address: 0x3
[ss-Inspiron-5439:32514] [ 0]
/lib/x86_64-linux-gnu/libc.so.6(+0x36c30)[0x7f74d7048c30]
[ss-Inspiron-5439:32514] [ 1]
/lib/x86_64-linux-gnu/libc.so.6(+0x98a70)[0x7f74d70aaa70]
[ss-Inspiron-5439:32514] [ 2]
/usr/local/openmpi-1.8.4/lib/libopen-pal.so.6(opal_convertor_pack+0x187)[0x7f74d673f097]
[ss-Inspiron-5439:32514] [ 3]
/usr/local/openmpi-1.8.4/lib/openmpi/mca_btl_self.so(mca_btl_self_prepare_src+0xb8)[0x7f74ce196888]
[ss-Inspiron-5439:32514] [ 4]
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_prepare+0x4c)[0x7f74cd2c183c]
[ss-Inspiron-5439:32514] [ 5]
/usr/local/openmpi-1.8.4/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x5ba)[0x7f74cd2b78aa]
[ss-Inspiron-5439:32514] [ 6]
/usr/local/openmpi-1.8.4/lib/libmpi.so.1(PMPI_Send+0xf2)[0x7f74d79602a2]
[ss-Inspiron-5439:32514] [ 7] s1[0x408b1e]
[ss-Inspiron-5439:32514] [ 8]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5)[0x7f74d7033ec5]
[ss-Inspiron-5439:32514] [ 9] s1[0x4088e9]
[ss-Inspiron-5439:32514] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 32514 on node ss-Inspiron-5439
exited on signal 11 (Segmentation fault).

Looks like MPI_Send can not send CUDA buffer. But I already did  the
command      ./configure --with-cuda for OpenMPI.


The command I uesd is.

ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ nvcc -c k1.cu
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -c main.cc
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpic++ -o s1 main.o k1.o
-L/usr/local/cuda/lib64/ -lcudart
ss@ss-Inspiron-5439:~/cuda-workspace/cuda_mpi_ex1$ mpirun -np 2 ./s1



The code I'm running is

main.cc file
#include<iostream>
using namespace std;
#include<mpi.h>
#include"k1.h"
#define vect_len 16
const int blocksize = 16;

int main(int argv, char *argc[])
{
int numprocs, myid;
MPI_Status status;
const int vect_size = vect_len*sizeof(int);

int *vect1 = new int[vect_size];
int *vect2 = new int[vect_size];
int *result = new int[vect_size];
  bool flag;

  int *ad;
  int *bd;

MPI_Init(&argv, &argc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  if(myid == 0)
  {
for(int i = 0; i < vect_len; i++)
{
vect1[i] = i;
vect2[i] = 2 * i;
}
  }
  else
  {
for(int i = 0; i < vect_len; i++)
{
vect1[i] = 2 * i;
vect2[i] = i;
}
  }

  initializeGPU(vect1, vect2, ad, bd, vect_size);

if(myid == 0)
{
for(int i = 0; i < numprocs; i++)
{
MPI_Send(ad,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD );
MPI_Send(bd,vect_len, MPI_INT, i, 99, MPI_COMM_WORLD );
}
}
else
{
MPI_Recv(ad,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status );
MPI_Recv(bd,vect_len, MPI_INT, 0, 99, MPI_COMM_WORLD, &status );
}



computeGPU(blocksize, vect_len, ad, bd, result, vect_size);

//Verify
flag = true;

for(int i = 0; i < vect_len; i++)
{
if (i < 8)
vect1[i] += vect2[i];
else
vect1[i] -= vect2[i];

}

for(int i = 0; i < vect_len; i++)
{
if( result[i] != vect1[i] )
{
cout<<"the result ["<<i<<"] by m2s is"<<result[i]<<endl;
cout<<"the result ["<<i<<"] of vector is"<<vect1[i]<<endl;
cout << "Verification fail at " << i << endl;
flag = false;
break;
}

}
if(flag)
cout << "Verification passes." <<endl;
// free device memory
/* cudaFree( ad );
cudaFree( bd );
free(vect1);
free(vect2);
free(result);
*/
MPI_Finalize();
}


k1.h file

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int
vect_size);

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int*
result, int vect_size);



k1.cu file

#include"k1.h"

__global__ void vect_add(int *a, int *b, int n)
{

int id = threadIdx.x;

if (id < n)
a[id] = a[id] + b[id];
else
a[id] = a[id] - b[id];
}

void initializeGPU(int *hostptr1, int *hostptr2, int *ad, int *bd, int
vect_size)
{

// initialize device memory
cudaMalloc( (void**)&ad, vect_size );
cudaMalloc( (void**)&bd, vect_size );

// copy data to device
cudaMemcpy( ad, hostptr1, vect_size, cudaMemcpyHostToDevice );
cudaMemcpy( bd, hostptr2, vect_size, cudaMemcpyHostToDevice );

}

void computeGPU(int block_size, int vect_len, int *ptr1, int *ptr2, int*
result, int vect_size)
{
// setup block and grid size
dim3 dimBlock( block_size, 1, 1);
dim3 dimGrid( vect_len/block_size, 1 , 1);
vect_add<<<dimGrid, dimBlock>>>(ptr1, ptr2, 8);

cudaMemcpy( result, ptr1 , vect_size, cudaMemcpyDeviceToHost );
}


Many Thanks for help,
Xun

Reply via email to