You should be running with one GPU per MPI process.  If I understand correctly, 
you have a 3 node cluster and each node has a GPU so you should run with np=3.
Maybe you can try that and see if your numbers come out better.


From: users-boun...@open-mpi.org [mailto:users-boun...@open-mpi.org] On Behalf 
Of Rohan Deshpande
Sent: Monday, May 07, 2012 9:38 PM
To: Open MPI Users
Subject: [OMPI users] GPU and CPU timing - OpenMPI and Thrust

 I am running MPI and Thrust code on a cluster and measuring time for 
calculations.

My MPI code -

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#define  MASTER 0
#define ARRAYSIZE 20000000

int 
*masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;
   int main(int argc, char* argv[])
{
  int   numtasks, taskid,chunksize, namelen;
  int mysum,one,two,three,four,five,six,seven,eight,nine;

  char myname[MPI_MAX_PROCESSOR_NAME];
  MPI_Status status;
  int a,b,c,d,e,f,g,h,i,j;

/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);

masterarray= malloc(ARRAYSIZE * sizeof(int));
onearray= malloc(ARRAYSIZE * sizeof(int));
twoarray= malloc(ARRAYSIZE * sizeof(int));
threearray= malloc(ARRAYSIZE * sizeof(int));
fourarray= malloc(ARRAYSIZE * sizeof(int));
fivearray= malloc(ARRAYSIZE * sizeof(int));
sixarray= malloc(ARRAYSIZE * sizeof(int));
sevenarray= malloc(ARRAYSIZE * sizeof(int));
eightarray= malloc(ARRAYSIZE * sizeof(int));
ninearray= malloc(ARRAYSIZE * sizeof(int));

/***** Master task only ******/
if (taskid == MASTER){
           for(a=0; a < ARRAYSIZE; a++){
                 masterarray[a] = 1;

            }
   mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);

 }  /* end of master section */

  if (taskid > MASTER) {

             if(taskid == 1){
                for(b=0;b<ARRAYSIZE;b++){
                onearray[b] = 1;
            }
                 one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 2){
                for(c=0;c<ARRAYSIZE;c++){
                 twoarray[c] = 1;
            }
                 two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 3){
                 for(d=0;d<ARRAYSIZE;d++){
                 threearray[d] = 1;
                  }
                  three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
             }
     if(taskid == 4){
                   for(e=0;e < ARRAYSIZE; e++){
                      fourarray[e] = 1;
                  }
                 four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 5){
                for(f=0;f<ARRAYSIZE;f++){
                  fivearray[f] = 1;
                  }
                five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 6){

                for(g=0;g<ARRAYSIZE;g++){
                 sixarray[g] = 1;
                }
                 six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 7){
                    for(h=0;h<ARRAYSIZE;h++){
                    sevenarray[h] = 1;
                  }
                   seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 8){

                  for(i=0;i<ARRAYSIZE;i++){
                  eightarray[i] = 1;
                }
                   eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 9){

                   for(j=0;j<ARRAYSIZE;j++){
                 ninearray[j] = 1;
                   }
                   nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
             }
   }
 MPI_Finalize();

}

All the tasks just initialize their own array and then calculate the sum using 
cuda thrust.
My CUDA Thrust code -

 #include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

  extern "C"
 int run_kernel0( int array[], int nelements, int taskid, char hostname[])
 {

       float elapsedTime;
        int result = 0;
int threshold = 25000000;
        cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_vector<int> gpuarray;
int *begin = array;
int *end = array + nelements;
while(begin != end)
{
   int chunk_size = thrust::min(threshold,end - begin);
   gpuarray.assign(begin, begin + chunk_size);
 result += thrust::reduce(gpuarray.begin(), gpuarray.end());
   begin += chunk_size;
}
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

        printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f ms 
\n", taskid, result, elapsedTime);

return result;
    }

I also calculate the sum using CPU and the code is as below -

  struct timespec time1, time2, temp_time;

  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
  int i;
  int cpu_sum = 0;
  long diff = 0;

  for (i = 0; i < nelements; i++) {
    cpu_sum += array[i];
  }
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
  temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
  temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;
  diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec;
  printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid, cpu_sum, 
(double) diff/1000000);
  return cpu_sum;

Now when I run the job on cluster with 10 MPI tasks and compare the timings of 
CPU and GPU, I get weird results where GPU time is much much higher than CPU 
time.
But the case should be opposite isnt it?

The CPU time is almost same for all the task but GPU time increases.

Just wondering what might be the cause of this or are these results correct? 
Anything wrong with MPI code?

My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on 1 
machine.
Each machine has 1 GPU - GForce 9500 GT with 512 MB memory.

Can anyone please help me with this.?

Thanks
--




-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may 
contain
confidential information.  Any unauthorized review, use, disclosure or 
distribution
is prohibited.  If you are not the intended recipient, please contact the 
sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------

Reply via email to