// mpi_pinned.c
//
// To run:
//  /usr/mpi/gcc/openmpi-1.2.8/bin/mpirun  -x LD_LIBRARY_PATH 
//           -np 2 -host 172.16.175.55,redforge.nvidia.com
//           ${e}/infiniband/precompiled/mpi_pinned_centos5.3_64bit
//
//////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <mpi.h>

#define NREPEAT 1
//#define NBYTES  268435456 //10.e6
#define NBYTES 20971520
#define BUF_SIZE (4*1024*1024)
#define NUM_BUFS 2
#define PINNED

#define CPY2DEV 0 // Enable copy to device memory?

void usage(const char * name)
{
    printf("Usage: %s -x LD_LIBRARY_PATH \\ \n"
           "          -np 2 -host computer1,computer2 _out/<arch>/mpi_pinned\n",
           name);
}

int main (int argc, char *argv[])
{
    int i, rank, size, n, len;
    int result;
    void *a_h, *a_d;
    struct timeval time[2];
    double bandwidth;
    char hostname[MPI_MAX_PROCESSOR_NAME];
    //MPI_Status status;
    CUresult cuError;

    MPI_Init (&argc, &argv);
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Comm_size (MPI_COMM_WORLD, &size);

    MPI_Get_processor_name(hostname, &len);
    printf("Process %d is on %s\n", rank, hostname);

    if (argc > 1 && *argv[1] == 'h')
    {
        usage(argv[0]);
        exit(0);
    }

    cuError = cuInit(0);
    if (cuError != CUDA_SUCCESS) {
        fprintf(stderr, "cuInit() failed.\n");
    }

#ifdef PINNED
    cudaMallocHost( (void **) &a_h, NBYTES);
#else
    a_h = malloc(NBYTES);
#endif

    result = cudaMalloc( (void **) &a_d, NBYTES);
    if (result)
    {
        printf("ERROR: %s: cudaMalloc failed, error code: %d, which means: %s\n",
               hostname, result, cudaGetErrorString(result));
        exit(1);
    }

    /* Test host -> device bandwidth. */

    gettimeofday(&time[0], NULL);
    for (n=0; n<NREPEAT; n++)
    {
        result = cudaMemcpy(a_d, a_h, NBYTES, cudaMemcpyHostToDevice);
        if (result)
        {
            printf("ERROR: %s: cudaMemcpy failed, error code: %d, which means: %s\n",
                   hostname, result, cudaGetErrorString(result));
            exit(1);
        }

    }
    gettimeofday(&time[1], NULL);

    bandwidth  =        time[1].tv_sec  - time[0].tv_sec;
    bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
    bandwidth  = NBYTES*(NREPEAT/1.e6/bandwidth);

    printf("Host->device bandwidth for process %d: %f MB/sec\n",rank,bandwidth);

    /* Test MPI send/recv bandwidth. */

    MPI_Barrier(MPI_COMM_WORLD);

    gettimeofday(&time[0], NULL);
    for (n=0; n<NREPEAT; n++)
    {
        if (rank == 0) {
            /* Sender */
            int size = NBYTES;
            int rest, sent = 0;

            /* Send complete blocks first */
            rest = size;
            for (i = 0; i < (rest / BUF_SIZE); i++) {
                MPI_Send(a_h + sent, BUF_SIZE, MPI_CHAR, 1, 0, MPI_COMM_WORLD);
                sent += BUF_SIZE;
                printf(" * complete block sent: %d\n", BUF_SIZE);
            }
            rest = size - sent;
            /* Send rest if necessary */
            if (rest) {
                MPI_Send(a_h + sent, rest, MPI_CHAR, 1, 0, MPI_COMM_WORLD);
                printf(" * rest sent: %d\n", rest);
            }

            //MPI_Send(a_h, NBYTES/sizeof(int), MPI_INT, 1, 0, MPI_COMM_WORLD);
        }
        else {
            /* Receiver */
            char *pipeline_buf[NUM_BUFS];
            CUstream stream[NUM_BUFS];
            int size, rest, curr, copied;

            /* Create streams (1 for each buffer) */
            for (i = 0; i < NUM_BUFS; i++) {
                cuError = cuStreamCreate(&stream[i], 0);
                if (cuError != CUDA_SUCCESS) {
                    fprintf(stderr, "cuStreamCreate() failed.\n");
                } 
            }

            /* Set start addresses for pipeline buffers */
            for (i = 0; i < NUM_BUFS; i++) {
                pipeline_buf[i] = (char *) (a_h + (i * BUF_SIZE));
                //pipeline_buf[i] = (char *) (a_h);
            }

            size = NBYTES;
            curr = 0;        // stream to use
            copied = 0;      // # bytes already copied to GPU

            rest = size;
            /* Recv complete blocks first */
            for (i = 0; i < (rest / BUF_SIZE); i++) {
                cuStreamSynchronize(stream[curr]); // wait for stream to get ready

                MPI_Recv(pipeline_buf[curr], BUF_SIZE, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                //MPI_Recv(a_h, BUF_SIZE, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

#if CPY2DEV
                /* Copy block to GPU */
                cuError = cuMemcpyHtoDAsync((CUdeviceptr) a_d + copied, pipeline_buf[curr], BUF_SIZE, stream[curr]);
                if (CUDA_SUCCESS != cuError) {
                    fprintf(stderr, "cuMemcpyHtoDAsync() failed.\n");
                    exit(1);
                }
                printf(" - complete block copied: %d\n", BUF_SIZE);
                //ac_memcopy_h2d_async(addr + copied, pipeline_buf[curr], BUF_SIZE, stream[curr]);
#endif

                copied += BUF_SIZE;
                curr    = (curr + 1) % NUM_BUFS; // index of next block and stream
                printf(" - complete block received: %d\n", BUF_SIZE);
            }
            rest = size - copied;
            /* Recv rest if necessary */
            if (rest) {
                cuStreamSynchronize(stream[curr]); // wait for stream to get ready

                MPI_Recv(pipeline_buf[curr], rest, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                //MPI_Recv(a_h, rest, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

#if CPY2DEV
                /* Copy block to GPU */
                cuError = cuMemcpyHtoDAsync((CUdeviceptr) a_d + copied, pipeline_buf[curr], rest, stream[curr]);
                if (CUDA_SUCCESS != cuError) {
                    fprintf(stderr, "cuMemcpyHtoDAsync() failed.\n");
                    exit(1);
                }
                //ac_memcopy_h2d_async(addr + copied, pipeline_buf[curr], rest, stream[curr]);
#endif

                copied += rest;
                printf(" - rest received + copied: %d\n", rest);
                rest   = size - copied;
            }
            /* Wait for all streams */
            for (i = 0; i < NUM_BUFS; i++) {
                cuStreamSynchronize(stream[i]);
            }

            /* Free streams */
            for (i = 0; i < NUM_BUFS; i++) {
                cuError = cuStreamDestroy(stream[i]);
                if (cuError != CUDA_SUCCESS) {
                    fprintf(stderr, "cuStreamDestroy() failed.\n");
                } 
            }

            //MPI_Recv(a_h, NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
        }
    }
    gettimeofday(&time[1], NULL);

    bandwidth  =        time[1].tv_sec  - time[0].tv_sec;
    bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
    bandwidth  = NBYTES*(NREPEAT/1.e6/bandwidth);

    if (rank == 0)
        printf("MPI send/recv bandwidth: %f MB/sec\n", bandwidth);

    cudaFree(a_d);

#ifdef PINNED
    cudaFreeHost(a_h);
#else
    free(a_h);
#endif

    MPI_Finalize();
    return 0;
}
