Hello everyone,

I recently had to code an MPI application where I send std::vector contents in a distributed environment. In order to try different approaches I coded both 1-sided and 2-sided point-to-point communication schemes, the first one uses MPI_Window and MPI_Get, the second one uses MPI_SendRecv.

I had a hard time figuring out why my implementation with MPI_Get was between 10 and 100 times slower, and I finally found out that MPI_Get is abnormally slow when one tries to send custom datatypes including padding.

Here is a short example attached, where I send a struct {double, int} (12 bytes of data + 4 bytes of padding) vs a struct {double, int, int} (16 bytes of data, 0 bytes of padding) with both MPI_SendRecv and MPI_Get. I got these results :

mpirun -np 4 ./compareGetWithSendRecv
{double, int} SendRecv : 0.0303547 s
{double, int} Get : 1.9196 s
{double, int, int} SendRecv : 0.0164659 s
{double, int, int} Get : 0.0147757 s

I run it with both Open MPI 4.1.2 and with intel MPI 2021.6 and got the same results.

Is this result normal? Do I have any solution other than adding garbage at the end of the struct or at the end of the MPI_Datatype to avoid padding?

Regards,

Antoine Motte
#include <mpi.h>
#include <chrono>
#include <iostream>
#include <cassert>
#include <type_traits>

struct Test1
{
    double a;
    int b;
};

bool operator!=(const Test1 x, const Test1 y) {return x.a != y.a or x.b != y.b;};

struct Test2
{
    double a;
    int b, c;
};

bool operator!=(const Test2 x, const Test2 y) {return x.a != y.a or x.b != y.b or x.c != y.c;};

const int maxWorldSize = 8;

//each proc sends an array of sz elements of type T to every other proc with SendRecv
template<class T, int sz>
double SendRecv(int world_size, int world_rank, MPI_Datatype dt)
{
    T sendBuffer[sz], recvBuffer[maxWorldSize*sz];

    T element;
    if constexpr (std::is_same_v<T, Test1>) element = {1., 2};
    else if constexpr (std::is_same_v<T, Test2>) element = {1., 2, 3};

    for (int i = 0; i < sz; i++)
    {
        sendBuffer[i] = element;
    }

    auto startTime = std::chrono::system_clock::now();
    for (int offset = 1; offset < world_size; offset++)
    {
        int dest = (world_rank + offset)%world_size;
        int source = (world_rank - offset + world_size)%world_size;
        MPI_Status s;

        MPI_Sendrecv(sendBuffer, sz, dt, dest, 0, recvBuffer + source*sz, sz, dt, source, 0, MPI_COMM_WORLD, &s);
    }
    auto endTime = std::chrono::system_clock::now();

    //ensure the result is used somewhere
    for (int i = 0; i < world_size; i++)
    {
        if (i != world_rank)
        {
            for (int j = 0; j < sz; j++)
            {
                if (recvBuffer[i*sz + j] != element) std::cout << "Problem" << std::endl;
            }
        } 
    }

    return std::chrono::duration<double>(endTime - startTime).count();
}

//each proc sends an array of sz elements of type T to every other proc with Window and Get
template<class T, int sz>
double Get(int world_size, int world_rank, MPI_Datatype dt)
{
    T sendBuffer[sz], recvBuffer[maxWorldSize*sz];

    T element;
    if constexpr (std::is_same_v<T, Test1>) element = {1., 2};
    else if constexpr (std::is_same_v<T, Test2>) element = {1., 2, 3};

    for (int i = 0; i < sz; i++)
    {
        sendBuffer[i] = element;
    }

    MPI_Win window;
    MPI_Info infos;

    MPI_Info_create(&infos);
    MPI_Win_create(sendBuffer, sz*sizeof(T), sizeof(T), infos, MPI_COMM_WORLD, &window);
    MPI_Win_fence(0, window);

    auto startTime = std::chrono::system_clock::now();
    for (int offset = 1; offset < world_size; offset++)
    {
        int source = (world_rank - offset + world_size)%world_size;
        MPI_Get(recvBuffer + source*sz, sz, dt, source, 0, sz, dt, window);
    }
    auto endTime = std::chrono::system_clock::now();

    MPI_Win_fence(0, window);
    MPI_Win_free(&window);
    MPI_Info_free(&infos);

    //ensure the result is used somewhere
    for (int i = 0; i < world_size; i++)
    {
        if (i != world_rank)
        {
            for (int j = 0; j < sz; j++)
            {
                if (recvBuffer[i*sz + j] != element) std::cout << "Problem" << std::endl;
            }
        } 
    }

    return std::chrono::duration<double>(endTime - startTime).count();
}

int main(int argc, char **argv)
{
    MPI_Init(&argc, &argv);

    int world_size, world_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    if (world_size > maxWorldSize) std::cout << "SendRecv and Get function will seg fault if world_size > " << maxWorldSize << std::endl;
    if (world_size == 1) std::cout << "MPI_Window won't work if world_size == 1" << std::endl;
    assert(world_size <= maxWorldSize && world_size > 1);

    const int size = 1e6;

    MPI_Datatype datatype1;
    int array_of_blocklengths1[2] = {1, 1};
    MPI_Aint array_of_displacements1[2] = {0, 8};
    MPI_Datatype array_of_types1[2] = {MPI_DOUBLE, MPI_INT};
    MPI_Type_create_struct(2, array_of_blocklengths1, array_of_displacements1, array_of_types1, &datatype1);
    MPI_Type_commit(&datatype1);
    
    //send {double, int} with MPI_SendRecv
    auto time = SendRecv<Test1, size>(world_size, world_rank, datatype1);
    if (world_rank == 0) std::cout << "{double, int} SendRecv : " << time << " s" << std::endl;

    //send {double, int} with MPI_Window and MPI_Get
    time = Get<Test1, size>(world_size, world_rank, datatype1);
    if (world_rank == 0) std::cout << "{double, int} Get : " << time << " s" << std::endl;

    MPI_Datatype datatype2;
    int array_of_blocklengths2[2] = {1, 2};
    MPI_Aint array_of_displacements2[2] = {0, 8};
    MPI_Datatype array_of_types2[2] = {MPI_DOUBLE, MPI_INT};
    MPI_Type_create_struct(2, array_of_blocklengths2, array_of_displacements2, array_of_types2, &datatype2);
    MPI_Type_commit(&datatype2);
    
    //send {double, int, int} with MPI_SendRecv
    time = SendRecv<Test1, size>(world_size, world_rank, datatype2);
    if (world_rank == 0) std::cout << "{double, int, int} SendRecv : " << time << " s" << std::endl;

    //send {double, int, int} with MPI_Window and MPI_Get
    time = Get<Test1, size>(world_size, world_rank, datatype2);
    if (world_rank == 0) std::cout << "{double, int, int} Get : " << time << " s" << std::endl;

    MPI_Finalize();
}

Reply via email to