Hello everyone,
I recently had to code an MPI application where I send std::vector
contents in a distributed environment. In order to try different
approaches I coded both 1-sided and 2-sided point-to-point communication
schemes, the first one uses MPI_Window and MPI_Get, the second one uses
MPI_SendRecv.
I had a hard time figuring out why my implementation with MPI_Get was
between 10 and 100 times slower, and I finally found out that MPI_Get is
abnormally slow when one tries to send custom datatypes including
padding.
Here is a short example attached, where I send a struct {double, int}
(12 bytes of data + 4 bytes of padding) vs a struct {double, int, int}
(16 bytes of data, 0 bytes of padding) with both MPI_SendRecv and
MPI_Get. I got these results :
mpirun -np 4 ./compareGetWithSendRecv
{double, int} SendRecv : 0.0303547 s
{double, int} Get : 1.9196 s
{double, int, int} SendRecv : 0.0164659 s
{double, int, int} Get : 0.0147757 s
I run it with both Open MPI 4.1.2 and with intel MPI 2021.6 and got the
same results.
Is this result normal? Do I have any solution other than adding garbage
at the end of the struct or at the end of the MPI_Datatype to avoid
padding?
Regards,
Antoine Motte
#include <mpi.h>
#include <chrono>
#include <iostream>
#include <cassert>
#include <type_traits>
struct Test1
{
double a;
int b;
};
bool operator!=(const Test1 x, const Test1 y) {return x.a != y.a or x.b != y.b;};
struct Test2
{
double a;
int b, c;
};
bool operator!=(const Test2 x, const Test2 y) {return x.a != y.a or x.b != y.b or x.c != y.c;};
const int maxWorldSize = 8;
//each proc sends an array of sz elements of type T to every other proc with SendRecv
template<class T, int sz>
double SendRecv(int world_size, int world_rank, MPI_Datatype dt)
{
T sendBuffer[sz], recvBuffer[maxWorldSize*sz];
T element;
if constexpr (std::is_same_v<T, Test1>) element = {1., 2};
else if constexpr (std::is_same_v<T, Test2>) element = {1., 2, 3};
for (int i = 0; i < sz; i++)
{
sendBuffer[i] = element;
}
auto startTime = std::chrono::system_clock::now();
for (int offset = 1; offset < world_size; offset++)
{
int dest = (world_rank + offset)%world_size;
int source = (world_rank - offset + world_size)%world_size;
MPI_Status s;
MPI_Sendrecv(sendBuffer, sz, dt, dest, 0, recvBuffer + source*sz, sz, dt, source, 0, MPI_COMM_WORLD, &s);
}
auto endTime = std::chrono::system_clock::now();
//ensure the result is used somewhere
for (int i = 0; i < world_size; i++)
{
if (i != world_rank)
{
for (int j = 0; j < sz; j++)
{
if (recvBuffer[i*sz + j] != element) std::cout << "Problem" << std::endl;
}
}
}
return std::chrono::duration<double>(endTime - startTime).count();
}
//each proc sends an array of sz elements of type T to every other proc with Window and Get
template<class T, int sz>
double Get(int world_size, int world_rank, MPI_Datatype dt)
{
T sendBuffer[sz], recvBuffer[maxWorldSize*sz];
T element;
if constexpr (std::is_same_v<T, Test1>) element = {1., 2};
else if constexpr (std::is_same_v<T, Test2>) element = {1., 2, 3};
for (int i = 0; i < sz; i++)
{
sendBuffer[i] = element;
}
MPI_Win window;
MPI_Info infos;
MPI_Info_create(&infos);
MPI_Win_create(sendBuffer, sz*sizeof(T), sizeof(T), infos, MPI_COMM_WORLD, &window);
MPI_Win_fence(0, window);
auto startTime = std::chrono::system_clock::now();
for (int offset = 1; offset < world_size; offset++)
{
int source = (world_rank - offset + world_size)%world_size;
MPI_Get(recvBuffer + source*sz, sz, dt, source, 0, sz, dt, window);
}
auto endTime = std::chrono::system_clock::now();
MPI_Win_fence(0, window);
MPI_Win_free(&window);
MPI_Info_free(&infos);
//ensure the result is used somewhere
for (int i = 0; i < world_size; i++)
{
if (i != world_rank)
{
for (int j = 0; j < sz; j++)
{
if (recvBuffer[i*sz + j] != element) std::cout << "Problem" << std::endl;
}
}
}
return std::chrono::duration<double>(endTime - startTime).count();
}
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int world_size, world_rank;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if (world_size > maxWorldSize) std::cout << "SendRecv and Get function will seg fault if world_size > " << maxWorldSize << std::endl;
if (world_size == 1) std::cout << "MPI_Window won't work if world_size == 1" << std::endl;
assert(world_size <= maxWorldSize && world_size > 1);
const int size = 1e6;
MPI_Datatype datatype1;
int array_of_blocklengths1[2] = {1, 1};
MPI_Aint array_of_displacements1[2] = {0, 8};
MPI_Datatype array_of_types1[2] = {MPI_DOUBLE, MPI_INT};
MPI_Type_create_struct(2, array_of_blocklengths1, array_of_displacements1, array_of_types1, &datatype1);
MPI_Type_commit(&datatype1);
//send {double, int} with MPI_SendRecv
auto time = SendRecv<Test1, size>(world_size, world_rank, datatype1);
if (world_rank == 0) std::cout << "{double, int} SendRecv : " << time << " s" << std::endl;
//send {double, int} with MPI_Window and MPI_Get
time = Get<Test1, size>(world_size, world_rank, datatype1);
if (world_rank == 0) std::cout << "{double, int} Get : " << time << " s" << std::endl;
MPI_Datatype datatype2;
int array_of_blocklengths2[2] = {1, 2};
MPI_Aint array_of_displacements2[2] = {0, 8};
MPI_Datatype array_of_types2[2] = {MPI_DOUBLE, MPI_INT};
MPI_Type_create_struct(2, array_of_blocklengths2, array_of_displacements2, array_of_types2, &datatype2);
MPI_Type_commit(&datatype2);
//send {double, int, int} with MPI_SendRecv
time = SendRecv<Test1, size>(world_size, world_rank, datatype2);
if (world_rank == 0) std::cout << "{double, int, int} SendRecv : " << time << " s" << std::endl;
//send {double, int, int} with MPI_Window and MPI_Get
time = Get<Test1, size>(world_size, world_rank, datatype2);
if (world_rank == 0) std::cout << "{double, int, int} Get : " << time << " s" << std::endl;
MPI_Finalize();
}