Dear all,

I have a little piece of code shown below that initializes a multidimensional Fortran array and performs:
- a non-blocking MPI_Iallreduce immediately followed by an MPI_Wait
- a blocking MPI_Allreduce
After both calls, it displays a few elements of the input and output buffers.

In the output I am showing below, the first column gives the indices of the element displayed, the second column gives the corresponding element in the input array, the third column gives the corresponding element in the output array. All the processes have the same input array so the output should just be a multiple of the output.

I tried to compile and execute it with OpenMPI 4.0.1 on a single node, I get:

coti@xxx:~$ mpiexec -n 4 test_allreduce
 Rank           3  /            4
 Rank           1  /            4
 Rank           0  /            4
 Rank           2  /            4
 Non-blocking
 1,1,1,1           5  1252991616
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6          24
 2,1,1,1           6       21197
 ----
 Blocking
 1,1,1,1           5          20
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6          24
 2,1,1,1           6          24
 ----

I just cloned the master branch of the Git repository and compiled it (hash db52da40c379610360676f225cd7c767e5a964d3), with the following configuration line:
  $ ./configure --prefix=<....> --enable-mpi-fortran=usempi

I get:

coti@yyy:~$ mpiexec --mca btl vader,self -n 4 ./test_allreduce
 Rank           0  /            4
 Rank           1  /            4
 Rank           2  /            4
 Rank           3  /            4
 Non-blocking
 1,1,1,1           5 -1092661536
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6 -1354461780
 2,1,1,1           6      130622
 ----
 Blocking
 1,1,1,1           5          20
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6          24
 2,1,1,1           6          24
 ----

I have tried it with other MPI implementations (Intel MPI 19 and MPICH 3.3), and they gave me the same output with the blocking and non-blocking calls:

coti@yyy:~$ mpiexec -n 4 ./test_allreduce
 Rank           0  /            4
 Rank           1  /            4
 Rank           2  /            4
 Rank           3  /            4
 Non-blocking
 1,1,1,1           5          20
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6          24
 2,1,1,1           6          24
 ----
 Blocking
 1,1,1,1           5          20
 1,1,1,2           6          24
 1,1,1,3           7          28
 1,1,1,4           8          32
 1,1,1,5           9          36
 ----
 1,1,2,1           6          24
 1,2,1,1           6          24
 2,1,1,1           6          24
 ----

Is there anything wrong with my call to MPI_Iallreduce/MPI_Wait?

Thanks,
Camille


$ cat test_allreduce.f90
program main
  use mpi

  integer, allocatable, dimension(:,:,:,:,:) :: buff_in
  integer, allocatable, dimension(:,:,:,:) :: buff_out
  integer :: N, rank, size, err, i, j, k, l, m
  integer :: req

  N = 8

  allocate( buff_in( N, N, N, N, N ) )
  allocate( buff_out( N, N, N, N ) )

  call mpi_init( err )
  call mpi_comm_rank( mpi_comm_world, rank, err )
  call mpi_comm_size( mpi_comm_world, size, err )

  write( 6, * ) "Rank", rank, " / ", size

  do i=1, N
     do j=1, N
        do k=1, N
           do l=1, N
              do m=1, N
                 buff_in( i, j, k, l, m ) = i + j + k + l + m
              end do
           end do
        end do
     end do
  end do

  buff_out( :,:,:,: ) = 0

! non-blocking

  call mpi_iallreduce( buff_in( 1, :, :, :, : ), buff_out, N*N*N*N, MPI_INT, MPI_SUM, mpi_comm_world, req, err )
  call mpi_wait( req, MPI_STATUS_IGNORE, err )

  if( 0 == rank ) then
     write( 6, * ) "Non-blocking"
     write( 6, * ) "1,1,1,1", buff_in( 1, 1, 1, 1, 1 ), buff_out( 1, 1, 1, 1 )      write( 6, * ) "1,1,1,2", buff_in( 1, 1, 1, 1, 2 ), buff_out( 1, 1, 1, 2 )      write( 6, * ) "1,1,1,3", buff_in( 1, 1, 1, 1, 3 ), buff_out( 1, 1, 1, 3 )      write( 6, * ) "1,1,1,4", buff_in( 1, 1, 1, 1, 4 ), buff_out( 1, 1, 1, 4 )      write( 6, * ) "1,1,1,5", buff_in( 1, 1, 1, 1, 5 ), buff_out( 1, 1, 1, 5 )
     write( 6, * ) "----"
     write( 6, * ) "1,1,2,1", buff_in( 1, 1, 1, 2, 1 ), buff_out( 1, 1, 2, 1 )      write( 6, * ) "1,2,1,1", buff_in( 1, 1, 2, 1, 1 ), buff_out( 1, 2, 1, 1 )      write( 6, * ) "2,1,1,1", buff_in( 1, 2, 1, 1, 1 ), buff_out( 2, 1, 1, 1 )
     write( 6, * ) "----"
  end if

! blocking

  buff_out( :,:,:,: ) = 0
  call mpi_allreduce( buff_in( 1, :, :, :, : ), buff_out, N*N*N*N, MPI_INT, MPI_SUM, mpi_comm_world, err )

  if( 0 == rank ) then
     write( 6, * ) "Blocking"
     write( 6, * ) "1,1,1,1", buff_in( 1, 1, 1, 1, 1 ), buff_out( 1, 1, 1, 1 )      write( 6, * ) "1,1,1,2", buff_in( 1, 1, 1, 1, 2 ), buff_out( 1, 1, 1, 2 )      write( 6, * ) "1,1,1,3", buff_in( 1, 1, 1, 1, 3 ), buff_out( 1, 1, 1, 3 )      write( 6, * ) "1,1,1,4", buff_in( 1, 1, 1, 1, 4 ), buff_out( 1, 1, 1, 4 )      write( 6, * ) "1,1,1,5", buff_in( 1, 1, 1, 1, 5 ), buff_out( 1, 1, 1, 5 )
     write( 6, * ) "----"
     write( 6, * ) "1,1,2,1", buff_in( 1, 1, 1, 2, 1 ), buff_out( 1, 1, 2, 1 )      write( 6, * ) "1,2,1,1", buff_in( 1, 1, 2, 1, 1 ), buff_out( 1, 2, 1, 1 )      write( 6, * ) "2,1,1,1", buff_in( 1, 2, 1, 1, 1 ), buff_out( 2, 1, 1, 1 )
     write( 6, * ) "----"
  end if

  call mpi_finalize( err )
end program main

Reply via email to