https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52473
--- Comment #10 from Thomas Koenig <tkoenig at gcc dot gnu.org> --- Also see the discussion at https://groups.google.com/forum/#!topic/comp.lang.fortran/AI0F1Vpkc3I There is one thing that I do not understand. For the following test code, which compares straightforward DO loops for shifting a three-dimensional array by 1, the library version is actually faster than straightforward loops for dim=1. module kinds integer, parameter :: sp = selected_real_kind(6) ! Single precision integer, parameter :: dp = selected_real_kind(15) ! Double precision end module kinds module replacements use kinds contains subroutine cshift_sp_3_v1 (array, shift, dim, res) integer, parameter :: wp = sp real(kind=wp), dimension(:,:,:), intent(in), contiguous :: array integer, intent(in) :: shift, dim real(kind=wp), dimension(:,:,:), intent(out), contiguous :: res integer :: i,j,k integer :: sh, rsh integer :: n res = 0 if (dim == 1) then n = size(array,1) sh = modulo(shift, n) rsh = n - sh do k=1, size(array,3) do j=1, size(array,2) do i=1, rsh res(i,j,k) = array(i+sh,j,k) end do do i=rsh+1,n res(i,j,k) = array(i-rsh,j,k) end do end do end do else if (dim == 2) then n = size(array,2) sh = modulo(shift,n) rsh = n - sh do k=1, size(array,3) do j=1, rsh do i=1, size(array,1) res(i,j,k) = array(i,j+sh, k) end do end do do j=rsh+1, n do i=1, size(array,1) res(i,j,k) = array(i,j-rsh, k) end do end do end do else if (dim == 3) then n = size(array,3) sh = modulo(shift, n) rsh = n - sh do k=1, rsh do j=1, size(array,2) do i=1, size(array,1) res(i,j,k) = array(i, j, k+sh) end do end do end do do k=rsh+1, n do j=1, size(array,2) do i=1, size(array,1) res(i,j, k) = array(i, j, k-rsh) end do end do end do else stop "Wrong argument to dim" end if end subroutine cshift_sp_3_v1 end module replacements program testme use kinds use replacements implicit none integer, parameter :: wp = sp ! Working precision INTEGER, PARAMETER :: n = 200 real(kind=wp) :: a(n,n,n), b(n,n,n) integer i, j, k real t1, t2 print *,"Testing explicit DO loops" call random_number(a) do k = 1,3 call cpu_time ( t1 ) do j = 1, 100 call cshift_sp_3_v1 (a, 1, k, b) end do call cpu_time ( t2 ) write ( *, * ) 'Dim = ', k, ' Elapsed CPU time = ', t2-t1 end do print *,"Testing built-in cshift" do k = 1,3 call cpu_time ( t1 ) do j = 1, 100 b = cshift(a,1,k) end do call cpu_time ( t2 ) write ( *, * ) 'Dim = ', k, ' Elapsed CPU time = ', t2-t1 end do end program testme