I think that forall statement must be at least as fast as equivalent do- -end do construction. But the next program (variant of LU-decomposition) shows that fragment containing forall statement is approximately at 2.5(!) times slower then fragment with do-end do.
program test implicit none integer, parameter :: n = 2000 integer i, j double precision, dimension (n, n) :: a, a1 double precision, dimension (n) :: work real time_begin, time_end integer, dimension(1) :: max_loc intrinsic random_number, maxloc, CPU_TIME call random_number(a) a1=a CALL CPU_TIME(time_begin) do i = 1, n-1 max_loc = maxloc(abs(a(i:,i))) j = max_loc(1) + i - 1 if (a(j,i) == 0.0) stop 'Zero pivot' if (i /= j) then work(i:n) = a(i,i:n) a(i,i:n) = a(j,i:n) a(j,i:n) = work(i:n) end if a(i+1:,i) = a(i+1:,i) / a(i,i) do j = i+1, n a(i+1:,j) = a(i+1:,j) - a(i,j) * a(i+1:,i) end do end do CALL CPU_TIME(time_end) print *, 'Time of operation was ', time_end - time_begin, ' seconds' a=a1 CALL CPU_TIME(time_begin) do i = 1, n-1 max_loc = maxloc(abs(a(i:,i))) j = max_loc(1) + i - 1 if (a(j,i) == 0.0) stop 'Zero pivot' if (i /= j) then work(i:n) = a(i,i:n) a(i,i:n) = a(j,i:n) a(j,i:n) = work(i:n) end if a(i+1:,i) = a(i+1:,i) / a(i,i) forall (j = i+1:n) a(i+1:,j) = a(i+1:,j) - a(i,j) * a(i+1:,i) end do CALL CPU_TIME(time_end) print *, 'Time of operation was ', time_end - time_begin, ' seconds' end program test GCC version 4.4.2. Windows Vista SP2, CPU: Intel Core 2 Quad Q6600, RAM: 3 GB Gfortran O3 file_name.f95 -- Summary: Slow forall Product: gcc Version: 4.4.2 Status: UNCONFIRMED Severity: normal Priority: P3 Component: fortran AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: gretsov at gmail dot com http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42118