------- Comment #6 from bonzini at gnu dot org 2009-02-04 12:02 ------- We have:
4.3 4.4 -O2 3.36 3.32 -O3 3.02 3.50 -O3 -funroll-all-loops 2.92 3.45 Interestingly enough, if I _really_ hand-unroll the loop as suggested by the wrong summary, 4.4 is faster than 4.3: 4.3 4.4 -O2 3.53 3.23 -O3 3.53 3.23 -O3 -funroll-all-loops 3.45 3.18 This means: 1) that -funroll-all-loops is not really affecting the benchmark, but only the test code 2) that the regression is actually at -O3, not at -O2, and there's really no guarantee of what performance you get past -O2 3) ultimately, that this is not a regression, but just an enhancement request: 4.4 -O2 is better, but not as good as 4.3 -O3. --- here is the hand-unrolled loop code: module foo implicit none contains subroutine unroll(a,b,c) real, dimension(3,3) :: a,b,c c(1,1) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + a(1,3)*b(3,1) c(1,2) = a(1,1)*b(1,2) + a(1,2)*b(2,2) + a(1,3)*b(3,2) c(1,3) = a(1,1)*b(1,3) + a(1,2)*b(2,3) + a(1,3)*b(3,3) c(2,1) = a(2,1)*b(1,1) + a(2,2)*b(2,1) + a(2,3)*b(3,1) c(2,2) = a(2,1)*b(1,2) + a(2,2)*b(2,2) + a(2,3)*b(3,2) c(2,3) = a(2,1)*b(1,3) + a(2,2)*b(2,3) + a(2,3)*b(3,3) c(3,1) = a(3,1)*b(1,1) + a(3,2)*b(2,1) + a(3,3)*b(3,1) c(3,2) = a(3,1)*b(1,2) + a(3,2)*b(2,2) + a(3,3)*b(3,2) c(3,3) = a(3,1)*b(1,3) + a(3,2)*b(2,3) + a(3,3)*b(3,3) end subroutine unroll end module foo program main use foo implicit none integer :: k real, dimension(3,3) :: a,b,c, a0, b0 real :: t1, t2 character(len=90) :: line call random_number(a0) call random_number(b0) a = a0 b = b0 call cpu_time(t1) do k=1,10**8 call unroll(a,b,c) end do call cpu_time(t2) print *,"subroutine with explicit interface and unroll(1): ",t2-t1, " s" write (unit=line,fmt='(9G10.3)') c end program main -- bonzini at gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Priority|P2 |P3 Last reconfirmed|2008-12-10 13:36:41 |2009-02-04 12:02:14 date| | Summary|[4.4 Regression] speed |big performance boost from |regression with hand- |4.3's -O3 vs. -O2 (in any |unrolled matmul |version) on 3x3 matmul Target Milestone|4.4.0 |--- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38434