------- Comment #6 from bonzini at gnu dot org  2009-02-04 12:02 -------
We have:

                         4.3    4.4
-O2                     3.36   3.32
-O3                     3.02   3.50
-O3 -funroll-all-loops  2.92   3.45

Interestingly enough, if I _really_ hand-unroll the loop as suggested by the
wrong summary, 4.4 is faster than 4.3:

                         4.3    4.4
-O2                     3.53   3.23
-O3                     3.53   3.23
-O3 -funroll-all-loops  3.45   3.18

This means:

1) that -funroll-all-loops is not really affecting the benchmark, but only the
test code

2) that the regression is actually at -O3, not at -O2, and there's really no
guarantee of what performance you get past -O2

3) ultimately, that this is not a regression, but just an enhancement request:
4.4 -O2 is better, but not as good as 4.3 -O3.

--- here is the hand-unrolled loop code:

module foo
  implicit none
contains
  subroutine unroll(a,b,c)
    real, dimension(3,3) :: a,b,c
    c(1,1) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + a(1,3)*b(3,1)
    c(1,2) = a(1,1)*b(1,2) + a(1,2)*b(2,2) + a(1,3)*b(3,2)
    c(1,3) = a(1,1)*b(1,3) + a(1,2)*b(2,3) + a(1,3)*b(3,3)
    c(2,1) = a(2,1)*b(1,1) + a(2,2)*b(2,1) + a(2,3)*b(3,1)
    c(2,2) = a(2,1)*b(1,2) + a(2,2)*b(2,2) + a(2,3)*b(3,2)
    c(2,3) = a(2,1)*b(1,3) + a(2,2)*b(2,3) + a(2,3)*b(3,3)
    c(3,1) = a(3,1)*b(1,1) + a(3,2)*b(2,1) + a(3,3)*b(3,1)
    c(3,2) = a(3,1)*b(1,2) + a(3,2)*b(2,2) + a(3,3)*b(3,2)
    c(3,3) = a(3,1)*b(1,3) + a(3,2)*b(2,3) + a(3,3)*b(3,3)
  end subroutine unroll
end module foo

program main
  use foo
  implicit none
  integer :: k
  real, dimension(3,3) :: a,b,c, a0, b0
  real :: t1, t2
  character(len=90) :: line

  call random_number(a0)
  call random_number(b0)
  a = a0
  b = b0

  call cpu_time(t1)

  do k=1,10**8
     call unroll(a,b,c)
  end do

  call cpu_time(t2)
  print *,"subroutine with explicit interface and unroll(1): ",t2-t1, " s"
  write (unit=line,fmt='(9G10.3)') c

end program main


-- 

bonzini at gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Priority|P2                          |P3
   Last reconfirmed|2008-12-10 13:36:41         |2009-02-04 12:02:14
               date|                            |
            Summary|[4.4 Regression] speed      |big performance boost from
                   |regression with hand-       |4.3's -O3 vs. -O2 (in any
                   |unrolled matmul             |version) on 3x3 matmul
   Target Milestone|4.4.0                       |---


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38434

Reply via email to