------- Comment #7 from dominiq at lps dot ens dot fr  2009-05-22 20:52 -------
I had a closer look at the code and found that the inner loop

               DO k = 0 , Np(i)
                  uxt = uxt + D(j,k+1)*U(jmin+k,jm)
               ENDDO

is unrolled 8 times, but Np(i) is always equal to 4, so the relevant part of
the assembly is

...
        je      L951
        testl   %esi, %esi
        je      L915
        cmpl    $1, %esi
        je      L945
        cmpl    $2, %esi
        .p2align 4,,5
        je      L946
        cmpl    $3, %esi
        .p2align 4,,5
        je      L947
        cmpl    $4, %esi
        .p2align 4,,5
        je      L948
        cmpl    $5, %esi
        .p2align 4,,5
        je      L949
        cmpl    $6, %esi
        .p2align 4,,5
        je      L950
...

where the jump for $5 is the relevant one (this does look an optimal way to
handle the preamble).

I have also done some profiling and found that 'pow$fenv_access_off' in
libSystem.B.dylib  (PowerInner for ppc) takes a significant amount of time for
the executable compiled with -fwhole-file.

Any idea why? Note that derivx and derivy are inlined with -fwhole-file and
looking at the *s files attached in comment #5 and #6, everything looks normal
at this point.

                        i686-apple-darwin9

[ibook-dhum] lin/test% gfc -m64 -O3 -ffast-math -funroll-loops air.f90
[ibook-dhum] lin/test% rm -f tmp ; time a.out > tmp
8.451u 0.116s 0:08.61 99.4%     0+0k 0+6io 0pf+0w

+ 99.5%, start, a.out
| + 99.5%, main, a.out
| | + 99.4%, MAIN__, a.out
| | |   12.8%, derivy_, a.out
| | |   11.3%, derivx_, a.out
| | |   5.1%, fvsplty2_, a.out
| | |   4.1%, state_, a.out
| | |   3.1%, fvspltx2_, a.out
| | | - 2.8%, _gfortrani_list_formatted_write, libgfortran.3.dylib
| | | + 0.6%, botwall_, a.out
| | | |   0.2%, pow$fenv_access_off, libSystem.B.dylib
| | | |   0.0%, exp, libSystem.B.dylib
| | | |   0.0%, dyld_stub_exp, a.out
| | | + 0.6%, topwall_, a.out
| | | |   0.4%, pow$fenv_access_off, libSystem.B.dylib
| | | |   0.1%, exp, libSystem.B.dylib
| | | |   0.0%, dyld_stub_pow, a.out
| | | + 0.3%, aexit_, a.out
| | | |   0.1%, exp, libSystem.B.dylib
| | | + 0.2%, inlet_, a.out
| | | |   0.1%, exp, libSystem.B.dylib
| | | |   0.0%, log$fenv_access_off, libSystem.B.dylib
| | |   0.2%, log$fenv_access_off, libSystem.B.dylib
| | | - 0.1%, _gfortran_st_write_done, libgfortran.3.dylib
| | | - 0.1%, data_transfer_init, libgfortran.3.dylib
| | | - 0.1%, formatted_transfer, libgfortran.3.dylib
| | |   0.0%, _gfortran_transfer_real, libgfortran.3.dylib
| |   0.0%, _gfortran_st_write, libgfortran.3.dylib


[ibook-dhum] lin/test% gfc -m64 -O3 -ffast-math -funroll-loops -fwhole-file
air.f90
[ibook-dhum] lin/test% rm -f tmp ; time a.out > tmp
9.752u 0.096s 0:09.90 99.3%     0+0k 0+6io 0pf+0w

+ 99.5%, start, a.out
| + 99.5%, main, a.out
| | + 99.5%, MAIN__, a.out
| | | + 15.0%, pow$fenv_access_off, libSystem.B.dylib             <==== Why?
| | | |   0.4%, floorl$fenv_access_off, libSystem.B.dylib
| | | |   0.2%, dyld_stub_fabs, libSystem.B.dylib
| | | |   0.1%, dyld_stub_floorl, libSystem.B.dylib
| | | |   0.1%, fabs$fenv_access_off, libSystem.B.dylib
| | |   4.6%, fvsplty2_, a.out
| | |   3.5%, state_.clone.2, a.out
| | | - 2.9%, _gfortrani_list_formatted_write, libgfortran.3.dylib
| | |   2.8%, fvspltx2_, a.out
| | | + 0.4%, topwall_, a.out
| | | |   0.2%, pow$fenv_access_off, libSystem.B.dylib
| | | |   0.1%, exp, libSystem.B.dylib
| | | + 0.4%, botwall_.clone.3, a.out
| | | |   0.2%, pow$fenv_access_off, libSystem.B.dylib
| | | |   0.0%, exp, libSystem.B.dylib
| | | + 0.3%, aexit_.clone.4, a.out
| | | |   0.1%, exp, libSystem.B.dylib
| | | |   0.0%, log$fenv_access_off, libSystem.B.dylib
| | |   0.3%, dyld_stub_pow, a.out
| | | + 0.2%, inlet_, a.out
| | | |   0.1%, exp, libSystem.B.dylib
| | | |   0.0%, dyld_stub_log, a.out
| | | - 0.2%, _gfortran_st_write_done, libgfortran.3.dylib
| | | - 0.1%, formatted_transfer, libgfortran.3.dylib
| | | - 0.1%, data_transfer_init, libgfortran.3.dylib
| | |   0.1%, log$fenv_access_off, libSystem.B.dylib
| | |   0.0%, _gfortrani_flush_if_preconnected, libgfortran.3.dylib
| |   0.0%, pow$fenv_access_off, libSystem.B.dylib
| |   0.0%, _gfortrani_free_internal_unit, libgfortran.3.dylib


                        powerpc-apple-darwin9

gfc -m64 -O3 -ffast-math -funroll-loops air.f90

- 75.5%, MAIN__, a.out
- 5.9%, derivy_, a.out
- 5.4%, derivx_, a.out
- 4.7%, fvsplty2_, a.out
- 4.2%, fvspltx2_, a.out
- 2.1%, state_, a.out
- 0.6%, dyld_stub_sqrt, a.out
- 0.5%, ml_set_interrupts_enabled, mach_kernel
- 0.2%, sqrt, libSystem.B.dylib
- 0.2%, exp, libSystem.B.dylib
- 0.2%, log, libSystem.B.dylib
- 0.1%, PowerInner, libSystem.B.dylib
- 0.1%, inlet_, a.out
- 0.0%, aexit_, a.out
- 0.0%, dyld_stub_pow, a.out
- 0.0%, botwall_, a.out
- 0.0%, topwall_, a.out
- 0.0%, pow, libSystem.B.dylib
- 0.0%, dyld_stub_log, a.out
- 0.0%, __dtoa, libSystem.B.dylib
- 0.0%, next_format0, libgfortran.3.dylib
- 0.0%, log10, libSystem.B.dylib
- 0.0%, dyld_stub_memset, libSystem.B.dylib
- 0.0%, dyld_stub_memcpy, libgfortran.3.dylib
- 0.0%, dyld_stub_exp, a.out
- 0.0%, dyld_stub___sfvwrite, libSystem.B.dylib
- 0.0%, __vfprintf, libSystem.B.dylib
- 0.0%, __quorem_D2A, libSystem.B.dylib
- 0.0%, __Bfree_D2A, libSystem.B.dylib

gfc -m64 -O3 -ffast-math -funroll-loops -fwhole-file air.f90

- 82.6%, MAIN__, a.out
- 5.3%, PowerInner, libSystem.B.dylib                             <==== Why?
- 4.3%, fvsplty2_, a.out
- 3.2%, fvspltx2_, a.out
- 1.9%, state_.clone.2, a.out
- 1.3%, pow, libSystem.B.dylib
- 0.4%, ml_set_interrupts_enabled, mach_kernel
- 0.4%, dyld_stub_sqrt, a.out
- 0.1%, log, libSystem.B.dylib
- 0.1%, dyld_stub_pow, a.out
- 0.1%, sqrt, libSystem.B.dylib
- 0.1%, exp, libSystem.B.dylib
- 0.0%, inlet_, a.out
- 0.0%, botwall_.clone.3, a.out
- 0.0%, topwall_, a.out
- 0.0%, aexit_.clone.4, a.out
- 0.0%, dyld_stub_log, a.out
- 0.0%, dyld_stub_localeconv_l, libSystem.B.dylib
- 0.0%, dyld_stub_exp, a.out
- 0.0%, dyld_stub___pow5mult_D2A, libSystem.B.dylib
- 0.0%, data_transfer_init, libgfortran.3.dylib
- 0.0%, __umodti3, libgfortran.3.dylib
- 0.0%, __dtoa, libSystem.B.dylib
- 0.0%, __Bfree_D2A, libSystem.B.dylib
- 0.0%, __Balloc_D2A, libSystem.B.dylib


-- 

dominiq at lps dot ens dot fr changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jh at suse dot cz


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40106

Reply via email to