------- Comment #69 from lucier at math dot purdue dot edu 2009-05-07 15:57 ------- Well, adding -frename-registers by itself to -O1 and not -fforward-propagate and -fno-move-loop-invariants doesn't help (loop is given below, along with complete compile options), the time is
140 ms cpu time (140 user, 0 system) and adding -frename-registers and -fno-move-loop-invariants without -fforward-propagate doesn't help (loop is again given below), it gets 140 ms cpu time (140 user, 0 system) Adding all three gives a very consistent time this morning of 120 ms cpu time (120 user, 0 system) so which is the same as the 4.2.4 time without any of these options (this morning). But -fforward-propagate is not a viable option in general for this type of code; here are some times for the testcase from PR 31957 with various options on a 2.something GHz Xeon server: pythagoras-45% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -frename-registers -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report -fmem-report >& rename-report 252.987u 9.592s 4:23.20 99.7% 0+0k 0+0io 0pf+0w pythagoras-46% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report -fmem-report > & no-rename-report 249.875u 10.544s 4:21.73 99.4% 0+0k 0+0io 0pf+0w pythagoras-47% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report -fmem-report > & rename-no-move-loop-invariants-report 246.663u 10.484s 4:18.30 99.5% 0+0k 0+0io 0pf+0w pythagoras-48% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -frename-registers -fno-move-loop-invariants -fforward-propagate -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report -fmem-report > & rename-no-move-loop-invariants-forward-propagate-report 357.830u 28.417s 6:27.81 99.5% 0+0k 0+0io 11pf+0w With -fforward-propagate the memory required went up to at least 21GB. I'll attach the time reports for the various options, but the compiler wasn't configured to provide detailed memory reports. Brad Loop with -frename-registers /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -frename-registers -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c movq %rdx, %r12 addq (%r11), %r12 leaq 4(%rdx), %r14 movq %r12, (%rsi) addq $4, %r12 movq %r12, (%r10) movq (%r11), %rcx addq (%rsi), %rcx movq %rcx, (%rbx) addq $4, %rcx movq %rcx, (%r9) movq (%r11), %r13 addq (%rbx), %r13 movq %r13, (%r8) addq $4, %r13 movq %r13, (%r15) movq (%rax), %rcx movq (%r8), %r12 addq $7, %rcx movsd (%rcx,%r12,2), %xmm10 movq (%rbx), %r12 movsd (%rcx,%r13,2), %xmm13 movq (%r9), %r13 movsd (%rcx,%r12,2), %xmm6 movq (%rsi), %r12 movsd (%rcx,%r13,2), %xmm5 movq (%r10), %r13 movsd (%rcx,%r12,2), %xmm9 leaq (%r14,%r14), %r12 movsd (%rcx,%r13,2), %xmm11 leaq (%rcx,%rdx,2), %r13 movsd (%rcx,%r12), %xmm3 movq 24(%rdi), %rcx movsd (%r13), %xmm4 addq $8, %rdx movsd 15(%rcx), %xmm14 movsd 7(%rcx), %xmm15 movapd %xmm14, %xmm8 movapd %xmm14, %xmm7 movapd %xmm15, %xmm12 mulsd %xmm10, %xmm8 mulsd %xmm13, %xmm12 mulsd %xmm15, %xmm10 mulsd %xmm14, %xmm13 movsd 31(%rcx), %xmm2 addsd %xmm8, %xmm12 movapd %xmm15, %xmm8 mulsd %xmm6, %xmm7 mulsd %xmm5, %xmm14 subsd %xmm13, %xmm10 mulsd %xmm5, %xmm8 movapd %xmm2, %xmm13 mulsd %xmm6, %xmm15 movapd %xmm4, %xmm6 xorpd .LC5(%rip), %xmm13 movapd %xmm3, %xmm5 addsd %xmm7, %xmm8 movapd %xmm11, %xmm7 subsd %xmm14, %xmm15 movapd %xmm9, %xmm14 movsd 23(%rcx), %xmm0 subsd %xmm12, %xmm7 subsd %xmm10, %xmm14 movapd %xmm13, %xmm1 addsd %xmm11, %xmm12 movapd %xmm2, %xmm11 subsd %xmm15, %xmm6 addsd %xmm4, %xmm15 movapd %xmm0, %xmm4 mulsd %xmm7, %xmm1 addsd %xmm9, %xmm10 mulsd %xmm14, %xmm4 subsd %xmm8, %xmm5 mulsd %xmm0, %xmm7 addsd %xmm3, %xmm8 mulsd %xmm13, %xmm14 movapd %xmm15, %xmm9 mulsd %xmm10, %xmm11 mulsd %xmm0, %xmm10 addsd %xmm1, %xmm4 movapd %xmm8, %xmm3 movapd %xmm5, %xmm1 subsd %xmm7, %xmm14 movapd %xmm0, %xmm7 mulsd %xmm12, %xmm7 addsd %xmm4, %xmm1 mulsd %xmm2, %xmm12 movapd %xmm6, %xmm2 subsd %xmm14, %xmm6 addsd %xmm14, %xmm2 addsd %xmm11, %xmm7 subsd %xmm12, %xmm10 subsd %xmm4, %xmm5 addsd %xmm7, %xmm3 addsd %xmm10, %xmm9 subsd %xmm10, %xmm15 subsd %xmm7, %xmm8 movsd %xmm9, (%r13) movq (%rax), %rcx movsd %xmm3, 7(%r12,%rcx) movq (%rsi), %r13 movq (%rax), %rcx movsd %xmm15, 7(%rcx,%r13,2) movq (%r10), %r12 movq (%rax), %r13 movsd %xmm8, 7(%r13,%r12,2) movq (%rbx), %rcx movq (%rax), %r13 movsd %xmm2, 7(%r13,%rcx,2) movq (%r9), %r12 movq (%rax), %rcx movsd %xmm1, 7(%rcx,%r12,2) movq (%r8), %r13 movq (%rax), %rcx movsd %xmm6, 7(%rcx,%r13,2) movq (%r15), %r12 movq (%rax), %r13 movsd %xmm5, 7(%r13,%r12,2) cmpq %rdx, -104(%rsp) jg .L2941 Loop with -frename-registers -fno-move-loop-invariants /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c .L2755: leaq 8(%rax), %rdx movq %rcx, %r13 leaq -16(%rax), %r9 leaq -8(%rax), %r10 leaq -24(%rax), %r8 leaq -32(%rax), %rdi addq (%rdx), %r13 leaq 4(%rcx), %r14 leaq 4(%r13), %rsi movq %r13, (%r10) movq %rsi, (%r9) addq (%rdx), %r13 leaq -40(%rax), %rsi leaq 4(%r13), %r11 movq %r13, (%r8) movq %r11, (%rdi) addq (%rdx), %r13 leaq -48(%rax), %r11 leaq 40(%rax), %rdx movq %r13, (%rsi) addq $4, %r13 movq %r13, (%r11) movq (%rdx), %rbx movq (%rsi), %r12 addq $7, %rbx movsd (%rbx,%r12,2), %xmm11 movq (%r8), %r12 movsd (%rbx,%r13,2), %xmm9 movq (%rdi), %r13 movsd (%rbx,%r12,2), %xmm7 movq (%r10), %r12 movsd (%rbx,%r13,2), %xmm5 movq (%r9), %r13 movsd (%rbx,%r12,2), %xmm6 leaq (%r14,%r14), %r12 movsd (%rbx,%r13,2), %xmm14 leaq (%rbx,%rcx,2), %r13 movsd (%rbx,%r12), %xmm8 movq 24(%rax), %rbx movapd %xmm6, %xmm13 addq $8, %rcx movsd (%r13), %xmm4 cmpq %rcx, %r15 movsd 15(%rbx), %xmm1 movsd 7(%rbx), %xmm2 movapd %xmm1, %xmm3 movsd 31(%rbx), %xmm0 movapd %xmm2, %xmm10 mulsd %xmm11, %xmm3 movapd %xmm2, %xmm12 mulsd %xmm9, %xmm10 mulsd %xmm2, %xmm11 mulsd %xmm1, %xmm9 mulsd %xmm7, %xmm2 addsd %xmm10, %xmm3 mulsd %xmm5, %xmm12 movapd %xmm14, %xmm10 movsd 23(%rbx), %xmm15 subsd %xmm9, %xmm11 movapd %xmm1, %xmm9 mulsd %xmm5, %xmm1 movapd %xmm8, %xmm5 mulsd %xmm7, %xmm9 subsd %xmm3, %xmm10 movapd %xmm4, %xmm7 subsd %xmm11, %xmm13 addsd %xmm6, %xmm11 movsd .LC5(%rip), %xmm6 subsd %xmm1, %xmm2 xorpd %xmm0, %xmm6 addsd %xmm14, %xmm3 addsd %xmm12, %xmm9 movapd %xmm15, %xmm14 movapd %xmm0, %xmm12 subsd %xmm2, %xmm7 mulsd %xmm13, %xmm14 addsd %xmm4, %xmm2 movapd %xmm6, %xmm4 subsd %xmm9, %xmm5 mulsd %xmm3, %xmm0 addsd %xmm8, %xmm9 mulsd %xmm10, %xmm4 movapd %xmm15, %xmm8 mulsd %xmm15, %xmm10 mulsd %xmm11, %xmm15 movapd %xmm7, %xmm1 mulsd %xmm13, %xmm6 mulsd %xmm3, %xmm8 movapd %xmm9, %xmm3 mulsd %xmm11, %xmm12 addsd %xmm14, %xmm4 subsd %xmm0, %xmm15 movapd %xmm5, %xmm0 subsd %xmm10, %xmm6 movapd %xmm2, %xmm10 addsd %xmm12, %xmm8 addsd %xmm15, %xmm10 subsd %xmm15, %xmm2 addsd %xmm6, %xmm1 addsd %xmm8, %xmm3 movsd %xmm10, (%r13) movq (%rdx), %rbx subsd %xmm8, %xmm9 addsd %xmm4, %xmm0 subsd %xmm6, %xmm7 movsd %xmm3, 7(%r12,%rbx) movq (%r10), %r10 movq (%rdx), %r13 subsd %xmm4, %xmm5 movsd %xmm2, 7(%r13,%r10,2) movq (%r9), %rbx movq (%rdx), %r12 movsd %xmm9, 7(%r12,%rbx,2) movq (%r8), %r13 movq (%rdx), %r10 movsd %xmm1, 7(%r10,%r13,2) movq (%rdi), %r9 movq (%rdx), %rbx movsd %xmm0, 7(%rbx,%r9,2) movq (%rsi), %rsi movq (%rdx), %r8 movsd %xmm7, 7(%r8,%rsi,2) movq (%r11), %rdi movq (%rdx), %r12 movsd %xmm5, 7(%r12,%rdi,2) jg .L2755 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928