[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-04-26 Thread lucier at math dot purdue dot edu


--- Comment #52 from lucier at math dot purdue dot edu  2009-04-26 18:27 
---
I narrowed down the new performance regression to code added some time around
March 12, 2009, so I changed back the subject line of this PR to reflect the
performance regression caused only by the code added 2006-11-03 and added a new
PR

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39914

to reflect the effects of the March, 2009, code.


-- 

lucier at math dot purdue dot edu changed:

   What|Removed |Added

Summary|[4.3/4.4/4.5 Regression] 79%|[4.3/4.4/4.5 Regression] 30%
   |performance slowdown in |performance slowdown in
   |floating-point code |floating-point code caused
   |partially caused by  r118475|by  r118475


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-05 Thread lucier at math dot purdue dot edu


--- Comment #53 from lucier at math dot purdue dot edu  2009-05-06 03:43 
---
I posted a possible fix to gcc-patches with the subject line

Possible fix for 30% performance regression in PR 33928

Here's the assembly for the main loop after the changes I proposed:

.L4230:
movq%r11, %rdi
addq8(%r10), %rdi
movq8(%r10), %rsi
movq8(%r10), %rdx
movq40(%r10), %rax
leaq4(%r11), %rbx
addq%rdi, %rsi
leaq4(%rdi), %r9
movq%rdi, -8(%r10)
addq%rsi, %rdx
leaq4(%rsi), %r8
movq%rsi, -24(%r10)
leaq4(%rdx), %rcx
movq%r9, -16(%r10)
movq%rdx, -40(%r10)
movq%r8, -32(%r10)
addq$7, %rax
movq%rcx, -48(%r10)
movsd   (%rax,%rcx,2), %xmm12
leaq(%rbx,%rbx), %rcx
movsd   (%rax,%rdx,2), %xmm3
leaq(%rax,%r11,2), %rdx
addq$8, %r11
movsd   (%rax,%r8,2), %xmm14
cmpq%r11, %r13
movsd   (%rax,%rsi,2), %xmm13
movsd   (%rax,%r9,2), %xmm11
movsd   (%rax,%rdi,2), %xmm10
movsd   (%rax,%rcx), %xmm8
movq24(%r10), %rax
movsd   (%rdx), %xmm7
movsd   15(%rax), %xmm2
movsd   7(%rax), %xmm1
movapd  %xmm2, %xmm0
movsd   31(%rax), %xmm9
movapd  %xmm1, %xmm6
mulsd   %xmm3, %xmm0
movapd  %xmm1, %xmm4
mulsd   %xmm12, %xmm6
mulsd   %xmm3, %xmm4
movapd  %xmm1, %xmm3
mulsd   %xmm13, %xmm1
mulsd   %xmm14, %xmm3
addsd   %xmm0, %xmm6
movapd  %xmm2, %xmm0
movsd   23(%rax), %xmm5
mulsd   %xmm12, %xmm0
movapd  %xmm7, %xmm12
subsd   %xmm0, %xmm4
movapd  %xmm2, %xmm0
mulsd   %xmm14, %xmm2
movapd  %xmm8, %xmm14
mulsd   %xmm13, %xmm0
movapd  %xmm11, %xmm13
addsd   %xmm6, %xmm11
subsd   %xmm6, %xmm13
subsd   %xmm2, %xmm1
movapd  %xmm10, %xmm2
addsd   %xmm0, %xmm3
movapd  %xmm5, %xmm0
subsd   %xmm4, %xmm2
addsd   %xmm4, %xmm10
subsd   %xmm1, %xmm12
addsd   %xmm1, %xmm7
movapd  %xmm9, %xmm1
subsd   %xmm3, %xmm14
mulsd   %xmm2, %xmm0
xorpd   .LC5(%rip), %xmm1
addsd   %xmm3, %xmm8
movapd  %xmm1, %xmm3
mulsd   %xmm2, %xmm1
movapd  %xmm5, %xmm2
mulsd   %xmm13, %xmm3
mulsd   %xmm11, %xmm2
addsd   %xmm0, %xmm3
movapd  %xmm5, %xmm0
mulsd   %xmm10, %xmm5
mulsd   %xmm13, %xmm0
subsd   %xmm0, %xmm1
movapd  %xmm9, %xmm0
mulsd   %xmm11, %xmm9
mulsd   %xmm10, %xmm0
subsd   %xmm9, %xmm5
addsd   %xmm0, %xmm2
movapd  %xmm7, %xmm0
addsd   %xmm5, %xmm0
subsd   %xmm5, %xmm7
movsd   %xmm0, (%rdx)
movapd  %xmm8, %xmm0
movq40(%r10), %rax
subsd   %xmm2, %xmm8
addsd   %xmm2, %xmm0
movsd   %xmm0, 7(%rcx,%rax)
movq-8(%r10), %rdx
movq40(%r10), %rax
movapd  %xmm12, %xmm0
subsd   %xmm1, %xmm12
movsd   %xmm7, 7(%rax,%rdx,2)
movq-16(%r10), %rdx
movq40(%r10), %rax
addsd   %xmm1, %xmm0
movsd   %xmm8, 7(%rax,%rdx,2)
movq-24(%r10), %rdx
movq40(%r10), %rax
movsd   %xmm0, 7(%rax,%rdx,2)
movapd  %xmm14, %xmm0
movq-32(%r10), %rdx
movq40(%r10), %rax
subsd   %xmm3, %xmm14
addsd   %xmm3, %xmm0
movsd   %xmm0, 7(%rax,%rdx,2)
movq-40(%r10), %rdx
movq40(%r10), %rax
movsd   %xmm12, 7(%rax,%rdx,2)
movq-48(%r10), %rdx
movq40(%r10), %rax
movsd   %xmm14, 7(%rax,%rdx,2)
jg  .L4230
movq%rbx, %r13
.L4228:


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-05 Thread lucier at math dot purdue dot edu


--- Comment #54 from lucier at math dot purdue dot edu  2009-05-06 03:50 
---
Created an attachment (id=17805)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17805&action=view)
svn diff of cse.c to fix the performance regression

This partially reverts r118475 and adds code to call find_best_address for MEMs
in fold_rtx.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread bonzini at gnu dot org


--- Comment #55 from bonzini at gnu dot org  2009-05-06 09:20 ---
Created an attachment (id=17807)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17807&action=view)
svn diff of cse.c to "fix" the performance regression (updated)


-- 

bonzini at gnu dot org changed:

   What|Removed |Added

  Attachment #17805|0   |1
is obsolete||


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread bonzini at gnu dot org


--- Comment #56 from bonzini at gnu dot org  2009-05-06 09:31 ---
Created an attachment (id=17808)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17808&action=view)
usable testcase

Ok, I managed to make a reasonably readable source code (uninclude stdlib
files, remove unused gambit stuff and ___ prefixes, simplify some expressions),
find the heavy loops, annotate them with asm statements (see comment #18,
2007-11-30) and find the length of the loops.

   4.2  4.5 4.5 + patch
LOOP 1~190 ~230~190
INNER LOOP 1.1~120 ~130~120
LOOP 2 33   36  31

I am thus obsoleting (almost) everything that was posted and is not relevant
anymore.  Let's start from scratch with the new testcase.


-- 

bonzini at gnu dot org changed:

   What|Removed |Added

  Attachment #14418|0   |1
is obsolete||
  Attachment #14423|0   |1
is obsolete||
  Attachment #14424|0   |1
is obsolete||
  Attachment #14425|0   |1
is obsolete||
  Attachment #14426|0   |1
is obsolete||
  Attachment #14534|0   |1
is obsolete||
  Attachment #14535|0   |1
is obsolete||
  Attachment #14536|0   |1
is obsolete||
  Attachment #14997|0   |1
is obsolete||


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread jakub at gcc dot gnu dot org


--- Comment #57 from jakub at gcc dot gnu dot org  2009-05-06 09:49 ---
Why do you need any #include lines at all in the reduced testcase?  Compiles
just fine even without them...


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread bonzini at gnu dot org


--- Comment #58 from bonzini at gnu dot org  2009-05-06 09:56 ---
Uhm, it's better to run unpatched 4.5 with -O1 -fforward-propagate to get a
fair comparison.  Also, I was counting the loop headers, which are not part of
the hot code.

   4.2 -O1 4.5 -O1 -ffw-prop 4.5 + patch -O1
LOOP 1181 201   180
INNER LOOP 1.1117 118   113
LOOP 227   2726

This shows that you should compare running the code (you can use direct.i) with
4.2/-O1 and 4.5/-O1 -fforward-propagate.  This is very important, otherwise
you're comparing apples to oranges.

fwprop is creating too high register pressure by creating offsets like these in
the loop header:

leaq-8(%r12), %rsi
leaq8(%r12), %r10
leaq-16(%r12), %r9
leaq-24(%r12), %rbx
leaq-32(%r12), %rbp
leaq-40(%r12), %rdi
leaq-48(%r12), %r11
leaq40(%r12), %rdx

Then, the additional register pressure is causing the bad scheduling we have in
the fast assembly outputs:

movq(%rdx), %rax
movsd   (%rax,%r15,2), %xmm7
movq(%rdi), %r15
movsd   (%rax,%r15,2), %xmm10
movq(%rbp), %r15
movsd   (%rax,%r15,2), %xmm5
movq(%rbx), %r15
movsd   (%rax,%r15,2), %xmm6
movq(%r9), %r15
movsd   (%rax,%r15,2), %xmm15
movq(%rsi), %r15
movsd   (%rax,%r15,2), %xmm11


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread bonzini at gnu dot org


--- Comment #59 from bonzini at gnu dot org  2009-05-06 09:59 ---
Created an attachment (id=17809)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=17809&action=view)
usable testcase

Without includes as Jakub suggested.


-- 

bonzini at gnu dot org changed:

   What|Removed |Added

  Attachment #17808|0   |1
is obsolete||


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928



[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

2009-05-06 Thread bonzini at gnu dot org


--- Comment #60 from bonzini at gnu dot org  2009-05-06 10:47 ---
Actually those are created by -fmove-loop-invariants.  With -O1
-fforward-propagate -fno-move-loop-invariants I get:

   4.5 -O1 -ffw-prop -fno-move-loop-inv
LOOP 1183
INNER LOOP 1.1116
LOOP 225

You should be able to get performance close to 4.2 or better with options "-O1
-fforward-propagate -fno-move-loop-invariants -fschedule-insns2".  If you do,
this means two things:

1) That the bug is in the register pressure estimations of
-fno-move-loop-invariants, and merely exposed by the fwprop patch.

2) That maybe you should start from -O2 and go backwards, eliminating
optimizations that do not help you or cause high compilation time, instead of
using -O1.


-- 

bonzini at gnu dot org changed:

   What|Removed |Added

 Status|NEW |WAITING


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928