I triggered this is the inner loop of the CPU emulation code of openMSX (http://openmsx.sf.net/). I tried to reduce the code. Below is the smallest code I could come with up that still shows the problem:
------------------------------------------- struct Clock { void f(); void add(unsigned n) { a += n; } int a; }; struct CPU : Clock { virtual ~CPU(); unsigned char readSlow(); void execute(); void delay() { add(2); } unsigned char readFast() { if (unsigned char* p = ptrs[addr >> 8]) { // fast-path delay(); // ### 1 delay(); // ### 2 return p[addr & 255]; } else { // slow-path return readSlow(); } } typedef void (CPU::*FuncPtr)(); static FuncPtr tab[256]; unsigned char* ptrs[256]; unsigned addr; }; void CPU::execute() { f(); while (true) { unsigned char b = readFast(); delay(); // # 3 (this->*tab[b])(); } } ---------------------------------------- When compiled with SVN revision 128037 on a linux x86_64 machine: > g++ -O3 -S CPU.ii > cat -n CPU.s 1 .file "CPU.ii" 2 .text 3 .align 2 4 .p2align 4,,15 5 .globl _ZN3CPU7executeEv 6 .type _ZN3CPU7executeEv, @function 7 _ZN3CPU7executeEv: 8 .LFB5: 9 pushq %rbp 10 .LCFI0: 11 leaq 8(%rdi), %rbp 12 pushq %rbx 13 .LCFI1: 14 movq %rdi, %rbx 15 movq %rbp, %rdi 16 subq $8, %rsp 17 .LCFI2: 18 call _ZN5Clock1fEv 19 .p2align 4,,10 20 .p2align 3 21 .L6: 22 movl 2064(%rbx), %eax 23 shrl $8, %eax 24 mov %eax, %eax 25 movq 16(%rbx,%rax,8), %rdx 26 testq %rdx, %rdx 27 je .L2 28 movl 8(%rbx), %eax ### 29 addl $2, %eax ### 1 30 movl %eax, (%rbp) ### 31 movl 8(%rbx), %eax ### 32 addl $2, %eax ### 2 33 movl %eax, (%rbp) ### 34 movzbl 2064(%rbx), %eax 35 movzbl (%rdx,%rax), %edx 36 .L3: 37 movl 8(%rbx), %eax # 38 addl $2, %eax # 3 39 movl %eax, (%rbp) # 40 movzbl %dl, %eax 41 salq $4, %rax 42 movq _ZN3CPU3tabE(%rax), %rdx 43 testb $1, %dl 44 jne .L4 45 movq %rbx, %rdi 46 addq _ZN3CPU3tabE+8(%rax), %rdi 47 call *%rdx 48 jmp .L6 49 .p2align 4,,10 50 .p2align 3 51 .L4: 52 movq %rbx, %rdi 53 addq _ZN3CPU3tabE+8(%rax), %rdi 54 movq (%rdi), %rax 55 movq -1(%rdx,%rax), %rdx 56 call *%rdx 57 jmp .L6 58 .L2: 59 movq %rbx, %rdi 60 call _ZN3CPU8readSlowEv 61 movl %eax, %edx 62 .p2align 4,,4 63 .p2align 3 64 jmp .L3 [skipped the rest of the output] The missed optimization is visible in lines 28-33. It's also strange to me why reading the variable is done via 8(%rbx) while writing is done via (%rbp). gcc-4.2.1 does a better job on this, it optimizes the two consecutive delay() functions to just: addl $4, 8(%rbx) Additionally I would have prefered that all three delay() functions would be collapsed into a single instruction in the fast code path (and partly duplicated as a+=4; readSlow(); a+=2; in the slow path). But I understand this might be more difficult to implement. -- Summary: a+=2; a+=2 not simplified to a+=4; with -O3 (ok with gcc-4.2.1) Product: gcc Version: 4.3.0 Status: UNCONFIRMED Severity: minor Priority: P3 Component: tree-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: wouter dot vermaelen at scarlet dot be http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33291