http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
Richard Guenther <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |ASSIGNED Last reconfirmed| |2012-05-15 Component|middle-end |tree-optimization AssignedTo|unassigned at gcc dot |rguenth at gcc dot gnu.org |gnu.org | Ever Confirmed|0 |1 --- Comment #1 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-15 09:40:45 UTC --- Confirmed. Btw, it does not check for misalignment again but for whether there is a remaining scalar iteration to be done. On trunk (for GCC 4.8) we already improve somewhat - but the odd induction variable choices remain. foo: .LFB0: .cfi_startproc movq %rdi, %rax salq $60, %rax shrq $63, %rax testl %eax, %eax je .L2 movsd (%rdi), %xmm0 movl $100000, %r8d movsd .LC0(%rip), %xmm1 subl %eax, %r8d movl %r8d, %esi movl $99999, %r11d addsd %xmm1, %xmm0 shrl %esi movl %esi, %r9d addl %r9d, %r9d movsd %xmm0, (%rdi) je .L9 movl $1, %r10d .L8: movapd .LC1(%rip), %xmm1 leaq (%rdi,%rax,8), %rcx xorl %edx, %edx xorl %eax, %eax .p2align 4,,10 .p2align 3 .L7: movapd (%rcx,%rax), %xmm0 addl $1, %edx addpd %xmm1, %xmm0 movapd %xmm0, (%rcx,%rax) addq $16, %rax cmpl %esi, %edx jb .L7 subl %r9d, %r11d cmpl %r9d, %r8d leal (%r10,%r9), %edx je .L1 movsd .LC0(%rip), %xmm1 .L3: leal -1(%r11), %ecx movslq %edx, %rdx leaq 0(,%rdx,8), %rax leaq 1(%rdx,%rcx), %rdx salq $3, %rdx .p2align 4,,10 .p2align 3 .L6: movsd (%rdi,%rax), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, (%rdi,%rax) addq $8, %rax cmpq %rdx, %rax jne .L6 rep ret .L1: ret .L2: movl $100000, %r8d movl $50000, %esi movl $100000, %r9d movl $100000, %r11d xorl %r10d, %r10d jmp .L8 .L9: movl $1, %edx jmp .L3 what we also fail to optimize are the entry checks of the vectorized loops after the prologue loop: # BLOCK 2 loop_depth:0 freq:100 # PRED: ENTRY [100.0%] (fallthru,exec) D.1738_23 = (unsigned long) x_2(D); D.1739_24 = D.1738_23 & 15; D.1740_25 = D.1739_24 >> 3; D.1741_26 = -D.1740_25; D.1742_27 = (unsigned int) D.1741_26; D.1802_18 = D.1741_26 & 1; prolog_loop_niters.16_28 = (unsigned int) D.1802_18; if (prolog_loop_niters.16_28 == 0) goto <bb 12>; else goto <bb 3>; # SUCC: 3 [100.0%] (false,exec) 12 (true,exec) # BLOCK 3 loop_depth:0 freq:100 # PRED: 2 [100.0%] (false,exec) x_65 = x_2(D); D.1718_37 = MEM[base: x_65, offset: 0B]; D.1719_38 = D.1718_37 + 2.0e+0; MEM[base: x_65, offset: 0B] = D.1719_38; prolog_loop_adjusted_niters.18_52 = D.1741_26 & 1; niters.19_53 = 100000 - prolog_loop_niters.16_28; bnd.20_54 = niters.19_53 >> 1; ratio_mult_vf.21_55 = bnd.20_54 << 1; if (ratio_mult_vf.21_55 == 0) goto <bb 7>; else goto <bb 4>; # SUCC: 4 [100.0%] (false,exec) 7 (true,exec) we fail to see that ratio_mult_vf.21_55 is never zero. Of course that is because of the awkward representation of the checks. I will look into this.