http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59262
Bug ID: 59262 Summary: __attribute__ ((optimize())) broken (and corrupts optimization of the whole compilation unit) Product: gcc Version: 4.9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: vincenzo.innocente at cern dot ch in latest 4.9. seen in 4.8.1 too take cat attribute.cc inline float sum(float x, float y) { return x+y;} #ifdef OPT1 float foo1() __attribute__ ((optimize("O3", "fast-math"))); #endif #ifdef OPT2 float foo2() __attribute__ ((optimize("fast-math"))); #endif #ifdef OPT3 float foo3() __attribute__ ((optimize("O3"))); #endif float x[1024], y[1024]; float foo1() { float ret=0; for (int i=0; i<1024; ++i) ret += sum(x[i],y[i]); return ret; } float foo2() { float ret=0; for (int i=0; i<1024; ++i) ret += sum(x[i],y[i]); return ret; } float foo3() { float ret=0; for (int i=0; i<1024; ++i) ret += sum(x[i],y[i]); return ret; } float bar() { float ret=0; for (int i=0; i<1024; ++i) ret += sum(x[i],y[i]); return ret; } c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT1 ; cat attribute.s .file "attribute.cc" .section .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat .p2align 4,,15 .weak _Z3sumff .type _Z3sumff, @function _Z3sumff: .LFB0: .cfi_startproc addss %xmm1, %xmm0 ret .cfi_endproc .LFE0: .size _Z3sumff, .-_Z3sumff .text .p2align 4,,-1 .globl _Z4foo1v .type _Z4foo1v, @function _Z4foo1v: .LFB1: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 pxor %xmm3, %xmm3 xorl %ebx, %ebx subq $16, %rsp .cfi_def_cfa_offset 32 movss %xmm3, 12(%rsp) .p2align 4,,10 .p2align 3 .L3: movss x(%rbx), %xmm0 addq $4, %rbx movss y-4(%rbx), %xmm1 call _Z3sumff addss 12(%rsp), %xmm0 movss %xmm0, 12(%rsp) cmpq $4096, %rbx jne .L3 addq $16, %rsp .cfi_def_cfa_offset 16 popq %rbx .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE1: .size _Z4foo1v, .-_Z4foo1v .p2align 4,,15 .globl _Z4foo2v .type _Z4foo2v, @function _Z4foo2v: .LFB2: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L8: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L8 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE2: .size _Z4foo2v, .-_Z4foo2v .p2align 4,,15 .globl _Z4foo3v .type _Z4foo3v, @function _Z4foo3v: .LFB3: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L11: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L11 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE3: .size _Z4foo3v, .-_Z4foo3v .p2align 4,,15 .globl _Z3barv .type _Z3barv, @function _Z3barv: .LFB4: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L14: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L14 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE4: .size _Z3barv, .-_Z3barv .globl y .bss .align 32 .type y, @object .size y, 4096 y: .zero 4096 .globl x .align 32 .type x, @object .size x, 4096 x: .zero 4096 .section .rodata.cst4,"aM",@progbits,4 .align 4 .LC0: .long 0 .ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]" .section .note.GNU-stack,"",@progbits c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT2 ; cat attribute.s .file "attribute.cc" .text .p2align 4,,15 .globl _Z4foo1v .type _Z4foo1v, @function _Z4foo1v: .LFB1: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L2: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L2 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE1: .size _Z4foo1v, .-_Z4foo1v .p2align 4,,-1 .globl _Z4foo2v .type _Z4foo2v, @function _Z4foo2v: .LFB2: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L6: movss x(%rax), %xmm1 addq $4, %rax addss y-4(%rax), %xmm1 addss %xmm1, %xmm0 cmpq $4096, %rax jne .L6 ret .cfi_endproc .LFE2: .size _Z4foo2v, .-_Z4foo2v .p2align 4,,15 .globl _Z4foo3v .type _Z4foo3v, @function _Z4foo3v: .LFB3: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L9: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L9 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE3: .size _Z4foo3v, .-_Z4foo3v .p2align 4,,15 .globl _Z3barv .type _Z3barv, @function _Z3barv: .LFB4: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L12: movaps x(%rax), %xmm1 addq $16, %rax addps y-16(%rax), %xmm1 addps %xmm1, %xmm0 cmpq $4096, %rax jne .L12 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 addss .LC0(%rip), %xmm0 ret .cfi_endproc .LFE4: .size _Z3barv, .-_Z3barv .globl y .bss .align 32 .type y, @object .size y, 4096 y: .zero 4096 .globl x .align 32 .type x, @object .size x, 4096 x: .zero 4096 .section .rodata.cst4,"aM",@progbits,4 .align 4 .LC0: .long 0 .ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]" .section .note.GNU-stack,"",@progbits [innocent@vinavx2 bugs48]$ c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT3 ; cat attribute.s .file "attribute.cc" .section .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat .p2align 4,,15 .weak _Z3sumff .type _Z3sumff, @function _Z3sumff: .LFB0: .cfi_startproc addss %xmm1, %xmm0 ret .cfi_endproc .LFE0: .size _Z3sumff, .-_Z3sumff .text .p2align 4,,15 .globl _Z4foo1v .type _Z4foo1v, @function _Z4foo1v: .LFB1: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L3: movss y(%rax), %xmm1 addq $4, %rax addss x-4(%rax), %xmm1 addss %xmm1, %xmm0 cmpq $4096, %rax jne .L3 ret .cfi_endproc .LFE1: .size _Z4foo1v, .-_Z4foo1v .p2align 4,,15 .globl _Z4foo2v .type _Z4foo2v, @function _Z4foo2v: .LFB2: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L7: movss y(%rax), %xmm1 addq $4, %rax addss x-4(%rax), %xmm1 addss %xmm1, %xmm0 cmpq $4096, %rax jne .L7 ret .cfi_endproc .LFE2: .size _Z4foo2v, .-_Z4foo2v .p2align 4,,-1 .globl _Z4foo3v .type _Z4foo3v, @function _Z4foo3v: .LFB3: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 pxor %xmm3, %xmm3 xorl %ebx, %ebx subq $16, %rsp .cfi_def_cfa_offset 32 movss %xmm3, 12(%rsp) .p2align 4,,10 .p2align 3 .L10: movss x(%rbx), %xmm0 addq $4, %rbx movss y-4(%rbx), %xmm1 call _Z3sumff addss 12(%rsp), %xmm0 movss %xmm0, 12(%rsp) cmpq $4096, %rbx jne .L10 addq $16, %rsp .cfi_def_cfa_offset 16 popq %rbx .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE3: .size _Z4foo3v, .-_Z4foo3v .p2align 4,,15 .globl _Z3barv .type _Z3barv, @function _Z3barv: .LFB4: .cfi_startproc xorl %eax, %eax pxor %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L14: movss y(%rax), %xmm1 addq $4, %rax addss x-4(%rax), %xmm1 addss %xmm1, %xmm0 cmpq $4096, %rax jne .L14 ret .cfi_endproc .LFE4: .size _Z3barv, .-_Z3barv .globl y .bss .align 32 .type y, @object .size y, 4096 y: .zero 4096 .globl x .align 32 .type x, @object .size x, 4096 x: .zero 4096 .ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]" .section .note.GNU-stack,"",@progbits notice how float foo1() __attribute__ ((optimize("O3", "fast-math"))); manages to vectorize foo2,foo3,bar while prevents inlining in foo1 itself... float foo2() __attribute__ ((optimize("fast-math"))); instead vectorize all others BUT foo2