http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59262

            Bug ID: 59262
           Summary: __attribute__ ((optimize()))  broken (and corrupts
                    optimization of the whole compilation unit)
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vincenzo.innocente at cern dot ch

in latest 4.9. seen in 4.8.1 too
take
cat attribute.cc
inline float sum(float x, float y) { return x+y;}


#ifdef OPT1
float foo1() __attribute__ ((optimize("O3", "fast-math")));
#endif
#ifdef OPT2
float foo2() __attribute__ ((optimize("fast-math")));
#endif
#ifdef OPT3
float foo3() __attribute__ ((optimize("O3")));
#endif

float x[1024], y[1024];

float foo1() {
  float ret=0;
  for (int i=0; i<1024; ++i) 
     ret += sum(x[i],y[i]);
  return ret;
}


float foo2() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}


float foo3() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}


float bar() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}

c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT1 ; cat attribute.s
    .file    "attribute.cc"
    .section    .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
    .p2align 4,,15
    .weak    _Z3sumff
    .type    _Z3sumff, @function
_Z3sumff:
.LFB0:
    .cfi_startproc
    addss    %xmm1, %xmm0
    ret
    .cfi_endproc
.LFE0:
    .size    _Z3sumff, .-_Z3sumff
    .text
    .p2align 4,,-1
    .globl    _Z4foo1v
    .type    _Z4foo1v, @function
_Z4foo1v:
.LFB1:
    .cfi_startproc
    pushq    %rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    pxor    %xmm3, %xmm3
    xorl    %ebx, %ebx
    subq    $16, %rsp
    .cfi_def_cfa_offset 32
    movss    %xmm3, 12(%rsp)
    .p2align 4,,10
    .p2align 3
.L3:
    movss    x(%rbx), %xmm0
    addq    $4, %rbx
    movss    y-4(%rbx), %xmm1
    call    _Z3sumff
    addss    12(%rsp), %xmm0
    movss    %xmm0, 12(%rsp)
    cmpq    $4096, %rbx
    jne    .L3
    addq    $16, %rsp
    .cfi_def_cfa_offset 16
    popq    %rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE1:
    .size    _Z4foo1v, .-_Z4foo1v
    .p2align 4,,15
    .globl    _Z4foo2v
    .type    _Z4foo2v, @function
_Z4foo2v:
.LFB2:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L8:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L8
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE2:
    .size    _Z4foo2v, .-_Z4foo2v
    .p2align 4,,15
    .globl    _Z4foo3v
    .type    _Z4foo3v, @function
_Z4foo3v:
.LFB3:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L11:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L11
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE3:
    .size    _Z4foo3v, .-_Z4foo3v
    .p2align 4,,15
    .globl    _Z3barv
    .type    _Z3barv, @function
_Z3barv:
.LFB4:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L14:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L14
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE4:
    .size    _Z3barv, .-_Z3barv
    .globl    y
    .bss
    .align 32
    .type    y, @object
    .size    y, 4096
y:
    .zero    4096
    .globl    x
    .align 32
    .type    x, @object
    .size    x, 4096
x:
    .zero    4096
    .section    .rodata.cst4,"aM",@progbits,4
    .align 4
.LC0:
    .long    0
    .ident    "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
    .section    .note.GNU-stack,"",@progbits
 c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT2 ; cat
attribute.s
    .file    "attribute.cc"
    .text
    .p2align 4,,15
    .globl    _Z4foo1v
    .type    _Z4foo1v, @function
_Z4foo1v:
.LFB1:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L2:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L2
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE1:
    .size    _Z4foo1v, .-_Z4foo1v
    .p2align 4,,-1
    .globl    _Z4foo2v
    .type    _Z4foo2v, @function
_Z4foo2v:
.LFB2:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L6:
    movss    x(%rax), %xmm1
    addq    $4, %rax
    addss    y-4(%rax), %xmm1
    addss    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L6
    ret
    .cfi_endproc
.LFE2:
    .size    _Z4foo2v, .-_Z4foo2v
    .p2align 4,,15
    .globl    _Z4foo3v
    .type    _Z4foo3v, @function
_Z4foo3v:
.LFB3:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L9:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L9
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE3:
    .size    _Z4foo3v, .-_Z4foo3v
    .p2align 4,,15
    .globl    _Z3barv
    .type    _Z3barv, @function
_Z3barv:
.LFB4:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L12:
    movaps    x(%rax), %xmm1
    addq    $16, %rax
    addps    y-16(%rax), %xmm1
    addps    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L12
    haddps    %xmm0, %xmm0
    haddps    %xmm0, %xmm0
    addss    .LC0(%rip), %xmm0
    ret
    .cfi_endproc
.LFE4:
    .size    _Z3barv, .-_Z3barv
    .globl    y
    .bss
    .align 32
    .type    y, @object
    .size    y, 4096
y:
    .zero    4096
    .globl    x
    .align 32
    .type    x, @object
    .size    x, 4096
x:
    .zero    4096
    .section    .rodata.cst4,"aM",@progbits,4
    .align 4
.LC0:
    .long    0
    .ident    "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
    .section    .note.GNU-stack,"",@progbits

[innocent@vinavx2 bugs48]$ c++ -O2 -ftree-vectorize -S attribute.cc
-march=corei7 -DOPT3 ; cat attribute.s
    .file    "attribute.cc"
    .section    .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
    .p2align 4,,15
    .weak    _Z3sumff
    .type    _Z3sumff, @function
_Z3sumff:
.LFB0:
    .cfi_startproc
    addss    %xmm1, %xmm0
    ret
    .cfi_endproc
.LFE0:
    .size    _Z3sumff, .-_Z3sumff
    .text
    .p2align 4,,15
    .globl    _Z4foo1v
    .type    _Z4foo1v, @function
_Z4foo1v:
.LFB1:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L3:
    movss    y(%rax), %xmm1
    addq    $4, %rax
    addss    x-4(%rax), %xmm1
    addss    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L3
    ret
    .cfi_endproc
.LFE1:
    .size    _Z4foo1v, .-_Z4foo1v
    .p2align 4,,15
    .globl    _Z4foo2v
    .type    _Z4foo2v, @function
_Z4foo2v:
.LFB2:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L7:
    movss    y(%rax), %xmm1
    addq    $4, %rax
    addss    x-4(%rax), %xmm1
    addss    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L7
    ret
    .cfi_endproc
.LFE2:
    .size    _Z4foo2v, .-_Z4foo2v
    .p2align 4,,-1
    .globl    _Z4foo3v
    .type    _Z4foo3v, @function
_Z4foo3v:
.LFB3:
    .cfi_startproc
    pushq    %rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    pxor    %xmm3, %xmm3
    xorl    %ebx, %ebx
    subq    $16, %rsp
    .cfi_def_cfa_offset 32
    movss    %xmm3, 12(%rsp)
    .p2align 4,,10
    .p2align 3
.L10:
    movss    x(%rbx), %xmm0
    addq    $4, %rbx
    movss    y-4(%rbx), %xmm1
    call    _Z3sumff
    addss    12(%rsp), %xmm0
    movss    %xmm0, 12(%rsp)
    cmpq    $4096, %rbx
    jne    .L10
    addq    $16, %rsp
    .cfi_def_cfa_offset 16
    popq    %rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE3:
    .size    _Z4foo3v, .-_Z4foo3v
    .p2align 4,,15
    .globl    _Z3barv
    .type    _Z3barv, @function
_Z3barv:
.LFB4:
    .cfi_startproc
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    .p2align 4,,10
    .p2align 3
.L14:
    movss    y(%rax), %xmm1
    addq    $4, %rax
    addss    x-4(%rax), %xmm1
    addss    %xmm1, %xmm0
    cmpq    $4096, %rax
    jne    .L14
    ret
    .cfi_endproc
.LFE4:
    .size    _Z3barv, .-_Z3barv
    .globl    y
    .bss
    .align 32
    .type    y, @object
    .size    y, 4096
y:
    .zero    4096
    .globl    x
    .align 32
    .type    x, @object
    .size    x, 4096
x:
    .zero    4096
    .ident    "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
    .section    .note.GNU-stack,"",@progbits


notice how
float foo1() __attribute__ ((optimize("O3", "fast-math")));
manages to vectorize foo2,foo3,bar while prevents inlining in foo1 itself...
float foo2() __attribute__ ((optimize("fast-math")));
instead vectorize all others BUT foo2

Reply via email to