http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55760



             Bug #: 55760

           Summary: scalar code non using rsqrtss and rcpss

    Classification: Unclassified

           Product: gcc

           Version: 4.8.0

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: tree-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: vincenzo.innoce...@cern.ch





is there any reason why rsqrtss and rcpss are not used for scalar code while

rsqrtps and rcpps are used for loops?



cat scalar.cc

#include<cmath>

void scalar(float& a, float& b) {

  a = std::sqrt(a);

  b = 1.f/b;

}



float v[1024];

float w[1024];



void vector() {

  for(int i=0;i!=1024;++i) {

    v[i] = std::sqrt(v[i]);

    w[i] = 1.f/w[i];

  }

}



c++ -std=c++11 -Ofast -march=corei7 -S scalar.cc -ftree-vectorizer-verbose=1 

-ftree-loop-if-convert-stores; cat scalar.s | c++filt





scalar(float&, float&):

LFB221:

    sqrtss    (%rdi), %xmm0

    movss    %xmm0, (%rdi)

    movss    LC0(%rip), %xmm0

    divss    (%rsi), %xmm0

    movss    %xmm0, (%rsi)

    ret

LFE221:

    .align 4,0x90

    .globl vector()

vector():

LFB222:

    movaps    LC1(%rip), %xmm5

    leaq    void(%rip), %rax

    xorps    %xmm3, %xmm3

    movaps    LC2(%rip), %xmm4

    leaq    wchar_t(%rip), %rdx

    leaq    4096+void(%rip), %rcx

    .align 4,0x90

L4:

    movaps    (%rax), %xmm1

    movaps    %xmm3, %xmm2

    addq    $16, %rax

    addq    $16, %rdx

    rsqrtps    %xmm1, %xmm0

    cmpneqps    %xmm1, %xmm2

    andps    %xmm2, %xmm0

    mulps    %xmm0, %xmm1

    mulps    %xmm1, %xmm0

    mulps    %xmm4, %xmm1

    addps    %xmm5, %xmm0

    mulps    %xmm1, %xmm0

    movaps    %xmm0, -16(%rax)

    movaps    -16(%rdx), %xmm1

    rcpps    %xmm1, %xmm0

    mulps    %xmm0, %xmm1

    mulps    %xmm0, %xmm1

    addps    %xmm0, %xmm0

    subps    %xmm1, %xmm0

    movaps    %xmm0, -16(%rdx)

    cmpq    %rcx, %rax

    jne    L4

    rep; ret

Reply via email to