http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55760
Bug #: 55760 Summary: scalar code non using rsqrtss and rcpss Classification: Unclassified Product: gcc Version: 4.8.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: vincenzo.innoce...@cern.ch is there any reason why rsqrtss and rcpss are not used for scalar code while rsqrtps and rcpps are used for loops? cat scalar.cc #include<cmath> void scalar(float& a, float& b) { a = std::sqrt(a); b = 1.f/b; } float v[1024]; float w[1024]; void vector() { for(int i=0;i!=1024;++i) { v[i] = std::sqrt(v[i]); w[i] = 1.f/w[i]; } } c++ -std=c++11 -Ofast -march=corei7 -S scalar.cc -ftree-vectorizer-verbose=1 -ftree-loop-if-convert-stores; cat scalar.s | c++filt scalar(float&, float&): LFB221: sqrtss (%rdi), %xmm0 movss %xmm0, (%rdi) movss LC0(%rip), %xmm0 divss (%rsi), %xmm0 movss %xmm0, (%rsi) ret LFE221: .align 4,0x90 .globl vector() vector(): LFB222: movaps LC1(%rip), %xmm5 leaq void(%rip), %rax xorps %xmm3, %xmm3 movaps LC2(%rip), %xmm4 leaq wchar_t(%rip), %rdx leaq 4096+void(%rip), %rcx .align 4,0x90 L4: movaps (%rax), %xmm1 movaps %xmm3, %xmm2 addq $16, %rax addq $16, %rdx rsqrtps %xmm1, %xmm0 cmpneqps %xmm1, %xmm2 andps %xmm2, %xmm0 mulps %xmm0, %xmm1 mulps %xmm1, %xmm0 mulps %xmm4, %xmm1 addps %xmm5, %xmm0 mulps %xmm1, %xmm0 movaps %xmm0, -16(%rax) movaps -16(%rdx), %xmm1 rcpps %xmm1, %xmm0 mulps %xmm0, %xmm1 mulps %xmm0, %xmm1 addps %xmm0, %xmm0 subps %xmm1, %xmm0 movaps %xmm0, -16(%rdx) cmpq %rcx, %rax jne L4 rep; ret