http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55723
Bug #: 55723 Summary: SLP vectorization vs loop: SLP more efficient! Classification: Unclassified Product: gcc Version: 4.8.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: vincenzo.innoce...@cern.ch in the following code, basic block vectorization seems to be more efficient that standard loop vectorization (I measure 20% better) Is the loop vectorization computing the polynomial twice? gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC) cat AtanT.cc; typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; template<typename Float> inline Float atan(Float t) { constexpr float PIO4F = 0.7853981633974483096f; Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t; Float z2 = z * z; Float ret = ((( 8.05374449538e-2f * z2 - 1.38776856032E-1f) * z2 + 1.99777106478E-1f) * z2 - 3.33329491539E-1f) * z2 * z + z; // move back in place return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret; return ret; } float32x4_t va[1024]; float32x4_t vb[1024]; float a[4*1024]; float b[4*1024]; void computeV() { for (int i=0;i!=1024;++i) vb[i]=atan(va[i]); } //inline void computeL() { for (int i=0;i!=4*1024;++i) b[i]=atan(a[i]); } Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7 -S AtanT.cc; cat AtanT.s .text .align 4,0x90 .globl __Z8computeVv __Z8computeVv: LFB1: movaps LC1(%rip), %xmm4 leaq _va(%rip), %rcx xorl %eax, %eax movaps LC0(%rip), %xmm10 leaq _vb(%rip), %rdx movaps LC2(%rip), %xmm9 movaps LC3(%rip), %xmm8 movaps LC4(%rip), %xmm7 movaps LC5(%rip), %xmm6 movaps LC6(%rip), %xmm5 .align 4,0x90 L3: movaps (%rcx,%rax), %xmm1 movaps %xmm1, %xmm3 movaps %xmm1, %xmm2 addps %xmm4, %xmm3 subps %xmm4, %xmm2 rcpps %xmm3, %xmm0 mulps %xmm0, %xmm3 mulps %xmm0, %xmm3 addps %xmm0, %xmm0 subps %xmm3, %xmm0 movaps %xmm1, %xmm3 mulps %xmm0, %xmm2 movaps %xmm10, %xmm0 cmpltps %xmm1, %xmm0 blendvps %xmm0, %xmm2, %xmm3 movaps %xmm3, %xmm2 mulps %xmm3, %xmm2 movaps %xmm2, %xmm1 mulps %xmm9, %xmm1 subps %xmm8, %xmm1 mulps %xmm2, %xmm1 addps %xmm7, %xmm1 mulps %xmm2, %xmm1 subps %xmm6, %xmm1 mulps %xmm2, %xmm1 addps %xmm4, %xmm1 mulps %xmm3, %xmm1 movaps %xmm1, %xmm2 addps %xmm5, %xmm2 blendvps %xmm0, %xmm2, %xmm1 movaps %xmm1, (%rdx,%rax) addq $16, %rax cmpq $16384, %rax jne L3 rep; ret LFE1: .align 4,0x90 .globl __Z8computeLv __Z8computeLv: LFB2: movaps LC1(%rip), %xmm5 leaq _a(%rip), %rcx xorl %eax, %eax movaps LC0(%rip), %xmm11 leaq _b(%rip), %rdx movaps LC2(%rip), %xmm9 movaps LC7(%rip), %xmm8 movaps LC4(%rip), %xmm7 movaps LC8(%rip), %xmm6 movaps LC6(%rip), %xmm10 .align 4,0x90 L7: movaps (%rcx,%rax), %xmm0 movaps %xmm0, %xmm3 movaps %xmm0, %xmm1 addps %xmm5, %xmm3 subps %xmm5, %xmm1 rcpps %xmm3, %xmm2 mulps %xmm2, %xmm3 mulps %xmm2, %xmm3 addps %xmm2, %xmm2 subps %xmm3, %xmm2 movaps %xmm0, %xmm3 mulps %xmm0, %xmm3 mulps %xmm2, %xmm1 movaps %xmm1, %xmm4 mulps %xmm1, %xmm4 movaps %xmm4, %xmm2 mulps %xmm9, %xmm2 addps %xmm8, %xmm2 mulps %xmm4, %xmm2 addps %xmm7, %xmm2 mulps %xmm4, %xmm2 addps %xmm6, %xmm2 mulps %xmm4, %xmm2 movaps %xmm11, %xmm4 cmpltps %xmm0, %xmm4 addps %xmm5, %xmm2 mulps %xmm1, %xmm2 movaps %xmm3, %xmm1 mulps %xmm9, %xmm1 addps %xmm10, %xmm2 addps %xmm8, %xmm1 mulps %xmm3, %xmm1 addps %xmm7, %xmm1 mulps %xmm3, %xmm1 addps %xmm6, %xmm1 mulps %xmm3, %xmm1 addps %xmm5, %xmm1 mulps %xmm0, %xmm1 movaps %xmm4, %xmm0 blendvps %xmm0, %xmm2, %xmm1 movaps %xmm1, (%rdx,%rax) addq $16, %rax cmpq $16384, %rax jne L7 rep; ret