[Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!

vincenzo.innocente at cern dot ch Mon, 17 Dec 2012 10:58:17 -0800


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55723




             Bug #: 55723

           Summary: SLP vectorization vs loop: SLP more efficient!

    Classification: Unclassified

           Product: gcc

           Version: 4.8.0

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: tree-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: vincenzo.innoce...@cern.ch





in the following code, basic block vectorization seems to be more efficient

that standard loop vectorization (I measure 20% better)

Is the loop vectorization computing the polynomial twice?





gcc version 4.8.0 20121215 (experimental) [trunk revision 194522] (GCC) 







cat AtanT.cc;

typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;



template<typename Float>

inline

Float atan(Float t) {

  constexpr float PIO4F = 0.7853981633974483096f;



  Float z= (t > 0.4142135623730950f) ? (t-1.0f)/(t+1.0f) : t;



  Float z2 = z * z;

  Float ret =

    ((( 8.05374449538e-2f * z2

    - 1.38776856032E-1f) * z2

      + 1.99777106478E-1f) * z2

     - 3.33329491539E-1f) * z2 * z

    + z;



  // move back in place

  return ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;

  return ret;

}



float32x4_t va[1024];

float32x4_t vb[1024];



float a[4*1024];

float b[4*1024];



void computeV() {

  for (int i=0;i!=1024;++i)

    vb[i]=atan(va[i]);

}



//inline

void computeL() {

  for (int i=0;i!=4*1024;++i)

    b[i]=atan(a[i]);

}

Vincenzos-MacBook-Pro:floatPrec innocent$ c++ -std=c++11 -Ofast -march=corei7

-S AtanT.cc; cat AtanT.s

    .text

    .align 4,0x90

    .globl __Z8computeVv

__Z8computeVv:

LFB1:

    movaps    LC1(%rip), %xmm4

    leaq    _va(%rip), %rcx

    xorl    %eax, %eax

    movaps    LC0(%rip), %xmm10

    leaq    _vb(%rip), %rdx

    movaps    LC2(%rip), %xmm9

    movaps    LC3(%rip), %xmm8

    movaps    LC4(%rip), %xmm7

    movaps    LC5(%rip), %xmm6

    movaps    LC6(%rip), %xmm5

    .align 4,0x90

L3:

    movaps    (%rcx,%rax), %xmm1

    movaps    %xmm1, %xmm3

    movaps    %xmm1, %xmm2

    addps    %xmm4, %xmm3

    subps    %xmm4, %xmm2

    rcpps    %xmm3, %xmm0

    mulps    %xmm0, %xmm3

    mulps    %xmm0, %xmm3

    addps    %xmm0, %xmm0

    subps    %xmm3, %xmm0

    movaps    %xmm1, %xmm3

    mulps    %xmm0, %xmm2

    movaps    %xmm10, %xmm0

    cmpltps    %xmm1, %xmm0

    blendvps    %xmm0, %xmm2, %xmm3

    movaps    %xmm3, %xmm2

    mulps    %xmm3, %xmm2

    movaps    %xmm2, %xmm1

    mulps    %xmm9, %xmm1

    subps    %xmm8, %xmm1

    mulps    %xmm2, %xmm1

    addps    %xmm7, %xmm1

    mulps    %xmm2, %xmm1

    subps    %xmm6, %xmm1

    mulps    %xmm2, %xmm1

    addps    %xmm4, %xmm1

    mulps    %xmm3, %xmm1

    movaps    %xmm1, %xmm2

    addps    %xmm5, %xmm2

    blendvps    %xmm0, %xmm2, %xmm1

    movaps    %xmm1, (%rdx,%rax)

    addq    $16, %rax

    cmpq    $16384, %rax

    jne    L3

    rep; ret

LFE1:

    .align 4,0x90

    .globl __Z8computeLv

__Z8computeLv:

LFB2:

    movaps    LC1(%rip), %xmm5

    leaq    _a(%rip), %rcx

    xorl    %eax, %eax

    movaps    LC0(%rip), %xmm11

    leaq    _b(%rip), %rdx

    movaps    LC2(%rip), %xmm9

    movaps    LC7(%rip), %xmm8

    movaps    LC4(%rip), %xmm7

    movaps    LC8(%rip), %xmm6

    movaps    LC6(%rip), %xmm10

    .align 4,0x90

L7:

    movaps    (%rcx,%rax), %xmm0

    movaps    %xmm0, %xmm3

    movaps    %xmm0, %xmm1

    addps    %xmm5, %xmm3

    subps    %xmm5, %xmm1

    rcpps    %xmm3, %xmm2

    mulps    %xmm2, %xmm3

    mulps    %xmm2, %xmm3

    addps    %xmm2, %xmm2

    subps    %xmm3, %xmm2

    movaps    %xmm0, %xmm3

    mulps    %xmm0, %xmm3

    mulps    %xmm2, %xmm1

    movaps    %xmm1, %xmm4

    mulps    %xmm1, %xmm4

    movaps    %xmm4, %xmm2

    mulps    %xmm9, %xmm2

    addps    %xmm8, %xmm2

    mulps    %xmm4, %xmm2

    addps    %xmm7, %xmm2

    mulps    %xmm4, %xmm2

    addps    %xmm6, %xmm2

    mulps    %xmm4, %xmm2

    movaps    %xmm11, %xmm4

    cmpltps    %xmm0, %xmm4

    addps    %xmm5, %xmm2

    mulps    %xmm1, %xmm2

    movaps    %xmm3, %xmm1

    mulps    %xmm9, %xmm1

    addps    %xmm10, %xmm2

    addps    %xmm8, %xmm1

    mulps    %xmm3, %xmm1

    addps    %xmm7, %xmm1

    mulps    %xmm3, %xmm1

    addps    %xmm6, %xmm1

    mulps    %xmm3, %xmm1

    addps    %xmm5, %xmm1

    mulps    %xmm0, %xmm1

    movaps    %xmm4, %xmm0

    blendvps    %xmm0, %xmm2, %xmm1

    movaps    %xmm1, (%rdx,%rax)

    addq    $16, %rax

    cmpq    $16384, %rax

    jne    L7

    rep; ret

[Bug tree-optimization/55723] New: SLP vectorization vs loop: SLP more efficient!

Reply via email to