[Bug tree-optimization/61747] New: min,max pattern not always properly optimized (for sse4 targets)

vincenzo.innocente at cern dot ch Tue, 08 Jul 2014 06:35:33 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61747


            Bug ID: 61747
           Summary: min,max pattern not always properly optimized (for
                    sse4 targets)
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vincenzo.innocente at cern dot ch

I was expecting gcc to substitute min/max instruction for (a>/<b) ? a : b;
even for "O2".
This is not always the case, only Ofast provides consistently optimized code
(even if sometimes with a redundant move). -ffinite-math-only makes the code
worse for vector arguments...

cat vmin.cc 
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

  template<typename V1>
  V1 vmax(V1 a, V1 b) {
    return (a>b) ? a : b;
  }
  template<typename V1>
  V1 vmin(V1 a, V1 b) {
    return (a<b) ? a : b;
  }


float foo(float a, float b, float c) {
  return vmin(vmax(a,b),c);
}

float32x4_t foo(float32x4_t a, float32x4_t b, float32x4_t c) {
  return vmin(vmax(a,b),c);
}

template<typename Float>
Float bart(Float a) { 
  constexpr Float zero{0.f};
  constexpr Float it = zero+4.f;
  constexpr Float zt = zero-3.f;
  return vmin(vmax(a,zt),it);
}


float bar(float a) {
   return bart(a);
}
float32x4_t bar(float32x4_t a) {
   return bart(a);
}

I see
c++ -std=c++11 -O2  -msse4.2 -s vmin.cc -S; cat vmin.s

__Z3foofff:
LFB2:
    maxss    %xmm1, %xmm0
    minss    %xmm2, %xmm0
    ret

__Z3fooDv4_fS_S_:
LFB3:
    maxps    %xmm1, %xmm0
    minps    %xmm2, %xmm0
    ret

__Z3barf:
LFB5:
    ucomiss    LC3(%rip), %xmm0
    jbe    L12
    minss    LC2(%rip), %xmm0
    ret
    .align 4,0x90
L12:
    movss    LC3(%rip), %xmm0
    ret

__Z3barDv4_f:
LFB6:
    movaps    LC5(%rip), %xmm1
    movaps    %xmm0, %xmm2
    movaps    %xmm1, %xmm0
    cmpltps    %xmm2, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1
    movaps    LC6(%rip), %xmm2
    movaps    %xmm1, %xmm0
    cmpltps    %xmm2, %xmm0
    blendvps    %xmm0, %xmm1, %xmm2
    movaps    %xmm2, %xmm0
    ret

-----------------
c++ -std=c++11 -O2  -msse4.2 -s vmin.cc -S -ffinite-math-only; cat vmin.s
__Z3foofff:
LFB2:
    maxss    %xmm0, %xmm1
    minss    %xmm2, %xmm1
    movaps    %xmm1, %xmm0
    ret
__Z3fooDv4_fS_S_:
LFB3:
    maxps    %xmm1, %xmm0
    movaps    %xmm0, %xmm1
    movaps    %xmm2, %xmm0
    cmpleps    %xmm1, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1
    movaps    %xmm1, %xmm0
    ret

__Z3barf:
LFB5:
    maxss    LC2(%rip), %xmm0
    minss    LC3(%rip), %xmm0
    ret

__Z3barDv4_f:
LFB6:
    movaps    LC5(%rip), %xmm1
    movaps    %xmm0, %xmm2
    movaps    %xmm1, %xmm0
    cmpltps    %xmm2, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1
    movaps    LC6(%rip), %xmm2
    movaps    %xmm1, %xmm0
    cmpltps    %xmm2, %xmm0
    blendvps    %xmm0, %xmm1, %xmm2
    movaps    %xmm2, %xmm0
    ret
LFE6:

--------------
eventually
c++ -std=c++11 -Ofast  -msse4.2 -s vmin.cc -S; cat vmin.s

__Z3foofff:
LFB2:
    maxss    %xmm0, %xmm1
    minss    %xmm2, %xmm1
    movaps    %xmm1, %xmm0
    ret

__Z3fooDv4_fS_S_:
LFB3:
    maxps    %xmm0, %xmm1
    minps    %xmm2, %xmm1
    movaps    %xmm1, %xmm0
    ret

__Z3barf:
LFB5:
    maxss    LC2(%rip), %xmm0
    minss    LC3(%rip), %xmm0
    ret
__Z3barDv4_f:
LFB6:
    maxps    LC5(%rip), %xmm0
    minps    LC6(%rip), %xmm0
    ret

[Bug tree-optimization/61747] New: min,max pattern not always properly optimized (for sse4 targets)

Reply via email to