https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91818

            Bug ID: 91818
           Summary: SSE optimization flaw with float vs. double
           Product: gcc
           Version: 9.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: warp at iki dot fi
  Target Milestone: ---

Consider the following code:

//-------------------------------------------------
#include <cmath>
#include <array>

using Float = std::array<double, 4>;

Float p(Float a, Float b)
{
    Float result;
    for(unsigned i = 0; i < result.size(); ++i)
        result[i] = std::sqrt(a[i]*a[i] + b[i]*b[i]);
    return result;
}
//-------------------------------------------------

When compiled with gcc 9.2, using -Ofast -march=skylake, it produces the
following result:

//-------------------------------------------------
        push    rbp
        mov     rax, rdi
        mov     rbp, rsp
        vmovupd ymm1, YMMWORD PTR [rbp+48]
        vmovupd ymm0, YMMWORD PTR [rbp+16]
        vmulpd  ymm1, ymm1, ymm1
        vfmadd132pd     ymm0, ymm1, ymm0
        vsqrtpd ymm0, ymm0
        vmovupd YMMWORD PTR [rdi], ymm0
        vzeroupper
        pop     rbp
        ret
//-------------------------------------------------

Besides the surrounding boilerplate (which might or might not be necessary, I'm
not knowledgeable enough to fully understand this), the actual operations are
sensible.

However, consider what happens if we change the type alias to:

using Float = std::array<float, 8>;

One would think the result would be almost identical, yet this is produced:

//-------------------------------------------------
        push    rbp
        vxorps  xmm2, xmm2, xmm2
        mov     rax, rdi
        mov     rbp, rsp
        vmovups ymm1, YMMWORD PTR [rbp+48]
        vmovups ymm0, YMMWORD PTR [rbp+16]
        vmulps  ymm1, ymm1, ymm1
        vfmadd132ps     ymm0, ymm1, ymm0
        vrsqrtps        ymm1, ymm0
        vcmpneqps       ymm2, ymm2, ymm0
        vandps  ymm1, ymm1, ymm2
        vmulps  ymm0, ymm1, ymm0
        vmulps  ymm1, ymm0, ymm1
        vmulps  ymm0, ymm0, YMMWORD PTR .LC1[rip]
        vaddps  ymm1, ymm1, YMMWORD PTR .LC0[rip]
        vmulps  ymm0, ymm1, ymm0
        vmovups YMMWORD PTR [rdi], ymm0
        vzeroupper
        pop     rbp
        ret
.LC0:
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
        .long   3225419776
.LC1:
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
        .long   3204448256
//-------------------------------------------------

This is not a question of the number of loops being 8, as

using Float = std::array<float, 4>;

produces a very similar result.

Note that clang 8.0 produces this (from the <float, 8> version of the code):

//-------------------------------------------------
        mov     rax, rdi
        vmovups ymm0, ymmword ptr [rsp + 8]
        vmulps  ymm0, ymm0, ymm0
        vmovups ymm1, ymmword ptr [rsp + 40]
        vfmadd213ps     ymm1, ymm1, ymm0
        vsqrtps ymm0, ymm1
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret
//-------------------------------------------------

Reply via email to