[Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math

tim at klingt dot org Wed, 31 Dec 2008 02:55:16 -0800

there is a speed regression from gcc-4.4 with the following code:

void bench_3(float * out, float * in1, float in2, float slope, unsigned int n)
{
    __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope,
in2+slope+slope+slope);
    const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);


    std::size_t loops = n / 4;

    do {
        __m128 arg1 = _mm_load_ps(in1);
        __m128 result = _mm_add_ps(arg1, arg2);
        arg2 = _mm_add_ps(arg2, vslope);
        _mm_store_ps(out, result);
        in1+=4;
        out+=4;
    } while (--loops);
}

gcc-4.3 generates the code:
0000000000400f00 <bench_3(float*, float*, float, float, unsigned int)>:
  400f00:       0f 28 e1                movaps %xmm1,%xmm4
  400f03:       c1 ea 02                shr    $0x2,%edx
  400f06:       f3 0f 58 e0             addss  %xmm0,%xmm4
  400f0a:       89 d2                   mov    %edx,%edx
  400f0c:       0f 28 dc                movaps %xmm4,%xmm3
  400f0f:       31 c0                   xor    %eax,%eax
  400f11:       f3 0f 58 d9             addss  %xmm1,%xmm3
  400f15:       0f 14 e0                unpcklps %xmm0,%xmm4
  400f18:       0f 28 d3                movaps %xmm3,%xmm2
  400f1b:       f3 0f 58 d1             addss  %xmm1,%xmm2
  400f1f:       f3 0f 59 0d 79 17 00    mulss  0x1779(%rip),%xmm1        #
4026a0 <_IO_stdin_used+0xa0>
  400f26:       00 
  400f27:       0f 14 d3                unpcklps %xmm3,%xmm2
  400f2a:       0f c6 c9 00             shufps $0x0,%xmm1,%xmm1
  400f2e:       0f 16 d4                movlhps %xmm4,%xmm2
  400f31:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  400f38:       0f 28 04 06             movaps (%rsi,%rax,1),%xmm0
  400f3c:       0f 58 c2                addps  %xmm2,%xmm0
  400f3f:       0f 58 d1                addps  %xmm1,%xmm2
  400f42:       0f 29 04 07             movaps %xmm0,(%rdi,%rax,1)
  400f46:       48 83 c0 10             add    $0x10,%rax
  400f4a:       48 ff ca                dec    %rdx
  400f4d:       75 e9                   jne    400f38 <bench_3(float*, float*,
float, float, unsigned int)+0x38>
  400f4f:       f3 c3                   repz retq 
  400f51:       66 66 66 66 66 66 2e    nopw   %cs:0x0(%rax,%rax,1)
  400f58:       0f 1f 84 00 00 00 00 
  400f5f:       00 

while gcc-4.4 generates:
0000000000400ea0 <bench_3(float*, float*, float, float, unsigned int)>:
  400ea0:       0f 28 d9                movaps %xmm1,%xmm3
  400ea3:       c1 ea 02                shr    $0x2,%edx
  400ea6:       f3 0f 58 d8             addss  %xmm0,%xmm3
  400eaa:       89 d2                   mov    %edx,%edx
  400eac:       0f 28 d3                movaps %xmm3,%xmm2
  400eaf:       31 c0                   xor    %eax,%eax
  400eb1:       f3 0f 58 d1             addss  %xmm1,%xmm2
  400eb5:       0f 14 d8                unpcklps %xmm0,%xmm3
  400eb8:       0f 28 e2                movaps %xmm2,%xmm4
  400ebb:       f3 0f 58 e1             addss  %xmm1,%xmm4
  400ebf:       f3 0f 59 0d 39 1e 00    mulss  0x1e39(%rip),%xmm1        #
402d00 <_IO_stdin_used+0xa0>
  400ec6:       00 
  400ec7:       0f 28 c4                movaps %xmm4,%xmm0
  400eca:       0f c6 c9 00             shufps $0x0,%xmm1,%xmm1
  400ece:       0f 14 c2                unpcklps %xmm2,%xmm0
  400ed1:       0f 16 c3                movlhps %xmm3,%xmm0
  400ed4:       0f 1f 40 00             nopl   0x0(%rax)
  400ed8:       0f 28 d0                movaps %xmm0,%xmm2
  400edb:       0f 58 c1                addps  %xmm1,%xmm0
  400ede:       0f 58 14 06             addps  (%rsi,%rax,1),%xmm2
  400ee2:       0f 29 14 07             movaps %xmm2,(%rdi,%rax,1)
  400ee6:       48 83 c0 10             add    $0x10,%rax
  400eea:       48 ff ca                dec    %rdx
  400eed:       75 e9                   jne    400ed8 <bench_3(float*, float*,
float, float, unsigned int)+0x38>
  400eef:       f3 c3                   repz retq 
  400ef1:       66 66 66 66 66 66 2e    nopw   %cs:0x0(%rax,%rax,1)
  400ef8:       0f 1f 84 00 00 00 00 
  400eff:       00 

the movaps in 400ec7 is not generated by gcc-4.3 ... the code generated by
gcc-4.4 is running about 7% slower on a core2 (x86_64).

gcc -v:
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../gcc-4.4-20081226/configure -v
--with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs
--enable-languages=c,c++ --prefix=/usr/local/lib/gcc-snapshot --enable-shared
--with-system-zlib --disable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-gtk-cairo --disable-plugin --enable-objc-gc --enable-mpfr
--disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.4.0 20081226 (experimental) (GCC)


-- 
           Summary: [4.4 Regression] speed regression with sse intrinsics
                    and -ffast-math
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tim at klingt dot org
GCC target triplet: x86_64-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682

[Bug tree-optimization/38682] New: [4.4 Regression] speed regression with sse intrinsics and -ffast-math

Reply via email to