there is a speed regression from gcc-4.4 with the following code: void bench_3(float * out, float * in1, float in2, float slope, unsigned int n) { __m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope); const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
std::size_t loops = n / 4; do { __m128 arg1 = _mm_load_ps(in1); __m128 result = _mm_add_ps(arg1, arg2); arg2 = _mm_add_ps(arg2, vslope); _mm_store_ps(out, result); in1+=4; out+=4; } while (--loops); } gcc-4.3 generates the code: 0000000000400f00 <bench_3(float*, float*, float, float, unsigned int)>: 400f00: 0f 28 e1 movaps %xmm1,%xmm4 400f03: c1 ea 02 shr $0x2,%edx 400f06: f3 0f 58 e0 addss %xmm0,%xmm4 400f0a: 89 d2 mov %edx,%edx 400f0c: 0f 28 dc movaps %xmm4,%xmm3 400f0f: 31 c0 xor %eax,%eax 400f11: f3 0f 58 d9 addss %xmm1,%xmm3 400f15: 0f 14 e0 unpcklps %xmm0,%xmm4 400f18: 0f 28 d3 movaps %xmm3,%xmm2 400f1b: f3 0f 58 d1 addss %xmm1,%xmm2 400f1f: f3 0f 59 0d 79 17 00 mulss 0x1779(%rip),%xmm1 # 4026a0 <_IO_stdin_used+0xa0> 400f26: 00 400f27: 0f 14 d3 unpcklps %xmm3,%xmm2 400f2a: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 400f2e: 0f 16 d4 movlhps %xmm4,%xmm2 400f31: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 400f38: 0f 28 04 06 movaps (%rsi,%rax,1),%xmm0 400f3c: 0f 58 c2 addps %xmm2,%xmm0 400f3f: 0f 58 d1 addps %xmm1,%xmm2 400f42: 0f 29 04 07 movaps %xmm0,(%rdi,%rax,1) 400f46: 48 83 c0 10 add $0x10,%rax 400f4a: 48 ff ca dec %rdx 400f4d: 75 e9 jne 400f38 <bench_3(float*, float*, float, float, unsigned int)+0x38> 400f4f: f3 c3 repz retq 400f51: 66 66 66 66 66 66 2e nopw %cs:0x0(%rax,%rax,1) 400f58: 0f 1f 84 00 00 00 00 400f5f: 00 while gcc-4.4 generates: 0000000000400ea0 <bench_3(float*, float*, float, float, unsigned int)>: 400ea0: 0f 28 d9 movaps %xmm1,%xmm3 400ea3: c1 ea 02 shr $0x2,%edx 400ea6: f3 0f 58 d8 addss %xmm0,%xmm3 400eaa: 89 d2 mov %edx,%edx 400eac: 0f 28 d3 movaps %xmm3,%xmm2 400eaf: 31 c0 xor %eax,%eax 400eb1: f3 0f 58 d1 addss %xmm1,%xmm2 400eb5: 0f 14 d8 unpcklps %xmm0,%xmm3 400eb8: 0f 28 e2 movaps %xmm2,%xmm4 400ebb: f3 0f 58 e1 addss %xmm1,%xmm4 400ebf: f3 0f 59 0d 39 1e 00 mulss 0x1e39(%rip),%xmm1 # 402d00 <_IO_stdin_used+0xa0> 400ec6: 00 400ec7: 0f 28 c4 movaps %xmm4,%xmm0 400eca: 0f c6 c9 00 shufps $0x0,%xmm1,%xmm1 400ece: 0f 14 c2 unpcklps %xmm2,%xmm0 400ed1: 0f 16 c3 movlhps %xmm3,%xmm0 400ed4: 0f 1f 40 00 nopl 0x0(%rax) 400ed8: 0f 28 d0 movaps %xmm0,%xmm2 400edb: 0f 58 c1 addps %xmm1,%xmm0 400ede: 0f 58 14 06 addps (%rsi,%rax,1),%xmm2 400ee2: 0f 29 14 07 movaps %xmm2,(%rdi,%rax,1) 400ee6: 48 83 c0 10 add $0x10,%rax 400eea: 48 ff ca dec %rdx 400eed: 75 e9 jne 400ed8 <bench_3(float*, float*, float, float, unsigned int)+0x38> 400eef: f3 c3 repz retq 400ef1: 66 66 66 66 66 66 2e nopw %cs:0x0(%rax,%rax,1) 400ef8: 0f 1f 84 00 00 00 00 400eff: 00 the movaps in 400ec7 is not generated by gcc-4.3 ... the code generated by gcc-4.4 is running about 7% slower on a core2 (x86_64). gcc -v: Using built-in specs. Target: x86_64-linux-gnu Configured with: ../gcc-4.4-20081226/configure -v --with-bugurl=file:///usr/share/doc/gcc-snapshot/README.Bugs --enable-languages=c,c++ --prefix=/usr/local/lib/gcc-snapshot --enable-shared --with-system-zlib --disable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-gtk-cairo --disable-plugin --enable-objc-gc --enable-mpfr --disable-werror --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 4.4.0 20081226 (experimental) (GCC) -- Summary: [4.4 Regression] speed regression with sse intrinsics and -ffast-math Product: gcc Version: unknown Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: tree-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tim at klingt dot org GCC target triplet: x86_64-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38682