http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58450
Bug ID: 58450 Summary: -fno-trapping-math causes decrease in performance Product: gcc Version: 4.8.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: ylow at graphlab dot com The program computes the Gregory Liebniz Pi approximation for 100M iterations. The algorithm is simple: double pi_apx() { double val = 0.0; for (size_t i = 0;i < PI_ITERATIONS; ++i) { if (i % 2 == 0) { val += 4.0 / (2 * i + 1); } else { val -= 4.0 / (2 * i + 1); } } return val; } Compiling with g++ -std=c++11 t.cpp -O3 outputs 3.14159 364339 microseconds And adding the -fno-trapping-math option roughly doubles runtime with output: 3.14159 698326 microseconds Disassembling the output: -O3 only: 0x0000000000400980 <+0>: mov $0x1,%edx 0x0000000000400985 <+5>: xor %eax,%eax 0x0000000000400987 <+7>: xorpd %xmm0,%xmm0 0x000000000040098b <+11>: movsd 0x105(%rip),%xmm1 # 0x400a98 0x0000000000400993 <+19>: jmp 0x4009b4 <_Z6pi_apxv+52> 0x0000000000400995 <+21>: nopl (%rax) 0x0000000000400998 <+24>: movapd %xmm1,%xmm3 0x000000000040099c <+28>: add $0x1,%rax 0x00000000004009a0 <+32>: add $0x2,%rdx 0x00000000004009a4 <+36>: cmp $0x5f5e100,%rax 0x00000000004009aa <+42>: divsd %xmm2,%xmm3 0x00000000004009ae <+46>: addsd %xmm3,%xmm0 0x00000000004009b2 <+50>: je 0x4009d9 <_Z6pi_apxv+89> 0x00000000004009b4 <+52>: test $0x1,%al 0x00000000004009b6 <+54>: cvtsi2sd %rdx,%xmm2 0x00000000004009bb <+59>: je 0x400998 <_Z6pi_apxv+24> 0x00000000004009bd <+61>: movapd %xmm1,%xmm4 0x00000000004009c1 <+65>: add $0x1,%rax 0x00000000004009c5 <+69>: add $0x2,%rdx 0x00000000004009c9 <+73>: cmp $0x5f5e100,%rax 0x00000000004009cf <+79>: divsd %xmm2,%xmm4 0x00000000004009d3 <+83>: subsd %xmm4,%xmm0 0x00000000004009d7 <+87>: jne 0x4009b4 <_Z6pi_apxv+52> 0x00000000004009d9 <+89>: repz retq -O3 -fno-trapping-math: 0x0000000000400980 <+0>: xorpd %xmm1,%xmm1 0x0000000000400984 <+4>: mov $0x1,%edx 0x0000000000400989 <+9>: movsd 0x107(%rip),%xmm3 # 0x400a98 0x0000000000400991 <+17>: xor %eax,%eax 0x0000000000400993 <+19>: nopl 0x0(%rax,%rax,1) 0x0000000000400998 <+24>: cvtsi2sd %rdx,%xmm0 0x000000000040099d <+29>: test $0x1,%al 0x000000000040099f <+31>: movapd %xmm3,%xmm4 0x00000000004009a3 <+35>: divsd %xmm0,%xmm4 0x00000000004009a7 <+39>: movapd %xmm4,%xmm2 0x00000000004009ab <+43>: addsd %xmm1,%xmm2 0x00000000004009af <+47>: subsd %xmm4,%xmm1 0x00000000004009b3 <+51>: movapd %xmm1,%xmm0 0x00000000004009b7 <+55>: jne 0x4009bd <_Z6pi_apxv+61> 0x00000000004009b9 <+57>: movapd %xmm2,%xmm0 0x00000000004009bd <+61>: add $0x1,%rax 0x00000000004009c1 <+65>: add $0x2,%rdx 0x00000000004009c5 <+69>: cmp $0x5f5e100,%rax 0x00000000004009cb <+75>: movapd %xmm0,%xmm1 0x00000000004009cf <+79>: jne 0x400998 <_Z6pi_apxv+24> 0x00000000004009d1 <+81>: repz retq The optimization options that turn on -fno-trapping-math also produces the slow down. (-funsafe-math-optimizations and -ffast-math) Interestingly, adding -march=core-avx-i (the native CPU type of my machine) also causes the slow down, even in combination with -mtune=core-avx-i Disassembly of -O3 -march=core-avx-i 0x0000000000400980 <+0>: mov $0x1,%edx 0x0000000000400985 <+5>: xor %eax,%eax 0x0000000000400987 <+7>: vxorpd %xmm0,%xmm0,%xmm0 0x000000000040098b <+11>: vmovsd 0xf5(%rip),%xmm1 # 0x400a88 0x0000000000400993 <+19>: jmp 0x4009ac <_Z6pi_apxv+44> 0x0000000000400995 <+21>: nopl (%rax) 0x0000000000400998 <+24>: add $0x1,%rax 0x000000000040099c <+28>: vaddsd %xmm2,%xmm0,%xmm0 0x00000000004009a0 <+32>: add $0x2,%rdx 0x00000000004009a4 <+36>: cmp $0x5f5e100,%rax 0x00000000004009aa <+42>: je 0x4009cd <_Z6pi_apxv+77> 0x00000000004009ac <+44>: vcvtsi2sd %rdx,%xmm2,%xmm2 0x00000000004009b1 <+49>: test $0x1,%al 0x00000000004009b3 <+51>: vdivsd %xmm2,%xmm1,%xmm2 0x00000000004009b7 <+55>: je 0x400998 <_Z6pi_apxv+24> 0x00000000004009b9 <+57>: add $0x1,%rax 0x00000000004009bd <+61>: vsubsd %xmm2,%xmm0,%xmm0 0x00000000004009c1 <+65>: add $0x2,%rdx 0x00000000004009c5 <+69>: cmp $0x5f5e100,%rax 0x00000000004009cb <+75>: jne 0x4009ac <_Z6pi_apxv+44> 0x00000000004009cd <+77>: repz retq Other Information: gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.8.1/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: ../gcc-4.8.1/configure --program-suffix=-4.8 --enable-threads --enable-lto --disable-bootstrap --enable-languages=c,c++,go --disable-multilib Thread model: posix gcc version 4.8.1 (GCC) System: OS: Linux Mint 15 CPU: Intel Core i7-3770 @ 3.40 GHz