[Bug target/58450] New: -fno-trapping-math causes decrease in performance

ylow at graphlab dot com Tue, 17 Sep 2013 13:12:59 -0700

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58450


            Bug ID: 58450
           Summary: -fno-trapping-math causes decrease in performance
           Product: gcc
           Version: 4.8.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ylow at graphlab dot com

The program computes the Gregory Liebniz Pi approximation for 100M iterations.
The algorithm is simple:

double pi_apx() {
  double val = 0.0;
  for (size_t i = 0;i < PI_ITERATIONS; ++i) {
    if (i % 2 == 0) {
      val += 4.0 / (2 * i + 1);
    } else {
      val -= 4.0 / (2 * i + 1);
    }
  }
  return val;
}


Compiling with g++ -std=c++11 t.cpp -O3 outputs 

  3.14159
  364339 microseconds

And adding the -fno-trapping-math option roughly doubles runtime with output:

  3.14159
  698326 microseconds

Disassembling the output:

-O3 only:
   0x0000000000400980 <+0>:    mov    $0x1,%edx
   0x0000000000400985 <+5>:    xor    %eax,%eax
   0x0000000000400987 <+7>:    xorpd  %xmm0,%xmm0
   0x000000000040098b <+11>:    movsd  0x105(%rip),%xmm1        # 0x400a98
   0x0000000000400993 <+19>:    jmp    0x4009b4 <_Z6pi_apxv+52>
   0x0000000000400995 <+21>:    nopl   (%rax)
   0x0000000000400998 <+24>:    movapd %xmm1,%xmm3
   0x000000000040099c <+28>:    add    $0x1,%rax
   0x00000000004009a0 <+32>:    add    $0x2,%rdx
   0x00000000004009a4 <+36>:    cmp    $0x5f5e100,%rax
   0x00000000004009aa <+42>:    divsd  %xmm2,%xmm3
   0x00000000004009ae <+46>:    addsd  %xmm3,%xmm0
   0x00000000004009b2 <+50>:    je     0x4009d9 <_Z6pi_apxv+89>
   0x00000000004009b4 <+52>:    test   $0x1,%al
   0x00000000004009b6 <+54>:    cvtsi2sd %rdx,%xmm2
   0x00000000004009bb <+59>:    je     0x400998 <_Z6pi_apxv+24>
   0x00000000004009bd <+61>:    movapd %xmm1,%xmm4
   0x00000000004009c1 <+65>:    add    $0x1,%rax
   0x00000000004009c5 <+69>:    add    $0x2,%rdx
   0x00000000004009c9 <+73>:    cmp    $0x5f5e100,%rax
   0x00000000004009cf <+79>:    divsd  %xmm2,%xmm4
   0x00000000004009d3 <+83>:    subsd  %xmm4,%xmm0
   0x00000000004009d7 <+87>:    jne    0x4009b4 <_Z6pi_apxv+52>
   0x00000000004009d9 <+89>:    repz retq 

-O3 -fno-trapping-math:
   0x0000000000400980 <+0>:    xorpd  %xmm1,%xmm1
   0x0000000000400984 <+4>:    mov    $0x1,%edx
   0x0000000000400989 <+9>:    movsd  0x107(%rip),%xmm3        # 0x400a98
   0x0000000000400991 <+17>:    xor    %eax,%eax
   0x0000000000400993 <+19>:    nopl   0x0(%rax,%rax,1)
   0x0000000000400998 <+24>:    cvtsi2sd %rdx,%xmm0
   0x000000000040099d <+29>:    test   $0x1,%al
   0x000000000040099f <+31>:    movapd %xmm3,%xmm4
   0x00000000004009a3 <+35>:    divsd  %xmm0,%xmm4
   0x00000000004009a7 <+39>:    movapd %xmm4,%xmm2
   0x00000000004009ab <+43>:    addsd  %xmm1,%xmm2
   0x00000000004009af <+47>:    subsd  %xmm4,%xmm1
   0x00000000004009b3 <+51>:    movapd %xmm1,%xmm0
   0x00000000004009b7 <+55>:    jne    0x4009bd <_Z6pi_apxv+61>
   0x00000000004009b9 <+57>:    movapd %xmm2,%xmm0
   0x00000000004009bd <+61>:    add    $0x1,%rax
   0x00000000004009c1 <+65>:    add    $0x2,%rdx
   0x00000000004009c5 <+69>:    cmp    $0x5f5e100,%rax
   0x00000000004009cb <+75>:    movapd %xmm0,%xmm1
   0x00000000004009cf <+79>:    jne    0x400998 <_Z6pi_apxv+24>
   0x00000000004009d1 <+81>:    repz retq 

The optimization options that turn on -fno-trapping-math also produces the slow
down. (-funsafe-math-optimizations and -ffast-math)



Interestingly, adding -march=core-avx-i (the native CPU type of my machine)
also causes the slow down, even in combination with -mtune=core-avx-i

Disassembly of -O3 -march=core-avx-i
   0x0000000000400980 <+0>:    mov    $0x1,%edx
   0x0000000000400985 <+5>:    xor    %eax,%eax
   0x0000000000400987 <+7>:    vxorpd %xmm0,%xmm0,%xmm0
   0x000000000040098b <+11>:    vmovsd 0xf5(%rip),%xmm1        # 0x400a88
   0x0000000000400993 <+19>:    jmp    0x4009ac <_Z6pi_apxv+44>
   0x0000000000400995 <+21>:    nopl   (%rax)
   0x0000000000400998 <+24>:    add    $0x1,%rax
   0x000000000040099c <+28>:    vaddsd %xmm2,%xmm0,%xmm0
   0x00000000004009a0 <+32>:    add    $0x2,%rdx
   0x00000000004009a4 <+36>:    cmp    $0x5f5e100,%rax
   0x00000000004009aa <+42>:    je     0x4009cd <_Z6pi_apxv+77>
   0x00000000004009ac <+44>:    vcvtsi2sd %rdx,%xmm2,%xmm2
   0x00000000004009b1 <+49>:    test   $0x1,%al
   0x00000000004009b3 <+51>:    vdivsd %xmm2,%xmm1,%xmm2
   0x00000000004009b7 <+55>:    je     0x400998 <_Z6pi_apxv+24>
   0x00000000004009b9 <+57>:    add    $0x1,%rax
   0x00000000004009bd <+61>:    vsubsd %xmm2,%xmm0,%xmm0
   0x00000000004009c1 <+65>:    add    $0x2,%rdx
   0x00000000004009c5 <+69>:    cmp    $0x5f5e100,%rax
   0x00000000004009cb <+75>:    jne    0x4009ac <_Z6pi_apxv+44>
   0x00000000004009cd <+77>:    repz retq 




Other Information:

gcc -v

Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.8.1/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ../gcc-4.8.1/configure --program-suffix=-4.8 --enable-threads
--enable-lto --disable-bootstrap --enable-languages=c,c++,go --disable-multilib
Thread model: posix
gcc version 4.8.1 (GCC) 

System:
OS: Linux Mint 15 
CPU: Intel Core i7-3770 @ 3.40 GHz

[Bug target/58450] New: -fno-trapping-math causes decrease in performance

Reply via email to