On 5/30/2014 9:30 AM, bearophile wrote:
double plus(in uint nSteps) pure nothrow @safe /*@nogc*/ {
    enum double p0 = 0.0045;
    enum double p1 = 1.00045452-p0;

    double tot = 1.346346;
    auto b = true;

    foreach (immutable i; 0 .. nSteps) {
        final switch (b) {
            case true:
                tot += p0;
                break;
            case false:
                tot += p1;
                break;
        }

        b = !b;
    }

    return tot;
}

And this is the 32 bit X86 asm generated by ldc2 for the plus function:

__D4test4plusFNaNbNfxkZd:
     pushl    %ebp
     movl    %esp, %ebp
     pushl    %esi
     andl    $-8, %esp
     subl    $24, %esp
     movsd    LCPI0_0, %xmm0
     testl    %eax, %eax
     je    LBB0_8
     xorl    %ecx, %ecx
     movb    $1, %dl
     movsd    LCPI0_1, %xmm1
     movsd    LCPI0_2, %xmm2
     .align    16, 0x90
LBB0_2:
     testb    $1, %dl
     jne    LBB0_3
     addsd    %xmm1, %xmm0
     jmp    LBB0_7
     .align    16, 0x90
LBB0_3:
     movzbl    %dl, %esi
     andl    $1, %esi
     je    LBB0_5
     addsd    %xmm2, %xmm0
LBB0_7:
     xorb    $1, %dl
     incl    %ecx
     cmpl    %eax, %ecx
     jb    LBB0_2
LBB0_8:
     movsd    %xmm0, 8(%esp)
     fldl    8(%esp)
     leal    -4(%ebp), %esp
     popl    %esi
     popl    %ebp
     ret
LBB0_5:
     movl    $11, 4(%esp)
     movl    $__D4test12__ModuleInfoZ, (%esp)
     calll    __d_switch_error

Bye,
bearophile


Well, I'd argue that in fact neither the C++ nor D code generated the
fastest possible code here, as this code will result in at least 3,
likely more, potentially even every, branch being mispredicted. I would
argue, after checking the throughput numbers for fadd (only checked
haswell), that the fastest code here would actually compute both sides
of the branch and use a set of 4 cmov's (due to the fact it's x86 and
we're working with doubles) to determine which one is the one we need to
use going forward.

Reply via email to