On 5/30/2014 9:30 AM, bearophile wrote:
double plus(in uint nSteps) pure nothrow @safe /*@nogc*/ {
enum double p0 = 0.0045;
enum double p1 = 1.00045452-p0;
double tot = 1.346346;
auto b = true;
foreach (immutable i; 0 .. nSteps) {
final switch (b) {
case true:
tot += p0;
break;
case false:
tot += p1;
break;
}
b = !b;
}
return tot;
}
And this is the 32 bit X86 asm generated by ldc2 for the plus function:
__D4test4plusFNaNbNfxkZd:
pushl %ebp
movl %esp, %ebp
pushl %esi
andl $-8, %esp
subl $24, %esp
movsd LCPI0_0, %xmm0
testl %eax, %eax
je LBB0_8
xorl %ecx, %ecx
movb $1, %dl
movsd LCPI0_1, %xmm1
movsd LCPI0_2, %xmm2
.align 16, 0x90
LBB0_2:
testb $1, %dl
jne LBB0_3
addsd %xmm1, %xmm0
jmp LBB0_7
.align 16, 0x90
LBB0_3:
movzbl %dl, %esi
andl $1, %esi
je LBB0_5
addsd %xmm2, %xmm0
LBB0_7:
xorb $1, %dl
incl %ecx
cmpl %eax, %ecx
jb LBB0_2
LBB0_8:
movsd %xmm0, 8(%esp)
fldl 8(%esp)
leal -4(%ebp), %esp
popl %esi
popl %ebp
ret
LBB0_5:
movl $11, 4(%esp)
movl $__D4test12__ModuleInfoZ, (%esp)
calll __d_switch_error
Bye,
bearophile
Well, I'd argue that in fact neither the C++ nor D code generated the
fastest possible code here, as this code will result in at least 3,
likely more, potentially even every, branch being mispredicted. I would
argue, after checking the throughput numbers for fadd (only checked
haswell), that the fastest code here would actually compute both sides
of the branch and use a set of 4 cmov's (due to the fact it's x86 and
we're working with doubles) to determine which one is the one we need to
use going forward.