On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
...
It seems that using static foreach with pointer parameters
exclusively is the best way to "guide" LDC into optimizing
code.(using arr1[] += arr2[] syntax resulted in worse performance
for me.)
However, AVX512 support seems limited to being able to use the 16
other YMM registers, rather than using the same code template but
changed to use ZMM registers and double the offsets to take
advantage of the new size.
Compiled with «-g -enable-unsafe-fp-math -enable-no-infs-fp-math
-ffast-math -O -release -mcpu=skylake» :
__gshared simdf init = [0f,0f,0f,0f,0f,0f,0f,0f];
alias simdf = float[8]
extern(C)//with extern(D)(the default), the assembly output uses
one register for two pointers.
void vEUCLIDpsptr_void(simdf* a0, simdf* a1, simdf* a2, simdf*
a3, simdf* b1, simdf* b2, simdf* b3) {
simdf amm0 = init;//returned simdf
simdf amm1 = *a1;
simdf amm2 = *a2;
simdf amm3 = *a3;
static foreach(size_t i; 0..simdlength) {
//Needs to be interleaved like this, otherwise LDC generates
worse code.
amm1[i] -= (*b1)[i];
amm1[i] *= amm1[i];
amm2[i] -= (*b2)[i];
amm2[i] *= amm2[i];
amm3[i] -= (*b3)[i];
amm3[i] *= amm3[i];
amm0[i] += amm1[i];
amm0[i] += amm2[i];
amm0[i] += amm3[i];
amm0[i] = sqrt(amm0[i]);
}
*a0 = amm0;
return;
}
mov r10,qword ptr ss:[rsp+38]
mov r11,qword ptr ss:[rsp+30]
mov rax,qword ptr ss:[rsp+28]
vmovups ymm0,yword ptr ds:[rdx]
vmovups ymm1,yword ptr ds:[r8]
vsubps ymm0,ymm0,yword ptr ds:[rax]
vmovups ymm2,yword ptr ds:[r9]
vfmadd213ps ymm0,ymm0,yword ptr ds:[<_D12euclideandst4initG8f>]
vsubps ymm1,ymm1,yword ptr ds:[r11]
vfmadd213ps ymm1,ymm1,ymm0
vsubps ymm0,ymm2,yword ptr ds:[r10]
vfmadd213ps ymm0,ymm0,ymm1
vsqrtps ymm0,ymm0
vmovups yword ptr ds:[rcx],ymm0
vzeroupper ret
The speed difference is near 400% for the same amount of
distances compared with the single distance function example.
However, the assembly generated isn't the fastest, for example
removing vzeroupper and using the unused and known-zeroed YMM15
register as a zero-filled register operand for the first
vfmadd213ps instruction improves performance by 10%(70 vs 78ms
for 256 million distances...)
The function can then be improved further to use pointer offsets
and more registers, this is more efficient and results in a 500%~
improvement :
extern(C)
void vEUCLIDpsptr_void_40(simdf* a0, simdf* a1, simdf* a2,
simdf* a3, simdf* b1, simdf* b2, simdf* b3) {
simdf amm0 = init;
simdf amm1 = *a1;
simdf amm2 = *a2;
simdf amm3 = *a3;
simdf emm0 = init;
simdf emm1 = amm1;
simdf emm2 = amm2;//mirror AMM for positions
simdf emm3 = amm3;
simdf imm0 = init;
simdf imm1 = emm1;
simdf imm2 = emm2;
simdf imm3 = emm3;
simdf omm0 = init;
simdf omm1 = emm1;
simdf omm2 = emm2;
simdf omm3 = emm3;
simdf umm0 = init;
simdf umm1 = omm1;
simdf umm2 = omm2;
simdf umm3 = omm3;
//cascading assignment may not be the fastest way, especially
compared to just loading from the pointer!
static foreach(size_t i; 0..simdlength) {
amm1[i] -= (b1[0])[i];
amm1[i] *= amm1[i];
amm0[i] += amm1[i];
amm2[i] -= (b2[0])[i];
amm2[i] *= amm2[i];
amm0[i] += amm2[i];
amm3[i] -= (b3[0])[i];
amm3[i] *= amm3[i];
amm0[i] += amm3[i];
amm0[i] = sqrt(amm0[i]);
//template
emm1[i] -= (b1[1])[i];
emm1[i] *= emm1[i];
emm0[i] += emm1[i];
emm2[i] -= (b2[1])[i];
emm2[i] *= emm2[i];
emm0[i] += emm2[i];
emm3[i] -= (b3[1])[i];
emm3[i] *= emm3[i];
emm0[i] += emm3[i];
emm0[i] = sqrt(emm0[i]);
//
imm1[i] -= (b1[2])[i];
imm1[i] *= imm1[i];
imm0[i] += imm1[i];
imm2[i] -= (b2[2])[i];
imm2[i] *= imm2[i];
imm0[i] += imm2[i];
imm3[i] -= (b3[2])[i];
imm3[i] *= imm3[i];
imm0[i] += imm3[i];
imm0[i] = sqrt(imm0[i]);
//
omm1[i] -= (b1[3])[i];
omm1[i] *= omm1[i];
omm0[i] += omm1[i];
omm2[i] -= (b2[3])[i];
omm2[i] *= omm2[i];
omm0[i] += omm2[i];
omm3[i] -= (b3[3])[i];
omm3[i] *= omm3[i];
omm0[i] += omm3[i];
omm0[i] = sqrt(omm0[i]);
//
umm1[i] -= (b1[4])[i];
umm1[i] *= umm1[i];
umm0[i] += umm1[i];
umm2[i] -= (b2[4])[i];
umm2[i] *= umm2[i];
umm0[i] += umm2[i];
umm3[i] -= (b3[4])[i];
umm3[i] *= umm3[i];
umm0[i] += umm3[i];
umm0[i] = sqrt(umm0[i]);
}
//Sadly, writing to the arrays from within the static foreach
causes LDC to not use SIMD at all
*a0 = amm0;
*(a0+1) = emm0;
*(a0+2) = imm0;
*(a0+3) = omm0;
*(a0+4) = umm0;
return;
}
sub rsp,38
vmovaps xmmword ptr ss:[rsp+20],xmm8
vmovaps xmmword ptr ss:[rsp+10],xmm7
vmovaps xmmword ptr ss:[rsp],xmm6
mov r10,qword ptr ss:[rsp+70]
mov r11,qword ptr ss:[rsp+68]
mov rax,qword ptr ss:[rsp+60]
vmovaps ymm1,yword ptr ds:[<_D12euclideandst4initG8f>]
vmovups ymm3,yword ptr ds:[rdx]
vmovups ymm2,yword ptr ds:[r8]
vmovups ymm0,yword ptr ds:[r9]
vsubps ymm4,ymm3,yword ptr ds:[rax]
vfmadd213ps ymm4,ymm4,ymm1
vsubps ymm5,ymm2,yword ptr ds:[r11]
vfmadd213ps ymm5,ymm5,ymm4
vsubps ymm4,ymm0,yword ptr ds:[r10]
vfmadd213ps ymm4,ymm4,ymm5
vsqrtps ymm4,ymm4
vsubps ymm5,ymm3,yword ptr ds:[rax+20]
vfmadd213ps ymm5,ymm5,ymm1
vsubps ymm6,ymm2,yword ptr ds:[r11+20]
vfmadd213ps ymm6,ymm6,ymm5
vsubps ymm5,ymm0,yword ptr ds:[r10+20]
vfmadd213ps ymm5,ymm5,ymm6
vsqrtps ymm5,ymm5
vsubps ymm6,ymm3,yword ptr ds:[rax+40]
vsubps ymm7,ymm2,yword ptr ds:[r11+40]
vfmadd213ps ymm6,ymm6,ymm1
vfmadd213ps ymm7,ymm7,ymm6
vsubps ymm6,ymm0,yword ptr ds:[r10+40]
vfmadd213ps ymm6,ymm6,ymm7
vsqrtps ymm6,ymm6
vsubps ymm7,ymm3,yword ptr ds:[rax+60]
vfmadd213ps ymm7,ymm7,ymm1
vsubps ymm8,ymm2,yword ptr ds:[r11+60]
vfmadd213ps ymm8,ymm8,ymm7
vsubps ymm7,ymm0,yword ptr ds:[r10+60]
vfmadd213ps ymm7,ymm7,ymm8
vsqrtps ymm7,ymm7
vsubps ymm3,ymm3,yword ptr ds:[rax+80]
vfmadd213ps ymm3,ymm3,ymm1
vsubps ymm1,ymm2,yword ptr ds:[r11+80]
vfmadd213ps ymm1,ymm1,ymm3
vsubps ymm0,ymm0,yword ptr ds:[r10+80]
vfmadd213ps ymm0,ymm0,ymm1
vsqrtps ymm0,ymm0
vmovups yword ptr ds:[rcx],ymm4
vmovups yword ptr ds:[rcx+20],ymm5
vmovups yword ptr ds:[rcx+40],ymm6
vmovups yword ptr ds:[rcx+60],ymm7
vmovups yword ptr ds:[rcx+80],ymm0
vmovaps xmm6,xmmword ptr ss:[rsp]
vmovaps xmm7,xmmword ptr ss:[rsp+10]
vmovaps xmm8,xmmword ptr ss:[rsp+20]
add rsp,38
vzeroupper ret
This is slightly more efficient(54 vs 78ms for the same amount of
distances(256 million)) but still far from perfect, also i do not
know what it is doing with XMM registers knowing i can safely
remove the instructions with a debugger and it doesn't crash or
corrupt data.(sometimes it was doing this for 0x100 bytes)