On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
...
It seems that using static foreach with pointer parameters exclusively is the best way to "guide" LDC into optimizing code.(using arr1[] += arr2[] syntax resulted in worse performance for me.) However, AVX512 support seems limited to being able to use the 16 other YMM registers, rather than using the same code template but changed to use ZMM registers and double the offsets to take advantage of the new size. Compiled with «-g -enable-unsafe-fp-math -enable-no-infs-fp-math -ffast-math -O -release -mcpu=skylake» :
__gshared simdf init = [0f,0f,0f,0f,0f,0f,0f,0f];
alias simdf = float[8]
extern(C)//with extern(D)(the default), the assembly output uses one register for two pointers. void vEUCLIDpsptr_void(simdf* a0, simdf* a1, simdf* a2, simdf* a3, simdf* b1, simdf* b2, simdf* b3) {
        simdf amm0 = init;//returned simdf
        simdf amm1 = *a1;
        simdf amm2 = *a2;
        simdf amm3 = *a3;
        static foreach(size_t i; 0..simdlength) {
//Needs to be interleaved like this, otherwise LDC generates worse code.
                amm1[i] -= (*b1)[i];
                amm1[i] *= amm1[i];
                amm2[i] -= (*b2)[i];
                amm2[i] *= amm2[i];
                amm3[i] -= (*b3)[i];
                amm3[i] *= amm3[i];
                amm0[i] += amm1[i];
                amm0[i] += amm2[i];
                amm0[i] += amm3[i];
                amm0[i] = sqrt(amm0[i]);
        }
        *a0 = amm0;
        return;
}

mov r10,qword ptr ss:[rsp+38]
mov r11,qword ptr ss:[rsp+30]
mov rax,qword ptr ss:[rsp+28]
vmovups ymm0,yword ptr ds:[rdx]
vmovups ymm1,yword ptr ds:[r8]
vsubps ymm0,ymm0,yword ptr ds:[rax]
vmovups ymm2,yword ptr ds:[r9]
vfmadd213ps ymm0,ymm0,yword ptr ds:[<_D12euclideandst4initG8f>]
vsubps ymm1,ymm1,yword ptr ds:[r11]
vfmadd213ps ymm1,ymm1,ymm0
vsubps ymm0,ymm2,yword ptr ds:[r10]
vfmadd213ps ymm0,ymm0,ymm1
vsqrtps ymm0,ymm0
vmovups yword ptr ds:[rcx],ymm0
vzeroupper ret

The speed difference is near 400% for the same amount of distances compared with the single distance function example. However, the assembly generated isn't the fastest, for example removing vzeroupper and using the unused and known-zeroed YMM15 register as a zero-filled register operand for the first vfmadd213ps instruction improves performance by 10%(70 vs 78ms for 256 million distances...) The function can then be improved further to use pointer offsets and more registers, this is more efficient and results in a 500%~ improvement :
extern(C)
void vEUCLIDpsptr_void_40(simdf* a0, simdf* a1, simdf* a2, simdf* a3, simdf* b1, simdf* b2, simdf* b3) {
        simdf amm0 = init;
        simdf amm1 = *a1;
        simdf amm2 = *a2;
        simdf amm3 = *a3;
        simdf emm0 = init;
        simdf emm1 = amm1;
        simdf emm2 = amm2;//mirror AMM for positions
        simdf emm3 = amm3;
        simdf imm0 = init;
        simdf imm1 = emm1;
        simdf imm2 = emm2;
        simdf imm3 = emm3;
        simdf omm0 = init;
        simdf omm1 = emm1;
        simdf omm2 = emm2;
        simdf omm3 = emm3;
        simdf umm0 = init;
        simdf umm1 = omm1;
        simdf umm2 = omm2;
        simdf umm3 = omm3;
//cascading assignment may not be the fastest way, especially compared to just loading from the pointer!

        static foreach(size_t i; 0..simdlength) {
                amm1[i] -= (b1[0])[i];
                amm1[i] *= amm1[i];
                amm0[i] += amm1[i];

                amm2[i] -= (b2[0])[i];
                amm2[i] *= amm2[i];
                amm0[i] += amm2[i];

                amm3[i] -= (b3[0])[i];
                amm3[i] *= amm3[i];
                amm0[i] += amm3[i];

                amm0[i] = sqrt(amm0[i]);
                //template
                emm1[i] -= (b1[1])[i];
                emm1[i] *= emm1[i];
                emm0[i] += emm1[i];

                emm2[i] -= (b2[1])[i];
                emm2[i] *= emm2[i];
                emm0[i] += emm2[i];

                emm3[i] -= (b3[1])[i];
                emm3[i] *= emm3[i];
                emm0[i] += emm3[i];

                emm0[i] = sqrt(emm0[i]);
                //
                imm1[i] -= (b1[2])[i];
                imm1[i] *= imm1[i];
                imm0[i] += imm1[i];

                imm2[i] -= (b2[2])[i];
                imm2[i] *= imm2[i];
                imm0[i] += imm2[i];

                imm3[i] -= (b3[2])[i];
                imm3[i] *= imm3[i];
                imm0[i] += imm3[i];

                imm0[i] = sqrt(imm0[i]);
                //
                omm1[i] -= (b1[3])[i];
                omm1[i] *= omm1[i];
                omm0[i] += omm1[i];

                omm2[i] -= (b2[3])[i];
                omm2[i] *= omm2[i];
                omm0[i] += omm2[i];

                omm3[i] -= (b3[3])[i];
                omm3[i] *= omm3[i];
                omm0[i] += omm3[i];

                omm0[i] = sqrt(omm0[i]);
                //
                umm1[i] -= (b1[4])[i];
                umm1[i] *= umm1[i];
                umm0[i] += umm1[i];

                umm2[i] -= (b2[4])[i];
                umm2[i] *= umm2[i];
                umm0[i] += umm2[i];

                umm3[i] -= (b3[4])[i];
                umm3[i] *= umm3[i];
                umm0[i] += umm3[i];

                umm0[i] = sqrt(umm0[i]);

        }

//Sadly, writing to the arrays from within the static foreach causes LDC to not use SIMD at all
        *a0 = amm0;
        *(a0+1) = emm0;
        *(a0+2) = imm0;
        *(a0+3) = omm0;
        *(a0+4) = umm0;
        return;

}

sub rsp,38
vmovaps xmmword ptr ss:[rsp+20],xmm8
vmovaps xmmword ptr ss:[rsp+10],xmm7
vmovaps xmmword ptr ss:[rsp],xmm6
mov r10,qword ptr ss:[rsp+70]
mov r11,qword ptr ss:[rsp+68]
mov rax,qword ptr ss:[rsp+60]
vmovaps ymm1,yword ptr ds:[<_D12euclideandst4initG8f>]
vmovups ymm3,yword ptr ds:[rdx]
vmovups ymm2,yword ptr ds:[r8]
vmovups ymm0,yword ptr ds:[r9]
vsubps ymm4,ymm3,yword ptr ds:[rax]
vfmadd213ps ymm4,ymm4,ymm1
vsubps ymm5,ymm2,yword ptr ds:[r11]
vfmadd213ps ymm5,ymm5,ymm4
vsubps ymm4,ymm0,yword ptr ds:[r10]
vfmadd213ps ymm4,ymm4,ymm5
vsqrtps ymm4,ymm4
vsubps ymm5,ymm3,yword ptr ds:[rax+20]
vfmadd213ps ymm5,ymm5,ymm1
vsubps ymm6,ymm2,yword ptr ds:[r11+20]
vfmadd213ps ymm6,ymm6,ymm5
vsubps ymm5,ymm0,yword ptr ds:[r10+20]
vfmadd213ps ymm5,ymm5,ymm6
vsqrtps ymm5,ymm5
vsubps ymm6,ymm3,yword ptr ds:[rax+40]
vsubps ymm7,ymm2,yword ptr ds:[r11+40]
vfmadd213ps ymm6,ymm6,ymm1
vfmadd213ps ymm7,ymm7,ymm6
vsubps ymm6,ymm0,yword ptr ds:[r10+40]
vfmadd213ps ymm6,ymm6,ymm7
vsqrtps ymm6,ymm6
vsubps ymm7,ymm3,yword ptr ds:[rax+60]
vfmadd213ps ymm7,ymm7,ymm1
vsubps ymm8,ymm2,yword ptr ds:[r11+60]
vfmadd213ps ymm8,ymm8,ymm7
vsubps ymm7,ymm0,yword ptr ds:[r10+60]
vfmadd213ps ymm7,ymm7,ymm8
vsqrtps ymm7,ymm7
vsubps ymm3,ymm3,yword ptr ds:[rax+80]
vfmadd213ps ymm3,ymm3,ymm1
vsubps ymm1,ymm2,yword ptr ds:[r11+80]
vfmadd213ps ymm1,ymm1,ymm3
vsubps ymm0,ymm0,yword ptr ds:[r10+80]
vfmadd213ps ymm0,ymm0,ymm1
vsqrtps ymm0,ymm0
vmovups yword ptr ds:[rcx],ymm4
vmovups yword ptr ds:[rcx+20],ymm5
vmovups yword ptr ds:[rcx+40],ymm6
vmovups yword ptr ds:[rcx+60],ymm7
vmovups yword ptr ds:[rcx+80],ymm0
vmovaps xmm6,xmmword ptr ss:[rsp]
vmovaps xmm7,xmmword ptr ss:[rsp+10]
vmovaps xmm8,xmmword ptr ss:[rsp+20]
add rsp,38
vzeroupper ret

This is slightly more efficient(54 vs 78ms for the same amount of distances(256 million)) but still far from perfect, also i do not know what it is doing with XMM registers knowing i can safely remove the instructions with a debugger and it doesn't crash or corrupt data.(sometimes it was doing this for 0x100 bytes)

Reply via email to