Mini numerical optimizer in D:
https://github.com/S6Regen/Dopt
I also have this Walsh Hadamard code in D for Linux AMD 64:
// Linux AMD64
extern(C) void hsixteen(float* x,ulong n,float scale){
asm{
naked;
shufps XMM0,XMM0,0;
align 16;
h16:
sub RSI,16;
movups XMM1,[RDI];
movups XMM2,[RDI+16];
movups XMM3,[RDI+2*16];
movups XMM4,[RDI+3*16];
movups XMM5,XMM1;
movups XMM6,XMM3;
haddps XMM1,XMM2;
haddps XMM3,XMM4;
hsubps XMM5,XMM2;
hsubps XMM6,XMM4;
movups XMM2,XMM1;
movups XMM4,XMM3;
haddps XMM1,XMM5;
haddps XMM3,XMM6;
hsubps XMM2,XMM5;
hsubps XMM4,XMM6;
movups XMM5,XMM1;
movups XMM6,XMM3;
haddps XMM1,XMM2;
haddps XMM3,XMM4;
hsubps XMM5,XMM2;
hsubps XMM6,XMM4;
movups XMM2,XMM1;
movups XMM4,XMM5;
addps XMM1,XMM3;
addps XMM5,XMM6;
subps XMM2,XMM3;
subps XMM4,XMM6;
mulps XMM1,XMM0;
mulps XMM5,XMM0;
mulps XMM2,XMM0;
mulps XMM4,XMM0;
movups [RDI],XMM1;
movups [RDI+16],XMM5;
movups [RDI+2*16],XMM2;
movups [RDI+3*16],XMM4;
lea RDI,[RDI+64];
jnz h16;
ret;
}
}
extern(C) void hgap(float* x,ulong gap,ulong n){
asm{
naked;
mov RCX,RSI;
lea R8,[RDI+4*RSI];
shr RDX,1;
align 16;
hgaploop:
sub RCX,16;
movups XMM0,[RDI];
movups XMM1,[RDI+16];
movups XMM2,[RDI+2*16];
movups XMM3,[RDI+3*16];
movups XMM8,[R8];
movups XMM9,[R8+16];
movups XMM10,[R8+2*16];
movups XMM11,[R8+3*16];
movups XMM4,XMM0;
movups XMM5,XMM1;
movups XMM6,XMM2;
movups XMM7,XMM3;
addps XMM0,XMM8;
addps XMM1,XMM9;
addps XMM2,XMM10;
addps XMM3,XMM11;
subps XMM4,XMM8;
subps XMM5,XMM9;
subps XMM6,XMM10;
subps XMM7,XMM11;
movups [RDI],XMM0;
movups [RDI+16],XMM1;
movups [RDI+2*16],XMM2;
movups [RDI+3*16],XMM3;
movups [R8],XMM4;
movups [R8+16],XMM5;
movups [R8+2*16],XMM6;
movups [R8+3*16],XMM7;
lea RDI,[RDI+64];
lea R8,[R8+64];
jnz hgaploop;
sub RDX,RSI;
mov RCX,RSI;
mov RDI,R8;
lea R8,[R8+4*RSI];
jnz hgaploop;
ret;
}
}
void wht(float[] vec){
const ulong lim=8192;
const ulong n=vec.length;
ulong gap,k;
float scale=1f/sqrt(to!float(n));
k=n;
if( k>lim) k=lim;
for(ulong i=0;i<n;i+=lim){
hsixteen(&vec[i],k,scale);
gap=16;
while (gap<k){
hgap(&vec[i],gap,k);
gap+=gap;
}
}
while(gap<n){
hgap(&vec[0],gap,n);
gap+=gap;
}
}
It is the simplest algorithm in computer science least known, as
I like to say.
It actually has many uses. Eg.
https://github.com/FALCONN-LIB/FFHT