Mini numerical optimizer in D:
https://github.com/S6Regen/Dopt
I also have this Walsh Hadamard code in D for Linux AMD 64:

// Linux AMD64
extern(C) void hsixteen(float* x,ulong n,float scale){
        asm{
                naked;
                shufps XMM0,XMM0,0;
                align 16;
h16:
        sub RSI,16;
        movups XMM1,[RDI];
        movups XMM2,[RDI+16];
        movups XMM3,[RDI+2*16];
        movups XMM4,[RDI+3*16];
        movups XMM5,XMM1;
        movups XMM6,XMM3;
        haddps XMM1,XMM2;
        haddps XMM3,XMM4;
        hsubps XMM5,XMM2;
        hsubps XMM6,XMM4;
        movups XMM2,XMM1;
        movups XMM4,XMM3;
        haddps XMM1,XMM5;
        haddps XMM3,XMM6;
        hsubps XMM2,XMM5;
        hsubps XMM4,XMM6;
        movups XMM5,XMM1;
        movups XMM6,XMM3;
        haddps XMM1,XMM2;
        haddps XMM3,XMM4;
        hsubps XMM5,XMM2;
        hsubps XMM6,XMM4;
        movups XMM2,XMM1;
        movups XMM4,XMM5;
        addps XMM1,XMM3;
        addps XMM5,XMM6;
        subps XMM2,XMM3;
        subps XMM4,XMM6;
        mulps XMM1,XMM0;
        mulps XMM5,XMM0;
        mulps XMM2,XMM0;
        mulps XMM4,XMM0;
        movups [RDI],XMM1;
        movups [RDI+16],XMM5;
        movups [RDI+2*16],XMM2;
        movups [RDI+3*16],XMM4;
        lea RDI,[RDI+64];
        jnz h16;
        ret;
        }
}

extern(C) void hgap(float* x,ulong gap,ulong n){
        asm{
                naked;
                mov RCX,RSI;
                lea R8,[RDI+4*RSI];
                shr RDX,1;
                align 16;       
        hgaploop:
                sub RCX,16;
                movups XMM0,[RDI];
                movups XMM1,[RDI+16];
                movups XMM2,[RDI+2*16];
                movups XMM3,[RDI+3*16];
                movups XMM8,[R8];
                movups XMM9,[R8+16];
                movups XMM10,[R8+2*16];
                movups XMM11,[R8+3*16];
                movups XMM4,XMM0;
                movups XMM5,XMM1;
                movups XMM6,XMM2;
                movups XMM7,XMM3;
                addps XMM0,XMM8;
                addps XMM1,XMM9;
                addps XMM2,XMM10;
                addps XMM3,XMM11;
                subps XMM4,XMM8;
                subps XMM5,XMM9;
                subps XMM6,XMM10;
                subps XMM7,XMM11;
                movups [RDI],XMM0;
                movups [RDI+16],XMM1;
                movups [RDI+2*16],XMM2;
                movups [RDI+3*16],XMM3;
                movups [R8],XMM4;
                movups [R8+16],XMM5;
                movups [R8+2*16],XMM6;
                movups [R8+3*16],XMM7;
                lea RDI,[RDI+64];
                lea R8,[R8+64];
                jnz hgaploop;
                sub RDX,RSI;
                mov RCX,RSI;
                mov RDI,R8;
                lea R8,[R8+4*RSI];
                jnz hgaploop;
                ret;
        }
}
void wht(float[] vec){
           const ulong lim=8192;
           const ulong n=vec.length;
           ulong gap,k;
           float scale=1f/sqrt(to!float(n));
           k=n;
           if( k>lim) k=lim;
           for(ulong i=0;i<n;i+=lim){
                   hsixteen(&vec[i],k,scale);
                   gap=16;
                   while (gap<k){
                          hgap(&vec[i],gap,k);
                          gap+=gap;
                   }
                }
                while(gap<n){
                        hgap(&vec[0],gap,n);
                        gap+=gap;
                }       
}

It is the simplest algorithm in computer science least known, as I like to say.
It actually has many uses. Eg.
https://github.com/FALCONN-LIB/FFHT

Reply via email to