Seems I found a better solution hidden in the docs:

@nogc protected int[2] transformFunc(int[2] xy){
version(X86){
asm @nogc{
        naked;
        mov                     EBX, this;
        movd            XMM1, sX[EBX];
        pslldq          XMM1, 4;
        movss           XMM1, sY[EBX];
        movq            XMM0, xy;
        paddd           XMM0, XMM1;     // [x,y] + [sX,sY]
        movq            XMM3, xy0[EBX];
        psubd           XMM0, XMM3;     // ([x,y] + [sX,sY] - [x_0,y_0])
        movq            XMM1, ac[EBX];
        movq            XMM2, bd[EBX];
        pmuludq         XMM1, XMM0;     // [A,0,C,0] * ([x,y] + [sX,sY] - 
[x_0,y_0])
psrlq XMM1, 16; // ([A,0,C,0] * ([x,y] + [sX,sY] - [x_0,y_0]))>>16
        movups          XMM4, XMM0;
        psrldq          XMM4, 4;
        pslldq          XMM0, 4;
        por                     XMM4, XMM0;
        pmuludq         XMM2, XMM4; // [0,B,0,D] * ([x,y] + [sX,sY] - [x_0,y_0])
psrlq XMM2, 16; // ([0,B,0,D] * ([x,y] + [sX,sY] - [x_0,y_0]))>>16 paddq XMM1, XMM2; // ([A,B,C,D] * ([x,y] + [sX,sY] - [x_0,y_0]))>>16
        punpckldq       XMM3, XMM7;
paddq XMM1, XMM3; // ([A,B,C,D] * ([x,y] + [sX,sY] - [x_0,y_0]))>>16 + [x_0,y_0]
        movups          XMM0, XMM1;     // Convert 64 bit vectors into 32 bit 
ones
        psrldq          XMM0, 4;
        por                     XMM0, XMM1;     
        ret                     ;
}
}(...)
}

Reply via email to