the code is right if possible, I suggest do "dec srcq" before loop, so we can modify [srcq -1] to [srcq], it is faster and less a byte
At 2013-11-12 23:06:55,[email protected] wrote: ># HG changeset patch ># User Nabajit Deka ># Date 1384268074 -19800 ># Tue Nov 12 20:24:34 2013 +0530 ># Node ID c9851effbce88c9a70f712fbfaf7e83616c5615f ># Parent 968f6df6d50f70d2a4cf569a8c0426f65d927b00 >asm: routines for chroma hps filter functions for 2xN, 4xN, 6x8 and 12x16 >block sizes. > >diff -r 968f6df6d50f -r c9851effbce8 source/common/x86/ipfilter8.asm >--- a/source/common/x86/ipfilter8.asm Tue Nov 12 17:34:19 2013 +0530 >+++ b/source/common/x86/ipfilter8.asm Tue Nov 12 20:24:34 2013 +0530 >@@ -3417,3 +3417,186 @@ > FILTER_VER_CHROMA_SP_W8_H2 8, 8 > FILTER_VER_CHROMA_SP_W8_H2 8, 16 > FILTER_VER_CHROMA_SP_W8_H2 8, 32 >+ >+%macro PROCESS_CHROMA_W2 3 >+ movh %2, [srcq - 1] >+ pshufb %2, %2, Tm0 >+ movh %1, [srcq + srcstrideq - 1] >+ pshufb %1, %1, Tm0 >+ punpcklqdq %2, %1 >+ pmaddubsw %2, coef2 >+ phaddw %2, %2 >+ psubw %2, %3 >+ movd [dstq], %2 >+ pshufd %2, %2, 1 >+ movd [dstq + dststrideq], %2 >+%endmacro >+ >+;------------------------------------------------------------------------------------------------------------- >+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t >*dst, intptr_t dstStride, int coeffIdx) >+;------------------------------------------------------------------------------------------------------------- >+%macro FILTER_HORIZ_CHROMA_2xN 2 >+INIT_XMM sse4 >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride >+%define coef2 m5 >+%define Tm0 m4 >+%define Tm1 m3 >+%define t2 m2 >+%define t1 m1 >+%define t0 m0 >+ >+mov r4d, r4m >+add dststrided, dststrided >+ >+%ifdef PIC >+lea r5, [tab_ChromaCoeff] >+movd coef2, [r5 + r4 * 4] >+%else >+movd coef2, [tab_ChromaCoeff + r4 * 4] >+%endif >+ >+pshufd coef2, coef2, 0 >+mova t2, [tab_c_8192] >+mova Tm0, [tab_Tm] >+ >+%rep %2/2 >+PROCESS_CHROMA_W2 t0, t1, t2 >+lea srcq, [srcq + srcstrideq * 2] >+lea dstq, [dstq + dststrideq * 2] >+%endrep >+ >+RET >+%endmacro >+ >+FILTER_HORIZ_CHROMA_2xN 2, 4 >+FILTER_HORIZ_CHROMA_2xN 2, 8 >+ >+ >+%macro PROCESS_CHROMA_W4 3 >+ movh %2, [srcq - 1] >+ pshufb %2, %2, Tm0 >+ pmaddubsw %2, coef2 >+ movh %1, [srcq + srcstrideq - 1] >+ pshufb %1, %1, Tm0 >+ pmaddubsw %1, coef2 >+ phaddw %2, %1 >+ psubw %2, %3 >+ movlps [dstq], %2 >+ movhps [dstq + dststrideq], %2 >+%endmacro >+ >+;------------------------------------------------------------------------------------------------------------- >+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t >*dst, intptr_t dstStride, int coeffIdx) >+;------------------------------------------------------------------------------------------------------------- >+%macro FILTER_HORIZ_CHROMA_4xN 2 >+INIT_XMM sse4 >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride >+%define coef2 m5 >+%define Tm0 m4 >+%define Tm1 m3 >+%define t2 m2 >+%define t1 m1 >+%define t0 m0 >+ >+mov r4d, r4m >+add dststrided, dststrided >+ >+%ifdef PIC >+lea r5, [tab_ChromaCoeff] >+movd coef2, [r5 + r4 * 4] >+%else >+movd coef2, [tab_ChromaCoeff + r4 * 4] >+%endif >+ >+pshufd coef2, coef2, 0 >+mova t2, [tab_c_8192] >+mova Tm0, [tab_Tm] >+ >+%rep %2/2 >+PROCESS_CHROMA_W4 t0, t1, t2 >+lea srcq, [srcq + srcstrideq * 2] >+lea dstq, [dstq + dststrideq * 2] >+%endrep >+ >+RET >+%endmacro >+ >+FILTER_HORIZ_CHROMA_4xN 4, 2 >+FILTER_HORIZ_CHROMA_4xN 4, 4 >+FILTER_HORIZ_CHROMA_4xN 4, 8 >+FILTER_HORIZ_CHROMA_4xN 4, 16 >+ >+ >+%macro PROCESS_CHROMA_W6 3 >+ movu %1, [srcq - 1] >+ pshufb %2, %1, Tm0 >+ pmaddubsw %2, coef2 >+ pshufb %1, %1, Tm1 >+ pmaddubsw %1, coef2 >+ phaddw %2, %1 >+ psubw %2, %3 >+ movh [dstq], %2 >+ pshufd %2, %2, 2 >+ movd [dstq + 8], %2 >+%endmacro >+ >+%macro PROCESS_CHROMA_W12 3 >+ movu %1, [srcq - 1] >+ pshufb %2, %1, Tm0 >+ pmaddubsw %2, coef2 >+ pshufb %1, %1, Tm1 >+ pmaddubsw %1, coef2 >+ phaddw %2, %1 >+ psubw %2, %3 >+ movu [dstq], %2 >+ movu %1, [srcq - 1 + 8] >+ pshufb %1, %1, Tm0 >+ pmaddubsw %1, coef2 >+ phaddw %1, %1 >+ psubw %1, %3 >+ movh [dstq + 16], %1 >+%endmacro >+ >+;----------------------------------------------------------------------------- >+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, pixel *dst, >intptr_t dstStride, int coeffIdx) >+;----------------------------------------------------------------------------- >+%macro FILTER_HORIZ_CHROMA 2 >+INIT_XMM sse4 >+cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride >+%define coef2 m5 >+%define Tm0 m4 >+%define Tm1 m3 >+%define t2 m2 >+%define t1 m1 >+%define t0 m0 >+ >+mov r4d, r4m >+add dststrided, dststrided >+ >+%ifdef PIC >+lea r5, [tab_ChromaCoeff] >+movd coef2, [r5 + r4 * 4] >+%else >+movd coef2, [tab_ChromaCoeff + r4 * 4] >+%endif >+ >+mov r5d, %2 >+ >+pshufd coef2, coef2, 0 >+mova t2, [tab_c_8192] >+mova Tm0, [tab_Tm] >+mova Tm1, [tab_Tm + 16] >+ >+.loop >+PROCESS_CHROMA_W%1 t0, t1, t2 >+add srcq, srcstrideq >+add dstq, dststrideq >+ >+dec r5d >+jnz .loop >+ >+RET >+%endmacro >+ >+FILTER_HORIZ_CHROMA 6, 8 >+FILTER_HORIZ_CHROMA 12, 16 >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
