The code can't pass testbench on my PC VS2008, Win7 x64
At 2014-11-20 11:13:31,[email protected] wrote: ># HG changeset patch ># User David T Yuen <[email protected]> ># Date 1416451149 28800 ># Node ID 37392ba74268210aafa8123d9f7c12d46a22c152 ># Parent d059cfa88f1ac79b319bd8a05bc70704d454f0ba >idct8 sse2 > >Based on the gcc of Debian 4.7.2-5 by the following command > >c++ -S -masm=intel -DX265_ARCH_X86=1 -DX86_64=1 -DHAVE_INT_TYPES_H=1 >-D__STDC_LIMIT_MACROS=1 -DHIGH_BIT_DEPTH=0 -O3 -DNDEBUG >-I/home/shakezula/Development/x265/source/. >-I/home/shakezula/Development/x265/source/Lib >-I/home/shakezula/Development/x265/source/common >-I/home/shakezula/Development/x265/source/encoder >-I/home/shakezula/Development/x265/build/linux -Wall -Wextra -Wshadow -fPIC >-ffast-math -mstackrealign -fno-exceptions -Wno-unused-parameter -msse3 -o >~/Development/dct-sse3.asm -c >/home/shakezula/Development/x265/source/common/vec/dct-sse3.cpp > >It has been tweeked for better register usage with fewer values written to the >stack and better setup of r2 indexing for write > >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Nov 18 14:11:12 2014 -0600 >+++ b/source/common/x86/asm-primitives.cpp Wed Nov 19 18:39:09 2014 -0800 >@@ -1377,6 +1377,7 @@ > p.dct[DCT_4x4] = x265_dct4_sse2; > p.idct[IDCT_4x4] = x265_idct4_sse2; > p.idct[IDST_4x4] = x265_idst4_sse2; >+ p.idct[IDCT_8x8] = x265_idct8_sse2; > > LUMA_SS_FILTERS(_sse2); > } >@@ -1567,6 +1568,7 @@ > p.ssim_end_4 = x265_pixel_ssim_end4_sse2; > p.dct[DCT_4x4] = x265_dct4_sse2; > p.idct[IDCT_4x4] = x265_idct4_sse2; >+ p.idct[IDCT_8x8] = x265_idct8_sse2; > p.idct[IDST_4x4] = x265_idst4_sse2; > p.planecopy_sp = x265_downShift_16_sse2; > p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2; >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.asm >--- a/source/common/x86/dct8.asm Tue Nov 18 14:11:12 2014 -0600 >+++ b/source/common/x86/dct8.asm Wed Nov 19 18:39:09 2014 -0800 >@@ -302,6 +302,19 @@ > > pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 > >+tab_idct8: times 4 dw 89, 75 >+ times 4 dw 50, 18 >+ times 4 dw 75, -18 >+ times 4 dw -89, -50 >+ times 4 dw 50, -89 >+ times 4 dw 18, 75 >+ times 4 dw 18, -50 >+ times 4 dw 75, -89 >+ times 4 dw 64, 64 >+ times 4 dw 64, -64 >+ times 4 dw 83, 36 >+ times 4 dw 36, -83 >+ > SECTION .text > cextern pd_1 > cextern pd_2 >@@ -976,6 +989,387 @@ > ;------------------------------------------------------- > ; void idct8(int32_t *src, int16_t *dst, intptr_t stride) > ;------------------------------------------------------- >+INIT_XMM sse2 >+ >+%if BIT_DEPTH == 10 >+ %define IDCT_SHIFT 10 >+ %define IDCT_ADD pd_512 >+%elif BIT_DEPTH == 8 >+ %define IDCT_SHIFT 12 >+ %define IDCT_ADD pd_2048 >+%else >+ %error Unsupported BIT_DEPTH! >+%endif >+ >+cglobal idct8, 3,7, 16 >+ lea r2, [r2 + r2] ;set r2 to index of 1 >+ lea r4, [r2 + r2] ;set r4 to index of 2 >+ lea r3, [r4 + r2] ;set r3 to index of 3 >+ lea r4, [r4 + r3] ;set r4 to index of 5 >+ mova m9, [r0 + 32] >+ packssdw m9, [r0 + 48] >+ movu m1, [r0 + 96] >+ packssdw m1, [r0 + 112] >+ mova m7, m9 >+ punpcklwd m7, m1 >+ punpckhwd m9, m1 >+ mova m14, [tab_idct8] >+ mova m3, m14 >+ pmaddwd m14, m7 >+ pmaddwd m3, m9 >+ mova m0, [r0 + 160] >+ packssdw m0, [r0 + 176] >+ mova m10, [r0 + 224] >+ packssdw m10, [r0 + 240] >+ mova m2, m0 >+ punpcklwd m2, m10 >+ punpckhwd m0, m10 >+ mova m15, [tab_idct8 + 16] >+ mova m11, [tab_idct8 + 16] >+ pmaddwd m15, m2 >+ mova m4, [tab_idct8 + 32] >+ pmaddwd m11, m0 >+ mova m1, [tab_idct8 + 32] >+ paddd m15, m14 >+ mova m5, [tab_idct8 + 64] >+ mova m12, [tab_idct8 + 64] >+ paddd m11, m3 >+ mova [rsp - 72], m11 >+ mova [rsp - 88], m15 >+ pmaddwd m4, m7 >+ pmaddwd m1, m9 >+ mova m14, [tab_idct8 + 48] >+ mova m3, [tab_idct8 + 48] >+ pmaddwd m14, m2 >+ pmaddwd m3, m0 >+ paddd m14, m4 >+ paddd m3, m1 >+ mova [rsp - 40], m3 >+ pmaddwd m5, m9 >+ pmaddwd m9, [tab_idct8 + 96] >+ mova m6, [tab_idct8 + 80] >+ pmaddwd m12, m7 >+ pmaddwd m7, [tab_idct8 + 96] >+ mova m4, [tab_idct8 + 80] >+ pmaddwd m6, m2 >+ paddd m6, m12 >+ pmaddwd m2, [tab_idct8 + 112] >+ paddd m7, m2 >+ mova [rsp - 24], m6 >+ pmaddwd m4, m0 >+ pmaddwd m0, [tab_idct8 + 112] >+ paddd m9, m0 >+ paddd m5, m4 >+ mova m6, [r0] >+ packssdw m6, [r0 + 16] >+ mova m0, [r0 + 128] >+ packssdw m0, [r0 + 144] >+ mova m4, m6 >+ mova m12, [r0 + 64] >+ punpcklwd m4, m0 >+ punpckhwd m6, m0 >+ packssdw m12, [r0 + 80] >+ mova m0, [r0 + 192] >+ packssdw m0, [r0 + 208] >+ mova m13, m12 >+ mova m8, [tab_idct8 + 128] >+ punpcklwd m13, m0 >+ mova m10, [tab_idct8 + 128] >+ punpckhwd m12, m0 >+ pmaddwd m8, m4 >+ mova m3, m8 >+ pmaddwd m4, [tab_idct8 + 144] >+ pmaddwd m10, m6 >+ mova m2, [tab_idct8 + 160] >+ mova m1, m10 >+ pmaddwd m6, [tab_idct8 + 144] >+ mova m0, [tab_idct8 + 160] >+ pmaddwd m2, m13 >+ paddd m3, m2 >+ psubd m8, m2 >+ mova m2, m6 >+ pmaddwd m13, [tab_idct8 + 176] >+ pmaddwd m0, m12 >+ paddd m1, m0 >+ psubd m10, m0 >+ mova m0, m4 >+ pmaddwd m12, [tab_idct8 + 176] >+ paddd m3, [pd_64] >+ paddd m1, [pd_64] >+ paddd m8, [pd_64] >+ paddd m10, [pd_64] >+ paddd m0, m13 >+ paddd m2, m12 >+ paddd m0, [pd_64] >+ paddd m2, [pd_64] >+ psubd m4, m13 >+ psubd m6, m12 >+ paddd m4, [pd_64] >+ paddd m6, [pd_64] >+ mova m12, m8 >+ psubd m8, m7 >+ psrad m8, 7 >+ paddd m15, m3 >+ psubd m3, [rsp - 88] >+ psrad m15, 7 >+ paddd m12, m7 >+ psrad m12, 7 >+ paddd m11, m1 >+ mova m13, m14 >+ psrad m11, 7 >+ packssdw m15, m11 >+ psubd m1, [rsp - 72] >+ psrad m1, 7 >+ mova m11, [rsp - 40] >+ paddd m14, m0 >+ psrad m14, 7 >+ psubd m0, m13 >+ psrad m0, 7 >+ paddd m11, m2 >+ mova m13, [rsp - 24] >+ psrad m11, 7 >+ packssdw m14, m11 >+ mova m11, m6 >+ psubd m6, m5 >+ paddd m13, m4 >+ psrad m13, 7 >+ psrad m6, 7 >+ paddd m11, m5 >+ psrad m11, 7 >+ packssdw m13, m11 >+ mova m11, m10 >+ psubd m4, [rsp - 24] >+ psubd m10, m9 >+ psrad m4, 7 >+ psrad m10, 7 >+ packssdw m4, m6 >+ packssdw m8, m10 >+ paddd m11, m9 >+ psrad m11, 7 >+ packssdw m12, m11 >+ psubd m2, [rsp - 40] >+ mova m5, m15 >+ psrad m2, 7 >+ packssdw m0, m2 >+ mova m2, m14 >+ psrad m3, 7 >+ packssdw m3, m1 >+ mova m6, m13 >+ punpcklwd m5, m8 >+ punpcklwd m2, m4 >+ mova m1, m12 >+ punpcklwd m6, m0 >+ punpcklwd m1, m3 >+ mova m9, m5 >+ punpckhwd m13, m0 >+ mova m0, m2 >+ punpcklwd m9, m6 >+ punpckhwd m5, m6 >+ punpcklwd m0, m1 >+ punpckhwd m2, m1 >+ punpckhwd m15, m8 >+ mova m1, m5 >+ punpckhwd m14, m4 >+ punpckhwd m12, m3 >+ mova m6, m9 >+ punpckhwd m9, m0 >+ punpcklwd m1, m2 >+ mova m4, [tab_idct8] >+ punpckhwd m5, m2 >+ punpcklwd m6, m0 >+ mova m2, m15 >+ mova m0, m14 >+ mova m7, m9 >+ punpcklwd m2, m13 >+ punpcklwd m0, m12 >+ punpcklwd m7, m5 >+ punpckhwd m14, m12 >+ mova m10, m2 >+ punpckhwd m15, m13 >+ punpckhwd m9, m5 >+ pmaddwd m4, m7 >+ mova m13, m1 >+ punpckhwd m2, m0 >+ punpcklwd m10, m0 >+ mova m0, m15 >+ punpckhwd m15, m14 >+ mova m12, m1 >+ mova m3, [tab_idct8] >+ punpcklwd m0, m14 >+ pmaddwd m3, m9 >+ mova m11, m2 >+ punpckhwd m2, m15 >+ punpcklwd m11, m15 >+ mova m8, [tab_idct8 + 16] >+ punpcklwd m13, m0 >+ punpckhwd m12, m0 >+ pmaddwd m8, m11 >+ paddd m8, m4 >+ mova [rsp - 88], m8 >+ mova m4, [tab_idct8 + 32] >+ pmaddwd m4, m7 >+ mova m15, [tab_idct8 + 32] >+ mova m5, [tab_idct8 + 16] >+ pmaddwd m15, m9 >+ pmaddwd m5, m2 >+ paddd m5, m3 >+ mova [rsp - 72], m5 >+ mova m14, [tab_idct8 + 48] >+ mova m5, [tab_idct8 + 48] >+ pmaddwd m14, m11 >+ paddd m14, m4 >+ mova [rsp - 56], m14 >+ pmaddwd m5, m2 >+ paddd m5, m15 >+ mova [rsp - 40], m5 >+ mova m15, [tab_idct8 + 64] >+ mova m5, [tab_idct8 + 64] >+ pmaddwd m15, m7 >+ pmaddwd m7, [tab_idct8 + 96] >+ pmaddwd m5, m9 >+ pmaddwd m9, [tab_idct8 + 96] >+ mova m4, [tab_idct8 + 80] >+ pmaddwd m4, m2 >+ paddd m5, m4 >+ mova m4, m6 >+ mova m8, [tab_idct8 + 80] >+ punpckhwd m6, m10 >+ pmaddwd m2, [tab_idct8 + 112] >+ punpcklwd m4, m10 >+ paddd m9, m2 >+ pmaddwd m8, m11 >+ mova m10, [tab_idct8 + 128] >+ paddd m8, m15 >+ pmaddwd m11, [tab_idct8 + 112] >+ paddd m7, m11 >+ mova [rsp - 24], m8 >+ pmaddwd m10, m6 >+ pmaddwd m6, [tab_idct8 + 144] >+ mova m1, m10 >+ mova m8, [tab_idct8 + 128] >+ mova m3, [tab_idct8 + 160] >+ pmaddwd m8, m4 >+ pmaddwd m4, [tab_idct8 + 144] >+ mova m0, m8 >+ mova m2, [tab_idct8 + 160] >+ pmaddwd m3, m13 >+ psubd m8, m3 >+ paddd m0, m3 >+ mova m3, m6 >+ pmaddwd m13, [tab_idct8 + 176] >+ pmaddwd m2, m12 >+ paddd m1, m2 >+ psubd m10, m2 >+ mova m2, m4 >+ pmaddwd m12, [tab_idct8 + 176] >+ paddd m0, [IDCT_ADD] >+ paddd m1, [IDCT_ADD] >+ paddd m8, [IDCT_ADD] >+ paddd m10, [IDCT_ADD] >+ paddd m2, m13 >+ paddd m3, m12 >+ paddd m2, [IDCT_ADD] >+ paddd m3, [IDCT_ADD] >+ psubd m4, m13 >+ psubd m6, m12 >+ paddd m4, [IDCT_ADD] >+ paddd m6, [IDCT_ADD] >+ mova m15, [rsp - 88] >+ mova m12, m8 >+ psubd m8, m7 >+ psrad m8, IDCT_SHIFT >+ mova m11, [rsp - 72] >+ paddd m15, m0 >+ psrad m15, IDCT_SHIFT >+ psubd m0, [rsp - 88] >+ psrad m0, IDCT_SHIFT >+ paddd m12, m7 >+ paddd m11, m1 >+ mova m14, [rsp - 56] >+ psrad m11, IDCT_SHIFT >+ packssdw m15, m11 >+ psubd m1, [rsp - 72] >+ psrad m1, IDCT_SHIFT >+ mova m11, [rsp - 40] >+ paddd m14, m2 >+ psrad m14, IDCT_SHIFT >+ packssdw m0, m1 >+ psrad m12, IDCT_SHIFT >+ psubd m2, [rsp - 56] >+ paddd m11, m3 >+ mova m13, [rsp - 24] >+ psrad m11, IDCT_SHIFT >+ packssdw m14, m11 >+ mova m11, m6 >+ psubd m6, m5 >+ paddd m13, m4 >+ psrad m13, IDCT_SHIFT >+ mova m1, m15 >+ paddd m11, m5 >+ psrad m11, IDCT_SHIFT >+ packssdw m13, m11 >+ mova m11, m10 >+ psubd m10, m9 >+ psrad m10, IDCT_SHIFT >+ packssdw m8, m10 >+ psrad m6, IDCT_SHIFT >+ psubd m4, [rsp - 24] >+ paddd m11, m9 >+ psrad m11, IDCT_SHIFT >+ packssdw m12, m11 >+ punpcklwd m1, m14 >+ mova m5, m13 >+ psrad m4, IDCT_SHIFT >+ packssdw m4, m6 >+ psubd m3, [rsp - 40] >+ psrad m2, IDCT_SHIFT >+ mova m6, m8 >+ psrad m3, IDCT_SHIFT >+ punpcklwd m5, m12 >+ packssdw m2, m3 >+ punpcklwd m6, m4 >+ punpckhwd m8, m4 >+ mova m4, m1 >+ mova m3, m2 >+ punpckhdq m1, m5 >+ punpckldq m4, m5 >+ punpcklwd m3, m0 >+ punpckhwd m2, m0 >+ mova m0, m6 >+ lea r0, [r4 + r2 * 2] ;set r0 to index of 7 >+ movq [r1], m4 >+ punpckhwd m15, m14 >+ movhps [r1 + r2], m4 >+ punpckhdq m0, m3 >+ movq [r1 + r2 * 2], m1 >+ punpckhwd m13, m12 >+ movhps [r1 + r3], m1 >+ mova m1, m6 >+ punpckldq m1, m3 >+ movq [r1 + 8], m1 >+ movhps [r1 + r2 + 8], m1 >+ movq [r1 + r2 * 2 + 8], m0 >+ movhps [r1 + r3 + 8], m0 >+ mova m0, m15 >+ punpckhdq m15, m13 >+ punpckldq m0, m13 >+ movq [r1 + r2 * 4], m0 >+ movhps [r1 + r4], m0 >+ mova m0, m8 >+ punpckhdq m8, m2 >+ movq [r1 + r3 * 2], m15 >+ punpckldq m0, m2 >+ movhps [r1 + r0], m15 >+ movq [r1 + r2 * 4 + 8], m0 >+ movhps [r1 + r4 + 8], m0 >+ movq [r1 + r3 * 2 + 8], m8 >+ movhps [r1 + r0 + 8], m8 >+ RET >+%undef IDCT_SHIFT >+ >+;------------------------------------------------------- >+; void idct8(int32_t *src, int16_t *dst, intptr_t stride) >+;------------------------------------------------------- > INIT_XMM ssse3 > > cglobal patial_butterfly_inverse_internal_pass1 >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.h >--- a/source/common/x86/dct8.h Tue Nov 18 14:11:12 2014 -0600 >+++ b/source/common/x86/dct8.h Wed Nov 19 18:39:09 2014 -0800 >@@ -35,6 +35,7 @@ > void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride); > void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride); > void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride); >+void x265_idct8_sse2(int32_t *src, int16_t *dst, intptr_t stride); > void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride); > void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride); > void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride); >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
