# HG changeset patch # User Sumalatha Polureddy # Date 1430807066 -19800 # Tue May 05 11:54:26 2015 +0530 # Node ID 1cee00d68a2ace6b4da51d4a18a3052bd94987fc # Parent f32e6464225afa02983af1b1905f50cdccae5244 asm: avx2 code for sad 16x4,16x8,16x12,16x16,16x32 for 10bpp
sse2: sad[ 16x4] 2.78x 274.25 761.77 sad[ 16x8] 3.08x 455.14 1401.20 sad[16x12] 3.16x 644.93 2039.52 sad[16x16] 3.28x 830.14 2725.86 sad[16x32] 3.38x 1603.12 5415.46 avx2: sad[ 16x4] 5.38x 140.91 758.03 sad[ 16x8] 6.14x 214.04 1313.60 sad[16x12] 6.44x 316.88 2039.84 sad[16x16] 6.82x 396.83 2705.34 sad[16x32] 7.13x 738.81 5268.11 diff -r f32e6464225a -r 1cee00d68a2a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon May 04 15:15:42 2015 -0500 +++ b/source/common/x86/asm-primitives.cpp Tue May 05 11:54:26 2015 +0530 @@ -1239,7 +1239,11 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = x265_pixel_sub_ps_16x32_avx2; p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = x265_pixel_sub_ps_32x64_avx2; + p.pu[LUMA_16x4].sad = x265_pixel_sad_16x4_avx2; p.pu[LUMA_16x8].sad = x265_pixel_sad_16x8_avx2; + p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_avx2; + p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_avx2; + p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_avx2; p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2; p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2; diff -r f32e6464225a -r 1cee00d68a2a source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Mon May 04 15:15:42 2015 -0500 +++ b/source/common/x86/sad16-a.asm Tue May 05 11:54:26 2015 +0530 @@ -385,6 +385,13 @@ SAD 8, 16 SAD 8, 32 +INIT_YMM avx2 +SAD 16, 4 +SAD 16, 8 +SAD 16, 12 +SAD 16, 16 +SAD 16, 32 + ;------------------------------------------------------------------ ; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;------------------------------------------------------------------ @@ -772,58 +779,6 @@ %endif movd eax, xm0 RET - -INIT_YMM avx2 -cglobal pixel_sad_16x8, 4,7,7 - xorps m0, m0 - xorps m6, m6 - movu m5, [pw_1] - mov r4d, 8/4 - add r1d, r1d - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] -.loop - movu m1, [r0] ; row 0 of pix0 - movu m2, [r2] ; row 0 of pix1 - movu m3, [r0 + r1] ; row 1 of pix0 - movu m4, [r2 + r3] ; row 1 of pix1 - psubw m1, m2 - pabsw m1, m1 - pmaddwd m1, m5 - psubw m3, m4 - pabsw m3, m3 - pmaddwd m3, m5 - paddd m0, m1 - paddd m6, m3 - - movu m1, [r0 + 2 * r1] ; row 2 of pix0 - movu m2, [r2 + 2 * r3] ; row 2 of pix1 - movu m3, [r0 + r5] ; row 3 of pix0 - movu m4, [r2 + r6] ; row 3 of pix1 - psubw m1, m2 - pabsw m1, m1 - pmaddwd m1, m5 - psubw m3, m4 - pabsw m3, m3 - pmaddwd m3, m5 - paddd m0, m1 - paddd m6, m3 - lea r2, [r2 + 4 * r3] - lea r0, [r0 + 4 * r1] - - dec r4d - jnz .loop - - paddd m0, m6 - vextracti128 xm1, m0, 1 - paddd xm0, xm1 - movhlps xm1, xm0 - paddd xm0, xm1 - pshufd xm1, xm0, 1 - paddd xm0, xm1 - movd eax, xm0 - RET ;----------------------------------------------------------------------------- ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, ; uint16_t *pix2, intptr_t i_stride, int scores[3] ) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel