# HG changeset patch # User Divya Manivannan <di...@multicorewareinc.com> # Date 1416467833 -19800 # Thu Nov 20 12:47:13 2014 +0530 # Node ID 02bc16b116ebfdb61c91a516291f1b19b259bcbf # Parent a6e1b125424acc727f9ba464ccc530550203d407 asm: luma_vpp[8x16, 8x32] in avx2: improve 1139c->774c, 1968c->1452c
diff -r a6e1b125424a -r 02bc16b116eb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 20 12:23:05 2014 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 12:47:13 2014 +0530 @@ -1802,6 +1802,8 @@ p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2; p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2; p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2; + p.luma_vpp[LUMA_8x16] = x265_interp_8tap_vert_pp_8x16_avx2; + p.luma_vpp[LUMA_8x32] = x265_interp_8tap_vert_pp_8x32_avx2; } #endif // if HIGH_BIT_DEPTH } diff -r a6e1b125424a -r 02bc16b116eb source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Nov 20 12:23:05 2014 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Nov 20 12:47:13 2014 +0530 @@ -3729,6 +3729,53 @@ movhps [r2 + r4], xm4 RET +%macro FILTER_VER_LUMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] + mov word [rsp], %2 / 8 + mova m7, [pw_512] + +.loop: + PROCESS_LUMA_AVX2_W8_8R + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movhps [r2], xm5 + movhps [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movq [r2], xm1 + movq [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm1 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + sub r0, r6 + dec word [rsp] + jnz .loop + RET +%endmacro + ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- @@ -3743,11 +3790,13 @@ ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 16, pp +FILTER_VER_LUMA_AVX2_8xN 8, 16 ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 32, pp +FILTER_VER_LUMA_AVX2_8xN 8, 32 ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel