# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1510120943 -19800 # Wed Nov 08 11:32:23 2017 +0530 # Node ID d6b9a214bbbf62e6052e231ac9110f256d836204 # Parent 3f4b7399d14ba72aba0692e61681276f09df8ada x86: AVX512 interp_4tap_vert_ss_64xN for high bit depth
i444 Size | AVX2 performance | AVX512 performance ---------------------------------------------- 64x16 | 25.33x | 39.28x 64x32 | 25.52x | 39.09x 64x48 | 25.51x | 39.10x 64x64 | 25.45x | 39.46x diff -r 3f4b7399d14b -r d6b9a214bbbf source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 15 12:30:26 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 08 11:32:23 2017 +0530 @@ -2647,6 +2647,10 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512); diff -r 3f4b7399d14b -r d6b9a214bbbf source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Wed Nov 15 12:30:26 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Nov 08 11:32:23 2017 +0530 @@ -8272,7 +8272,7 @@ FILTER_VER_PS_CHROMA_64xN_AVX512 64 %endif -%macro PROCESS_CHROMA_VERT_SP_64x2_AVX512 0 +%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1 movu m1, [r0] movu m3, [r0 + r1] punpcklwd m0, m1, m3 @@ -8332,6 +8332,7 @@ pmaddwd m13, m16 paddd m11, m13 +%ifidn %1,sp paddd m0, m7 paddd m1, m7 paddd m2, m7 @@ -8349,7 +8350,16 @@ psrad m9, INTERP_SHIFT_SP psrad m10, INTERP_SHIFT_SP psrad m11, INTERP_SHIFT_SP - +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 +%endif packssdw m0, m1 packssdw m2, m3 packssdw m8, m9 @@ -8362,9 +8372,9 @@ ;----------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_SP_CHROMA_64xN_AVX512 1 +%macro FILTER_VER_S_CHROMA_64xN_AVX512 2 INIT_ZMM avx512 -cglobal interp_4tap_vert_sp_64x%1, 5, 7, 17 +cglobal interp_4tap_vert_%1_64x%2, 5, 7, 17 add r1d, r1d add r3d, r3d sub r0, r1 @@ -8377,23 +8387,29 @@ lea r5, [tab_ChromaCoeffV_avx512 + r4] %endif +%ifidn %1, sp vbroadcasti32x4 m7, [INTERP_OFFSET_SP] - mova [r5], m15 - mova [r5 + mmsize], m16 - -%rep %1/2 - 1 - PROCESS_CHROMA_VERT_SP_64x2_AVX512 +%endif + mova m15, [r5] + mova m16, [r5 + mmsize] + +%rep %2/2 - 1 + PROCESS_CHROMA_VERT_S_64x2_AVX512 %1 lea r2, [r2 + 2 * r3] %endrep - PROCESS_CHROMA_VERT_SP_64x2_AVX512 + PROCESS_CHROMA_VERT_S_64x2_AVX512 %1 RET %endmacro %if ARCH_X86_64 - FILTER_VER_SP_CHROMA_64xN_AVX512 16 - FILTER_VER_SP_CHROMA_64xN_AVX512 32 - FILTER_VER_SP_CHROMA_64xN_AVX512 48 - FILTER_VER_SP_CHROMA_64xN_AVX512 64 + FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16 + FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32 + FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48 + FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64 + FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16 + FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32 + FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48 + FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64 %endif ;------------------------------------------------------------------------------------------------------------- ;ipfilter_chroma_avx512 code end _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel