# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1512380104 -19800 # Mon Dec 04 15:05:04 2017 +0530 # Node ID ae75b2d09d10f28391d573507c13512360593386 # Parent 3e8615bc86537e07754a1c023ade702a837042a8 x86: AVX512 optimise interp_4tap_vert_ss_8xN
i444 8x4 AVX2 performance : 10.61x AVX512 performance : 18.93x diff -r 3e8615bc8653 -r ae75b2d09d10 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 04 14:23:30 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 15:05:04 2017 +0530 @@ -4903,6 +4903,7 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512); @@ -4938,7 +4939,9 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512); @@ -4979,6 +4982,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512); diff -r 3e8615bc8653 -r ae75b2d09d10 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Dec 04 14:23:30 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Dec 04 15:05:04 2017 +0530 @@ -11146,78 +11146,54 @@ ;------------------------------------------------------------------------------------------------------------- ;avx512 chroma_vss code start ;------------------------------------------------------------------------------------------------------------- -%macro PROCESS_CHROMA_VERT_SS_8x8_AVX512 0 +%macro PROCESS_CHROMA_VERT_SS_8x4_AVX512 0 + lea r5, [r0 + 4 * r1] movu xm1, [r0] - lea r6, [r0 + 2 * r1] - lea r8, [r0 + 4 * r1] - lea r9, [r8 + 2 * r1] - vinserti32x4 m1, [r6], 1 - vinserti32x4 m1, [r8], 2 - vinserti32x4 m1, [r9], 3 movu xm3, [r0 + r1] - vinserti32x4 m3, [r6 + r1], 1 - vinserti32x4 m3, [r8 + r1], 2 - vinserti32x4 m3, [r9 + r1], 3 + vinserti32x4 m1, [r0 + r1], 1 + vinserti32x4 m3, [r0 + 2 * r1], 1 + vinserti32x4 m1, [r0 + 2 * r1], 2 + vinserti32x4 m3, [r0 + r6], 2 + vinserti32x4 m1, [r0 + r6], 3 + vinserti32x4 m3, [r0 + 4 * r1], 3 + punpcklwd m0, m1, m3 pmaddwd m0, m8 punpckhwd m1, m3 pmaddwd m1, m8 movu xm4, [r0 + 2 * r1] - vinserti32x4 m4, [r6 + 2 * r1], 1 - vinserti32x4 m4, [r8 + 2 * r1], 2 - vinserti32x4 m4, [r9 + 2 * r1], 3 - punpcklwd m2, m3, m4 - pmaddwd m2, m8 - punpckhwd m3, m4 - pmaddwd m3, m8 - - movu xm5, [r0 + r10] - vinserti32x4 m5, [r6 + r10], 1 - vinserti32x4 m5, [r8 + r10], 2 - vinserti32x4 m5, [r9 + r10], 3 - punpcklwd m6, m4, m5 - pmaddwd m6, m9 - paddd m0, m6 + movu xm5, [r0 + r6] + vinserti32x4 m4, [r0 + r6], 1 + vinserti32x4 m5, [r5], 1 + vinserti32x4 m4, [r5], 2 + vinserti32x4 m5, [r5 + r1], 2 + vinserti32x4 m4, [r5 + r1], 3 + vinserti32x4 m5, [r5 + 2 * r1], 3 + + punpcklwd m3, m4, m5 + pmaddwd m3, m9 punpckhwd m4, m5 pmaddwd m4, m9 + + paddd m0, m3 paddd m1, m4 - movu xm4, [r0 + 4 * r1] - vinserti32x4 m4, [r6 + 4 * r1], 1 - vinserti32x4 m4, [r8 + 4 * r1], 2 - vinserti32x4 m4, [r9 + 4 * r1], 3 - punpcklwd m6, m5, m4 - pmaddwd m6, m9 - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, m9 - paddd m3, m5 - psrad m0, 6 psrad m1, 6 - psrad m2, 6 - psrad m3, 6 packssdw m0, m1 - packssdw m2, m3 - movu [r2], xm0 - movu [r2 + r3], xm2 - vextracti32x4 [r2 + 2 * r3], m0, 1 - vextracti32x4 [r2 + r7], m2, 1 - lea r2, [r2 + 4 * r3] - vextracti32x4 [r2], m0, 2 - vextracti32x4 [r2 + r3], m2, 2 - vextracti32x4 [r2 + 2 * r3], m0, 3 - vextracti32x4 [r2 + r7], m2, 3 + vextracti32x4 [r2 + r3], m0, 1 + vextracti32x4 [r2 + 2 * r3], m0, 2 + vextracti32x4 [r2 + r7], m0, 3 %endmacro ;----------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------------------------------------------- -%if ARCH_X86_64 +%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1 INIT_ZMM avx512 -cglobal interp_4tap_vert_ss_8x8, 5, 11, 10 +cglobal interp_4tap_vert_ss_8x%1, 5, 8, 10 add r1d, r1d add r3d, r3d sub r0, r1 @@ -11231,41 +11207,22 @@ mova m8, [r5] mova m9, [r5 + mmsize] %endif - lea r10, [3 * r1] + lea r6, [3 * r1] lea r7, [3 * r3] - PROCESS_CHROMA_VERT_SS_8x8_AVX512 - RET -%endif - -%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1 -INIT_ZMM avx512 -cglobal interp_4tap_vert_ss_8x%1, 5, 11, 10 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 7 -%ifdef PIC - lea r5, [pw_ChromaCoeffVer_32_avx512] - mova m8, [r5 + r4] - mova m9, [r5 + r4 + mmsize] -%else - lea r5, [pw_ChromaCoeffVer_32_avx512 + r4] - mova m8, [r5] - mova m9, [r5 + mmsize] -%endif - lea r10, [3 * r1] - lea r7, [3 * r3] - -%rep %1/8 - 1 - PROCESS_CHROMA_VERT_SS_8x8_AVX512 - lea r0, [r8 + 4 * r1] + +%rep %1/4 - 1 + PROCESS_CHROMA_VERT_SS_8x4_AVX512 + lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep - PROCESS_CHROMA_VERT_SS_8x8_AVX512 + PROCESS_CHROMA_VERT_SS_8x4_AVX512 RET %endmacro %if ARCH_X86_64 + FILTER_VER_SS_CHROMA_8xN_AVX512 4 + FILTER_VER_SS_CHROMA_8xN_AVX512 8 + FILTER_VER_SS_CHROMA_8xN_AVX512 12 FILTER_VER_SS_CHROMA_8xN_AVX512 16 FILTER_VER_SS_CHROMA_8xN_AVX512 32 FILTER_VER_SS_CHROMA_8xN_AVX512 64 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel