# HG changeset patch # User Jayashri Murugan # Date 1502430475 25200 # Thu Aug 10 22:47:55 2017 -0700 # Node ID 951e9a16296e5d1e528c0083630fde8122bd15c1 # Parent 3d8c45642752803c560891fdfbe0a8b5c03ca76a x86: AVX512 interp_4tap_horiz_ps_64xN
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 64x16 | 26.50x | 35.13x 64x32 | 25.48x | 38.62x 64x48 | 27.52x | 40.34x 64x64 | 27.85x | 40.43x diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 @@ -4029,6 +4029,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.weight_pp = PFX(weight_pp_avx512); + //i444 chroma_hps + p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); } #endif } diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 @@ -26,7 +26,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 const tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 @@ -152,6 +152,9 @@ const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 +ALIGN 64 +const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 + SECTION .text cextern pb_128 @@ -9836,7 +9839,7 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss ;------------------------------------------------------------------------------------------------------------- -;ipfilter_chroma_pp_avx512 code start +;ipfilter_chroma_avx512 code start ;------------------------------------------------------------------------------------------------------------- %macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 ; register map @@ -9976,6 +9979,86 @@ IPFILTER_CHROMA_PP_32xN_AVX512 32 IPFILTER_CHROMA_PP_32xN_AVX512 64 IPFILTER_CHROMA_PP_32xN_AVX512 48 + +%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0 + movu ym6, [r0] + vinserti32x8 m6, [r0 + 4], 1 + pshufb m7, m6, m2 + pshufb m6, m1 + pmaddubsw m6, m0 + pmaddubsw m7, m0 + pmaddwd m6, m3 + pmaddwd m7, m3 + + movu ym8, [r0 + 32] + vinserti32x8 m8, [r0 + 36], 1 + pshufb m9, m8, m2 + pshufb m8, m1 + pmaddubsw m8, m0 + pmaddubsw m9, m0 + pmaddwd m8, m3 + pmaddwd m9, m3 + + packssdw m6, m7 + packssdw m8, m9 + psubw m6, m4 + psubw m8, m4 + vpermq m6, m10, m6 + vpermq m8, m10, m8 + movu [r2], m6 + movu [r2 + mmsize],m8 +%endmacro + ;------------------------------------------------------------------------------------------------------------- -;ipfilter_chroma_pp_avx512 code end +; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_64x%1, 4,7,11 + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] + vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] + vbroadcasti32x8 m3, [pw_1] + vbroadcasti32x8 m4, [pw_2000] + mova m10, [interp8_hps_shuf_avx512] + + ; register map + ; m0 - interpolate coeff + ; m1,m2 - load shuffle order table + ; m3 - constant word 1 + ; m4 - constant word 2000 + ; m10 - store shuffle order table + + mov r6d, %1 + dec r0 + test r5d, r5d + je .loop + sub r0, r1 + add r6d, 3 + +.loop: + PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + r1] + dec r6d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PS_64xN_AVX512 64 + IPFILTER_CHROMA_PS_64xN_AVX512 32 + IPFILTER_CHROMA_PS_64xN_AVX512 48 + IPFILTER_CHROMA_PS_64xN_AVX512 16 + +;------------------------------------------------------------------------------------------------------------- +;ipfilter_chroma_avx512 code end +;------------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel