# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1511947290 -19800 # Wed Nov 29 14:51:30 2017 +0530 # Node ID 3e2058cec6c6f4ad49d92f9df7fbc110a54f4b4b # Parent d7af8d747bffacafa5dfe8f4d513bbd09314ad63 [x265-avx512]x86: AVX512 nquant
AVX2 Performance : 21.42x AVX512 Performance : 25.60x diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 15:29:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 14:51:30 2017 +0530 @@ -2887,8 +2887,7 @@ p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); - - + p.nquant = PFX(nquant_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); @@ -5015,7 +5014,7 @@ p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); - + p.nquant = PFX(nquant_avx512); } #endif } diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Nov 30 15:29:18 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Nov 29 14:51:30 2017 +0530 @@ -1277,7 +1277,101 @@ paddd xm5, xm0 movd eax, xm5 RET - +%if ARCH_X86_64 == 1 +INIT_ZMM avx512 +cglobal nquant, 3,5,22 +%if UNIX64 == 0 + vpbroadcastd m4, r4m +%else ; Mac + movd xm4, r4m + vpbroadcastd m4, xm4 +%endif + + vbroadcasti32x8 m6, [pw_1] + mov r4d, r5m + pxor m5, m5 + movd xm3, r3m + sub r4d, 16 + je .coeff16 + add r4d, 16 + shr r4d, 5 + jmp .loop + +.coeff16: + pmovsxwd m16, [r0] + pabsd m17, m16 + pmulld m17, [r1] + paddd m17, m4 + psrad m17, xm3 + + vextracti64x4 ym19, m17, 1 + vextracti64x4 ym20, m16, 1 + psignd ym17, ym16 + psignd ym19, ym20 + packssdw ym17, ym19 + vpermq ym17, ym17, q3120 + pabsw ym17, ym17 + movu [r2], ym17 + pminuw ym17, ym6 + paddw ym5, ym17 + pxor m0, m0 + psadbw ym5, ym0 + vextracti128 xm0, ym5, 1 + paddd xm5, xm0 + pshufd xm0, xm5, 2 + paddd xm5, xm0 + movd eax, xm5 + RET + +.loop: + pmovsxwd m16, [r0] + pabsd m17, m16 + pmulld m17, [r1] + paddd m17, m4 + psrad m17, xm3 + vextracti64x4 ym19, m17, 1 + vextracti64x4 ym20, m16, 1 + psignd ym17, ym16 + psignd ym19, ym20 + packssdw ym17, ym19 + + pmovsxwd m16, [r0 + mmsize/2] + pabsd m18, m16 + pmulld m18, [r1 + mmsize] + paddd m18, m4 + psrad m18, xm3 + vextracti64x4 ym21, m18, 1 + vextracti64x4 ym20, m16, 1 + psignd ym18, ym16 + psignd ym21, ym20 + packssdw ym18, ym21 + vinserti64x4 m17, m17, ym18, 1 + vpermq m17, m17, q3120 + + pabsw m17, m17 + movu [r2], m17 + + add r0, mmsize + add r1, mmsize * 2 + add r2, mmsize + + pminuw m17, m6 + paddw m5, m17 + + dec r4d + jnz .loop + + pxor m0, m0 + psadbw m5, m0 + vextracti32x8 ym1, m5, 1 + paddd ym5, ym1 + vextracti64x2 xm1, m5, 1 + paddd xm5, xm1 + pshufd xm1, xm5, 2 + paddd xm5, xm1 + movd eax, xm5 + RET +%endif ; ARCH_X86_64 == 1 ;----------------------------------------------------------------------------- ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) diff -r d7af8d747bff -r 3e2058cec6c6 source/test/mbdstharness.cpp --- a/source/test/mbdstharness.cpp Thu Nov 30 15:29:18 2017 +0530 +++ b/source/test/mbdstharness.cpp Wed Nov 29 14:51:30 2017 +0530 @@ -252,12 +252,10 @@ bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt) { int j = 0; - for (int i = 0; i < ITERS; i++) { - int width = (rand() % 4 + 1) * 4; + int width = 1 << (rand() % 4 + 2); int height = width; - uint32_t optReturnValue = 0; uint32_t refReturnValue = 0; _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel