# HG changeset patch # User Vignesh Vijayakumar # Date 1501593145 -19800 # Tue Aug 01 18:42:25 2017 +0530 # Node ID fabc3475654f222b469c57b6cf8fd41b334d71be # Parent ef7fd93923fa24a8f77a557817b03078356443e7 x86: AVX512 pixel_avg_weight_W64 for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 64x16 | 11.78x | 20.54x 64x32 | 12.08x | 23.01x 64x48 | 12.26x | 22.62x 64x64 | 12.35x | 22.67x diff -r ef7fd93923fa -r fabc3475654f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:27:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 18:42:25 2017 +0530 @@ -2301,6 +2301,10 @@ p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx512); p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx512); p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx512); + p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); + p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); + p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512); + p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); } } diff -r ef7fd93923fa -r fabc3475654f source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Tue Aug 01 18:27:37 2017 +0530 +++ b/source/common/x86/mc-a.asm Tue Aug 01 18:42:25 2017 +0530 @@ -5676,6 +5676,84 @@ movu [r0 + r8], m2 %endmacro +%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0 + movu m0, [r2] + movu m1, [r4] + movu m2, [r2 + r3] + movu m3, [r4 + r5] + pavgw m0, m1 + pavgw m2, m3 + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2 + mmsize] + movu m1, [r4 + mmsize] + movu m2, [r2 + r3 + mmsize] + movu m3, [r4 + r5 + mmsize] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + mmsize], m0 + movu [r0 + r1 + mmsize], m2 + + movu m0, [r2 + r3 * 2] + movu m1, [r4 + r5 * 2] + movu m2, [r2 + r6] + movu m3, [r4 + r7] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + r1 * 2], m0 + movu [r0 + r8], m2 + + movu m0, [r2 + r3 * 2 + mmsize] + movu m1, [r4 + r5 * 2 + mmsize] + movu m2, [r2 + r6 + mmsize] + movu m3, [r4 + r7 + mmsize] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r8 + mmsize], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + lea r4, [r4 + 4 * r5] + + movu m0, [r2] + movu m1, [r4] + movu m2, [r2 + r3] + movu m3, [r4 + r5] + pavgw m0, m1 + pavgw m2, m3 + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2 + mmsize] + movu m1, [r4 + mmsize] + movu m2, [r2 + r3 + mmsize] + movu m3, [r4 + r5 + mmsize] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + mmsize], m0 + movu [r0 + r1 + mmsize], m2 + + movu m0, [r2 + r3 * 2] + movu m1, [r4 + r5 * 2] + movu m2, [r2 + r6] + movu m3, [r4 + r7] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + r1 * 2], m0 + movu [r0 + r8], m2 + + movu m0, [r2 + r3 * 2 + mmsize] + movu m1, [r4 + r5 * 2 + mmsize] + movu m2, [r2 + r6 + mmsize] + movu m3, [r4 + r7 + mmsize] + pavgw m0, m1 + pavgw m2, m3 + movu [r0 + r1 * 2 + mmsize], m0 + movu [r0 + r8 + mmsize], m2 +%endmacro + %macro PIXEL_AVG_HBD_W32 1 INIT_ZMM avx512 cglobal pixel_avg_32x%1, 6,9,4 @@ -5701,6 +5779,31 @@ PIXEL_AVG_HBD_W32 24 PIXEL_AVG_HBD_W32 32 PIXEL_AVG_HBD_W32 64 + +%macro PIXEL_AVG_HBD_W64 1 +INIT_ZMM avx512 +cglobal pixel_avg_64x%1, 6,9,4 + add r1d, r1d + add r3d, r3d + add r5d, r5d + lea r6, [r3 * 3] + lea r7, [r5 * 3] + lea r8, [r1 * 3] + +%rep %1/8 - 1 + PROCESS_PIXELAVG_64x8_HBD_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + lea r4, [r4 + 4 * r5] +%endrep + PROCESS_PIXELAVG_64x8_HBD_AVX512 + RET +%endmacro + +PIXEL_AVG_HBD_W64 16 +PIXEL_AVG_HBD_W64 32 +PIXEL_AVG_HBD_W64 48 +PIXEL_AVG_HBD_W64 64 ;----------------------------------------------------------------------------- ;pixel_avg_pp avx512 high bit depth code end ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel