# HG changeset patch # User Vignesh Vijayakumar # Date 1501589225 -19800 # Tue Aug 01 17:37:05 2017 +0530 # Node ID aac415b7223acced7fc844c4a07225704b811df0 # Parent ad756cf6d35f0d1460c5a079bea8781ffd67b7c7 x86: AVX512 addAvg_48x64 for high bit depth
AVX2 performance: 10.61x AVX512 performance: 13.18x diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530 @@ -2276,6 +2276,7 @@ p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); + p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Mon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/mc-a.asm Tue Aug 01 17:37:05 2017 +0530 @@ -1812,6 +1812,79 @@ movu [r2 + r8 + mmsize], m0 %endmacro +%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu ym0, [r0 + mmsize] + movu ym1, [r1 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + mmsize], ym0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu ym0, [r0 + r3 + mmsize] + movu ym1, [r1 + r4 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + r5 + mmsize], ym0 + + movu m0, [r0 + 2 * r3] + movu m1, [r1 + 2 * r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5], m0 + + movu ym0, [r0 + 2 * r3 + mmsize] + movu ym1, [r1 + 2 * r4 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + 2 * r5 + mmsize], ym0 + + movu m0, [r0 + r6] + movu m1, [r1 + r7] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8], m0 + + movu ym0, [r0 + r6 + mmsize] + movu ym1, [r1 + r7 + mmsize] + paddw ym0, ym1 + pmulhrsw ym0, ym3 + paddw ym0, ym4 + pmaxsw ym0, ym2 + pminsw ym0, ym5 + movu [r2 + r8 + mmsize], ym0 +%endmacro ;----------------------------------------------------------------------------- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;----------------------------------------------------------------------------- @@ -1874,6 +1947,28 @@ ADDAVG_W64_HBD_AVX512 32 ADDAVG_W64_HBD_AVX512 48 ADDAVG_W64_HBD_AVX512 64 + +INIT_ZMM avx512 +cglobal addAvg_48x64, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep 15 + PROCESS_ADDAVG_48x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_48x4_HBD_AVX512 + RET ;----------------------------------------------------------------------------- ;addAvg avx512 high bit depth code end ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel