# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1512723573 -19800 # Fri Dec 08 14:29:33 2017 +0530 # Node ID ddd64f4b2ff382d05e86708750b20332ed93f3c9 # Parent fa954ed4a1e7ce2741f3cac14006f78c3199191b x86: AVX512 intra_pred_dc32 for high bit depth
AVX2 performance : 15.53x AVX512 performance : 23.96x diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 14:29:33 2017 +0530 @@ -3053,6 +3053,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512); } #endif diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/intrapred.h Fri Dec 08 14:29:33 2017 +0530 @@ -76,7 +76,7 @@ FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); - +FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/intrapred16.asm Fri Dec 08 14:29:33 2017 +0530 @@ -688,6 +688,68 @@ movu [r0 + r2 * 1 + 0], m0 movu [r0 + r2 * 1 + mmsize], m0 RET + +INIT_ZMM avx512 +cglobal intra_pred_dc32, 3,3,17 + add r2, 2 + add r1d, r1d + movu m16, [r2] + movu m1, [r2 + 2 * mmsize] + paddw m16, m1 + vextracti32x8 ym1, m16, 1 + paddw ym16, ym1 + vextracti32x4 xm1, m16, 1 + paddw xm16, xm1 + pmaddwd xm16, [pw_1] + movhlps xm1, xm16 + paddd xm16, xm1 + phaddd xm16, xm16 + paddd xm16, [pd_32] ; sum = sum + 32 + psrld xm16, 6 ; sum = sum / 64 + vpbroadcastw m0, xm16 + + lea r2, [r1 * 3] + ; store DC 32x32 + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + lea r0, [r0 + r1 * 4] + movu [r0 + r1 * 0 + 0], m0 + movu [r0 + r1 * 1 + 0], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r2 * 1 + 0], m0 + RET %endif ;--------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel