# HG changeset patch # User Vignesh Vijayakumar # Date 1503557407 -19800 # Thu Aug 24 12:20:07 2017 +0530 # Node ID 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47 # Parent 31a180bcef33fae436ad7e3aa4378b283a86d56a x86: AVX512 copy_cnt_32 and copy_cnt_16
Size | BitDepth | AVX2 performance | AVX512 performance ------------------------------------------------------- 16x16| 8 | 6.92x | 8.07x 16x16| 10 | 6.72x | 7.75x 32x32| 8 | 6.08x | 10.33x 32x32| 10 | 6.04x | 10.16x diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 24 12:20:07 2017 +0530 @@ -2342,6 +2342,9 @@ p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); p.weight_pp = PFX(weight_pp_avx512); + p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); + p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -4054,6 +4057,9 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); + p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); + //i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Aug 24 12:20:07 2017 +0530 @@ -5958,7 +5958,91 @@ movd eax, xm4 RET - +;-------------------------------------------------------------------------------------- +; copy_cnt avx512 code start +;-------------------------------------------------------------------------------------- +%macro PROCESS_COPY_CNT_32x4_AVX512 0 + movu m0, [r1] + movu m1, [r1 + r2] + movu [r0], m0 + movu [r0 + mmsize], m1 + packsswb m0, m1 + pminub m0, m3 + + movu m1, [r1 + 2 * r2] + movu m2, [r1 + r3] + movu [r0 + 2 * mmsize], m1 + movu [r0 + 3 * mmsize], m2 + packsswb m1, m2 + pminub m1, m3 + + paddb m0, m1 + paddb m4, m0 +%endmacro + +%macro PROCESS_COPY_CNT_16x4_AVX512 0 + movu ym0, [r1] + vinserti32x8 m0, [r1 + r2], 1 + movu ym1, [r1 + 2 * r2] + vinserti32x8 m1, [r1 + r3], 1 + movu [r0], m0 + movu [r0 + mmsize], m1 + packsswb m0, m1 + pminub m0, m3 + paddb m4, m0 +%endmacro + +%macro PROCESS_COPY_CNT_END_AVX512 0 + pxor m0, m0 + vextracti32x8 ym1, m4, 1 + paddb ym4, ym1 + vextracti32x4 xm1, ym4, 1 + paddb xm4, xm1 + psadbw xm4, xm0 + movhlps xm1, xm4 + paddd xm4, xm1 + movd eax, xm4 +%endmacro + +;-------------------------------------------------------------------------------------- +; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); +;-------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal copy_cnt_32, 3, 4, 5 + add r2d, r2d + lea r3, [3 * r2] + + vbroadcasti32x8 m3, [pb_1] + pxor m4, m4 + +%rep 7 + PROCESS_COPY_CNT_32x4_AVX512 + add r0, 4 * mmsize + lea r1, [r1 + 4 * r2] +%endrep + PROCESS_COPY_CNT_32x4_AVX512 + PROCESS_COPY_CNT_END_AVX512 + RET + +INIT_ZMM avx512 +cglobal copy_cnt_16, 3, 4, 5 + add r2d, r2d + lea r3, [3 * r2] + + vbroadcasti32x8 m3, [pb_1] + pxor m4, m4 + +%rep 3 + PROCESS_COPY_CNT_16x4_AVX512 + add r0, 2 * mmsize + lea r1, [r1 + 4 * r2] +%endrep + PROCESS_COPY_CNT_16x4_AVX512 + PROCESS_COPY_CNT_END_AVX512 + RET +;-------------------------------------------------------------------------------------- +; copy_cnt avx512 code end +;-------------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Wed Aug 23 10:06:01 2017 +0530 +++ b/source/common/x86/blockcopy8.h Thu Aug 24 12:20:07 2017 +0530 @@ -46,6 +46,7 @@ FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride); FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride); FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride); +FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride); FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel