# HG changeset patch # User Vignesh Vijayakumar # Date 1502186111 -19800 # Tue Aug 08 15:25:11 2017 +0530 # Node ID 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d # Parent ce93c1b1894ae7d789e451f65479f018ba90ec76 x86: AVX512 cpy1Dto2D_shl_32
Size | BitDepth | AVX2 performance | AVX512 performance ------------------------------------------------------- 32x32| 8 | 16.03x | 28.94x 32x32| 10 | 14.12x | 24.99x diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530 @@ -2311,6 +2311,8 @@ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -3992,6 +3994,7 @@ p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); + p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); } #endif diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Aug 08 15:25:11 2017 +0530 @@ -5513,7 +5513,62 @@ jnz .loop RET - +;-------------------------------------------------------------------------------------- +; cpy_1Dto2D_shl avx512 code start +;-------------------------------------------------------------------------------------- +%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0 + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + r2], m2 + movu [r0 + 2 * r2], m3 + movu [r0 + r3], m4 + + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + + movu m1, [r1 + 0 * mmsize] + movu m2, [r1 + 1 * mmsize] + movu m3, [r1 + 2 * mmsize] + movu m4, [r1 + 3 * mmsize] + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + r2], m2 + movu [r0 + 2 * r2], m3 + movu [r0 + r3], m4 +%endmacro +;-------------------------------------------------------------------------------------- +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) +;-------------------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal cpy1Dto2D_shl_32, 4, 4, 5 + add r2d, r2d + movd xm0, r3d + lea r3, [3 * r2] + + PROCESS_CPY1Dto2D_SHL_32x8_AVX512 + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + PROCESS_CPY1Dto2D_SHL_32x8_AVX512 + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + PROCESS_CPY1Dto2D_SHL_32x8_AVX512 + add r1, 4 * mmsize + lea r0, [r0 + r2 * 4] + PROCESS_CPY1Dto2D_SHL_32x8_AVX512 + RET +;-------------------------------------------------------------------------------------- +; copy_cnt avx512 code end +;-------------------------------------------------------------------------------------- ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.h Wed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/blockcopy8.h Tue Aug 08 15:25:11 2017 +0530 @@ -37,6 +37,7 @@ FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel