# HG changeset patch # User Vignesh Vijayakumar # Date 1503462961 -19800 # Wed Aug 23 10:06:01 2017 +0530 # Node ID 31a180bcef33fae436ad7e3aa4378b283a86d56a # Parent 7868f1cb521d554dc77d768ec1f838e0b29824e4 x86: AVX512 copy_pp_32xN
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x16 | 1.63x | 2.58x 32x24 | 2.51x | 2.87x 32x32 | 2.48x | 2.95x 32x64 | 2.03x | 2.53x This patch also clean up code for 64xN diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 22 13:51:33 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530 @@ -3965,6 +3965,18 @@ p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512); p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512); p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512); + p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); + p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512); + p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); + p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 22 13:51:33 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530 @@ -1107,7 +1107,7 @@ BLOCKCOPY_PP_W64_H4_avx 64, 64 ;---------------------------------------------------------------------------------------------- -; Macro to calculate blockcopy_pp_64x4_avx512 +; blockcopy_pp avx512 code start ;---------------------------------------------------------------------------------------------- %macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0 movu m0, [r2] @@ -1121,16 +1121,28 @@ movu [r0 + r5] , m3 %endmacro +%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0 +movu ym0, [r2] +vinserti32x8 m0, [r2 + r3], 1 +movu ym1, [r2 + 2 * r3] +vinserti32x8 m1, [r2 + r4], 1 + +movu [r0] , ym0 +vextracti32x8 [r0 + r1] , m0, 1 +movu [r0 + 2 * r1] , ym1 +vextracti32x8 [r0 + r5] , m1, 1 +%endmacro + ;---------------------------------------------------------------------------------------------- ; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;---------------------------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4_avx512 1 INIT_ZMM avx512 -cglobal blockcopy_pp_64x%1, 4, 4, 6 +cglobal blockcopy_pp_64x%1, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] -%rep %1/4 - 1 +%rep %1/4 - 1 PROCESS_BLOCKCOPY_PP_64X4_avx512 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] @@ -1145,7 +1157,30 @@ BLOCKCOPY_PP_W64_H4_avx512 48 BLOCKCOPY_PP_W64_H4_avx512 64 - +%macro BLOCKCOPY_PP_W32_H4_avx512 1 +INIT_ZMM avx512 +cglobal blockcopy_pp_32x%1, 4, 6, 2 + lea r4, [3 * r3] + lea r5, [3 * r1] + +%rep %1/4 - 1 + PROCESS_BLOCKCOPY_PP_32X4_avx512 + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] +%endrep + PROCESS_BLOCKCOPY_PP_32X4_avx512 + RET +%endmacro + +BLOCKCOPY_PP_W32_H4_avx512 8 +BLOCKCOPY_PP_W32_H4_avx512 16 +BLOCKCOPY_PP_W32_H4_avx512 24 +BLOCKCOPY_PP_W32_H4_avx512 32 +BLOCKCOPY_PP_W32_H4_avx512 48 +BLOCKCOPY_PP_W32_H4_avx512 64 +;---------------------------------------------------------------------------------------------- +; blockcopy_pp avx512 code end +;---------------------------------------------------------------------------------------------- ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel