# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1512622521 -19800 # Thu Dec 07 10:25:21 2017 +0530 # Node ID 9bd38bd06850914d1cbf617063ea0e1e60f66219 # Parent 2d298099a8d6b266a32b975de4b6a369988d3887 x86: AVX512 pixel_satd_64xN and 32xN for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x8 | 10.99x | 17.98x 32x16 | 12.18x | 17.05x 32x24 | 13.11x | 19.70x 32x32 | 13.21x | 18.36x 32x64 | 13.27x | 19.04x 64x16 | 12.36x | 17.15x 64x32 | 11.63x | 17.78x 64x48 | 12.00x | 19.23x 64x64 | 12.12x | 19.20x diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 06 10:53:15 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 10:25:21 2017 +0530 @@ -3015,6 +3015,24 @@ //Luma_hps_48x64 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512); + p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512); + p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512); + p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512); + p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512); + p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512); + p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512); + p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512); + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); + } #endif } diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Dec 06 10:53:15 2017 +0530 +++ b/source/common/x86/pixel-a.asm Thu Dec 07 10:25:21 2017 +0530 @@ -13958,6 +13958,192 @@ paddd xm6, xm7 movd eax, xm6 RET + +%macro SATD_HBD_AVX512_END 0 + vextracti32x8 ym7, m6, 1 + paddd ym6, ym7 + vextracti128 xm7, ym6, 1 + paddd xm6, xm7 + pxor xm7, xm7 + movhlps xm7, xm6 + paddd xm6, xm7 + pshufd xm7, xm6, 1 + paddd xm6, xm7 + movd eax, xm6 +%endmacro + +%macro PROCESS_SATD_32x8_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 8 rows + ; rows 0-3 + movu m0, [r0] + movu m4, [r2] + psubw m0, m4 + movu m1, [r0 + r1] + movu m5, [r2 + r3] + psubw m1, m5 + movu m2, [r0 + r1 * 2] + movu m4, [r2 + r3 * 2] + psubw m2, m4 + movu m3, [r0 + r4] + movu m5, [r2 + r5] + psubw m3, m5 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + paddw m4, m0, m1 + psubw m1, m0 + paddw m0, m2, m3 + psubw m3, m2 + punpckhwd m2, m4, m1 + punpcklwd m4, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddw m3, m4, m0 + psubw m0, m4 + paddw m4, m2, m1 + psubw m1, m2 + punpckhdq m2, m3, m0 + punpckldq m3, m0 + paddw m0, m3, m2 + psubw m2, m3 + punpckhdq m3, m4, m1 + punpckldq m4, m1 + paddw m1, m4, m3 + psubw m3, m4 + punpckhqdq m4, m0, m1 + punpcklqdq m0, m1 + pabsw m0, m0 + pabsw m4, m4 + pmaxsw m0, m0, m4 + punpckhqdq m1, m2, m3 + punpcklqdq m2, m3 + pabsw m2, m2 + pabsw m1, m1 + pmaxsw m2, m1 + pxor m7, m7 + mova m1, m0 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m0 + punpckhwd m1, m7 + paddd m6, m1 + pxor m7, m7 + mova m1, m2 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m2 + punpckhwd m1, m7 + paddd m6, m1 + ; rows 4-7 + movu m0, [r0] + movu m4, [r2] + psubw m0, m4 + movu m1, [r0 + r1] + movu m5, [r2 + r3] + psubw m1, m5 + movu m2, [r0 + r1 * 2] + movu m4, [r2 + r3 * 2] + psubw m2, m4 + movu m3, [r0 + r4] + movu m5, [r2 + r5] + psubw m3, m5 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + paddw m4, m0, m1 + psubw m1, m0 + paddw m0, m2, m3 + psubw m3, m2 + punpckhwd m2, m4, m1 + punpcklwd m4, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddw m3, m4, m0 + psubw m0, m4 + paddw m4, m2, m1 + psubw m1, m2 + punpckhdq m2, m3, m0 + punpckldq m3, m0 + paddw m0, m3, m2 + psubw m2, m3 + punpckhdq m3, m4, m1 + punpckldq m4, m1 + paddw m1, m4, m3 + psubw m3, m4 + punpckhqdq m4, m0, m1 + punpcklqdq m0, m1 + pabsw m0, m0 + pabsw m4, m4 + pmaxsw m0, m0, m4 + punpckhqdq m1, m2, m3 + punpcklqdq m2, m3 + pabsw m2, m2 + pabsw m1, m1 + pmaxsw m2, m1 + pxor m7, m7 + mova m1, m0 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m0 + punpckhwd m1, m7 + paddd m6, m1 + pxor m7, m7 + mova m1, m2 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m2 + punpckhwd m1, m7 + paddd m6, m1 +%endmacro + +%macro SATD_32xN_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_satd_32x%1, 4,8,8 + add r1d, r1d + add r3d, r3d + lea r4, [3 * r1] + lea r5, [3 * r3] + pxor m6, m6 + mov r6, r0 + mov r7, r2 + +%rep %1/8 + PROCESS_SATD_32x8_HBD_AVX512 +%endrep + SATD_HBD_AVX512_END + RET +%endmacro + +SATD_32xN_HBD_AVX512 8 +SATD_32xN_HBD_AVX512 16 +SATD_32xN_HBD_AVX512 24 +SATD_32xN_HBD_AVX512 32 +SATD_32xN_HBD_AVX512 64 + +%macro SATD_64xN_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_satd_64x%1, 4,8,8 + add r1d, r1d + add r3d, r3d + lea r4, [3 * r1] + lea r5, [3 * r3] + pxor m6, m6 + mov r6, r0 + mov r7, r2 + +%rep %1/8 + PROCESS_SATD_32x8_HBD_AVX512 +%endrep + lea r0, [r6 + mmsize] + lea r2, [r7 + mmsize] +%rep %1/8 + PROCESS_SATD_32x8_HBD_AVX512 +%endrep + SATD_HBD_AVX512_END + RET +%endmacro + +SATD_64xN_HBD_AVX512 16 +SATD_64xN_HBD_AVX512 32 +SATD_64xN_HBD_AVX512 48 +SATD_64xN_HBD_AVX512 64 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel