# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1512648175 -19800 # Thu Dec 07 17:32:55 2017 +0530 # Node ID 86d3d34de566d7696028b5e798a79b9de3a6e62b # Parent 617aa7cf2c76368cb8a3b252175c1b3d6f716915 x86: pixel_satd_16xN for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 16x8 | 9.62x | 14.03x 16x16 | 12.07x | 13.57x 16x32 | 12.82x | 16.03x 16x64 | 12.92x | 15.76x This patch also cleanup existing satd AVX512 code diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 08 14:12:55 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 17:32:55 2017 +0530 @@ -3026,7 +3026,10 @@ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512); //Luma_hps_48x64 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512); - + p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512); + p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512); + p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx512); + p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx512); p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512); p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512); p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512); @@ -3036,11 +3039,17 @@ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512); p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512); p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512); - + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); diff -r 617aa7cf2c76 -r 86d3d34de566 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Fri Dec 08 14:12:55 2017 +0530 +++ b/source/common/x86/pixel-a.asm Thu Dec 07 17:32:55 2017 +0530 @@ -8227,7 +8227,7 @@ pmaxsw m%1, m%3 pmaxsw m%2, m%4 %endmacro - +%if HIGH_BIT_DEPTH==0 INIT_ZMM avx512 cglobal pixel_satd_16x8_internal vbroadcasti64x4 m6, [hmul_16p] @@ -8381,7 +8381,7 @@ SUMSUB_BA w, 0, 1, 2 HMAXABSW2 0, 1, 2, 3 SATD_AVX512_END 1 - +%endif ; Input 10bit, Output 8bit ;------------------------------------------------------------------------------------------------------------------------ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) @@ -13971,82 +13971,31 @@ paddd xm6, xm7 movd eax, xm6 %endmacro - -%macro PROCESS_SATD_32x8_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 8 rows +%macro PROCESS_SATD_16x8_HBD_AVX512 0 ; function to compute satd cost for 16 columns, 8 rows ; rows 0-3 - movu m0, [r0] - movu m4, [r2] + lea r6, [r0 + r1 * 4] + lea r7, [r2 + r3 * 4] + movu ym0, [r0] + movu ym4, [r2] + vinserti32x8 m0, [r6], 1 + vinserti32x8 m4, [r7], 1 psubw m0, m4 - movu m1, [r0 + r1] - movu m5, [r2 + r3] + movu ym1, [r0 + r1] + movu ym5, [r2 + r3] + vinserti32x8 m1, [r6 + r1], 1 + vinserti32x8 m5, [r7 + r3], 1 psubw m1, m5 - movu m2, [r0 + r1 * 2] - movu m4, [r2 + r3 * 2] + movu ym2, [r0 + r1 * 2] + movu ym4, [r2 + r3 * 2] + vinserti32x8 m2, [r6 + r1 * 2], 1 + vinserti32x8 m4, [r7 + r3 * 2], 1 psubw m2, m4 - movu m3, [r0 + r4] - movu m5, [r2 + r5] + movu ym3, [r0 + r4] + movu ym5, [r2 + r5] + vinserti32x8 m3, [r6 + r4], 1 + vinserti32x8 m5, [r7 + r5], 1 psubw m3, m5 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - paddw m4, m0, m1 - psubw m1, m0 - paddw m0, m2, m3 - psubw m3, m2 - punpckhwd m2, m4, m1 - punpcklwd m4, m1 - punpckhwd m1, m0, m3 - punpcklwd m0, m3 - paddw m3, m4, m0 - psubw m0, m4 - paddw m4, m2, m1 - psubw m1, m2 - punpckhdq m2, m3, m0 - punpckldq m3, m0 - paddw m0, m3, m2 - psubw m2, m3 - punpckhdq m3, m4, m1 - punpckldq m4, m1 - paddw m1, m4, m3 - psubw m3, m4 - punpckhqdq m4, m0, m1 - punpcklqdq m0, m1 - pabsw m0, m0 - pabsw m4, m4 - pmaxsw m0, m0, m4 - punpckhqdq m1, m2, m3 - punpcklqdq m2, m3 - pabsw m2, m2 - pabsw m1, m1 - pmaxsw m2, m1 - pxor m7, m7 - mova m1, m0 - punpcklwd m1, m7 - paddd m6, m1 - mova m1, m0 - punpckhwd m1, m7 - paddd m6, m1 - pxor m7, m7 - mova m1, m2 - punpcklwd m1, m7 - paddd m6, m1 - mova m1, m2 - punpckhwd m1, m7 - paddd m6, m1 - ; rows 4-7 - movu m0, [r0] - movu m4, [r2] - psubw m0, m4 - movu m1, [r0 + r1] - movu m5, [r2 + r3] - psubw m1, m5 - movu m2, [r0 + r1 * 2] - movu m4, [r2 + r3 * 2] - psubw m2, m4 - movu m3, [r0 + r4] - movu m5, [r2 + r5] - psubw m3, m5 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + paddw m4, m0, m1 psubw m1, m0 paddw m0, m2, m3 @@ -14092,6 +14041,89 @@ punpckhwd m1, m7 paddd m6, m1 %endmacro +%macro PROCESS_SATD_32x4_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows + ; rows 0-3 + movu m0, [r0] + movu m4, [r2] + psubw m0, m4 + movu m1, [r0 + r1] + movu m5, [r2 + r3] + psubw m1, m5 + movu m2, [r0 + r1 * 2] + movu m4, [r2 + r3 * 2] + psubw m2, m4 + movu m3, [r0 + r4] + movu m5, [r2 + r5] + psubw m3, m5 + paddw m4, m0, m1 + psubw m1, m0 + paddw m0, m2, m3 + psubw m3, m2 + punpckhwd m2, m4, m1 + punpcklwd m4, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddw m3, m4, m0 + psubw m0, m4 + paddw m4, m2, m1 + psubw m1, m2 + punpckhdq m2, m3, m0 + punpckldq m3, m0 + paddw m0, m3, m2 + psubw m2, m3 + punpckhdq m3, m4, m1 + punpckldq m4, m1 + paddw m1, m4, m3 + psubw m3, m4 + punpckhqdq m4, m0, m1 + punpcklqdq m0, m1 + pabsw m0, m0 + pabsw m4, m4 + pmaxsw m0, m0, m4 + punpckhqdq m1, m2, m3 + punpcklqdq m2, m3 + pabsw m2, m2 + pabsw m1, m1 + pmaxsw m2, m1 + pxor m7, m7 + mova m1, m0 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m0 + punpckhwd m1, m7 + paddd m6, m1 + pxor m7, m7 + mova m1, m2 + punpcklwd m1, m7 + paddd m6, m1 + mova m1, m2 + punpckhwd m1, m7 + paddd m6, m1 +%endmacro + +%macro SATD_16xN_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_satd_16x%1, 4,8,8 + add r1d, r1d + add r3d, r3d + lea r4, [3 * r1] + lea r5, [3 * r3] + pxor m6, m6 + +%rep %1/8 - 1 + PROCESS_SATD_16x8_HBD_AVX512 + lea r0, [r6 + 4 * r1] + lea r2, [r7 + 4 * r3] +%endrep + PROCESS_SATD_16x8_HBD_AVX512 + SATD_HBD_AVX512_END + RET +%endmacro + +SATD_16xN_HBD_AVX512 8 +SATD_16xN_HBD_AVX512 16 +SATD_16xN_HBD_AVX512 32 +SATD_16xN_HBD_AVX512 64 %macro SATD_32xN_HBD_AVX512 1 INIT_ZMM avx512 @@ -14103,10 +14135,12 @@ pxor m6, m6 mov r6, r0 mov r7, r2 - -%rep %1/8 - PROCESS_SATD_32x8_HBD_AVX512 +%rep %1/4 - 1 + PROCESS_SATD_32x4_HBD_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] %endrep + PROCESS_SATD_32x4_HBD_AVX512 SATD_HBD_AVX512_END RET %endmacro @@ -14127,15 +14161,20 @@ pxor m6, m6 mov r6, r0 mov r7, r2 - -%rep %1/8 - PROCESS_SATD_32x8_HBD_AVX512 +%rep %1/4 - 1 + PROCESS_SATD_32x4_HBD_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] %endrep + PROCESS_SATD_32x4_HBD_AVX512 lea r0, [r6 + mmsize] lea r2, [r7 + mmsize] -%rep %1/8 - PROCESS_SATD_32x8_HBD_AVX512 +%rep %1/4 - 1 + PROCESS_SATD_32x4_HBD_AVX512 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] %endrep + PROCESS_SATD_32x4_HBD_AVX512 SATD_HBD_AVX512_END RET %endmacro _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel