# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com> # Date 1515575222 -19800 # Wed Jan 10 14:37:02 2018 +0530 # Node ID 59e596ff83801d7c3e3e01f6d6f64d26b2e8010f # Parent a4d60c45fdce6797486f25f5f319615b25bd86f0 x86: AVX512 intra_pred_ang32 mode 9 and 27 for high bit depth TODO: optimise TRANSPOSE_STORE macro for AVX512 code
AVX2 performance : 12.63x AVX512 performance : 16.73x diff -r a4d60c45fdce -r 59e596ff8380 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jan 10 10:15:39 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jan 10 14:37:02 2018 +0530 @@ -3097,9 +3097,11 @@ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512); p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx512); p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx512); + p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512); p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx512); p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx512); + p.cu[BLOCK_32x32].intra_pred[27] = PFX(intra_pred_ang32_27_avx512); p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>; p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>; diff -r a4d60c45fdce -r 59e596ff8380 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Wed Jan 10 10:15:39 2018 +0530 +++ b/source/common/x86/intrapred16.asm Wed Jan 10 14:37:02 2018 +0530 @@ -11125,35 +11125,35 @@ %macro TRANSPOSE_STORE_AVX2 11 jnz .skip%11 - punpckhwd m%9, m%1, m%2 - punpcklwd m%1, m%2 - punpckhwd m%2, m%3, m%4 - punpcklwd m%3, m%4 - - punpckldq m%4, m%1, m%3 - punpckhdq m%1, m%3 - punpckldq m%3, m%9, m%2 - punpckhdq m%9, m%2 - - punpckhwd m%10, m%5, m%6 - punpcklwd m%5, m%6 - punpckhwd m%6, m%7, m%8 - punpcklwd m%7, m%8 - - punpckldq m%8, m%5, m%7 - punpckhdq m%5, m%7 - punpckldq m%7, m%10, m%6 - punpckhdq m%10, m%6 - - punpcklqdq m%6, m%4, m%8 - punpckhqdq m%2, m%4, m%8 - punpcklqdq m%4, m%1, m%5 - punpckhqdq m%8, m%1, m%5 - - punpcklqdq m%1, m%3, m%7 - punpckhqdq m%5, m%3, m%7 - punpcklqdq m%3, m%9, m%10 - punpckhqdq m%7, m%9, m%10 + punpckhwd ym%9, ym%1, ym%2 + punpcklwd ym%1, ym%2 + punpckhwd ym%2, ym%3, ym%4 + punpcklwd ym%3, ym%4 + + punpckldq ym%4, ym%1, ym%3 + punpckhdq ym%1, ym%3 + punpckldq ym%3, ym%9, ym%2 + punpckhdq ym%9, ym%2 + + punpckhwd ym%10, ym%5, ym%6 + punpcklwd ym%5, ym%6 + punpckhwd ym%6, ym%7, ym%8 + punpcklwd ym%7, ym%8 + + punpckldq ym%8, ym%5, ym%7 + punpckhdq ym%5, ym%7 + punpckldq ym%7, ym%10, ym%6 + punpckhdq ym%10, ym%6 + + punpcklqdq ym%6, ym%4, ym%8 + punpckhqdq ym%2, ym%4, ym%8 + punpcklqdq ym%4, ym%1, ym%5 + punpckhqdq ym%8, ym%1, ym%5 + + punpcklqdq ym%1, ym%3, ym%7 + punpckhqdq ym%5, ym%3, ym%7 + punpcklqdq ym%3, ym%9, ym%10 + punpckhqdq ym%7, ym%9, ym%10 movu [r0 + r1 * 0 + %11], xm%6 movu [r0 + r1 * 1 + %11], xm%2 @@ -11167,28 +11167,28 @@ movu [r5 + r4 * 1 + %11], xm%7 lea r5, [r5 + r1 * 4] - vextracti128 [r5 + r1 * 0 + %11], m%6, 1 - vextracti128 [r5 + r1 * 1 + %11], m%2, 1 - vextracti128 [r5 + r1 * 2 + %11], m%4, 1 - vextracti128 [r5 + r4 * 1 + %11], m%8, 1 + vextracti128 [r5 + r1 * 0 + %11], ym%6, 1 + vextracti128 [r5 + r1 * 1 + %11], ym%2, 1 + vextracti128 [r5 + r1 * 2 + %11], ym%4, 1 + vextracti128 [r5 + r4 * 1 + %11], ym%8, 1 lea r5, [r5 + r1 * 4] - vextracti128 [r5 + r1 * 0 + %11], m%1, 1 - vextracti128 [r5 + r1 * 1 + %11], m%5, 1 - vextracti128 [r5 + r1 * 2 + %11], m%3, 1 - vextracti128 [r5 + r4 * 1 + %11], m%7, 1 + vextracti128 [r5 + r1 * 0 + %11], ym%1, 1 + vextracti128 [r5 + r1 * 1 + %11], ym%5, 1 + vextracti128 [r5 + r1 * 2 + %11], ym%3, 1 + vextracti128 [r5 + r4 * 1 + %11], ym%7, 1 jmp .end%11 .skip%11: - movu [r0 + r1 * 0], m%1 - movu [r0 + r1 * 1], m%2 - movu [r0 + r1 * 2], m%3 - movu [r0 + r4 * 1], m%4 + movu [r0 + r1 * 0], ym%1 + movu [r0 + r1 * 1], ym%2 + movu [r0 + r1 * 2], ym%3 + movu [r0 + r4 * 1], ym%4 lea r0, [r0 + r1 * 4] - movu [r0 + r1 * 0], m%5 - movu [r0 + r1 * 1], m%6 - movu [r0 + r1 * 2], m%7 - movu [r0 + r4 * 1], m%8 + movu [r0 + r1 * 0], ym%5 + movu [r0 + r1 * 1], ym%6 + movu [r0 + r1 * 2], ym%7 + movu [r0 + r4 * 1], ym%8 lea r0, [r0 + r1 * 4] .end%11: %endmacro @@ -18640,6 +18640,145 @@ movu [r0 + r1 * 2], m0 movu [r0 + r2], m0 RET + +;; angle 16, modes 9 and 27 +cglobal ang16_mode_9_27 + test r6d, r6d + + vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + + punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] + punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + + vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] + vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] + punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + + movu ym16, [r3 - 14 * 32] ; [2] + vinserti32x8 m16, [r3 - 12 * 32], 1 ; [4] + pmaddwd m4, m3, m16 + paddd m4, m15 + psrld m4, 5 + pmaddwd m5, m0, m16 + paddd m5, m15 + psrld m5, 5 + packusdw m4, m5 + vextracti32x8 ym5, m4, 1 + movu ym16, [r3 - 10 * 32] ; [6] + vinserti32x8 m16, [r3 - 8 * 32], 1 ; [8] + pmaddwd m6, m3, m16 + paddd m6, m15 + psrld m6, 5 + pmaddwd m9, m0, m16 + paddd m9, m15 + psrld m9, 5 + packusdw m6, m9 + vextracti32x8 ym7, m6, 1 + movu ym16, [r3 - 6 * 32] ; [10] + vinserti32x8 m16, [r3 - 4 * 32], 1 ; [12] + pmaddwd m8, m3, m16 + paddd m8, m15 + psrld m8, 5 + pmaddwd m9, m0, m16 + paddd m9, m15 + psrld m9, 5 + packusdw m8, m9 + vextracti32x8 ym9, m8, 1 + movu ym16, [r3 - 2 * 32] ; [14] + vinserti32x8 m16, [r3], 1 ; [16] + pmaddwd m10, m3, m16 + paddd m10, m15 + psrld m10, 5 + pmaddwd m1, m0, m16 + paddd m1, m15 + psrld m1, 5 + packusdw m10, m1 + vextracti32x8 ym11, m10, 1 + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 + + movu ym16, [r3 + 2 * 32] ; [18] + vinserti32x8 m16, [r3 + 4 * 32], 1 ; [20] + pmaddwd m4, m3, m16 + paddd m4, m15 + psrld m4, 5 + pmaddwd m5, m0, m16 + paddd m5, m15 + psrld m5, 5 + packusdw m4, m5 + vextracti32x8 ym5, m4, 1 + movu ym16, [r3 + 6 * 32] ; [22] + vinserti32x8 m16, [r3 + 8 * 32], 1 ; [24] + pmaddwd m6, m3, m16 + paddd m6, m15 + psrld m6, 5 + pmaddwd m8, m0, m16 + paddd m8, m15 + psrld m8, 5 + packusdw m6, m8 + vextracti32x8 ym7, m6, 1 + movu ym16, [r3 + 10 * 32] ; [26] + vinserti32x8 m16, [r3 + 12 * 32], 1 ; [28] + pmaddwd m8, m3, m16 + paddd m8, m15 + psrld m8, 5 + pmaddwd m9, m0, m16 + paddd m9, m15 + psrld m9, 5 + packusdw m8, m9 + vextracti32x8 ym9, m8, 1 + movu ym16, [r3 + 14 * 32] ; [30] + pmaddwd ym3, ym16 + paddd ym3, ym15 + psrld ym3, 5 + pmaddwd ym0, ym16 + paddd ym0, ym15 + psrld ym0, 5 + packusdw ym3, ym0 + + movu ym1, [r2 + 4] + TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16 + ret + +cglobal intra_pred_ang32_9, 3,8,17 + add r2, 128 + xor r6d, r6d + lea r3, [ang_table_avx2 + 16 * 32] + shl r1d, 1 + lea r4, [r1 * 3] + lea r7, [r0 + 8 * r1] + vbroadcasti32x8 m15, [pd_16] + + call ang16_mode_9_27 + add r2, 2 + lea r0, [r0 + 32] + call ang16_mode_9_27 + add r2, 30 + lea r0, [r7 + 8 * r1] + call ang16_mode_9_27 + add r2, 2 + lea r0, [r0 + 32] + call ang16_mode_9_27 + RET + +cglobal intra_pred_ang32_27, 3,7,17 + xor r6d, r6d + inc r6d + lea r3, [ang_table_avx2 + 16 * 32] + shl r1d, 1 + lea r4, [r1 * 3] + lea r5, [r0 + 32] + vbroadcasti32x8 m15, [pd_16] + + call ang16_mode_9_27 + add r2, 2 + call ang16_mode_9_27 + add r2, 30 + mov r0, r5 + call ang16_mode_9_27 + add r2, 2 + call ang16_mode_9_27 + RET ;------------------------------------------------------------------------------------------------------- ; avx512 code for intra_pred_ang32 mode 2 to 34 end ;------------------------------------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel