# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1511154277 -19800 # Mon Nov 20 10:34:37 2017 +0530 # Node ID 664d45353792c5014a714a5ddc8d618b01391deb # Parent 3369cc99e3e0e23f0711dda22196fda4ca9b4913 [x265-avx512]x86: AVX512 idct32x32
AVX2 Performance : 6.43x AVX512 Performance : 6.97x diff -r 3369cc99e3e0 -r 664d45353792 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 05 18:18:47 2018 -0700 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 20 10:34:37 2017 +0530 @@ -2844,6 +2844,7 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); + p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); @@ -4907,6 +4908,7 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); + p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); } #endif diff -r 3369cc99e3e0 -r 664d45353792 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Thu Apr 05 18:18:47 2018 -0700 +++ b/source/common/x86/dct8.asm Mon Nov 20 10:34:37 2017 +0530 @@ -293,6 +293,71 @@ dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 + +tab_idct32_AVX512_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54 + dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13 + dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38 + dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31 + dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22 + dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46 + dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4 + dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61 + +tab_idct32_AVX512_5: dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73 + dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90 + dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82 + dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85 + dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88 + dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78 + dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90 + dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67 + + +tab_idct32_AVX512_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50 + dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89 + dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75 + dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18 + +tab_idct32_AVX512_3: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25 + dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57 + dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80 + dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90 + +tab_idct32_AVX512_4: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 + dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 + dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 + dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31 + dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38 + dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46 + dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54 + dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61 + dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67 + dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73 + dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78 + dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82 + dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85 + dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88 + dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90 + dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90 + +tab_idct32_AVX512_6: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9 + dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25 + dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43 + dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57 + dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70 + dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80 + dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87 + dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90 + dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90 + dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87 + dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80 + dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70 + dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57 + dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43 + dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25 + dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 + + avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83 @@ -4649,6 +4714,612 @@ jnz .pass2 RET + +%macro IDCT32_AVX512_PASS1 5 + pmaddwd m9, m8, m%4 + pmaddwd m10, m7, m%5 + + vpsrldq m0, m9, 4 + paddd m9, m0 + vpslldq m5, m10, 4 + paddd m10, m5 + vmovdqu32 m9 {k1}, m10 + + pmaddwd m10, m4, m%4 + pmaddwd m11, m1, m%5 + + vpsrldq m0, m10, 4 + paddd m10, m0 + vpslldq m5, m11, 4 + paddd m11, m5 + vmovdqu32 m10 {k1}, m11 + + vpsrldq m0, m9, 8 + paddd m9, m0 + vpslldq m5, m10, 8 + paddd m10, m5 + vmovdqu32 m9 {k2}, m10 + + movu m6, [tab_idct32_AVX512_5 + %1 * 64] + movu m5, [tab_idct32_AVX512_5 + %1 * 64 + 64] + + pmaddwd m10, m8, m6 + pmaddwd m11, m7, m5 + + vpsrldq m0, m10, 4 + paddd m10, m0 + vpslldq m5, m11, 4 + paddd m11, m5 + vmovdqu32 m10 {k1}, m11 + + pmaddwd m11, m4, m6 + pmaddwd m12, m1, [tab_idct32_AVX512_5 + %1 * 64 + 64] + + vpsrldq m0, m11, 4 + paddd m11, m0 + vpslldq m5, m12, 4 + paddd m12, m5 + vmovdqu32 m11 {k1}, m12 + + vpsrldq m0, m10, 8 + paddd m10, m0 + vpslldq m5, m11, 8 + paddd m11, m5 + vmovdqu32 m10 {k2}, m11 + + pshufd m0, m9, q2301 + pshufd m5, m10, q2301 + paddd m9, m0 + paddd m10, m5 + punpckhdq m0, m9, m10 + punpckldq m5, m9, m10 + punpckhdq m9, m5, m0 + + pmaddwd m10, m3, m%2 + pmaddwd m11, m14, m%2 + + vpsrldq m0, m10, 4 + paddd m10, m0 + vpslldq m5, m11, 4 + paddd m11, m5 + vmovdqu32 m10 {k1}, m11 + + vpsrldq m0, m10, 8 + paddd m10, m0 + + pmaddwd m11, m2, m%3 + pmaddwd m12, m13, m%3 + + vpsrldq m0, m11, 4 + paddd m11, m0 + vpslldq m5, m12, 4 + paddd m12, m5 + vmovdqu32 m11 {k1}, m12 + + vpsrldq m0, m11, 8 + paddd m11, m0 + + paddd m12, m10, m11 + psubd m10, m11 + + punpcklqdq m12, m10 + paddd m10, m9, m12 + paddd m10, m15 + psrad m10, IDCT_SHIFT1 + + psubd m12, m9 + paddd m12, m15 + psrad m12, IDCT_SHIFT1 + + packssdw m10, m12 + vextracti128 xm12, m10, 1 + vextracti64x4 ym5, m10, 1 + vextracti128 xm0, ym5, 1 + + movd [r3 + %1 * 64], xm10 + movd [r3 + 32 + %1 * 64], xm12 + pextrd [r4 - %1 * 64], xm10, 1 + pextrd [r4+ 32 - %1 * 64], xm12, 1 + pextrd [r3 + 16 * 64 + %1 *64], xm10, 3 + pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3 + pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2 + pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2 + + movd [r3 + (%1 + 1) * 64], xm5 + movd [r3 + 32 + (%1 + 1) * 64], xm0 + pextrd [r4 - (%1 + 1) * 64], xm5, 1 + pextrd [r4+ 32 - (%1 + 1) * 64], xm0, 1 + pextrd [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3 + pextrd [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3 + pextrd [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2 + pextrd [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2 +%endmacro + +%macro IDCT32_AVX512_PASS2 0 + pmaddwd m2, m0, m7 + pmaddwd m3, m0, m8 + + vpsrldq m24, m2, 4 + paddd m2, m24 + vpslldq m25, m3, 4 + paddd m3, m25 + vmovdqu32 m2 {k1}, m3 + + pmaddwd m3, m0, m9 + pmaddwd m4, m0, m10 + + vpsrldq m24, m3, 4 + paddd m3, m24 + vpslldq m25, m4, 4 + paddd m4, m25 + vmovdqu32 m3 {k1}, m4 + + vpsrldq m24, m2, 8 + paddd m2, m24 + vpslldq m25, m3, 8 + paddd m3, m25 + vmovdqu32 m2 {k2}, m3 + + pmaddwd m3, m0, m11 + pmaddwd m4, m0, m12 + + vpsrldq m24, m3, 4 + paddd m3, m24 + vpslldq m25, m4, 4 + paddd m4, m25 + vmovdqu32 m3 {k1}, m4 + + pmaddwd m4, m0, m13 + pmaddwd m5, m0, m14 + + vpsrldq m24, m4, 4 + paddd m4, m24 + vpslldq m25, m5, 4 + paddd m5, m25 + vmovdqu32 m4 {k1}, m5 + + vpsrldq m24, m3, 8 + paddd m3, m24 + vpslldq m25, m4, 8 + paddd m4, m25 + vmovdqu32 m3 {k2}, m4 + + movu m24, [idct16_AVX512_shuff3] + movu m25, [idct16_AVX512_shuff2] + vpermi2q m24, m2, m3 + vpermi2q m25, m2, m3 + paddd m2, m25, m24 + + pmaddwd m3, m0, m16 + pmaddwd m4, m0, m17 + + vpsrldq m24, m3, 4 + paddd m3, m24 + vpslldq m25, m4, 4 + paddd m4, m25 + vmovdqu32 m3 {k1}, m4 + + pmaddwd m4, m0, m18 + pmaddwd m5, m0, m19 + + vpsrldq m24, m4, 4 + paddd m4, m24 + vpslldq m25, m5, 4 + paddd m5, m25 + vmovdqu32 m4 {k1}, m5 + + vpsrldq m24, m3, 8 + paddd m3, m24 + vpslldq m25, m4, 8 + paddd m4, m25 + vmovdqu32 m3 {k2}, m4 + + pmaddwd m4, m0, m20 + pmaddwd m5, m0, m21 + + vpsrldq m24, m4, 4 + paddd m4, m24 + vpslldq m25, m5, 4 + paddd m5, m25 + vmovdqu32 m4 {k1}, m5 + + pmaddwd m5, m0, m22 + pmaddwd m0, m23 + + vpsrldq m24, m5, 4 + paddd m5, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m5 {k1}, m0 + + vpsrldq m24, m4, 8 + paddd m4, m24 + vpslldq m25, m5, 8 + paddd m5, m25 + vmovdqu32 m4 {k2}, m5 + + movu m24, [idct16_AVX512_shuff3] + movu m25, [idct16_AVX512_shuff2] + vpermi2q m24, m3, m4 + vpermi2q m25, m3, m4 + paddd m3, m25, m24 + + pmaddwd m4, m1, m26 + pmaddwd m0, m1, m27 + + vpsrldq m24, m4, 4 + paddd m4, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m4 {k1}, m0 + + pmaddwd m5, m1, m28 + pmaddwd m0, m1, m29 + + vpsrldq m24, m5, 4 + paddd m5, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m5 {k1}, m0 + + + vpsrldq m24, m4, 8 + paddd m4, m24 + vpslldq m25, m5, 8 + paddd m5, m25 + vmovdqu32 m4 {k2}, m5 + + pmaddwd m5, m1, m30 + pmaddwd m0, m1, m31 + + vpsrldq m24, m5, 4 + paddd m5, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m5 {k1}, m0 + + pmaddwd m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize] + pmaddwd m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize] + + vpsrldq m24, m6, 4 + paddd m6, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m6 {k1}, m0 + + vpsrldq m24, m5, 8 + paddd m5, m24 + vpslldq m25, m6, 8 + paddd m6, m25 + vmovdqu32 m5 {k2}, m6 + + movu m24, [idct16_AVX512_shuff3] + movu m25, [idct16_AVX512_shuff2] + vpermi2q m24, m4, m5 + vpermi2q m25, m4, m5 + paddd m4, m25, m24 + + pmaddwd m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize] + pmaddwd m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize] + + vpsrldq m24, m5, 4 + paddd m5, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m5 {k1}, m0 + + pmaddwd m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize] + pmaddwd m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize] + + vpsrldq m24, m6, 4 + paddd m6, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m6 {k1}, m0 + + vpsrldq m24, m5, 8 + paddd m5, m24 + vpslldq m25, m6, 8 + paddd m6, m25 + vmovdqu32 m5 {k2}, m6 + + pmaddwd m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize] + pmaddwd m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize] + + vpsrldq m24, m6, 4 + paddd m6, m24 + vpslldq m25, m0, 4 + paddd m0, m25 + vmovdqu32 m6 {k1}, m0 + + pmaddwd m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize] + pmaddwd m1, [tab_idct32_AVX512_4 + 15 * mmsize] + + vpsrldq m24, m0, 4 + paddd m0, m24 + vpslldq m25, m1, 4 + paddd m1, m25 + vmovdqu32 m0 {k1}, m1 + + vpsrldq m24, m6, 8 + paddd m6, m24 + vpslldq m25, m0, 8 + paddd m0, m25 + vmovdqu32 m6 {k2}, m0 + + movu m24, [idct16_AVX512_shuff3] + movu m25, [idct16_AVX512_shuff2] + vpermi2q m24, m5, m6 + vpermi2q m25, m5, m6 + paddd m5, m25, m24 + + paddd m6, m2, m4 + paddd m6, m15 + psrad m6, IDCT_SHIFT2 + + psubd m2, m4 + paddd m2, m15 + psrad m2, IDCT_SHIFT2 + + paddd m4, m3, m5 + paddd m4, m15 + psrad m4, IDCT_SHIFT2 + + psubd m3, m5 + paddd m3, m15 + psrad m3, IDCT_SHIFT2 + + packssdw m6, m4 + packssdw m2, m3 + + vpermq m6, m6, 0xD8 + vpermq m2, m2, 0x8D + pshufb m2, [idct16_AVX512_shuff6] +%endmacro + +;------------------------------------------------------------------- +; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride) +;------------------------------------------------------------------- + +INIT_ZMM avx512 +cglobal idct32, 3, 8, 32, 0-32*64 + +%define IDCT_SHIFT1 7 + + vbroadcasti128 m15, [pd_64] + + mov r3, rsp + lea r4, [r3 + 15 * 64] + mov r5d, 8 + mov r7d, 0xAAAA + kmovd k1, r7d + mov r7d, 0xCCCC + kmovd k2, r7d + + + movu m16, [tab_idct32_AVX512_2 + 0 * 64] + movu m17, [tab_idct32_AVX512_2 + 1 * 64] + movu m18, [tab_idct32_AVX512_2 + 2 * 64] + movu m19, [tab_idct32_AVX512_2 + 3 * 64] + + movu m20, [tab_idct32_AVX512_3 + 0 * 64] + movu m21, [tab_idct32_AVX512_3 + 1 * 64] + movu m22, [tab_idct32_AVX512_3 + 2 * 64] + movu m23, [tab_idct32_AVX512_3 + 3 * 64] + + movu m24, [tab_idct32_AVX512_1 + 0 * 64] + movu m25, [tab_idct32_AVX512_1 + 1 * 64] + movu m26, [tab_idct32_AVX512_1 + 2 * 64] + movu m27, [tab_idct32_AVX512_1 + 3 * 64] + movu m28, [tab_idct32_AVX512_1 + 4 * 64] + movu m29, [tab_idct32_AVX512_1 + 5 * 64] + movu m30, [tab_idct32_AVX512_1 + 6 * 64] + movu m31, [tab_idct32_AVX512_1 + 7 * 64] + +.pass1: + movq xm0, [r0 + 2 * 64] + movq xm1, [r0 + 18 * 64] + punpcklqdq xm0, xm0, xm1 + movq xm1, [r0 + 0 * 64] + movq xm2, [r0 + 16 * 64] + punpcklqdq xm1, xm1, xm2 + vinserti128 ym0, ym0, xm1, 1 ;[2 18 0 16] + + movq xm1, [r0 + 1 * 64] + movq xm2, [r0 + 9 * 64] + punpcklqdq xm1, xm1, xm2 + movq xm2, [r0 + 17 * 64] + movq xm3, [r0 + 25 * 64] + punpcklqdq xm2, xm2, xm3 + vinserti128 ym1, ym1, xm2, 1 ;[1 9 17 25] + + movq xm2, [r0 + 6 * 64] + movq xm3, [r0 + 22 * 64] + punpcklqdq xm2, xm2, xm3 + movq xm3, [r0 + 4 * 64] + movq xm4, [r0 + 20 * 64] + punpcklqdq xm3, xm3, xm4 + vinserti128 ym2, ym2, xm3, 1 ;[6 22 4 20] + + movq xm3, [r0 + 3 * 64] + movq xm4, [r0 + 11 * 64] + punpcklqdq xm3, xm3, xm4 + movq xm4, [r0 + 19 * 64] + movq xm5, [r0 + 27 * 64] + punpcklqdq xm4, xm4, xm5 + vinserti128 ym3, ym3, xm4, 1 ;[3 11 17 25] + + movq xm4, [r0 + 10 * 64] + movq xm5, [r0 + 26 * 64] + punpcklqdq xm4, xm4, xm5 + movq xm5, [r0 + 8 * 64] + movq xm6, [r0 + 24 * 64] + punpcklqdq xm5, xm5, xm6 + vinserti128 ym4, ym4, xm5, 1 ;[10 26 8 24] + + movq xm5, [r0 + 5 * 64] + movq xm6, [r0 + 13 * 64] + punpcklqdq xm5, xm5, xm6 + movq xm6, [r0 + 21 * 64] + movq xm7, [r0 + 29 * 64] + punpcklqdq xm6, xm6, xm7 + vinserti128 ym5, ym5, xm6, 1 ;[5 13 21 9] + + movq xm6, [r0 + 14 * 64] + movq xm7, [r0 + 30 * 64] + punpcklqdq xm6, xm6, xm7 + movq xm7, [r0 + 12 * 64] + movq xm8, [r0 + 28 * 64] + punpcklqdq xm7, xm7, xm8 + vinserti128 ym6, ym6, xm7, 1 ;[14 30 12 28] + + movq xm7, [r0 + 7 * 64] + movq xm8, [r0 + 15 * 64] + punpcklqdq xm7, xm7, xm8 + movq xm8, [r0 + 23 * 64] + movq xm9, [r0 + 31 * 64] + punpcklqdq xm8, xm8, xm9 + vinserti128 ym7, ym7, xm8, 1 ;[7 15 23 31] + + punpckhwd ym8, ym0, ym2 ;[18 22 16 20] + punpcklwd ym0, ym2 ;[2 6 0 4] + + punpckhwd ym2, ym1, ym3 ;[9 11 25 27] + punpcklwd ym1, ym3 ;[1 3 17 19] + + punpckhwd ym3, ym4, ym6 ;[26 30 24 28] + punpcklwd ym4, ym6 ;[10 14 8 12] + + punpckhwd ym6, ym5, ym7 ;[13 15 29 31] + punpcklwd ym5, ym7 ;[5 7 21 23] + + punpckhdq ym7, ym0, ym4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] + punpckldq ym0, ym4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] + + punpckhdq ym4, ym8, ym3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] + punpckldq ym8, ym3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] + + punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] + punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] + + punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] + punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] + + punpckhqdq ym6, ym0, ym8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] + punpcklqdq ym0, ym8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] + + punpckhqdq ym8, ym7, ym4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] + punpcklqdq ym7, ym4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] + + punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] + punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] + + punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] + punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] + + vinserti64x4 m7, m7, ym7, 1 + vinserti64x4 m8, m8, ym8, 1 + movu m13, [idct16_AVX512_shuff2] + movu m14, [idct16_AVX512_shuff3] + vpermi2q m13, m7, m8 + vpermi2q m14, m7, m8 + + vinserti64x4 m1, m1, ym1, 1 + vinserti64x4 m4, m4, ym4, 1 + movu m7, [idct16_AVX512_shuff3] + movu m8, [idct16_AVX512_shuff2] + vpermi2q m7, m1, m4 + vpermi2q m8, m1, m4 + + vinserti64x4 m3, m3, ym3, 1 + vinserti64x4 m2, m2, ym2, 1 + movu m1, [idct16_AVX512_shuff3] + movu m4, [idct16_AVX512_shuff2] + vpermi2q m1, m3, m2 + vpermi2q m4, m3, m2 + + vinserti64x4 m0, m0, ym0, 1 + vinserti64x4 m6, m6, ym6, 1 + movu m2, [idct16_AVX512_shuff2] + movu m3, [idct16_AVX512_shuff3] + vpermi2q m2, m0, m6 + vpermi2q m3, m0, m6 + + + IDCT32_AVX512_PASS1 0, 16, 20, 24, 25 + IDCT32_AVX512_PASS1 2, 17, 21, 26, 27 + IDCT32_AVX512_PASS1 4, 18, 22, 28, 29 + IDCT32_AVX512_PASS1 6, 19, 23, 30, 31 + + add r0, 8 + add r3, 4 + add r4, 4 + dec r5d + jnz .pass1 + +%if BIT_DEPTH == 12 + %define IDCT_SHIFT2 8 + vpbroadcastd m15, [pd_128] +%elif BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m15, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m15, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif + + mov r3, rsp + add r2d, r2d + mov r4d, 16 + mov r6d, 0xFFFF0000 + kmovd k3, r6d + + movu m7, [tab_idct32_AVX512_6] + movu m8, [tab_idct32_AVX512_6 + 1 * mmsize] + movu m9, [tab_idct32_AVX512_6 + 2 * mmsize] + movu m10, [tab_idct32_AVX512_6 + 3 * mmsize] + movu m11, [tab_idct32_AVX512_6 + 4 * mmsize] + movu m12, [tab_idct32_AVX512_6 + 5 * mmsize] + movu m13, [tab_idct32_AVX512_6 + 6 * mmsize] + movu m14, [tab_idct32_AVX512_6 + 7 * mmsize] + movu m16, [tab_idct32_AVX512_6 + 8 * mmsize] + movu m17, [tab_idct32_AVX512_6 + 9 * mmsize] + movu m18, [tab_idct32_AVX512_6 + 10 * mmsize] + movu m19, [tab_idct32_AVX512_6 + 11 * mmsize] + movu m20, [tab_idct32_AVX512_6 + 12 * mmsize] + movu m21, [tab_idct32_AVX512_6 + 13 * mmsize] + movu m22, [tab_idct32_AVX512_6 + 14 * mmsize] + movu m23, [tab_idct32_AVX512_6 + 15 * mmsize] + movu m26, [tab_idct32_AVX512_4] + movu m27, [tab_idct32_AVX512_4 + 1 * mmsize] + movu m28, [tab_idct32_AVX512_4 + 2 * mmsize] + movu m29, [tab_idct32_AVX512_4 + 3 * mmsize] + movu m30, [tab_idct32_AVX512_4 + 4 * mmsize] + movu m31, [tab_idct32_AVX512_4 + 5 * mmsize] + +.pass2: + movu ym0, [r3] + movu ym1, [r3 + 32] + vmovdqu16 m0 {k3}, [r3 + 32] + vmovdqu16 m1 {k3}, [r3 + 64] + + IDCT32_AVX512_PASS2 + movu [r1], ym6 + movu [r1 + 32], ym2 + vextracti64x4 ym24, m6, 1 + vextracti64x4 ym25, m2, 1 + add r1, r2 + movu [r1 ], ym24 + movu [r1 + 32], ym25 + + add r1, r2 + add r3, 128 + dec r4d + jnz .pass2 + RET + ;------------------------------------------------------- ; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride) ;------------------------------------------------------- diff -r 3369cc99e3e0 -r 664d45353792 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Thu Apr 05 18:18:47 2018 -0700 +++ b/source/common/x86/dct8.h Mon Nov 20 10:34:37 2017 +0530 @@ -46,5 +46,6 @@ void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); +void PFX(idct32_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); #endif // ifndef X265_DCT8_H _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel