# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1509948596 -19800 # Mon Nov 06 11:39:56 2017 +0530 # Node ID 8bbcc1bd3c1381e936695a6eff30a17cc2633b6f # Parent df3c576cd32c50b0412ad3d70eeebfe8fb511da1 [x265-avx512]x86: AVX512 idct16x16
AVX2 Performance : 11.67x AVX512 Performance : 12.80x diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 13 16:02:40 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 06 11:39:56 2017 +0530 @@ -2837,6 +2837,8 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); + p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); @@ -4835,6 +4837,7 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); + p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); } #endif diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Mon Nov 13 16:02:40 2017 +0530 +++ b/source/common/x86/dct8.asm Mon Nov 06 11:39:56 2017 +0530 @@ -218,6 +218,27 @@ idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 + +tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43 + dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57 + dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87 + dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90 + +tab_AVX512_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75 + dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89 + dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50 + dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18 + +idct16_AVX512_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15 + +idct16_AVX512_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13 + +idct16_AVX512_shuff2: dq 0, 1, 8, 9, 4, 5, 12, 13 +idct16_AVX512_shuff3: dq 2, 3, 10, 11, 6, 7, 14, 15 +idct16_AVX512_shuff4: dq 4, 5, 12, 13, 0, 1, 8, 9 +idct16_AVX512_shuff5: dq 6, 7, 14, 15, 2, 3, 10, 11 +idct16_AVX512_shuff6: times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 @@ -3671,6 +3692,599 @@ jnz .pass2 RET + +%macro IDCT16_AVX512_PASS1 3 + movu m5, [tab_AVX512_idct16_2 + %1 * 64] + pmaddwd m9, m0, m5 + pmaddwd m10, m7, m5 + + vpsrldq m16, m9, 4 + paddd m9, m16 + vpslldq m17, m10, 4 + paddd m10, m17 + vmovdqu32 m9 {k1}, m10 + + pmaddwd m10, m6, m5 + pmaddwd m11, m8, m5 + + vpsrldq m16, m10, 4 + paddd m10, m16 + vpslldq m17, m11, 4 + paddd m11, m17 + vmovdqu32 m10 {k1}, m11 + + vpsrldq m16, m9, 8 + paddd m9, m16 + vpslldq m17, m10, 8 + paddd m10, m17 + vmovdqu32 m9 {k2}, m10 + + movu m5, [tab_AVX512_idct16_1 + %1 * 64] + pmaddwd m10, m1, m5 + pmaddwd m11, m3, m5 + + vpsrldq m16, m10, 4 + paddd m10, m16 + vpslldq m17, m11, 4 + paddd m11, m17 + vmovdqu32 m10 {k1}, m11 + + pmaddwd m11, m4, m5 + pmaddwd m12, m2, m5 + + vpsrldq m16, m11, 4 + paddd m11, m16 + vpslldq m17, m12, 4 + paddd m12, m17 + vmovdqu32 m11 {k1}, m12 + + vpsrldq m16, m10, 8 + paddd m10, m16 + vpslldq m17, m11, 8 + paddd m11, m17 + vmovdqu32 m10 {k2}, m11 + + paddd m11, m9, m10 + paddd m11, m14 + psrad m11, IDCT_SHIFT1 + + psubd m9, m10 + paddd m9, m14 + psrad m9, IDCT_SHIFT1 + + movu m5, [tab_AVX512_idct16_2 + %1 * 64 + 64] + pmaddwd m10, m0, m5 + pmaddwd m12, m7, m5 + + + vpsrldq m16, m10, 4 + paddd m10, m16 + vpslldq m17, m12, 4 + paddd m12, m17 + vmovdqu32 m10 {k1}, m12 + + pmaddwd m12, m6, m5 + pmaddwd m13, m8, m5 + + + vpsrldq m16, m12, 4 + paddd m12, m16 + vpslldq m17, m13, 4 + paddd m13, m17 + vmovdqu32 m12 {k1}, m13 + + + vpsrldq m16, m10, 8 + paddd m10, m16 + vpslldq m17, m12, 8 + paddd m12, m17 + vmovdqu32 m10 {k2}, m12 + + + + movu m5, [tab_AVX512_idct16_1 + %1 * 64 + 64] + pmaddwd m12, m1, m5 + pmaddwd m13, m3, m5 + + + vpsrldq m16, m12, 4 + paddd m12, m16 + vpslldq m17, m13, 4 + paddd m13, m17 + vmovdqu32 m12 {k1}, m13 + + pmaddwd m13, m4, m5 + pmaddwd m5, m2 + + + vpsrldq m16, m13, 4 + paddd m13, m16 + vpslldq m17, m5, 4 + paddd m5, m17 + vmovdqu32 m13 {k1}, m5 + + + vpsrldq m16, m12, 8 + paddd m12, m16 + vpslldq m17, m13, 8 + paddd m13, m17 + vmovdqu32 m12 {k2}, m13 + + + paddd m5, m10, m12 + paddd m5, m14 + psrad m5, IDCT_SHIFT1 + + psubd m10, m12 + paddd m10, m14 + psrad m10, IDCT_SHIFT1 + + packssdw m11, m5 + packssdw m9, m10 + + movu m10, [idct16_AVX512_shuff] + movu m5, [idct16_AVX512_shuff1] + + vpermd m%2, m10, m11 + vpermd m%3, m5, m9 +%endmacro + +%macro IDCT16_AVX512_PASS2 2 + vpermq m0, m%1, 0xD8 + + pmaddwd m1, m0, m7 + pmaddwd m2, m0, m8 + + + vpsrldq m14, m1, 4 + paddd m1, m14 + vpslldq m31, m2, 4 + paddd m2, m31 + vmovdqu32 m1 {k1}, m2 + + pmaddwd m2, m0, m9 + pmaddwd m3, m0, m10 + + + vpsrldq m14, m2, 4 + paddd m2, m14 + vpslldq m31, m3, 4 + paddd m3, m31 + vmovdqu32 m2 {k1}, m3 + + + vpsrldq m14, m1, 8 + paddd m1, m14 + vpslldq m31, m2, 8 + paddd m2, m31 + vmovdqu32 m1 {k2}, m2 + + pmaddwd m2, m0, m11 + pmaddwd m3, m0, m12 + + + vpsrldq m14, m2, 4 + paddd m2, m14 + vpslldq m31, m3, 4 + paddd m3, m31 + vmovdqu32 m2 {k1}, m3 + + vbroadcasti64x2 m14, [r5 + 112] + pmaddwd m3, m0, m13 + pmaddwd m4, m0, m14 + + + vpsrldq m14, m3, 4 + paddd m3, m14 + vpslldq m31, m4, 4 + paddd m4, m31 + vmovdqu32 m3 {k1}, m4 + + + vpsrldq m14, m2, 8 + paddd m2, m14 + vpslldq m31, m3, 8 + paddd m3, m31 + vmovdqu32 m2 {k2}, m3 + + vpermq m0, m%2, 0xD8 + pmaddwd m3, m0, m16 + pmaddwd m4, m0, m17 + + + vpsrldq m14, m3, 4 + paddd m3, m14 + vpslldq m31, m4, 4 + paddd m4, m31 + vmovdqu32 m3 {k1}, m4 + + pmaddwd m4, m0, m19 + pmaddwd m5, m0, m23 + + + vpsrldq m14, m4, 4 + paddd m4, m14 + vpslldq m31, m5, 4 + paddd m5, m31 + vmovdqu32 m4 {k1}, m5 + + + vpsrldq m14, m3, 8 + paddd m3, m14 + vpslldq m31, m4, 8 + paddd m4, m31 + vmovdqu32 m3 {k2}, m4 + + + pmaddwd m4, m0, m28 + pmaddwd m5, m0, m29 + + vpsrldq m14, m4, 4 + paddd m4, m14 + vpslldq m31, m5, 4 + paddd m5, m31 + vmovdqu32 m4 {k1}, m5 + + pmaddwd m6, m0, m30 + vbroadcasti64x2 m31, [r6 + 112] + pmaddwd m0, m31 + + + vpsrldq m14, m6, 4 + paddd m6, m14 + vpslldq m31, m0, 4 + paddd m0, m31 + vmovdqu32 m6 {k1}, m0 + + + vpsrldq m14, m4, 8 + paddd m4, m14 + vpslldq m31, m6, 8 + paddd m6, m31 + vmovdqu32 m4 {k2}, m6 + + paddd m5, m1, m3 + paddd m5, m15 + psrad m5, IDCT_SHIFT2 + + psubd m1, m3 + paddd m1, m15 + psrad m1, IDCT_SHIFT2 + + paddd m6, m2, m4 + paddd m6, m15 + psrad m6, IDCT_SHIFT2 + + psubd m2, m4 + paddd m2, m15 + psrad m2, IDCT_SHIFT2 + + packssdw m5, m6 + packssdw m1, m2 + pshufb m2, m1, [idct16_AVX512_shuff6] +%endmacro + + +;------------------------------------------------------- +; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride) +;------------------------------------------------------- +INIT_ZMM avx512 +cglobal idct16, 3, 8, 32 +%if BIT_DEPTH == 12 + %define IDCT_SHIFT2 8 + vpbroadcastd m15, [pd_128] +%elif BIT_DEPTH == 10 + %define IDCT_SHIFT2 10 + vpbroadcastd m15, [pd_512] +%elif BIT_DEPTH == 8 + %define IDCT_SHIFT2 12 + vpbroadcastd m15, [pd_2048] +%else + %error Unsupported BIT_DEPTH! +%endif +%define IDCT_SHIFT1 7 + + vpbroadcastd m14, [pd_64] + + add r2d, r2d + + mov r7d, 0xAAAA + kmovd k1, r7d + mov r7d, 0xCCCC + kmovd k2, r7d + +.pass1: + movu xm0, [r0 + 0 * 32] + movu xm1, [r0 + 8 * 32] + punpckhqdq xm2, xm0, xm1 + punpcklqdq xm0, xm1 + vinserti128 ym0, ym0, xm2, 1 + + movu xm1, [r0 + 1 * 32] + movu xm2, [r0 + 9 * 32] + punpckhqdq xm3, xm1, xm2 + punpcklqdq xm1, xm2 + vinserti128 ym1, ym1, xm3, 1 + + movu xm2, [r0 + 2 * 32] + movu xm3, [r0 + 10 * 32] + punpckhqdq xm4, xm2, xm3 + punpcklqdq xm2, xm3 + vinserti128 ym2, ym2, xm4, 1 + + movu xm3, [r0 + 3 * 32] + movu xm4, [r0 + 11 * 32] + punpckhqdq xm5, xm3, xm4 + punpcklqdq xm3, xm4 + vinserti128 ym3, ym3, xm5, 1 + + movu xm4, [r0 + 4 * 32] + movu xm5, [r0 + 12 * 32] + punpckhqdq xm6, xm4, xm5 + punpcklqdq xm4, xm5 + vinserti128 ym4, ym4, xm6, 1 + + movu xm5, [r0 + 5 * 32] + movu xm6, [r0 + 13 * 32] + punpckhqdq xm7, xm5, xm6 + punpcklqdq xm5, xm6 + vinserti128 ym5, ym5, xm7, 1 + + movu xm6, [r0 + 6 * 32] + movu xm7, [r0 + 14 * 32] + punpckhqdq xm8, xm6, xm7 + punpcklqdq xm6, xm7 + vinserti128 ym6, ym6, xm8, 1 + + movu xm7, [r0 + 7 * 32] + movu xm8, [r0 + 15 * 32] + punpckhqdq xm9, xm7, xm8 + punpcklqdq xm7, xm8 + vinserti128 ym7, ym7, xm9, 1 + + punpckhwd ym8, ym0, ym2 ;[8 10] + punpcklwd ym0, ym2 ;[0 2] + + punpckhwd ym2, ym1, ym3 ;[9 11] + punpcklwd ym1, ym3 ;[1 3] + + punpckhwd ym3, ym4, ym6 ;[12 14] + punpcklwd ym4, ym6 ;[4 6] + + punpckhwd ym6, ym5, ym7 ;[13 15] + punpcklwd ym5, ym7 ;[5 7] + + punpckhdq ym7, ym0, ym4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] + punpckldq ym0, ym4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] + + punpckhdq ym4, ym8, ym3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] + punpckldq ym8, ym3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] + + punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] + punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] + + punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] + punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] + + punpckhqdq ym6, ym0, ym8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] + punpcklqdq ym0, ym8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] + + punpckhqdq ym8, ym7, ym4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] + punpcklqdq ym7, ym4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] + + punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] + punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] + + punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] + punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] + + vinserti64x4 m6, m6, ym6, 1 + vinserti64x4 m0, m0, ym0, 1 + vinserti64x4 m8, m8, ym8, 1 + vinserti64x4 m7, m7, ym7, 1 + vinserti64x4 m4, m4, ym4, 1 + vinserti64x4 m1, m1, ym1, 1 + vinserti64x4 m2, m2, ym2, 1 + vinserti64x4 m3, m3, ym3, 1 + + + IDCT16_AVX512_PASS1 0, 18, 19 + IDCT16_AVX512_PASS1 2, 20, 21 + + add r0, 16 + + movu xm0, [r0 + 0 * 32] + movu xm1, [r0 + 8 * 32] + punpckhqdq xm2, xm0, xm1 + punpcklqdq xm0, xm1 + vinserti128 ym0, ym0, xm2, 1 + + movu xm1, [r0 + 1 * 32] + movu xm2, [r0 + 9 * 32] + punpckhqdq xm3, xm1, xm2 + punpcklqdq xm1, xm2 + vinserti128 ym1, ym1, xm3, 1 + + movu xm2, [r0 + 2 * 32] + movu xm3, [r0 + 10 * 32] + punpckhqdq xm4, xm2, xm3 + punpcklqdq xm2, xm3 + vinserti128 ym2, ym2, xm4, 1 + + movu xm3, [r0 + 3 * 32] + movu xm4, [r0 + 11 * 32] + punpckhqdq xm5, xm3, xm4 + punpcklqdq xm3, xm4 + vinserti128 ym3, ym3, xm5, 1 + + movu xm4, [r0 + 4 * 32] + movu xm5, [r0 + 12 * 32] + punpckhqdq xm6, xm4, xm5 + punpcklqdq xm4, xm5 + vinserti128 ym4, ym4, xm6, 1 + + movu xm5, [r0 + 5 * 32] + movu xm6, [r0 + 13 * 32] + punpckhqdq xm7, xm5, xm6 + punpcklqdq xm5, xm6 + vinserti128 ym5, ym5, xm7, 1 + + movu xm6, [r0 + 6 * 32] + movu xm7, [r0 + 14 * 32] + punpckhqdq xm8, xm6, xm7 + punpcklqdq xm6, xm7 + vinserti128 ym6, ym6, xm8, 1 + + movu xm7, [r0 + 7 * 32] + movu xm8, [r0 + 15 * 32] + punpckhqdq xm9, xm7, xm8 + punpcklqdq xm7, xm8 + vinserti128 ym7, ym7, xm9, 1 + + punpckhwd ym8, ym0, ym2 ;[8 10] + punpcklwd ym0, ym2 ;[0 2] + + punpckhwd ym2, ym1, ym3 ;[9 11] + punpcklwd ym1, ym3 ;[1 3] + + punpckhwd ym3, ym4, ym6 ;[12 14] + punpcklwd ym4, ym6 ;[4 6] + + punpckhwd ym6, ym5, ym7 ;[13 15] + punpcklwd ym5, ym7 ;[5 7] + + punpckhdq ym7, ym0, ym4 ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] + punpckldq ym0, ym4 ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] + + punpckhdq ym4, ym8, ym3 ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] + punpckldq ym8, ym3 ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] + + punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] + punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] + + punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] + punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] + + punpckhqdq ym6, ym0, ym8 ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] + punpcklqdq ym0, ym8 ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] + + punpckhqdq ym8, ym7, ym4 ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] + punpcklqdq ym7, ym4 ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] + + punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] + punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] + + punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] + punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] + + vinserti64x4 m6, m6, ym6, 1 + vinserti64x4 m0, m0, ym0, 1 + vinserti64x4 m8, m8, ym8, 1 + vinserti64x4 m7, m7, ym7, 1 + vinserti64x4 m4, m4, ym4, 1 + vinserti64x4 m1, m1, ym1, 1 + vinserti64x4 m2, m2, ym2, 1 + vinserti64x4 m3, m3, ym3, 1 + + + IDCT16_AVX512_PASS1 0, 22, 23 + IDCT16_AVX512_PASS1 2, 24, 25 + + movu m26, [idct16_AVX512_shuff2] + movu m27, [idct16_AVX512_shuff3] + vpermi2q m26, m18, m22 + vpermi2q m27, m18, m22 + movu m18, [idct16_AVX512_shuff2] + movu m22, [idct16_AVX512_shuff3] + vpermi2q m18, m20, m24 + vpermi2q m22, m20, m24 + movu m20, [idct16_AVX512_shuff4] + movu m24, [idct16_AVX512_shuff5] + vpermi2q m20, m21, m25 + vpermi2q m24, m21, m25 + movu m21, [idct16_AVX512_shuff4] + movu m25, [idct16_AVX512_shuff5] + vpermi2q m21, m19, m23 + vpermi2q m25, m19, m23 + + lea r5, [tab_idct16_2] + lea r6, [tab_idct16_1] + + vbroadcasti64x2 m7, [r5] + vbroadcasti64x2 m8, [r5 + 16] + vbroadcasti64x2 m9, [r5 + 32] + vbroadcasti64x2 m10, [r5 + 48] + vbroadcasti64x2 m11, [r5 + 64] + vbroadcasti64x2 m12, [r5 + 80] + vbroadcasti64x2 m13, [r5 + 96] + + vbroadcasti64x2 m16, [r6] + vbroadcasti64x2 m17, [r6 + 16] + vbroadcasti64x2 m19, [r6 + 32] + vbroadcasti64x2 m23, [r6 + 48] + vbroadcasti64x2 m28, [r6 + 64] + vbroadcasti64x2 m29, [r6 + 80] + vbroadcasti64x2 m30, [r6 + 96] + + + IDCT16_AVX512_PASS2 26, 27 + mova [r1], xm5 + mova [r1 + 16], xm2 + vextracti128 [r1 + r2], ym5, 1 + vextracti128 [r1 + r2 + 16], ym2, 1 + vextracti64x4 ym14, m5, 1 + vextracti64x4 ym31, m2, 1 + lea r1, [r1 + 2 * r2] + mova [r1], xm14 + mova [r1 + 16], xm31 + vextracti128 [r1 + r2], ym14, 1 + vextracti128 [r1 + r2 + 16], ym31, 1 + + IDCT16_AVX512_PASS2 18, 22 + lea r1, [r1 + 2 * r2] + mova [r1], xm5 + mova [r1 + 16], xm2 + vextracti128 [r1 + r2], ym5, 1 + vextracti128 [r1 + r2 + 16], ym2, 1 + vextracti64x4 ym14, m5, 1 + vextracti64x4 ym31, m2, 1 + lea r1, [r1 + 2 * r2] + mova [r1], xm14 + mova [r1 + 16], xm31 + vextracti128 [r1 + r2], ym14, 1 + vextracti128 [r1 + r2 + 16], ym31, 1 + + IDCT16_AVX512_PASS2 20, 24 + lea r1, [r1 + 2 * r2] + mova [r1], xm5 + mova [r1 + 16], xm2 + vextracti128 [r1 + r2], ym5, 1 + vextracti128 [r1 + r2 + 16], ym2, 1 + vextracti64x4 ym14, m5, 1 + vextracti64x4 ym31, m2, 1 + lea r1, [r1 + 2 * r2] + mova [r1], xm14 + mova [r1 + 16], xm31 + vextracti128 [r1 + r2], ym14, 1 + vextracti128 [r1 + r2 + 16], ym31, 1 + + IDCT16_AVX512_PASS2 21, 25 + lea r1, [r1 + 2 * r2] + mova [r1], xm5 + mova [r1 + 16], xm2 + vextracti128 [r1 + r2], ym5, 1 + vextracti128 [r1 + r2 + 16], ym2, 1 + vextracti64x4 ym14, m5, 1 + vextracti64x4 ym31, m2, 1 + lea r1, [r1 + 2 * r2] + mova [r1], xm14 + mova [r1 + 16], xm31 + vextracti128 [r1 + r2], ym14, 1 + vextracti128 [r1 + r2 + 16], ym31, 1 + RET + + + %macro IDCT32_PASS1 1 vbroadcasti128 m3, [tab_idct32_1 + %1 * 32] vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16] diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Mon Nov 13 16:02:40 2017 +0530 +++ b/source/common/x86/dct8.h Mon Nov 06 11:39:56 2017 +0530 @@ -45,5 +45,6 @@ void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); +void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); #endif // ifndef X265_DCT8_H _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel