# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1429268008 -19800 # Fri Apr 17 16:23:28 2015 +0530 # Node ID c8ea565afa9a8e7934ada36f76a0bb79f34d59b2 # Parent 7be1172ec816298c32f588908e1b6f0fa214d349 asm: intra_allangs4x4 improved by ~61% over SSE4
AVX2: intra_allangs4x4 31.17x 1070.01 33353.50 SSE4: intra_allangs4x4 12.04x 2746.58 33061.69 diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 16 11:38:32 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Apr 17 16:23:28 2015 +0530 @@ -1909,6 +1909,9 @@ p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2; p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2; + // all_angs primitives + p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_avx2; + // copy_sp primitives p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Thu Apr 16 11:38:32 2015 +0530 +++ b/source/common/x86/intrapred.h Fri Apr 17 16:23:28 2015 +0530 @@ -283,4 +283,5 @@ void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); +void x265_all_angs_pred_4x4_avx2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); #endif // ifndef X265_INTRAPRED_H diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred8_allangs.asm --- a/source/common/x86/intrapred8_allangs.asm Thu Apr 16 11:38:32 2015 +0530 +++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 17 16:23:28 2015 +0530 @@ -27,6 +27,64 @@ SECTION_RODATA 32 +all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 + db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4 + db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 + db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12 + db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11 + db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11 + db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10 + db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10 + db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9 + db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0 + db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1 + db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2 + db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3 + db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4 + db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 + db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5 + db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6 + db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6 + db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7 + db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8 + db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8 + +all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 + db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 + db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 + db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 + db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 + db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 + db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 + db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 + db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 + db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 + db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 + db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 + db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 + db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 + db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24 + db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12 + db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28 + db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12 + db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28 + db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12 + db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24 + db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8 + db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20 + db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4 + db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20 + db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4 + db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20 + db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8 + + SECTION .text ; global constant @@ -23012,6 +23070,324 @@ movu [r0 + 2111 * 16], m4 RET + +;----------------------------------------------------------------------------- +; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal all_angs_pred_4x4, 4, 4, 6 + + mova m5, [pw_1024] + lea r2, [all_ang4] + lea r3, [all_ang4_shuff] + +; mode 2 + + vbroadcasti128 m0, [r1 + 9] + mova xm1, xm0 + psrldq xm1, 1 + pshufb xm1, [r3] + movu [r0], xm1 + +; mode 3 + + pshufb m1, m0, [r3 + 1 * mmsize] + pmaddubsw m1, [r2] + pmulhrsw m1, m5 + +; mode 4 + + pshufb m2, m0, [r3 + 2 * mmsize] + pmaddubsw m2, [r2 + 1 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (3 - 2) * 16], m1 + +; mode 5 + + pshufb m1, m0, [r3 + 2 * mmsize] + pmaddubsw m1, [r2 + 2 * mmsize] + pmulhrsw m1, m5 + +; mode 6 + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 3 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (5 - 2) * 16], m1 + + add r3, 4 * mmsize + add r2, 4 * mmsize + +; mode 7 + + pshufb m1, m0, [r3 + 0 * mmsize] + pmaddubsw m1, [r2 + 0 * mmsize] + pmulhrsw m1, m5 + +; mode 8 + + pshufb m2, m0, [r3 + 1 * mmsize] + pmaddubsw m2, [r2 + 1 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (7 - 2) * 16], m1 + +; mode 9 + + pshufb m1, m0, [r3 + 1 * mmsize] + pmaddubsw m1, [r2 + 2 * mmsize] + pmulhrsw m1, m5 + packuswb m1, m1 + vpermq m1, m1, 11011000b + movu [r0 + (9 - 2) * 16], xm1 + +; mode 10 + + pshufb xm1, xm0, [r3 + 2 * mmsize] + movu [r0 + (10 - 2) * 16], xm1 + + pxor xm1, xm1 + movd xm2, [r1 + 1] + pshufd xm3, xm2, 0 + punpcklbw xm3, xm1 + pinsrb xm2, [r1], 0 + pshufb xm4, xm2, xm1 + punpcklbw xm4, xm1 + psubw xm3, xm4 + psraw xm3, 1 + pshufb xm4, xm0, xm1 + punpcklbw xm4, xm1 + paddw xm3, xm4 + packuswb xm3, xm1 + + pextrb [r0 + 128], xm3, 0 + pextrb [r0 + 132], xm3, 1 + pextrb [r0 + 136], xm3, 2 + pextrb [r0 + 140], xm3, 3 + +; mode 11 + + vbroadcasti128 m0, [r1] + pshufb m1, m0, [r3 + 3 * mmsize] + pmaddubsw m1, [r2 + 3 * mmsize] + pmulhrsw m1, m5 + +; mode 12 + + add r2, 4 * mmsize + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 0 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (11 - 2) * 16], m1 + +; mode 13 + + add r3, 4 * mmsize + + pshufb m1, m0, [r3 + 0 * mmsize] + pmaddubsw m1, [r2 + 1 * mmsize] + pmulhrsw m1, m5 + +; mode 14 + + pshufb m2, m0, [r3 + 1 * mmsize] + pmaddubsw m2, [r2 + 2 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (13 - 2) * 16], m1 + +; mode 15 + + pshufb m1, m0, [r3 + 2 * mmsize] + pmaddubsw m1, [r2 + 3 * mmsize] + pmulhrsw m1, m5 + +; mode 16 + + add r2, 4 * mmsize + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 0 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (15 - 2) * 16], m1 + +; mode 17 + + add r3, 4 * mmsize + + pshufb m1, m0, [r3 + 0 * mmsize] + pmaddubsw m1, [r2 + 1 * mmsize] + pmulhrsw m1, m5 + packuswb m1, m1 + vpermq m1, m1, 11011000b + +; mode 18 + + pshufb m2, m0, [r3 + 1 * mmsize] + vinserti128 m1, m1, xm2, 1 + movu [r0 + (17 - 2) * 16], m1 + +; mode 19 + + pshufb m1, m0, [r3 + 2 * mmsize] + pmaddubsw m1, [r2 + 2 * mmsize] + pmulhrsw m1, m5 + +; mode 20 + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 3 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (19 - 2) * 16], m1 + +; mode 21 + + add r2, 4 * mmsize + add r3, 4 * mmsize + + pshufb m1, m0, [r3 + 0 * mmsize] + pmaddubsw m1, [r2 + 0 * mmsize] + pmulhrsw m1, m5 + +; mode 22 + + pshufb m2, m0, [r3 + 1 * mmsize] + pmaddubsw m2, [r2 + 1 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (21 - 2) * 16], m1 + +; mode 23 + + pshufb m1, m0, [r3 + 2 * mmsize] + pmaddubsw m1, [r2 + 2 * mmsize] + pmulhrsw m1, m5 + +; mode 24 + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 3 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (23 - 2) * 16], m1 + +; mode 25 + + add r2, 4 * mmsize + + pshufb m1, m0, [r3 + 3 * mmsize] + pmaddubsw m1, [r2 + 0 * mmsize] + pmulhrsw m1, m5 + packuswb m1, m1 + vpermq m1, m1, 11011000b + movu [r0 + (25 - 2) * 16], xm1 + +; mode 26 + + add r3, 4 * mmsize + + pshufb xm1, xm0, [r3 + 0 * mmsize] + movu [r0 + (26 - 2) * 16], xm1 + + pxor xm1, xm1 + movd xm2, [r1 + 9] + pshufd xm3, xm2, 0 + punpcklbw xm3, xm1 + pinsrb xm4, [r1 + 0], 0 + pshufb xm4, xm1 + punpcklbw xm4, xm1 + psubw xm3, xm4 + psraw xm3, 1 + psrldq xm2, xm0, 1 + pshufb xm2, xm1 + punpcklbw xm2, xm1 + paddw xm3, xm2 + packuswb xm3, xm1 + + pextrb [r0 + 384], xm3, 0 + pextrb [r0 + 388], xm3, 1 + pextrb [r0 + 392], xm3, 2 + pextrb [r0 + 396], xm3, 3 + +; mode 27 + + pshufb m1, m0, [r3 + 1 * mmsize] + pmaddubsw m1, [r2 + 1 * mmsize] + pmulhrsw m1, m5 + +; mode 28 + + pshufb m2, m0, [r3 + 1 * mmsize] + pmaddubsw m2, [r2 + 2 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (27 - 2) * 16], m1 + +; mode 29 + + pshufb m1, m0, [r3 + 2 * mmsize] + pmaddubsw m1, [r2 + 3 * mmsize] + pmulhrsw m1, m5 + +; mode 30 + + add r2, 4 * mmsize + + pshufb m2, m0, [r3 + 3 * mmsize] + pmaddubsw m2, [r2 + 0 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (29 - 2) * 16], m1 + +; mode 31 + + add r3, 4 * mmsize + + pshufb m1, m0, [r3 + 0 * mmsize] + pmaddubsw m1, [r2 + 1 * mmsize] + pmulhrsw m1, m5 + +; mode 32 + + pshufb m2, m0, [r3 + 0 * mmsize] + pmaddubsw m2, [r2 + 2 * mmsize] + pmulhrsw m2, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + movu [r0 + (31 - 2) * 16], m1 + +; mode 33 + + pshufb m1, m0, [r3 + 1 * mmsize] + pmaddubsw m1, [r2 + 3 * mmsize] + pmulhrsw m1, m5 + packuswb m1, m2 + vpermq m1, m1, 11011000b + +; mode 34 + + pshufb m0, [r3 + 2 * mmsize] + vinserti128 m1, m1, xm0, 1 + movu [r0 + (33 - 2) * 16], m1 + RET + ;----------------------------------------------------------------------------- ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel