# HG changeset patch # User Praveen Tiwari <prav...@multicorewareinc.com> # Date 1510574887 28800 # Mon Nov 13 04:08:07 2017 -0800 # Node ID a7ce91c5db95ac0eb3f58b5c993ace3bfe0bbe2f # Parent 4b01781203a4e7a08cee94346f52a24ac78a3478 x86: dct8 PASS1 optimize for shuffle instructions
diff -r 4b01781203a4 -r a7ce91c5db95 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asm Sun Nov 12 23:17:58 2017 -0800 +++ b/source/common/x86/dct8.asm Mon Nov 13 04:08:07 2017 -0800 @@ -34,6 +34,7 @@ dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7 dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 @@ -2306,20 +2307,19 @@ %macro DCT8_AVX512_PASS_1 4 vpmaddwd m%2, m3, m%1 - vpshufb m8, m%2, m6 + vpsrlq m8, m%2, 32 vpaddd m%2, m8 - vpermd m%2, m17, m%2 - - vpmaddwd m%4, m2, m%3 - vpshufb m8, m%4, m6 - vpaddd m%4, m8 - vpermd m%4, m17, m%4 - - vinserti64x4 m%2, m%2, ym%4, 1 vpaddd m%2, m5 vpsrad m%2, DCT8_SHIFT1 - vpackssdw m%2, m%2 - vpermq m%2, m19, m%2 + + vpmaddwd m%4, m2, m%3 + vpsrlq m8, m%4, 32 + vpaddd m%4, m8 + vpaddd m%4, m5 + vpsrad m%4, DCT8_SHIFT1 + + vpackssdw m%2, m%4 + vpermw m%2, m1, m%2 %endmacro %macro DCT8_AVX512_PASS_2 4 @@ -2423,6 +2423,8 @@ vpaddw m3, m2, m0 vpsubw m2, m0 + vbroadcasti32x8 m1, [dct8_shuf7_AVX512] + ; Load all the coefficients togather for better caching vpbroadcastq m20, [r6 + 0 * 8] vpbroadcastq m21, [r6 + 1 * 8] _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel