At 2015-02-02 16:55:16,[email protected] wrote: ># HG changeset patch ># User Praveen Tiwari ># Date 1422867249 -19800 ># Branch stable ># Node ID 2618352a21d5917ee8c1f79bcc159e858dd19daa ># Parent e2c958ff874e2bf8992ba22605e993530e8a2d8c >blockfill_s_8x8 sse2 asm code optimization > >improved, 100.04c -> 90.05c > >diff -r e2c958ff874e -r 2618352a21d5 source/common/x86/blockcopy8.asm >--- a/source/common/x86/blockcopy8.asm Sat Jan 31 13:48:34 2015 -0600 >+++ b/source/common/x86/blockcopy8.asm Mon Feb 02 14:24:09 2015 +0530 >@@ -1748,9 +1748,10 @@ > ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) > ;----------------------------------------------------------------------------- > INIT_XMM sse2 >-cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val >+cglobal blockfill_s_8x8, 3, 4, 1, dst, dstStride, val > > add r1, r1 >+lea r3, [3 * r1] > > movd m0, r2d > pshuflw m0, m0, 0 >@@ -1760,17 +1761,13 @@ > movu [r0 + r1], m0 > movu [r0 + 2 * r1], m0 > >-lea r0, [r0 + 2 * r1] >+movu [r0 + r3], m0 >+movu [r0 + 4 * r1], m0 >+ >+lea r0, [r0 + 4 * r1] swap LEA and above movu, you will get less bytes on binary code > movu [r0 + r1], m0 > movu [r0 + 2 * r1], m0 >- >-lea r0, [r0 + 2 * r1] >-movu [r0 + r1], m0 >-movu [r0 + 2 * r1], m0 >- >-lea r0, [r0 + 2 * r1] >-movu [r0 + r1], m0 >- >+movu [r0 + r3], m0 > RET > > ;----------------------------------------------------------------------------- >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
