4% to 35% faster depending on the width. --- libavcodec/x86/huffyuvencdsp.asm | 31 ++++++++++++++++++++----------- libavcodec/x86/huffyuvencdsp_mmx.c | 8 +++++++- 2 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm index 97de7e9..9625fbe 100644 --- a/libavcodec/x86/huffyuvencdsp.asm +++ b/libavcodec/x86/huffyuvencdsp.asm @@ -27,27 +27,27 @@ section .text -INIT_MMX mmx ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; intptr_t w); -cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i +%macro DIFF_BYTES 0 +cglobal diff_bytes, 4,6,2, dst, src1, src2, w, i xor iq, iq - cmp wq, 16 + cmp wq, mmsize * 2 jb .loop2 - sub wq, 15 + sub wq, mmsize * 2 - 1 .loop: - mova m0, [src2q + iq] - mova m1, [src1q + iq] + movu m0, [src2q + iq] + movu m1, [src1q + iq] psubb m1, m0 mova [iq + dstq], m1 - mova m0, [src2q + iq + 8] - mova m1, [src1q + iq + 8] + movu m0, [src2q + iq + mmsize] + movu m1, [src1q + iq + mmsize] psubb m1, m0 - mova [8 + iq + dstq], m1 - add iq, 16 + mova [mmsize + iq + dstq], m1 + add iq, mmsize * 2 cmp iq, wq jb .loop - add wq, 15 + add wq, mmsize * 2 - 1 .loop2: mov r6b, byte [src1q + iq] sub r6b, byte [src2q + iq] @@ -56,3 +56,12 @@ cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i cmp iq, wq jb .loop2 REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +DIFF_BYTES +%endif + +INIT_XMM sse2 +DIFF_BYTES diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c index c5f81c8..9af5305 100644 --- a/libavcodec/x86/huffyuvencdsp_mmx.c +++ b/libavcodec/x86/huffyuvencdsp_mmx.c @@ -31,6 +31,8 @@ void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w); +void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + intptr_t w); #if HAVE_INLINE_ASM @@ -80,11 +82,15 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) { av_unused int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { + if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { c->diff_bytes = ff_diff_bytes_mmx; } if (INLINE_MMXEXT(cpu_flags)) { c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext; } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->diff_bytes = ff_diff_bytes_sse2; + } } -- 1.9.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel