4% to 35% faster depending on the width.
---
libavcodec/x86/huffyuvencdsp.asm | 31 ---
libavcodec/x86/huffyuvencdsp_mmx.c | 8 +++-
2 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 97de7e9..9625fbe 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -27,27 +27,27 @@
section .text
-INIT_MMX mmx
; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t
*src2,
;intptr_t w);
-cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
+%macro DIFF_BYTES 0
+cglobal diff_bytes, 4,6,2, dst, src1, src2, w, i
xor iq, iq
-cmp wq, 16
+cmp wq, mmsize * 2
jb.loop2
-sub wq, 15
+sub wq, mmsize * 2 - 1
.loop:
-mova m0, [src2q + iq]
-mova m1, [src1q + iq]
+movu m0, [src2q + iq]
+movu m1, [src1q + iq]
psubb m1, m0
mova [iq + dstq], m1
-mova m0, [src2q + iq + 8]
-mova m1, [src1q + iq + 8]
+movu m0, [src2q + iq + mmsize]
+movu m1, [src1q + iq + mmsize]
psubb m1, m0
-mova [8 + iq + dstq], m1
-add iq, 16
+mova [mmsize + iq + dstq], m1
+add iq, mmsize * 2
cmp iq, wq
jb .loop
-add wq, 15
+add wq, mmsize * 2 - 1
.loop2:
mov r6b, byte [src1q + iq]
sub r6b, byte [src2q + iq]
@@ -56,3 +56,12 @@ cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
cmp iq, wq
jb.loop2
REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c
b/libavcodec/x86/huffyuvencdsp_mmx.c
index c5f81c8..9af5305 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -31,6 +31,8 @@
void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+intptr_t w);
#if HAVE_INLINE_ASM
@@ -80,11 +82,15 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext
*c)
{
av_unused int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_MMX(cpu_flags)) {
+if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
c->diff_bytes = ff_diff_bytes_mmx;
}
if (INLINE_MMXEXT(cpu_flags)) {
c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
}
+
+if (EXTERNAL_SSE2(cpu_flags)) {
+c->diff_bytes = ff_diff_bytes_sse2;
+}
}
--
1.9.1
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel