Re: [FFmpeg-devel] [PATCH 3/4] huffyuvencdsp: Add ff_diff_bytes_sse2

2015-10-19 Thread James Almer
On 10/19/2015 5:00 PM, Timothy Gu wrote:
> 4% to 35% faster depending on the width.
> ---
>  libavcodec/x86/huffyuvencdsp.asm   | 31 ---
>  libavcodec/x86/huffyuvencdsp_mmx.c |  8 +++-
>  2 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/x86/huffyuvencdsp.asm 
> b/libavcodec/x86/huffyuvencdsp.asm
> index 97de7e9..9625fbe 100644
> --- a/libavcodec/x86/huffyuvencdsp.asm
> +++ b/libavcodec/x86/huffyuvencdsp.asm
> @@ -27,27 +27,27 @@
>  
>  section .text
>  
> -INIT_MMX mmx
>  ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
> *src2,
>  ;intptr_t w);
> -cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
> +%macro DIFF_BYTES 0
> +cglobal diff_bytes, 4,6,2, dst, src1, src2, w, i
>  xor   iq, iq
> -cmp   wq, 16
> +cmp   wq, mmsize * 2
>  jb.loop2
> -sub   wq, 15
> +sub   wq, mmsize * 2 - 1
>  .loop:
> -mova  m0, [src2q + iq]
> -mova  m1, [src1q + iq]
> +movu  m0, [src2q + iq]
> +movu  m1, [src1q + iq]

If dst and/or src can sometimes be aligned, check how ff_add_hfyu_left_pred
(also huffyuvdsp.asm) handles it.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 3/4] huffyuvencdsp: Add ff_diff_bytes_sse2

2015-10-19 Thread Timothy Gu
4% to 35% faster depending on the width.
---
 libavcodec/x86/huffyuvencdsp.asm   | 31 ---
 libavcodec/x86/huffyuvencdsp_mmx.c |  8 +++-
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 97de7e9..9625fbe 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -27,27 +27,27 @@
 
 section .text
 
-INIT_MMX mmx
 ; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t 
*src2,
 ;intptr_t w);
-cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
+%macro DIFF_BYTES 0
+cglobal diff_bytes, 4,6,2, dst, src1, src2, w, i
 xor   iq, iq
-cmp   wq, 16
+cmp   wq, mmsize * 2
 jb.loop2
-sub   wq, 15
+sub   wq, mmsize * 2 - 1
 .loop:
-mova  m0, [src2q + iq]
-mova  m1, [src1q + iq]
+movu  m0, [src2q + iq]
+movu  m1, [src1q + iq]
 psubb m1, m0
 mova [iq + dstq], m1
-mova  m0, [src2q + iq + 8]
-mova  m1, [src1q + iq + 8]
+movu  m0, [src2q + iq + mmsize]
+movu  m1, [src1q + iq + mmsize]
 psubb m1, m0
-mova [8 + iq + dstq], m1
-add   iq, 16
+mova [mmsize + iq + dstq], m1
+add   iq, mmsize * 2
 cmp   iq, wq
 jb .loop
-add   wq, 15
+add   wq, mmsize * 2 - 1
 .loop2:
 mov  r6b, byte [src1q + iq]
 sub  r6b, byte [src2q + iq]
@@ -56,3 +56,12 @@ cglobal diff_bytes, 4,6,0, dst, src1, src2, w, i
 cmp   iq, wq
 jb.loop2
 REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c 
b/libavcodec/x86/huffyuvencdsp_mmx.c
index c5f81c8..9af5305 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -31,6 +31,8 @@
 
 void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
intptr_t w);
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+intptr_t w);
 
 #if HAVE_INLINE_ASM
 
@@ -80,11 +82,15 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext 
*c)
 {
 av_unused int cpu_flags = av_get_cpu_flags();
 
-if (EXTERNAL_MMX(cpu_flags)) {
+if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
 c->diff_bytes = ff_diff_bytes_mmx;
 }
 
 if (INLINE_MMXEXT(cpu_flags)) {
 c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
 }
+
+if (EXTERNAL_SSE2(cpu_flags)) {
+c->diff_bytes = ff_diff_bytes_sse2;
+}
 }
-- 
1.9.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel