[libav-devel] [PATCH] x86: Port gradfun to yasm

2013-10-21 Thread Daniel Kang
---
 libavfilter/x86/Makefile|1 +
 libavfilter/x86/vf_gradfun.c|  168 +--
 libavfilter/x86/vf_gradfun_yasm.asm |  144 ++
 3 files changed, 165 insertions(+), 148 deletions(-)
 create mode 100644 libavfilter/x86/vf_gradfun_yasm.asm

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 59cefe8..b50b373 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -3,6 +3,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += 
x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_GRADFUN_FILTER)   += x86/vf_gradfun_yasm.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
 YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o
diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c
index e571af7..3bac16d 100644
--- a/libavfilter/x86/vf_gradfun.c
+++ b/libavfilter/x86/vf_gradfun.c
@@ -24,12 +24,10 @@
 #include libavutil/x86/asm.h
 #include libavfilter/gradfun.h
 
-#if HAVE_INLINE_ASM
-
-DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = 
{0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
-DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = 
{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-
-#if HAVE_MMXEXT_INLINE
+#if HAVE_YASM
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
+   uint16_t *dc, int thresh,
+   const uint16_t *dithers);
 static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t 
*dc,
int width, int thresh,
const uint16_t *dithers)
@@ -41,72 +39,13 @@ static void gradfun_filter_line_mmxext(uint8_t *dst, 
uint8_t *src, uint16_t *dc,
 width = x;
 }
 x = -width;
-__asm__ volatile(
-movd  %4, %%mm5 \n
-pxor   %%mm7, %%mm7 \n
-pshufw $0, %%mm5, %%mm5 \n
-movq  %6, %%mm6 \n
-movq  (%5), %%mm3 \n
-movq 8(%5), %%mm4 \n
-
-1: \n
-movd (%2,%0), %%mm0 \n
-movd (%3,%0), %%mm1 \n
-punpcklbw  %%mm7, %%mm0 \n
-punpcklwd  %%mm1, %%mm1 \n
-psllw $7, %%mm0 \n
-pxor   %%mm2, %%mm2 \n
-psubw  %%mm0, %%mm1 \n // delta = dc - pix
-psubw  %%mm1, %%mm2 \n
-pmaxsw %%mm1, %%mm2 \n
-pmulhuw%%mm5, %%mm2 \n // m = abs(delta) * thresh  16
-psubw  %%mm6, %%mm2 \n
-pminsw %%mm7, %%mm2 \n // m = -max(0, 127-m)
-pmullw %%mm2, %%mm2 \n
-paddw  %%mm3, %%mm0 \n // pix += dither
-psllw $2, %%mm1 \n // m = m*m*delta  14
-pmulhw %%mm2, %%mm1 \n
-paddw  %%mm1, %%mm0 \n // pix += m
-psraw $7, %%mm0 \n
-packuswb   %%mm0, %%mm0 \n
-movd   %%mm0, (%1,%0) \n // dst = clip(pix7)
-add   $4, %0 \n
-jnl 2f \n
-
-movd (%2,%0), %%mm0 \n
-movd (%3,%0), %%mm1 \n
-punpcklbw  %%mm7, %%mm0 \n
-punpcklwd  %%mm1, %%mm1 \n
-psllw $7, %%mm0 \n
-pxor   %%mm2, %%mm2 \n
-psubw  %%mm0, %%mm1 \n // delta = dc - pix
-psubw  %%mm1, %%mm2 \n
-pmaxsw %%mm1, %%mm2 \n
-pmulhuw%%mm5, %%mm2 \n // m = abs(delta) * thresh  16
-psubw  %%mm6, %%mm2 \n
-pminsw %%mm7, %%mm2 \n // m = -max(0, 127-m)
-pmullw %%mm2, %%mm2 \n
-paddw  %%mm4, %%mm0 \n // pix += dither
-psllw $2, %%mm1 \n // m = m*m*delta  14
-pmulhw %%mm2, %%mm1 \n
-paddw  %%mm1, %%mm0 \n // pix += m
-psraw $7, %%mm0 \n
-packuswb   %%mm0, %%mm0 \n
-movd   %%mm0, (%1,%0) \n // dst = clip(pix7)
-add   $4, %0 \n
-jl 1b \n
-
-2: \n
-emms \n
-:+r(x)
-:r(dst+width), r(src+width), r(dc+width/2),
- rm(thresh), r(dithers), m(*pw_7f)
-:memory
-);
+ff_gradfun_filter_line_mmxext(x, dst+width, src+width, dc+width/2,
+  thresh, dithers);
 }
-#endif
 
-#if HAVE_SSSE3_INLINE
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
+  uint16_t *dc, int thresh,
+  const uint16_t *dithers);
 static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t 
*dc, int width, int thresh, const uint16_t *dithers)
 {
 intptr_t x;
@@ -117,100 +56,33 @@ static void gradfun_filter_line_ssse3(uint8_t *dst, 
uint8_t *src, uint16_t *dc,
 width = x;
 }
 x = -width;
-__asm__ volatile(
-

[libav-devel] [PATCH] x86: Port gradfun to yasm

2013-10-21 Thread Daniel Kang
---
Fixed all of Diego's comments
---
 libavfilter/x86/Makefile  |3 +-
 libavfilter/x86/vf_gradfun.asm|  144 
 libavfilter/x86/vf_gradfun.c  |  217 -
 libavfilter/x86/vf_gradfun_init.c |   93 
 4 files changed, 239 insertions(+), 218 deletions(-)
 create mode 100644 libavfilter/x86/vf_gradfun.asm
 delete mode 100644 libavfilter/x86/vf_gradfun.c
 create mode 100644 libavfilter/x86/vf_gradfun_init.c

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 59cefe8..16b1307 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,8 +1,9 @@
-OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun.o
+OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_GRADFUN_FILTER)   += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
 YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
new file mode 100644
index 000..e1737dd
--- /dev/null
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -0,0 +1,144 @@
+;*
+;* x86-optimized functions for gradfun filter
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pw_7f: times 8 dw 0x7F
+pw_ff: times 8 dw 0xFF
+
+SECTION .text
+
+INIT_MMX mmxext
+cglobal gradfun_filter_line, 6, 6
+movh  m5, r4d
+pxor  m7, m7
+pshufwm5, m5,0
+mova  m6, [pw_7f]
+mova  m3, [r5]
+mova  m4, [r5+8]
+.loop:
+movh  m0, [r2+r0]
+movh  m1, [r3+r0]
+punpcklbw m0, m7
+punpcklwd m1, m1
+psllw m0, 7
+pxor  m2, m2
+psubw m1, m0
+psubw m2, m1
+pmaxswm2, m1
+pmulhuw   m2, m5
+psubw m2, m6
+pminswm2, m7
+pmullwm2, m2
+paddw m0, m3
+psllw m1, 2
+pmulhwm1, m2
+paddw m0, m1
+psraw m0, 7
+packuswb  m0, m0
+movh [r1+r0], m0
+add   r0, 4
+jge .end
+movh  m0, [r2+r0]
+movh  m1, [r3+r0]
+punpcklbw m0, m7
+punpcklwd m1, m1
+psllw m0, 7
+pxor  m2, m2
+psubw m1, m0
+psubw m2, m1
+pmaxswm2, m1
+pmulhuw   m2, m5
+psubw m2, m6
+pminswm2, m7
+pmullwm2, m2
+paddw m0, m4
+psllw m1, 2
+pmulhwm1, m2
+paddw m0, m1
+psraw m0, 7
+packuswb  m0, m0
+movh [r1+r0], m0
+add   r0, 4
+jl .loop
+.end:
+REP_RET
+
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+movd   m5, r4d
+pxor   m7, m7
+pshuflwm5, m5, 0
+mova   m6, [pw_7f]
+punpcklqdq m5, m5
+mova   m4, [r5]
+.loop:
+movh   m0, [r2+r0]
+movh   m1, [r3+r0]
+punpcklbw  m0, m7
+punpcklwd  m1, m1
+psllw  m0, 7
+psubw  m1, m0
+pabsw  m2, m1
+pmulhuwm2, m5
+psubw  m2, m6
+pminsw m2, m7
+pmullw m2, m2
+psllw  m1, 2
+paddw  m0, m4
+pmulhw m1, m2
+paddw  m0, m1
+psraw  m0, 7
+packuswb   m0, m0
+movh  [r1+r0], m0
+addr0, 8
+jl .loop
+REP_RET
+
+%macro BLUR_LINE 1
+cglobal gradfun_blur_line_%1, 6, 6, 8
+movam7, [pw_ff]
+.loop:
+%1  m0, [r4+r0]
+%1  m1, [r5+r0]
+movam2, m0
+movam3, m1
+psrlw   m0, 8
+psrlw   m1, 8
+pandm2, m7
+pandm3, m7
+paddw   m0, m1
+paddw   m2, m3
+paddw   m0, m2
+paddw   m0, [r2+r0]
+movam1, [r1+r0]
+mova   [r1+r0], m0
+psubw   m0, m1
+mova   [r3+r0], m0
+add r0, 16
+jl .loop
+REP_RET
+%endmacro
+
+INIT_XMM 

Re: [libav-devel] [PATCH] x86: Port gradfun to yasm

2013-10-21 Thread Daniel Kang
On Mon, Oct 21, 2013 at 9:50 AM, Diego Biurrun di...@biurrun.de wrote:

 On Mon, Oct 21, 2013 at 09:30:23AM -0400, Daniel Kang wrote:
  --- a/libavfilter/x86/Makefile
  +++ b/libavfilter/x86/Makefile
  @@ -3,6 +3,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += 
  x86/vf_hqdn3d_init.o
   OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
   OBJS-$(CONFIG_YADIF_FILTER)  += x86/vf_yadif_init.o
 
  +YASM-OBJS-$(CONFIG_GRADFUN_FILTER)   += x86/vf_gradfun_yasm.o
   YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o
   YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
   YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o

 The file with the init code should be renamed to vf_gradfun_init.c instead
 and vf_gradfun.asm should contain the assembly.

  --- a/libavfilter/x86/vf_gradfun.c
  +++ b/libavfilter/x86/vf_gradfun.c
  @@ -41,72 +39,13 @@ static void gradfun_filter_line_mmxext(uint8_t *dst, 
  uint8_t *src, uint16_t *dc,
   width = x;
   }
   x = -width;
  +ff_gradfun_filter_line_mmxext(x, dst+width, src+width, dc+width/2,
  +  thresh, dithers);

 spaces around operators

  @@ -117,100 +56,33 @@ static void gradfun_filter_line_ssse3(uint8_t *dst, 
  uint8_t *src, uint16_t *dc,
   width = x;
   }
   x = -width;
  +ff_gradfun_filter_line_ssse3(x, dst+width, src+width, dc+width/2,
  + thresh, dithers);

 same

   av_cold void ff_gradfun_init_x86(GradFunContext *gf)
   {
  -#if HAVE_MMXEXT_INLINE
  +#if HAVE_YASM
   int cpu_flags = av_get_cpu_flags();
  -
   if (cpu_flags  AV_CPU_FLAG_MMXEXT)

 Keep the empty line.

   gf-filter_line = gradfun_filter_line_mmxext;
  -#endif
  -#if HAVE_SSSE3_INLINE
   if (cpu_flags  AV_CPU_FLAG_SSSE3)
   gf-filter_line = gradfun_filter_line_ssse3;
  -#endif
  -#if HAVE_SSE2_INLINE
   if (cpu_flags  AV_CPU_FLAG_SSE2)
   gf-blur_line = gradfun_blur_line_sse2;
   #endif

 Please comment the #endif.

 Look at vf_yadif_init.c to see how to replace the ifdefs with the
 right macros.

All done.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86: Port gradfun to yasm

2013-10-21 Thread Daniel Kang
---
Fix licence to LGPL, add newline for readability.

Loren's update.
---
 libavfilter/x86/Makefile  |3 +-
 libavfilter/x86/vf_gradfun.asm|  110 +++
 libavfilter/x86/vf_gradfun.c  |  217 -
 libavfilter/x86/vf_gradfun_init.c |   94 
 4 files changed, 206 insertions(+), 218 deletions(-)
 create mode 100644 libavfilter/x86/vf_gradfun.asm
 delete mode 100644 libavfilter/x86/vf_gradfun.c
 create mode 100644 libavfilter/x86/vf_gradfun_init.c

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 59cefe8..16b1307 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,8 +1,9 @@
-OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun.o
+OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_GRADFUN_FILTER)   += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
 YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
new file mode 100644
index 000..00fcb16
--- /dev/null
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -0,0 +1,110 @@
+;**
+;* x86-optimized functions for gradfun filter
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pw_7f: times 8 dw 0x7F
+pw_ff: times 8 dw 0xFF
+
+SECTION .text
+
+%macro FILTER_LINE 1
+movh   m0, [r2+r0]
+movh   m1, [r3+r0]
+punpcklbw  m0, m7
+punpcklwd  m1, m1
+psllw  m0, 7
+psubw  m1, m0
+PABSW  m2, m1
+pmulhuwm2, m5
+psubw  m2, m6
+pminsw m2, m7
+pmullw m2, m2
+psllw  m1, 2
+paddw  m0, %1
+pmulhw m1, m2
+paddw  m0, m1
+psraw  m0, 7
+packuswb   m0, m0
+movh  [r1+r0], m0
+%endmacro
+
+INIT_MMX mmxext
+cglobal gradfun_filter_line, 6, 6
+movh  m5, r4d
+pxor  m7, m7
+pshufwm5, m5,0
+mova  m6, [pw_7f]
+mova  m3, [r5]
+mova  m4, [r5+8]
+.loop:
+FILTER_LINE m3
+add   r0, 4
+jge .end
+FILTER_LINE m4
+add   r0, 4
+jl .loop
+.end:
+REP_RET
+
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+movd   m5, r4d
+pxor   m7, m7
+pshuflwm5, m5, 0
+mova   m6, [pw_7f]
+punpcklqdq m5, m5
+mova   m4, [r5]
+.loop:
+FILTER_LINE m4
+addr0, 8
+jl .loop
+REP_RET
+
+%macro BLUR_LINE 1
+cglobal gradfun_blur_line_%1, 6, 6, 8
+movam7, [pw_ff]
+.loop:
+%1  m0, [r4+r0]
+%1  m1, [r5+r0]
+movam2, m0
+movam3, m1
+psrlw   m0, 8
+psrlw   m1, 8
+pandm2, m7
+pandm3, m7
+paddw   m0, m1
+paddw   m2, m3
+paddw   m0, m2
+paddw   m0, [r2+r0]
+movam1, [r1+r0]
+mova   [r1+r0], m0
+psubw   m0, m1
+mova   [r3+r0], m0
+add r0, 16
+jl .loop
+REP_RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_LINE movdqa
+BLUR_LINE movdqu
diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c
deleted file mode 100644
index e571af7..000
--- a/libavfilter/x86/vf_gradfun.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (C) 2009 Loren Merritt lor...@u.washignton.edu
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY 

[libav-devel] [PATCH] dsputil: x86: Fix compile error by conditionally compiling code.

2013-02-18 Thread Daniel Kang
Specifically related to the H263 encoder/decoder.
---
 libavcodec/x86/dsputil.asm |4 
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 7ea796d..d245300 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -727,6 +727,7 @@ BSWAP32_BUF
 INIT_MMX mmx
 ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
 cglobal h263_v_loop_filter, 3,5
+%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER
 movsxdifnidn r1, r1d
 movsxdifnidn r2, r2d
 
@@ -745,6 +746,7 @@ cglobal h263_v_loop_filter, 3,5
 mova   [r0], m4
 mova   [r4], m5
 mova[r0+r1], m6
+%endif
 RET
 
 %macro TRANSPOSE4X4 2
@@ -769,6 +771,7 @@ cglobal h263_v_loop_filter, 3,5
 ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
 INIT_MMX mmx
 cglobal h263_h_loop_filter, 3,5,0,32
+%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER
 movsxdifnidn r1, r1d
 movsxdifnidn r2, r2d
 
@@ -810,4 +813,5 @@ cglobal h263_h_loop_filter, 3,5,0,32
 movd  [r4+r1*2], m6
 punpckhdqm6, m6
 movd[r4+r3], m6
+%endif
 RET
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Fix int - ptrdiff_t

2013-02-18 Thread Daniel Kang
On Fri, Feb 15, 2013 at 3:13 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 ---
  libavcodec/x86/dsputil_mmx.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

ping
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Fix compile error by conditionally compiling code.

2013-02-18 Thread Daniel Kang
On Mon, Feb 18, 2013 at 9:21 AM, Diego Biurrun di...@biurrun.de wrote:
 On Mon, Feb 18, 2013 at 09:08:35AM -0500, Daniel Kang wrote:
 Specifically related to the H263 encoder/decoder.
 ---
  libavcodec/x86/dsputil.asm |4 
  1 file changed, 4 insertions(+)

 Note that it's linking, not a compilation failure.

 --- a/libavcodec/x86/dsputil.asm
 +++ b/libavcodec/x86/dsputil.asm
 @@ -727,6 +727,7 @@ BSWAP32_BUF
  INIT_MMX mmx
  ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
  cglobal h263_v_loop_filter, 3,5
 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER
  movsxdifnidn r1, r1d
  movsxdifnidn r2, r2d

 @@ -745,6 +746,7 @@ cglobal h263_v_loop_filter, 3,5
  mova   [r0], m4
  mova   [r4], m5
  mova[r0+r1], m6
 +%endif
  RET

  %macro TRANSPOSE4X4 2
 @@ -769,6 +771,7 @@ cglobal h263_v_loop_filter, 3,5
  ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
  INIT_MMX mmx
  cglobal h263_h_loop_filter, 3,5,0,32
 +%if CONFIG_H263_DECODER || CONFIG_H263_ENCODER
  movsxdifnidn r1, r1d
  movsxdifnidn r2, r2d

 @@ -810,4 +813,5 @@ cglobal h263_h_loop_filter, 3,5,0,32
  movd  [r4+r1*2], m6
  punpckhdqm6, m6
  movd[r4+r3], m6
 +%endif
  RET

 Can't you move the H.263 code to a separate file?  That would be much
 cleaner than this ifdeffery.

A cleaner solution would be to just remove this from dsputil since
it's only called in h263 specific contexts, but that falls outside the
scope of my work.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-18 Thread Daniel Kang
On Fri, Feb 15, 2013 at 6:33 PM, Loren Merritt lor...@u.washington.edu wrote:
 On Fri, 15 Feb 2013, Daniel Kang wrote:

 +%macro PAVGBP_MMX 6
 +mova   %3, %1
 +mova   %6, %4
 +por%3, %2
 +por%6, %5
 +pxor   %2, %1
 +pxor   %5, %4
 +pand   %2, m6
 +pand   %5, m6
 +psrlq  %2, 1
 +psrlq  %5, 1
 +psubb  %3, %2
 +psubb  %6, %5
 +%endmacro
 +
 +%macro PAVGBP_NO_RND_MMX 6
 +mova %3, %1
 +mova %6, %4
 +pand %3, %2
 +pand %6, %5
 +pxor %2, %1
 +pxor %5, %4
 +pand %2, m6
 +pand %5, m6
 +psrlq%2, 1
 +psrlq%5, 1
 +paddb%3, %2
 +paddb%6, %5
 +%endmacro

 Does this need to be interleaved, not just two calls to PAVGB_OP_MMX?

No, fixed.

 +; put_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int 
 h)
 +%macro PUT_PIXELS8_XY2_MMX 0-1
 +cglobal put%1_pixels8_xy2, 4,5
 +pxor m7, m7
 +SET_RND(m6)
 +mova m0, [r1]
 +mova m4, [r1+1]
 +mova m1, m0
 +mova m5, m4
 +punpcklbwm0, m7
 +punpcklbwm4, m7
 +punpckhbwm1, m7
 +punpckhbwm5, m7
 +paddusw  m4, m0
 +paddusw  m5, m1
 +xor  r4, r4
 +add  r1, r2
 +.loop:
 +mova m0, [r1+r4]
 +mova m2, [r1+r4+1]
 +mova m1, m0
 +mova m3, m2
 +punpcklbwm0, m7
 +punpcklbwm2, m7
 +punpckhbwm1, m7
 +punpckhbwm3, m7
 +paddusw  m0, m2
 +paddusw  m1, m3
 +paddusw  m4, m6
 +paddusw  m5, m6
 +paddusw  m4, m0
 +paddusw  m5, m1
 +psrlwm4, 2
 +psrlwm5, 2
 +packuswb m4, m5
 +mova[r0+r4], m4
 +add  r4, r2
 +mova m2, [r1+r4]
 +mova m3, [r1+r4+1]
 +mova m3, m2
 +mova m5, m4
 +punpcklbwm2, m7
 +punpcklbwm4, m7
 +punpckhbwm3, m7
 +punpckhbwm5, m7
 +paddusw  m4, m2
 +paddusw  m5, m3
 +paddusw  m0, m6
 +paddusw  m1, m6
 +paddusw  m0, m4
 +paddusw  m1, m5
 +psrlwm0, 2
 +psrlwm1, 2
 +packuswb m0, m1
 +mova[r0+r4], m0
 +add  r4, r2
 +sub r3d, 2
 +jne .loop
 +RET
 +%endmacro

 Does this and similar functions really need to be unrolled? If so, use
 %rep.

Yes, due to the way this is written. I rep'd the one I could.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] dsputil: x86: Fix linker error with h263 loop filter.

2013-02-18 Thread Daniel Kang
This was caused by referencing a conditionally compiled table. Now the
code is also compiled conditionally.
---
 libavcodec/x86/Makefile|2 +
 libavcodec/x86/dsputil.asm |  162 --
 libavcodec/x86/h263_lf.asm |  187 
 3 files changed, 189 insertions(+), 162 deletions(-)
 create mode 100644 libavcodec/x86/h263_lf.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index c740573..24a96a5 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -44,6 +44,8 @@ YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_DCT)+= x86/dct32.o
 YASM-OBJS-$(CONFIG_ENCODERS)   += x86/dsputilenc.o
 YASM-OBJS-$(CONFIG_FFT)+= x86/fft.o
+YASM-OBJS-$(CONFIG_H263_ENCODER)   += x86/h263_lf.o
+YASM-OBJS-$(CONFIG_H263_DECODER)   += x86/h263_lf.o
 YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o   \
   x86/h264_chromamc_10bit.o
 YASM-OBJS-$(CONFIG_H264DSP)+= x86/h264_deblock.o\
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 7ea796d..4539e5c 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -22,8 +22,6 @@
 %include libavutil/x86/x86util.asm
 
 SECTION_RODATA
-cextern pb_FC
-cextern h263_loop_filter_strength
 pb_f: times 16 db 15
 pb_: times 8 db -1
 pb_7: times 8 db 7
@@ -651,163 +649,3 @@ BSWAP32_BUF
 INIT_XMM ssse3
 BSWAP32_BUF
 
-
-%macro H263_LOOP_FILTER 5
-pxor m7, m7
-mova m0, [%1]
-mova m1, [%1]
-mova m2, [%4]
-mova m3, [%4]
-punpcklbwm0, m7
-punpckhbwm1, m7
-punpcklbwm2, m7
-punpckhbwm3, m7
-psubwm0, m2
-psubwm1, m3
-mova m2, [%2]
-mova m3, [%2]
-mova m4, [%3]
-mova m5, [%3]
-punpcklbwm2, m7
-punpckhbwm3, m7
-punpcklbwm4, m7
-punpckhbwm5, m7
-psubwm4, m2
-psubwm5, m3
-psllwm4, 2
-psllwm5, 2
-paddwm4, m0
-paddwm5, m1
-pxor m6, m6
-pcmpgtw  m6, m4
-pcmpgtw  m7, m5
-pxor m4, m6
-pxor m5, m7
-psubwm4, m6
-psubwm5, m7
-psrlwm4, 3
-psrlwm5, 3
-packuswb m4, m5
-packsswb m6, m7
-pxor m7, m7
-movd m2, %5
-punpcklbwm2, m2
-punpcklbwm2, m2
-punpcklbwm2, m2
-psubusb  m2, m4
-mova m3, m2
-psubusb  m3, m4
-psubbm2, m3
-mova m3, [%2]
-mova m4, [%3]
-pxor m3, m6
-pxor m4, m6
-paddusb  m3, m2
-psubusb  m4, m2
-pxor m3, m6
-pxor m4, m6
-paddusb  m2, m2
-packsswb m0, m1
-pcmpgtb  m7, m0
-pxor m0, m7
-psubbm0, m7
-mova m1, m0
-psubusb  m0, m2
-psubbm1, m0
-pand m1, [pb_FC]
-psrlwm1, 2
-pxor m1, m7
-psubbm1, m7
-mova m5, [%1]
-mova m6, [%4]
-psubbm5, m1
-paddbm6, m1
-%endmacro
-
-INIT_MMX mmx
-; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
-cglobal h263_v_loop_filter, 3,5
-movsxdifnidn r1, r1d
-movsxdifnidn r2, r2d
-
-lea  r4, [h263_loop_filter_strength]
-movzx   r3d, BYTE [r4+r2]
-movsxr2, r3b
-shl  r2, 1
-
-mov  r3, r0
-sub  r3, r1
-mov  r4, r3
-sub  r4, r1
-H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
-
-mova   [r3], m3
-mova   [r0], m4
-mova   [r4], m5
-mova[r0+r1], m6
-RET
-
-%macro TRANSPOSE4X4 2
-movd  m0, [%1]
-movd  m1, [%1+r1]
-movd  m2, [%1+r1*2]
-movd  m3, [%1+r3]
-punpcklbw m0, m1
-punpcklbw m2, m3
-mova  m1, m0
-punpcklwd m0, m2
-punpckhwd m1, m2
-movd [%2+ 0], m0
-punpckhdq m0, m0
-movd [%2+ 8], m0
-movd [%2+16], m1
-punpckhdq m1, m1
-movd [%2+24], m1
-%endmacro
-
-
-; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
-INIT_MMX mmx
-cglobal h263_h_loop_filter, 3,5,0,32
-movsxdifnidn r1, r1d
-movsxdifnidn r2, r2d
-
-lea  r4, [h263_loop_filter_strength]
-movzx   r3d, BYTE [r4+r2]
-movsxr2, r3b
-shl  r2, 1
-
-sub  r0, 2
-lea  r3, [r1*3]
-
-TRANSPOSE4X4 r0, rsp
-lea  r4, [r0+r1*4]
-TRANSPOSE4X4 r4, rsp+4
-
-H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
-
-mova m1, m5
-mova m0, m4
-punpcklbwm5, m3
-punpcklbwm4, m6
-punpckhbwm1, m3
-punpckhbwm0, m6
-mova m3, m5
-mova   

[libav-devel] [PATCH] dsputil: x86: Fix linker error with h263 loop filter.

2013-02-18 Thread Daniel Kang
This was caused by referencing a conditionally compiled table. Now the
code is also compiled conditionally.
---
Change order and filename
---
 libavcodec/x86/Makefile|2 +
 libavcodec/x86/dsputil.asm |  162 ---
 libavcodec/x86/h263_loopfilter.asm |  187 
 3 files changed, 189 insertions(+), 162 deletions(-)
 create mode 100644 libavcodec/x86/h263_loopfilter.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index c740573..a759e6e 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -44,6 +44,8 @@ YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_DCT)+= x86/dct32.o
 YASM-OBJS-$(CONFIG_ENCODERS)   += x86/dsputilenc.o
 YASM-OBJS-$(CONFIG_FFT)+= x86/fft.o
+YASM-OBJS-$(CONFIG_H263_DECODER)   += x86/h263_loopfilter.o
+YASM-OBJS-$(CONFIG_H263_ENCODER)   += x86/h263_loopfilter.o
 YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o   \
   x86/h264_chromamc_10bit.o
 YASM-OBJS-$(CONFIG_H264DSP)+= x86/h264_deblock.o\
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 7ea796d..4539e5c 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -22,8 +22,6 @@
 %include libavutil/x86/x86util.asm
 
 SECTION_RODATA
-cextern pb_FC
-cextern h263_loop_filter_strength
 pb_f: times 16 db 15
 pb_: times 8 db -1
 pb_7: times 8 db 7
@@ -651,163 +649,3 @@ BSWAP32_BUF
 INIT_XMM ssse3
 BSWAP32_BUF
 
-
-%macro H263_LOOP_FILTER 5
-pxor m7, m7
-mova m0, [%1]
-mova m1, [%1]
-mova m2, [%4]
-mova m3, [%4]
-punpcklbwm0, m7
-punpckhbwm1, m7
-punpcklbwm2, m7
-punpckhbwm3, m7
-psubwm0, m2
-psubwm1, m3
-mova m2, [%2]
-mova m3, [%2]
-mova m4, [%3]
-mova m5, [%3]
-punpcklbwm2, m7
-punpckhbwm3, m7
-punpcklbwm4, m7
-punpckhbwm5, m7
-psubwm4, m2
-psubwm5, m3
-psllwm4, 2
-psllwm5, 2
-paddwm4, m0
-paddwm5, m1
-pxor m6, m6
-pcmpgtw  m6, m4
-pcmpgtw  m7, m5
-pxor m4, m6
-pxor m5, m7
-psubwm4, m6
-psubwm5, m7
-psrlwm4, 3
-psrlwm5, 3
-packuswb m4, m5
-packsswb m6, m7
-pxor m7, m7
-movd m2, %5
-punpcklbwm2, m2
-punpcklbwm2, m2
-punpcklbwm2, m2
-psubusb  m2, m4
-mova m3, m2
-psubusb  m3, m4
-psubbm2, m3
-mova m3, [%2]
-mova m4, [%3]
-pxor m3, m6
-pxor m4, m6
-paddusb  m3, m2
-psubusb  m4, m2
-pxor m3, m6
-pxor m4, m6
-paddusb  m2, m2
-packsswb m0, m1
-pcmpgtb  m7, m0
-pxor m0, m7
-psubbm0, m7
-mova m1, m0
-psubusb  m0, m2
-psubbm1, m0
-pand m1, [pb_FC]
-psrlwm1, 2
-pxor m1, m7
-psubbm1, m7
-mova m5, [%1]
-mova m6, [%4]
-psubbm5, m1
-paddbm6, m1
-%endmacro
-
-INIT_MMX mmx
-; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
-cglobal h263_v_loop_filter, 3,5
-movsxdifnidn r1, r1d
-movsxdifnidn r2, r2d
-
-lea  r4, [h263_loop_filter_strength]
-movzx   r3d, BYTE [r4+r2]
-movsxr2, r3b
-shl  r2, 1
-
-mov  r3, r0
-sub  r3, r1
-mov  r4, r3
-sub  r4, r1
-H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
-
-mova   [r3], m3
-mova   [r0], m4
-mova   [r4], m5
-mova[r0+r1], m6
-RET
-
-%macro TRANSPOSE4X4 2
-movd  m0, [%1]
-movd  m1, [%1+r1]
-movd  m2, [%1+r1*2]
-movd  m3, [%1+r3]
-punpcklbw m0, m1
-punpcklbw m2, m3
-mova  m1, m0
-punpcklwd m0, m2
-punpckhwd m1, m2
-movd [%2+ 0], m0
-punpckhdq m0, m0
-movd [%2+ 8], m0
-movd [%2+16], m1
-punpckhdq m1, m1
-movd [%2+24], m1
-%endmacro
-
-
-; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
-INIT_MMX mmx
-cglobal h263_h_loop_filter, 3,5,0,32
-movsxdifnidn r1, r1d
-movsxdifnidn r2, r2d
-
-lea  r4, [h263_loop_filter_strength]
-movzx   r3d, BYTE [r4+r2]
-movsxr2, r3b
-shl  r2, 1
-
-sub  r0, 2
-lea  r3, [r1*3]
-
-TRANSPOSE4X4 r0, rsp
-lea  r4, [r0+r1*4]
-TRANSPOSE4X4 r4, rsp+4
-
-H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
-
-mova m1, m5
-mova m0, m4
-punpcklbwm5, m3
-punpcklbwm4, m6
-punpckhbwm1, m3
-  

[libav-devel] [PATCH] dsputil: x86: Fix int - ptrdiff_t

2013-02-15 Thread Daniel Kang
---
 libavcodec/x86/dsputil_mmx.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index c011a21..fbc4b01 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -147,7 +147,7 @@ void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t 
*pixels,
 
 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h);
 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
-   int line_size, int h)
+   ptrdiff_t line_size, int h)
 {
 ff_put_pixels8_mmxext(block, pixels, line_size, h);
 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-15 Thread Daniel Kang
On Fri, Feb 15, 2013 at 1:53 PM, Diego Biurrun di...@biurrun.de wrote:

 On Fri, Feb 15, 2013 at 12:53:44AM -0500, Daniel Kang wrote:
  On Thu, Feb 14, 2013 at 7:59 AM, Diego Biurrun di...@biurrun.de wrote:
   On Wed, Feb 13, 2013 at 05:53:36PM -0500, Daniel Kang wrote:
  
   --- a/libavcodec/x86/cavsdsp.c
   +++ b/libavcodec/x86/cavsdsp.c
   @@ -475,12 +481,18 @@ CAVS_MC(put_, 8, 3dnow)
CAVS_MC(put_, 16,3dnow)
CAVS_MC(avg_, 8, 3dnow)
CAVS_MC(avg_, 16,3dnow)
   +#endif /* HAVE_AMD3DNOW_INLINE */
  
static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c,
  AVCodecContext *avctx)
{
   +#if HAVE_YASM
   +c-put_cavs_qpel_pixels_tab[0][0] =
   ff_put_cavs_qpel16_mc00_mmxext;
   +c-put_cavs_qpel_pixels_tab[1][0] =
   ff_put_cavs_qpel16_mc00_mmxext;
   +#endif
  
   +#if HAVE_INLINE_ASM
#define dspfunc(PFX, IDX, NUM) \
   -c-PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ##
   _mc00_mmxext; \
c-PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ##
   _mc20_3dnow; \
c-PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ##
   _mc01_3dnow; \
  
   mmxext functions in the 3dnow init function?
 
  Yes this is correct. Does not contain any mmxext specific instructions.

 That's not my definition of correct.  It is not wrongly placed, just
 wrongly named then.  Please fix the name.

What do you suggest?

   --- a/libavcodec/x86/dsputil_mmx.c
   +++ b/libavcodec/x86/dsputil_mmx.c
   @@ -128,26 +136,45 @@ void
   ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
   ptrdiff_t line_size, int h);
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t
   *pixels,
   -   int line_size, int h)
   +   ptrdiff_t line_size, int h)
  
   Is there a reason not to do this separately, i.e. right away?
 
  No.

 So let's go ahead and change it separately. :)

Sure

   --- a/libavcodec/x86/dsputil_rnd_template.c
   +++ b/libavcodec/x86/dsputil_rnd_template.c
   @@ -25,570 +25,28 @@
  
//FIXME optimize
   -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t
   *pixels, ptrdiff_t line_size, int h){
   -DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
   -DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
   +static void DEF(ff_put, pixels16_y2)(uint8_t *block, const uint8_t
   *pixels, ptrdiff_t line_size, int h){
   +DEF(ff_put, pixels8_y2)(block  , pixels  , line_size, h);
   +DEF(ff_put, pixels8_y2)(block+8, pixels+8, line_size, h);
}
  
   Is the FIXME comment still valid in some way?
 
  Yes and no. There are mmxext versions of the same thing and they're
  faster anyway.

 So it's cruft more than anything else, please delete it.

Done

   +lea  r1, [r1+r2*2]
   +lea  r0, [r0+r2*2]
   +sub r3d, 4
   +jne .loop
   +RET
  
   Weird placement of .loop; I suggest aligning it with the rest.
   Probably it is handled inconsistently throughout...
 
  All my code I've written has the loop in that placement. It's very
  possible it's inconsistent across files.

 I suggest no idiosyncratic formatting for jump instructions and/or
 maintaining the style of the file.

The majority of the code in that file already has loops in that
format. The exceptions are the two functions at the bottom (the sse2
ones).
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-14 Thread Daniel Kang
On Thu, Feb 14, 2013 at 7:59 AM, Diego Biurrun di...@biurrun.de wrote:
 On Wed, Feb 13, 2013 at 05:53:36PM -0500, Daniel Kang wrote:

 --- a/libavcodec/x86/cavsdsp.c
 +++ b/libavcodec/x86/cavsdsp.c
 @@ -475,12 +481,18 @@ CAVS_MC(put_, 8, 3dnow)
  CAVS_MC(put_, 16,3dnow)
  CAVS_MC(avg_, 8, 3dnow)
  CAVS_MC(avg_, 16,3dnow)
 +#endif /* HAVE_AMD3DNOW_INLINE */

  static av_cold void ff_cavsdsp_init_3dnow(CAVSDSPContext *c,
AVCodecContext *avctx)
  {
 +#if HAVE_YASM
 +c-put_cavs_qpel_pixels_tab[0][0] = ff_put_cavs_qpel16_mc00_mmxext;
 +c-put_cavs_qpel_pixels_tab[1][0] = ff_put_cavs_qpel16_mc00_mmxext;
 +#endif

 +#if HAVE_INLINE_ASM
  #define dspfunc(PFX, IDX, NUM) \
 -c-PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \
  c-PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
  c-PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \

 mmxext functions in the 3dnow init function?

Yes this is correct. Does not contain any mmxext specific instructions.

 --- a/libavcodec/x86/dsputil_mmx.c
 +++ b/libavcodec/x86/dsputil_mmx.c
 @@ -128,26 +136,45 @@ void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t 
 *block,
  void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
 line_size, int h);
  static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
 -   int line_size, int h)
 +   ptrdiff_t line_size, int h)

 Is there a reason not to do this separately, i.e. right away?

No.

 --- a/libavcodec/x86/dsputil_rnd_template.c
 +++ b/libavcodec/x86/dsputil_rnd_template.c
 @@ -25,570 +25,28 @@

  //FIXME optimize
 -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, 
 ptrdiff_t line_size, int h){
 -DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
 -DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
 +static void DEF(ff_put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, 
 ptrdiff_t line_size, int h){
 +DEF(ff_put, pixels8_y2)(block  , pixels  , line_size, h);
 +DEF(ff_put, pixels8_y2)(block+8, pixels+8, line_size, h);
  }

 Is the FIXME comment still valid in some way?

Yes and no. There are mmxext versions of the same thing and they're
faster anyway.

 Please prettyprint those lines that you are changing anyway, I think
 I gave you an sed expression to do it automatically the last time.
 It should still work and/or be easy to adopt.

Done.

 @@ -56,6 +107,44 @@ PUT_PIXELS8_X2

 +%macro PUT_PIXELS8_X2_MMX 0-1
 +%if %0 == 1
 +cglobal put%1_pixels8_x2, 4,4
 +%else
 +cglobal put_pixels8_x2, 4,4
 +%endif

 IIRC you don't need the %if, but you can just pass an empty
 first parameter and it should do the right thing.
 .. more below ..

I tried this (and Ronald's suggestion) and I get the error:

libavcodec/x86/hpeldsp.asm:142: error: (cglobal_internal:8) `%ifndef'
expects macro identifiers

I suspect this has to do with putting the %1 in the middle of the
string. Suggestions appreciated.

 +lea  r1, [r1+r2*2]
 +lea  r0, [r0+r2*2]
 +sub r3d, 4
 +jne .loop
 +RET

 Weird placement of .loop; I suggest aligning it with the rest.
 Probably it is handled inconsistently throughout...

All my code I've written has the loop in that placement. It's very
possible it's inconsistent across files.

 @@ -453,6 +753,201 @@ AVG_PIXELS8_XY2

 +%macro AVG_PIXELS8_XY2_MMX 0-1

 Some macros have comments with the C functions they implement, some
 don't.  Please add the comments everywhere, I consider them helpful.

Done.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] mpeg4qpel: Make movsxifnidn do the right thing

2013-02-11 Thread Daniel Kang
Fixes an instruction that does nothing by changing the
source to dword
---
 libavcodec/x86/mpeg4qpel.asm |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
index 6b5d203..df20ea9 100644
--- a/libavcodec/x86/mpeg4qpel.asm
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -100,7 +100,7 @@ PUT_NO_RND_PIXELS8_L2
 ; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int 
dstStride, int src1Stride, int h)
 %macro PUT_NO_RND_PIXELS16_l2 0
 cglobal put_no_rnd_pixels16_l2, 6,6
-movsxdifnidn r3, r3
+movsxdifnidn r3, r3d
 movsxdifnidn r4, r4d
 pcmpeqb  m6, m6
 testr5d, 1
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-06 Thread Daniel Kang
On Tue, Feb 5, 2013 at 1:23 PM, Diego Biurrun di...@biurrun.de wrote:
 On Fri, Feb 01, 2013 at 06:28:30PM -0500, Daniel Kang wrote:

 --- a/libavcodec/x86/dsputil.asm
 +++ b/libavcodec/x86/dsputil.asm
 @@ -648,3 +650,160 @@ BSWAP32_BUF
 +
 +INIT_MMX mmx
 +cglobal h263_v_loop_filter, 3,5
 +movsxdifnidn r1, r1
 +movsxdifnidn r2, r2
 +
 +INIT_MMX mmx
 +cglobal h263_h_loop_filter, 3,5,0,32
 +movsxdifnidn r1, r1
 +movsxdifnidn r2, r2

 Is the sign extension still needed after all the ptrdiff_t changes I did?

Yes. You didn't touch h263 loop filter, either that or your changes
haven't been pushed.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-05 Thread Daniel Kang
On Fri, Feb 1, 2013 at 6:28 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 ---
 I am very skeptical when assembly works on the first time. More testing would 
 be appreciated.
 ---
  libavcodec/x86/dsputil.asm   |  159 
  libavcodec/x86/dsputil_mmx.c |  185 
 ++
  2 files changed, 167 insertions(+), 177 deletions(-)

Ping?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-05 Thread Daniel Kang
On Tue, Feb 5, 2013 at 9:16 AM, Luca Barbato lu_z...@gentoo.org wrote:
 On 02/02/13 00:28, Daniel Kang wrote:
 ---
 I am very skeptical when assembly works on the first time. More testing 
 would be appreciated.
 ---
  libavcodec/x86/dsputil.asm   |  159 
  libavcodec/x86/dsputil_mmx.c |  185 
 ++
  2 files changed, 167 insertions(+), 177 deletions(-)

 /usr/lib/gcc/x86_64-pc-linux-gnu/4.6.3/../../../../x86_64-pc-linux-gnu/bin/ld:
 libavcodec/libavcodec.a(dsputil.o): relocation R_X86_64_32 against
 `ff_h263_loop_filter_strength' can not be used when making a shared
 object; recompile with -fPIC
 libavcodec/libavcodec.a: could not read symbols: Bad value

 Looks like something is broken for x86_64.

What configure options are you using? ./configure  make works just
fine with me with a clean build, on master.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-05 Thread Daniel Kang
On Tue, Feb 5, 2013 at 1:04 PM, Luca Barbato lu_z...@gentoo.org wrote:
 On 05/02/13 18:31, Daniel Kang wrote:
 On Tue, Feb 5, 2013 at 9:16 AM, Luca Barbato lu_z...@gentoo.org wrote:
 On 02/02/13 00:28, Daniel Kang wrote:
 ---
 I am very skeptical when assembly works on the first time. More testing 
 would be appreciated.
 ---
  libavcodec/x86/dsputil.asm   |  159 
  libavcodec/x86/dsputil_mmx.c |  185 
 ++
  2 files changed, 167 insertions(+), 177 deletions(-)

 /usr/lib/gcc/x86_64-pc-linux-gnu/4.6.3/../../../../x86_64-pc-linux-gnu/bin/ld:
 libavcodec/libavcodec.a(dsputil.o): relocation R_X86_64_32 against
 `ff_h263_loop_filter_strength' can not be used when making a shared
 object; recompile with -fPIC
 libavcodec/libavcodec.a: could not read symbols: Bad value

 Looks like something is broken for x86_64.

 What configure options are you using? ./configure  make works just
 fine with me with a clean build, on master.

 Here does not and it is a clean checkout with that patch on top.

 Which compiler are you using?

 lu

ddkang@ddkang ~/code/libav $ gcc --version
gcc (Ubuntu/Linaro 4.7.2-2ubuntu1) 4.7.2
Copyright (C) 2012 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-01 Thread Daniel Kang
---
I am very skeptical when assembly works on the first time. More testing would 
be appreciated.
---
 libavcodec/x86/dsputil.asm   |  159 
 libavcodec/x86/dsputil_mmx.c |  185 ++
 2 files changed, 167 insertions(+), 177 deletions(-)

diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 8002779..b7b7046 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -22,6 +22,8 @@
 %include libavutil/x86/x86util.asm
 
 SECTION_RODATA
+cextern pb_FC
+cextern h263_loop_filter_strength
 pb_f: times 16 db 15
 pb_: times 8 db -1
 pb_7: times 8 db 7
@@ -648,3 +650,160 @@ BSWAP32_BUF
 
 INIT_XMM ssse3
 BSWAP32_BUF
+
+
+%macro H263_LOOP_FILTER 5
+pxor m7, m7
+mova m0, [%1]
+mova m1, [%1]
+mova m2, [%4]
+mova m3, [%4]
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+psubwm0, m2
+psubwm1, m3
+mova m2, [%2]
+mova m3, [%2]
+mova m4, [%3]
+mova m5, [%3]
+punpcklbwm2, m7
+punpckhbwm3, m7
+punpcklbwm4, m7
+punpckhbwm5, m7
+psubwm4, m2
+psubwm5, m3
+psllwm4, 2
+psllwm5, 2
+paddwm4, m0
+paddwm5, m1
+pxor m6, m6
+pcmpgtw  m6, m4
+pcmpgtw  m7, m5
+pxor m4, m6
+pxor m5, m7
+psubwm4, m6
+psubwm5, m7
+psrlwm4, 3
+psrlwm5, 3
+packuswb m4, m5
+packsswb m6, m7
+pxor m7, m7
+movd m2, %5
+punpcklbwm2, m2
+punpcklbwm2, m2
+punpcklbwm2, m2
+psubusb  m2, m4
+mova m3, m2
+psubusb  m3, m4
+psubbm2, m3
+mova m3, [%2]
+mova m4, [%3]
+pxor m3, m6
+pxor m4, m6
+paddusb  m3, m2
+psubusb  m4, m2
+pxor m3, m6
+pxor m4, m6
+paddusb  m2, m2
+packsswb m0, m1
+pcmpgtb  m7, m0
+pxor m0, m7
+psubbm0, m7
+mova m1, m0
+psubusb  m0, m2
+psubbm1, m0
+pand m1, [pb_FC]
+psrlwm1, 2
+pxor m1, m7
+psubbm1, m7
+mova m5, [%1]
+mova m6, [%4]
+psubbm5, m1
+paddbm6, m1
+%endmacro
+
+INIT_MMX mmx
+cglobal h263_v_loop_filter, 3,5
+movsxdifnidn r1, r1
+movsxdifnidn r2, r2
+
+movzx   r3d, BYTE [ff_h263_loop_filter_strength+r2]
+movsxr2, r3b
+shl  r2, 1
+
+mov  r3, r0
+sub  r3, r1
+mov  r4, r3
+sub  r4, r1
+H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
+
+mova   [r3], m3
+mova   [r0], m4
+mova   [r4], m5
+mova[r0+r1], m6
+RET
+
+%macro TRANSPOSE4X4 2
+movd  m0, [%1]
+movd  m1, [%1+r1]
+movd  m2, [%1+r1*2]
+movd  m3, [%1+r3]
+punpcklbw m0, m1
+punpcklbw m2, m3
+mova  m1, m0
+punpcklwd m0, m2
+punpckhwd m1, m2
+movd [%2+ 0], m0
+punpckhdq m0, m0
+movd [%2+ 8], m0
+movd [%2+16], m1
+punpckhdq m1, m1
+movd [%2+24], m1
+%endmacro
+
+
+INIT_MMX mmx
+cglobal h263_h_loop_filter, 3,5,0,32
+movsxdifnidn r1, r1
+movsxdifnidn r2, r2
+
+movzx   r3d, BYTE [ff_h263_loop_filter_strength+r2]
+movsxr2, r3b
+shl  r2, 1
+
+sub  r0, 2
+lea  r3, [r1*3]
+
+TRANSPOSE4X4 r0, rsp
+lea  r4, [r0+r1*4]
+TRANSPOSE4X4 r4, rsp+4
+
+H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
+
+mova m1, m5
+mova m0, m4
+punpcklbwm5, m3
+punpcklbwm4, m6
+punpckhbwm1, m3
+punpckhbwm0, m6
+mova m3, m5
+mova m6, m1
+punpcklwdm5, m4
+punpcklwdm1, m0
+punpckhwdm3, m4
+punpckhwdm6, m0
+movd   [r0], m5
+punpckhdqm5, m5
+movd  [r0+r1*1], m5
+movd  [r0+r1*2], m3
+punpckhdqm3, m3
+movd[r0+r3], m3
+movd   [r4], m1
+punpckhdqm1, m1
+movd  [r4+r1*1], m1
+movd  [r4+r1*2], m6
+punpckhdqm6, m6
+movd[r4+r3], m6
+RET
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 3ccef62..7039095 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -650,181 +650,12 @@ static void add_hfyu_median_prediction_cmov(uint8_t 
*dst, const uint8_t *top,
 *left_top = tl;
 }
 #endif
+#endif /* HAVE_INLINE_ASM */
 
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg 
dst_stride, x86_reg src_stride){
-__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
-movd  (%1), %%mm0  \n\t
-add   %3, 

[libav-devel] [PATCH] dsputil: Fix error by not using redzone

2013-01-27 Thread Daniel Kang
---
I currently have no way of testing if this fixes the mingw32 failures or not. 
Testing would be appreciated
---
 libavcodec/x86/mpeg4qpel.asm |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
index a5525d2..6b5d203 100644
--- a/libavcodec/x86/mpeg4qpel.asm
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -168,7 +168,7 @@ INIT_MMX 3dnow
 PUT_NO_RND_PIXELS16_l2
 
 %macro MPEG4_QPEL16_H_LOWPASS 1
-cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
+cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
 movsxdifnidn r2, r2d
 movsxdifnidn r3, r3d
 pxor m7, m7
@@ -201,7 +201,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
 paddwm6, [PW_ROUND]
 paddwm0, m6
 psrawm0, 5
-mova[rsp-8], m0
+mova[rsp+8], m0
 mova m0, [r1+5]
 mova m5, m0
 mova m6, m0
@@ -225,7 +225,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
 paddwm1, [PW_ROUND]
 paddwm3, m1
 psrawm3, 5
-mova m1, [rsp-8]
+mova m1, [rsp+8]
 packuswb m1, m3
 OP_MOV [r0], m1, m4
 mova m1, [r1+9]
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm

2013-01-26 Thread Daniel Kang
On Sat, Jan 26, 2013 at 1:25 PM, Diego Biurrun di...@biurrun.de wrote:
 On Sat, Jan 26, 2013 at 01:01:09PM -0500, Daniel Kang wrote:
 On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun di...@biurrun.de wrote:
  On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
  --- a/libavcodec/x86/dsputil.asm
  +++ b/libavcodec/x86/dsputil.asm
  @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
  +; mpeg4 qpel
  +
  +%macro MPEG4_QPEL16_H_LOWPASS 1
  +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
 
  So it seems like dsputil.asm is becoming the new dumping ground for
  functions of all kind.  It doubles in size after your patch and at
  around 2k lines it starts to work against our current efforts of
  splitting dsputil into sensibly-sized pieces.  If you continue your
  porting efforts, it will probably end up around 5k lines or so.
 
  Whenever there is an opportunity to make dsputil less monolithic comes
  up, we should exploit it.  That seems to be the case here.

 I was trying to avoid drama and bikeshedding re: file names and save
 that for another patch. I guess I could split it in this patch if you
 want.

 Come on, don't blow the issue out of proportion.  Just come up with a
 suitable name, maybe ask one or two other people that know the code
 for suitable suggestions.  My suggestion would be mpeg4qpel.asm, maybe
 h263qpel.asm, but the former is probably more fitting, not sure.

 Even in case you should get three different suggestions and change to one
 after the other, it's easy enough with git and will not hinder your
 workflow at all.

 However, going back and forth after your patch has been pushed just
 creates unnecessary churn and annoyance.

Very well, moved to mpeg4qpel.asm

  +%macro QPEL_V_LOW 5
  +paddw  m0, m1
  +mova   m4, [pw_20]
  +pmullw m4, m0
  +mova   m0, %4
  +mova   m5, %1
  +paddw  m5, m0
  +psubw  m4, m5
  +mova   m5, %2
  +mova   m6, %3
  +paddw  m5, m3
  +paddw  m6, m2
  +paddw  m6, m6
  +psubw  m5, m6
  +pmullw m5, [pw_3]
  +paddw  m4, [PW_ROUND]
  +paddw  m5, m4
  +psraw  m5, 5
  +packuswb   m5, m5
  +OP_MOV %5, m5, m7
  +SWAP 0,1,2,3
  +%endmacro
 
  nit: SWAP is not special, format its arguments like the rest of the
  macro instructions.

 I disagree on this one, I think SWAP is special.

 The rest of the codebase disagrees with you then.  In the rest of the
 files SWAP has spaces after comma and arguments aligned with the other
 instructions.

Only some of it does, but changed.

  --- a/libavcodec/x86/dsputil_avg_template.c
  +++ b/libavcodec/x86/dsputil_avg_template.c
  @@ -24,781 +24,32 @@
   //FIXME the following could be optimized too ...
  +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
  +DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
  +DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
  +DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, 
  int line_size, int h){
  +DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
  +DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
  +DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
  +DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
   }
  +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t 
  *pixels, int line_size, int h){
  +DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
  +DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
   }
 
  If you feel motivated, you could fix the formatting as you are changing
  all lines anyway.

 Fixed.

 Hehe, sort of :)

 Try running the following (GNU) sed command on your tree:

   sed -i -e 's/+/ + /g' -e 's/  ,/,/g' 
 libavcodec/x86/dsputil_avg_template.c

 That should prettyprint it nicely.

Fixed.

  --- a/libavcodec/x86/dsputil_mmx.c
  +++ b/libavcodec/x86/dsputil_mmx.c
  @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 
  0xFEFEFEFEFEFEFEFEULL, 0xFEF

[libav-devel] [PATCH] dsputil: Fix compile error

2013-01-26 Thread Daniel Kang
Fixes under --disable-optimizations --disable-yasm --disable-inline-asm

Due to misplaced HAVE_YASM guard
---
 libavcodec/x86/dsputil_mmx.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 743a7c1..3ccef62 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -81,6 +81,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
 
+#if HAVE_YASM
 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
   int line_size, int h);
 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -179,6 +180,7 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t 
*dst, uint8_t *src,
 int dstStride, int srcStride);
 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#endif /* HAVE_YASM */
 
 
 #if HAVE_INLINE_ASM
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] dsputil: Fix error with wrong number of registers

2013-01-26 Thread Daniel Kang
Allocated 5 instead of 6 registers
---
 libavcodec/x86/mpeg4qpel.asm |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm
index 39c9fc8..a5525d2 100644
--- a/libavcodec/x86/mpeg4qpel.asm
+++ b/libavcodec/x86/mpeg4qpel.asm
@@ -99,7 +99,7 @@ PUT_NO_RND_PIXELS8_L2
 
 ; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int 
dstStride, int src1Stride, int h)
 %macro PUT_NO_RND_PIXELS16_l2 0
-cglobal put_no_rnd_pixels16_l2, 5,5
+cglobal put_no_rnd_pixels16_l2, 6,6
 movsxdifnidn r3, r3
 movsxdifnidn r4, r4d
 pcmpeqb  m6, m6
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm

2013-01-23 Thread Daniel Kang
On Wed, Jan 23, 2013 at 12:36 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi Daniel,

 On Tue, Jan 22, 2013 at 11:19 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 @@ -1330,10 +1087,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t 
 *dst, uint8_t *src,\
  {   \
  uint64_t half[8 + 9];   \
  uint8_t * const halfH = ((uint8_t*)half);   \
 -put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,  \
 -stride, 9); \
 -put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
 -OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
 +ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,   \
 +   stride, 9);  \
 +ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,  \
 +8, stride, 9);  \
 +ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,  \
 +   stride, 8);  \
  }   \

 So, for all cases like this, does this actually affect speed? I mean,
 previously this could be inlined, now it no longer can be. I wonder if
 that has any effect on speed (i.e. was it ever inlined previously?).

Depending on the architecture (??) the functions are inlined, but are
often not. I suspect GCC's insane method of reordering registers
swallows any overhead from calling these functions, but due to macro
hell, I'm not sure of the best way to test this.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm

2013-01-23 Thread Daniel Kang
On Wed, Jan 23, 2013 at 4:14 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 On Wed, Jan 23, 2013 at 12:36 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi Daniel,

 On Tue, Jan 22, 2013 at 11:19 PM, Daniel Kang daniel.d.k...@gmail.com 
 wrote:
 @@ -1330,10 +1087,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t 
 *dst, uint8_t *src,\
  {   \
  uint64_t half[8 + 9];   \
  uint8_t * const halfH = ((uint8_t*)half);   \
 -put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,  \
 -stride, 9); \
 -put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
 -OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
 +ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,   \
 +   stride, 9);  \
 +ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,  \
 +8, stride, 9);  \
 +ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,  \
 +   stride, 8);  \
  }   \

 So, for all cases like this, does this actually affect speed? I mean,
 previously this could be inlined, now it no longer can be. I wonder if
 that has any effect on speed (i.e. was it ever inlined previously?).

 Depending on the architecture (??) the functions are inlined, but are
 often not. I suspect GCC's insane method of reordering registers
 swallows any overhead from calling these functions, but due to macro
 hell, I'm not sure of the best way to test this.

Sorry, this was not very clear. I think the yasm version is faster
despite calling overhead, because GCC uses some ridiculous method of
reordering registers for the inline assembly.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert some inline asm to yasm

2013-01-22 Thread Daniel Kang
On Tue, Jan 22, 2013 at 5:10 PM, Diego Biurrun di...@biurrun.de wrote:
 On Tue, Jan 22, 2013 at 04:40:34PM -0500, Daniel Kang wrote:
 --- a/libavcodec/x86/dsputil_avg_template.c
 +++ b/libavcodec/x86/dsputil_avg_template.c
 @@ -24,781 +24,32 @@

  //FIXME the following could be optimized too ...
 +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t 
 *pixels, int line_size, int h){
 +DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
 +DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, 
 int line_size, int h){
 +DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
 +DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t 
 *pixels, int line_size, int h){
 +DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
 +DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int 
 line_size, int h){
 +DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
 +DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, 
 int line_size, int h){
 +DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
 +DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, 
 int line_size, int h){
 +DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
 +DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
  }
 +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, 
 int line_size, int h){
 +DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
 +DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
  }

 Moving this to a macro and deleting the file seems saner to me.
 Maybe there are other opinions though...

I was trying to avoid more macro hell in dsputil. Suggestions appreciated.

 --- a/libavcodec/x86/dsputil_mmx.c
 +++ b/libavcodec/x86/dsputil_mmx.c
 @@ -83,6 +83,147 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 
 0xFEFEFEFEFEFEFEFEULL, 0xFEF

 +#if HAVE_YASM
 +/* VC-1-specific */
 +#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
 +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
 +   int stride, int rnd)
 +{
 +ff_put_pixels8_mmx(dst, src, stride, 8);
 +}
 +
 +void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
 +  int stride, int rnd)
 +{
 +ff_avg_pixels8_mmxext(dst, src, stride, 8);
 +}

 Is this used outside of VC-1?  If no, this should be split out and moved
 to a VC-1-specific file.

The avg and put pixels functions are. I am fairly confident the others aren't.

 +/***/
 +/* 3Dnow specific */
 +
 +#define DEF(x) x ## _3dnow
 +
 +#include dsputil_avg_template.c
 +
 +#undef DEF
 +
 +/***/
 +/* MMXEXT specific */
 +
 +#define DEF(x) x ## _mmxext
 +
 +#include dsputil_avg_template.c
 +
 +#undef DEF
 +
 +
 +
 +#endif /* HAVE_YASM */
 +
 +
 +
 +
  #if HAVE_INLINE_ASM

 nit: stray large amount of empty lines

Fixed.

 --- a/libavcodec/x86/dsputil.asm
 +++ b/libavcodec/x86/dsputil.asm
 @@ -879,3 +884,986 @@ cglobal avg_pixels16, 4,5,4
  lea  r0, [r0+r2*4]
  jnz   .loop
  REP_RET
 +
 +
 +
 +
 +; HPEL mmxext
 +%macro PAVGB_OP 2

 nit: 4 empty lines looks slightly weird; in that file 2 empty lines
 between unrelated blocks seem to be the norm.

Fixed.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] dsputilenc: x86: Convert pixel inline asm to yasm

2013-01-20 Thread Daniel Kang
On Wed, Jan 16, 2013 at 2:41 AM, Daniel Kang daniel.d.k...@gmail.com wrote:
 ---
 Fixed movu - mova comment from Loren
 ---
  libavcodec/x86/dsputilenc.asm   |  152 +
  libavcodec/x86/dsputilenc_mmx.c |  201 
 ---
  2 files changed, 172 insertions(+), 181 deletions(-)

Ping?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH v2] dsputilenc: x86: Convert pixel inline asm to yasm

2013-01-15 Thread Daniel Kang
---
Fixed movu - mova comment from Loren
---
 libavcodec/x86/dsputilenc.asm   |  152 +
 libavcodec/x86/dsputilenc_mmx.c |  201 ---
 2 files changed, 172 insertions(+), 181 deletions(-)

diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index a2cb7f9..7b8763c 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -333,3 +333,155 @@ cglobal sse16, 5, 5, 8
 paddd m7, m1
 movd eax, m7 ; return value
 RET
+
+INIT_MMX mmx
+; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
+cglobal get_pixels, 3,4
+movsxdifnidn r2, r2d
+add  r0, 128
+mov  r3, -128
+pxor m7, m7
+.loop:
+mova m0, [r1]
+mova m2, [r1+r2]
+mova m1, m0
+mova m3, m2
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+mova [r0+r3+ 0], m0
+mova [r0+r3+ 8], m1
+mova [r0+r3+16], m2
+mova [r0+r3+24], m3
+lea  r1, [r1+r2*2]
+add  r3, 32
+js .loop
+REP_RET
+
+INIT_XMM sse2
+cglobal get_pixels, 3, 4
+movsxdifnidn r2, r2d
+lea  r3, [r2*3]
+pxor m4, m4
+movh m0, [r1]
+movh m1, [r1+r2]
+movh m2, [r1+r2*2]
+movh m3, [r1+r3]
+lea  r1, [r1+r2*4]
+punpcklbwm0, m4
+punpcklbwm1, m4
+punpcklbwm2, m4
+punpcklbwm3, m4
+mova   [r0], m0
+mova  [r0+0x10], m1
+mova  [r0+0x20], m2
+mova  [r0+0x30], m3
+movh m0, [r1]
+movh m1, [r1+r2*1]
+movh m2, [r1+r2*2]
+movh m3, [r1+r3]
+punpcklbwm0, m4
+punpcklbwm1, m4
+punpcklbwm2, m4
+punpcklbwm3, m4
+mova  [r0+0x40], m0
+mova  [r0+0x50], m1
+mova  [r0+0x60], m2
+mova  [r0+0x70], m3
+RET
+
+INIT_MMX mmx
+; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, 
stride)
+cglobal diff_pixels, 4,5
+movsxdifnidn r3, r3d
+pxor m7, m7
+add  r0,  128
+mov  r4, -128
+.loop:
+mova m0, [r1]
+mova m2, [r2]
+mova m1, m0
+mova m3, m2
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+psubwm0, m2
+psubwm1, m3
+mova  [r0+r4+0], m0
+mova  [r0+r4+8], m1
+add  r1, r3
+add  r2, r3
+add  r4, 16
+jne .loop
+REP_RET
+
+INIT_MMX mmx
+; pix_sum16_mmx(uint8_t * pix, int line_size)
+cglobal pix_sum16, 2, 3
+movsxdifnidn r1, r1d
+mov  r2, r1
+neg  r2
+shl  r2, 4
+sub  r0, r2
+pxor m7, m7
+pxor m6, m6
+.loop:
+mova m0, [r0+r2+0]
+mova m1, [r0+r2+0]
+mova m2, [r0+r2+8]
+mova m3, [r0+r2+8]
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+paddwm1, m0
+paddwm3, m2
+paddwm3, m1
+paddwm6, m3
+add  r2, r1
+js .loop
+mova m5, m6
+psrlqm6, 32
+paddwm6, m5
+mova m5, m6
+psrlqm6, 16
+paddwm6, m5
+movdeax, m6
+and eax, 0x
+RET
+
+INIT_MMX mmx
+; pix_norm1_mmx(uint8_t *pix, int line_size)
+cglobal pix_norm1, 2, 4
+movsxdifnidn r1, r1d
+mov  r2, 16
+pxor m0, m0
+pxor m7, m7
+.loop:
+mova m2, [r0+0]
+mova m3, [r0+8]
+mova m1, m2
+punpckhbwm1, m0
+punpcklbwm2, m0
+mova m4, m3
+punpckhbwm3, m0
+punpcklbwm4, m0
+pmaddwd  m1, m1
+pmaddwd  m2, m2
+pmaddwd  m3, m3
+pmaddwd  m4, m4
+padddm2, m1
+padddm4, m3
+padddm7, m2
+add  r0, r1
+padddm7, m4
+dec r2
+jne .loop
+mova m1, m7
+psrlqm7, 32
+padddm1, m7
+movdeax, m1
+RET
+
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index e5d2473..fa126d6 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -30,181 +30,14 @@
 #include libavcodec/mathops.h
 #include dsputil_mmx.h
 
+void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, 
int stride);
+int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
-static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int 
line_size)
-{
-__asm__ volatile(
-mov 

[libav-devel] [PATCH] dsputilenc: x86: Convert pixel inline asm to yasm

2013-01-14 Thread Daniel Kang
---
Tested on a variety of configs, but that pesky emms bug prevents full testing 
of x86_32 --disable-asm. So, more testing would be appreciated (and on MSVC).
---
 libavcodec/x86/dsputilenc.asm   |  152 +
 libavcodec/x86/dsputilenc_mmx.c |  201 ---
 2 files changed, 172 insertions(+), 181 deletions(-)

diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 3bb1f2f..d5a0206 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -335,3 +335,155 @@ cglobal sse16, 5, 5, 8
 paddd m7, m1
 movd eax, m7 ; return value
 RET
+
+INIT_MMX mmx
+; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
+cglobal get_pixels, 3,4
+movsxdifnidn r2, r2d
+add  r0, 128
+mov  r3, -128
+pxor m7, m7
+.loop:
+movu m0, [r1]
+movu m2, [r1+r2]
+movu m1, m0
+movu m3, m2
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+movu [r0+r3+ 0], m0
+movu [r0+r3+ 8], m1
+movu [r0+r3+16], m2
+movu [r0+r3+24], m3
+lea  r1, [r1+r2*2]
+add  r3, 32
+js .loop
+REP_RET
+
+INIT_XMM sse2
+cglobal get_pixels, 3, 4
+movsxdifnidn r2, r2d
+lea  r3, [r2*3]
+pxor m4, m4
+movh m0, [r1]
+movh m1, [r1+r2]
+movh m2, [r1+r2*2]
+movh m3, [r1+r3]
+lea  r1, [r1+r2*4]
+punpcklbwm0, m4
+punpcklbwm1, m4
+punpcklbwm2, m4
+punpcklbwm3, m4
+mova   [r0], m0
+mova  [r0+0x10], m1
+mova  [r0+0x20], m2
+mova  [r0+0x30], m3
+movh m0, [r1]
+movh m1, [r1+r2*1]
+movh m2, [r1+r2*2]
+movh m3, [r1+r3]
+punpcklbwm0, m4
+punpcklbwm1, m4
+punpcklbwm2, m4
+punpcklbwm3, m4
+mova  [r0+0x40], m0
+mova  [r0+0x50], m1
+mova  [r0+0x60], m2
+mova  [r0+0x70], m3
+RET
+
+INIT_MMX mmx
+; diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const unint8_t *s2, 
stride)
+cglobal diff_pixels, 4,5
+movsxdifnidn r3, r3d
+pxor m7, m7
+add  r0,  128
+mov  r4, -128
+.loop:
+mova m0, [r1]
+mova m2, [r2]
+mova m1, m0
+mova m3, m2
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+psubwm0, m2
+psubwm1, m3
+mova  [r0+r4+0], m0
+mova  [r0+r4+8], m1
+add  r1, r3
+add  r2, r3
+add  r4, 16
+jne .loop
+REP_RET
+
+INIT_MMX mmx
+; pix_sum16_mmx(uint8_t * pix, int line_size)
+cglobal pix_sum16, 2, 3
+movsxdifnidn r1, r1d
+mov  r2, r1
+neg  r2
+shl  r2, 4
+sub  r0, r2
+pxor m7, m7
+pxor m6, m6
+.loop:
+mova m0, [r0+r2+0]
+mova m1, [r0+r2+0]
+mova m2, [r0+r2+8]
+mova m3, [r0+r2+8]
+punpcklbwm0, m7
+punpckhbwm1, m7
+punpcklbwm2, m7
+punpckhbwm3, m7
+paddwm1, m0
+paddwm3, m2
+paddwm3, m1
+paddwm6, m3
+add  r2, r1
+js .loop
+mova m5, m6
+psrlqm6, 32
+paddwm6, m5
+mova m5, m6
+psrlqm6, 16
+paddwm6, m5
+movdeax, m6
+and eax, 0x
+RET
+
+INIT_MMX mmx
+; pix_norm1_mmx(uint8_t *pix, int line_size)
+cglobal pix_norm1, 2, 4
+movsxdifnidn r1, r1d
+mov  r2, 16
+pxor m0, m0
+pxor m7, m7
+.loop:
+mova m2, [r0+0]
+mova m3, [r0+8]
+mova m1, m2
+punpckhbwm1, m0
+punpcklbwm2, m0
+mova m4, m3
+punpckhbwm3, m0
+punpcklbwm4, m0
+pmaddwd  m1, m1
+pmaddwd  m2, m2
+pmaddwd  m3, m3
+pmaddwd  m4, m4
+padddm2, m1
+padddm4, m3
+padddm7, m2
+add  r0, r1
+padddm7, m4
+dec r2
+jne .loop
+mova m1, m7
+psrlqm7, 32
+padddm1, m7
+movdeax, m1
+RET
+
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index e5d2473..fa126d6 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -30,181 +30,14 @@
 #include libavcodec/mathops.h
 #include dsputil_mmx.h
 
+void ff_get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size);
+void ff_diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, 
int stride);
+int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 

[libav-devel] [PATCH v4] yadif: x86: fix build for compilers without aligned stack

2013-01-13 Thread Daniel Kang
Manually load registers to avoid using 8 registers with
compilers that do not align the stack (e.g. MSVC).
---
Now with named args.
---
 libavfilter/x86/yadif.asm |   56 +++--
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
index 5e406a4..bc4b3ce 100644
--- a/libavfilter/x86/yadif.asm
+++ b/libavfilter/x86/yadif.asm
@@ -31,8 +31,8 @@ pw_1: times  8 dw 1
 SECTION .text
 
 %macro CHECK 2
-movu  m2, [curq+mrefsq+%1]
-movu  m3, [curq+prefsq+%2]
+movu  m2, [curq+t1+%1]
+movu  m3, [curq+t0+%2]
 mova  m4, m2
 mova  m5, m2
 pxor  m4, m3
@@ -97,8 +97,8 @@ SECTION .text
 %macro FILTER 3
 .loop%1:
 pxor m7, m7
-LOAD  0, [curq+mrefsq]
-LOAD  1, [curq+prefsq]
+LOAD  0, [curq+t1]
+LOAD  1, [curq+t0]
 LOAD  2, [%2]
 LOAD  3, [%3]
 mova m4, m3
@@ -109,8 +109,8 @@ SECTION .text
 mova   [rsp+32], m1
 psubwm2, m4
 ABS1 m2, m4
-LOAD  3, [prevq+mrefsq]
-LOAD  4, [prevq+prefsq]
+LOAD  3, [prevq+t1]
+LOAD  4, [prevq+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -119,8 +119,8 @@ SECTION .text
 psrlwm2, 1
 psrlwm3, 1
 pmaxsw   m2, m3
-LOAD  3, [nextq+mrefsq]
-LOAD  4, [nextq+prefsq]
+LOAD  3, [nextq+t1]
+LOAD  4, [nextq+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -136,8 +136,8 @@ SECTION .text
 psrlwm1, 1
 ABS1 m0, m2
 
-movu m2, [curq+mrefsq-1]
-movu m3, [curq+prefsq-1]
+movu m2, [curq+t1-1]
+movu m3, [curq+t0-1]
 mova m4, m2
 psubusb  m2, m3
 psubusb  m3, m4
@@ -164,12 +164,12 @@ SECTION .text
 CHECK2
 
 mova m6, [rsp+48]
-cmp DWORD modem, 2
+cmp   DWORD r8m, 2
 jge .end%1
-LOAD  2, [%2+mrefsq*2]
-LOAD  4, [%3+mrefsq*2]
-LOAD  3, [%2+prefsq*2]
-LOAD  5, [%3+prefsq*2]
+LOAD  2, [%2+t1*2]
+LOAD  4, [%3+t1*2]
+LOAD  3, [%2+t0*2]
+LOAD  5, [%3+t0*2]
 paddwm2, m4
 paddwm3, m5
 psrlwm2, 1
@@ -208,19 +208,31 @@ SECTION .text
 add   prevq, mmsize/2
 addcurq, mmsize/2
 add   nextq, mmsize/2
-sub  wd, mmsize/2
+sub   DWORD r4m, mmsize/2
 jg .loop%1
 %endmacro
 
 %macro YADIF 0
-cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
-  mrefs, parity, mode
-test wq, wq
+%if ARCH_X86_32
+cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+mrefs, parity, mode
+%else
+cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+mrefs, parity, mode
+%endif
+cmp  DWORD wm, 0
 jle .ret
-movsxdifnidn prefsq, prefsd
-movsxdifnidn mrefsq, mrefsd
+%if ARCH_X86_32
+movr4, r5mp
+movr5, r6mp
+DECLARE_REG_TMP 4,5
+%else
+movsxd r5, DWORD r5m
+movsxd r6, DWORD r6m
+DECLARE_REG_TMP 5,6
+%endif
 
-cmp   DWORD paritym, 0
+cmp DWORD paritym, 0
 je .parity0
 FILTER 1, prevq, curq
 jmp .ret
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] yadif: x86: fix build for compilers without aligned stack

2013-01-13 Thread Daniel Kang
Manually load registers to avoid using 8 registers on x86_32 with
compilers that do not align the stack (e.g. MSVC).
---
Add which platform it fixes
---
 libavfilter/x86/yadif.asm |   56 +++--
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
index 5e406a4..bc4b3ce 100644
--- a/libavfilter/x86/yadif.asm
+++ b/libavfilter/x86/yadif.asm
@@ -31,8 +31,8 @@ pw_1: times  8 dw 1
 SECTION .text
 
 %macro CHECK 2
-movu  m2, [curq+mrefsq+%1]
-movu  m3, [curq+prefsq+%2]
+movu  m2, [curq+t1+%1]
+movu  m3, [curq+t0+%2]
 mova  m4, m2
 mova  m5, m2
 pxor  m4, m3
@@ -97,8 +97,8 @@ SECTION .text
 %macro FILTER 3
 .loop%1:
 pxor m7, m7
-LOAD  0, [curq+mrefsq]
-LOAD  1, [curq+prefsq]
+LOAD  0, [curq+t1]
+LOAD  1, [curq+t0]
 LOAD  2, [%2]
 LOAD  3, [%3]
 mova m4, m3
@@ -109,8 +109,8 @@ SECTION .text
 mova   [rsp+32], m1
 psubwm2, m4
 ABS1 m2, m4
-LOAD  3, [prevq+mrefsq]
-LOAD  4, [prevq+prefsq]
+LOAD  3, [prevq+t1]
+LOAD  4, [prevq+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -119,8 +119,8 @@ SECTION .text
 psrlwm2, 1
 psrlwm3, 1
 pmaxsw   m2, m3
-LOAD  3, [nextq+mrefsq]
-LOAD  4, [nextq+prefsq]
+LOAD  3, [nextq+t1]
+LOAD  4, [nextq+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -136,8 +136,8 @@ SECTION .text
 psrlwm1, 1
 ABS1 m0, m2
 
-movu m2, [curq+mrefsq-1]
-movu m3, [curq+prefsq-1]
+movu m2, [curq+t1-1]
+movu m3, [curq+t0-1]
 mova m4, m2
 psubusb  m2, m3
 psubusb  m3, m4
@@ -164,12 +164,12 @@ SECTION .text
 CHECK2
 
 mova m6, [rsp+48]
-cmp DWORD modem, 2
+cmp   DWORD r8m, 2
 jge .end%1
-LOAD  2, [%2+mrefsq*2]
-LOAD  4, [%3+mrefsq*2]
-LOAD  3, [%2+prefsq*2]
-LOAD  5, [%3+prefsq*2]
+LOAD  2, [%2+t1*2]
+LOAD  4, [%3+t1*2]
+LOAD  3, [%2+t0*2]
+LOAD  5, [%3+t0*2]
 paddwm2, m4
 paddwm3, m5
 psrlwm2, 1
@@ -208,19 +208,31 @@ SECTION .text
 add   prevq, mmsize/2
 addcurq, mmsize/2
 add   nextq, mmsize/2
-sub  wd, mmsize/2
+sub   DWORD r4m, mmsize/2
 jg .loop%1
 %endmacro
 
 %macro YADIF 0
-cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
-  mrefs, parity, mode
-test wq, wq
+%if ARCH_X86_32
+cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+mrefs, parity, mode
+%else
+cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+mrefs, parity, mode
+%endif
+cmp  DWORD wm, 0
 jle .ret
-movsxdifnidn prefsq, prefsd
-movsxdifnidn mrefsq, mrefsd
+%if ARCH_X86_32
+movr4, r5mp
+movr5, r6mp
+DECLARE_REG_TMP 4,5
+%else
+movsxd r5, DWORD r5m
+movsxd r6, DWORD r6m
+DECLARE_REG_TMP 5,6
+%endif
 
-cmp   DWORD paritym, 0
+cmp DWORD paritym, 0
 je .parity0
 FILTER 1, prevq, curq
 jmp .ret
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC

2013-01-12 Thread Daniel Kang
---
More testing would be appreciated
---
 libavfilter/x86/yadif.asm |   83 -
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
index 5e406a4..da0d6eb 100644
--- a/libavfilter/x86/yadif.asm
+++ b/libavfilter/x86/yadif.asm
@@ -31,8 +31,8 @@ pw_1: times  8 dw 1
 SECTION .text
 
 %macro CHECK 2
-movu  m2, [curq+mrefsq+%1]
-movu  m3, [curq+prefsq+%2]
+movu  m2, [r2+t1+%1]
+movu  m3, [r2+t0+%2]
 mova  m4, m2
 mova  m5, m2
 pxor  m4, m3
@@ -97,20 +97,20 @@ SECTION .text
 %macro FILTER 3
 .loop%1:
 pxor m7, m7
-LOAD  0, [curq+mrefsq]
-LOAD  1, [curq+prefsq]
+LOAD  0, [r2+t1]
+LOAD  1, [r2+t0]
 LOAD  2, [%2]
 LOAD  3, [%3]
 mova m4, m3
 paddwm3, m2
 psrawm3, 1
-mova   [rsp+ 0], m0
-mova   [rsp+16], m3
-mova   [rsp+32], m1
+mova   [rsp- 0], m0
+mova   [rsp-16], m3
+mova   [rsp-32], m1
 psubwm2, m4
 ABS1 m2, m4
-LOAD  3, [prevq+mrefsq]
-LOAD  4, [prevq+prefsq]
+LOAD  3, [r1+t1]
+LOAD  4, [r1+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -119,8 +119,8 @@ SECTION .text
 psrlwm2, 1
 psrlwm3, 1
 pmaxsw   m2, m3
-LOAD  3, [nextq+mrefsq]
-LOAD  4, [nextq+prefsq]
+LOAD  3, [r3+t1]
+LOAD  4, [r3+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -128,7 +128,7 @@ SECTION .text
 paddwm3, m4
 psrlwm3, 1
 pmaxsw   m2, m3
-mova   [rsp+48], m2
+mova   [rsp-48], m2
 
 paddwm1, m0
 paddwm0, m0
@@ -136,8 +136,8 @@ SECTION .text
 psrlwm1, 1
 ABS1 m0, m2
 
-movu m2, [curq+mrefsq-1]
-movu m3, [curq+prefsq-1]
+movu m2, [r2+t1-1]
+movu m3, [r2+t0-1]
 mova m4, m2
 psubusb  m2, m3
 psubusb  m3, m4
@@ -163,20 +163,20 @@ SECTION .text
 CHECK 1, -3
 CHECK2
 
-mova m6, [rsp+48]
-cmp DWORD modem, 2
+mova m6, [rsp-48]
+cmp   DWORD r8m, 2
 jge .end%1
-LOAD  2, [%2+mrefsq*2]
-LOAD  4, [%3+mrefsq*2]
-LOAD  3, [%2+prefsq*2]
-LOAD  5, [%3+prefsq*2]
+LOAD  2, [%2+t1*2]
+LOAD  4, [%3+t1*2]
+LOAD  3, [%2+t0*2]
+LOAD  5, [%3+t0*2]
 paddwm2, m4
 paddwm3, m5
 psrlwm2, 1
 psrlwm3, 1
-mova m4, [rsp+ 0]
-mova m5, [rsp+16]
-mova m7, [rsp+32]
+mova m4, [rsp- 0]
+mova m5, [rsp-16]
+mova m7, [rsp-32]
 psubwm2, m4
 psubwm3, m7
 mova m0, m5
@@ -195,7 +195,7 @@ SECTION .text
 pmaxsw   m6, m4
 
 .end%1:
-mova m2, [rsp+16]
+mova m2, [rsp-16]
 mova m3, m2
 psubwm2, m6
 paddwm3, m6
@@ -203,30 +203,37 @@ SECTION .text
 pminsw   m1, m3
 packuswb m1, m1
 
-movh [dstq], m1
-adddstq, mmsize/2
-add   prevq, mmsize/2
-addcurq, mmsize/2
-add   nextq, mmsize/2
-sub  wd, mmsize/2
+movh   [r0], m1
+add  r0, mmsize/2
+add  r1, mmsize/2
+add  r2, mmsize/2
+add  r3, mmsize/2
+sub   DWORD r4m, mmsize/2
 jg .loop%1
 %endmacro
 
+%assign PAD -1*80
 %macro YADIF 0
-cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
-  mrefs, parity, mode
-test wq, wq
+cglobal yadif_filter_line, 4, 7, 8, PAD
+cmp DWORD r4m, 0
 jle .ret
-movsxdifnidn prefsq, prefsd
-movsxdifnidn mrefsq, mrefsd
+%if ARCH_X86_32
+movifnidn  r4, r5mp
+movifnidn  r5, r6mp
+   DECLARE_REG_TMP 4,5
+%else
+movsxdifnidn r5, DWORD r5m
+movsxdifnidn r6, DWORD r6m
+DECLARE_REG_TMP 5,6
+%endif
 
-cmp   DWORD paritym, 0
+cmp DWORD r7m, 0
 je .parity0
-FILTER 1, prevq, curq
+FILTER 1, r1, r2
 jmp .ret
 
 .parity0:
-FILTER 0, curq, nextq
+FILTER 0, r2, r3
 
 .ret:
 RET
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH v2] yadif: x86: fix building with automatic stack alignment

2013-01-12 Thread Daniel Kang
Manually reload registers to avoid trying to use 8 registers when
compilers that do not align the stack. MSVC among those.
---
Update based on suggestion and commit message.
---
 libavfilter/x86/yadif.asm |   83 -
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
index 5e406a4..d2b4be5 100644
--- a/libavfilter/x86/yadif.asm
+++ b/libavfilter/x86/yadif.asm
@@ -31,8 +31,8 @@ pw_1: times  8 dw 1
 SECTION .text
 
 %macro CHECK 2
-movu  m2, [curq+mrefsq+%1]
-movu  m3, [curq+prefsq+%2]
+movu  m2, [r2+t1+%1]
+movu  m3, [r2+t0+%2]
 mova  m4, m2
 mova  m5, m2
 pxor  m4, m3
@@ -97,20 +97,20 @@ SECTION .text
 %macro FILTER 3
 .loop%1:
 pxor m7, m7
-LOAD  0, [curq+mrefsq]
-LOAD  1, [curq+prefsq]
+LOAD  0, [r2+t1]
+LOAD  1, [r2+t0]
 LOAD  2, [%2]
 LOAD  3, [%3]
 mova m4, m3
 paddwm3, m2
 psrawm3, 1
-mova   [rsp+ 0], m0
-mova   [rsp+16], m3
-mova   [rsp+32], m1
+mova   [rsp- 0], m0
+mova   [rsp-16], m3
+mova   [rsp-32], m1
 psubwm2, m4
 ABS1 m2, m4
-LOAD  3, [prevq+mrefsq]
-LOAD  4, [prevq+prefsq]
+LOAD  3, [r1+t1]
+LOAD  4, [r1+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -119,8 +119,8 @@ SECTION .text
 psrlwm2, 1
 psrlwm3, 1
 pmaxsw   m2, m3
-LOAD  3, [nextq+mrefsq]
-LOAD  4, [nextq+prefsq]
+LOAD  3, [r3+t1]
+LOAD  4, [r3+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -128,7 +128,7 @@ SECTION .text
 paddwm3, m4
 psrlwm3, 1
 pmaxsw   m2, m3
-mova   [rsp+48], m2
+mova   [rsp-48], m2
 
 paddwm1, m0
 paddwm0, m0
@@ -136,8 +136,8 @@ SECTION .text
 psrlwm1, 1
 ABS1 m0, m2
 
-movu m2, [curq+mrefsq-1]
-movu m3, [curq+prefsq-1]
+movu m2, [r2+t1-1]
+movu m3, [r2+t0-1]
 mova m4, m2
 psubusb  m2, m3
 psubusb  m3, m4
@@ -163,20 +163,20 @@ SECTION .text
 CHECK 1, -3
 CHECK2
 
-mova m6, [rsp+48]
-cmp DWORD modem, 2
+mova m6, [rsp-48]
+cmp   DWORD r8m, 2
 jge .end%1
-LOAD  2, [%2+mrefsq*2]
-LOAD  4, [%3+mrefsq*2]
-LOAD  3, [%2+prefsq*2]
-LOAD  5, [%3+prefsq*2]
+LOAD  2, [%2+t1*2]
+LOAD  4, [%3+t1*2]
+LOAD  3, [%2+t0*2]
+LOAD  5, [%3+t0*2]
 paddwm2, m4
 paddwm3, m5
 psrlwm2, 1
 psrlwm3, 1
-mova m4, [rsp+ 0]
-mova m5, [rsp+16]
-mova m7, [rsp+32]
+mova m4, [rsp- 0]
+mova m5, [rsp-16]
+mova m7, [rsp-32]
 psubwm2, m4
 psubwm3, m7
 mova m0, m5
@@ -195,7 +195,7 @@ SECTION .text
 pmaxsw   m6, m4
 
 .end%1:
-mova m2, [rsp+16]
+mova m2, [rsp-16]
 mova m3, m2
 psubwm2, m6
 paddwm3, m6
@@ -203,30 +203,37 @@ SECTION .text
 pminsw   m1, m3
 packuswb m1, m1
 
-movh [dstq], m1
-adddstq, mmsize/2
-add   prevq, mmsize/2
-addcurq, mmsize/2
-add   nextq, mmsize/2
-sub  wd, mmsize/2
+movh   [r0], m1
+add  r0, mmsize/2
+add  r1, mmsize/2
+add  r2, mmsize/2
+add  r3, mmsize/2
+sub   DWORD r4m, mmsize/2
 jg .loop%1
 %endmacro
 
+%assign PAD -1*80
 %macro YADIF 0
-cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
-  mrefs, parity, mode
-test wq, wq
+cglobal yadif_filter_line, 4, 7, 8, PAD
+cmp   DWORD r4m, 0
 jle .ret
-movsxdifnidn prefsq, prefsd
-movsxdifnidn mrefsq, mrefsd
+%if ARCH_X86_32
+mov  r4, r5mp
+mov  r5, r6mp
+DECLARE_REG_TMP 4,5
+%else
+movsxdifnidn r5, DWORD r5m
+movsxdifnidn r6, DWORD r6m
+DECLARE_REG_TMP 5,6
+%endif
 
-cmp   DWORD paritym, 0
+cmp   DWORD r7m, 0
 je .parity0
-FILTER 1, prevq, curq
+FILTER 1, r1, r2
 jmp .ret
 
 .parity0:
-FILTER 0, curq, nextq
+FILTER 0, r2, r3
 
 .ret:
 RET
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] yadif: x86: fix building with automatic stack alignment

2013-01-12 Thread Daniel Kang
On Sat, Jan 12, 2013 at 4:13 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi,

 On Sat, Jan 12, 2013 at 12:03 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 Manually reload registers to avoid trying to use 8 registers when
 compilers that do not align the stack. MSVC among those.
 [..]
 -adddstq, mmsize/2
 -add   prevq, mmsize/2
 -addcurq, mmsize/2
 -add   nextq, mmsize/2
 -sub  wd, mmsize/2
 [..]
 +add  r0, mmsize/2
 +add  r1, mmsize/2
 +add  r2, mmsize/2
 +add  r3, mmsize/2
 +sub   DWORD r4m, mmsize/2
 [..]
 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
 -  mrefs, parity, mode
 [..]
 +cglobal yadif_filter_line, 4, 7, 8, PAD

 Do you think it's possible to somehow keep the named arguments, at
 least for the ones where you load arguments from the stack (mrefsm,
 paritym, etc) and for the first 3-4 arguments that are not temp-based?

I tried when making the patch and I think it makes things more confusing.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC

2013-01-12 Thread Daniel Kang
On Sat, Jan 12, 2013 at 10:36 PM, Loren Merritt lor...@u.washington.edu wrote:
 On Sat, 12 Jan 2013, Daniel Kang wrote:

 -mova   [rsp+ 0], m0
 -mova   [rsp+16], m3
 -mova   [rsp+32], m1
 +mova   [rsp- 0], m0
 +mova   [rsp-16], m3
 +mova   [rsp-32], m1

 You can't do that on x86_32.

What do I do instead?

Also this seemed to work in my tests, why won't this work on x86_32?

 +%assign PAD -1*80

 Unused?

cglobal errors if I put -80 for the stack space.

  %macro YADIF 0
 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
 -  mrefs, parity, mode
 -test wq, wq
 +cglobal yadif_filter_line, 4, 7, 8, PAD

 Do you have a reason for removing all the named args?

I can't use half of the named args, and I thought it was less
confusing if I didn't use them at all.

 +cmp DWORD r4m, 0
  jle .ret
 -movsxdifnidn prefsq, prefsd
 -movsxdifnidn mrefsq, mrefsd
 +%if ARCH_X86_32
 +movifnidn  r4, r5mp
 +movifnidn  r5, r6mp
 +DECLARE_REG_TMP 4,5
 +%else
 +movsxdifnidn r5, DWORD r5m
 +movsxdifnidn r6, DWORD r6m
 +DECLARE_REG_TMP 5,6
 +%endif

 No ifnidn. After your change, they will not in fact be identical.

Will fix.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH v3] yadif: x86: fix build for compilers without aligned stack

2013-01-12 Thread Daniel Kang
Manually load registers to avoid using 8 registers with
compilers that do not align the stack (e.g. MSVC).
---
Better commit message and avoid redzone (Loren's comments).
---
 libavfilter/x86/yadif.asm |   68 ++---
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
index 5e406a4..adfd3db 100644
--- a/libavfilter/x86/yadif.asm
+++ b/libavfilter/x86/yadif.asm
@@ -31,8 +31,8 @@ pw_1: times  8 dw 1
 SECTION .text
 
 %macro CHECK 2
-movu  m2, [curq+mrefsq+%1]
-movu  m3, [curq+prefsq+%2]
+movu  m2, [r2+t1+%1]
+movu  m3, [r2+t0+%2]
 mova  m4, m2
 mova  m5, m2
 pxor  m4, m3
@@ -97,8 +97,8 @@ SECTION .text
 %macro FILTER 3
 .loop%1:
 pxor m7, m7
-LOAD  0, [curq+mrefsq]
-LOAD  1, [curq+prefsq]
+LOAD  0, [r2+t1]
+LOAD  1, [r2+t0]
 LOAD  2, [%2]
 LOAD  3, [%3]
 mova m4, m3
@@ -109,8 +109,8 @@ SECTION .text
 mova   [rsp+32], m1
 psubwm2, m4
 ABS1 m2, m4
-LOAD  3, [prevq+mrefsq]
-LOAD  4, [prevq+prefsq]
+LOAD  3, [r1+t1]
+LOAD  4, [r1+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -119,8 +119,8 @@ SECTION .text
 psrlwm2, 1
 psrlwm3, 1
 pmaxsw   m2, m3
-LOAD  3, [nextq+mrefsq]
-LOAD  4, [nextq+prefsq]
+LOAD  3, [r3+t1]
+LOAD  4, [r3+t0]
 psubwm3, m0
 psubwm4, m1
 ABS1 m3, m5
@@ -136,8 +136,8 @@ SECTION .text
 psrlwm1, 1
 ABS1 m0, m2
 
-movu m2, [curq+mrefsq-1]
-movu m3, [curq+prefsq-1]
+movu m2, [r2+t1-1]
+movu m3, [r2+t0-1]
 mova m4, m2
 psubusb  m2, m3
 psubusb  m3, m4
@@ -164,12 +164,12 @@ SECTION .text
 CHECK2
 
 mova m6, [rsp+48]
-cmp DWORD modem, 2
+cmp   DWORD r8m, 2
 jge .end%1
-LOAD  2, [%2+mrefsq*2]
-LOAD  4, [%3+mrefsq*2]
-LOAD  3, [%2+prefsq*2]
-LOAD  5, [%3+prefsq*2]
+LOAD  2, [%2+t1*2]
+LOAD  4, [%3+t1*2]
+LOAD  3, [%2+t0*2]
+LOAD  5, [%3+t0*2]
 paddwm2, m4
 paddwm3, m5
 psrlwm2, 1
@@ -203,30 +203,40 @@ SECTION .text
 pminsw   m1, m3
 packuswb m1, m1
 
-movh [dstq], m1
-adddstq, mmsize/2
-add   prevq, mmsize/2
-addcurq, mmsize/2
-add   nextq, mmsize/2
-sub  wd, mmsize/2
+movh   [r0], m1
+add  r0, mmsize/2
+add  r1, mmsize/2
+add  r2, mmsize/2
+add  r3, mmsize/2
+sub   DWORD r4m, mmsize/2
 jg .loop%1
 %endmacro
 
 %macro YADIF 0
-cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
-  mrefs, parity, mode
-test wq, wq
+%if ARCH_X86_32
+cglobal yadif_filter_line, 4, 6, 8, 80
+%else
+cglobal yadif_filter_line, 4, 7, 8, 80
+%endif
+cmp   DWORD r4m, 0
 jle .ret
-movsxdifnidn prefsq, prefsd
-movsxdifnidn mrefsq, mrefsd
+%if ARCH_X86_32
+mov  r4, r5mp
+mov  r5, r6mp
+DECLARE_REG_TMP 4,5
+%else
+movsxd   r5, DWORD r5m
+movsxd   r6, DWORD r6m
+DECLARE_REG_TMP 5,6
+%endif
 
-cmp   DWORD paritym, 0
+cmp   DWORD r7m, 0
 je .parity0
-FILTER 1, prevq, curq
+FILTER 1, r1, r2
 jmp .ret
 
 .parity0:
-FILTER 0, curq, nextq
+FILTER 0, r2, r3
 
 .ret:
 RET
-- 
1.7.10.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] yadif: Port inline assembly to YASM

2013-01-09 Thread Daniel Kang
---
Updated with Loren's suggestion.
---
 libavfilter/x86/Makefile |3 +-
 libavfilter/x86/yadif.asm|  242 +++
 libavfilter/x86/yadif.c  |   74 ---
 libavfilter/x86/yadif_init.c |   54 
 libavfilter/x86/yadif_template.c |  261 --
 5 files changed, 298 insertions(+), 336 deletions(-)
 create mode 100644 libavfilter/x86/yadif.asm
 delete mode 100644 libavfilter/x86/yadif.c
 create mode 100644 libavfilter/x86/yadif_init.c
 delete mode 100644 libavfilter/x86/yadif_template.c

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 0f08e39..47569cf 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,6 +1,7 @@
 OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/gradfun.o
 OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
-OBJS-$(CONFIG_YADIF_FILTER)  += x86/yadif.o
+OBJS-$(CONFIG_YADIF_FILTER)  += x86/yadif_init.o
 
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o
diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm
new file mode 100644
index 000..5e406a4
--- /dev/null
+++ b/libavfilter/x86/yadif.asm
@@ -0,0 +1,242 @@
+;*
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4
+pmaxubm2, m3
+mova  m3, m2
+mova  m4, m2
+%if mmsize == 16
+psrldqm3, 1
+psrldqm4, 2
+%else
+psrlq m3, 8
+psrlq m4, 16
+%endif
+punpcklbw m2, m7
+punpcklbw m3, m7
+punpcklbw m4, m7
+paddw m2, m3
+paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+movam3, m0
+pcmpgtw m3, m2
+pminsw  m0, m2
+movam6, m3
+pandm5, m3
+pandn   m3, m1
+por m3, m5
+movam1, m3
+%endmacro
+
+%macro CHECK2 0
+paddw   m6, [pw_1]
+psllw   m6, 14
+paddsw  m2, m6
+movam3, m0
+pcmpgtw m3, m2
+pminsw  m0, m2
+pandm5, m3
+pandn   m3, m1
+por m3, m5
+movam1, m3
+%endmacro
+
+%macro LOAD 2
+movh  m%1, %2
+punpcklbw m%1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+pxor m7, m7
+LOAD  0, [curq+mrefsq]
+LOAD  1, [curq+prefsq]
+LOAD  2, [%2]
+LOAD  3, [%3]
+mova m4, m3
+paddwm3, m2
+psrawm3, 1
+mova   [rsp+ 0], m0
+mova   [rsp+16], m3
+mova   [rsp+32], m1
+psubwm2, m4
+ABS1 m2, m4
+LOAD  3, [prevq+mrefsq]
+LOAD  4, [prevq+prefsq]
+psubwm3, m0
+psubwm4, m1
+ABS1 m3, m5
+ABS1 m4, m5
+paddwm3, m4
+psrlwm2, 1
+psrlwm3, 1
+pmaxsw   m2, m3
+LOAD  3, [nextq+mrefsq]
+LOAD  4, [nextq+prefsq]
+psubwm3, m0
+psubwm4, m1
+ABS1 m3, m5
+ABS1 m4, m5
+paddwm3, m4
+psrlwm3, 1
+pmaxsw   m2, m3
+mova   [rsp+48], m2
+
+paddwm1, m0
+paddwm0, m0
+psubwm0, m1
+psrlwm1, 1
+ABS1 m0, m2
+
+movu m2, [curq+mrefsq-1]
+movu m3, [curq+prefsq-1]
+mova m4, m2
+psubusb  m2, m3
+psubusb  m3, m4
+pmaxub

Re: [libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM

2013-01-08 Thread Daniel Kang
On Sun, Jan 6, 2013 at 12:54 PM, Diego Biurrun di...@biurrun.de wrote:
 On Sun, Jan 06, 2013 at 11:32:15AM -0600, Daniel Kang wrote:
 ---
 Updated to use ABS1
 ---
  libavfilter/x86/Makefile |1 +
  libavfilter/x86/yadif.c  |   60 +++--
  libavfilter/x86/yadif_template.c |  261 
 --
  libavfilter/x86/yadif_yasm.asm   |  241 +++
  4 files changed, 262 insertions(+), 301 deletions(-)
  delete mode 100644 libavfilter/x86/yadif_template.c
  create mode 100644 libavfilter/x86/yadif_yasm.asm

 No further issues to be noticed from me.  Does this pass the test script
 that I gave you?

While the script doesn't work for unrelated reasons, it builds with
every configuration the script tests. It also passes on x86_32 and
x86_64 for the yadif test for me.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM

2013-01-08 Thread Daniel Kang
On Tue, Jan 8, 2013 at 10:04 AM, Daniel Kang daniel.d.k...@gmail.com wrote:
 On Sun, Jan 6, 2013 at 12:54 PM, Diego Biurrun di...@biurrun.de wrote:
 On Sun, Jan 06, 2013 at 11:32:15AM -0600, Daniel Kang wrote:
 ---
 Updated to use ABS1
 ---
  libavfilter/x86/Makefile |1 +
  libavfilter/x86/yadif.c  |   60 +++--
  libavfilter/x86/yadif_template.c |  261 
 --
  libavfilter/x86/yadif_yasm.asm   |  241 +++
  4 files changed, 262 insertions(+), 301 deletions(-)
  delete mode 100644 libavfilter/x86/yadif_template.c
  create mode 100644 libavfilter/x86/yadif_yasm.asm

 No further issues to be noticed from me.  Does this pass the test script
 that I gave you?

 While the script doesn't work for unrelated reasons, it builds with
 every configuration the script tests. It also passes on x86_32 and
 x86_64 for the yadif test for me.

Oops ignore that.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH v3] YADIF: Port inline assembly to YASM

2013-01-08 Thread Daniel Kang
\
-psubusb   MM3, MM2 \n\t\
-psubusb   MM4, MM3 \n\t\
-pmaxubMM3, MM2 \n\t\
-PSHUF(MM3, MM2) \
-punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - 
cur[x+refs-1]) */\
-punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - 
cur[x+refs+1]) */\
-paddw MM2, MM0 \n\t\
-paddw MM3, MM0 \n\t\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..79265e6
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,241 @@
+;*
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4

[libav-devel] [PATCH v2] YADIF: Port inline assembly to YASM

2013-01-06 Thread Daniel Kang
\
-pmaxubMM3, MM2 \n\t\
-PSHUF(MM3, MM2) \
-punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - 
cur[x+refs-1]) */\
-punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - 
cur[x+refs+1]) */\
-paddw MM2, MM0 \n\t\
-paddw MM3, MM0 \n\t\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..6519de8
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,241 @@
+;*
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4
+pmaxubm2, m3
+mova  m3

[libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..ce76ff3
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,246 @@
+;*
+;* x86-optimized functions for volume filter
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION .text
+
+cextern pb_1
+cextern pw_1
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4
+pmaxubm2, m3
+mova  m3, m2
+mova  m4, m2
+%if mmsize == 16
+psrldqm3, 1
+psrldqm4, 2
+%else
+psrlq m3, 8
+psrlq m4, 16
+%endif
+punpcklbw m2, m7
+punpcklbw m3, m7
+punpcklbw m4, m7
+paddw m2, m3
+paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+movam3, m0
+pcmpgtw m3, m2
+pminsw  m0, m2
+movam6, m3
+pand

Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
If someone could test this on msvc 32/64-bit that would be great.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
On Sat, Jan 5, 2013 at 12:17 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi,

 On Sat, Jan 5, 2013 at 9:01 AM, Daniel Kang daniel.d.k...@gmail.com wrote:
 --- a/libavfilter/x86/yadif.c
 +++ b/libavfilter/x86/yadif.c
 @@ -26,49 +26,34 @@
  #include libavcodec/x86/dsputil_mmx.h
  #include libavfilter/yadif.h

 -#if HAVE_INLINE_ASM
 +#if HAVE_YASM

  DECLARE_ASM_CONST(16, const xmm_reg, pb_1) = {0x0101010101010101ULL, 
 0x0101010101010101ULL};
  DECLARE_ASM_CONST(16, const xmm_reg, pw_1) = {0x0001000100010001ULL, 
 0x0001000100010001ULL};

 Move to .asm file also (SECTION_RODATA). Just remove the whole _mmx.c
 file except for the glue bits.

Fixed.

 +%macro YADIF 0
 +cglobal yadif_filter_line, 9, 9, 8, dst, prev, cur, next, w, prefs, \
 +mrefs, parity, mode
 +%assign pad 16*5-gprsize-(stack_offset15)
 +SUBrsp, pad
 [..]
 +ADDrsp, pad
 +RET
 +%endmacro

 cglobal yadif_filter_line, 9, 9, 8, 16*5, names...

Fixed.

 That way stack alignment works on msvc also. Now, this is harder
 because you'll need to use only 6 regs on msvc (instead of 7), because
 the 7th one needs to hold the stack pointer. You can test locally by
 changing HAVE_INLINE_ASM from 1 to 0 in your config.{mak,h,asm}.

Sorry, how changing that test MSVC?

 +INIT_XMM sse2
 +YADIF
 +INIT_MMX mmxext
 +YADIF

 %if ARCH_X86_32
 INIT_MMX mmxext
 YADIF
 %endif

 Same change in c wrapper glue, gives smaller object files.

Fixed.

 Is there a fate test?

Yes, filter-yadif-mode1 and filter-yadif-mode0
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
On Sat, Jan 5, 2013 at 12:29 PM, Diego Biurrun di...@biurrun.de wrote:
 On Sat, Jan 05, 2013 at 11:01:19AM -0600, Daniel Kang wrote:

 --- a/libavfilter/x86/Makefile
 +++ b/libavfilter/x86/Makefile
 @@ -4,3 +4,4 @@ OBJS-$(CONFIG_YADIF_FILTER)  += x86/yadif.o

  YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o
  YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/af_volume.o
 +YASM-OBJS-$(CONFIG_VOLUME_FILTER)+= x86/yadif_yasm.o

 copy and paste typo :)

Fixed.

 As a result, your code is probably untested, please check again.

Interestingly enough it is tested (adding an int 3 kills the program
as expected). I'm not sure why.

 --- a/libavfilter/x86/yadif.c
 +++ b/libavfilter/x86/yadif.c
 @@ -26,49 +26,34 @@

 -#if HAVE_INLINE_ASM
 +#if HAVE_YASM

 -#if HAVE_MMXEXT_INLINE
 -#undef RENAME
 -#define RENAME(a) a ## _mmxext
 -#include yadif_template.c
  #endif

 -#endif /* HAVE_INLINE_ASM */

 Please comment the #endif, these files have a tendency to collect
 a lot of them and then the commented endifs help keep track.

Fixed.

  av_cold void ff_yadif_init_x86(YADIFContext *yadif)
  {
  int cpu_flags = av_get_cpu_flags();

 -#if HAVE_MMXEXT_INLINE
 +#if HAVE_YASM
  if (cpu_flags  AV_CPU_FLAG_MMXEXT)
 -yadif-filter_line = yadif_filter_line_mmxext;
 -#endif
 -#if HAVE_SSE2_INLINE
 +yadif-filter_line = ff_yadif_filter_line_mmxext;
  if (cpu_flags  AV_CPU_FLAG_SSE2)
 -yadif-filter_line = yadif_filter_line_sse2;
 -#endif
 -#if HAVE_SSSE3_INLINE
 +yadif-filter_line = ff_yadif_filter_line_sse2;
  if (cpu_flags  AV_CPU_FLAG_SSSE3)
 -yadif-filter_line = yadif_filter_line_ssse3;
 +yadif-filter_line = ff_yadif_filter_line_ssse3;
  #endif

 These could likely use HAVE_EXTERNAL_MMXEXT, etc...

Maybe I'm missing something?

AVX@AVX-PC /cygdrive/c/Code/libav
$ git grep HAVE_EXTERNAL
nothing

 --- /dev/null
 +++ b/libavfilter/x86/yadif_yasm.asm
 @@ -0,0 +1,246 @@
 +;*
 +;* x86-optimized functions for volume filter

 volume?

Copy/paste fail. Fixed.

 +;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com

 Keep the previous copyright line as well.

Fixed.

 +%macro CHECK 2
 +%if mmsize == 16
 +psrldqm5, 1
 +%else
 +psrlq m5, 8
 +%endif
 +%if mmsize == 16
 +psrldqm3, 1
 +psrldqm4, 2
 +%else
 +psrlq m3, 8
 +psrlq m4, 16
 +%endif

 idea (untested):

 %macro PSRLQ 2
 %if mmsize == 16
 psrlq %1, %2 * 8
 %else
 psrlq %1, %2
 %endif

I don't think it's worth it for those couple of lines.

 +%macro CHECK1 0
 +%endmacro
 +
 +%macro CHECK2 0
 +%endmacro

 These names are not terribly descriptive.

They weren't in the original file. I have no idea what to call them.

 +%macro LOAD 2
 +movh  m%1, %2
 +punpcklbw m%1, m7
 +%endmacro
 +
 +%macro ABSY 1-2
 +%if cpuflag(ssse3)
 +pabsw %1, %1
 +%else
 +ABS1_MMXEXT %1, %2
 +%endif
 +%endmacro

 Is this a candidate for simply extending ABS1_MMXEXT?

No. The MMXEXT is there for a reason.

 +%macro FILTER 3
 +.loop%1:
 +pxorm7, m7
 +LOAD 0, [curq+mrefsq]
 +LOAD 1, [curq+prefsq]
 +LOAD 2, [%2]
 +LOAD 3, [%3]
 +movam4, m3
 +paddw   m3, m2
 +psraw   m3, 1
 +mova [rsp+ 0], m0
 +mova [rsp+16], m3
 +mova [rsp+32], m1

 Indentation is off by one char, keep the ',' aligned.

Oops, fixed.

 +%if mmsize == 16
 +mova   m3, m2
 +psrldq m3, 2
 +%else
 +pshufw m3, m2, 9
 +%endif

 Didn't we have a macro for this?

If we do, I can't find it.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
\
-PSHUF(MM3, MM2) \
-punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - 
cur[x+refs-1]) */\
-punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - 
cur[x+refs+1]) */\
-paddw MM2, MM0 \n\t\
-paddw MM3, MM0 \n\t\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..91ced62
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,250 @@
+;*
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4
+pmaxubm2, m3
+mova  m3, m2
+mova  m4, m2
+%if mmsize

Re: [libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
On Sat, Jan 5, 2013 at 5:47 PM, Diego Biurrun di...@biurrun.de wrote:
 On Sat, Jan 05, 2013 at 12:48:58PM -0500, Daniel Kang wrote:
 On Sat, Jan 5, 2013 at 12:29 PM, Diego Biurrun di...@biurrun.de wrote:
  On Sat, Jan 05, 2013 at 11:01:19AM -0600, Daniel Kang wrote:
 
  --- a/libavfilter/x86/yadif.c
  +++ b/libavfilter/x86/yadif.c
   av_cold void ff_yadif_init_x86(YADIFContext *yadif)
   {
   int cpu_flags = av_get_cpu_flags();
 
  -#if HAVE_MMXEXT_INLINE
  +#if HAVE_YASM
   if (cpu_flags  AV_CPU_FLAG_MMXEXT)
  -yadif-filter_line = yadif_filter_line_mmxext;
  -#endif
  -#if HAVE_SSE2_INLINE
  +yadif-filter_line = ff_yadif_filter_line_mmxext;
   if (cpu_flags  AV_CPU_FLAG_SSE2)
  -yadif-filter_line = yadif_filter_line_sse2;
  -#endif
  -#if HAVE_SSSE3_INLINE
  +yadif-filter_line = ff_yadif_filter_line_sse2;
   if (cpu_flags  AV_CPU_FLAG_SSSE3)
  -yadif-filter_line = yadif_filter_line_ssse3;
  +yadif-filter_line = ff_yadif_filter_line_ssse3;
   #endif
 
  These could likely use HAVE_EXTERNAL_MMXEXT, etc...

 Maybe I'm missing something?

 AVX@AVX-PC /cygdrive/c/Code/libav
 $ git grep HAVE_EXTERNAL
 nothing

 I confused the order, it's HAVE_MMXEXT_EXTERNAL, etc...

Fixed.

  +%macro CHECK1 0
  +%endmacro
  +
  +%macro CHECK2 0
  +%endmacro
 
  These names are not terribly descriptive.

 They weren't in the original file. I have no idea what to call them.

 Maybe ask Loren?

Asked.

  +%macro LOAD 2
  +movh  m%1, %2
  +punpcklbw m%1, m7
  +%endmacro
  +
  +%macro ABSY 1-2
  +%if cpuflag(ssse3)
  +pabsw %1, %1
  +%else
  +ABS1_MMXEXT %1, %2
  +%endif
  +%endmacro
 
  Is this a candidate for simply extending ABS1_MMXEXT?

 No. The MMXEXT is there for a reason.

 I was being totally unclear, as I was referring to some changes I only
 have locally.  Please review the following patch:

 http://patches.libav.org/patch/25264/

That looks right.

  +%if mmsize == 16
  +mova   m3, m2
  +psrldq m3, 2
  +%else
  +pshufw m3, m2, 9
  +%endif
 
  Didn't we have a macro for this?

 If we do, I can't find it.

 h264_intrapred.asm has some very similar code that likely should be
 extracted into a common macro.

In another patch? This patch is supposed to be a port.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
\
-psubusb   MM4, MM3 \n\t\
-pmaxubMM3, MM2 \n\t\
-PSHUF(MM3, MM2) \
-punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - 
cur[x+refs-1]) */\
-punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - 
cur[x+refs+1]) */\
-paddw MM2, MM0 \n\t\
-paddw MM3, MM0 \n\t\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..76553a3
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,249 @@
+;*
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4

[libav-devel] [PATCH] YADIF: Port inline assembly to YASM

2013-01-05 Thread Daniel Kang
\
-pmaxubMM3, MM2 \n\t\
-PSHUF(MM3, MM2) \
-punpcklbw MM7, MM2 \n\t /* ABS(cur[x-refs-1] - 
cur[x+refs-1]) */\
-punpcklbw MM7, MM3 \n\t /* ABS(cur[x-refs+1] - 
cur[x+refs+1]) */\
-paddw MM2, MM0 \n\t\
-paddw MM3, MM0 \n\t\
-psubwMANGLE(pw_1), MM0 \n\t /* spatial_score */\
-\
-CHECK(-2,0)\
-CHECK1\
-CHECK(-3,1)\
-CHECK2\
-CHECK(0,-2)\
-CHECK1\
-CHECK(1,-3)\
-CHECK2\
-\
-/* if(p-mode2) ... */\
-MOVQ 48(%[tmp]), MM6 \n\t /* diff */\
-cmpl  $2, %[mode] \n\t\
-jge   1f \n\t\
-LOAD((%[prev2],%[mrefs],2), MM2) /* prev2[x-2*refs] */\
-LOAD((%[next2],%[mrefs],2), MM4) /* next2[x-2*refs] */\
-LOAD((%[prev2],%[prefs],2), MM3) /* prev2[x+2*refs] */\
-LOAD((%[next2],%[prefs],2), MM5) /* next2[x+2*refs] */\
-paddw MM4, MM2 \n\t\
-paddw MM5, MM3 \n\t\
-psrlw $1,MM2 \n\t /* b */\
-psrlw $1,MM3 \n\t /* f */\
-MOVQ   (%[tmp]), MM4 \n\t /* c */\
-MOVQ 16(%[tmp]), MM5 \n\t /* d */\
-MOVQ 32(%[tmp]), MM7 \n\t /* e */\
-psubw MM4, MM2 \n\t /* b-c */\
-psubw MM7, MM3 \n\t /* f-e */\
-MOVQ  MM5, MM0 \n\t\
-psubw MM4, MM5 \n\t /* d-c */\
-psubw MM7, MM0 \n\t /* d-e */\
-MOVQ  MM2, MM4 \n\t\
-pminswMM3, MM2 \n\t\
-pmaxswMM4, MM3 \n\t\
-pmaxswMM5, MM2 \n\t\
-pminswMM5, MM3 \n\t\
-pmaxswMM0, MM2 \n\t /* max */\
-pminswMM0, MM3 \n\t /* min */\
-pxor  MM4, MM4 \n\t\
-pmaxswMM3, MM6 \n\t\
-psubw MM2, MM4 \n\t /* -max */\
-pmaxswMM4, MM6 \n\t /* diff= MAX3(diff, min, -max); */\
-1: \n\t\
-\
-MOVQ 16(%[tmp]), MM2 \n\t /* d */\
-MOVQ  MM2, MM3 \n\t\
-psubw MM6, MM2 \n\t /* d-diff */\
-paddw MM6, MM3 \n\t /* d+diff */\
-pmaxswMM2, MM1 \n\t\
-pminswMM3, MM1 \n\t /* d = clip(spatial_pred, d-diff, 
d+diff); */\
-packuswb  MM1, MM1 \n\t\
-\
-::[prev] r(prev),\
- [cur]  r(cur),\
- [next] r(next),\
- [prefs]r((x86_reg)prefs),\
- [mrefs]r((x86_reg)mrefs),\
- [mode] g(mode),\
- [tmp]  r(tmp)\
-);\
-__asm__ volatile(MOV MM1, %0 :=m(*dst));\
-dst += STEP;\
-prev+= STEP;\
-cur += STEP;\
-next+= STEP;\
-}
-
-if (parity) {
-#define prev2 prev
-#define next2 cur
-FILTER
-#undef prev2
-#undef next2
-} else {
-#define prev2 cur
-#define next2 next
-FILTER
-#undef prev2
-#undef next2
-}
-}
-#undef STEP
-#undef MM
-#undef MOV
-#undef MOVQ
-#undef MOVQU
-#undef PSHUF
-#undef PSRL1
-#undef PSRL2
-#undef LOAD
-#undef PABS
-#undef CHECK
-#undef CHECK1
-#undef CHECK2
-#undef FILTER
diff --git a/libavfilter/x86/yadif_yasm.asm b/libavfilter/x86/yadif_yasm.asm
new file mode 100644
index 000..e51ed7e
--- /dev/null
+++ b/libavfilter/x86/yadif_yasm.asm
@@ -0,0 +1,249 @@
+;*
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2006 Michael Niedermayer michae...@gmx.at
+;* Copyright (c) 2013 Daniel Kang daniel.d.k...@gmail.com
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;**
+
+%include libavutil/x86/x86util.asm
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro CHECK 2
+movu  m2, [curq+mrefsq+%1]
+movu  m3, [curq+prefsq+%2]
+mova  m4, m2
+mova  m5, m2
+pxor  m4, m3
+pavgb m5, m3
+pand  m4, [pb_1]
+psubusb   m5, m4
+%if mmsize == 16
+psrldqm5, 1
+%else
+psrlq m5, 8
+%endif
+punpcklbw m5, m7
+mova  m4, m2
+psubusb   m2, m3
+psubusb   m3, m4
+pmaxubm2, m3
+mova

Re: [libav-devel] [PATCH] x86: h264_qpel: sign-extend stride arguments

2012-11-14 Thread Daniel Kang
On Tue, Nov 6, 2012 at 7:34 AM, Diego Biurrun di...@biurrun.de wrote:

 ---

 Now sign-extends all stride arguments, not just put_pixels16_sse2().

 Also adds a colon to a jump label to make NASM happy.  Since this patch
 is intended for squashing, I did not separate it from the rest.

  libavcodec/x86/dsputil.asm|   11 ++-
  libavcodec/x86/h264_qpel_8bit.asm |   30 ++
  2 files changed, 40 insertions(+), 1 deletions(-)

If it works, the patch looks okay to me?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.

2012-10-13 Thread Daniel Kang
The only CPUs that have 3dnow and don't have mmxext are 12 years old.

Moreover, AMD has deprecated 3dnow.
---
 libavcodec/x86/dsputil_avg_template.c |8 +-
 libavcodec/x86/dsputil_mmx.c  |  142 +
 libavcodec/x86/h264_qpel.c|4 -
 3 files changed, 8 insertions(+), 146 deletions(-)

diff --git a/libavcodec/x86/dsputil_avg_template.c 
b/libavcodec/x86/dsputil_avg_template.c
index 8b116b7..b514746 100644
--- a/libavcodec/x86/dsputil_avg_template.c
+++ b/libavcodec/x86/dsputil_avg_template.c
@@ -55,6 +55,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t 
*pixels, int line_
 :%REG_a, memory);
 }
 
+#ifndef SKIP_FOR_3DNOW
 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, 
int dstStride, int src1Stride, int h)
 {
 __asm__ volatile(
@@ -104,7 +105,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t 
*src1, uint8_t *src2, int
 :S((x86_reg)src1Stride), D((x86_reg)dstStride)
 :memory);
 }
-
+#endif
 
 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, 
int dstStride, int src1Stride, int h)
 {
@@ -226,6 +227,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, 
uint8_t *src1, uint8_t *src
 :memory);*/
 }
 
+#ifndef SKIP_FOR_3DNOW
 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, 
int dstStride, int src1Stride, int h)
 {
 __asm__ volatile(
@@ -276,7 +278,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t 
*src1, uint8_t *src2, int
 :S((x86_reg)src1Stride), D((x86_reg)dstStride)
 :memory);
 }
-
+#endif
 
 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, 
int dstStride, int src1Stride, int h)
 {
@@ -872,6 +874,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const 
uint8_t *pixels, int line
 :%REG_a,  memory);
 }
 
+#ifndef SKIP_FOR_3DNOW
 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int 
line_size, int h)
 {
 do {
@@ -896,6 +899,7 @@ static void DEF(avg_pixels4)(uint8_t *block, const uint8_t 
*pixels, int line_siz
 h -= 4;
 } while(h  0);
 }
+#endif
 
 //FIXME the following could be optimized too ...
 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, 
int line_size, int h){
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 86a08cb..a0231b7 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -197,12 +197,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 
2.0 };
 #define DEF(x) x ## _3dnow
 #define PAVGB pavgusb
 #define OP_AVG PAVGB
+#define SKIP_FOR_3DNOW
 
 #include dsputil_avg_template.c
 
 #undef DEF
 #undef PAVGB
 #undef OP_AVG
+#undef SKIP_FOR_3DNOW
 
 /***/
 /* MMX2 specific */
@@ -1051,73 +1053,6 @@ static void OPNAME ## 
mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst,   \
 );\
 } \
   \
-static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst,  \
-   uint8_t *src,  \
-   int dstStride, \
-   int srcStride, \
-   int h) \
-{ \
-int i;\
-int16_t temp[16]; \
-/* quick HACK, XXX FIXME MUST be optimized */ \
-for (i = 0; i  h; i++) { \
-temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 +   \
-   (src[ 1] + src[ 3]) *  3 - (src[ 2] + src[ 4]);\
-temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 +   \
-   (src[ 0] + src[ 4]) *  3 - (src[ 1] + src[ 5]);\
-temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 +   \
-   (src[ 0] + src[ 5]) *  3 - (src[ 0] + src[ 6]);\
-temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 +   \
-   (src[ 1] + src[ 6]) *  3 - (src[ 0] + src[ 7]);\
-temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 +   \
-   (src[ 2] + src[ 7]) *  3 - (src[ 1] + src[ 8]);\
-temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 +   \
-   (src[ 3] + src[ 8]) *  3 - (src[ 2] + src[ 9]);\
-temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 +   \
-   (src[ 4] + src[ 9]) *  3 - 

Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-10-13 Thread Daniel Kang
On Sat, Oct 13, 2012 at 11:04 AM, Daniel Kang daniel.d.k...@gmail.com wrote:
 ---
  libavcodec/x86/Makefile   |4 +-
  libavcodec/x86/dsputil.asm|  222 +++
  libavcodec/x86/dsputil_avg_template.c |  136 +---
  libavcodec/x86/dsputil_mmx.c  |  105 +--
  libavcodec/x86/h264_qpel.c| 1138 
 +
  libavcodec/x86/h264_qpel_8bit.asm |  833 
  6 files changed, 1239 insertions(+), 1199 deletions(-)
  create mode 100644 libavcodec/x86/h264_qpel_8bit.asm

This introduces many unused function warnings, but the functions are
being called...
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-10-13 Thread Daniel Kang
On Sat, Oct 13, 2012 at 11:31 AM, Måns Rullgård m...@mansr.com wrote:
 Daniel Kang daniel.d.k...@gmail.com writes:

 On Sat, Oct 13, 2012 at 11:04 AM, Daniel Kang daniel.d.k...@gmail.com 
 wrote:
 ---
  libavcodec/x86/Makefile   |4 +-
  libavcodec/x86/dsputil.asm|  222 +++
  libavcodec/x86/dsputil_avg_template.c |  136 +---
  libavcodec/x86/dsputil_mmx.c  |  105 +--
  libavcodec/x86/h264_qpel.c| 1138 
 +
  libavcodec/x86/h264_qpel_8bit.asm |  833 
  6 files changed, 1239 insertions(+), 1199 deletions(-)
  create mode 100644 libavcodec/x86/h264_qpel_8bit.asm

 This introduces many unused function warnings, but the functions are
 being called...

 Obviously they are not.  GCC does not simply make things like that up.

If I add an exit in the appropriate places, it exits.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-23 Thread Daniel Kang
On Wed, Aug 22, 2012 at 11:30 PM, Loren Merritt lor...@u.washington.edu wrote:

 On Wed, 22 Aug 2012, daniel.d.k...@gmail.com wrote:

  +; void pixels8_l2_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
  dstStride, int src1Stride, int h)
  +%macro PIXELS8_L2 1
  +%define OP op_%1
  +cglobal %1_pixels8_l2, 6,6
  +test   r5d, 1
  +je .loop
  +movam0, [r1]
  +movam1, [r2]
  +add r1, r4
  +add r2, 8
  +pavgb   m0, m1
  +OP  m0, [r0]
  +add r0, r3
  +decr5d
  +.loop:
  +movam0, [r1]
  +add r1, r4
  +movam1, [r1]
  +add r1, r4
  +pavgb   m0, [r2]
  +pavgb   m1, [r2+8]
  +OP  m0, [r0]
  +add r0, r3
  +OP  m1, [r0]
  +add r0, r3
  +movam0, [r1]
  +add r1, r4
  +movam1, [r1]
  +add r1, r4
  +pavgb   m0, [r2+16]
  +pavgb   m1, [r2+24]
  +OP  m0, [r0]
  +add r0, r3
  +OP  m1, [r0]
  +add r0, r3
  +add r2, 32
  +subr5d, 4
  +jne .loop
  +REP_RET
  +%endmacro

 More adds than necessary. Use [r1+r4].

Fixed locally.

  +%macro QPEL4_H_LOWPASS_OP 1
  +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
  +%define OP op_%1h

 I don't think this define clarifies anything, and it's only used once or
 twice each function.

What do you suggest I do instead?

  +%macro QPEL8_H_LOWPASS_OP_XMM 1
  +%define OP op_%1h
  +cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride
  +mov  r4d,  8
  +pxor  m7, m7
  +mova  m6, [pw_5]
  +.loop:
  +lddqu m1, [r1-2]
  +mova  m0, m1
  +punpckhbw m1, m7
  +punpcklbw m0, m7
  +mova  m2, m1
  +mova  m3, m1
  +mova  m4, m1
  +mova  m5, m1
  +palignr   m4, m0, 2
  +palignr   m3, m0, 4
  +palignr   m2, m0, 6
  +palignr   m1, m0, 8
  +palignr   m5, m0, 10
  +paddw m0, m5
  +paddw m2, m3
  +paddw m1, m4
  +psllw m2, 2
  +psubw m2, m1
  +paddw m0, [pw_16]
  +pmullwm2, m6
  +paddw m2, m0
  +psraw m2, 5
  +packuswb  m2, m2
  +OPm2, [r0], m4
  +add   r1, r3
  +add   r0, r2
  +dec r4d
  +jne .loop
  +REP_RET
  +%endmacro
  +
  +INIT_XMM ssse3
  +QPEL8_H_LOWPASS_OP_XMM put
  +QPEL8_H_LOWPASS_OP_XMM avg

 There aren't any cpus that have both lddqu and ssse3. Use movu instead,
 since that's what lddqu actually does on everything other than pentium4.

Fixed locally.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-23 Thread Daniel Kang
On Thu, Aug 23, 2012 at 12:22 PM, Derek Buitenhuis
derek.buitenh...@gmail.com wrote:

 On 22/08/2012 10:56 PM, daniel.d.k...@gmail.com wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  ---
   libavcodec/x86/Makefile   |3 +-
   libavcodec/x86/dsputil.asm|  138 
   libavcodec/x86/dsputil_mmx.c  |   79 +-
   libavcodec/x86/dsputil_mmx_avg_template.c |8 +-
   libavcodec/x86/h264_qpel.asm  |  849 ++
   libavcodec/x86/h264_qpel_mmx.c| 1107
  -
   6 files changed, 1149 insertions(+), 1035 deletions(-)
   create mode 100644 libavcodec/x86/h264_qpel.asm

 Doesn't this break !HAVE_INLINE_ASM? My tests indicate it does...

No idea. There's no way to configure !HAVE_INLINE_ASM (before you say
disable in config.h, you also have to disable HAVE_EBX, blah blah).

Get me a system with VS or a configure option and I'll fix the breakage.

 A little birdie told me you haven't switched over all funcs to YASM
 versions yet.

Depends on what you mean by that. I have yasm versions for everything needed.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-22 Thread Daniel Kang
On Wed, Aug 22, 2012 at 5:01 AM, Diego Biurrun di...@biurrun.de wrote:

 On Tue, Aug 21, 2012 at 10:08:03PM -0500, daniel.d.k...@gmail.com wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  ---
   libavcodec/x86/Makefile   |3 +-
   libavcodec/x86/dsputil.asm|  138 
   libavcodec/x86/dsputil_mmx.c  |   79 +-
   libavcodec/x86/dsputil_mmx_avg_template.c |8 +-
   libavcodec/x86/h264_qpel.asm  |  853 ++
   libavcodec/x86/h264_qpel_mmx.c| 1107
  -
   6 files changed, 1153 insertions(+), 1035 deletions(-)
   create mode 100644 libavcodec/x86/h264_qpel.asm

 What changed?

 Again: Please annotate your patches with the --annotate option of
 git-send-email.

My bad -- I forgot to do this. I fixed the first patch to shut up
compiler warnings.

  --- /dev/null
  +++ b/libavcodec/x86/h264_qpel.asm
  @@ -0,0 +1,853 @@
  +
  +%macro op_avgh 3 ; op_avgh
  +movh   %3, %2
  +pavgb  %1, %3
  +movh   %2, %1
  +%endmacro
  +
  +%macro op_avg 3 ; op_avg
  +pavgb  %1, %2
  +mova   %2, %1
  +%endmacro
  +
  +%macro op_puth 3 ; op_puth
  +movh   %2, %1
  +%endmacro
  +
  +%macro op_put 3 ; op_put
  +mova   %2, %1
  +%endmacro

 The the comments comments look look very very redundant redundant.

Added for debugging and forgot to change. Fixed.

  +%if ARCH_X86_64
  +; Is there a has ssse3 flag?

 Yes of course, why do you ask in a comment buried deep in the code?

Fixed.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.

2012-08-22 Thread Daniel Kang
On Wed, Aug 22, 2012 at 9:14 AM, Diego Biurrun di...@biurrun.de wrote:

 On Tue, Aug 21, 2012 at 10:08:02PM -0500, daniel.d.k...@gmail.com wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  The only CPUs that have 3dnow and don't have mmxext are 12 years old.
 
  Moreover, AMD has deprecated 3dnow.
  ---
   libavcodec/x86/dsputil_mmx.c  |  142 
  +
   libavcodec/x86/dsputil_mmx_avg_template.c |8 +-
   libavcodec/x86/h264_qpel_mmx.c|4 -
   3 files changed, 8 insertions(+), 146 deletions(-)

 You have sent this patch before, we have discussed this before and my
 last round of questions remains unanswered.

What questions? I fixed the SKIP_FOR_3DNOW
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.

2012-08-22 Thread Daniel Kang
On Wed, Aug 22, 2012 at 10:19 AM, Diego Biurrun di...@biurrun.de wrote:
 On Wed, Aug 22, 2012 at 10:15:14AM -0700, Daniel Kang wrote:
 On Wed, Aug 22, 2012 at 9:14 AM, Diego Biurrun di...@biurrun.de wrote:
 
  On Tue, Aug 21, 2012 at 10:08:02PM -0500, daniel.d.k...@gmail.com wrote:
   From: Daniel Kang daniel.d.k...@gmail.com
  
   The only CPUs that have 3dnow and don't have mmxext are 12 years old.
  
   Moreover, AMD has deprecated 3dnow.
   ---
libavcodec/x86/dsputil_mmx.c  |  142 
   +
libavcodec/x86/dsputil_mmx_avg_template.c |8 +-
libavcodec/x86/h264_qpel_mmx.c|4 -
3 files changed, 8 insertions(+), 146 deletions(-)
 
  You have sent this patch before, we have discussed this before and my
  last round of questions remains unanswered.

 What questions? I fixed the SKIP_FOR_3DNOW

 See my mail from July 25th.

Size:
The binary is about 418kb smaller.

Why drop this:
As mentioned in the commit message, 3dnow is deprecated by AMD. I see
no reason to support a deprecated instruction set. Also, it's a
maintenance burden for anyone who makes functional changes to the
code.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.

2012-08-22 Thread Daniel Kang
On Wed, Aug 22, 2012 at 4:04 PM, Diego Biurrun di...@biurrun.de wrote:
 On Wed, Aug 22, 2012 at 08:56:46PM +0200, Luca Barbato wrote:
 On 8/22/12 8:45 PM, Måns Rullgård wrote:
 Daniel Kang daniel.d.k...@gmail.com writes:
 
 Why drop this:
 As mentioned in the commit message, 3dnow is deprecated by AMD. I see
 no reason to support a deprecated instruction set. Also, it's a
 maintenance burden for anyone who makes functional changes to the
 code.
 
 It is deprecated on current CPUs, not on the ancient ones predating SSE.

 We should mention which cpu might have a regression in performance
 (yes I can only think only of one) the rest should be fine.

 Do we have mmxext or sse for all functions that we have 3dnow for?

For everything I removed, there's an mmxext counterpart.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/4] vc1dec: Fix motion vector scaling for field pictures

2012-08-19 Thread Daniel Kang
On Sun, Aug 19, 2012 at 1:10 PM, Diego Biurrun di...@biurrun.de wrote:

 On Sun, Aug 19, 2012 at 07:00:32PM +0200, Hendrik Leppkes wrote:
  On Sun, Aug 19, 2012 at 6:52 PM, Diego Biurrun di...@biurrun.de wrote:
  
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -1164,177 +1164,57 @@ static av_always_inline void
   get_mvdata_interlaced(VC1Context *v, int *dmv_x,
   
+static void scale_field_mv(VC1Context *v, int16_t *mv, int opp, int 
dir)
 {
+px = (px * scale)  8;
+py = (py * scale)  8;
+
+if (FFABS(px)  scalezone1_x)
+px = (px * scale1)  8;
+else if (px  0)
+px = ((px * scale2)  8) - zone1offset_x;
+ else
+px = ((px * scale2)  8) + zone1offset_x;
+
+if (FFABS(py)  scalezone1_y)
+py = (py * scale1)  8;
+else if (py  0)
+py = ((py * scale2)  8) - zone1offset_y;
+ else
+py = ((py * scale2)  8) + zone1offset_y;
  
   many unnecessary ()
 
  Just because they are not necessary for the syntax does not mean that they
  don't greatly help the readability of the code.

 Sure, but clearly not the case here IMO.  The () around the multiplications
 just clutter the code, which will in any case be evaluated left to right,
 matching reading direction.  Myself I looked twice to see why the () were
 added, only to note that they were added for no reason.

I suggest we paint the bikeshed bright pink.

Please, there's no need to reject a functional patch on the basis of too many ()
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/3] x86: avcodec: Consistently name all init files

2012-08-15 Thread Daniel Kang
On Wed, Aug 15, 2012 at 2:40 AM, Diego Biurrun di...@biurrun.de wrote:
 On Tue, Aug 14, 2012 at 02:58:52PM +0200, Janne Grunau wrote:
 On 2012-08-13 13:42:57 +0200, Diego Biurrun wrote:
  ---
   libavcodec/x86/Makefile|6 +++---
   libavcodec/x86/{pngdsp-init.c = pngdsp_init.c}|0
   .../x86/{proresdsp-init.c = proresdsp_init.c} |0
   libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c}|0
   4 files changed, 3 insertions(+), 3 deletions(-)
   rename libavcodec/x86/{pngdsp-init.c = pngdsp_init.c} (100%)
   rename libavcodec/x86/{proresdsp-init.c = proresdsp_init.c} (100%)
   rename libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c} (100%)

 ok, since you obviously care. do we have a rule in the developer docs to
 use underscores instead of dash in filenames? I'm just asking because
 the it annoys me in the Linux kernel that both are used.

 We don't have a rule.  We could of course add one if you want.  I haven't
 given a general rule much thought yet, but, as you may have guessed,
 inconsistencies between similar files in the same subdirectory irked me.

Can we choose something to prevent potential multiple renames of files?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/3] x86: avcodec: Consistently name all init files

2012-08-15 Thread Daniel Kang
On Wed, Aug 15, 2012 at 3:41 PM, Diego Biurrun di...@biurrun.de wrote:
 On Wed, Aug 15, 2012 at 09:47:28AM -0700, Daniel Kang wrote:
 On Wed, Aug 15, 2012 at 2:40 AM, Diego Biurrun di...@biurrun.de wrote:
  On Tue, Aug 14, 2012 at 02:58:52PM +0200, Janne Grunau wrote:
  On 2012-08-13 13:42:57 +0200, Diego Biurrun wrote:
   ---
libavcodec/x86/Makefile|6 +++---
libavcodec/x86/{pngdsp-init.c = pngdsp_init.c}|0
.../x86/{proresdsp-init.c = proresdsp_init.c} |0
libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c}|0
4 files changed, 3 insertions(+), 3 deletions(-)
rename libavcodec/x86/{pngdsp-init.c = pngdsp_init.c} (100%)
rename libavcodec/x86/{proresdsp-init.c = proresdsp_init.c} (100%)
rename libavcodec/x86/{vp8dsp-init.c = vp8dsp_init.c} (100%)
 
  ok, since you obviously care. do we have a rule in the developer docs to
  use underscores instead of dash in filenames? I'm just asking because
  the it annoys me in the Linux kernel that both are used.
 
  We don't have a rule.  We could of course add one if you want.  I haven't
  given a general rule much thought yet, but, as you may have guessed,
  inconsistencies between similar files in the same subdirectory irked me.

 Can we choose something to prevent potential multiple renames of files?

 Which file is at risk of being renamed multiple times and what
 would be the problem?

I am not worried about a particular file, but the case of having all
files in one directory conforming to one standard, and deciding later
to adapt a global standard.

I think deciding on a particular format now would save trouble down the road.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 4/5] x86: fix rNmp macros with nasm

2012-08-06 Thread Daniel Kang
On Sun, Aug 5, 2012 at 7:36 PM, Mans Rullgard m...@mansr.com wrote:
 For some reason, nasm requires this.  No harm done to yasm.

 Signed-off-by: Mans Rullgard m...@mansr.com
 ---
  libavutil/x86/x86inc.asm | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

Has this been synced with x264?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 07/45] x86: mmx2 --- mmxext in asm constructs

2012-08-06 Thread Daniel Kang
On Sun, Aug 5, 2012 at 10:20 AM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Plus, I didn't say it was a good idea, I said I could live with it if
 others want it. Right now, it seems others (i.e. Loren) don't.

FYI, it makes my life harder. Also I agree with Loren.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm

2012-08-04 Thread Daniel Kang
As it turns out, the qpel functions use stuff from dsputil.

I'll just rewrite dsputil while I'm at it, so this will be somewhat delayed.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] H264: Remove 3dnow qpel code.

2012-08-04 Thread Daniel Kang
On Sat, Aug 4, 2012 at 1:10 PM, Diego Biurrun di...@biurrun.de wrote:

 On Thu, Aug 02, 2012 at 12:30:48AM -0500, daniel.d.k...@gmail.com wrote:
 
  --- a/libavcodec/x86/dsputil_mmx.c
  +++ b/libavcodec/x86/dsputil_mmx.c
  @@ -198,12 +198,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 
  2.0, 2.0 };
   #define DEF(x) x ## _3dnow
   #define PAVGB pavgusb
   #define OP_AVG PAVGB
  +#define IS_3DNOW
 
   #include dsputil_mmx_avg_template.c
 
   #undef DEF
   #undef PAVGB
   #undef OP_AVG
  +#undef IS_3DNOW

 IS_3DNOW supposedly stands for - what?
 SKIP_FOR_3DNOW would be a much more sensible name IMO.

Fixed locally. Will resend once I port the rest of the parts of
dsputil I need to.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm

2012-08-01 Thread Daniel Kang
On Wed, Aug 1, 2012 at 1:03 AM, Loren Merritt lor...@u.washington.eduwrote:

 On Wed, 1 Aug 2012, Luca Barbato wrote:

  +%macro OP_MOVH_MMX 3
  +movh   %3, %2
  +pavgb  %1, %3
  +movh   %2, %1
  +%endmacro
  +
  +%macro MOVH_MMX 3
  +movh   %2, %1
  +%endmacro
  +
  +%macro OP_MOV_MMX 3
  +mova   %3, %2
  +pavgb  %1, %3
  +mova   %2, %1

 pavgb %1, %2
 mova  %2, %1
 (Just for the full width one)


Fixed locally.

 +%endmacro
  +
  +%macro MOV_MMX 3
  +mova   %2, %1
  +%endmacro

 It's op_put vs op_avg (or mov vs avg), not mov vs op_mov.
 Plus, naming them put vs avg would allow you to exploit the same put vs
 avg that's already in all the function names, rather than a separate
 %define OP.


Fixed locally.

 +%macro QPEL8OR16_V_LOWPASS_OP 1

 +cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride,
 srcStride, h
  +%if cpuflag(sse2)
  +sub   r1, r3
  +sub   r1, r3
  +%endif
  +pxor  m7, m7
  +movh  m0, [r1]
  +movh  m1, [r1+r3]
  +lea   r1, [r1+2*r3]
  +movh  m2, [r1]
  +movh  m3, [r1+r3]
  +lea   r1, [r1+2*r3]
  +movh  m4, [r1]
  +add   r1, r3
  +punpcklbw m0, m7
  +punpcklbw m1, m7
  +punpcklbw m2, m7
  +punpcklbw m3, m7
  +punpcklbw m4, m7
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +cmp r4d, 16
  +jne .end
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +FILT_V
  +.end:
  +RET
  +%endmacro

 (and other cases of this)
 REP_RET


Fixed locally.

 ... and I'll skip the suggestions for improvement, since Daniel Kang has a

separate branch for that.


Thank you. I'll get to rewriting the qpel eventually.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm

2012-08-01 Thread Daniel Kang
On Wed, Aug 1, 2012 at 5:22 AM, Måns Rullgård m...@mansr.com wrote:

 Daniel Kang daniel.d.k...@gmail.com writes:

  Can I have access to a setup that doesn't have inlined assembly?

 --extra-cflags=-D__asm__=error should make it fail nicely.


This gives me bizarre errors like:

/usr/include/x86_64-linux-gnu/asm/posix_types_64.h:25:14: error: storage
class specified for parameter ‘__kernel_clockid_t’

Is there a way to just disable inlined assembly in libav?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-01 Thread Daniel Kang
On Thu, Aug 2, 2012 at 1:30 AM, daniel.d.k...@gmail.com wrote:

 From: Daniel Kang daniel.d.k...@gmail.com

 ---
  libavcodec/x86/Makefile |3 +-
  libavcodec/x86/dsputil_mmx.c|   81 +---
  libavcodec/x86/dsputil_yasm.asm |   42 ++
  libavcodec/x86/h264_qpel.asm|  850 +++
  libavcodec/x86/h264_qpel_mmx.c  |  946
 ---
  5 files changed, 984 insertions(+), 938 deletions(-)
  create mode 100644 libavcodec/x86/h264_qpel.asm


Compiles and passes fate-h264 for me with --disable-yasm and with yasm.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] h264: remove 3dnow qpel code

2012-07-31 Thread Daniel Kang
On Wed, Aug 1, 2012 at 12:57 AM, Loren Merritt lor...@u.washington.eduwrote:

 On Wed, 1 Aug 2012, Luca Barbato wrote:

  From: Daniel Kang daniel.d.k...@gmail.com
 
  Remove the code to eases porting the other qpel optimizations to
  yasm.
 
  AMD has deprecated 3dnow and the only CPUs that have 3dnow and
  do not have mmxext are 12 years old.

 libavcodec/x86/dsputil_mmx_avg_template.c:58:1: warning:
 `put_pixels4_l2_3dnow' defined but not used
 libavcodec/x86/dsputil_mmx_avg_template.c:229:1: warning:
 `avg_pixels4_l2_3dnow' defined but not used
 libavcodec/x86/dsputil_mmx_avg_template.c:875:1: warning:
 `avg_pixels4_3dnow' defined but not used


Those errors are fairly difficult to fix. Whoever wrote the original
inlined assembly made it very hard to disable only certain functions at
specific CPU types. Which would you prefer, a hack-ish workaround or to
wait until I port the rest over to yasm?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] h264: remove 3dnow qpel code

2012-07-31 Thread Daniel Kang
On Wed, Aug 1, 2012 at 1:03 AM, Daniel Kang daniel.d.k...@gmail.com wrote:

 On Wed, Aug 1, 2012 at 12:57 AM, Loren Merritt lor...@u.washington.eduwrote:

 libavcodec/x86/dsputil_mmx_avg_template.c:58:1: warning:
 `put_pixels4_l2_3dnow' defined but not used
 libavcodec/x86/dsputil_mmx_avg_template.c:229:1: warning:
 `avg_pixels4_l2_3dnow' defined but not used
 libavcodec/x86/dsputil_mmx_avg_template.c:875:1: warning:
 `avg_pixels4_3dnow' defined but not used


 Those errors are fairly difficult to fix. Whoever wrote the original
 inlined assembly made it very hard to disable only certain functions at
 specific CPU types. Which would you prefer, a hack-ish workaround or to
 wait until I port the rest over to yasm?


I'm thinking something along  the lines of:

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 47d99f9..126a0b9 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -198,12 +198,14 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = {
2.0, 2.0 };
 #define DEF(x) x ## _3dnow
 #define PAVGB pavgusb
 #define OP_AVG PAVGB
+#define IS_3DNOW

 #include dsputil_mmx_avg_template.c

 #undef DEF
 #undef PAVGB
 #undef OP_AVG
+#undef IS_3DNOW

 /***/
 /* MMX2 specific */
diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c
b/libavcodec/x86/dsputil_mmx_avg_template.c
index 8b116b7..29d0e51 100644
--- a/libavcodec/x86/dsputil_mmx_avg_template.c
+++ b/libavcodec/x86/dsputil_mmx_avg_template.c
@@ -55,6 +55,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const
uint8_t *pixels, int line_
 :%REG_a, memory);
 }

+#ifndef IS_3DNOW
 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t
*src2, int dstStride, int src1Stride, int h)
 {
 __asm__ volatile(
@@ -104,7 +105,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t
*src1, uint8_t *src2, int
 :S((x86_reg)src1Stride), D((x86_reg)dstStride)
 :memory);
 }
-
+#endif

 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t
*src2, int dstStride, int src1Stride, int h)
 {
@@ -226,6 +227,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst,
uint8_t *src1, uint8_t *src
 :memory);*/
 }

+#ifndef IS_3DNOW
 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t
*src2, int dstStride, int src1Stride, int h)
 {
 __asm__ volatile(
@@ -276,7 +278,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t
*src1, uint8_t *src2, int
 :S((x86_reg)src1Stride), D((x86_reg)dstStride)
 :memory);
 }
-
+#endif

 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t
*src2, int dstStride, int src1Stride, int h)
 {
@@ -872,6 +874,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const
uint8_t *pixels, int line
 :%REG_a,  memory);
 }

+#ifndef IS_3DNOW
 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int
line_size, int h)
 {
 do {
@@ -896,6 +899,7 @@ static void DEF(avg_pixels4)(uint8_t *block, const
uint8_t *pixels, int line_siz
 h -= 4;
 } while(h  0);
 }
+#endif

 //FIXME the following could be optimized too ...
 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t
*pixels, int line_size, int h){
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm

2012-07-31 Thread Daniel Kang
On Wed, Aug 1, 2012 at 12:28 AM, Ronald S. Bultje rsbul...@gmail.comwrote:

 Hi,

 On Tue, Jul 31, 2012 at 9:12 PM, Luca Barbato lu_z...@gentoo.org wrote:
  @@ -2879,10 +2879,9 @@ static void dsputil_init_sse2(DSPContext *c,
 AVCodecContext *avctx,
 int mm_flags)
   {
   const int bit_depth  = avctx-bits_per_raw_sample;
  -
  -#if HAVE_INLINE_ASM
   const int high_bit_depth = bit_depth  8;
 
  +#if HAVE_INLINE_ASM
   if (!(mm_flags  AV_CPU_FLAG_SSE2SLOW)) {
   // these functions are slower than mmx on AMD, but faster on
 Intel
   if (!high_bit_depth) {
  @@ -2893,7 +2892,9 @@ static void dsputil_init_sse2(DSPContext *c,
 AVCodecContext *avctx,
   H264_QPEL_FUNCS(0, 0, sse2);
   }
   }
  +#endif /* HAVE_INLINE_ASM */

 The H264_QPEL_FUNCS() call should go outside HAVE_INLINE_ASM.


Fixed locally.


 +;*

 +;* MMX/SSE2/SSSE3-optimized H.264 qpel code
 
 +;*
  +;* Copyright (C) 2012 Daniel Kang
  +;*
  +;* Authors: Daniel Kang daniel.d.k...@gmail.com

 You told me you based this substantially on the existing
 implementation; if that's the case, you should acknowledge the
 original author(s) of that code also.


Fixed locally.


Can I have access to a setup that doesn't have inlined assembly?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-07-29 Thread Daniel Kang
0.1% slower, probably due to alignment issues and a very small amount of
calling overhead.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] H264: Remove 3dnow qpel code.

2012-07-23 Thread Daniel Kang
From: Daniel Kang daniel.d.k...@gmail.com

The only CPUs that have 3dnow and don't have mmxext are 12 years old.
---
 libavcodec/x86/dsputil_mmx.c   |9 -
 libavcodec/x86/h264_qpel_mmx.c |4 
 2 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5eb4a24..f7dbb0b 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2783,15 +2783,6 @@ static void dsputil_init_3dnow(DSPContext *c, 
AVCodecContext *avctx,
 SET_QPEL_FUNCS(avg_qpel,0, 16, 3dnow, );
 SET_QPEL_FUNCS(avg_qpel,1,  8, 3dnow, );
 
-if (!high_bit_depth) {
-SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
-SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 3dnow, );
-SET_QPEL_FUNCS(put_h264_qpel, 2,  4, 3dnow, );
-SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
-SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 3dnow, );
-SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, 3dnow, );
-}
-
 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
 SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, 3dnow, );
 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
index 85ae07e..cd7ea99 100644
--- a/libavcodec/x86/h264_qpel_mmx.c
+++ b/libavcodec/x86/h264_qpel_mmx.c
@@ -1161,9 +1161,6 @@ QPEL(put_, 16,XMM, 16)\
 QPEL(avg_, 8, XMM, 16)\
 QPEL(avg_, 16,XMM, 16)\
 
-#define PAVGB pavgusb
-QPEL_H264(put_,   PUT_OP, 3dnow)
-QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
 #undef PAVGB
 #define PAVGB pavgb
 QPEL_H264(put_,   PUT_OP, mmx2)
@@ -1182,7 +1179,6 @@ QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
 #endif
 #undef PAVGB
 
-H264_MC_4816(3dnow)
 H264_MC_4816(mmx2)
 H264_MC_816(H264_MC_V, sse2)
 H264_MC_816(H264_MC_HV, sse2)
-- 
1.7.7.3

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.

2012-07-23 Thread Daniel Kang
On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote:

 On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  The only CPUs that have 3dnow and don't have mmxext are 12 years old.
  ---
   libavcodec/x86/dsputil_mmx.c   |9 -
   libavcodec/x86/h264_qpel_mmx.c |4 
   2 files changed, 0 insertions(+), 13 deletions(-)

 What sort of maintenance burden does this relieve us from?
 I'm writing this mail on a system fitting the description
 you mention, my trusty old K6-III.


1. Decreases binary size.
2. We don't support Windows ME (12 years old), I don't see a reason to keep
this cruft.
3. 3dnow becomes a pain when I'm trying to port code to yasm.
4. You can probably decode 260p H.264 with a K6-III. Who seriously would
use this?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] VP8: Implement sliced threading.

2012-07-13 Thread Daniel Kang
On Thu, Jul 12, 2012 at 1:07 PM, Daniel Kang daniel.d.k...@gmail.comwrote:

 Testing gives 25-30% gain on HD clips with two threads and up to 50% gain
 with eight threads.

 Sliced threading uses more memory than single or frame threading.
 ---
  libavcodec/vp8.c |  514
 ++
  libavcodec/vp8.h |   63 ---
  2 files changed, 399 insertions(+), 178 deletions(-)


This should have addressed all of Luca's and Diego's comments. Ping?
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] VP8: Implement sliced threading.

2012-07-13 Thread Daniel Kang
On Fri, Jul 13, 2012 at 10:34 AM, Luca Barbato lu_z...@gentoo.org wrote:

 This should have addressed all of Luca's and Diego's comments. Ping

 Looks fine for me, shall we push it in 4 hours?


If no one else has objections, fine by me.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] VP8: Implement sliced threading.

2012-07-12 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Sliced threading uses more memory than single or frame threading.
---
 libavcodec/vp8.c |  514 ++
 libavcodec/vp8.h |   63 ---
 2 files changed, 399 insertions(+), 178 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2181976..936c16a 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -30,17 +31,28 @@
 #include rectangle.h
 #include thread.h
 
+#if HAVE_PTHREADS
+#include pthread.h
+#elif HAVE_W32THREADS
+#include w32pthreads.h
+#endif
+
 #if ARCH_ARM
 #   include arm/vp8.h
 #endif
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +120,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +136,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mb_layout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mb_layout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+pthread_mutex_init(s-thread_data[i].lock, NULL);
+pthread_cond_init(s-thread_data[i].cond, NULL);
+}
+
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mb_layout == 0))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +358,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx-width || height != s-avctx-height) {
-if ((ret = update_dimensions(s, width, height))  0)
-return ret;
-}
-
 ff_vp56_init_range_decoder(c, buf, header_size);
 buf  += header_size;
 buf_size -= header_size;
@@ -366,6 +386,12 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 return AVERROR_INVALIDDATA;
 }
 
+if (!s-macroblocks_base || /* first frame */
+width != s-avctx-width || height != s-avctx-height) {
+if ((ret = update_dimensions(s, width, height))  0)
+return ret;
+}
+
 get_quants(s);
 
 if (!s-keyframe) {
@@ -468,19 +494,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
  * @returns the number of motion vectors parsed (2, 4 or 16)
  */
 static av_always_inline
-int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
+int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int 
layout)
 {
 int part_idx;
 int n, num;
-VP8Macroblock *top_mb  = mb[2];
+VP8Macroblock *top_mb

Re: [libav-devel] [PATCH 3/3] VP8: Implement sliced threading.

2012-07-12 Thread Daniel Kang
On Thu, Jul 12, 2012 at 12:27 PM, Måns Rullgård m...@mansr.com wrote:

 Luca Barbato lu_z...@gentoo.org writes:

  On 07/12/2012 09:42 AM, Måns Rullgård wrote:
  Are you still increasing the memory usage by a huge amount?  If so, I'm
  a bit concerned about how that will affect performance on systems with
  relatively small caches.
 
  Not sure if those systems would use slice threading, single thread and
  frame threading should keep the previous memory usage.

 It was my understanding that some version of this patch increased memory
 usage for all modes.  If that is no longer the case, great.


This is no longer the case.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] VP8: Implement sliced threading.

2012-07-12 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Sliced threading uses more memory than single or frame threading.
---
 libavcodec/vp8.c |  514 ++
 libavcodec/vp8.h |   63 ---
 2 files changed, 399 insertions(+), 178 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2181976..ee954fc 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -30,17 +31,28 @@
 #include rectangle.h
 #include thread.h
 
+#if HAVE_PTHREADS
+#include pthread.h
+#elif HAVE_W32THREADS
+#include w32pthreads.h
+#endif
+
 #if ARCH_ARM
 #   include arm/vp8.h
 #endif
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +120,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +136,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mb_layout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (!s-mb_layout) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+pthread_mutex_init(s-thread_data[i].lock, NULL);
+pthread_cond_init(s-thread_data[i].cond, NULL);
+}
+
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  !s-mb_layout))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +358,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx-width || height != s-avctx-height) {
-if ((ret = update_dimensions(s, width, height))  0)
-return ret;
-}
-
 ff_vp56_init_range_decoder(c, buf, header_size);
 buf  += header_size;
 buf_size -= header_size;
@@ -366,6 +386,12 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 return AVERROR_INVALIDDATA;
 }
 
+if (!s-macroblocks_base || /* first frame */
+width != s-avctx-width || height != s-avctx-height) {
+if ((ret = update_dimensions(s, width, height))  0)
+return ret;
+}
+
 get_quants(s);
 
 if (!s-keyframe) {
@@ -468,19 +494,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
  * @returns the number of motion vectors parsed (2, 4 or 16)
  */
 static av_always_inline
-int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
+int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int 
layout)
 {
 int part_idx;
 int n, num;
-VP8Macroblock *top_mb  = mb[2];
+VP8Macroblock *top_mb

[libav-devel] [PATCH 1/3] VP8: Refactor decoding a single mb_row.

2012-07-11 Thread Daniel Kang
This is in preperation for sliced threading.
---
 libavcodec/vp8.c |  164 --
 1 files changed, 86 insertions(+), 78 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 94200f6..8ebc445 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1574,11 +1574,95 @@ static void release_queued_segmaps(VP8Context *s, int 
is_close)
 s-maps_are_invalid = 0;
 }
 
+#define MARGIN (16  2)
+static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe,
+  AVFrame *prev_frame, int mb_y)
+{
+VP8Context *s = avctx-priv_data;
+VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
+VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
+int i, y, mb_x, mb_xy = mb_y*s-mb_width;
+uint8_t *dst[3] = {
+curframe-data[0] + 16*mb_y*s-linesize,
+curframe-data[1] +  8*mb_y*s-uvlinesize,
+curframe-data[2] +  8*mb_y*s-uvlinesize
+};
+
+memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
+memset(s-left_nnz, 0, sizeof(s-left_nnz));
+AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
+
+// left edge of 129 for intra prediction
+if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
+for (i = 0; i  3; i++)
+for (y = 0; y  16!!i; y++)
+dst[i][y*curframe-linesize[i]-1] = 129;
+if (mb_y == 1) // top left edge is also 129
+s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] 
= 129;
+}
+
+s-mv_min.x = -MARGIN;
+s-mv_max.x = ((s-mb_width  - 1)  6) + MARGIN;
+
+for (mb_x = 0; mb_x  s-mb_width; mb_x++, mb_xy++, mb++) {
+/* Prefetch the current frame, 4 MBs ahead */
+s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4);
+s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 
2);
+
+decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy,
+   prev_frame  prev_frame-ref_index[0] ? 
prev_frame-ref_index[0] + mb_xy : NULL);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
+
+if (!mb-skip)
+decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz);
+
+if (mb-mode = MODE_I4x4)
+intra_predict(s, dst, mb, mb_x, mb_y);
+else
+inter_predict(s, dst, mb, mb_x, mb_y);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
+
+if (!mb-skip) {
+idct_mb(s, dst, mb);
+} else {
+AV_ZERO64(s-left_nnz);
+AV_WN64(s-top_nnz[mb_x], 0);   // array of 9, so unaligned
+
+// Reset DC block predictors if they would exist if the mb had 
coefficients
+if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
+s-left_nnz[8]  = 0;
+s-top_nnz[mb_x][8] = 0;
+}
+}
+
+if (s-deblock_filter)
+filter_level_for_mb(s, mb, s-filter_strength[mb_x]);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
+
+dst[0] += 16;
+dst[1] += 8;
+dst[2] += 8;
+s-mv_min.x -= 64;
+s-mv_max.x -= 64;
+}
+if (s-deblock_filter) {
+if (s-filter.simple)
+filter_mb_row_simple(s, curframe, mb_y);
+else
+filter_mb_row(s, curframe, mb_y);
+}
+s-mv_min.y -= 64;
+s-mv_max.y -= 64;
+}
+
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 AVPacket *avpkt)
 {
 VP8Context *s = avctx-priv_data;
-int ret, mb_x, mb_y, i, y, referenced;
+int ret, mb_y, i, referenced;
 enum AVDiscard skip_thresh;
 AVFrame *av_uninit(curframe), *prev_frame;
 
@@ -1686,90 +1770,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void 
*data, int *data_size,
 if (s-keyframe)
 memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4);
 
-#define MARGIN (16  2)
 s-mv_min.y = -MARGIN;
 s-mv_max.y = ((s-mb_height - 1)  6) + MARGIN;
 
 for (mb_y = 0; mb_y  s-mb_height; mb_y++) {
-VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
-VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
-int mb_xy = mb_y*s-mb_width;
-uint8_t *dst[3] = {
-curframe-data[0] + 16*mb_y*s-linesize,
-curframe-data[1] +  8*mb_y*s-uvlinesize,
-curframe-data[2] +  8*mb_y*s-uvlinesize
-};
-
-memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
-memset(s-left_nnz, 0, sizeof(s-left_nnz));
-AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
-
-// left edge of 129 for intra prediction
-if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
-for (i = 0; i  3; i++)
-for (y = 0; y  16!!i; y++)
-dst[i][y*curframe-linesize[i]-1] = 129;
-if (mb_y 

[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock

2012-07-11 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   25 +
 libavcodec/vp8.h |7 ---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 8ebc445..2181976 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y)
 }
 
 static av_always_inline
-void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
+void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
+
 if (keyframe) {
 int x, y;
 uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
@@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid);
 else if (s-segmentation.enabled)
 *segment = ref ? *ref : *segment;
-s-segment = *segment;
+mb-segment = *segment;
 
 mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0;
 
@@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, 
vp8_pred16x16_prob_intra);
 
 if (mb-mode == MODE_I4x4) {
-decode_intra4x4_modes(s, c, mb_x, 1);
+decode_intra4x4_modes(s, c, mb, mb_x, 1);
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
 AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
 AV_WN32A(s-intra4x4_pred_mode_left, modes);
 }
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
 mb-ref_frame = VP56_FRAME_CURRENT;
 } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
 // inter MB, 16.2
@@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, 
s-prob-pred16x16);
 
 if (mb-mode == MODE_I4x4)
-decode_intra4x4_modes(s, c, mb_x, 0);
+decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
 mb-ref_frame = VP56_FRAME_CURRENT;
 mb-partitioning = VP8_SPLITMVMODE_NONE;
 AV_ZERO32(mb-bmv[0]);
@@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
 int nnz_pred, nnz, nnz_total = 0;
-int segment = s-segment;
+int segment = mb-segment;
 int block_dc = 0;
 
 if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
@@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 s-hpc.pred16x16[mode](dst[0], s-linesize);
 } else {
 uint8_t *ptr = dst[0];
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
 uint8_t tr_top[4] = { 127, 127, 127, 127 };
 
 // all blocks on the right edge of the macroblock use bottom edge
@@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 }
 
 if (avctx-flags  CODEC_FLAG_EMU_EDGE) {
-mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, 
mb_y);
+mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, 
mb_y);
 } else {
-mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y);
+mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y);
 }
 s-hpc.pred8x8[mode](dst[1], s-uvlinesize);
 s-hpc.pred8x8[mode](dst[2], s-uvlinesize);
@@ -1408,7 +1409,7 @@ static av_always_inline void 
filter_level_for_mb(VP8Context *s, VP8Macroblock *m
 int interior_limit, filter_level;
 
 if (s-segmentation.enabled) {
-filter_level = s-segmentation.filter_level[s-segment];
+filter_level = s-segmentation.filter_level[mb-segment];
 if (!s-segmentation.absolute_vals)
 filter_level += s-filter.level;
 } else
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index a738cb7..2f2cb80 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -79,6 +79,10 @@ typedef struct {
 uint8_t mode;
 uint8_t ref_frame;
 uint8_t partitioning;
+uint8_t chroma_pred_mode;
+uint8_t segment;
+uint8_t intra4x4_pred_mode_mb[16];
+uint8_t intra4x4_pred_mode_top[4];
 VP56mv mv;
 VP56mv bmv[16];
 } VP8Macroblock;
@@ -97,8 +101,6 @@ typedef struct {
 uint8_t keyframe;

[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.

2012-07-11 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Sliced threading uses more memory than single or frame threading.
---
 libavcodec/vp8.c |  521 ++
 libavcodec/vp8.h |   63 ---
 2 files changed, 407 insertions(+), 177 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2181976..756714e 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -22,6 +23,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include pthread.h
+
 #include libavutil/imgutils.h
 #include avcodec.h
 #include internal.h
@@ -36,11 +39,16 @@
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +116,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +132,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mlayout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
+
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mlayout == 0))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +354,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx-width || height != s-avctx-height) {
-if ((ret = update_dimensions(s, width, height))  0)
-return ret;
-}
-
 ff_vp56_init_range_decoder(c, buf, header_size);
 buf  += header_size;
 buf_size -= header_size;
@@ -366,6 +382,12 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 return AVERROR_INVALIDDATA;
 }
 
+if (!s-macroblocks_base || /* first frame */
+width != s-avctx-width || height != s-avctx-height) {
+if ((ret = update_dimensions(s, width, height))  0)
+return ret;
+}
+
 get_quants(s);
 
 if (!s-keyframe) {
@@ -468,19 +490,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
  * @returns the number of motion vectors parsed (2, 4 or 16)
  */
 static av_always_inline
-int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
+int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int 
layout)
 {
 int part_idx;
 int n, num;
-VP8Macroblock *top_mb  = mb[2];
+VP8Macroblock

[libav-devel] [PATCH] VP8: Implement sliced threading.

2012-07-11 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Sliced threading uses more memory than single or frame threading.
---
 libavcodec/vp8.c |  526 ++
 libavcodec/vp8.h |   63 ---
 2 files changed, 412 insertions(+), 177 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2181976..a0040d0 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -22,6 +23,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+
 #include libavutil/imgutils.h
 #include avcodec.h
 #include internal.h
@@ -30,17 +32,28 @@
 #include rectangle.h
 #include thread.h
 
+#if HAVE_PTHREADS
+#include pthread.h
+#elif HAVE_W32THREADS
+#include w32pthreads.h
+#endif
+
 #if ARCH_ARM
 #   include arm/vp8.h
 #endif
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +121,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +137,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mlayout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
+
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mlayout == 0))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +359,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx-width || height != s-avctx-height) {
-if ((ret = update_dimensions(s, width, height))  0)
-return ret;
-}
-
 ff_vp56_init_range_decoder(c, buf, header_size);
 buf  += header_size;
 buf_size -= header_size;
@@ -366,6 +387,12 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 return AVERROR_INVALIDDATA;
 }
 
+if (!s-macroblocks_base || /* first frame */
+width != s-avctx-width || height != s-avctx-height) {
+if ((ret = update_dimensions(s, width, height))  0)
+return ret;
+}
+
 get_quants(s);
 
 if (!s-keyframe) {
@@ -468,19 +495,26 @@ const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
  * @returns the number of motion vectors parsed (2, 4 or 16)
  */
 static av_always_inline
-int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
+int decode_splitmvs

Re: [libav-devel] [PATCH 3/3] VP8: Implement sliced threading.

2012-07-11 Thread Daniel Kang
On Wed, Jul 11, 2012 at 5:22 PM, Luca Barbato lu_z...@gentoo.org wrote:

 On 07/11/2012 08:34 PM, Daniel Kang wrote:
 
  +#include pthread.h
  +

 Check pthread.c, we apparently have non-pthread users, for the rest I'm
 not sure which comments you addressed.


This addresses Mans' objection to sched_yield() (I no longer use it).
Refactoring thread.h code no longer applies, so I have no outstanding
comments to address (I think).
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock

2012-06-28 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   25 +
 libavcodec/vp8.h |7 ---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 8ebc445..2181976 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y)
 }
 
 static av_always_inline
-void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
+void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
+
 if (keyframe) {
 int x, y;
 uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
@@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid);
 else if (s-segmentation.enabled)
 *segment = ref ? *ref : *segment;
-s-segment = *segment;
+mb-segment = *segment;
 
 mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0;
 
@@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, 
vp8_pred16x16_prob_intra);
 
 if (mb-mode == MODE_I4x4) {
-decode_intra4x4_modes(s, c, mb_x, 1);
+decode_intra4x4_modes(s, c, mb, mb_x, 1);
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
 AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
 AV_WN32A(s-intra4x4_pred_mode_left, modes);
 }
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
 mb-ref_frame = VP56_FRAME_CURRENT;
 } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
 // inter MB, 16.2
@@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, 
s-prob-pred16x16);
 
 if (mb-mode == MODE_I4x4)
-decode_intra4x4_modes(s, c, mb_x, 0);
+decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
 mb-ref_frame = VP56_FRAME_CURRENT;
 mb-partitioning = VP8_SPLITMVMODE_NONE;
 AV_ZERO32(mb-bmv[0]);
@@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
 int nnz_pred, nnz, nnz_total = 0;
-int segment = s-segment;
+int segment = mb-segment;
 int block_dc = 0;
 
 if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
@@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 s-hpc.pred16x16[mode](dst[0], s-linesize);
 } else {
 uint8_t *ptr = dst[0];
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
 uint8_t tr_top[4] = { 127, 127, 127, 127 };
 
 // all blocks on the right edge of the macroblock use bottom edge
@@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 }
 
 if (avctx-flags  CODEC_FLAG_EMU_EDGE) {
-mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, 
mb_y);
+mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, 
mb_y);
 } else {
-mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y);
+mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y);
 }
 s-hpc.pred8x8[mode](dst[1], s-uvlinesize);
 s-hpc.pred8x8[mode](dst[2], s-uvlinesize);
@@ -1408,7 +1409,7 @@ static av_always_inline void 
filter_level_for_mb(VP8Context *s, VP8Macroblock *m
 int interior_limit, filter_level;
 
 if (s-segmentation.enabled) {
-filter_level = s-segmentation.filter_level[s-segment];
+filter_level = s-segmentation.filter_level[mb-segment];
 if (!s-segmentation.absolute_vals)
 filter_level += s-filter.level;
 } else
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index a738cb7..2f2cb80 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -79,6 +79,10 @@ typedef struct {
 uint8_t mode;
 uint8_t ref_frame;
 uint8_t partitioning;
+uint8_t chroma_pred_mode;
+uint8_t segment;
+uint8_t intra4x4_pred_mode_mb[16];
+uint8_t intra4x4_pred_mode_top[4];
 VP56mv mv;
 VP56mv bmv[16];
 } VP8Macroblock;
@@ -97,8 +101,6 @@ typedef struct {
 uint8_t keyframe;

[libav-devel] [PATCH 1/3] VP8: Refactor decoding a single mb_row.

2012-06-28 Thread Daniel Kang
This is in preperation for sliced threading.
---
Splitting long lines to please Diego.
---
 libavcodec/vp8.c |  164 --
 1 files changed, 86 insertions(+), 78 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 94200f6..8ebc445 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1574,11 +1574,95 @@ static void release_queued_segmaps(VP8Context *s, int 
is_close)
 s-maps_are_invalid = 0;
 }
 
+#define MARGIN (16  2)
+static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe,
+  AVFrame *prev_frame, int mb_y)
+{
+VP8Context *s = avctx-priv_data;
+VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
+VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
+int i, y, mb_x, mb_xy = mb_y*s-mb_width;
+uint8_t *dst[3] = {
+curframe-data[0] + 16*mb_y*s-linesize,
+curframe-data[1] +  8*mb_y*s-uvlinesize,
+curframe-data[2] +  8*mb_y*s-uvlinesize
+};
+
+memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
+memset(s-left_nnz, 0, sizeof(s-left_nnz));
+AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
+
+// left edge of 129 for intra prediction
+if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
+for (i = 0; i  3; i++)
+for (y = 0; y  16!!i; y++)
+dst[i][y*curframe-linesize[i]-1] = 129;
+if (mb_y == 1) // top left edge is also 129
+s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] 
= 129;
+}
+
+s-mv_min.x = -MARGIN;
+s-mv_max.x = ((s-mb_width  - 1)  6) + MARGIN;
+
+for (mb_x = 0; mb_x  s-mb_width; mb_x++, mb_xy++, mb++) {
+/* Prefetch the current frame, 4 MBs ahead */
+s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4);
+s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 
2);
+
+decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy,
+   prev_frame  prev_frame-ref_index[0] ? 
prev_frame-ref_index[0] + mb_xy : NULL);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
+
+if (!mb-skip)
+decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz);
+
+if (mb-mode = MODE_I4x4)
+intra_predict(s, dst, mb, mb_x, mb_y);
+else
+inter_predict(s, dst, mb, mb_x, mb_y);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
+
+if (!mb-skip) {
+idct_mb(s, dst, mb);
+} else {
+AV_ZERO64(s-left_nnz);
+AV_WN64(s-top_nnz[mb_x], 0);   // array of 9, so unaligned
+
+// Reset DC block predictors if they would exist if the mb had 
coefficients
+if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
+s-left_nnz[8]  = 0;
+s-top_nnz[mb_x][8] = 0;
+}
+}
+
+if (s-deblock_filter)
+filter_level_for_mb(s, mb, s-filter_strength[mb_x]);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
+
+dst[0] += 16;
+dst[1] += 8;
+dst[2] += 8;
+s-mv_min.x -= 64;
+s-mv_max.x -= 64;
+}
+if (s-deblock_filter) {
+if (s-filter.simple)
+filter_mb_row_simple(s, curframe, mb_y);
+else
+filter_mb_row(s, curframe, mb_y);
+}
+s-mv_min.y -= 64;
+s-mv_max.y -= 64;
+}
+
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 AVPacket *avpkt)
 {
 VP8Context *s = avctx-priv_data;
-int ret, mb_x, mb_y, i, y, referenced;
+int ret, mb_y, i, referenced;
 enum AVDiscard skip_thresh;
 AVFrame *av_uninit(curframe), *prev_frame;
 
@@ -1686,90 +1770,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void 
*data, int *data_size,
 if (s-keyframe)
 memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4);
 
-#define MARGIN (16  2)
 s-mv_min.y = -MARGIN;
 s-mv_max.y = ((s-mb_height - 1)  6) + MARGIN;
 
 for (mb_y = 0; mb_y  s-mb_height; mb_y++) {
-VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
-VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
-int mb_xy = mb_y*s-mb_width;
-uint8_t *dst[3] = {
-curframe-data[0] + 16*mb_y*s-linesize,
-curframe-data[1] +  8*mb_y*s-uvlinesize,
-curframe-data[2] +  8*mb_y*s-uvlinesize
-};
-
-memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
-memset(s-left_nnz, 0, sizeof(s-left_nnz));
-AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
-
-// left edge of 129 for intra prediction
-if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
-for (i = 0; i  3; i++)
-for (y = 0; y  16!!i; y++)
-

[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.

2012-06-28 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Sliced threading uses more memory than single or frame threading.
---
Some cosmetic changes to please Diego.
---
 libavcodec/pthread.c|   11 +
 libavcodec/thread.h |   11 +
 libavcodec/vp8.c|  522 +--
 libavcodec/vp8.h|   61 ---
 libavcodec/x86/thread.h |   24 +++
 5 files changed, 452 insertions(+), 177 deletions(-)
 create mode 100644 libavcodec/x86/thread.h

diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index c7edb9e..a7aff31 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx)
 else
 thread_free(avctx);
 }
+
+void ff_thread_sleep(int nms)
+{
+#if defined(_WIN32)
+Sleep(nms);
+#elif defined(__OS2__)
+DosSleep(nms);
+#else // If it's not Windows, give up and say it's pthreads.
+sched_yield();
+#endif
+}
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 7f018fc..d037ea3 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -29,6 +29,15 @@
 
 #include config.h
 #include avcodec.h
+#if ARCH_X86
+#include libavcodec/x86/thread.h
+#endif
+
+#if ARCH_X86
+#define pause_hint() x86_pause_hint()
+#else
+#define pause_hint()
+#endif
 
 /**
  * Wait for decoding threads to finish and reset internal state.
@@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, 
AVFrame *f);
 int ff_thread_init(AVCodecContext *s);
 void ff_thread_free(AVCodecContext *s);
 
+void ff_thread_sleep(int nms);
+
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2181976..4e2a9e8 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -36,11 +37,16 @@
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +130,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mlayout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
+
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mlayout == 0))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first

[libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock

2012-06-27 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   46 +-
 libavcodec/vp8.h |8 
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 6ab4b26..9d10827 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s)
 {
 av_freep(s-macroblocks_base);
 av_freep(s-filter_strength);
-av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
 av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
@@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 
 s-macroblocks_base= 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
 s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
 s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
 s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
+if (!s-macroblocks_base || !s-filter_strength ||
 !s-top_nnz || !s-top_border)
 return AVERROR(ENOMEM);
 
-s-macroblocks= s-macroblocks_base + s-mb_width + 2;
+s-macroblocks= s-macroblocks_base + s-mb_width + 1;
 
 return 0;
 }
@@ -622,14 +620,17 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y)
 }
 
 static av_always_inline
-void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
+void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+VP8Macroblock *mb_top  = mb - s-mb_width - 1;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
+
+memcpy(mb-intra4x4_pred_mode_top,  mb_top-intra4x4_pred_mode_top,   4);
 if (keyframe) {
 int x, y;
-uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
 uint8_t* const left = s-intra4x4_pred_mode_left;
+uint8_t* const top = mb-intra4x4_pred_mode_top;
 for (y = 0; y  4; y++) {
 for (x = 0; x  4; x++) {
 const uint8_t *ctx;
@@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid);
 else if (s-segmentation.enabled)
 *segment = ref ? *ref : *segment;
-s-segment = *segment;
+mb-segment = *segment;
 
 mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0;
 
@@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, 
vp8_pred16x16_prob_intra);
 
 if (mb-mode == MODE_I4x4) {
-decode_intra4x4_modes(s, c, mb_x, 1);
+decode_intra4x4_modes(s, c, mb, mb_x, 1);
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
-AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
 AV_WN32A(s-intra4x4_pred_mode_left, modes);
+AV_WN32A(mb-intra4x4_pred_mode_top,  modes);
 }
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
 mb-ref_frame = VP56_FRAME_CURRENT;
 } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
 // inter MB, 16.2
@@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, 
s-prob-pred16x16);
 
 if (mb-mode == MODE_I4x4)
-decode_intra4x4_modes(s, c, mb_x, 0);
+decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
 mb-ref_frame = VP56_FRAME_CURRENT;
 mb-partitioning = VP8_SPLITMVMODE_NONE;
 AV_ZERO32(mb-bmv[0]);
@@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
 int nnz_pred, nnz, nnz_total = 0;
-int segment = s-segment;
+int segment = mb-segment;
 int block_dc = 0;
 
 if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
@@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 s-hpc.pred16x16[mode](dst[0], s-linesize);
 } else {
 uint8_t *ptr = dst[0];
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
 uint8_t tr_top[4] = { 127, 127, 127, 127 

[libav-devel] [PATCH 5/5] VP8: Implement sliced threading.

2012-06-27 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Also allow frame/single-thread to use less memory than sliced threading.
---
 libavcodec/pthread.c|   11 +
 libavcodec/thread.h |   11 +
 libavcodec/vp8.c|  498 ---
 libavcodec/vp8.h|   62 ---
 libavcodec/x86/thread.h |   24 +++
 5 files changed, 422 insertions(+), 184 deletions(-)
 create mode 100644 libavcodec/x86/thread.h

diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index c7edb9e..a7aff31 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx)
 else
 thread_free(avctx);
 }
+
+void ff_thread_sleep(int nms)
+{
+#if defined(_WIN32)
+Sleep(nms);
+#elif defined(__OS2__)
+DosSleep(nms);
+#else // If it's not Windows, give up and say it's pthreads.
+sched_yield();
+#endif
+}
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 7f018fc..d037ea3 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -29,6 +29,15 @@
 
 #include config.h
 #include avcodec.h
+#if ARCH_X86
+#include libavcodec/x86/thread.h
+#endif
+
+#if ARCH_X86
+#define pause_hint() x86_pause_hint()
+#else
+#define pause_hint()
+#endif
 
 /**
  * Wait for decoding threads to finish and reset internal state.
@@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, 
AVFrame *f);
 int ff_thread_init(AVCodecContext *s);
 void ff_thread_free(AVCodecContext *s);
 
+void ff_thread_sleep(int nms);
+
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 0d845d0..5696898 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -36,10 +37,16 @@
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
+av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -107,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -120,16 +130,28 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mlayout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
+
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
 
-if (!s-macroblocks_base || !s-filter_strength ||
-!s-top_nnz || !s-top_border)
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mlayout == 0))
 return AVERROR(ENOMEM);
 
-s-macroblocks= s-macroblocks_base + s-mb_width + 1;
+s-macroblocks= s-macroblocks_base + 1;
 
 return 0;
 }
@@ -330,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx-width

[libav-devel] [PATCH 1/5] VP8: Refactor decoding a single mb_row.

2012-06-27 Thread Daniel Kang
This is in preperation for sliced threading.
---
 libavcodec/vp8.c |  162 --
 1 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 94200f6..7a8a0c6 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1574,11 +1574,93 @@ static void release_queued_segmaps(VP8Context *s, int 
is_close)
 s-maps_are_invalid = 0;
 }
 
+#define MARGIN (16  2)
+static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, 
AVFrame *prev_frame, int mb_y) {
+VP8Context *s = avctx-priv_data;
+VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
+VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
+int i, y, mb_x, mb_xy = mb_y*s-mb_width;
+uint8_t *dst[3] = {
+curframe-data[0] + 16*mb_y*s-linesize,
+curframe-data[1] +  8*mb_y*s-uvlinesize,
+curframe-data[2] +  8*mb_y*s-uvlinesize
+};
+
+memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
+memset(s-left_nnz, 0, sizeof(s-left_nnz));
+AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
+
+// left edge of 129 for intra prediction
+if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
+for (i = 0; i  3; i++)
+for (y = 0; y  16!!i; y++)
+dst[i][y*curframe-linesize[i]-1] = 129;
+if (mb_y == 1) // top left edge is also 129
+s-top_border[0][15] = s-top_border[0][23] = s-top_border[0][31] 
= 129;
+}
+
+s-mv_min.x = -MARGIN;
+s-mv_max.x = ((s-mb_width  - 1)  6) + MARGIN;
+
+for (mb_x = 0; mb_x  s-mb_width; mb_x++, mb_xy++, mb++) {
+/* Prefetch the current frame, 4 MBs ahead */
+s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4);
+s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 
2);
+
+decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy,
+   prev_frame  prev_frame-ref_index[0] ? 
prev_frame-ref_index[0] + mb_xy : NULL);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
+
+if (!mb-skip)
+decode_mb_coeffs(s, c, mb, s-top_nnz[mb_x], s-left_nnz);
+
+if (mb-mode = MODE_I4x4)
+intra_predict(s, dst, mb, mb_x, mb_y);
+else
+inter_predict(s, dst, mb, mb_x, mb_y);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
+
+if (!mb-skip) {
+idct_mb(s, dst, mb);
+} else {
+AV_ZERO64(s-left_nnz);
+AV_WN64(s-top_nnz[mb_x], 0);   // array of 9, so unaligned
+
+// Reset DC block predictors if they would exist if the mb had 
coefficients
+if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
+s-left_nnz[8]  = 0;
+s-top_nnz[mb_x][8] = 0;
+}
+}
+
+if (s-deblock_filter)
+filter_level_for_mb(s, mb, s-filter_strength[mb_x]);
+
+prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
+
+dst[0] += 16;
+dst[1] += 8;
+dst[2] += 8;
+s-mv_min.x -= 64;
+s-mv_max.x -= 64;
+}
+if (s-deblock_filter) {
+if (s-filter.simple)
+filter_mb_row_simple(s, curframe, mb_y);
+else
+filter_mb_row(s, curframe, mb_y);
+}
+s-mv_min.y -= 64;
+s-mv_max.y -= 64;
+}
+
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 AVPacket *avpkt)
 {
 VP8Context *s = avctx-priv_data;
-int ret, mb_x, mb_y, i, y, referenced;
+int ret, mb_y, i, referenced;
 enum AVDiscard skip_thresh;
 AVFrame *av_uninit(curframe), *prev_frame;
 
@@ -1686,90 +1768,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void 
*data, int *data_size,
 if (s-keyframe)
 memset(s-intra4x4_pred_mode_top, DC_PRED, s-mb_width*4);
 
-#define MARGIN (16  2)
 s-mv_min.y = -MARGIN;
 s-mv_max.y = ((s-mb_height - 1)  6) + MARGIN;
 
 for (mb_y = 0; mb_y  s-mb_height; mb_y++) {
-VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
-VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
-int mb_xy = mb_y*s-mb_width;
-uint8_t *dst[3] = {
-curframe-data[0] + 16*mb_y*s-linesize,
-curframe-data[1] +  8*mb_y*s-uvlinesize,
-curframe-data[2] +  8*mb_y*s-uvlinesize
-};
-
-memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
-memset(s-left_nnz, 0, sizeof(s-left_nnz));
-AV_WN32A(s-intra4x4_pred_mode_left, DC_PRED*0x01010101);
-
-// left edge of 129 for intra prediction
-if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
-for (i = 0; i  3; i++)
-for (y = 0; y  16!!i; y++)
-dst[i][y*curframe-linesize[i]-1] = 129;
-if (mb_y == 1) // top left edge is also 

[libav-devel] [PATCH 2/3] VP8: Move data from VP8Context-VP8Macroblock

2012-06-27 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   25 +
 libavcodec/vp8.h |7 ---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 7a8a0c6..b70e87e 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -622,10 +622,11 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y)
 }
 
 static av_always_inline
-void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
+void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
+
 if (keyframe) {
 int x, y;
 uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
@@ -655,7 +656,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid);
 else if (s-segmentation.enabled)
 *segment = ref ? *ref : *segment;
-s-segment = *segment;
+mb-segment = *segment;
 
 mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0;
 
@@ -663,14 +664,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, 
vp8_pred16x16_prob_intra);
 
 if (mb-mode == MODE_I4x4) {
-decode_intra4x4_modes(s, c, mb_x, 1);
+decode_intra4x4_modes(s, c, mb, mb_x, 1);
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
 AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
 AV_WN32A(s-intra4x4_pred_mode_left, modes);
 }
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
 mb-ref_frame = VP56_FRAME_CURRENT;
 } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
 // inter MB, 16.2
@@ -688,9 +689,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, 
s-prob-pred16x16);
 
 if (mb-mode == MODE_I4x4)
-decode_intra4x4_modes(s, c, mb_x, 0);
+decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
 mb-ref_frame = VP56_FRAME_CURRENT;
 mb-partitioning = VP8_SPLITMVMODE_NONE;
 AV_ZERO32(mb-bmv[0]);
@@ -791,7 +792,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
 int nnz_pred, nnz, nnz_total = 0;
-int segment = s-segment;
+int segment = mb-segment;
 int block_dc = 0;
 
 if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
@@ -1002,7 +1003,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 s-hpc.pred16x16[mode](dst[0], s-linesize);
 } else {
 uint8_t *ptr = dst[0];
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
 uint8_t tr_top[4] = { 127, 127, 127, 127 };
 
 // all blocks on the right edge of the macroblock use bottom edge
@@ -1087,9 +1088,9 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 }
 
 if (avctx-flags  CODEC_FLAG_EMU_EDGE) {
-mode = check_intra_pred8x8_mode_emuedge(s-chroma_pred_mode, mb_x, 
mb_y);
+mode = check_intra_pred8x8_mode_emuedge(mb-chroma_pred_mode, mb_x, 
mb_y);
 } else {
-mode = check_intra_pred8x8_mode(s-chroma_pred_mode, mb_x, mb_y);
+mode = check_intra_pred8x8_mode(mb-chroma_pred_mode, mb_x, mb_y);
 }
 s-hpc.pred8x8[mode](dst[1], s-uvlinesize);
 s-hpc.pred8x8[mode](dst[2], s-uvlinesize);
@@ -1408,7 +1409,7 @@ static av_always_inline void 
filter_level_for_mb(VP8Context *s, VP8Macroblock *m
 int interior_limit, filter_level;
 
 if (s-segmentation.enabled) {
-filter_level = s-segmentation.filter_level[s-segment];
+filter_level = s-segmentation.filter_level[mb-segment];
 if (!s-segmentation.absolute_vals)
 filter_level += s-filter.level;
 } else
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index a738cb7..2f2cb80 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -79,6 +79,10 @@ typedef struct {
 uint8_t mode;
 uint8_t ref_frame;
 uint8_t partitioning;
+uint8_t chroma_pred_mode;
+uint8_t segment;
+uint8_t intra4x4_pred_mode_mb[16];
+uint8_t intra4x4_pred_mode_top[4];
 VP56mv mv;
 VP56mv bmv[16];
 } VP8Macroblock;
@@ -97,8 +101,6 @@ typedef struct {
 uint8_t keyframe;

[libav-devel] [PATCH 3/3] VP8: Implement sliced threading.

2012-06-27 Thread Daniel Kang
Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with 
eight threads.

Also allow frame/single-thread to use less memory than sliced threading.
---
 libavcodec/pthread.c|   11 +
 libavcodec/thread.h |   11 +
 libavcodec/vp8.c|  514 +--
 libavcodec/vp8.h|   61 ---
 libavcodec/x86/thread.h |   24 +++
 5 files changed, 445 insertions(+), 176 deletions(-)
 create mode 100644 libavcodec/x86/thread.h

diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index c7edb9e..a7aff31 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -1053,3 +1053,14 @@ void ff_thread_free(AVCodecContext *avctx)
 else
 thread_free(avctx);
 }
+
+void ff_thread_sleep(int nms)
+{
+#if defined(_WIN32)
+Sleep(nms);
+#elif defined(__OS2__)
+DosSleep(nms);
+#else // If it's not Windows, give up and say it's pthreads.
+sched_yield();
+#endif
+}
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 7f018fc..d037ea3 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -29,6 +29,15 @@
 
 #include config.h
 #include avcodec.h
+#if ARCH_X86
+#include libavcodec/x86/thread.h
+#endif
+
+#if ARCH_X86
+#define pause_hint() x86_pause_hint()
+#else
+#define pause_hint()
+#endif
 
 /**
  * Wait for decoding threads to finish and reset internal state.
@@ -113,4 +122,6 @@ void ff_thread_release_buffer(AVCodecContext *avctx, 
AVFrame *f);
 int ff_thread_init(AVCodecContext *s);
 void ff_thread_free(AVCodecContext *s);
 
+void ff_thread_sleep(int nms);
+
 #endif /* AVCODEC_THREAD_H */
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index b70e87e..4233eda 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -36,11 +37,16 @@
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+if (s-thread_data)
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i].filter_strength);
+av_freep(s-thread_data[i].edge_emu_buffer);
+}
+av_freep(s-thread_data);
 av_freep(s-macroblocks_base);
-av_freep(s-filter_strength);
 av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -108,6 +114,9 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+AVCodecContext *avctx = s-avctx;
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -121,14 +130,25 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
-s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
-s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
-s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-mlayout = (avctx-active_thread_type == FF_THREAD_SLICE)  
(FFMIN(s-num_coeff_partitions, avctx-thread_count)  1);
+if (s-mlayout == 0) { // Frame threading and one thread
+s-macroblocks_base   = 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-intra4x4_pred_mode_top = av_mallocz(s-mb_width*4);
+}
+else // Sliced threading
+s-macroblocks_base   = 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
+s-top_nnz= 
av_mallocz(s-mb_width*sizeof(*s-top_nnz));
+s-top_border = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
+s-thread_data= 
av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
+
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i].filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0].filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
-!s-top_nnz || !s-top_border)
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border ||
+(!s-intra4x4_pred_mode_top  s-mlayout == 0))
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + 1;
@@ -332,12 +352,6 @@ static int decode_frame_header(VP8Context *s, const 
uint8_t *buf, int buf_size)
 memset(s-segmentation, 0, sizeof(s-segmentation));
 }
 
-if (!s-macroblocks_base || /* first frame */
-width != s-avctx

Re: [libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock

2012-06-25 Thread Daniel Kang
On Fri, Jun 22, 2012 at 3:33 PM, Luca Barbato lu_z...@gentoo.org wrote:

  I am testing Jason's idea, which only need a little more memory. However,

 from the timings and preliminary tests I have done, I don't think it
 scales
  nearly as well.

 I see


I've decided to stick with this approach for now, since it scales better at
lower number of threads.


  A modification of Jason's idea might scale better, but it will require as
  much memory.

 Could you try to swap layouts depending on which threading system is used?


Tried this, and the speed hit is now ~1.5% for frame and single threaded.

I'll try profiling and seeing where the issues are.

Apparently I'm not allowed to attached large files, so here's my WIP patch
if anyone has suggestions, etc.: http://privatepaste.com/dbeafd7e68
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 3/5] VP8: Move data from VP8Context-VP8Macroblock

2012-06-22 Thread Daniel Kang
On Fri, Jun 22, 2012 at 12:26 AM, Luca Barbato lu_z...@gentoo.org wrote:

 On 06/22/2012 04:19 AM, Daniel Kang wrote:
  In preparation for sliced threading.
  ---
   libavcodec/vp8.c |   54
 ++
   libavcodec/vp8.h |   11 +--
   2 files changed, 35 insertions(+), 30 deletions(-)
 
  diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
  index 6ab4b26..bc2476e 100644
  --- a/libavcodec/vp8.c
  +++ b/libavcodec/vp8.c
  @@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s)
   {
   av_freep(s-macroblocks_base);
   av_freep(s-filter_strength);
  -av_freep(s-intra4x4_pred_mode_top);
   av_freep(s-top_nnz);
   av_freep(s-edge_emu_buffer);
   av_freep(s-top_border);
  @@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int
 width, int height)
 
   s-macroblocks_base=
 av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
   s-filter_strength =
 av_mallocz(s-mb_width*sizeof(*s-filter_strength));
  -s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
   s-top_nnz =
 av_mallocz(s-mb_width*sizeof(*s-top_nnz));
   s-top_border  =
 av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
 
  -if (!s-macroblocks_base || !s-filter_strength ||
 !s-intra4x4_pred_mode_top ||
  +if (!s-macroblocks_base || !s-filter_strength ||
   !s-top_nnz || !s-top_border)
   return AVERROR(ENOMEM);
 
  -s-macroblocks= s-macroblocks_base + s-mb_width + 2;
  +s-macroblocks= s-macroblocks_base + s-mb_width + 1;
 
   return 0;
   }
  @@ -622,14 +620,19 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb,
 int mb_x, int mb_y)
   }
 
   static av_always_inline
  -void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
  +void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 VP8Macroblock *mb,
  int mb_x, int keyframe)
   {
  -uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
  +VP8Macroblock *mb_top  = mb - s-mb_width - 1;
  +VP8Macroblock *mb_left = mb - 1;
  +uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
  +
  +memcpy(mb-intra4x4_pred_mode_left,
 mb_left-intra4x4_pred_mode_left, 4);
  +memcpy(mb-intra4x4_pred_mode_top,  mb_top-intra4x4_pred_mode_top,
   4);
   if (keyframe) {
   int x, y;
  -uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
  -uint8_t* const left = s-intra4x4_pred_mode_left;
  +uint8_t* const top = mb-intra4x4_pred_mode_top;
  +uint8_t* const left = mb-intra4x4_pred_mode_left;
   for (y = 0; y  4; y++) {
   for (x = 0; x  4; x++) {
   const uint8_t *ctx;
  @@ -655,7 +658,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock
 *mb, int mb_x, int mb_y, uint8_
   *segment = vp8_rac_get_tree(c, vp8_segmentid_tree,
 s-prob-segmentid);
   else if (s-segmentation.enabled)
   *segment = ref ? *ref : *segment;
  -s-segment = *segment;
  +mb-segment = *segment;
 
   mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c,
 s-prob-mbskip) : 0;
 
  @@ -663,14 +666,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock
 *mb, int mb_x, int mb_y, uint8_
   mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
 vp8_pred16x16_prob_intra);
 
   if (mb-mode == MODE_I4x4) {
  -decode_intra4x4_modes(s, c, mb_x, 1);
  +decode_intra4x4_modes(s, c, mb, mb_x, 1);
   } else {
   const uint32_t modes = vp8_pred4x4_mode[mb-mode] *
 0x01010101u;
  -AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
  -AV_WN32A(s-intra4x4_pred_mode_left, modes);
  +AV_WN32A(mb-intra4x4_pred_mode_top,  modes);
  +AV_WN32A(mb-intra4x4_pred_mode_left, modes);
   }
 
  -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
 vp8_pred8x8c_prob_intra);
  +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
 vp8_pred8x8c_prob_intra);
   mb-ref_frame = VP56_FRAME_CURRENT;
   } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
   // inter MB, 16.2
  @@ -688,9 +691,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock
 *mb, int mb_x, int mb_y, uint8_
   mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter,
 s-prob-pred16x16);
 
   if (mb-mode == MODE_I4x4)
  -decode_intra4x4_modes(s, c, mb_x, 0);
  +decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
  -s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
 s-prob-pred8x8c);
  +mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
 s-prob-pred8x8c);
   mb-ref_frame = VP56_FRAME_CURRENT;
   mb-partitioning = VP8_SPLITMVMODE_NONE;
   AV_ZERO32(mb-bmv[0]);
  @@ -791,7 +794,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder
 *c, VP8Macroblock *mb,
   {
   int i, x, y, luma_start = 0

[libav-devel] [PATCH 2/4] VP8: Move data from VP8Context-VP8Macroblock

2012-06-21 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   54 ++
 libavcodec/vp8.h |   11 +--
 2 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 6ab4b26..bc2476e 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -38,7 +38,6 @@ static void free_buffers(VP8Context *s)
 {
 av_freep(s-macroblocks_base);
 av_freep(s-filter_strength);
-av_freep(s-intra4x4_pred_mode_top);
 av_freep(s-top_nnz);
 av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
@@ -123,15 +122,14 @@ static int update_dimensions(VP8Context *s, int width, 
int height)
 
 s-macroblocks_base= 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
 s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
-s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
 s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
 s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
 
-if (!s-macroblocks_base || !s-filter_strength || 
!s-intra4x4_pred_mode_top ||
+if (!s-macroblocks_base || !s-filter_strength ||
 !s-top_nnz || !s-top_border)
 return AVERROR(ENOMEM);
 
-s-macroblocks= s-macroblocks_base + s-mb_width + 2;
+s-macroblocks= s-macroblocks_base + s-mb_width + 1;
 
 return 0;
 }
@@ -622,14 +620,19 @@ void decode_mvs(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y)
 }
 
 static av_always_inline
-void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
+void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
-uint8_t *intra4x4 = s-intra4x4_pred_mode_mb;
+VP8Macroblock *mb_top  = mb - s-mb_width - 1;
+VP8Macroblock *mb_left = mb - 1;
+uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
+
+memcpy(mb-intra4x4_pred_mode_left, mb_left-intra4x4_pred_mode_left, 4);
+memcpy(mb-intra4x4_pred_mode_top,  mb_top-intra4x4_pred_mode_top,   4);
 if (keyframe) {
 int x, y;
-uint8_t* const top = s-intra4x4_pred_mode_top + 4 * mb_x;
-uint8_t* const left = s-intra4x4_pred_mode_left;
+uint8_t* const top = mb-intra4x4_pred_mode_top;
+uint8_t* const left = mb-intra4x4_pred_mode_left;
 for (y = 0; y  4; y++) {
 for (x = 0; x  4; x++) {
 const uint8_t *ctx;
@@ -655,7 +658,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s-prob-segmentid);
 else if (s-segmentation.enabled)
 *segment = ref ? *ref : *segment;
-s-segment = *segment;
+mb-segment = *segment;
 
 mb-skip = s-mbskip_enabled ? vp56_rac_get_prob(c, s-prob-mbskip) : 0;
 
@@ -663,14 +666,14 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, 
vp8_pred16x16_prob_intra);
 
 if (mb-mode == MODE_I4x4) {
-decode_intra4x4_modes(s, c, mb_x, 1);
+decode_intra4x4_modes(s, c, mb, mb_x, 1);
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
-AV_WN32A(s-intra4x4_pred_mode_top + 4 * mb_x, modes);
-AV_WN32A(s-intra4x4_pred_mode_left, modes);
+AV_WN32A(mb-intra4x4_pred_mode_top,  modes);
+AV_WN32A(mb-intra4x4_pred_mode_left, modes);
 }
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
 mb-ref_frame = VP56_FRAME_CURRENT;
 } else if (vp56_rac_get_prob_branchy(c, s-prob-intra)) {
 // inter MB, 16.2
@@ -688,9 +691,9 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 mb-mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, 
s-prob-pred16x16);
 
 if (mb-mode == MODE_I4x4)
-decode_intra4x4_modes(s, c, mb_x, 0);
+decode_intra4x4_modes(s, c, mb, mb_x, 0);
 
-s-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
+mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
s-prob-pred8x8c);
 mb-ref_frame = VP56_FRAME_CURRENT;
 mb-partitioning = VP8_SPLITMVMODE_NONE;
 AV_ZERO32(mb-bmv[0]);
@@ -791,7 +794,7 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
 int nnz_pred, nnz, nnz_total = 0;
-int segment = s-segment;
+int segment = mb-segment;
 int block_dc = 0;
 
 if (mb-mode != MODE_I4x4  mb-mode != VP8_MVMODE_SPLIT) {
@@ -1002,7 +1005,7 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], 
VP8Macroblock *mb,
 

[libav-devel] [PATCH 3/4] VP8: Decode mvs and mb modes separately.

2012-06-21 Thread Daniel Kang
In preparation for sliced threading.
---
 libavcodec/vp8.c |   37 -
 1 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index bc2476e..db4a875 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1578,6 +1578,32 @@ static void release_queued_segmaps(VP8Context *s, int 
is_close)
 }
 
 #define MARGIN (16  2)
+static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe, 
AVFrame *prev_frame) {
+VP8Context *s = avctx-priv_data;
+int mb_x, mb_y;
+
+s-mv_min.y = -MARGIN;
+s-mv_max.y = ((s-mb_height - 1)  6) + MARGIN;
+for (mb_y = 0; mb_y  s-mb_height; mb_y++) {
+VP8Macroblock *mb = s-macroblocks_base + ((s-mb_width+1)*(mb_y + 1) 
+ 1);
+int mb_xy = mb_y*s-mb_width;
+
+memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
+AV_WN32A((mb-1)-intra4x4_pred_mode_left, DC_PRED*0x01010101);
+
+s-mv_min.x = -MARGIN;
+s-mv_max.x = ((s-mb_width  - 1)  6) + MARGIN;
+for (mb_x = 0; mb_x  s-mb_width; mb_x++, mb_xy++, mb++) {
+decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy,
+   prev_frame  prev_frame-ref_index[0] ? 
prev_frame-ref_index[0] + mb_xy : NULL);
+s-mv_min.x -= 64;
+s-mv_max.x -= 64;
+}
+s-mv_min.y -= 64;
+s-mv_max.y -= 64;
+}
+}
+
 static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, 
AVFrame *prev_frame, int mb_y) {
 VP8Context *s = avctx-priv_data;
 VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
@@ -1589,9 +1615,7 @@ static void vp8_decode_mb_row(AVCodecContext *avctx, 
AVFrame *curframe, AVFrame
 curframe-data[2] +  8*mb_y*s-uvlinesize
 };
 
-memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
 memset(s-left_nnz, 0, sizeof(s-left_nnz));
-AV_WN32A((mb-1)-intra4x4_pred_mode_left, DC_PRED*0x01010101);
 
 // left edge of 129 for intra prediction
 if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
@@ -1610,9 +1634,6 @@ static void vp8_decode_mb_row(AVCodecContext *avctx, 
AVFrame *curframe, AVFrame
 s-dsp.prefetch(dst[0] + (mb_x3)*4*s-linesize + 64, s-linesize, 4);
 s-dsp.prefetch(dst[1] + (mb_x7)*s-uvlinesize + 64, dst[2] - dst[1], 
2);
 
-decode_mb_mode(s, mb, mb_x, mb_y, curframe-ref_index[0] + mb_xy,
-   prev_frame  prev_frame-ref_index[0] ? 
prev_frame-ref_index[0] + mb_xy : NULL);
-
 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
 
 if (!mb-skip)
@@ -1774,6 +1795,12 @@ static int vp8_decode_frame(AVCodecContext *avctx, void 
*data, int *data_size,
 memset(mb-intra4x4_pred_mode_top, DC_PRED, 4);
 }
 
+// Make sure the previous frame has read its segmentation map,
+// if we re-use the same map.
+if (prev_frame  s-segmentation.enabled  !s-segmentation.update_map)
+ff_thread_await_progress(prev_frame, 1, 0);
+vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+
 s-mv_min.y = -MARGIN;
 s-mv_max.y = ((s-mb_height - 1)  6) + MARGIN;
 
-- 
1.7.7.3

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/4] VP8: Change mb memory layout for sliced threading.

2012-06-21 Thread Daniel Kang
---
 libavcodec/vp8.c |   14 +++---
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 7a8a0c6..6ab4b26 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -121,7 +121,7 @@ static int update_dimensions(VP8Context *s, int width, int 
height)
 s-mb_width  = (s-avctx-coded_width +15) / 16;
 s-mb_height = (s-avctx-coded_height+15) / 16;
 
-s-macroblocks_base= 
av_mallocz((s-mb_width+s-mb_height*2+1)*sizeof(*s-macroblocks));
+s-macroblocks_base= 
av_mallocz((s-mb_width+2)*(s-mb_height+2)*sizeof(*s-macroblocks));
 s-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-filter_strength));
 s-intra4x4_pred_mode_top  = av_mallocz(s-mb_width*4);
 s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
@@ -131,7 +131,7 @@ static int update_dimensions(VP8Context *s, int width, int 
height)
 !s-top_nnz || !s-top_border)
 return AVERROR(ENOMEM);
 
-s-macroblocks= s-macroblocks_base + 1;
+s-macroblocks= s-macroblocks_base + s-mb_width + 2;
 
 return 0;
 }
@@ -472,7 +472,7 @@ int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb)
 {
 int part_idx;
 int n, num;
-VP8Macroblock *top_mb  = mb[2];
+VP8Macroblock *top_mb  = mb[-s-mb_width-1];
 VP8Macroblock *left_mb = mb[-1];
 const uint8_t *mbsplits_left = vp8_mbsplits[left_mb-partitioning],
   *mbsplits_top = vp8_mbsplits[top_mb-partitioning],
@@ -534,9 +534,9 @@ int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb)
 static av_always_inline
 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 {
-VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
+VP8Macroblock *mb_edge[3] = { mb - s-mb_width-1 /* top */,
   mb - 1 /* left */,
-  mb + 1 /* top-left */ };
+  mb - s-mb_width-2 /* top-left */ };
 enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 int idx = CNT_ZERO;
@@ -1578,7 +1578,7 @@ static void release_queued_segmaps(VP8Context *s, int 
is_close)
 static void vp8_decode_mb_row(AVCodecContext *avctx, AVFrame *curframe, 
AVFrame *prev_frame, int mb_y) {
 VP8Context *s = avctx-priv_data;
 VP56RangeCoder *c = s-coeff_partition[mb_y  
(s-num_coeff_partitions-1)];
-VP8Macroblock *mb = s-macroblocks + (s-mb_height - mb_y - 1)*2;
+VP8Macroblock *mb = s-macroblocks + ((s-mb_width+1)*(mb_y + 1) + 1);
 int i, y, mb_x, mb_xy = mb_y*s-mb_width;
 uint8_t *dst[3] = {
 curframe-data[0] + 16*mb_y*s-linesize,
@@ -1757,7 +1757,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void 
*data, int *data_size,
 memset(s-top_nnz, 0, s-mb_width*sizeof(*s-top_nnz));
 
 /* Zero macroblock structures for top/top-left prediction from outside the 
frame. */
-memset(s-macroblocks + s-mb_height*2 - 1, 0, 
(s-mb_width+1)*sizeof(*s-macroblocks));
+memset(s-macroblocks_base, 0, (s-mb_width+1)*sizeof(*s-macroblocks));
 
 // top edge of 127 for intra prediction
 if (!(avctx-flags  CODEC_FLAG_EMU_EDGE)) {
-- 
1.7.7.3

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 4/4] [WIP] VP8: Implement sliced threading.

2012-06-21 Thread Daniel Kang
---
Jason has some ideas to improve speed.

This hurts frame and single threaded by ~0.5% (expected).

With two threads, I get ~30% speed increase with 4, ~45%, and with 8 ~50%.
---
 libavcodec/vp8.c |  394 +++---
 libavcodec/vp8.h |   56 +
 2 files changed, 288 insertions(+), 162 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index db4a875..56e40ea 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -22,6 +23,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include pthread.h
+
 #include libavutil/imgutils.h
 #include avcodec.h
 #include internal.h
@@ -36,10 +39,13 @@
 
 static void free_buffers(VP8Context *s)
 {
+int i;
+for (i = 0; i  MAX_THREADS; i++) {
+av_freep(s-thread_data[i]);
+}
 av_freep(s-macroblocks_base);
 av_freep(s-filter_strength);
 av_freep(s-top_nnz);
-av_freep(s-edge_emu_buffer);
 av_freep(s-top_border);
 
 s-macroblocks = NULL;
@@ -107,6 +113,8 @@ static void vp8_decode_flush(AVCodecContext *avctx)
 
 static int update_dimensions(VP8Context *s, int width, int height)
 {
+int i;
+
 if (width  != s-avctx-width ||
 height != s-avctx-height) {
 if (av_image_check_size(width, height, 0, s-avctx))
@@ -125,8 +133,14 @@ static int update_dimensions(VP8Context *s, int width, int 
height)
 s-top_nnz = av_mallocz(s-mb_width*sizeof(*s-top_nnz));
 s-top_border  = 
av_mallocz((s-mb_width+1)*sizeof(*s-top_border));
 
-if (!s-macroblocks_base || !s-filter_strength ||
-!s-top_nnz || !s-top_border)
+for (i = 0; i  MAX_THREADS; i++) {
+s-thread_data[i] = av_mallocz(sizeof(VP8ThreadData));
+s-thread_data[i]-filter_strength = 
av_mallocz(s-mb_width*sizeof(*s-thread_data[0]-filter_strength));
+//pthread_mutex_init(s-thread_data[i]-lock, NULL);
+//pthread_cond_init(s-thread_data[i]-cond, NULL);
+}
+
+if (!s-macroblocks_base || !s-top_nnz || !s-top_border)
 return AVERROR(ENOMEM);
 
 s-macroblocks= s-macroblocks_base + s-mb_width + 1;
@@ -624,15 +638,13 @@ void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder 
*c, VP8Macroblock *mb,
int mb_x, int keyframe)
 {
 VP8Macroblock *mb_top  = mb - s-mb_width - 1;
-VP8Macroblock *mb_left = mb - 1;
 uint8_t *intra4x4 = mb-intra4x4_pred_mode_mb;
 
-memcpy(mb-intra4x4_pred_mode_left, mb_left-intra4x4_pred_mode_left, 4);
 memcpy(mb-intra4x4_pred_mode_top,  mb_top-intra4x4_pred_mode_top,   4);
 if (keyframe) {
 int x, y;
 uint8_t* const top = mb-intra4x4_pred_mode_top;
-uint8_t* const left = mb-intra4x4_pred_mode_left;
+uint8_t* const left = s-intra4x4_pred_mode_left;
 for (y = 0; y  4; y++) {
 for (x = 0; x  4; x++) {
 const uint8_t *ctx;
@@ -670,7 +682,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int 
mb_x, int mb_y, uint8_
 } else {
 const uint32_t modes = vp8_pred4x4_mode[mb-mode] * 0x01010101u;
 AV_WN32A(mb-intra4x4_pred_mode_top,  modes);
-AV_WN32A(mb-intra4x4_pred_mode_left, modes);
+AV_WN32A( s-intra4x4_pred_mode_left, modes);
 }
 
 mb-chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, 
vp8_pred8x8c_prob_intra);
@@ -789,7 +801,7 @@ int decode_block_coeffs(VP56RangeCoder *c, DCTELEM 
block[16],
 }
 
 static av_always_inline
-void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
+void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, 
VP8Macroblock *mb,
   uint8_t t_nnz[9], uint8_t l_nnz[9])
 {
 int i, x, y, luma_start = 0, luma_ctx = 3;
@@ -801,16 +813,16 @@ void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, 
VP8Macroblock *mb,
 nnz_pred = t_nnz[8] + l_nnz[8];
 
 // decode DC values and do hadamard
-nnz = decode_block_coeffs(c, s-block_dc, s-prob-token[1], 0, 
nnz_pred,
+nnz = decode_block_coeffs(c, td-block_dc, s-prob-token[1], 0, 
nnz_pred,
   s-qmat[segment].luma_dc_qmul);
 l_nnz[8] = t_nnz[8] = !!nnz;
 if (nnz) {
 nnz_total += nnz;
 block_dc = 1;
 if (nnz == 1)
-s-vp8dsp.vp8_luma_dc_wht_dc(s-block, s-block_dc);
+s-vp8dsp.vp8_luma_dc_wht_dc(td-block, td-block_dc);
 else
-s-vp8dsp.vp8_luma_dc_wht(s-block, s-block_dc);
+s-vp8dsp.vp8_luma_dc_wht(td-block, td-block_dc);
 }
 luma_start = 1;
 luma_ctx = 0;
@@ -820,10 +832,10 @@ void decode_mb_coeffs(VP8Context *s

Re: [libav-devel] [PATCH 1/4] VP8: Change mb memory layout for sliced threading.

2012-06-21 Thread Daniel Kang
On Thu, Jun 21, 2012 at 6:52 PM, Daniel Kang daniel.d.k...@gmail.comwrote:

 ---
  libavcodec/vp8.c |   14 +++---
  1 files changed, 7 insertions(+), 7 deletions(-)


Oops I accidentally a patch -_-

Will re-send later.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


  1   2   >