Re: [libav-devel] [PATCH] x86: Port gradfun to yasm

2013-10-21 Thread Loren Merritt
---
 libavfilter/x86/vf_gradfun.asm | 82 +-
 1 file changed, 24 insertions(+), 58 deletions(-)

diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index e1737dd..0ba051d 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -27,6 +27,27 @@ pw_ff: times 8 dw 0xFF
 
 SECTION .text
 
+%macro FILTER_LINE 1
+movh   m0, [r2+r0]
+movh   m1, [r3+r0]
+punpcklbw  m0, m7
+punpcklwd  m1, m1
+psllw  m0, 7
+psubw  m1, m0
+PABSW  m2, m1
+pmulhuwm2, m5
+psubw  m2, m6
+pminsw m2, m7
+pmullw m2, m2
+psllw  m1, 2
+paddw  m0, %1
+pmulhw m1, m2
+paddw  m0, m1
+psraw  m0, 7
+packuswb   m0, m0
+movh  [r1+r0], m0
+%endmacro
+
 INIT_MMX mmxext
 cglobal gradfun_filter_line, 6, 6
 movh  m5, r4d
@@ -36,48 +57,10 @@ cglobal gradfun_filter_line, 6, 6
 mova  m3, [r5]
 mova  m4, [r5+8]
 .loop:
-movh  m0, [r2+r0]
-movh  m1, [r3+r0]
-punpcklbw m0, m7
-punpcklwd m1, m1
-psllw m0, 7
-pxor  m2, m2
-psubw m1, m0
-psubw m2, m1
-pmaxswm2, m1
-pmulhuw   m2, m5
-psubw m2, m6
-pminswm2, m7
-pmullwm2, m2
-paddw m0, m3
-psllw m1, 2
-pmulhwm1, m2
-paddw m0, m1
-psraw m0, 7
-packuswb  m0, m0
-movh [r1+r0], m0
+FILTER_LINE m3
 add   r0, 4
 jge .end
-movh  m0, [r2+r0]
-movh  m1, [r3+r0]
-punpcklbw m0, m7
-punpcklwd m1, m1
-psllw m0, 7
-pxor  m2, m2
-psubw m1, m0
-psubw m2, m1
-pmaxswm2, m1
-pmulhuw   m2, m5
-psubw m2, m6
-pminswm2, m7
-pmullwm2, m2
-paddw m0, m4
-psllw m1, 2
-pmulhwm1, m2
-paddw m0, m1
-psraw m0, 7
-packuswb  m0, m0
-movh [r1+r0], m0
+FILTER_LINE m4
 add   r0, 4
 jl .loop
 .end:
@@ -92,24 +75,7 @@ cglobal gradfun_filter_line, 6, 6, 8
 punpcklqdq m5, m5
 mova   m4, [r5]
 .loop:
-movh   m0, [r2+r0]
-movh   m1, [r3+r0]
-punpcklbw  m0, m7
-punpcklwd  m1, m1
-psllw  m0, 7
-psubw  m1, m0
-pabsw  m2, m1
-pmulhuwm2, m5
-psubw  m2, m6
-pminsw m2, m7
-pmullw m2, m2
-psllw  m1, 2
-paddw  m0, m4
-pmulhw m1, m2
-paddw  m0, m1
-psraw  m0, 7
-packuswb   m0, m0
-movh  [r1+r0], m0
+FILTER_LINE m4
 addr0, 8
 jl .loop
 REP_RET
-- 
1.8.3.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 01/10] x86inc: activate REP_RET automatically

2013-10-01 Thread Loren Merritt
On Mon, 30 Sep 2013, Diego Biurrun wrote:

 On Mon, Sep 30, 2013 at 04:38:01PM +0100, Derek Buitenhuis wrote:
  On 9/30/2013 4:28 PM, Diego Biurrun wrote:
   Not sure what you are trying to achieve with the if, $(STRIP) should
   always be set ...
  
   If we bring back stripping, we'll need the rest of the portability
   parts from e0be794 as well.
 
  For now, I do not think this patch is necessary, no?

 Are you referring to 01/10 or the build system patch?

 If the latter, then I don't know what the side effect of keeping
 those labels around is ...

The effect of keeping those labels around is making debugging harder.
Because those labels are meaningless, and complicate the disassembly.
Also, gdb can't tell the difference between them and function entrypoints.

This new strip command is irrelevant to any usage of libav that would
have used the old fully stripped version, because the old one was for
non-debug use.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 01/10] x86inc: activate REP_RET automatically

2013-09-12 Thread Loren Merritt
On Wed, 11 Sep 2013, Hendrik Leppkes wrote:

 From: Loren Merritt pengv...@akuvian.org

 Now RET checks whether it immediately follows a branch, so the programmer 
 dosen't have to keep track of that condition.
 REP_RET is still needed manually when it's a branch target, but that's much 
 rarer.

 The implementation involves lots of spurious labels, but that's ok because we 
 strip them.

That's true of x264's buildsystem, but not libav yet. So import that patch too:

---
 configure   | 3 +++
 library.mak | 1 +
 2 files changed, 4 insertions(+)

diff --git a/configure b/configure
index e6f8b52..fce4d25 100755
--- a/configure
+++ b/configure
@@ -1973,6 +1973,7 @@ nm_default=nm -g
 objformat=elf
 pkg_config_default=pkg-config
 ranlib=ranlib
+strip=strip
 yasmexe=yasm
 
 nogas=:
@@ -2231,6 +2232,7 @@ cc_default=${cross_prefix}${cc_default}
 nm_default=${cross_prefix}${nm_default}
 pkg_config_default=${cross_prefix}${pkg_config_default}
 ranlib=${cross_prefix}${ranlib}
+strip=${cross_prefix}${strip}
 
 sysinclude_default=${sysroot}/usr/include
 
@@ -4140,6 +4142,7 @@ AR=$ar
 ARFLAGS=$arflags
 AR_O=$ar_o
 RANLIB=$ranlib
+STRIP=$strip
 LN_S=$ln_s
 CPPFLAGS=$CPPFLAGS
 CFLAGS=$CFLAGS
diff --git a/library.mak b/library.mak
index 88d33dc..58f6360 100644
--- a/library.mak
+++ b/library.mak
@@ -25,6 +25,7 @@ $(SUBDIR)%-test.i: $(SUBDIR)%.c
 $(SUBDIR)x86/%.o: $(SUBDIR)x86/%.asm
$(DEPYASM) $(YASMFLAGS) -I $(D)/ -M -o $@ $  $(@:.o=.d)
$(YASM) $(YASMFLAGS) -I $(D)/ -o $@ $
+   -@ $(if $(STRIP), $(STRIP) -wN '..@*' $@)
 
 LIBOBJS := $(OBJS) $(SUBDIR)%.h.o $(TESTOBJS)
 $(LIBOBJS) $(LIBOBJS:.o=.i):   CPPFLAGS += -DHAVE_AV_CONFIG_H
-- 
1.8.3.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/4] lpc: remove decay argument

2013-06-18 Thread Loren Merritt
We never used the rolling-average mode, and this makes av_update_lls 15% faster.
---
 libavcodec/lpc.c | 2 +-
 libavutil/lls.c  | 7 +++
 libavutil/lls.h  | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index bada368..c098b0f 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -222,7 +222,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 }else
 weight++;
 
-avpriv_update_lls(m[pass1], var, 1.0);
+avpriv_update_lls(m[pass1], var);
 }
 avpriv_solve_lls(m[pass1], 0.001, 0);
 }
diff --git a/libavutil/lls.c b/libavutil/lls.c
index 246189b..2061e6a 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -38,13 +38,12 @@ av_cold void avpriv_init_lls(LLSModel *m, int indep_count)
 m-indep_count = indep_count;
 }
 
-void avpriv_update_lls(LLSModel *m, double *var, double decay)
+void avpriv_update_lls(LLSModel *m, double *var)
 {
 int i, j;
 
 for (i = 0; i = m-indep_count; i++) {
 for (j = i; j = m-indep_count; j++) {
-m-covariance[i][j] *= decay;
 m-covariance[i][j] += var[i] * var[j];
 }
 }
@@ -125,7 +124,7 @@ av_cold void av_init_lls(LLSModel *m, int indep_count)
 }
 void av_update_lls(LLSModel *m, double *param, double decay)
 {
-avpriv_update_lls(m, param, decay);
+avpriv_update_lls(m, param);
 }
 void av_solve_lls(LLSModel *m, double threshold, int min_order)
 {
@@ -160,7 +159,7 @@ int main(void)
 var[1] = var[0] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
 var[2] = var[1] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
 var[3] = var[2] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
-avpriv_update_lls(m, var, 0.99);
+avpriv_update_lls(m, var);
 avpriv_solve_lls(m, 0.001, 0);
 for (order = 0; order  3; order++) {
 eval = avpriv_evaluate_lls(m, var + 1, order);
diff --git a/libavutil/lls.h b/libavutil/lls.h
index f493076..9c71cf9 100644
--- a/libavutil/lls.h
+++ b/libavutil/lls.h
@@ -40,7 +40,7 @@ typedef struct LLSModel {
 } LLSModel;
 
 void avpriv_init_lls(LLSModel *m, int indep_count);
-void avpriv_update_lls(LLSModel *m, double *param, double decay);
+void avpriv_update_lls(LLSModel *m, double *param);
 void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order);
 double avpriv_evaluate_lls(LLSModel *m, double *param, int order);
 
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 0/4] lpc optimizations v2

2013-06-18 Thread Loren Merritt
changes:
Converted API to function pointers, like we usually use for DSP functions.
Removed the unused rolling-average mode from the C implementation. My asm
already didn't support it.
Updated doxygen, headers, and function prefixes.

(Not reposting the unchanged patches.)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 4/4] x86: lpc: simd av_evaluate_lls

2013-06-18 Thread Loren Merritt
1.5x-1.8x faster on sandybridge
---
 libavutil/x86/lls.asm| 38 ++
 libavutil/x86/lls_init.c |  3 +++
 2 files changed, 41 insertions(+)

diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index 92c00fc..92b7f95 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -194,3 +194,41 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
 jle .loop2x1
 .ret:
 REP_RET
+
+
+INIT_XMM sse2
+cglobal evaluate_lls, 2,4,2, ctx, var, order, i
+; This function is often called on the same buffer as update_lls, but with
+; an offset. They can't both be aligned.
+; Load halves rather than movu to avoid store-forwarding stalls, since the
+; input was initialized immediately prior to this function using scalar 
math.
+%define coefsq ctxq
+mov id, orderd
+imulorderd, MAX_VARS
+lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
+movsd   m0, [varq]
+movhpd  m0, [varq + 8]
+mulpd   m0, [coefsq]
+lea coefsq, [coefsq + iq*8]
+lea   varq, [varq + iq*8]
+neg iq
+add iq, 2
+.loop:
+movsd   m1, [varq + iq*8]
+movhpd  m1, [varq + iq*8 + 8]
+mulpd   m1, [coefsq + iq*8]
+addpd   m0, m1
+add iq, 2
+jl .loop
+jg .skip1
+movsd   m1, [varq + iq*8]
+mulsd   m1, [coefsq + iq*8]
+addpd   m0, m1
+.skip1:
+movhlps m1, m0
+addsd   m0, m1
+%if ARCH_X86_32
+movsd  r0m, m0
+fld   qword r0m
+%endif
+RET
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 8a80f83..888bc54 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -25,12 +25,15 @@
 
 void ff_update_lls_sse2(LLSModel *m, double *var);
 void ff_update_lls_avx(LLSModel *m, double *var);
+double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
 
 av_cold void ff_init_lls_x86(LLSModel *m)
 {
 int cpu_flags = av_get_cpu_flags();
 if (EXTERNAL_SSE2(cpu_flags)) {
 m-update_lls = ff_update_lls_sse2;
+if (m-indep_count = 4)
+m-evaluate_lls = ff_evaluate_lls_sse2;
 }
 if (EXTERNAL_AVX(cpu_flags)) {
 m-update_lls = ff_update_lls_avx;
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 2/4] lpc: use function pointers, in preparation for asm

2013-06-18 Thread Loren Merritt
---
 libavcodec/lpc.c |  4 ++--
 libavutil/lls.c  | 26 ++
 libavutil/lls.h  | 15 +--
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index c098b0f..f60976b 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -212,7 +212,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 
 if(pass){
 double eval, inv, rinv;
-eval= avpriv_evaluate_lls(m[(pass-1)1], var+1, 
max_order-1);
+eval= m[(pass-1)1].evaluate_lls(m[(pass-1)1], var+1, 
max_order-1);
 eval= (512pass) + fabs(eval - var[0]);
 inv = 1/eval;
 rinv = sqrt(inv);
@@ -222,7 +222,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 }else
 weight++;
 
-avpriv_update_lls(m[pass1], var);
+m[pass1].update_lls(m[pass1], var);
 }
 avpriv_solve_lls(m[pass1], 0.001, 0);
 }
diff --git a/libavutil/lls.c b/libavutil/lls.c
index 2061e6a..5a3e448 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -32,13 +32,7 @@
 #include version.h
 #include lls.h
 
-av_cold void avpriv_init_lls(LLSModel *m, int indep_count)
-{
-memset(m, 0, sizeof(LLSModel));
-m-indep_count = indep_count;
-}
-
-void avpriv_update_lls(LLSModel *m, double *var)
+static void update_lls(LLSModel *m, double *var)
 {
 int i, j;
 
@@ -106,7 +100,7 @@ void avpriv_solve_lls(LLSModel *m, double threshold, 
unsigned short min_order)
 }
 }
 
-double avpriv_evaluate_lls(LLSModel *m, double *param, int order)
+static double evaluate_lls(LLSModel *m, double *param, int order)
 {
 int i;
 double out = 0;
@@ -117,6 +111,14 @@ double avpriv_evaluate_lls(LLSModel *m, double *param, int 
order)
 return out;
 }
 
+av_cold void avpriv_init_lls(LLSModel *m, int indep_count)
+{
+memset(m, 0, sizeof(LLSModel));
+m-indep_count = indep_count;
+m-update_lls = update_lls;
+m-evaluate_lls = evaluate_lls;
+}
+
 #if FF_API_LLS_PRIVATE
 av_cold void av_init_lls(LLSModel *m, int indep_count)
 {
@@ -124,7 +126,7 @@ av_cold void av_init_lls(LLSModel *m, int indep_count)
 }
 void av_update_lls(LLSModel *m, double *param, double decay)
 {
-avpriv_update_lls(m, param);
+m-update_lls(m, param);
 }
 void av_solve_lls(LLSModel *m, double threshold, int min_order)
 {
@@ -132,7 +134,7 @@ void av_solve_lls(LLSModel *m, double threshold, int 
min_order)
 }
 double av_evaluate_lls(LLSModel *m, double *param, int order)
 {
-return avpriv_evaluate_lls(m, param, order);
+return m-evaluate_lls(m, param, order);
 }
 #endif /* FF_API_LLS_PRIVATE */
 
@@ -159,10 +161,10 @@ int main(void)
 var[1] = var[0] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
 var[2] = var[1] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
 var[3] = var[2] + av_lfg_get(lfg) / (double) UINT_MAX - 0.5;
-avpriv_update_lls(m, var);
+m.update_lls(m, var);
 avpriv_solve_lls(m, 0.001, 0);
 for (order = 0; order  3; order++) {
-eval = avpriv_evaluate_lls(m, var + 1, order);
+eval = m.evaluate_lls(m, var + 1, order);
 printf(real:%9f order:%d pred:%9f var:%f coeffs:%f %9f %9f\n,
var[0], order, eval, sqrt(m.variance[order] / (i + 1)),
m.coeff[order][0], m.coeff[order][1],
diff --git a/libavutil/lls.h b/libavutil/lls.h
index 9c71cf9..8183440 100644
--- a/libavutil/lls.h
+++ b/libavutil/lls.h
@@ -37,12 +37,23 @@ typedef struct LLSModel {
 double coeff[MAX_VARS][MAX_VARS];
 double variance[MAX_VARS];
 int indep_count;
+/**
+ * Take the outer-product of var[] with itself, and add to the covariance 
matrix.
+ * @param m this context
+ * @param var training samples, starting with the value to be predicted
+ */
+void (*update_lls)(struct LLSModel *m, double *var);
+/**
+ * Inner product of var[] and the LPC coefs.
+ * @param m this context
+ * @param var training samples, excluding the value to be predicted
+ * @param order lpc order
+ */
+double (*evaluate_lls)(struct LLSModel *m, double *var, int order);
 } LLSModel;
 
 void avpriv_init_lls(LLSModel *m, int indep_count);
-void avpriv_update_lls(LLSModel *m, double *param);
 void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order);
-double avpriv_evaluate_lls(LLSModel *m, double *param, int order);
 
 #if FF_API_LLS_PRIVATE
 void av_init_lls(LLSModel *m, int indep_count);
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 3/4] x86: lpc: simd av_update_lls

2013-06-18 Thread Loren Merritt
4x-6x faster on sandybridge
---
 libavcodec/lpc.c |   4 +-
 libavutil/lls.c  |   8 +-
 libavutil/lls.h  |  12 ++-
 libavutil/x86/Makefile   |   2 +
 libavutil/x86/lls.asm| 196 +++
 libavutil/x86/lls_init.c |  38 +
 6 files changed, 253 insertions(+), 7 deletions(-)
 create mode 100644 libavutil/x86/lls.asm
 create mode 100644 libavutil/x86/lls_init.c

diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index f60976b..144bbed 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -200,7 +200,9 @@ int ff_lpc_calc_coefs(LPCContext *s,
 ref[i] = fabs(lpc[i][i]);
 } else if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
 LLSModel m[2];
-double var[MAX_LPC_ORDER+1], av_uninit(weight);
+LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]);
+double av_uninit(weight);
+memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var));
 
 for(pass=0; passlpc_passes; pass++){
 avpriv_init_lls(m[pass1], max_order);
diff --git a/libavutil/lls.c b/libavutil/lls.c
index 5a3e448..f87c2cd 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -46,8 +46,8 @@ static void update_lls(LLSModel *m, double *var)
 void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order)
 {
 int i, j, k;
-double (*factor)[MAX_VARS + 1] = (void *) m-covariance[1][0];
-double (*covar) [MAX_VARS + 1] = (void *) m-covariance[1][1];
+double (*factor)[MAX_VARS_ALIGN] = (void *) m-covariance[1][0];
+double (*covar) [MAX_VARS_ALIGN] = (void *) m-covariance[1][1];
 double *covar_y= m-covariance[0];
 int count  = m-indep_count;
 
@@ -117,6 +117,8 @@ av_cold void avpriv_init_lls(LLSModel *m, int indep_count)
 m-indep_count = indep_count;
 m-update_lls = update_lls;
 m-evaluate_lls = evaluate_lls;
+if (ARCH_X86)
+ff_init_lls_x86(m);
 }
 
 #if FF_API_LLS_PRIVATE
@@ -154,7 +156,7 @@ int main(void)
 avpriv_init_lls(m, 3);
 
 for (i = 0; i  100; i++) {
-double var[4];
+LOCAL_ALIGNED(32, double, var, [4]);
 double eval;
 
 var[0] = (av_lfg_get(lfg) / (double) UINT_MAX - 0.5) * 2;
diff --git a/libavutil/lls.h b/libavutil/lls.h
index 8183440..27c0d5e 100644
--- a/libavutil/lls.h
+++ b/libavutil/lls.h
@@ -23,9 +23,12 @@
 #ifndef AVUTIL_LLS_H
 #define AVUTIL_LLS_H
 
+#include common.h
+#include mem.h
 #include version.h
 
 #define MAX_VARS 32
+#define MAX_VARS_ALIGN FFALIGN(MAX_VARS+1,4)
 
 //FIXME avoid direct access to LLSModel from outside
 
@@ -33,26 +36,29 @@
  * Linear least squares model.
  */
 typedef struct LLSModel {
-double covariance[MAX_VARS + 1][MAX_VARS + 1];
-double coeff[MAX_VARS][MAX_VARS];
+DECLARE_ALIGNED(32, double, covariance[MAX_VARS_ALIGN][MAX_VARS_ALIGN]);
+DECLARE_ALIGNED(32, double, coeff[MAX_VARS][MAX_VARS]);
 double variance[MAX_VARS];
 int indep_count;
 /**
  * Take the outer-product of var[] with itself, and add to the covariance 
matrix.
  * @param m this context
  * @param var training samples, starting with the value to be predicted
+ *32-byte aligned, and any padding elements must be initialized
+ *(i.e not denormal/nan).
  */
 void (*update_lls)(struct LLSModel *m, double *var);
 /**
  * Inner product of var[] and the LPC coefs.
  * @param m this context
- * @param var training samples, excluding the value to be predicted
+ * @param var training samples, excluding the value to be predicted. 
unaligned.
  * @param order lpc order
  */
 double (*evaluate_lls)(struct LLSModel *m, double *var, int order);
 } LLSModel;
 
 void avpriv_init_lls(LLSModel *m, int indep_count);
+void ff_init_lls_x86(LLSModel *m);
 void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order);
 
 #if FF_API_LLS_PRIVATE
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index ae07470..1e19082 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,6 +1,8 @@
 OBJS += x86/cpu.o   \
 x86/float_dsp_init.o\
+x86/lls_init.o  \
 
 YASM-OBJS += x86/cpuid.o\
  x86/emms.o \
  x86/float_dsp.o\
+ x86/lls.o  \
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
new file mode 100644
index 000..92c00fc
--- /dev/null
+++ b/libavutil/x86/lls.asm
@@ -0,0 +1,196 @@
+;**
+;* linear least squares model
+;*
+;* Copyright (c) 2013 Loren Merritt
+;*
+;* This file is part of Libav

Re: [libav-devel] [PATCH 2/5] x86: lpc: simd av_evaluate_lls

2013-06-16 Thread Loren Merritt
On Sun, 16 Jun 2013, Ronald S. Bultje wrote:
 On Sat, Jun 15, 2013 at 5:53 PM, Loren Merritt lor...@u.washington.eduwrote:

 1.5x-1.8x faster on sandybridge
 ---
  libavutil/lls.c  |  3 +++
  libavutil/lls.h  |  1 +
  libavutil/x86/lls.asm| 31 +++
  libavutil/x86/lls_init.c |  6 +-
  4 files changed, 40 insertions(+), 1 deletion(-)

 diff --git a/libavutil/lls.c b/libavutil/lls.c
 index eb500af..8f1aff1 100644
 --- a/libavutil/lls.c
 +++ b/libavutil/lls.c
 @@ -119,6 +119,9 @@ double avpriv_evaluate_lls(LLSModel *m, double *param,
 int order)
  int i;
  double out = 0;

 +if (m-evaluate_lls)
 +return m-evaluate_lls(m-coeff[order], param, order);

 Is there a special reason you didn't assign the default code as default
 implementation for evaluate_lls (as in: evaluate_lls_c), as is commonly
 done?

No special reason, I just forgot to update those when I realised I didn't
have to preserve the api.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/5] x86: lpc: simd av_update_lls

2013-06-16 Thread Loren Merritt
On Sun, 16 Jun 2013, Anton Khirnov wrote:
On Sun, 16 Jun 2013 00:53:43 +, Loren Merritt lor...@u.washington.edu 
wrote:

 FIXME: This adds alignment constraints and changes a struct that used to be
 public. External use of LLS is already deprecated, but I guess we can't
 actually enable this optimization until the next major version bump.

 I don't think the header was ever installed, so it wasn't really public.
 I wouldn't bother with compatibility in this case.

Ok with me. But what does FF_API_LLS_PRIVATE mean, if not these functions
were public, and are now transitioning to private? Why were they ifdeffed
rather than removed when someone decided they shouldn't have been
exported?

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/5] x86: lpc: simd av_evaluate_lls

2013-06-16 Thread Loren Merritt
On Sun, 16 Jun 2013, Jason Garrett-Glaser wrote:

 +movsd   m0, [varq]
 +movhpd  m0, [varq+8]

 movu isn't faster or at least the same?  I'd think it'd be better on
 Haswell at least, but I'm surprised it hurts on Sandy Bridge (if it
 does?)

movu can store-forwarding stall, since the parent function initializes
that array with scalar ops.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/5] x86: lpc: simd av_update_lls

2013-06-15 Thread Loren Merritt
/float_dsp.o\
+ x86/lls.o  \
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
new file mode 100644
index 000..f44f7ae
--- /dev/null
+++ b/libavutil/x86/lls.asm
@@ -0,0 +1,184 @@
+;**
+;* linear least squares model
+;*
+;* Copyright (c) 2013 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;**
+
+%include x86util.asm
+
+SECTION .text
+
+%define MAX_VARS 32
+%define COVAR_STRIDE (MAX_VARS+4)*8
+%define COVAR(x,y) [covarq+(x)*8+y*COVAR_STRIDE]
+
+%macro ADDPD_MEM 2
+%if cpuflag(avx)
+vaddpd %2, %1
+%else
+addpd  %2, %1
+%endif
+mova   %1, %2
+%endmacro
+
+INIT_XMM sse2
+%define movdqa movaps
+cglobal update_lls, 3,5,8, covar, var, i, j, covar2
+lea   varq, [varq+iq*8]
+neg iq
+mov covar2q, covarq
+.loopi:
+; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
+movam1, [varq+iq*8]
+movam3, [varq+iq*8+16]
+pshufd  m4, m1, q1010
+pshufd  m5, m1, q3232
+pshufd  m6, m3, q1010
+pshufd  m7, m3, q3232
+mulpd   m0, m1, m4
+mulpd   m1, m1, m5
+lea covarq, [covar2q+16]
+ADDPD_MEM COVAR(-2,0), m0
+ADDPD_MEM COVAR(-2,1), m1
+lea jq, [iq+2]
+cmp jd, -2
+jg .skip4x4
+.loop4x4:
+; Compute all 16 pairwise products of a 4x4 block
+mulpd   m0, m4, m3
+mulpd   m1, m5, m3
+mulpd   m2, m6, m3
+mulpd   m3, m3, m7
+ADDPD_MEM COVAR(0,0), m0
+ADDPD_MEM COVAR(0,1), m1
+ADDPD_MEM COVAR(0,2), m2
+ADDPD_MEM COVAR(0,3), m3
+movam3, [varq+jq*8+16]
+mulpd   m0, m4, m3
+mulpd   m1, m5, m3
+mulpd   m2, m6, m3
+mulpd   m3, m3, m7
+ADDPD_MEM COVAR(2,0), m0
+ADDPD_MEM COVAR(2,1), m1
+ADDPD_MEM COVAR(2,2), m2
+ADDPD_MEM COVAR(2,3), m3
+movam3, [varq+jq*8+32]
+add covarq, 32
+add jq, 4
+cmp jd, -2
+jle .loop4x4
+.skip4x4:
+testjd, jd
+jg .skip2x4
+mulpd   m4, m3
+mulpd   m5, m3
+mulpd   m6, m3
+mulpd   m7, m3
+ADDPD_MEM COVAR(0,0), m4
+ADDPD_MEM COVAR(0,1), m5
+ADDPD_MEM COVAR(0,2), m6
+ADDPD_MEM COVAR(0,3), m7
+.skip2x4:
+add iq, 4
+add covar2q, 4*COVAR_STRIDE+32
+cmp id, -2
+jle .loopi
+testid, id
+jg .ret
+mov jq, iq
+%define covarq covar2q
+.loop2x1:
+movsd   m0, [varq+iq*8]
+movlhps m0, m0
+mulpd   m0, [varq+jq*8]
+ADDPD_MEM COVAR(0,0), m0
+inc iq
+add covarq, COVAR_STRIDE
+testid, id
+jle .loop2x1
+.ret:
+REP_RET
+
+INIT_YMM avx
+cglobal update_lls, 3,6,8, covar, var, count, i, j, count2
+lea count2d, [countq-2]
+xor id, id
+.loopi:
+; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
+movaymm1, [varq+iq*8]
+vbroadcastsd ymm4, [varq+iq*8]
+vbroadcastsd ymm5, [varq+iq*8+8]
+vbroadcastsd ymm6, [varq+iq*8+16]
+vbroadcastsd ymm7, [varq+iq*8+24]
+vextractf128 xmm3, ymm1, 1
+vmulpd  ymm0, ymm1, ymm4
+vmulpd  ymm1, ymm1, ymm5
+vmulpd  xmm2, xmm3, xmm6
+vmulpd  xmm3, xmm3, xmm7
+ADDPD_MEM COVAR(iq  ,0), ymm0
+ADDPD_MEM COVAR(iq  ,1), ymm1
+ADDPD_MEM COVAR(iq+2,2), xmm2
+ADDPD_MEM COVAR(iq+2,3), xmm3
+lea jd, [iq+4]
+cmp jd, count2d
+jg .skip4x4
+.loop4x4:
+; Compute all 16 pairwise products of a 4x4 block
+movaymm3, [varq+jq*8]
+vmulpd  ymm0, ymm3, ymm4
+vmulpd  ymm1, ymm3, ymm5
+vmulpd  ymm2, ymm3, ymm6
+vmulpd  ymm3, ymm3, ymm7
+ADDPD_MEM COVAR(jq,0), ymm0
+ADDPD_MEM COVAR(jq,1), ymm1
+ADDPD_MEM COVAR(jq,2), ymm2
+ADDPD_MEM COVAR(jq,3), ymm3
+add jd, 4
+cmp jd, count2d
+jle .loop4x4
+.skip4x4:
+cmp jd, countd
+jg .skip2x4
+movaxmm3, [varq+jq*8]
+vmulpd  xmm0, xmm3, xmm4
+vmulpd  xmm1, xmm3, xmm5
+vmulpd  xmm2, xmm3, xmm6
+vmulpd  xmm3, xmm3, xmm7
+ADDPD_MEM COVAR(jq,0), xmm0
+ADDPD_MEM COVAR(jq,1), xmm1
+ADDPD_MEM COVAR(jq,2), xmm2
+ADDPD_MEM COVAR(jq,3), xmm3
+.skip2x4

[libav-devel] [PATCH 4/5] x86: lpc: optimize lpc_compute_autocorr_sse2

2013-06-15 Thread Loren Merritt
4% faster on sandybridge
---
 libavcodec/x86/lpc.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index 7a37b88..902d364 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -124,11 +124,13 @@ static void lpc_compute_autocorr_sse2(const double *data, 
int len, int lag,
 movsdMANGLE(pd_1), %%xmm1\n\t
 1: \n\t
 movapd   (%3,%0), %%xmm3   \n\t
-movupd -8(%4,%0), %%xmm4   \n\t
+movapd   (%4,%0), %%xmm4   \n\t
+movsd  -8(%4,%0), %%xmm5   \n\t
+movlhps   %%xmm4, %%xmm5   \n\t
 mulpd %%xmm3, %%xmm4   \n\t
-mulpd(%4,%0), %%xmm3   \n\t
-addpd %%xmm4, %%xmm1   \n\t
-addpd %%xmm3, %%xmm0   \n\t
+mulpd %%xmm3, %%xmm5   \n\t
+addpd %%xmm4, %%xmm0   \n\t
+addpd %%xmm5, %%xmm1   \n\t
 add   $16,%0   \n\t
 jl 1b  \n\t
 movhlps   %%xmm0, %%xmm3   \n\t
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 2/5] x86: lpc: simd av_evaluate_lls

2013-06-15 Thread Loren Merritt
1.5x-1.8x faster on sandybridge
---
 libavutil/lls.c  |  3 +++
 libavutil/lls.h  |  1 +
 libavutil/x86/lls.asm| 31 +++
 libavutil/x86/lls_init.c |  6 +-
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/libavutil/lls.c b/libavutil/lls.c
index eb500af..8f1aff1 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -119,6 +119,9 @@ double avpriv_evaluate_lls(LLSModel *m, double *param, int 
order)
 int i;
 double out = 0;
 
+if (m-evaluate_lls)
+return m-evaluate_lls(m-coeff[order], param, order);
+
 for (i = 0; i = order; i++)
 out += param[i] * m-coeff[order][i];
 
diff --git a/libavutil/lls.h b/libavutil/lls.h
index 76ff10c..8a4d318 100644
--- a/libavutil/lls.h
+++ b/libavutil/lls.h
@@ -39,6 +39,7 @@ typedef struct LLSModel {
 double variance[MAX_VARS];
 int indep_count;
 void (*update_lls)(struct LLSModel *m, double *var, int order);
+double (*evaluate_lls)(double *coefs, double *var, int order);
 } LLSModel;
 
 void avpriv_init_lls(LLSModel *m, int indep_count);
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index f44f7ae..b5e04d9 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -182,3 +182,34 @@ cglobal update_lls, 3,6,8, covar, var, count, i, j, count2
 jle .loop2x1
 .ret:
 REP_RET
+
+
+INIT_XMM sse2
+cglobal evaluate_lls, 3,3,3, coefs, var, i
+; This function is often called on the same buffer as update_lls, but with 
an offset. They can't both be aligned.
+movsd   m0, [varq]
+movhpd  m0, [varq+8]
+mulpd   m0, [coefsq]
+lea coefsq, [coefsq+iq*8]
+lea   varq, [varq+iq*8]
+neg iq
+add iq, 2
+.loop:
+movsd   m1, [varq+iq*8]
+movhpd  m1, [varq+iq*8+8]
+mulpd   m1, [coefsq+iq*8]
+addpd   m0, m1
+add iq, 2
+jl .loop
+jg .skip1
+movsd   m1, [varq+iq*8]
+mulsd   m1, [coefsq+iq*8]
+addpd   m0, m1
+.skip1:
+movhlps m1, m0
+addsd   m0, m1
+%if ARCH_X86_32
+movsd  r0m, m0
+fld   qword r0m
+%endif
+RET
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 1215b14..d65ddc8 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -25,12 +25,16 @@
 
 void ff_update_lls_sse2(LLSModel *m, double *var, int order);
 void ff_update_lls_avx(LLSModel *m, double *var, int order);
+double ff_evaluate_lls_sse2(double *coefs, double *var, int order);
 
 void avpriv_init_lls_x86(LLSModel *m)
 {
 int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_SSE2(cpu_flags))
+if (EXTERNAL_SSE2(cpu_flags)) {
 m-update_lls = ff_update_lls_sse2;
+if (m-indep_count = 4)
+m-evaluate_lls = ff_evaluate_lls_sse2;
+}
 if (EXTERNAL_AVX(cpu_flags))
 m-update_lls = ff_update_lls_avx;
 }
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 3/5] x86: lpc: optimize av_solve_lls

2013-06-15 Thread Loren Merritt
1.2x-1.4x faster on sandybridge
---
 libavutil/lls.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavutil/lls.c b/libavutil/lls.c
index 8f1aff1..bcaa901 100644
--- a/libavutil/lls.c
+++ b/libavutil/lls.c
@@ -75,9 +75,9 @@ void avpriv_solve_lls(LLSModel *m, double threshold, unsigned 
short min_order)
 if (i == j) {
 if (sum  threshold)
 sum = 1.0;
-factor[i][i] = sqrt(sum);
+factor[i][i] = 1.0 / sqrt(sum);
 } else {
-factor[j][i] = sum / factor[i][i];
+factor[j][i] = sum * factor[i][i];
 }
 }
 }
@@ -88,7 +88,7 @@ void avpriv_solve_lls(LLSModel *m, double threshold, unsigned 
short min_order)
 for (k = i - 1; k = 0; k--)
 sum -= factor[i][k] * m-coeff[0][k];
 
-m-coeff[0][i] = sum / factor[i][i];
+m-coeff[0][i] = sum * factor[i][i];
 }
 
 for (j = count - 1; j = min_order; j--) {
@@ -98,7 +98,7 @@ void avpriv_solve_lls(LLSModel *m, double threshold, unsigned 
short min_order)
 for (k = i + 1; k = j; k++)
 sum -= factor[k][i] * m-coeff[j][k];
 
-m-coeff[j][i] = sum / factor[i][i];
+m-coeff[j][i] = sum * factor[i][i];
 }
 
 m-variance[j] = covar_y[0];
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 5/5] lpc: use levinson for the first pass of multipass cholesky

2013-06-15 Thread Loren Merritt
Levinson is faster, and cholesky is only needed if we want to apply different
weights to different samples, which doesn't happen on the first pass.
---
 libavcodec/lpc.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 7247a76..58c7585 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -176,7 +176,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 double autoc[MAX_LPC_ORDER+1];
 double ref[MAX_LPC_ORDER];
 double lpc[MAX_LPC_ORDER][MAX_LPC_ORDER];
-int i, j, pass;
+int i, j, pass = 0;
 int opt_order;
 
 assert(max_order = MIN_LPC_ORDER  max_order = MAX_LPC_ORDER 
@@ -189,7 +189,7 @@ int ff_lpc_calc_coefs(LPCContext *s,
 ff_lpc_init(s, blocksize, max_order, lpc_type);
 }
 
-if (lpc_type == FF_LPC_TYPE_LEVINSON) {
+if (lpc_type == FF_LPC_TYPE_LEVINSON || (lpc_type == FF_LPC_TYPE_CHOLESKY 
 lpc_passes  1)) {
 s-lpc_apply_welch_window(samples, blocksize, s-windowed_samples);
 
 s-lpc_compute_autocorr(s-windowed_samples, blocksize, max_order, 
autoc);
@@ -198,13 +198,20 @@ int ff_lpc_calc_coefs(LPCContext *s,
 
 for(i=0; imax_order; i++)
 ref[i] = fabs(lpc[i][i]);
-} else if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
+
+pass++;
+}
+
+if (lpc_type == FF_LPC_TYPE_CHOLESKY) {
 LLSModel m[2];
 LOCAL_ALIGNED(32, double, var, [FFALIGN(MAX_LPC_ORDER+1,4)]);
 double av_uninit(weight);
 memset(var, 0, FFALIGN(MAX_LPC_ORDER+1,4)*sizeof(*var));
 
-for(pass=0; passlpc_passes; pass++){
+for(j=0; jmax_order; j++)
+m[0].coeff[max_order-1][j] = -lpc[max_order-1][j];
+
+for(; passlpc_passes; pass++){
 avpriv_init_lls(m[pass1], max_order);
 
 weight=0;
@@ -259,15 +266,11 @@ av_cold int ff_lpc_init(LPCContext *s, int blocksize, int 
max_order,
 s-max_order = max_order;
 s-lpc_type  = lpc_type;
 
-if (lpc_type == FF_LPC_TYPE_LEVINSON) {
-s-windowed_buffer = av_mallocz((blocksize + 2 + FFALIGN(max_order, 
4)) *
-sizeof(*s-windowed_samples));
-if (!s-windowed_buffer)
-return AVERROR(ENOMEM);
-s-windowed_samples = s-windowed_buffer + FFALIGN(max_order, 4);
-} else {
-s-windowed_samples = NULL;
-}
+s-windowed_buffer = av_mallocz((blocksize + 2 + FFALIGN(max_order, 4)) *
+sizeof(*s-windowed_samples));
+if (!s-windowed_buffer)
+return AVERROR(ENOMEM);
+s-windowed_samples = s-windowed_buffer + FFALIGN(max_order, 4);
 
 s-lpc_apply_welch_window = lpc_apply_welch_window_c;
 s-lpc_compute_autocorr   = lpc_compute_autocorr_c;
-- 
1.8.1.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [libav-api] High CPU load of the avcodec_decode_video2() function

2013-03-26 Thread Loren Merritt
On Tue, 26 Mar 2013, Ond?ej Perutka wrote:

 Hello again,
 I think I found the problem (correct me if I'm wrong :-)). I checked all
 functions with AVX 256 instructions and there is really no such function
 which is called from my application and does not have the vzeroupper
 instruction.

 After a few days of disassembling I've discovered there is quite a lot of
 functions which mix both SSE and AVX128 instructions. The
 ff_deblock_v_luma_8_avx, I mentioned before, is one of them. It is not a
 problem unless there is a non-zero in the upper half of YMM registers or
 the registers are in the state C (using Agner's terminology). In such case
 the registers would oscillate between states B and C and both of these
 transitions are time consuming.

 So here is my theory: Somewhere in the JVM or its dependent libraries is an
 AVX 256 code with missing vzeroupper instruction. This code leaves YMM
 registers in the state B. After that there could be also some SSE code
 which would leave YMM registers in the state C. Everything is fine until
 the ff_deblock_v_luma_8_avx (or similar function) is called. After that the
 registers oscillate between state B and C as the ff_deblock_v_luma_8_avx
 transits between SSE and AVX 128 instructions. It also corresponds to my
 previous profiling results.

 Knowing all this I tried to solve it. I added following macro into the
 libavutil/x86inc.asm:

 %macro vzeroupper 0
 %if avx_enabled
  vzeroupper
 %endif
 %endmacro

 
 and I also updated all macros used after INIT_XMM avx in all files so
 there is vzeroupper after each cglobal in these macros. Everything works

Wrong solution. vzeroupper is not free, so don't just sprinkle it
everywhere. You should either fix JVM, or put a single vzeroupper on
entry to the whole libav.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/1] x86: consistently use unaligned movs in the unaligned bswap

2013-03-24 Thread Loren Merritt
On Sun, 24 Mar 2013, Janne Grunau wrote:

 Fixes fate errors in asv1, ffvhuff and huffyuv on x86_32.
 ---
  libavcodec/x86/dsputil.asm | 18 +-
  1 file changed, 9 insertions(+), 9 deletions(-)

 diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
 index a1aaaf5..686cec8 100644
 --- a/libavcodec/x86/dsputil.asm
 +++ b/libavcodec/x86/dsputil.asm
 @@ -552,23 +552,23 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
  %if cpuflag(ssse3)
  pshufb   m0, m2
  pshufb   m1, m2
 -mova [r0 +  0], m0
 -mova [r0 + 16], m1
 +mov%1[r0 +  0], m0
 +mov%1[r0 + 16], m1
  %else
  pshuflw  m0, m0, 10110001b
  pshuflw  m1, m1, 10110001b
  pshufhw  m0, m0, 10110001b
  pshufhw  m1, m1, 10110001b
 -mova m2, m0
 -mova m3, m1
 +mov%1m2, m0
 +mov%1m3, m1

register-to-register move is always aligned.

  psllwm0, 8
  psllwm1, 8
  psrlwm2, 8
  psrlwm3, 8
  por  m2, m0
  por  m3, m1
 -mova [r0 +  0], m2
 -mova [r0 + 16], m3
 +mov%1[r0 +  0], m2
 +mov%1[r0 + 16], m3
  %endif
  add  r0, 32
  add  r1, 32
 @@ -581,15 +581,15 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
  mov%1m0, [r1]
  %if cpuflag(ssse3)
  pshufb   m0, m2
 -mova [r0], m0
 +mov%1[r0], m0
  %else
  pshuflw  m0, m0, 10110001b
  pshufhw  m0, m0, 10110001b
 -mova m2, m0
 +mov%1m2, m0

same here.

  psllwm0, 8
  psrlwm2, 8
  por  m2, m0
 -mova [r0], m2
 +mov%1[r0], m2
  %endif
  add  r1, 16
  add  r0, 16

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [libav-api] High CPU load of the avcodec_decode_video2() function

2013-03-17 Thread Loren Merritt
On Sun, 17 Mar 2013, Jason Garrett-Glaser wrote:
 On Sun, Mar 17, 2013 at 5:48 PM, Ondrej Perutka perutka.ond...@gmail.com 
 wrote:

 ... and according to the Intel reference:

 http://software.intel.com/sites/default/files/319433-014.pdf

 the VZEROUPPER or the VZEROALL instruction should be executed before and
 after any usage of VEX-encoded instructions (i.e. 256-bit AVX). It means
 both transitions SSE - AVX and AVX - SSE (not only before return from AVX
 function). See section 2.8.1.

 That's not what it says, and is not correct.  x264 does not use
 vzeroupper before any of its AVX functions and the official Intel
 emulator records no false SSE/AVX transition problems.

And if you (Ondrej) want to know how a both transitions can have
performance penalties (as the Intel doc says), while still needing only a
single vzeroupper to fix both, then see Agner's optimization guide
http://agner.org/optimize/optimizing_assembly.pdf section 13.6.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-19 Thread Loren Merritt
On Mon, 18 Feb 2013, Daniel Kang wrote:

 +%macro PAVGBP_MMX 6
 +mova   %3, %1
 +mova   %6, %4
 +por%3, %2
 +por%6, %5
 +pxor   %2, %1
 +pxor   %5, %4
 +pand   %2, m6
 +pand   %5, m6
 +psrlq  %2, 1
 +psrlq  %5, 1
 +psubb  %3, %2
 +psubb  %6, %5
 +%endmacro
 +
 +%macro PAVGB_NRND_OP_MMX 4
 +mova   %3, %1
 +pand   %3, %2
 +pxor   %2, %1
 +pand   %2, %4
 +psrlq  %2, 1
 +paddb  %3, %2
 +%endmacro
 +
 +%macro PAVGBP_NO_RND_MMX 6
 +PAVGB_NRND_OP_MMX %1, %2, %3, m6
 +PAVGB_NRND_OP_MMX %4, %5, %6, m6
 +%endmacro
 +
 +%macro PAVGB_OP_MMX 4
 +mova %3, %1
 +por  %3, %2
 +pxor %2, %1
 +pand %2, %4
 +psrlq%2, 1
 +psubb%3, %2
 +%endmacro

I meant eliminate PAVGBP_MMX and PAVGBP_NO_RND_MMX entirely, and instead
call PAVGB_OP_MMX or PAVGB_NRND_OP_MMX twice from the functions that used
to use them.

 +; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, 
 int h)
 +%macro PUT_PIXELS8_X2_MMX 0-1
 +cglobal put%1_pixels8_x2, 4,4
 +pcmpeqd  m6, m6
 +paddbm6, m6
 +.loop:
 +mova m0, [r1]
 +mova m1, [r1+1]
 +mova m2, [r1+r2]
 +mova m3, [r1+r2+1]
 +PAVGBP   m0, m1, m4, m2, m3, m5
 +mova   [r0], m4
 +mova  [r0+r2*1], m5
 +lea  r1, [r1+r2*2]
 +lea  r0, [r0+r2*2]
 +mova m0, [r1]
 +mova m1, [r1+1]
 +mova m2, [r1+r2]
 +mova m3, [r1+r2+1]
 +PAVGBP   m0, m1, m4, m2, m3, m5
 +mova   [r0], m4
 +mova  [r0+r2*1], m5
 +lea  r1, [r1+r2*2]
 +lea  r0, [r0+r2*2]
 +sub r3d, 4
 +jne .loop
 +RET
 +%endmacro

%rep.
That said, I would have guessed that the original purpose of the
unrolling in most of the functions in this patch was to allow manual
register renaming to eliminate a few moves. In which case the functions
where %rep works are precisely the ones that shouldn't have been
unrolled.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-15 Thread Loren Merritt
On Fri, 15 Feb 2013, Daniel Kang wrote:

 +%macro PAVGBP_MMX 6
 +mova   %3, %1
 +mova   %6, %4
 +por%3, %2
 +por%6, %5
 +pxor   %2, %1
 +pxor   %5, %4
 +pand   %2, m6
 +pand   %5, m6
 +psrlq  %2, 1
 +psrlq  %5, 1
 +psubb  %3, %2
 +psubb  %6, %5
 +%endmacro
 +
 +%macro PAVGBP_NO_RND_MMX 6
 +mova %3, %1
 +mova %6, %4
 +pand %3, %2
 +pand %6, %5
 +pxor %2, %1
 +pxor %5, %4
 +pand %2, m6
 +pand %5, m6
 +psrlq%2, 1
 +psrlq%5, 1
 +paddb%3, %2
 +paddb%6, %5
 +%endmacro

Does this need to be interleaved, not just two calls to PAVGB_OP_MMX?

 +; put_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int 
 h)
 +%macro PUT_PIXELS8_XY2_MMX 0-1
 +cglobal put%1_pixels8_xy2, 4,5
 +pxor m7, m7
 +SET_RND(m6)
 +mova m0, [r1]
 +mova m4, [r1+1]
 +mova m1, m0
 +mova m5, m4
 +punpcklbwm0, m7
 +punpcklbwm4, m7
 +punpckhbwm1, m7
 +punpckhbwm5, m7
 +paddusw  m4, m0
 +paddusw  m5, m1
 +xor  r4, r4
 +add  r1, r2
 +.loop:
 +mova m0, [r1+r4]
 +mova m2, [r1+r4+1]
 +mova m1, m0
 +mova m3, m2
 +punpcklbwm0, m7
 +punpcklbwm2, m7
 +punpckhbwm1, m7
 +punpckhbwm3, m7
 +paddusw  m0, m2
 +paddusw  m1, m3
 +paddusw  m4, m6
 +paddusw  m5, m6
 +paddusw  m4, m0
 +paddusw  m5, m1
 +psrlwm4, 2
 +psrlwm5, 2
 +packuswb m4, m5
 +mova[r0+r4], m4
 +add  r4, r2
 +mova m2, [r1+r4]
 +mova m3, [r1+r4+1]
 +mova m3, m2
 +mova m5, m4
 +punpcklbwm2, m7
 +punpcklbwm4, m7
 +punpckhbwm3, m7
 +punpckhbwm5, m7
 +paddusw  m4, m2
 +paddusw  m5, m3
 +paddusw  m0, m6
 +paddusw  m1, m6
 +paddusw  m0, m4
 +paddusw  m1, m5
 +psrlwm0, 2
 +psrlwm1, 2
 +packuswb m0, m1
 +mova[r0+r4], m0
 +add  r4, r2
 +sub r3d, 2
 +jne .loop
 +RET
 +%endmacro

Does this and similar functions really need to be unrolled? If so, use
%rep.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] hpeldsp: x86: Convert dsputil_rnd_template to yasm

2013-02-14 Thread Loren Merritt
On Thu, 14 Feb 2013, Ronald S. Bultje wrote:
 On Feb 14, 2013 4:59 AM, Diego Biurrun di...@biurrun.de wrote:
On Wed, Feb 13, 2013 at 05:53:36PM -0500, Daniel Kang wrote:
 @@ -56,6 +107,44 @@ PUT_PIXELS8_X2

 +%macro PUT_PIXELS8_X2_MMX 0-1
 +%if %0 == 1
 +cglobal put%1_pixels8_x2, 4,4
 +%else
 +cglobal put_pixels8_x2, 4,4
 +%endif

 IIRC you don't need the %if, but you can just pass an empty
 first parameter and it should do the right thing.
 .. more below ..

 MACRO 0-1  sets an empty string by default.

True, but strings aren't good for much in asm.
MACRO 0-1 sets an empty *token* by default, which is what you want if
you're going to concatenate it into a function name.
MACRO 0-1 {} works too if you want to explicitly specify the empty token.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: x86: Convert h263 loop filter to yasm

2013-02-05 Thread Loren Merritt
On Tue, 5 Feb 2013, Luca Barbato wrote:

 On 02/02/13 00:28, Daniel Kang wrote:
  ---
  I am very skeptical when assembly works on the first time. More testing 
  would be appreciated.
  ---
   libavcodec/x86/dsputil.asm   |  159 
   libavcodec/x86/dsputil_mmx.c |  185 
  ++
   2 files changed, 167 insertions(+), 177 deletions(-)

 /usr/lib/gcc/x86_64-pc-linux-gnu/4.6.3/../../../../x86_64-pc-linux-gnu/bin/ld:
 libavcodec/libavcodec.a(dsputil.o): relocation R_X86_64_32 against
 `ff_h263_loop_filter_strength' can not be used when making a shared
 object; recompile with -fPIC
 libavcodec/libavcodec.a: could not read symbols: Bad value

 Looks like something is broken for x86_64.

The offending line is
movzx r3d, BYTE [ff_h263_loop_filter_strength+r2]
since you can't have both a register offset and a PIC offset in the same
address.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] vorbisdsp: convert x86 simd functions from inline asm to yasm.

2013-01-22 Thread Loren Merritt
On Mon, 21 Jan 2013, Ronald S. Bultje wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libavcodec/x86/Makefile |  1 +
  libavcodec/x86/dsputil_mmx.c|  3 --
  libavcodec/x86/dsputil_mmx.h|  2 -
  libavcodec/x86/vorbisdsp.asm| 83 
 +
  libavcodec/x86/vorbisdsp_init.c | 77 --
  5 files changed, 92 insertions(+), 74 deletions(-)
  create mode 100644 libavcodec/x86/vorbisdsp.asm

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v2] dsputilenc: x86: Convert pixel inline asm to yasm

2013-01-20 Thread Loren Merritt
On Wed, 16 Jan 2013, Daniel Kang wrote:

 ---
 Fixed movu - mova comment from Loren
 ---
  libavcodec/x86/dsputilenc.asm   |  152 +
  libavcodec/x86/dsputilenc_mmx.c |  201 
 ---
  2 files changed, 172 insertions(+), 181 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputilenc: x86: Convert pixel inline asm to yasm

2013-01-15 Thread Loren Merritt
On Mon, 14 Jan 2013, Daniel Kang wrote:

 +INIT_MMX mmx
 +; get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
 +cglobal get_pixels, 3,4
 +movsxdifnidn r2, r2d
 +add  r0, 128
 +mov  r3, -128
 +pxor m7, m7
 +.loop:
 +movu m0, [r1]
 +movu m2, [r1+r2]
 +movu m1, m0
 +movu m3, m2
 +punpcklbwm0, m7
 +punpckhbwm1, m7
 +punpcklbwm2, m7
 +punpckhbwm3, m7
 +movu [r0+r3+ 0], m0
 +movu [r0+r3+ 8], m1
 +movu [r0+r3+16], m2
 +movu [r0+r3+24], m3
 +lea  r1, [r1+r2*2]
 +add  r3, 32
 +js .loop
 +REP_RET

Is this really unaligned?

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: Add cvisible macro for C functions with public prefix

2013-01-14 Thread Loren Merritt
On Sun, 13 Jan 2013, Diego Biurrun wrote:

 ---

 Now using local variables for HIDDEN and FUNCTION_PREFIX, as suggested
 by Loren.

  libavutil/x86/x86inc.asm  |   40 
  libavutil/x86/x86util.asm |1 +
  2 files changed, 29 insertions(+), 12 deletions(-)

Patch OK.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC

2013-01-12 Thread Loren Merritt
On Sat, 12 Jan 2013, Daniel Kang wrote:

 -mova   [rsp+ 0], m0
 -mova   [rsp+16], m3
 -mova   [rsp+32], m1
 +mova   [rsp- 0], m0
 +mova   [rsp-16], m3
 +mova   [rsp-32], m1

You can't do that on x86_32.

 +%assign PAD -1*80

Unused?

  %macro YADIF 0
 -cglobal yadif_filter_line, 7, 7, 8, 16*5, dst, prev, cur, next, w, prefs, \
 -  mrefs, parity, mode
 -test wq, wq
 +cglobal yadif_filter_line, 4, 7, 8, PAD

Do you have a reason for removing all the named args?

 +cmp DWORD r4m, 0
  jle .ret
 -movsxdifnidn prefsq, prefsd
 -movsxdifnidn mrefsq, mrefsd
 +%if ARCH_X86_32
 +movifnidn  r4, r5mp
 +movifnidn  r5, r6mp
 +DECLARE_REG_TMP 4,5
 +%else
 +movsxdifnidn r5, DWORD r5m
 +movsxdifnidn r6, DWORD r6m
 +DECLARE_REG_TMP 5,6
 +%endif

No ifnidn. After your change, they will not in fact be identical.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] yadif: Fix bug with x86_32 MSVC

2013-01-12 Thread Loren Merritt
On Sat, 12 Jan 2013, Daniel Kang wrote:
 On Sat, Jan 12, 2013 at 10:36 PM, Loren Merritt lor...@u.washington.edu 
 wrote:

 Do you have a reason for removing all the named args?

 I can't use half of the named args, and I thought it was less
 confusing if I didn't use them at all.

Half? The only ones you need to avoid are the 2 args you're moving.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/3] x86inc: Add cvisible macro for C functions with public prefix

2013-01-10 Thread Loren Merritt
On Wed, 9 Jan 2013, Diego Biurrun wrote:

 +%ifidn __OUTPUT_FORMAT__,elf  HIDDEN

That compares __OUTPUT_FORMAT__ against the string elf  HIDDEN.
 can only be used in %if, not %ifidn or any of the other conditionals.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH v4] yadif: Port inline assembly to YASM

2013-01-09 Thread Loren Merritt
On Tue, 8 Jan 2013, Daniel Kang wrote:

 +cglobal yadif_filter_line, 9, 9, 8, 16*5, dst, prev, cur, next, w, prefs, \
 +  mrefs, parity, mode

If you never use parity and mode as regs, then reduce register allocation.

 +test wq, wq
 +jle .ret
 +movsxdifnidn prefsq, prefsd
 +movsxdifnidn mrefsq, mrefsd
 +
 +cmp   DWORD paritym, 0
 +je .parity0
 +FILTER 1, prevq, curq
 +jmp .ret
 +
 +.parity0:
 +FILTER 0, curq, nextq
 +
 +.ret:
 +RET

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86inc: activate REP_RET automatically

2012-12-19 Thread Loren Merritt
Now RET checks whether it immediately follows a branch, so the programmer 
dosen't have to keep track of that condition.
REP_RET is still needed manually when it's a branch target, but that's much 
rarer.

The implementation involves lots of spurious labels, so get rid of them with 
`strip`.
---
 configure   |  3 +++
 library.mak |  1 +
 libavutil/x86/x86inc.asm| 36 +++
 libavcodec/x86/ac3dsp.asm   |  6 ++---
 libavcodec/x86/deinterlace.asm  |  2 +-
 libavcodec/x86/dsputil.asm  | 22 -
 libavcodec/x86/fft.asm  |  8 +++---
 libavcodec/x86/fmtconvert.asm   | 12 -
 libavcodec/x86/h264_chromamc.asm| 18 +++---
 libavcodec/x86/h264_chromamc_10bit.asm  | 10 
 libavcodec/x86/h264_deblock.asm |  2 +-
 libavcodec/x86/h264_deblock_10bit.asm   | 10 
 libavcodec/x86/h264_idct.asm| 16 ++--
 libavcodec/x86/h264_intrapred.asm   | 28 ++---
 libavcodec/x86/h264_intrapred_10bit.asm | 16 ++--
 libavcodec/x86/h264_qpel_10bit.asm  |  2 +-
 libavcodec/x86/h264_qpel_8bit.asm   | 20 +++
 libavcodec/x86/h264_weight.asm  | 16 ++--
 libavcodec/x86/h264_weight_10bit.asm| 12 -
 libavcodec/x86/pngdsp.asm   |  2 +-
 libavcodec/x86/rv34dsp.asm  |  2 +-
 libavcodec/x86/rv40dsp.asm  | 10 
 libavcodec/x86/sbrdsp.asm   |  2 +-
 libavcodec/x86/vp8dsp.asm   | 40 +++---
 libavfilter/x86/af_volume.asm   |  6 ++---
 libavfilter/x86/hqdn3d.asm  |  2 +-
 libavresample/x86/audio_convert.asm | 44 -
 libavresample/x86/audio_mix.asm | 10 
 libavutil/x86/float_dsp.asm |  8 +++---
 libswscale/x86/input.asm| 14 +--
 libswscale/x86/output.asm   |  8 +++---
 libswscale/x86/scale.asm|  2 +-
 32 files changed, 210 insertions(+), 180 deletions(-)

diff --git a/configure b/configure
index 08f1d82..ac056b3 100755
--- a/configure
+++ b/configure
@@ -1802,6 +1802,7 @@ nm_default=nm -g
 objformat=elf
 pkg_config_default=pkg-config
 ranlib=ranlib
+strip=strip
 yasmexe=yasm
 
 nogas=:
@@ -2051,6 +2052,7 @@ cc_default=${cross_prefix}${cc_default}
 nm_default=${cross_prefix}${nm_default}
 pkg_config_default=${cross_prefix}${pkg_config_default}
 ranlib=${cross_prefix}${ranlib}
+strip=${cross_prefix}${strip}
 
 sysinclude_default=${sysroot}/usr/include
 
@@ -3842,6 +3844,7 @@ ARCH=$arch
 CC=$cc
 AS=$as
 LD=$ld
+STRIP=$strip
 DEPCC=$dep_cc
 DEPCCFLAGS=$DEPCCFLAGS \$(CPPFLAGS)
 DEPAS=$as
diff --git a/library.mak b/library.mak
index 3b4bd2d..23ee5fa 100644
--- a/library.mak
+++ b/library.mak
@@ -26,6 +26,7 @@ $(SUBDIR)%-test.i: $(SUBDIR)%.c
 $(SUBDIR)x86/%.o: $(SUBDIR)x86/%.asm
$(DEPYASM) $(YASMFLAGS) -I $(D)/ -M -o $@ $  $(@:.o=.d)
$(YASM) $(YASMFLAGS) -I $(D)/ -o $@ $
+   -@ $(if $(STRIP), $(STRIP) -wN '..@*' $@)
 
 LIBOBJS := $(OBJS) $(SUBDIR)%.h.o $(TESTOBJS)
 $(LIBOBJS) $(LIBOBJS:.o=.i):   CPPFLAGS += -DHAVE_AV_CONFIG_H
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 2617cdf..5594db4 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -131,8 +131,7 @@ CPUNOP amdnop
 ; Pops anything that was pushed by PROLOGUE, and returns.
 
 ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
 
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
@@ -480,7 +479,7 @@ DECLARE_REG 14, R15, 120
 %if mmsize == 32
 vzeroupper
 %endif
-ret
+AUTO_REP_RET
 %endmacro
 
 %elif ARCH_X86_64 ; *nix x64 ;=
@@ -527,7 +526,7 @@ DECLARE_REG 14, R15, 72
 %if mmsize == 32
 vzeroupper
 %endif
-ret
+AUTO_REP_RET
 %endmacro
 
 %else ; X86_32 ;==
@@ -583,7 +582,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if mmsize == 32
 vzeroupper
 %endif
-ret
+AUTO_REP_RET
 %endmacro
 
 %endif ;==
@@ -597,6 +596,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %endmacro
 %endif
 
+; On AMD cpus =K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect follows a branch, but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this 
problem.)
 %macro REP_RET 0
 %if has_epilogue
 RET
@@ -605,6 +608,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %endif
 %endmacro
 
+%define last_branch_adr $$

Re: [libav-devel] [PATCH 2/4] lavr: x86: add SSE2 and FMA4 quantization for non-filtering dither methods

2012-12-16 Thread Loren Merritt
On Sat, 15 Dec 2012, Justin Ruggles wrote:

 +;--
 +; void ff_quantize_int_rectangular/triangular(int16_t *dst, float *src,
 +; int *dither, int len);
 +;--
 +
 +%macro QUANTIZE_INT 1
 +cglobal quantize_int_%1, 4,5,6, dst, src, dither0, len, dither1
 +lea lenq, [2*lend]
 +add dstq, lenq
 +lea srcq, [srcq+2*lenq]
 +%ifidn %1, triangular
 +lea dither1q, [dither0q+4*lenq]
 +%endif
 +lea dither0q, [dither0q+2*lenq]
 +neg lenq
 +mova  m4, [pf_dither_scale]
 +mova  m5, [pf_s16_scale]
 +.loop:
 +cvtdq2ps  m0, [dither0q+2*lenq]
 +cvtdq2ps  m1, [dither0q+2*lenq+mmsize]

Do you have to store dither in a format different than the asm works with?

 +%ifidn %1, triangular
 +cvtdq2ps  m2, [dither1q+2*lenq]
 +cvtdq2ps  m3, [dither1q+2*lenq+mmsize]
 +addps m0, m2
 +addps m1, m3

Can you add before converting, or does that overflow?

 +%endif
 +mulps m2, m5, [srcq+2*lenq]
 +mulps m3, m5, [srcq+2*lenq+mmsize]
 +fmaddps   m0, m0, m4, m2, m3
 +fmaddps   m1, m1, m4, m3, m2
 +cvtps2dq  m0, m0
 +cvtps2dq  m1, m1
 +packssdw  m0, m1
 +mova [dstq+lenq], m0
 +add lenq, mmsize
 +jl .loop
 +REP_RET
 +%endmacro

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/4] lavr: x86: add SSE2 and FMA4 quantization for non-filtering dither methods

2012-12-16 Thread Loren Merritt
On Sun, 16 Dec 2012, Justin Ruggles wrote:
 On 12/16/2012 06:27 AM, Loren Merritt wrote:

 Do you have to store dither in a format different than the asm works with?

 That's the whole point of these functions. av_get_lfg() returns an
 unsigned int, so instead of converting each value to float and scaling
 individually, we can use SSE2 to do the conversion/scaling of multiple
 values along with adding the noise to the samples.

 For the triangular dither with high-pass it actually does convert/scale
 the dither noise separately, filters it, then adds it into the samples.
 But for the dither methods that don't require filtering, we can do it
 all at once.

Why is av_get_lfg in the inner loop? Is there an audible difference
between that and repeating a constant array?

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 6/7] x86: af_volume: add SSE2/SSSE3/AVX-optimized s32 volume scaling

2012-12-04 Thread Loren Merritt
On Mon, 3 Dec 2012, Justin Ruggles wrote:
 On 12/03/2012 04:19 PM, Loren Merritt wrote:
 On Sun, 2 Dec 2012, Justin Ruggles wrote:

 +; NOTE: This is not bit-identical with the C version because it clips to
 +;   [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
 +
 +INIT_XMM ssse3

 INIT_XMM ssse3, atom

 Why is that needed? We don't have 2 different ssse3 versions of that
 function, and that cpuflag isn't used in the function itself.

Not really needed, but I thought it was more aesthetic to have the
function name reflect what it's optimized for. Normally I expect foo_ssse3
to be an improvemenet over foo_sse2.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 6/7] x86: af_volume: add SSE2/SSSE3/AVX-optimized s32 volume scaling

2012-12-03 Thread Loren Merritt
On Sun, 2 Dec 2012, Justin Ruggles wrote:

 +; NOTE: This is not bit-identical with the C version because it clips to
 +;   [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
 +
 +INIT_XMM ssse3

INIT_XMM ssse3, atom

 +cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
 +movdm4, volumem
 +pshufd  m4, m4, 0
 +movam5, [pq_128]
 +pxorm6, m6
 +lea   lenq, [lend*4-mmsize]
 +.loop:
 +; src[i] = av_clipl_int32((src[i] * volume + 128)  8);
 +movam7, [srcq+lenq]
 +pabsd   m3, m7
 +pshufd  m0, m3, q0100
 +pshufd  m1, m3, q0302
 +pmuludq m0, m4
 +pmuludq m1, m4
 +paddq   m0, m5
 +paddq   m1, m5
 +psrlq   m0, 7
 +psrlq   m1, 7
 +shufps  m2, m0, m1, q3131
 +shufps  m0, m0, m1, q2020
 +pcmpgtd m2, m6
 +por m0, m2
 +psrld   m0, 1
 +psignd  m0, m7
 +mova  [dstq+lenq], m0
 +sub   lenq, mmsize
 +jge .loop
 +REP_RET

 --- a/libavutil/x86/x86inc.asm
 +++ b/libavutil/x86/x86inc.asm
 @@ -957,6 +957,7 @@ AVX_INSTR cmpsd, 1, 0, 0
  AVX_INSTR cmpss, 1, 0, 0
  AVX_INSTR cvtdq2ps, 1, 0, 0
  AVX_INSTR cvtps2dq, 1, 0, 0
 +AVX_INSTR cvtpd2dq, 1, 0, 0
  AVX_INSTR divpd, 1, 0, 0
  AVX_INSTR divps, 1, 0, 0
  AVX_INSTR divsd, 1, 0, 0

Alphabetical order

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 10/10] SBR DSP x86: implement SSE hf_apply_noise

2012-12-01 Thread Loren Merritt
On Sat, 1 Dec 2012, Christophe Gisquet wrote:
 2012/11/30 Loren Merritt lor...@u.washington.edu:

 +cglobal sbr_hf_apply_noise_0, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
 +  mova   m0, [ps_noise0]
 +  mov   r4d, m_maxm
 +  call  hf_apply_noise_main
 +  RET

 TAIL_CALL hf_apply_noise_main, 1

 Which makes me think that every caller should have the same epilog
 (same stack offset etc). Is there a way I just do a jmp here and let
 the jumpee do the epilog.

Move the callee after the callers in sourcecode order, and make it a
plain label, not a cglobal. Then it doesn't reset the registers in use, so
RET does the same thing it would have in the previous function. Also don't
have to repeat DECLARE_ARGS that way. One of the callers can fallthrough
rather than jump.

 Another thing I'm wondering (can't make sure for the next 4 days):
 mov   r4d, m_maxm
 If I'm not mistaken, m_max should already be in r5 for linux
 x86_64/amd64 ABI (whatever I should call it).
 So I could save that mov and have instead hf_apply_noise_main use r5
 under that condition.

Yes. But probably negligible.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 01/10] SBR DSP x86: implement SSE sbr_hf_gen

2012-11-30 Thread Loren Merritt
On Fri, 30 Nov 2012, Christophe Gisquet wrote:

 +movam0, [X_lowq + start]
 +movlhps m1, m1 ; (a2 a3 a2 a3)
 +movlhps m2, m2 ; (a0 a1 a0 a1)
 +shufps  m3, m3, 00010001b  ; (a3 a2 a3 a2)
 +shufps  m4, m4, 00010001b  ; (a1 a0 a1 a0)
 +xorps   m3, m7 ; (-a3 a2 -a3 a2)
 +xorps   m4, m7 ; (-a1 a0 -a1 a0)
 +.loop2:
 +movam5, m0
 +movam6, m0
 +shufps  m0, m0, 1010b ; {Xl[-2][0],,Xl[-1][0],}
 +shufps  m5, m5, 0101b ; {Xl[-2][1],,Xl[-1][1],}
 +mulps   m0, m2
 +mulps   m5, m4
 +movam7, m6
 +addps   m5, m0
 +movam0, [X_lowq + start + 2*2*4]
 +shufps  m6, m0, 1010b ; {Xl[-1][0],,Xl[0][0],}
 +shufps  m7, m0, 0101b ; {Xl[-1][1],,Xl[1][1],}

Recommend using base-4 for shuffle constants.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 02/10] SBR DSP x86: implement SSE sum64x5

2012-11-30 Thread Loren Merritt
On Fri, 30 Nov 2012, Christophe Gisquet wrote:

 698 to 174 cycles on penrynn. Unrolling is a 6 cycles gain.

 ---
  libavcodec/x86/sbrdsp.asm|   22 ++
  libavcodec/x86/sbrdsp_init.c |2 ++
  2 files changed, 24 insertions(+), 0 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 03/10] SBR DSP x86: implement SSE qmf_post_shuffle

2012-11-30 Thread Loren Merritt
On Fri, 30 Nov 2012, Christophe Gisquet wrote:

 +cglobal sbr_qmf_post_shuffle, 2,3,3,W,z
 +  lea   r2q, [zq + (64-4)*4]
 +.loop:
 +  mova   m0, [r2q]
 +  mova   m1, [zq ]
 +  xorps  m0, [ps_neg]
 +  shufps m0, m0, 0x1B
 +  mova   m2, m0
 +  unpcklps   m0, m1
 +  unpckhps   m2, m1
 +  mova  [Wq +  0], m0
 +  mova  [Wq + 16], m2
 +  addWq, 32
 +  sub   r2q, 16
 +  addzq, 16
 +  cmpzq, r2q
 +  jl  .loop
 +  REP_RET

If you increment an index into W and z rather than the pointers
themselves, then you can eliminate an add and a cmp.

4 space tabs.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 10/10] SBR DSP x86: implement SSE hf_apply_noise

2012-11-30 Thread Loren Merritt
On Fri, 30 Nov 2012, Christophe Gisquet wrote:

 497 to 253 cycles under Win64.

cpu is more relevant than os.

 +; r0q=Y   r1q=s_m   r2q=q_filt   r3q=noise  r4q=max_m
 +cglobal hf_apply_noise_main

You can invoke DEFINE_ARGS even if not generating a prologue.

 +  dec   r3q
 +  shl   r4q, 2
 +  lea   r0q, [r0q + 2*r4q]
 +  add   r1q, r4q
 +  add   r2q, r4q
 +  shl   r3q, 3
 +  xorps  m5, m5
 +  neg   r4q
 +.loop:
 +  add   r3q, 16
 +  and   r3q, 0x1ff3
 +  movh   m1, [r2q + r4q]
 +  movu   m3, [r3q + sbr_noise_table]
 +  movh   m2, [r2q + r4q + 8]
 +  add   r3q, 16
 +  and   r3q, 0x1ff3
 +  movu   m4, [r3q + sbr_noise_table]
 +  unpcklps   m1, m1
 +  unpcklps   m2, m2
 +  mulps  m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
 +  mulps  m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
 +  movh   m3, [r1q + r4q]
 +  movh   m4, [r1q + r4q + 8]

Can these be a single aligned load?

 +  unpcklps   m3, m3
 +  unpcklps   m4, m4
 +  mova   m6, m3
 +  mova   m7, m4
 +  mulps  m3, m0 ; s_m[m] * phi_sign
 +  mulps  m4, m0 ; s_m[m] * phi_sign
 +  cmpps  m6, m5, 0 ; m1 == 0
 +  cmpps  m7, m5, 0 ; m1 == 0

You mean m7 == 0?

 +  andps  m1, m6
 +  andps  m2, m7
 +  movu   m6, [r0q + 2*r4q]
 +  movu   m7, [r0q + 2*r4q + 16]
 +  addps  m6, m1
 +  addps  m7, m2
 +  addps  m6, m3
 +  addps  m7, m4
 +  movu[r0q + 2*r4q], m6
 +  movu[r0q + 2*r4q + 16], m7
 +  add   r4q, 16
 +  jl  .loop
 +  ret
 +
 +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
 +;  const float *q_filt, int noise,
 +;  int kx, int m_max)
 +cglobal sbr_hf_apply_noise_0, 4,5,8, Y,s_m,q_filt,noise,kx,m_max
 +  mova   m0, [ps_noise0]
 +  mov   r4d, m_maxm
 +  call  hf_apply_noise_main
 +  RET

TAIL_CALL hf_apply_noise_main, 1

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/1] h264: reset has_b_frames after enabling low_delay from SPS

2012-11-21 Thread Loren Merritt
On Wed, 21 Nov 2012, Janne Grunau wrote:
 On 2012-11-16 18:14:29 -0800, Ronald S. Bultje wrote:
  On Fri, Nov 16, 2012 at 8:43 AM, Janne Grunau janne-li...@jannau.netwrote:
 
   Fixes a crash in fuzzed file nasa-8s2.ts_s20033 caused by a too large
   has_b_frames value. low_delay keeps getting re-enabled from the the
   presumely broken SPS.
   ---
libavcodec/h264.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
  
   diff --git a/libavcodec/h264.c b/libavcodec/h264.c
   index c30c478..7935fe6 100644
   --- a/libavcodec/h264.c
   +++ b/libavcodec/h264.c
   @@ -2346,8 +2346,10 @@ static int h264_set_parameter_from_sps(H264Context
   *h)
  
if (s-flags  CODEC_FLAG_LOW_DELAY ||
(h-sps.bitstream_restriction_flag 
   - !h-sps.num_reorder_frames))
   + !h-sps.num_reorder_frames)) {
s-low_delay = 1;
   +s-avctx-has_b_frames = 0;
   +}
  
 
  So I'm going to have to wonder what happens to the delayed frames already
  cached in h-delayed_pics[]? Are they still output?

 yes, delayed pictures are returned as long as h-delayed_pics[0] is not
 NULL.

And when you return something from delayed_pics[] instead of the new
frame that just got decoded, the new frame gets delayed, so that you
don't ever actually switch to low delay mode?

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 35/45] x86: PSIGNW: port to cpuflags

2012-11-20 Thread Loren Merritt
On Sun, 18 Nov 2012, Justin Ruggles wrote:

 On 07/31/2012 06:17 PM, Diego Biurrun wrote:
  ---
   libavutil/x86/x86util.asm |   10 +-
   1 files changed, 5 insertions(+), 5 deletions(-)
 
  diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
  index b844131..db8899b 100644
  --- a/libavutil/x86/x86util.asm
  +++ b/libavutil/x86/x86util.asm
  @@ -160,13 +160,13 @@
   %endif
   %endmacro
 
  -%macro PSIGNW_MMX 2
  +%macro PSIGNW 2
  +%if cpuflag(ssse3)
  +psignw %1, %2
  +%else

 %elif mmsize == 8 || cpuflag(sse2)

   pxor   %1, %2
   psubw  %1, %2
  -%endmacro
  -
  -%macro PSIGNW_SSSE3 2
  -psignw %1, %2
  +%endif

 %else
 %error PSIGNW with XMM requires SSE2
 %endif

Do you have a use-case for this error message? None of our other macros
have one, because I assume that you just don't try to instantiate sse1
verions of functions that do integer simd.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [RFC PATCH 1/2] h264: set parameters from SPS whenever it changes

2012-11-17 Thread Loren Merritt
On Fri, 16 Nov 2012, Ronald S. Bultje wrote:

 So ... I originally added code to not support changing bitdepth at all,
 basically to prevent this and alike issues. E.g., what if the reference is
 8bit but the next frame is 10bit?

The standard allows SPS to change only at IDR-frames, i.e. where there
aren't any old references remaining accessible.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86: dsputil: port to cpuflags

2012-11-15 Thread Loren Merritt
On Wed, 14 Nov 2012, Diego Biurrun wrote:

 ---
 Now addressing the points from Loren's review and more refactoring
 around bswap_buf.

  libavcodec/x86/dsputil.asm   |  220 
 --
  libavcodec/x86/dsputil_mmx.c |   26 +++---
  2 files changed, 119 insertions(+), 127 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 25/45] x86: h264_weight_10bit: port to cpuflags

2012-11-13 Thread Loren Merritt
On Wed, 1 Aug 2012, Diego Biurrun wrote:

  libavcodec/x86/h264_weight_10bit.asm |  132 
 ++
  1 files changed, 69 insertions(+), 63 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 12/45] x86: dsputil_yasm: port to cpuflags

2012-11-13 Thread Loren Merritt
On Wed, 1 Aug 2012, Diego Biurrun wrote:

 -%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact 
 version, %3=has_ssse3
 -cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
 +%macro APPLY_WINDOW_INT16 1 ; %1=mmxext/sse2 bitexact version

Comment is no longer true, you changed it to all of the bitexact versions.

 +%if %1
 +cglobal apply_window_int16, 4, 5, 6, output, input, window, offset, offset2
 +%else
 +cglobal apply_window_int16_round, 4, 5, 6, output, input, window, offset, 
 offset2
 +%endif
  lea offset2q, [offsetq-mmsize]
 -%if %2
 -mova  m5, [pd_16384]
 -%elifidn %1, ssse3
 +%if cpuflag(ssse3)
  mova  m5, [pb_revwords]
  ALIGN 16
 +%elif %1
 +mova  m5, [pd_16384]
  %endif

Atom version uses neither of these constants.

 @@ -426,17 +431,9 @@ cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, 
 top, diff, w, left, left_
  %endmacro

  ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int 
 left)
 -INIT_MMX
 -cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
 -.skip_prologue:
 -movam5, [pb_7]
 -movam4, [pb_]
 -movam3, [pb_zz11zz55zz99zzdd]
 -movdm0, leftm
 -psllq   m0, 56
 -ADD_HFYU_LEFT_LOOP 1
 -
 -INIT_XMM
 +%macro ADD_HFYU_LEFT_PREDICTION 0
 +cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
 +%if cpuflag(sse4)
  cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  movam5, [pb_f]
  movam6, [pb_]
 @@ -451,10 +448,25 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, 
 w, left
  ADD_HFYU_LEFT_LOOP 1
  .unaligned:
  ADD_HFYU_LEFT_LOOP 0
 +%else
 +.skip_prologue:
 +movam5, [pb_7]
 +movam4, [pb_]
 +movam3, [pb_zz11zz55zz99zzdd]
 +movdm0, leftm
 +psllq   m0, 56
 +ADD_HFYU_LEFT_LOOP 1
 +%endif
 +%endmacro

 +INIT_MMX ssse3
 +ADD_HFYU_LEFT_PREDICTION
 +INIT_XMM sse4
 +ADD_HFYU_LEFT_PREDICTION

There's no code being shared here, why add a superfluous level of
indirection? Likewise with bswap_buf.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 24/45] x86: h264_qpel_10bit: drop unused parameter from MC10/MC20/MC30 macros

2012-11-10 Thread Loren Merritt
On Fri, 9 Nov 2012, Diego Biurrun wrote:
 On Wed, Aug 01, 2012 at 12:17:48AM +0200, Diego Biurrun wrote:
  ---
   libavcodec/x86/h264_qpel_10bit.asm |   10 +-
   1 files changed, 5 insertions(+), 5 deletions(-)

 .. ping .. (this is rather simple and trivial)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 08/10] x86: af_volume: add SSE2/SSSE3/AVX-optimized s32 volume scaling

2012-11-09 Thread Loren Merritt
On Fri, 9 Nov 2012, Christophe Gisquet wrote:
 2012/11/8 Loren Merritt lor...@u.washington.edu:

  This code shouldn't be latency-bound anyway. Or if it is, all you need is
  more unrolling.

 But you do notice the discrepancy in timings? Atom's SSSE3 version is
 twice as fast as SSE2, but SB SSSE3 is slower than SSE2. Using
 opannotate may help lay the blame if that is the issue.

You mean Atom's SSE2 is twice as slow as SSSE3. Of course there's a
discrepancy when mulpd is 1 uop on SB and 9 cycles on Atom; and similarly
for all the other double instructions.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86: h264_qpel_10bit: port to cpuflags

2012-11-09 Thread Loren Merritt
On Thu, 8 Nov 2012, Diego Biurrun wrote:

 ---
 This version incorporates the changes suggested by Loren.

  libavcodec/x86/h264_qpel_10bit.asm |  314 
 ++--
  1 files changed, 155 insertions(+), 159 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 08/10] x86: af_volume: add SSE2/SSSE3/AVX-optimized s32 volume scaling

2012-11-08 Thread Loren Merritt
On Wed, 7 Nov 2012, Christophe Gisquet wrote:

 err, mail sent before I finished editing it...

 2012/11/7 Christophe Gisquet christophe.gisq...@gmail.com:
  Nehalem

 Nehalem has 2 cycles delay, SB 0 or 1, Atom 0 when mixing such instruction.

 So that mix of instructions may cause issues.

This code shouldn't be latency-bound anyway. Or if it is, all you need is
more unrolling.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 23/45] x86: h264_qpel_10bit: port to cpuflags

2012-11-08 Thread Loren Merritt
On Wed, 1 Aug 2012, Diego Biurrun wrote:

 -%macro MCAxA 8
 +%macro MCAxA 7
  %if ARCH_X86_64
 -%ifnidn %1,mmxext
 -MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
 +%ifnidn SUFFIX, mmxext
 +MCAxA_OP %1, %2, %3, %4, %5, %6, %7
  %endif
  %else
 -MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
 +MCAxA_OP %1, %2, %3, %4, %5, %6, %7
  %endif
  %endmacro

%macro MCAxA 7
%if ARCH_X86_32 || cpuflag(sse2)
MCAxA_OP %1, %2, %3, %4, %5, %6, %7
%endif
%endmacro

and could be inlined into cglobal_mc.

 -%macro MCAxA_OP 8
 +%macro MCAxA_OP 7
  %if ARCH_X86_32
 -cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
 -call stub_%2_h264_qpel%4_%3_10_%1
 +cglobal %1_h264_qpel%4_%2_10, %5, %6, %7
 +call stub_%1_h264_qpel%3_%2_10 %+ cpuname
  mov  r0, r0m
  mov  r1, r1m
 -add  r0, %4*2
 +add  r0, %3*2
  add  r1, %4*2
 -call stub_%2_h264_qpel%4_%3_10_%1
 +call stub_%1_h264_qpel%3_%2_10 %+ cpuname
  mov  r0, r0m
  mov  r1, r1m
 -lea  r0, [r0+r2*%4]
 -lea  r1, [r1+r2*%4]
 -call stub_%2_h264_qpel%4_%3_10_%1
 +lea  r0, [r0+r2*%3]
 +lea  r1, [r1+r2*%3]
 +call stub_%1_h264_qpel%3_%2_10 %+ cpuname
  mov  r0, r0m
  mov  r1, r1m
 -lea  r0, [r0+r2*%4+%4*2]
 -lea  r1, [r1+r2*%4+%4*2]
 -call stub_%2_h264_qpel%4_%3_10_%1
 +lea  r0, [r0+r2*%3+%3*2]
 +lea  r1, [r1+r2*%3+%3*2]
 +call stub_%1_h264_qpel%3_%2_10 %+ cpuname
  RET

All your uses of cpuname should instead be SUFFIX.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 6/6] x86: af_volume: add SSSE3/AVX-optimized s32 volume scaling

2012-11-06 Thread Loren Merritt
On Mon, 5 Nov 2012, Justin Ruggles wrote:

 Yeah, it works for SSE2 and AVX. Unfortunately it's not too fast. Not
 sure how I should handle this in terms of checking flags to determine
 when to use the SSE2 version. I suppose it depends on how well it
 performs on Intel cpus that have SSE2 but not SSSE3.

 Athlon64:
 C- 6267
 SSE2 - 4332

 Sandy Bridge:
 C- 4001
 SSE2 - 4560
 SSSE3- 2033
 AVX (int)- 1937
 AVX (double) - 44619

 Here is the code I tested for SSE2 and AVX:

 INIT_XMM sse2
 cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
 movdm2, volumem
 pshufd  m2, m2, 0
 cvtdq2pdm2, m2
 mulpd   m2, [pd_1_256]
 movam3, [pd_int32_max]
 lea   lenq, [lend*4-mmsize]
 .loop:
 movam0, [srcq+lenq]
 movhlps m1, m0
 cvtdq2pdm0, m0
 cvtdq2pdm1, m1
 mulpd   m0, m2
 mulpd   m1, m2
 minpd   m0, m3
 minpd   m1, m3
 cvtpd2dqm0, m0
 cvtpd2dqm1, m1
 punpcklqdq  m0, m1
 mova [dstq+lenq], m0
 sub   lenq, mmsize
 jge .loop
 REP_RET

 INIT_XMM avx
 cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
 movd  xmm1, volumem
 pshufdxmm1, xmm1, 0
 vcvtdq2pd ymm1, xmm1
 vmulpdymm1, ymm1, [pd_1_256]
 vmovapd   ymm2, [pd_int32_max]
 lea   lenq, [lend*4-mmsize]
 .loop:
 vcvtdq2pd ymm0, [srcq+lenq]
 vmulpdymm0, ymm0, ymm1
 vminpdymm0, ymm0, ymm2
 vcvtpd2dq xmm0, ymm0
 mova [dstq+lenq], xmm0

vmovdqa.
You can't mix ymm instructions with xmm instructions that don't begin
with a v.
Or rather, you can because intel didn't even have the decency to make that
combination crash so that you'd notice it; instead it's very slow.

 sub   lenq, mmsize
 jge .loop
 vzeroupper
 RET

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: Set program_name and cpuflags_3dnowext outside of x86inc.asm

2012-11-05 Thread Loren Merritt
On Sat, 3 Nov 2012, Diego Biurrun wrote:

 On Sat, Nov 03, 2012 at 06:13:31AM +, Loren Merritt wrote:
 On Fri, 2 Nov 2012, Diego Biurrun wrote:

 This reduces the local difference to the x264 upstream version.
 ---

 Now also sets cpuflags_3dnowext locally, further reducing the difference.

 x264 doesn't use cpuflags_3dnow* at all, it only exists because it's in
 libav.

 I was about to send you some x264 patches to sync those files further.
 Do you want me to drop the 3dnow cpuflags on the x264 side while at it
 now that we have a means of setting them on the libav side w/o patching
 x86inc.asm?

No preference. Just saying it should be either synched or dropped.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: Set program_name and cpuflags_3dnowext outside of x86inc.asm

2012-11-03 Thread Loren Merritt
On Fri, 2 Nov 2012, Diego Biurrun wrote:

 This reduces the local difference to the x264 upstream version.
 ---

 Now also sets cpuflags_3dnowext locally, further reducing the difference.

x264 doesn't use cpuflags_3dnow* at all, it only exists because it's in
libav.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: Add cpuflags_mmxext alias for cpuflags_mmx2

2012-10-30 Thread Loren Merritt
On Mon, 29 Oct 2012, Ronald S. Bultje wrote:
 On Mon, Oct 29, 2012 at 5:15 PM, Diego Biurrun di...@biurrun.de wrote:

 This allows using mmxext as name in Libav while staying compatible
 with changes to the YASM macro infrastructure imported from x264.
 ---
  libavutil/x86/x86inc.asm |1 +
  1 files changed, 1 insertions(+), 0 deletions(-)

 diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
 index 1fe9f55..2655154 100644
 --- a/libavutil/x86/x86inc.asm
 +++ b/libavutil/x86/x86inc.asm
 @@ -35,6 +35,7 @@
  ; to x264-de...@videolan.org .

  %define program_name ff
 +%define cpuflags_mmxext cpuflags_mmx2

 Any such a change to x86inc.asm is OK if and only if x264 accepts this
 change upstream. Loren?

I'll accept the addition of the synonym.

(This doesn't change my objections to the name, I'm just allowing an inert
line of code to live in the x264 repo if that simplifies merging.)

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 6/6] x86: af_volume: add SSSE3/AVX-optimized s32 volume scaling

2012-10-02 Thread Loren Merritt
On Sat, 29 Sep 2012, Justin Ruggles wrote:

 +cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
 +movdm4, volumem
 +pshufd  m4, m4, 0
 +movam5, [pq_128]
 +pxorm6, m6
 +lea   lenq, [lend*4-mmsize]
 +.loop:
 +; src[i] = av_clipl_int32((src[i] * volume + 128)  8);
 +movam7, [srcq+lenq]
 +pabsd   m3, m7
 +pshufd  m0, m3, q0100
 +pshufd  m1, m3, q0302
 +pmuludq m0, m4
 +pmuludq m1, m4
 +paddq   m0, m5
 +paddq   m1, m5
 +psrlq   m0, 7
 +psrlq   m1, 7
 +shufps  m2, m0, m1, q3131
 +shufps  m0, m0, m1, q2020
 +pcmpgtd m2, m6
 +por m0, m2
 +psrld   m0, 1
 +psignd  m0, m7
 +mova  [dstq+lenq], m0
 +sub   lenq, mmsize
 +jge .loop
 +REP_RET
 +%endmacro

.loop:
vcvtdq2pd ymm0, [srcq+lenq]
vmulpdymm0, ymm1
vminpdymm0, ymm2 ; no max needed, since underflows convert to INT_MIN 
anyway
vcvtpd2dq xmm0, ymm0
vmovdqa [dstq+lenq], xmm0
sub   lenq, mmsize
jge .loop

Should also work for sse2; I don't know if that'll be faster than ssse3 int.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 6/8] x86: af_volume: add SSE2-optimized s16 volume scaling

2012-09-28 Thread Loren Merritt
On Fri, 28 Sep 2012, Justin Ruggles wrote:

 --- /dev/null
 +++ b/libavfilter/x86/af_volume_init.c

 +void ff_volume_init_x86(VolumeContext *vol)
 +{
 +int mm_flags = av_get_cpu_flags();
 +enum AVSampleFormat sample_fmt = 
 av_get_packed_sample_fmt(vol-sample_fmt);
 +
 +if (sample_fmt == AV_SAMPLE_FMT_S16) {
 +if (EXTERNAL_SSE2(mm_flags)  vol-volume_i  65536) {

32768

 +vol-scale_samples_int = ff_scale_samples_s16_sse2;
 +vol-samples_align = 8;
 +}
 +}
 +}

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 7/9] x86: af_volume: add SSE4-optimized s16 volume scaling

2012-09-25 Thread Loren Merritt
On Tue, 25 Sep 2012, Justin Ruggles wrote:

 +;--
 +; void ff_scale_samples_s16(VolumeContext *ctx, uint8_t *src, int len,
 +;   int volume)
 +;--
 +
 +INIT_XMM sse4
 +cglobal scale_samples_s16, 4,4,4, ctx, src, len, volume
 +movdm0, volumed
 +pshufd  m0, m0, 0
 +movam1, [pd_128]
 +lea   lenq, [lend*2-mmsize]
 +.loop:
 +; src[i] = av_clip_int16((src[i] * volume + 128)  8);
 +movam2, [srcq+lenq]
 +pmovsxwdm3, m2
 +psrldq  m2, 8
 +pmovsxwdm2, m2
 +pmulld  m2, m0
 +pmulld  m3, m0
 +paddd   m2, m1
 +paddd   m3, m1
 +psrad   m2, 8
 +psrad   m3, 8
 +packssdwm3, m2
 +mova  [srcq+lenq], m3
 +sublenq, mmsize
 +jge .loop
 +REP_RET

INIT_XMM sse2
cglobal scale_samples_s16, 3,3,4, ctx, src, len, volume
movdm0, volumem
pshuflw m0, m0, 0
punpcklwd   m0, [pw_1]
movam1, [pw_128]
lea   lenq, [lend*2-mmsize]
.loop:
; src[i] = av_clip_int16((src[i] * volume + 128)  8);
movam2, [srcq+lenq]
punpcklwd   m3, m2, m1
punpckhwd   m2, m1
pmaddwd m3, m0
pmaddwd m2, m0
psrad   m3, 8
psrad   m2, 8
packssdwm3, m2
mova  [srcq+lenq], m3
sublenq, mmsize
jge .loop
REP_RET

(untested)

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 9/9] x86: af_volume: add SSSE3/AVX-optimized s32 volume scaling

2012-09-25 Thread Loren Merritt
On Tue, 25 Sep 2012, Justin Ruggles wrote:

  SECTION_RODATA 32

  pd_128: times 4 dd 128
 +pq_128: times 2 dq 128
 +pq_maxsq: times 2 dq  549755813632 ; INT_MAX  8
 +pb_packqd_rsh8: db 1, 2, 3, 4, 9, 10, 11, 12, -1, -1, -1, -1, -1, -1, -1, -1

  SECTION_TEXT

 @@ -55,3 +58,58 @@ cglobal scale_samples_s16, 4,4,4, ctx, src, len, volume
  sublenq, mmsize
  jge .loop
  REP_RET
 +
 +;--
 +; void ff_scale_samples_s32(VolumeContext *ctx, uint8_t *src, int len,
 +;   int volume)
 +;--
 +
 +; NOTE: This is not bit-identical with the C version because it clips to
 +;   [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
 +
 +%macro PMINSQ 3 ; dst, src, tmp
 +pcmpgtq   %3, %2, %1
 +pxor  %1, %2
 +pand  %1, %3
 +pxor  %1, %2
 +%endmacro
 +
 +%macro SCALE_SAMPLES_S32 0
 +cglobal scale_samples_s32, 4,4,8, ctx, src, len, volume
 +movdm0, volumed
 +pshufd  m0, m0, 0
 +movam3, [pq_128]
 +movam4, [pq_maxsq]
 +movam5, [pb_packqd_rsh8]
 +lea   lenq, [lend*4-mmsize]
 +.loop:
 +; src[i] = av_clipl_int32((src[i] * volume + 128)  8);
 +movam1, [srcq+lenq]
 +movam6, m1
 +movam2, m1
 +punpckhdq   m2, m1, m1
 +punpckldq   m1, m1, m1
 +pabsd   m1, m1
 +pabsd   m2, m2
 +pmuludq m1, m0
 +pmuludq m2, m0
 +paddq   m1, m3
 +paddq   m2, m3
 +PMINSQ  m1, m4, m7
 +PMINSQ  m2, m4, m7
 +pshufb  m1, m5
 +pshufb  m2, m5
 +punpcklqdq  m1, m2
 +psignd  m1, m6
 +mova  [srcq+lenq], m1
 +sub   lenq, mmsize
 +jge .loop
 +REP_RET
 +%endmacro

cglobal scale_samples_s32, 3,3,7, ctx, src, len, volume
movdm4, volumem
pshufd  m4, m4, 0
movam5, [pq_128]
pxorm6, m6
lea   lenq, [lend*4-mmsize]
.loop:
; src[i] = av_clipl_int32((src[i] * volume + 128)  8);
pabsd   m3, [srcq+lenq]
pshufd  m0, m3, q0100
pshufd  m1, m3, q0302
pmuludq m0, m4
pmuludq m1, m4
paddq   m0, m5
paddq   m1, m5
psrlq   m0, 7
psrlq   m1, 7
shufps  m2, m0, m1, q3131
shufps  m0, m0, m1, q2020
pcmpgtd m2, m6
por m0, m2
psrld   m0, 1
psignd  m0, m3
mova  [srcq+lenq], m0
sub   lenq, mmsize
jge .loop
REP_RET

(untested)

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] libx264: change i_qfactor to use x264cli's default

2012-09-25 Thread Loren Merritt
On Tue, 25 Sep 2012, John Van Sickle wrote:

 On Tue, Sep 25, 2012 at 2:12 PM, Luca Barbato lu_z...@gentoo.org wrote:
  On 09/25/2012 06:29 PM, John Van Sickle wrote:
  -x4-params.rc.f_ip_factor = 1 / 
  fabs(avctx-i_quant_factor);
  +if (avctx-i_quant_factor = 0)
  +x4-params.rc.f_ip_factor = avctx-i_quant_factor;
 
  Looks strange to me. before it was 1/qf...

 As discussed on irc, I have no idea why it was previously: 1 /
 fabs(avctx-i_quant_factor);

 With my patch the option now behaves the same way as x264cli. The
 default is 1.40, and using -tune grain correctly sets ipratio to
 1.10. If you set --ipratio .80 using x264cli, that's what you get. Not
 1/.80 like avconv is currently doing. Do this make sense?

The meaning that x264 assigns to f_ip_factor is the reciprocal of the
meaning that libav codecs assign to i_quant_factor.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] vorbisdec: ensure FASTDIV denominator is never 1

2012-09-13 Thread Loren Merritt
On Thu, 13 Sep 2012, Luca Barbato wrote:

 Both in usage of FASTDIV the denominator might be 1.

 Using a branch would make the function slower than using a normal
 division.

Which CPU do you have in mind where a division is faster than a branch
miss, let alone a predictable branch?
(No objection to the patch, it's only the commit message that's wrong.)

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] x86: dsputil: Move SSE4 function pointer setting to SSE4 section

2012-09-06 Thread Loren Merritt
On Thu, 6 Sep 2012, Diego Elio Pettenò wrote:
 On 06/09/2012 13:20, Diego Biurrun wrote:

  -if (mm_flags  AV_CPU_FLAG_SSE4) // not really sse4, just slow on 
  Conroe
  -c-add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;

 This looks like it might have to be tied to something else -- AMD also
 has ssse3 don't they?

Semantically it should check SSSE3  SHUFFLE_IS_FAST rather than SSE4. If
anyone wants to update the set of cpuflags to include the things (such as
SHUFFLE_IS_FAST) that I invented for x264 to represent performance
characteristics that aren't instructions sets.

But the extensional equivalence of SHUFFLE_IS_FAST and SSE4 remains true
on AMD too.

--Loren Merritt___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86: allow using add_hfyu_median_prediction_cmov on any cpu with cmov

2012-09-02 Thread Loren Merritt
On Sun, 2 Sep 2012, Mans Rullgard wrote:

 For some reasion add_hfyu_median_prediction_cmov is only selected
 on 3Dnow-capable CPUs, even though it uses no 3Dnow instructions.
 This patch allows it to be selected on any cpu with cmov with the
 possibility of being overridden by the mmxext version.

As long as mmxext doesn't override it on AMD, where cmov is faster.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 7/7] vf_hqdn3d: x86 asm

2012-08-26 Thread Loren Merritt
On Thu, 9 Aug 2012, Kieran Kunhya wrote:
 On Sat, Jul 28, 2012 at 8:55 PM, Loren Merritt lor...@u.washington.edu 
 wrote:

 13% faster on penryn, 16% on sandybridge, 15% on bulldozer
 Not simd; a compiler should have generated this, but gcc didn't.
 ---
  libavfilter/vf_hqdn3d.c|   27 ++--
  libavfilter/x86/Makefile   |1 +
  libavfilter/x86/hqdn3d.asm |  106 
 
  libavutil/x86/x86inc.asm   |1 +
  4 files changed, 131 insertions(+), 4 deletions(-)
  create mode 100644 libavfilter/x86/hqdn3d.asm

 Looks ok to me.

Pushed.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-23 Thread Loren Merritt
On Thu, 23 Aug 2012, Daniel Kang wrote:

 On Wed, Aug 22, 2012 at 11:30 PM, Loren Merritt lor...@u.washington.edu 
 wrote:
 
  On Wed, 22 Aug 2012, daniel.d.k...@gmail.com wrote:
 
   +%macro QPEL4_H_LOWPASS_OP 1
   +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
   +%define OP op_%1h
 
  I don't think this define clarifies anything, and it's only used once or
  twice each function.

 What do you suggest I do instead?

Write op_%1h in the place where it's used.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H.264: Convert 8-bit qpel inlined assembly to yasm

2012-08-22 Thread Loren Merritt
On Wed, 22 Aug 2012, daniel.d.k...@gmail.com wrote:

 +; void pixels8_l2_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int 
 dstStride, int src1Stride, int h)
 +%macro PIXELS8_L2 1
 +%define OP op_%1
 +cglobal %1_pixels8_l2, 6,6
 +test   r5d, 1
 +je .loop
 +movam0, [r1]
 +movam1, [r2]
 +add r1, r4
 +add r2, 8
 +pavgb   m0, m1
 +OP  m0, [r0]
 +add r0, r3
 +decr5d
 +.loop:
 +movam0, [r1]
 +add r1, r4
 +movam1, [r1]
 +add r1, r4
 +pavgb   m0, [r2]
 +pavgb   m1, [r2+8]
 +OP  m0, [r0]
 +add r0, r3
 +OP  m1, [r0]
 +add r0, r3
 +movam0, [r1]
 +add r1, r4
 +movam1, [r1]
 +add r1, r4
 +pavgb   m0, [r2+16]
 +pavgb   m1, [r2+24]
 +OP  m0, [r0]
 +add r0, r3
 +OP  m1, [r0]
 +add r0, r3
 +add r2, 32
 +subr5d, 4
 +jne .loop
 +REP_RET
 +%endmacro

More adds than necessary. Use [r1+r4].

 +%macro QPEL4_H_LOWPASS_OP 1
 +cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
 +%define OP op_%1h

I don't think this define clarifies anything, and it's only used once or
twice each function.

 +%macro QPEL8_H_LOWPASS_OP_XMM 1
 +%define OP op_%1h
 +cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride
 +mov  r4d,  8
 +pxor  m7, m7
 +mova  m6, [pw_5]
 +.loop:
 +lddqu m1, [r1-2]
 +mova  m0, m1
 +punpckhbw m1, m7
 +punpcklbw m0, m7
 +mova  m2, m1
 +mova  m3, m1
 +mova  m4, m1
 +mova  m5, m1
 +palignr   m4, m0, 2
 +palignr   m3, m0, 4
 +palignr   m2, m0, 6
 +palignr   m1, m0, 8
 +palignr   m5, m0, 10
 +paddw m0, m5
 +paddw m2, m3
 +paddw m1, m4
 +psllw m2, 2
 +psubw m2, m1
 +paddw m0, [pw_16]
 +pmullwm2, m6
 +paddw m2, m0
 +psraw m2, 5
 +packuswb  m2, m2
 +OPm2, [r0], m4
 +add   r1, r3
 +add   r0, r2
 +dec r4d
 +jne .loop
 +REP_RET
 +%endmacro
 +
 +INIT_XMM ssse3
 +QPEL8_H_LOWPASS_OP_XMM put
 +QPEL8_H_LOWPASS_OP_XMM avg

There aren't any cpus that have both lddqu and ssse3. Use movu instead,
since that's what lddqu actually does on everything other than pentium4.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86: Drop silly _yasm suffixes from filenames

2012-08-08 Thread Loren Merritt
On Wed, 8 Aug 2012, M�ns Rullg�rd wrote:
 Diego Biurrun di...@biurrun.de writes:

  libavcodec/x86/Makefile|6 +++---
  libavcodec/x86/{dsputil_yasm.asm = dsputil.asm}   |0
  .../x86/{dsputilenc_yasm.asm = dsputilenc.asm}|0
  libavcodec/x86/{vc1dsp_yasm.asm = vc1dsp.asm} |0
  4 files changed, 3 insertions(+), 3 deletions(-)
  rename libavcodec/x86/{dsputil_yasm.asm = dsputil.asm} (100%)
  rename libavcodec/x86/{dsputilenc_yasm.asm = dsputilenc.asm} (100%)
  rename libavcodec/x86/{vc1dsp_yasm.asm = vc1dsp.asm} (100%)

 Makes sense.  _yasm doesn't convey any information not already given by
 the .asm suffix, given that all the x86 .asm files use yasm syntax.

Same for _mmx.
Do we have a standard for what to do when we have both a dsputil.c and a
dsputil.asm, where the .o's would collide if we gave both their natural
basename?

--Loren Merritt___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 06/15] lavr: x86: optimized 6-channel fltp to s16 conversion

2012-08-06 Thread Loren Merritt
On Mon, 6 Aug 2012, Justin Ruggles wrote:

 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -581,6 +581,120 @@ CONV_FLTP_TO_S16_2CH
  INIT_XMM ssse3
  CONV_FLTP_TO_S16_2CH

 +;--
 +; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_FLTP_TO_S16_6CH 0
 +%if ARCH_X86_64
 +cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, 
 src5
 +%else
 +cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
 +%define lend dword r2m
 +%endif
 +movsrc1q, [srcq+1*gprsize]
 +movsrc2q, [srcq+2*gprsize]
 +movsrc3q, [srcq+3*gprsize]
 +movsrc4q, [srcq+4*gprsize]
 +movsrc5q, [srcq+5*gprsize]
 +mov srcq, [srcq]
 +subsrc1q, srcq
 +subsrc2q, srcq
 +subsrc3q, srcq
 +subsrc4q, srcq
 +subsrc5q, srcq
 +movaps  xmm6, [pf_s16_scale]
 +.loop:
 +%if cpuflag(sse2)
 +mulps m0, m6, [srcq  ]
 +mulps m1, m6, [srcq+src1q]
 +mulps m2, m6, [srcq+src2q]
 +mulps m3, m6, [srcq+src3q]
 +mulps m4, m6, [srcq+src4q]
 +mulps m5, m6, [srcq+src5q]
 +cvtps2dq  m0, m0
 +cvtps2dq  m1, m1
 +cvtps2dq  m2, m2
 +cvtps2dq  m3, m3
 +cvtps2dq  m4, m4
 +cvtps2dq  m5, m5
 +packssdw  m0, m3; m0 =  0,  6, 12, 18,  3,  9, 15, 21
 +packssdw  m1, m4; m1 =  1,  7, 13, 19,  4, 10, 16, 22
 +packssdw  m2, m5; m2 =  2,  8, 14, 20,  5, 11, 17, 23
 +; unpack words:
 +movhlps   m3, m0; m3 =  3,  9, 15, 21,  x,  x,  x,  x
 +punpcklwd m0, m1; m0 =  0,  1,  6,  7, 12, 13, 18, 19
 +punpckhwd m1, m2; m1 =  4,  5, 10, 11, 16, 17, 22, 23
 +punpcklwd m2, m3; m2 =  2,  3,  8,  9, 14, 15, 20, 21
 +; blend dwords:
 +shufpsm3, m0, m2, q2020 ; m3 =  0,  1, 12, 13,  2,  3, 14, 15
 +shufpsm0, m1, q2031 ; m0 =  6,  7, 18, 19,  4,  5, 16, 17
 +shufpsm2, m1, q3131 ; m2 =  8,  9, 20, 21, 10, 11, 22, 23
 +; shuffle dwords:
 +shufpsm1, m2, m3, q3120 ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
 +shufpsm3, m0, q0220 ; m3 =  0,  1,  2,  3,  4,  5,  6,  7
 +shufpsm0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
 +mova  [dstq+0*mmsize], m3
 +mova  [dstq+1*mmsize], m1
 +mova  [dstq+2*mmsize], m0
 +%else ; sse
 +movaxmm0, [srcq  ]
 +movaxmm1, [srcq+src1q]
 +movaxmm2, [srcq+src2q]
 +movaxmm3, [srcq+src3q]
 +movaxmm4, [srcq+src4q]
 +movaxmm5, [srcq+src5q]

Even aside from the weirdness of seeing a mova that doesn't move the whole
register, movq xmm, mem is a sse2 instruction. You need movlps.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 10/15] lavr: x86: optimized 2-channel s16 to fltp conversion

2012-08-06 Thread Loren Merritt
On Mon, 6 Aug 2012, Justin Ruggles wrote:

 ---
  libavresample/x86/audio_convert.asm|   39 
 
  libavresample/x86/audio_convert_init.c |   13 ++
  2 files changed, 52 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index 09c4e7f..ea0debf 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -923,3 +923,42 @@ CONV_S16_TO_S16P_6CH
  INIT_XMM avx
  CONV_S16_TO_S16P_6CH
  %endif
 +
 +;--
 +; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_S16_TO_FLTP_2CH 0
 +cglobal conv_s16_to_fltp_2ch, 3,4,4, dst0, src, len, dst1
 +lea   lenq, [4*lend]
 +mov  dst1q, [dst0q+gprsize]
 +mov  dst0q, [dst0q]
 +add   srcq, lenq
 +add  dst0q, lenq
 +add  dst1q, lenq
 +neg   lenq
 +movam3, [pf_s16_inv_scale]
 +.loop:
 +mova   m0, [srcq+lenq]
 +S16_TO_S32_SX 0, 1
 +cvtdq2ps   m0, m0
 +cvtdq2ps   m1, m1
 +mulps  m0, m0, m3
 +mulps  m1, m1, m3
 +DEINT2_PS   0, 1, 2

pslld  m1, m0, 16
psrad  m0, m0, 16
psrad  m1, m1, 16
cvtdq2ps   m0, m0
cvtdq2ps   m1, m1
mulps  m0, m0, m3
mulps  m1, m1, m3

 +mova  [dst0q+lenq], m0
 +mova  [dst1q+lenq], m1
 +add  lenq, mmsize
 +jl .loop
 +REP_RET
 +%endmacro

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 07/45] x86: mmx2 --- mmxext in asm constructs

2012-08-04 Thread Loren Merritt
On Sat, 4 Aug 2012, Diego Biurrun wrote:

 On Sat, Aug 04, 2012 at 03:11:50PM -0400, Justin Ruggles wrote:
 On 07/31/2012 06:17 PM, Diego Biurrun wrote:
 ---
  libavcodec/x86/ac3dsp.asm  |4 +-
  libavcodec/x86/ac3dsp_mmx.c|4 +-
  libavcodec/x86/dsputil_yasm.asm|8 ++--
  libavcodec/x86/dsputilenc_mmx.c|6 +-
  libavcodec/x86/dsputilenc_yasm.asm |2 +-
  libavcodec/x86/h264_chromamc.asm   |   14 +++---
  libavcodec/x86/h264_chromamc_10bit.asm |4 +-
  libavcodec/x86/h264_deblock.asm|   20 
  libavcodec/x86/h264_deblock_10bit.asm  |4 +-
  libavcodec/x86/h264_idct.asm   |   46 +
  libavcodec/x86/h264_idct_10bit.asm |2 +-
  libavcodec/x86/h264_intrapred.asm  |   26 +-
  libavcodec/x86/h264_intrapred_init.c   |   40 +++---
  libavcodec/x86/h264_weight.asm |   12 ++--
  libavcodec/x86/pngdsp-init.c   |6 +-
  libavcodec/x86/pngdsp.asm  |2 +-
  libavcodec/x86/rv34dsp.asm |6 +-
  libavcodec/x86/rv34dsp_init.c  |   10 ++--
  libavcodec/x86/rv40dsp.asm |4 +-
  libavcodec/x86/rv40dsp_init.c  |   30 ++--
  libavcodec/x86/vc1dsp_mmx.c|   10 ++--
  libavcodec/x86/vc1dsp_yasm.asm |   10 ++--
  libavcodec/x86/vp3dsp.asm  |4 +-
  libavcodec/x86/vp3dsp_init.c   |   16 +++---
  libavcodec/x86/vp8dsp-init.c   |   86 
 
  libavcodec/x86/vp8dsp.asm  |   30 ++--
  libavutil/x86/x86inc.asm   |6 +-
  libavutil/x86/x86util.asm  |2 +-
  libswscale/x86/output.asm  |4 +-
  libswscale/x86/swscale.c   |8 ++--
  30 files changed, 215 insertions(+), 211 deletions(-)

 Looks ok, but probably should get other opinions on this as well. I know
 Ronald was trying to keep x86inc.asm sychronized with x264, and trying
 to do so after this change would likely require similar extensive
 cpuflag modifications in x264.

 I volunteer to patch x264 if such a change would be accepted on their
 side.

Rejected. I like mmx2 better.
However, I wouldn't be opposed to dropping mmx1 entirely and using
the name mmx to refer to mmx2. (x264 doesn't actually support mmx1
anyway; we use mmx2 inline asm that's actually inlined in places where
runtime cpu detection is impossible.) But that wouldn't help
synchronization if libav doesn't do so.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: make add_hfyu_left_prediction_sse4() support unaligned src.

2012-08-03 Thread Loren Merritt
On Fri, 3 Aug 2012, Hendrik Leppkes wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 This makes add_hfyu_left_prediction_sse4() handle sources that are not
 16-byte aligned in its own function rather than by proxying the call to
 add_hfyu_left_prediction_ssse3(). This fixes a crash on Win64, since the
 sse4 version clobberes xmm6, but the ssse3 version (which uses MMX regs)
 does not restore it, thus leading to XMM clobbering and RSP being off.

 Fixes bug 342.
 ---
 The previous patch assumed that dst would be aligned if src was not,
 causing another crash. Instead, assume both are not aligned.

  libavcodec/x86/dsputil_yasm.asm |   14 ++
  1 file changed, 10 insertions(+), 4 deletions(-)

 diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
 index 70a0aa1..58e4ca0 100644
 --- a/libavcodec/x86/dsputil_yasm.asm
 +++ b/libavcodec/x86/dsputil_yasm.asm
 @@ -388,12 +388,16 @@ cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, 
 top, diff, w, left, left_to
  RET


 -%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
 +%macro ADD_HFYU_LEFT_LOOP 1-2 ; %1 = is_aligned, %2 = src_is_unaligned

The two have opposite semantics?
And there's no reason to use variable number of arguments here.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil: make add_hfyu_left_prediction_sse4() support unaligned src.

2012-08-03 Thread Loren Merritt
On Fri, 3 Aug 2012, Ronald S. Bultje wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 This makes add_hfyu_left_prediction_sse4() handle sources that are not
 16-byte aligned in its own function rather than by proxying the call to
 add_hfyu_left_prediction_ssse3(). This fixes a crash on Win64, since the
 sse4 version clobberes xmm6, but the ssse3 version (which uses MMX regs)
 does not restore it, thus leading to XMM clobbering and RSP being off.

 Fixes bug 342.
 ---
  libavcodec/x86/dsputil_yasm.asm |   20 +---
  1 file changed, 13 insertions(+), 7 deletions(-)

LGTM

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86: Refactor PSWAPD fallback implementations and port to cpuflags

2012-08-01 Thread Loren Merritt
On Thu, 2 Aug 2012, Diego Biurrun wrote:
 On Wed, Aug 01, 2012 at 07:41:01AM -0700, Ronald S. Bultje wrote:
 On Wed, Aug 1, 2012 at 5:30 AM, Diego Biurrun di...@biurrun.de wrote:
 --- a/libavcodec/x86/fft_mmx.asm
 +++ b/libavcodec/x86/fft_mmx.asm
 @@ -105,7 +105,8 @@ SECTION_TEXT
  pxor %3, [ps_m1p1] ; {t8,t7}
  mova %6, %1
 -pswapd   %3, %3
 +movd [r0+12], %3
 +punpckhdq %3, [r0+8]

 Needs rebase?

 No, it's the only caller with two identical arguments, so I moved the
 macro branch for identical arguments out of the macro and into the caller.

But then it doesn't generate pswapd in the 3dnow2 instantiation.

Or you could make it not have two identical arguments; there's a spare
mmreg (%4) at that point.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 07/15] lavr: x86: optimized 2-channel fltp to flt conversion

2012-07-31 Thread Loren Merritt
On Mon, 30 Jul 2012, Justin Ruggles wrote:

 --- a/libavutil/x86/x86util.asm
 +++ b/libavutil/x86/x86util.asm
 @@ -47,6 +47,12 @@
  SWAP %1, %3, %2
  %endmacro

 +%macro SBUTTERFLYPS2 3
 +unpcklps m%3, m%1, m%2
 +unpckhps m%1, m%1, m%2
 +SWAP %1, %3, %2
 +%endmacro

What's the difference between this and SBUTTERFLYPS?

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 43/45] x86: Refactor PSWAPD fallback implementations and port to cpuflags

2012-07-31 Thread Loren Merritt
On Wed, 1 Aug 2012, Diego Biurrun wrote:

 diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
 index 6911a9c..f358257 100644
 --- a/libavutil/x86/x86util.asm
 +++ b/libavutil/x86/x86util.asm
 @@ -306,6 +306,23 @@
  %endif
  %endmacro

 +%macro PSWAPD 2
 +%if cpuflag(sse)
 +pshufw %1, %2, 0x4e

That's mmx2.
The macro was called sse because it was in a float function which thus
didn't have a mmx2 version.

 +%elif cpuflag(3dnowext)
 +pswapd %1, %2
 +%else
 +%ifidn %1, %2
 +movd [r0+12], %1
 +punpckhdq %1, [r0+8]
 +%else
 +movq  %1, %2
 +psrlq %1, 32
 +punpckldq %1, %2
 +%endif
 +%endif
 +%endmacro

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] h264: remove 3dnow qpel code

2012-07-31 Thread Loren Merritt
On Wed, 1 Aug 2012, Luca Barbato wrote:

 From: Daniel Kang daniel.d.k...@gmail.com

 Remove the code to eases porting the other qpel optimizations to
 yasm.

 AMD has deprecated 3dnow and the only CPUs that have 3dnow and
 do not have mmxext are 12 years old.

libavcodec/x86/dsputil_mmx_avg_template.c:58:1: warning: `put_pixels4_l2_3dnow' 
defined but not used
libavcodec/x86/dsputil_mmx_avg_template.c:229:1: warning: 
`avg_pixels4_l2_3dnow' defined but not used
libavcodec/x86/dsputil_mmx_avg_template.c:875:1: warning: `avg_pixels4_3dnow' 
defined but not used

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] h264: convert 8-bit qpel inlined assembly to yasm

2012-07-31 Thread Loren Merritt
On Wed, 1 Aug 2012, Luca Barbato wrote:

 +%macro OP_MOVH_MMX 3
 +movh   %3, %2
 +pavgb  %1, %3
 +movh   %2, %1
 +%endmacro
 +
 +%macro MOVH_MMX 3
 +movh   %2, %1
 +%endmacro
 +
 +%macro OP_MOV_MMX 3
 +mova   %3, %2
 +pavgb  %1, %3
 +mova   %2, %1

pavgb %1, %2
mova  %2, %1
(Just for the full width one)

 +%endmacro
 +
 +%macro MOV_MMX 3
 +mova   %2, %1
 +%endmacro

It's op_put vs op_avg (or mov vs avg), not mov vs op_mov.
Plus, naming them put vs avg would allow you to exploit the same put vs
avg that's already in all the function names, rather than a separate
%define OP.

 +%macro QPEL8OR16_V_LOWPASS_OP 1
 +cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, 
 h
 +%if cpuflag(sse2)
 +sub   r1, r3
 +sub   r1, r3
 +%endif
 +pxor  m7, m7
 +movh  m0, [r1]
 +movh  m1, [r1+r3]
 +lea   r1, [r1+2*r3]
 +movh  m2, [r1]
 +movh  m3, [r1+r3]
 +lea   r1, [r1+2*r3]
 +movh  m4, [r1]
 +add   r1, r3
 +punpcklbw m0, m7
 +punpcklbw m1, m7
 +punpcklbw m2, m7
 +punpcklbw m3, m7
 +punpcklbw m4, m7
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +cmp r4d, 16
 +jne .end
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +FILT_V
 +.end:
 +RET
 +%endmacro

(and other cases of this)
REP_RET

... and I'll skip the suggestions for improvement, since Daniel Kang has a
separate branch for that.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] fft: port FFT and IMDCT functions for 3dnow to yasm.

2012-07-29 Thread Loren Merritt
On Sun, 29 Jul 2012, Ronald S. Bultje wrote:

 +%macro PSWAPD 2
 +%if cpuflag(3dnow2)
 +pswapd%1, %2
 +%else
 +mova  %1, %2
 +psrlq %1, 32
 +punpckldq %1, %2
 +%endif
 +%endmacro

Merge with the macro named pswapd.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-28 Thread Loren Merritt
On Sat, 28 Jul 2012, Ronald S. Bultje wrote:
 On Fri, Jul 27, 2012 at 1:01 PM, Loren Merritt lor...@u.washington.edu 
 wrote:

 %%.b_idx_loop:
 Automatically generates a different label for each instantiation of the 
 macro.

 My disassembly now looks like this:

 0x0001004c43b4 ff_h264_loop_filter_strength_mmx2.nofield+25:jne
0x1004c44d1 ff_h264_loop_filter_strength_mmx2.bidir
 0x0001004c43ba ff_h264_loop_filter_strength_mmx2.nofield+31:xor
%r8d,%r8d
 0x0001004c43bd ..@5001..b_idx_loop+0:   pxor   %mm0,%mm0
 0x0001004c43c0 ..@5001..b_idx_loop+3:   test   %r11d,%r8d

 Can I somehow keep the function name in it? I find that somewhat
 useful when debugging.

Nope. And I've patched my objdump to remove function names prefixed to
local labels.

The bigger problem is that gdb doesn't know the difference between local
labels and functions, so disassembly will fail to encompass a whole
function if it contains any labels, regardless of what they're named. For
this reason, I sometimes find it easier to `strip -x` before using gdb.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 7/7] vf_hqdn3d: x86 asm

2012-07-28 Thread Loren Merritt
13% faster on penryn, 16% on sandybridge, 15% on bulldozer
Not simd; a compiler should have generated this, but gcc didn't.
---
 libavfilter/vf_hqdn3d.c|   27 ++--
 libavfilter/x86/Makefile   |1 +
 libavfilter/x86/hqdn3d.asm |  106 
 libavutil/x86/x86inc.asm   |1 +
 4 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 libavfilter/x86/hqdn3d.asm

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index d263cff..ef59691 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -40,8 +40,14 @@ typedef struct {
 double strength[4];
 int hsub, vsub;
 int depth;
+void (*denoise_row[17])(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
 } HQDN3DContext;
 
+void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_9_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+
 #define LUT_BITS (depth==16 ? 8 : 4)
 #define RIGHTSHIFT(a,b) (((a)+(((1(b))-1)1))(b))
 #define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2))  (16-depth))
@@ -78,7 +84,8 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 
 av_always_inline
-static void denoise_spatial(uint8_t *src, uint8_t *dst,
+static void denoise_spatial(HQDN3DContext *hqdn3d,
+uint8_t *src, uint8_t *dst,
 uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
 int16_t *spatial, int16_t *temporal, int depth)
@@ -103,6 +110,10 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 src += sstride;
 dst += dstride;
 frame_ant += w;
+if (hqdn3d-denoise_row[depth]) {
+hqdn3d-denoise_row[depth](src, dst, line_ant, frame_ant, w, 
spatial, temporal);
+continue;
+}
 pixel_ant = LOAD(0);
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial, 
depth);
@@ -117,7 +128,8 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 }
 
 av_always_inline
-static void denoise_depth(uint8_t *src, uint8_t *dst,
+static void denoise_depth(HQDN3DContext *hqdn3d,
+  uint8_t *src, uint8_t *dst,
   uint16_t *line_ant, uint16_t **frame_ant_ptr,
   int w, int h, int sstride, int dstride,
   int16_t *spatial, int16_t *temporal, int depth)
@@ -137,7 +149,7 @@ static void denoise_depth(uint8_t *src, uint8_t *dst,
 }
 
 if (spatial[0])
-denoise_spatial(src, dst, line_ant, frame_ant,
+denoise_spatial(hqdn3d, src, dst, line_ant, frame_ant,
 w, h, sstride, dstride, spatial, temporal, depth);
 else
 denoise_temporal(src, dst, frame_ant,
@@ -297,6 +309,13 @@ static int config_input(AVFilterLink *inlink)
 }
 }
 
+#if HAVE_YASM
+hqdn3d-denoise_row[ 8] = ff_hqdn3d_row_8_x86;
+hqdn3d-denoise_row[ 9] = ff_hqdn3d_row_9_x86;
+hqdn3d-denoise_row[10] = ff_hqdn3d_row_10_x86;
+hqdn3d-denoise_row[16] = ff_hqdn3d_row_16_x86;
+#endif
+
 return 0;
 }
 
@@ -314,7 +333,7 @@ static int end_frame(AVFilterLink *inlink)
 int ret, c;
 
 for (c = 0; c  3; c++) {
-denoise(inpic-data[c], outpic-data[c],
+denoise(hqdn3d, inpic-data[c], outpic-data[c],
 hqdn3d-line, hqdn3d-frame_prev[c],
 inpic-video-w  (!!c * hqdn3d-hsub),
 inpic-video-h  (!!c * hqdn3d-vsub),
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e98693d..46fc84f 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,2 +1,3 @@
 MMX-OBJS-$(CONFIG_YADIF_FILTER)  += x86/yadif.o
 MMX-OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/gradfun.o
+YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o
diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm
new file mode 100644
index 000..7254194
--- /dev/null
+++ b/libavfilter/x86/hqdn3d.asm
@@ -0,0 +1,106 @@
+;**
+;* Copyright (c) 2012 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1

Re: [libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-27 Thread Loren Merritt
On Thu, 26 Jul 2012, Ronald S. Bultje wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libavcodec/x86/h264_deblock.asm   |  124 
 +++--
  libavcodec/x86/h264_deblock_10bit.asm |   77 ++--
  libavcodec/x86/h264dsp_mmx.c  |   60 
  3 files changed, 139 insertions(+), 122 deletions(-)

 diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
 index 1982dc4..b5e81e7 100644
 --- a/libavcodec/x86/h264_deblock.asm
 +++ b/libavcodec/x86/h264_deblock.asm
 @@ -282,8 +282,8 @@ cextern pb_A1
  
 ;-
  ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
 *tc0 )
  
 ;-
 -%macro DEBLOCK_LUMA 1
 -cglobal deblock_v_luma_8_%1, 5,5,10
 +%macro DEBLOCK_V_LUMA 0
 +cglobal deblock_v_luma_8, 5,5,10
  movdm8, [r4] ; tc0
  lea r4, [r1*3]
  dec r2d; alpha-1
 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
  mova[r4+2*r1], m1
  mova[r0], m2
  RET
 +%endmacro

No indent.

  
 ;-
  ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
 *tc0 )
  
 ;-
 -INIT_MMX
 -cglobal deblock_h_luma_8_%1, 5,9
 +%macro DEBLOCK_H_LUMA 0
 +cglobal deblock_h_luma_8, 5,9
  movsxd r7,  r1d
  lear8,  [r7+r7*2]
  lear6,  [r0-4]
 @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
  %if WIN64
  mov[rsp+0x20], r4
  %endif
 -call   deblock_v_luma_8_%1
 +call   deblock_v_luma_8 %+ SUFFIX

call automatically appends SUFFIX if there is a function by that name.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-27 Thread Loren Merritt
On Fri, 27 Jul 2012, Ronald S. Bultje wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 This completes the conversion of h264dsp to yasm; note that h264 also
 uses some dsputil functions, most notably qpel. Performance-wise, the
 yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles
 faster (201-193) on x86-32.
 ---
  libavcodec/x86/h264_deblock.asm |  168 
 +++
  libavcodec/x86/h264dsp_mmx.c|  162 ++---
  2 files changed, 175 insertions(+), 155 deletions(-)

 diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
 index 1982dc4..77b25d2 100644
 --- a/libavcodec/x86/h264_deblock.asm
 +++ b/libavcodec/x86/h264_deblock.asm
 @@ -27,6 +27,10 @@
  %include x86inc.asm
  %include x86util.asm

 +SECTION_RODATA
 +
 +pb_3_1: times 4 db 3, 1
 +
  SECTION .text

  cextern pb_0
 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext:
  paddb  m1, m5
  paddb  m2, m6
  ret
 +
 +;-
 +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
 +;int8_t ref[2][40], int16_t mv[2][40][2],
 +;int bidir,int edges,int step,
 +;int mask_mv0, int mask_mv1, int field);
 +;
 +; bidiris 0 or 1
 +; edgesis 1 or 4
 +; step is 1 or 2
 +; mask_mv0 is 0 or 3
 +; mask_mv1 is 0 or 1
 +; fieldis 0 or 1
 +;-
 +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
 +; dir, d_idx, mask_dir, bidir
 +%define edgesd%1
 +%define stepd %2
 +%define mask_mvd  %3
 +%define dir   %4
 +%define d_idx %5
 +%define mask_dir  %6
 +%define bidir %7
 +xor  b_idxd, b_idxd ; for (b_idx = 0; b_idx  edges; b_idx += 
 step)
 +.b_idx_loop_ %+ dir %+ _ %+ bidir:

%%.b_idx_loop:
Automatically generates a different label for each instantiation of the macro.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.

2012-07-26 Thread Loren Merritt
On Wed, 25 Jul 2012, Ronald S. Bultje wrote:

 diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
 index e3c8a45..351c88d 100644
 --- a/libavcodec/x86/dct32_sse.asm
 +++ b/libavcodec/x86/dct32_sse.asm
 @@ -278,8 +278,6 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp
  vperm2f128  m0, m1, m1, 0x31
  vmovaps [outq+96], m1

 -vzeroupper
 -
  ;pass 6, no SIMD...
  INIT_XMM
  PASS6_AND_PERMUTE

This one is followed by xmm instructions in the same function, so moving
it to RET doesn't work. And the INIT_XMM means it doesn't happen at RET
either. So just don't remove it.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/7] factor identical ff_inplace_start_frame out of two filters

2012-07-26 Thread Loren Merritt
---
 libavfilter/vf_delogo.c  |   38 +-
 libavfilter/vf_gradfun.c |   37 +
 libavfilter/video.c  |   36 
 libavfilter/video.h  |1 +
 4 files changed, 39 insertions(+), 73 deletions(-)

diff --git a/libavfilter/vf_delogo.c b/libavfilter/vf_delogo.c
index 343585f..e8b5431 100644
--- a/libavfilter/vf_delogo.c
+++ b/libavfilter/vf_delogo.c
@@ -214,42 +214,6 @@ static av_cold int init(AVFilterContext *ctx, const char 
*args)
 return 0;
 }
 
-static int start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
-{
-AVFilterLink *outlink = inlink-dst-outputs[0];
-AVFilterBufferRef *outpicref = NULL, *for_next_filter;
-int ret = 0;
-
-if (inpicref-perms  AV_PERM_PRESERVE) {
-outpicref = ff_get_video_buffer(outlink, AV_PERM_WRITE,
-outlink-w, outlink-h);
-if (!outpicref)
-return AVERROR(ENOMEM);
-
-avfilter_copy_buffer_ref_props(outpicref, inpicref);
-outpicref-video-w = outlink-w;
-outpicref-video-h = outlink-h;
-} else {
-outpicref = avfilter_ref_buffer(inpicref, ~0);
-if (!outpicref)
-return AVERROR(ENOMEM);
-}
-
-for_next_filter = avfilter_ref_buffer(outpicref, ~0);
-if (for_next_filter)
-ret = ff_start_frame(outlink, for_next_filter);
-else
-ret = AVERROR(ENOMEM);
-
-if (ret  0) {
-avfilter_unref_bufferp(outpicref);
-return ret;
-}
-
-outlink-out_buf = outpicref;
-return 0;
-}
-
 static int null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir)
 {
 return 0;
@@ -296,7 +260,7 @@ AVFilter avfilter_vf_delogo = {
 .inputs= (const AVFilterPad[]) {{ .name = default,
   .type = 
AVMEDIA_TYPE_VIDEO,
   .get_video_buffer = 
ff_null_get_video_buffer,
-  .start_frame  = start_frame,
+  .start_frame  = 
ff_inplace_start_frame,
   .draw_slice   = null_draw_slice,
   .end_frame= end_frame,
   .min_perms= AV_PERM_WRITE | 
AV_PERM_READ,
diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c
index 52dcb70..f8896b5 100644
--- a/libavfilter/vf_gradfun.c
+++ b/libavfilter/vf_gradfun.c
@@ -180,41 +180,6 @@ static int config_input(AVFilterLink *inlink)
 return 0;
 }
 
-static int start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
-{
-AVFilterLink *outlink = inlink-dst-outputs[0];
-AVFilterBufferRef *outpicref = NULL, *for_next_filter;
-int ret = 0;
-
-if (inpicref-perms  AV_PERM_PRESERVE) {
-outpicref = ff_get_video_buffer(outlink, AV_PERM_WRITE, outlink-w, 
outlink-h);
-if (!outpicref)
-return AVERROR(ENOMEM);
-
-avfilter_copy_buffer_ref_props(outpicref, inpicref);
-outpicref-video-w = outlink-w;
-outpicref-video-h = outlink-h;
-} else {
-outpicref = avfilter_ref_buffer(inpicref, ~0);
-if (!outpicref)
-return AVERROR(ENOMEM);
-}
-
-for_next_filter = avfilter_ref_buffer(outpicref, ~0);
-if (for_next_filter)
-ret = ff_start_frame(outlink, for_next_filter);
-else
-ret = AVERROR(ENOMEM);
-
-if (ret  0) {
-avfilter_unref_bufferp(outpicref);
-return ret;
-}
-
-outlink-out_buf = outpicref;
-return 0;
-}
-
 static int null_draw_slice(AVFilterLink *link, int y, int h, int slice_dir)
 {
 return 0;
@@ -261,7 +226,7 @@ AVFilter avfilter_vf_gradfun = {
 .inputs= (const AVFilterPad[]) {{ .name = default,
   .type = 
AVMEDIA_TYPE_VIDEO,
   .config_props = config_input,
-  .start_frame  = start_frame,
+  .start_frame  = 
ff_inplace_start_frame,
   .draw_slice   = null_draw_slice,
   .end_frame= end_frame,
   .min_perms= AV_PERM_READ, },
diff --git a/libavfilter/video.c b/libavfilter/video.c
index 6e50637..ebbbc34 100644
--- a/libavfilter/video.c
+++ b/libavfilter/video.c
@@ -168,6 +168,42 @@ int ff_null_start_frame(AVFilterLink *link, 
AVFilterBufferRef *picref)
 return ff_start_frame(link-dst-outputs[0], buf_out);
 }
 
+// for filters that support (but don't require) outpic==inpic
+int ff_inplace_start_frame(AVFilterLink *inlink, AVFilterBufferRef *inpicref)
+{
+AVFilterLink *outlink = inlink-dst-outputs[0];
+AVFilterBufferRef 

[libav-devel] [PATCH 2/7] vf_hqdn3d: cosmetics

2012-07-26 Thread Loren Merritt
Change code style to match the rest of libav.
---
 libavfilter/vf_hqdn3d.c |  308 +++
 1 files changed, 152 insertions(+), 156 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 9e01606..1fa70c6 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -32,165 +32,161 @@
 #include video.h
 
 typedef struct {
-int Coefs[4][512*16];
-unsigned int *Line;
-unsigned short *Frame[3];
+int coefs[4][512*16];
+uint32_t *line;
+uint16_t *frame_prev[3];
 int hsub, vsub;
 } HQDN3DContext;
 
-static inline unsigned int LowPassMul(unsigned int PrevMul, unsigned int 
CurrMul, int *Coef)
+static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
 {
-//int dMul= (PrevMul0xFF)-(CurrMul0xFF);
-int dMul= PrevMul-CurrMul;
-unsigned int d=((dMul+0x10007FF)12);
-return CurrMul + Coef[d];
+int dmul = prev-cur;
+unsigned int d = (dmul+0x10007FF)12; // 0x1000 to convert to unsigned, 
7FF for rounding
+return cur + coef[d];
 }
 
-static void deNoiseTemporal(unsigned char *FrameSrc,
-unsigned char *FrameDest,
-unsigned short *FrameAnt,
-int W, int H, int sStride, int dStride,
-int *Temporal)
+static void denoise_temporal(uint8_t *src, uint8_t *dst,
+ uint16_t *frame_ant,
+ int w, int h, int sstride, int dstride,
+ int *temporal)
 {
-long X, Y;
-unsigned int PixelDst;
-
-for (Y = 0; Y  H; Y++) {
-for (X = 0; X  W; X++) {
-PixelDst = LowPassMul(FrameAnt[X]8, FrameSrc[X]16, Temporal);
-FrameAnt[X] = ((PixelDst+0x107F)8);
-FrameDest[X]= ((PixelDst+0x10007FFF)16);
+long x, y;
+uint32_t pixel;
+
+for (y = 0; y  h; y++) {
+for (x = 0; x  w; x++) {
+pixel = lowpass(frame_ant[x]8, src[x]16, temporal);
+frame_ant[x] = ((pixel+0x107F)8);
+dst[x]= ((pixel+0x10007FFF)16);
 }
-FrameSrc  += sStride;
-FrameDest += dStride;
-FrameAnt += W;
+src += sstride;
+dst += dstride;
+frame_ant += w;
 }
 }
 
-static void deNoiseSpacial(unsigned char *Frame,
-   unsigned char *FrameDest,
-   unsigned int *LineAnt,
-   int W, int H, int sStride, int dStride,
-   int *Horizontal, int *Vertical)
+static void denoise_spatial(uint8_t *src, uint8_t *dst,
+uint32_t *line_ant,
+int w, int h, int sstride, int dstride,
+int *horizontal, int *vertical)
 {
-long X, Y;
-long sLineOffs = 0, dLineOffs = 0;
-unsigned int PixelAnt;
-unsigned int PixelDst;
+long x, y;
+long sline_offs = 0, dline_offs = 0;
+uint32_t pixel_ant;
+uint32_t pixel;
 
 /* First pixel has no left nor top neighbor. */
-PixelDst = LineAnt[0] = PixelAnt = Frame[0]16;
-FrameDest[0]= ((PixelDst+0x10007FFF)16);
+pixel = line_ant[0] = pixel_ant = src[0]16;
+dst[0]= ((pixel+0x10007FFF)16);
 
 /* First line has no top neighbor, only left. */
-for (X = 1; X  W; X++) {
-PixelDst = LineAnt[X] = LowPassMul(PixelAnt, Frame[X]16, Horizontal);
-FrameDest[X]= ((PixelDst+0x10007FFF)16);
+for (x = 1; x  w; x++) {
+pixel = line_ant[x] = lowpass(pixel_ant, src[x]16, horizontal);
+dst[x]= ((pixel+0x10007FFF)16);
 }
 
-for (Y = 1; Y  H; Y++) {
-unsigned int PixelAnt;
-sLineOffs += sStride, dLineOffs += dStride;
+for (y = 1; y  h; y++) {
+uint32_t pixel_ant;
+sline_offs += sstride, dline_offs += dstride;
 /* First pixel on each line doesn't have previous pixel */
-PixelAnt = Frame[sLineOffs]16;
-PixelDst = LineAnt[0] = LowPassMul(LineAnt[0], PixelAnt, Vertical);
-FrameDest[dLineOffs]= ((PixelDst+0x10007FFF)16);
+pixel_ant = src[sline_offs]16;
+pixel = line_ant[0] = lowpass(line_ant[0], pixel_ant, vertical);
+dst[dline_offs]= ((pixel+0x10007FFF)16);
 
-for (X = 1; X  W; X++) {
-unsigned int PixelDst;
+for (x = 1; x  w; x++) {
+uint32_t pixel;
 /* The rest are normal */
-PixelAnt = LowPassMul(PixelAnt, Frame[sLineOffs+X]16, 
Horizontal);
-PixelDst = LineAnt[X] = LowPassMul(LineAnt[X], PixelAnt, Vertical);
-FrameDest[dLineOffs+X]= ((PixelDst+0x10007FFF)16);
+pixel_ant = lowpass(pixel_ant, src[sline_offs+x]16, horizontal);
+pixel = line_ant[x] = lowpass(line_ant[x], pixel_ant, vertical);
+dst[dline_offs+x]= ((pixel+0x10007FFF)16);
 }
 }
 }
 
-static void deNoise(unsigned char *Frame,
-  

[libav-devel] [PATCH 3/7] vf_hqdn3d: simplify and optimize

2012-07-26 Thread Loren Merritt
14% faster on penryn, 2% on sandybridge, 9% on bulldozer
---
 libavfilter/vf_hqdn3d.c |  157 +++---
 1 files changed, 51 insertions(+), 106 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 1fa70c6..bb81b79 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003 Daniel Moreno comac AT comac DOT darktech DOT org
  * Copyright (c) 2010 Baptiste Coudurier
+ * Copyright (c) 2012 Loren Merritt
  *
  * This file is part of Libav, ported from MPlayer.
  *
@@ -51,13 +52,13 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
  int *temporal)
 {
 long x, y;
-uint32_t pixel;
+uint32_t tmp;
 
 for (y = 0; y  h; y++) {
 for (x = 0; x  w; x++) {
-pixel = lowpass(frame_ant[x]8, src[x]16, temporal);
-frame_ant[x] = ((pixel+0x107F)8);
-dst[x]= ((pixel+0x10007FFF)16);
+tmp = lowpass(frame_ant[x]8, src[x]16, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 src += sstride;
 dst += dstride;
@@ -66,111 +67,66 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant,
+uint32_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int *horizontal, int *vertical)
+int *spatial, int *temporal)
 {
 long x, y;
-long sline_offs = 0, dline_offs = 0;
 uint32_t pixel_ant;
-uint32_t pixel;
+uint32_t tmp;
 
-/* First pixel has no left nor top neighbor. */
-pixel = line_ant[0] = pixel_ant = src[0]16;
-dst[0]= ((pixel+0x10007FFF)16);
-
-/* First line has no top neighbor, only left. */
-for (x = 1; x  w; x++) {
-pixel = line_ant[x] = lowpass(pixel_ant, src[x]16, horizontal);
-dst[x]= ((pixel+0x10007FFF)16);
+/* First line has no top neighbor. Only left one for each tmp and
+ * last frame */
+pixel_ant = src[0]16;
+for (x = 0; x  w; x++) {
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]16, 
spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 
 for (y = 1; y  h; y++) {
-uint32_t pixel_ant;
-sline_offs += sstride, dline_offs += dstride;
-/* First pixel on each line doesn't have previous pixel */
-pixel_ant = src[sline_offs]16;
-pixel = line_ant[0] = lowpass(line_ant[0], pixel_ant, vertical);
-dst[dline_offs]= ((pixel+0x10007FFF)16);
-
-for (x = 1; x  w; x++) {
-uint32_t pixel;
-/* The rest are normal */
-pixel_ant = lowpass(pixel_ant, src[sline_offs+x]16, horizontal);
-pixel = line_ant[x] = lowpass(line_ant[x], pixel_ant, vertical);
-dst[dline_offs+x]= ((pixel+0x10007FFF)16);
+src += sstride;
+dst += dstride;
+frame_ant += w;
+pixel_ant = src[0]16;
+for (x = 0; x  w-1; x++) {
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
+pixel_ant = lowpass(pixel_ant, src[x+1]16, spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 }
 
 static void denoise(uint8_t *src, uint8_t *dst,
 uint32_t *line_ant, uint16_t **frame_ant_ptr,
 int w, int h, int sstride, int dstride,
-int *horizontal, int *vertical, int *temporal)
+int *spatial, int *temporal)
 {
 long x, y;
-long sline_offs = 0, dline_offs = 0;
-uint32_t pixel_ant;
-uint32_t pixel;
 uint16_t *frame_ant = *frame_ant_ptr;
-
 if (!frame_ant) {
+uint8_t *frame_src = src;
 *frame_ant_ptr = frame_ant = av_malloc(w*h*sizeof(uint16_t));
-for (y = 0; y  h; y++) {
-uint16_t *frame_dst = frame_ant+y*w;
-uint8_t *frame_src = src+y*sstride;
+for (y = 0; y  h; y++, src += sstride, frame_ant += w)
 for (x = 0; x  w; x++)
-frame_dst[x] = frame_src[x]8;
-}
+frame_ant[x] = src[x]8;
+src = frame_src;
+frame_ant = *frame_ant_ptr;
 }
 
-if (!horizontal[0]  !vertical[0]) {
+if (spatial[0])
+denoise_spatial(src, dst, line_ant, frame_ant,
+w, h, sstride, dstride, spatial, temporal);
+else
 denoise_temporal(src, dst, frame_ant

[libav-devel] [PATCH 5/7] vf_hqdn3d: support 10bit colordepth

2012-07-26 Thread Loren Merritt
---
 libavfilter/vf_hqdn3d.c |   68 +-
 1 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 46902fa..02f0cd1 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -27,6 +27,7 @@
  */
 
 #include libavutil/pixdesc.h
+#include libavutil/intreadwrite.h
 #include avfilter.h
 #include formats.h
 #include internal.h
@@ -37,18 +38,25 @@ typedef struct {
 uint16_t *line;
 uint16_t *frame_prev[3];
 int hsub, vsub;
+int depth;
 } HQDN3DContext;
 
+#define RIGHTSHIFT(a,b) (((a)+(((1(b))-1)1))(b))
+#define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2))  (16-depth))
+#define STORE(x,val) (depth==8 ? dst[x] = RIGHTSHIFT(val, 16-depth)\
+: AV_WN16A(dst+(x)*2, RIGHTSHIFT(val, 16-depth)))
+
 static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
 {
 int d = (prev-cur)4;
 return cur + coef[d];
 }
 
+av_always_inline
 static void denoise_temporal(uint8_t *src, uint8_t *dst,
  uint16_t *frame_ant,
  int w, int h, int sstride, int dstride,
- int16_t *temporal)
+ int16_t *temporal, int depth)
 {
 long x, y;
 uint32_t tmp;
@@ -57,8 +65,8 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 
 for (y = 0; y  h; y++) {
 for (x = 0; x  w; x++) {
-frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]8, temporal);
-dst[x] = (tmp+0x7F)8;
+frame_ant[x] = tmp = lowpass(frame_ant[x], LOAD(x), temporal);
+STORE(x, tmp);
 }
 src += sstride;
 dst += dstride;
@@ -66,10 +74,11 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 }
 
+av_always_inline
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
 uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int16_t *spatial, int16_t *temporal)
+int16_t *spatial, int16_t *temporal, int depth)
 {
 long x, y;
 uint32_t pixel_ant;
@@ -80,34 +89,35 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 
 /* First line has no top neighbor. Only left one for each tmp and
  * last frame */
-pixel_ant = src[0]8;
+pixel_ant = LOAD(0);
 for (x = 0; x  w; x++) {
-line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]8, spatial);
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, LOAD(x), spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 
 for (y = 1; y  h; y++) {
 src += sstride;
 dst += dstride;
 frame_ant += w;
-pixel_ant = src[0]8;
+pixel_ant = LOAD(0);
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-pixel_ant = lowpass(pixel_ant, src[x+1]8, spatial);
+pixel_ant = lowpass(pixel_ant, LOAD(x+1), spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 }
 
-static void denoise(uint8_t *src, uint8_t *dst,
-uint16_t *line_ant, uint16_t **frame_ant_ptr,
-int w, int h, int sstride, int dstride,
-int16_t *spatial, int16_t *temporal)
+av_always_inline
+static void denoise_depth(uint8_t *src, uint8_t *dst,
+  uint16_t *line_ant, uint16_t **frame_ant_ptr,
+  int w, int h, int sstride, int dstride,
+  int16_t *spatial, int16_t *temporal, int depth)
 {
 long x, y;
 uint16_t *frame_ant = *frame_ant_ptr;
@@ -116,19 +126,25 @@ static void denoise(uint8_t *src, uint8_t *dst,
 *frame_ant_ptr = frame_ant = av_malloc(w*h*sizeof(uint16_t));
 for (y = 0; y  h; y++, src += sstride, frame_ant += w)
 for (x = 0; x  w; x++)
-frame_ant[x] = src[x]8;
+frame_ant[x] = LOAD(x);
 src = frame_src;
 frame_ant = *frame_ant_ptr;
 }
 
 if (spatial[0])
 denoise_spatial(src, dst, line_ant, frame_ant,
-w, h, sstride, dstride, spatial, temporal);
+w, h, sstride, dstride, spatial, temporal, depth);
 else
 denoise_temporal(src, dst, frame_ant,
- w, h, sstride, dstride, temporal);
+ w, h, sstride, dstride, temporal, depth);
 }
 
+#define denoise(...) \
+switch (hqdn3d-depth) {\
+case  8: 

[libav-devel] [PATCH 6/7] vf_hqdn3d: support 16bit colordepth

2012-07-26 Thread Loren Merritt
---
 libavfilter/vf_hqdn3d.c |   70 +++
 1 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 02f0cd1..521fef8 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -34,21 +34,24 @@
 #include video.h
 
 typedef struct {
-int16_t coefs[4][512*16];
+int16_t *coefs[4];
 uint16_t *line;
 uint16_t *frame_prev[3];
+double strength[4];
 int hsub, vsub;
 int depth;
 } HQDN3DContext;
 
+#define LUT_BITS (depth==16 ? 8 : 4)
 #define RIGHTSHIFT(a,b) (((a)+(((1(b))-1)1))(b))
 #define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2))  (16-depth))
 #define STORE(x,val) (depth==8 ? dst[x] = RIGHTSHIFT(val, 16-depth)\
 : AV_WN16A(dst+(x)*2, RIGHTSHIFT(val, 16-depth)))
 
-static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
+av_always_inline
+static inline uint32_t lowpass(int prev, int cur, int16_t *coef, int depth)
 {
-int d = (prev-cur)4;
+int d = (prev - cur)  (8 - LUT_BITS);
 return cur + coef[d];
 }
 
@@ -61,11 +64,11 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 long x, y;
 uint32_t tmp;
 
-temporal += 0x1000;
+temporal += 256  LUT_BITS;
 
 for (y = 0; y  h; y++) {
 for (x = 0; x  w; x++) {
-frame_ant[x] = tmp = lowpass(frame_ant[x], LOAD(x), temporal);
+frame_ant[x] = tmp = lowpass(frame_ant[x], LOAD(x), temporal, 
depth);
 STORE(x, tmp);
 }
 src += sstride;
@@ -84,15 +87,15 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 uint32_t pixel_ant;
 uint32_t tmp;
 
-spatial  += 0x1000;
-temporal += 0x1000;
+spatial  += 256  LUT_BITS;
+temporal += 256  LUT_BITS;
 
 /* First line has no top neighbor. Only left one for each tmp and
  * last frame */
 pixel_ant = LOAD(0);
 for (x = 0; x  w; x++) {
-line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, LOAD(x), spatial);
-frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, LOAD(x), spatial, 
depth);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal, depth);
 STORE(x, tmp);
 }
 
@@ -102,13 +105,13 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 frame_ant += w;
 pixel_ant = LOAD(0);
 for (x = 0; x  w-1; x++) {
-line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-pixel_ant = lowpass(pixel_ant, LOAD(x+1), spatial);
-frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial, 
depth);
+pixel_ant = lowpass(pixel_ant, LOAD(x+1), spatial, depth);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal, depth);
 STORE(x, tmp);
 }
-line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial, depth);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal, depth);
 STORE(x, tmp);
 }
 }
@@ -119,6 +122,8 @@ static void denoise_depth(uint8_t *src, uint8_t *dst,
   int w, int h, int sstride, int dstride,
   int16_t *spatial, int16_t *temporal, int depth)
 {
+// FIXME: For 16bit depth, frame_ant could be a pointer to the previous
+// filtered frame rather than a separate buffer.
 long x, y;
 uint16_t *frame_ant = *frame_ant_ptr;
 if (!frame_ant) {
@@ -143,24 +148,26 @@ static void denoise_depth(uint8_t *src, uint8_t *dst,
 switch (hqdn3d-depth) {\
 case  8: denoise_depth(__VA_ARGS__,  8); break;\
 case 10: denoise_depth(__VA_ARGS__, 10); break;\
+case 16: denoise_depth(__VA_ARGS__, 16); break;\
 }
 
-static void precalc_coefs(int16_t *ct, double dist25)
+static int16_t *precalc_coefs(double dist25, int depth)
 {
 int i;
 double gamma, simil, C;
+int16_t *ct = av_malloc((512LUT_BITS)*sizeof(int16_t));
 
 gamma = log(0.25) / log(1.0 - FFMIN(dist25,252.0)/255.0 - 0.1);
 
-for (i = -255*16; i = 255*16; i++) {
-// lowpass() truncates (not rounds) the diff, so +15/32 for the 
midpoint of the bin.
-double f = (i + 15.0/32.0) / 16.0;
+for (i = -255LUT_BITS; i = 255LUT_BITS; i++) {
+double f = ((i(9-LUT_BITS)) + (1(8-LUT_BITS)) - 1) / 512.0; // 
midpoint of the bin
 simil = 1.0 - FFABS(f) / 255.0;
 C = pow(simil, gamma) * 256.0 * f;
-ct[16*256+i] = lrint(C);
+ct[(256LUT_BITS)+i] = lrint(C);
 }
 
 ct[0] = !!dist25;
+return ct;
 }
 
 #define PARAM1_DEFAULT 4.0
@@ -208,6 +215,11 @@ static int init(AVFilterContext *ctx, const char *args)
 }
 }
 
+hqdn3d-strength[0] = lum_spac;
+

[libav-devel] [PATCH 7/7] vf_hqdn3d: x86 asm

2012-07-26 Thread Loren Merritt
13% faster on penryn, 16% on sandybridge, 15% on bulldozer
Not simd; a compiler should have generated this, but gcc didn't.
---
 libavfilter/vf_hqdn3d.c|   25 +--
 libavfilter/x86/Makefile   |1 +
 libavfilter/x86/hqdn3d.asm |  105 
 libavutil/x86/x86inc.asm   |1 +
 4 files changed, 128 insertions(+), 4 deletions(-)
 create mode 100644 libavfilter/x86/hqdn3d.asm

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 521fef8..98529ce 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -40,8 +40,13 @@ typedef struct {
 double strength[4];
 int hsub, vsub;
 int depth;
+void (*denoise_row[17])(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
 } HQDN3DContext;
 
+void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, 
uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);
+
 #define LUT_BITS (depth==16 ? 8 : 4)
 #define RIGHTSHIFT(a,b) (((a)+(((1(b))-1)1))(b))
 #define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2))  (16-depth))
@@ -78,7 +83,8 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 
 av_always_inline
-static void denoise_spatial(uint8_t *src, uint8_t *dst,
+static void denoise_spatial(HQDN3DContext *hqdn3d,
+uint8_t *src, uint8_t *dst,
 uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
 int16_t *spatial, int16_t *temporal, int depth)
@@ -103,6 +109,10 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 src += sstride;
 dst += dstride;
 frame_ant += w;
+if (hqdn3d-denoise_row[depth]) {
+hqdn3d-denoise_row[depth](src, dst, line_ant, frame_ant, w, 
spatial, temporal);
+continue;
+}
 pixel_ant = LOAD(0);
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial, 
depth);
@@ -117,7 +127,8 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 }
 
 av_always_inline
-static void denoise_depth(uint8_t *src, uint8_t *dst,
+static void denoise_depth(HQDN3DContext *hqdn3d,
+  uint8_t *src, uint8_t *dst,
   uint16_t *line_ant, uint16_t **frame_ant_ptr,
   int w, int h, int sstride, int dstride,
   int16_t *spatial, int16_t *temporal, int depth)
@@ -137,7 +148,7 @@ static void denoise_depth(uint8_t *src, uint8_t *dst,
 }
 
 if (spatial[0])
-denoise_spatial(src, dst, line_ant, frame_ant,
+denoise_spatial(hqdn3d, src, dst, line_ant, frame_ant,
 w, h, sstride, dstride, spatial, temporal, depth);
 else
 denoise_temporal(src, dst, frame_ant,
@@ -293,6 +304,12 @@ static int config_input(AVFilterLink *inlink)
 }
 }
 
+#if HAVE_YASM
+hqdn3d-denoise_row[ 8] = ff_hqdn3d_row_8_x86;
+hqdn3d-denoise_row[10] = ff_hqdn3d_row_10_x86;
+hqdn3d-denoise_row[16] = ff_hqdn3d_row_16_x86;
+#endif
+
 return 0;
 }
 
@@ -310,7 +327,7 @@ static int end_frame(AVFilterLink *inlink)
 int ret, c;
 
 for (c = 0; c  3; c++) {
-denoise(inpic-data[c], outpic-data[c],
+denoise(hqdn3d, inpic-data[c], outpic-data[c],
 hqdn3d-line, hqdn3d-frame_prev[c],
 inpic-video-w  (!!c * hqdn3d-hsub),
 inpic-video-h  (!!c * hqdn3d-vsub),
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e98693d..46fc84f 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,2 +1,3 @@
 MMX-OBJS-$(CONFIG_YADIF_FILTER)  += x86/yadif.o
 MMX-OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/gradfun.o
+YASM-OBJS-$(CONFIG_HQDN3D_FILTER)+= x86/hqdn3d.o
diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm
new file mode 100644
index 000..c15c119
--- /dev/null
+++ b/libavfilter/x86/hqdn3d.asm
@@ -0,0 +1,105 @@
+;**
+;* Copyright (c) 2012 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty

[libav-devel] [PATCH 4/7] vf_hqdn3d: reduce intermediate precision

2012-07-26 Thread Loren Merritt
11% faster on penryn, 7% on sandybridge, 5% on bulldozer
Negligible change to output.
---
 libavfilter/vf_hqdn3d.c |   62 --
 1 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index bb81b79..46902fa 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -33,32 +33,32 @@
 #include video.h
 
 typedef struct {
-int coefs[4][512*16];
-uint32_t *line;
+int16_t coefs[4][512*16];
+uint16_t *line;
 uint16_t *frame_prev[3];
 int hsub, vsub;
 } HQDN3DContext;
 
-static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
+static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
 {
-int dmul = prev-cur;
-unsigned int d = (dmul+0x10007FF)12; // 0x1000 to convert to unsigned, 
7FF for rounding
+int d = (prev-cur)4;
 return cur + coef[d];
 }
 
 static void denoise_temporal(uint8_t *src, uint8_t *dst,
  uint16_t *frame_ant,
  int w, int h, int sstride, int dstride,
- int *temporal)
+ int16_t *temporal)
 {
 long x, y;
 uint32_t tmp;
 
+temporal += 0x1000;
+
 for (y = 0; y  h; y++) {
 for (x = 0; x  w; x++) {
-tmp = lowpass(frame_ant[x]8, src[x]16, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]8, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 src += sstride;
 dst += dstride;
@@ -67,47 +67,47 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant, uint16_t *frame_ant,
+uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int *spatial, int *temporal)
+int16_t *spatial, int16_t *temporal)
 {
 long x, y;
 uint32_t pixel_ant;
 uint32_t tmp;
 
+spatial  += 0x1000;
+temporal += 0x1000;
+
 /* First line has no top neighbor. Only left one for each tmp and
  * last frame */
-pixel_ant = src[0]16;
+pixel_ant = src[0]8;
 for (x = 0; x  w; x++) {
-line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]16, 
spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]8, spatial);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 
 for (y = 1; y  h; y++) {
 src += sstride;
 dst += dstride;
 frame_ant += w;
-pixel_ant = src[0]16;
+pixel_ant = src[0]8;
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-pixel_ant = lowpass(pixel_ant, src[x+1]16, spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+pixel_ant = lowpass(pixel_ant, src[x+1]8, spatial);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 }
 
 static void denoise(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant, uint16_t **frame_ant_ptr,
+uint16_t *line_ant, uint16_t **frame_ant_ptr,
 int w, int h, int sstride, int dstride,
-int *spatial, int *temporal)
+int16_t *spatial, int16_t *temporal)
 {
 long x, y;
 uint16_t *frame_ant = *frame_ant_ptr;
@@ -129,16 +129,18 @@ static void denoise(uint8_t *src, uint8_t *dst,
  w, h, sstride, dstride, temporal);
 }
 
-static void precalc_coefs(int *ct, double dist25)
+static void precalc_coefs(int16_t *ct, double dist25)
 {
 int i;
 double gamma, simil, C;
 
-gamma = log(0.25) / log(1.0 - dist25/255.0 - 0.1);
+gamma = log(0.25) / log(1.0 - FFMIN(dist25,252.0)/255.0 - 0.1);
 
 for (i = -255*16; i = 255*16; i++) {
-simil = 1.0 - FFABS(i) / (16*255.0);
-C = pow(simil, gamma) * 65536.0 * i / 16.0;
+// lowpass() truncates (not rounds) the diff, so +15/32 for the 
midpoint of the bin.
+double f = (i + 15.0/32.0) / 16.0;
+simil = 1.0 - FFABS(f) / 255.0;
+C = pow(simil, gamma) * 256.0 * f;
 

Re: [libav-devel] [PATCH 3/7] vf_hqdn3d: simplify and optimize

2012-07-26 Thread Loren Merritt
On Thu, 26 Jul 2012, Ronald S. Bultje wrote:

 Hi,

 On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu 
 wrote:
  14% faster on penryn, 2% on sandybridge, 9% on bulldozer
  ---
   libavfilter/vf_hqdn3d.c |  157 
  +++---
   1 files changed, 51 insertions(+), 106 deletions(-)

 Looks good.

 I am going to ask a very stupid question: why is this faster? I see a
 lot of simplification, which is good, but I'm not quite sure which
 part actually has a clear speed impact.

Old code's sline_offs and dline_offs confused gcc into incrementing the
src and dst pointers rather than using x as an index reg.

Old code did horizontal(x), vertical(x), temporal(x). There's a dependency
chain between those 3 filters, so you need to interleave multiple loop
iterations to get maximum throughput. OOE might theoretically handle that,
but doesn't do so perfectly on the CPUs I tested.
New code does vertical(x), horizontal(x+1), temporal(x); which requires
less OOE.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 5/7] vf_hqdn3d: support 9 and 10bit colordepth

2012-07-26 Thread Loren Merritt
---
 libavfilter/vf_hqdn3d.c |   72 ++
 1 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 46902fa..138b527 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -27,6 +27,7 @@
  */
 
 #include libavutil/pixdesc.h
+#include libavutil/intreadwrite.h
 #include avfilter.h
 #include formats.h
 #include internal.h
@@ -37,18 +38,25 @@ typedef struct {
 uint16_t *line;
 uint16_t *frame_prev[3];
 int hsub, vsub;
+int depth;
 } HQDN3DContext;
 
+#define RIGHTSHIFT(a,b) (((a)+(((1(b))-1)1))(b))
+#define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2))  (16-depth))
+#define STORE(x,val) (depth==8 ? dst[x] = RIGHTSHIFT(val, 16-depth)\
+: AV_WN16A(dst+(x)*2, RIGHTSHIFT(val, 16-depth)))
+
 static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
 {
 int d = (prev-cur)4;
 return cur + coef[d];
 }
 
+av_always_inline
 static void denoise_temporal(uint8_t *src, uint8_t *dst,
  uint16_t *frame_ant,
  int w, int h, int sstride, int dstride,
- int16_t *temporal)
+ int16_t *temporal, int depth)
 {
 long x, y;
 uint32_t tmp;
@@ -57,8 +65,8 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 
 for (y = 0; y  h; y++) {
 for (x = 0; x  w; x++) {
-frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]8, temporal);
-dst[x] = (tmp+0x7F)8;
+frame_ant[x] = tmp = lowpass(frame_ant[x], LOAD(x), temporal);
+STORE(x, tmp);
 }
 src += sstride;
 dst += dstride;
@@ -66,10 +74,11 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 }
 
+av_always_inline
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
 uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int16_t *spatial, int16_t *temporal)
+int16_t *spatial, int16_t *temporal, int depth)
 {
 long x, y;
 uint32_t pixel_ant;
@@ -80,34 +89,35 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst,
 
 /* First line has no top neighbor. Only left one for each tmp and
  * last frame */
-pixel_ant = src[0]8;
+pixel_ant = LOAD(0);
 for (x = 0; x  w; x++) {
-line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]8, spatial);
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, LOAD(x), spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 
 for (y = 1; y  h; y++) {
 src += sstride;
 dst += dstride;
 frame_ant += w;
-pixel_ant = src[0]8;
+pixel_ant = LOAD(0);
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-pixel_ant = lowpass(pixel_ant, src[x+1]8, spatial);
+pixel_ant = lowpass(pixel_ant, LOAD(x+1), spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
 frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
-dst[x] = (tmp+0x7F)8;
+STORE(x, tmp);
 }
 }
 
-static void denoise(uint8_t *src, uint8_t *dst,
-uint16_t *line_ant, uint16_t **frame_ant_ptr,
-int w, int h, int sstride, int dstride,
-int16_t *spatial, int16_t *temporal)
+av_always_inline
+static void denoise_depth(uint8_t *src, uint8_t *dst,
+  uint16_t *line_ant, uint16_t **frame_ant_ptr,
+  int w, int h, int sstride, int dstride,
+  int16_t *spatial, int16_t *temporal, int depth)
 {
 long x, y;
 uint16_t *frame_ant = *frame_ant_ptr;
@@ -116,19 +126,26 @@ static void denoise(uint8_t *src, uint8_t *dst,
 *frame_ant_ptr = frame_ant = av_malloc(w*h*sizeof(uint16_t));
 for (y = 0; y  h; y++, src += sstride, frame_ant += w)
 for (x = 0; x  w; x++)
-frame_ant[x] = src[x]8;
+frame_ant[x] = LOAD(x);
 src = frame_src;
 frame_ant = *frame_ant_ptr;
 }
 
 if (spatial[0])
 denoise_spatial(src, dst, line_ant, frame_ant,
-w, h, sstride, dstride, spatial, temporal);
+w, h, sstride, dstride, spatial, temporal, depth);
 else
 denoise_temporal(src, dst, frame_ant,
- w, h, sstride, dstride, temporal);
+ w, h, sstride, dstride, temporal, depth);
 }
 
+#define denoise(...) \
+switch (hqdn3d-depth) {\
+case  8: 

Re: [libav-devel] [PATCH 2/2] lavr: add x86-optimized mixing functions

2012-07-25 Thread Loren Merritt
On Tue, 24 Jul 2012, Justin Ruggles wrote:

 Adds optimized functions for mixing 3 through 8 input channels to 1 and 2
 output channels in fltp or s16p format with flt coeffs.
 ---
  libavresample/utils.c  |7 +-
  libavresample/x86/audio_mix.asm|  293 
 
  libavresample/x86/audio_mix_init.c |  130 
  3 files changed, 427 insertions(+), 3 deletions(-)

LGTM.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [libav-commits] x86: fft: convert sse inline asm to yasm

2012-07-25 Thread Loren Merritt
On Wed, 25 Jul 2012, Luca Barbato wrote:
 On 07/25/2012 07:40 AM, Jason Garrett-Glaser wrote:

 Do the x264 functions sign-extend all their integer arguments? Or put
 differently, does the problem occur for 32-bit builds also, or only
 for 64-bit builds?

 Yes they do, and such a problem wouldn't target solely _avx functions;
 that wouldn't make any sense.

 How about start by checking for a missing vzeroupper?

 Is there a simple way to do that? We could fix that part sooner than later.

Done.
... And it does find a hit, ff_mix_1_to_2_fltp_flt_avx.

--Loren Merritt#!/usr/bin/perl -w
$exe = $ARGV[0];
@ARGV==1 and -f $exe and -x $exe or die
usage: missing_vzeroupper.pl avconv\n.
Finds functions that use ymm and fail to reset the ymm state with vzeroupper,\n.
which thus incur a large speed penalty when mixed with non-avx xmm.\n;

open FH, -|, objdump, -d, -M, intel, $exe or die failed to run objdump: $!\n;
@funcs = split /^[0-9a-f]+ ([^. ]+):/m, join , FH;
close FH or die;

# The fft functions are ok because the vzeroupper happens in fft_dispatch.
$exceptions = qr/^_?(fft\d+|pass)_/;

$err = 0;
shift @funcs;
while(@funcs) {
my $funcname = shift @funcs;
my $asm = shift @funcs;
if($asm =~ /\bymm\d/ and $asm !~ /\bvzeroupper\b/ and $funcname !~ $exceptions) {
print $funcname\n;
$err = 1;
}
}
exit $err;
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 14/15] lavr: x86: optimized 2-channel flt to fltp conversion

2012-07-25 Thread Loren Merritt
On Wed, 25 Jul 2012, Justin Ruggles wrote:

 On 07/25/2012 01:03 AM, Ronald S. Bultje wrote:

 (Can this be implemented in YMM with the current instructions available?)

 Not really sure if it would help.
 Maybe something like this?

 vpermilps m0, [srcq+2*lenq], q0213
 vextractf128  m1, m0, 1
 SBUTTERFLY2  qdq, m0, m1, m2
 movdqa  [dst0q+lenq], m0
 movdqa  [dst1q+lenq], m1

 Is that even worth testing?

No. This function should be memory-bound. And 32byte memory ops aren't any
higher thoughput than 16byte memory ops unless they come in the ratio of
2 loads per 1 store.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.

2012-07-25 Thread Loren Merritt
On Wed, 25 Jul 2012, Ronald S. Bultje wrote:

 diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
 index 42ba97a..4b523e9 100644
 --- a/libavutil/x86/x86inc.asm
 +++ b/libavutil/x86/x86inc.asm
 @@ -369,11 +369,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120
  %macro RET 0
  WIN64_RESTORE_XMM_INTERNAL rsp
  POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
 +%if mmsize == 32
 +vzeroupper
 +%endif
  ret
  %endmacro

  %macro REP_RET 0
 -%if regs_used  7 || xmm_regs_used  6
 +%if regs_used  7 || xmm_regs_used  6 || mmsize == 32
  RET
  %else
  rep ret

There's 3 implementations of RET, for the 3 ABIs.

--Loren Merritt
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/7] vf_hqdn3d: cosmetics

2012-07-24 Thread Loren Merritt
Change code style to match the rest of libav.
---
 libavfilter/vf_hqdn3d.c |  309 +++
 1 files changed, 153 insertions(+), 156 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index af69d41..8e2a6a2 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -32,165 +32,162 @@
 #include video.h
 
 typedef struct {
-int Coefs[4][512*16];
-unsigned int *Line;
-unsigned short *Frame[3];
+int coefs[4][512*16];
+uint32_t *line;
+uint16_t *frame_prev[3];
 int hsub, vsub;
 } HQDN3DContext;
 
-static inline unsigned int LowPassMul(unsigned int PrevMul, unsigned int 
CurrMul, int *Coef)
+static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
 {
-//int dMul= (PrevMul0xFF)-(CurrMul0xFF);
-int dMul= PrevMul-CurrMul;
-unsigned int d=((dMul+0x10007FF)12);
-return CurrMul + Coef[d];
+//int dmul= (prev0xFF)-(cur0xFF);
+int dmul = prev-cur;
+uint32_t d = ((dmul+0x10007FF)12);
+return cur + coef[d];
 }
 
-static void deNoiseTemporal(unsigned char *FrameSrc,
-unsigned char *FrameDest,
-unsigned short *FrameAnt,
-int W, int H, int sStride, int dStride,
-int *Temporal)
+static void denoise_temporal(uint8_t *src, uint8_t *dst,
+ uint16_t *frame_ant,
+ int w, int h, int sstride, int dstride,
+ int *temporal)
 {
-long X, Y;
-unsigned int PixelDst;
-
-for (Y = 0; Y  H; Y++) {
-for (X = 0; X  W; X++) {
-PixelDst = LowPassMul(FrameAnt[X]8, FrameSrc[X]16, Temporal);
-FrameAnt[X] = ((PixelDst+0x107F)8);
-FrameDest[X]= ((PixelDst+0x10007FFF)16);
+long x, y;
+uint32_t pixel;
+
+for (y = 0; y  h; y++) {
+for (x = 0; x  w; x++) {
+pixel = lowpass(frame_ant[x]8, src[x]16, temporal);
+frame_ant[x] = ((pixel+0x107F)8);
+dst[x]= ((pixel+0x10007FFF)16);
 }
-FrameSrc  += sStride;
-FrameDest += dStride;
-FrameAnt += W;
+src += sstride;
+dst += dstride;
+frame_ant += w;
 }
 }
 
-static void deNoiseSpacial(unsigned char *Frame,
-   unsigned char *FrameDest,
-   unsigned int *LineAnt,
-   int W, int H, int sStride, int dStride,
-   int *Horizontal, int *Vertical)
+static void denoise_spatial(uint8_t *src, uint8_t *dst,
+uint32_t *line_ant,
+int w, int h, int sstride, int dstride,
+int *horizontal, int *vertical)
 {
-long X, Y;
-long sLineOffs = 0, dLineOffs = 0;
-unsigned int PixelAnt;
-unsigned int PixelDst;
+long x, y;
+long sline_offs = 0, dline_offs = 0;
+uint32_t pixel_ant;
+uint32_t pixel;
 
 /* First pixel has no left nor top neighbor. */
-PixelDst = LineAnt[0] = PixelAnt = Frame[0]16;
-FrameDest[0]= ((PixelDst+0x10007FFF)16);
+pixel = line_ant[0] = pixel_ant = src[0]16;
+dst[0]= ((pixel+0x10007FFF)16);
 
 /* First line has no top neighbor, only left. */
-for (X = 1; X  W; X++) {
-PixelDst = LineAnt[X] = LowPassMul(PixelAnt, Frame[X]16, Horizontal);
-FrameDest[X]= ((PixelDst+0x10007FFF)16);
+for (x = 1; x  w; x++) {
+pixel = line_ant[x] = lowpass(pixel_ant, src[x]16, horizontal);
+dst[x]= ((pixel+0x10007FFF)16);
 }
 
-for (Y = 1; Y  H; Y++) {
-unsigned int PixelAnt;
-sLineOffs += sStride, dLineOffs += dStride;
+for (y = 1; y  h; y++) {
+uint32_t pixel_ant;
+sline_offs += sstride, dline_offs += dstride;
 /* First pixel on each line doesn't have previous pixel */
-PixelAnt = Frame[sLineOffs]16;
-PixelDst = LineAnt[0] = LowPassMul(LineAnt[0], PixelAnt, Vertical);
-FrameDest[dLineOffs]= ((PixelDst+0x10007FFF)16);
+pixel_ant = src[sline_offs]16;
+pixel = line_ant[0] = lowpass(line_ant[0], pixel_ant, vertical);
+dst[dline_offs]= ((pixel+0x10007FFF)16);
 
-for (X = 1; X  W; X++) {
-unsigned int PixelDst;
+for (x = 1; x  w; x++) {
+uint32_t pixel;
 /* The rest are normal */
-PixelAnt = LowPassMul(PixelAnt, Frame[sLineOffs+X]16, 
Horizontal);
-PixelDst = LineAnt[X] = LowPassMul(LineAnt[X], PixelAnt, Vertical);
-FrameDest[dLineOffs+X]= ((PixelDst+0x10007FFF)16);
+pixel_ant = lowpass(pixel_ant, src[sline_offs+x]16, horizontal);
+pixel = line_ant[x] = lowpass(line_ant[x], pixel_ant, vertical);
+dst[dline_offs+x]= ((pixel+0x10007FFF)16);
 }
 }
 }
 
-static void deNoise(unsigned char *Frame,
- 

[libav-devel] [PATCH 2/7] vf_hqdn3d: simplify and optimize

2012-07-24 Thread Loren Merritt
14% faster on penryn, 2% on sandybridge, 9% on bulldozer
---
 libavfilter/vf_hqdn3d.c |  166 +++---
 1 files changed, 54 insertions(+), 112 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 8e2a6a2..fdb2ecb 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2003 Daniel Moreno comac AT comac DOT darktech DOT org
  * Copyright (c) 2010 Baptiste Coudurier
+ * Copyright (c) 2012 Loren Merritt
  *
  * This file is part of Libav, ported from MPlayer.
  *
@@ -40,9 +41,8 @@ typedef struct {
 
 static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
 {
-//int dmul= (prev0xFF)-(cur0xFF);
 int dmul = prev-cur;
-uint32_t d = ((dmul+0x10007FF)12);
+unsigned int d = (dmul+0x10007FF)12; // 0x1000 to convert to unsigned, 
7FF for rounding
 return cur + coef[d];
 }
 
@@ -51,14 +51,13 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
  int w, int h, int sstride, int dstride,
  int *temporal)
 {
-long x, y;
-uint32_t pixel;
+uint32_t tmp;
 
-for (y = 0; y  h; y++) {
-for (x = 0; x  w; x++) {
-pixel = lowpass(frame_ant[x]8, src[x]16, temporal);
-frame_ant[x] = ((pixel+0x107F)8);
-dst[x]= ((pixel+0x10007FFF)16);
+for (long y = 0; y  h; y++) {
+for (long x = 0; x  w; x++) {
+tmp = lowpass(frame_ant[x]8, src[x]16, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 src += sstride;
 dst += dstride;
@@ -66,112 +65,67 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 }
 }
 
+av_always_inline
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant,
+uint32_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int *horizontal, int *vertical)
+int *spatial, int *temporal)
 {
 long x, y;
-long sline_offs = 0, dline_offs = 0;
 uint32_t pixel_ant;
-uint32_t pixel;
-
-/* First pixel has no left nor top neighbor. */
-pixel = line_ant[0] = pixel_ant = src[0]16;
-dst[0]= ((pixel+0x10007FFF)16);
+uint32_t tmp;
 
-/* First line has no top neighbor, only left. */
-for (x = 1; x  w; x++) {
-pixel = line_ant[x] = lowpass(pixel_ant, src[x]16, horizontal);
-dst[x]= ((pixel+0x10007FFF)16);
+/* First line has no top neighbor. Only left one for each tmp and
+ * last frame */
+pixel_ant = src[0]16;
+for (x = 0; x  w; x++) {
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]16, 
spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 
 for (y = 1; y  h; y++) {
-uint32_t pixel_ant;
-sline_offs += sstride, dline_offs += dstride;
-/* First pixel on each line doesn't have previous pixel */
-pixel_ant = src[sline_offs]16;
-pixel = line_ant[0] = lowpass(line_ant[0], pixel_ant, vertical);
-dst[dline_offs]= ((pixel+0x10007FFF)16);
-
-for (x = 1; x  w; x++) {
-uint32_t pixel;
-/* The rest are normal */
-pixel_ant = lowpass(pixel_ant, src[sline_offs+x]16, horizontal);
-pixel = line_ant[x] = lowpass(line_ant[x], pixel_ant, vertical);
-dst[dline_offs+x]= ((pixel+0x10007FFF)16);
+src += sstride;
+dst += dstride;
+frame_ant += w;
+pixel_ant = src[0]16;
+for (x = 0; x  w-1; x++) {
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
+pixel_ant = lowpass(pixel_ant, src[x+1]16, spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
+line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
+tmp = lowpass(frame_ant[x]8, tmp, temporal);
+frame_ant[x] = (tmp+0x7F)8;
+dst[x] = (tmp+0x7FFF)16;
 }
 }
 
 static void denoise(uint8_t *src, uint8_t *dst,
 uint32_t *line_ant, uint16_t **frame_ant_ptr,
 int w, int h, int sstride, int dstride,
-int *horizontal, int *vertical, int *temporal)
+int *spatial, int *temporal)
 {
-long x, y;
-long sline_offs = 0, dline_offs = 0;
-uint32_t pixel_ant;
-uint32_t pixel;
 uint16_t *frame_ant = *frame_ant_ptr;
-
 if (!frame_ant) {
+uint8_t *frame_src = src;
 *frame_ant_ptr = frame_ant = av_malloc(w*h*sizeof(uint16_t));
-for (y = 0; y  h; y++) {
-uint16_t *frame_dst = frame_ant+y*w;
-uint8_t *frame_src

[libav-devel] [PATCH 3/7] vf_hqdn3d: reduce intermediate precision

2012-07-24 Thread Loren Merritt
11% faster on penryn, 7% on sandybridge, 5% on bulldozer
Negligible change to output.
---
 libavfilter/vf_hqdn3d.c |   62 --
 1 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index fdb2ecb..505a8ab 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -33,31 +33,31 @@
 #include video.h
 
 typedef struct {
-int coefs[4][512*16];
-uint32_t *line;
+int16_t coefs[4][512*16];
+uint16_t *line;
 uint16_t *frame_prev[3];
 int hsub, vsub;
 } HQDN3DContext;
 
-static inline uint32_t lowpass(unsigned int prev, unsigned int cur, int *coef)
+static inline uint32_t lowpass(int prev, int cur, int16_t *coef)
 {
-int dmul = prev-cur;
-unsigned int d = (dmul+0x10007FF)12; // 0x1000 to convert to unsigned, 
7FF for rounding
+int d = (prev-cur)4;
 return cur + coef[d];
 }
 
 static void denoise_temporal(uint8_t *src, uint8_t *dst,
  uint16_t *frame_ant,
  int w, int h, int sstride, int dstride,
- int *temporal)
+ int16_t *temporal)
 {
 uint32_t tmp;
 
+temporal += 0x1000;
+
 for (long y = 0; y  h; y++) {
 for (long x = 0; x  w; x++) {
-tmp = lowpass(frame_ant[x]8, src[x]16, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+frame_ant[x] = tmp = lowpass(frame_ant[x], src[x]8, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 src += sstride;
 dst += dstride;
@@ -67,47 +67,47 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst,
 
 av_always_inline
 static void denoise_spatial(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant, uint16_t *frame_ant,
+uint16_t *line_ant, uint16_t *frame_ant,
 int w, int h, int sstride, int dstride,
-int *spatial, int *temporal)
+int16_t *spatial, int16_t *temporal)
 {
 long x, y;
 uint32_t pixel_ant;
 uint32_t tmp;
 
+spatial  += 0x1000;
+temporal += 0x1000;
+
 /* First line has no top neighbor. Only left one for each tmp and
  * last frame */
-pixel_ant = src[0]16;
+pixel_ant = src[0]8;
 for (x = 0; x  w; x++) {
-line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]16, 
spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+line_ant[x] = tmp = pixel_ant = lowpass(pixel_ant, src[x]8, spatial);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 
 for (y = 1; y  h; y++) {
 src += sstride;
 dst += dstride;
 frame_ant += w;
-pixel_ant = src[0]16;
+pixel_ant = src[0]8;
 for (x = 0; x  w-1; x++) {
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-pixel_ant = lowpass(pixel_ant, src[x+1]16, spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+pixel_ant = lowpass(pixel_ant, src[x+1]8, spatial);
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial);
-tmp = lowpass(frame_ant[x]8, tmp, temporal);
-frame_ant[x] = (tmp+0x7F)8;
-dst[x] = (tmp+0x7FFF)16;
+frame_ant[x] = tmp = lowpass(frame_ant[x], tmp, temporal);
+dst[x] = (tmp+0x7F)8;
 }
 }
 
 static void denoise(uint8_t *src, uint8_t *dst,
-uint32_t *line_ant, uint16_t **frame_ant_ptr,
+uint16_t *line_ant, uint16_t **frame_ant_ptr,
 int w, int h, int sstride, int dstride,
-int *spatial, int *temporal)
+int16_t *spatial, int16_t *temporal)
 {
 uint16_t *frame_ant = *frame_ant_ptr;
 if (!frame_ant) {
@@ -128,16 +128,18 @@ static void denoise(uint8_t *src, uint8_t *dst,
  w, h, sstride, dstride, temporal);
 }
 
-static void precalc_coefs(int *ct, double dist25)
+static void precalc_coefs(int16_t *ct, double dist25)
 {
 int i;
 double gamma, simil, C;
 
-gamma = log(0.25) / log(1.0 - dist25/255.0 - 0.1);
+gamma = log(0.25) / log(1.0 - FFMIN(dist25,252.0)/255.0 - 0.1);
 
 for (i = -255*16; i = 255*16; i++) {
-simil = 1.0 - FFABS(i) / (16*255.0);
-C = pow(simil, gamma) * 65536.0 * i / 16.0;
+// lowpass() truncates (not rounds) the diff, so +15/32 for the 
midpoint of the bin.
+double f = (i + 15.0/32.0) / 16.0;
+simil = 1.0 - FFABS(f) / 255.0;
+C = pow(simil, gamma) * 256.0 * f;

  1   2   >