Re: [libav-devel] [PATCH 1/3] x86/synth_filter: add synth_filter_sse

2014-03-15 Thread James Almer
On 14/03/14 7:56 AM, Christophe Gisquet wrote:
 Hi,
 
 2014-03-04 3:25 GMT+01:00 James Almer jamr...@gmail.com:
 -INIT_XMM sse2
 +%macro SETZERO 1
 +%if cpuflag(sse2)
 +pxor  %1, %1
 +%else
 +xorps %1, %1, %1
 +%endif
 +%endmacro
 +
 +%macro SHUF 2
 +%if cpuflag(sse2)
 +pshufd%1, %2, q0123
 +%else
 +mova  %1, %2
 +shufps%1, %1, q0123
 +%endif
 +%endmacro
 
 We already discussed this, and indeed it is worth having SSE2
 (integer) instructions instead of pure (float) SSE ones for the SSE2
 version as they are actually faster. OK from me then for the asm.
 
 Not sure if the C part still applies cleanly, but this should be minor.

It doesn't. I'll rebase and send the patchset again with some other changes 
later.
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/3] x86/synth_filter: add synth_filter_sse

2014-03-14 Thread Christophe Gisquet
Hi,

2014-03-04 3:25 GMT+01:00 James Almer jamr...@gmail.com:
 -INIT_XMM sse2
 +%macro SETZERO 1
 +%if cpuflag(sse2)
 +pxor  %1, %1
 +%else
 +xorps %1, %1, %1
 +%endif
 +%endmacro
 +
 +%macro SHUF 2
 +%if cpuflag(sse2)
 +pshufd%1, %2, q0123
 +%else
 +mova  %1, %2
 +shufps%1, %1, q0123
 +%endif
 +%endmacro

We already discussed this, and indeed it is worth having SSE2
(integer) instructions instead of pure (float) SSE ones for the SSE2
version as they are actually faster. OK from me then for the asm.

Not sure if the C part still applies cleanly, but this should be minor.

-- 
Christophe
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/3] x86/synth_filter: add synth_filter_sse

2014-03-03 Thread James Almer
Build only on x86_32 targets.

Signed-off-by: James Almer jamr...@gmail.com
---
 libavcodec/x86/dcadsp.asm| 55 +---
 libavcodec/x86/dcadsp_init.c | 44 +--
 2 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 56039ba..970ec3d 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,15 +199,31 @@ INIT_XMM sse
 DCA_LFE_FIR 0
 DCA_LFE_FIR 1
 
-INIT_XMM sse2
+%macro SETZERO 1
+%if cpuflag(sse2)
+pxor  %1, %1
+%else
+xorps %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 2
+%if cpuflag(sse2)
+pshufd%1, %2, q0123
+%else
+mova  %1, %2
+shufps%1, %1, q0123
+%endif
+%endmacro
+
 %macro INNER_LOOP   1
 ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
 ;~ a += window[i + j]  * (-synth_buf[15 - i + j])
 ;~ b += window[i + j + 16] * (synth_buf[i + j])
-pshufdm5, [ptr2 + j + (15 - 3) * 4], q0123
+SHUF  m5, [ptr2 + j + (15 - 3) * 4]
 mova  m6, [ptr1 + j]
 %if ARCH_X86_64
-pshufd   m11, [ptr2 + j + (15 - 3) * 4 - mmsize], q0123
+SHUF m11, [ptr2 + j + (15 - 3) * 4 - mmsize]
 mova m12, [ptr1 + j + mmsize]
 %endif
 mulps m6, [win  + %1 + j + 16 * 4]
@@ -224,10 +240,10 @@ INIT_XMM sse2
 %endif
 ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
 ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-pshufdm6, [ptr2 + j + (31 - 3) * 4], q0123
+SHUF  m6, [ptr2 + j + (31 - 3) * 4]
 mova  m5, [ptr1 + j + 16 * 4]
 %if ARCH_X86_64
-pshufd   m12, [ptr2 + j + (31 - 3) * 4 - mmsize], q0123
+SHUF m12, [ptr2 + j + (31 - 3) * 4 - mmsize]
 mova m11, [ptr1 + j + mmsize + 16 * 4]
 %endif
 mulps m5, [win  + %1 + j + 32 * 4]
@@ -245,20 +261,25 @@ INIT_XMM sse2
 subj, 64 * 4
 %endmacro
 
-; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
+; void ff_synth_filter_inner_opt(float *synth_buf, float synth_buf2[32],
+;  const float window[512], float out[32],
+;  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
 cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
   synth_buf, synth_buf2, window, out, off, scale
 %define scale m0
 %if ARCH_X86_32 || WIN64
+%if cpuflag(sse2)
 movd   scale, scalem
+%else
+movss  scale, scalem
+%endif
 ; Make sure offset is in a register and not on the stack
 %define OFFQ  r4q
 %else
 %define OFFQ  offq
 %endif
-pshufdm0, m0, 0
+SPLATDm0
 ; prepare inner counter limit 1
 mov  r5q, 480
 sub  r5q, offmp
@@ -274,8 +295,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * 
ARCH_X86_64, \
 %endif
 .mainloop
 ; m1 = a  m2 = b  m3 = c  m4 = d
-pxor  m3, m3
-pxor  m4, m4
+SETZERO   m3
+SETZERO   m4
 mova  m1, [buf2 + i]
 mova  m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
@@ -292,8 +313,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * 
ARCH_X86_64, \
 %define ptr2 r7q ; must be loaded
 %define win  r8q
 %define jr9q
-pxor  m9, m9
-pxor m10, m10
+SETZERO   m9
+SETZERO  m10
 mova  m7, [buf2 + i + mmsize]
 mova  m8, [buf2 + i + mmsize + 16 * 4]
 lea  win, [windowq + i]
@@ -350,3 +371,11 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 
* ARCH_X86_64, \
 subi, (ARCH_X86_64 + 1) * mmsize
 jge.mainloop
 RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 3821892..f8dd9b1 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -56,29 +56,39 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 }
 }
 
-void ff_synth_filter_inner_sse2(float *synth_buf_ptr, float synth_buf2[32],
-const float window[512],
-float out[32], intptr_t offset, float scale);
+#define SYNTH_FILTER_FUNC(opt) 
\
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   
\
+ const float window[512],  
\
+ float out[32], intptr_t offset, float scale); 
\
+static void synth_filter_##opt(FFTContext *imdct,  
\
+   float *synth_buf_ptr, int