Signed-off-by: James Almer <jamr...@gmail.com> --- This should make replacing the current dca decoder for foo86's easier, without having to disable/reenable the compilation of synth_filter files in between removal/addition commits.
aarch64 and arm changes untested. configure | 3 +- libavcodec/Makefile | 3 +- libavcodec/aarch64/Makefile | 5 +- libavcodec/aarch64/dcadsp_init.c | 21 -- .../aarch64/{dcadsp_init.c => synth_filter_init.c} | 16 +- libavcodec/arm/Makefile | 9 +- libavcodec/arm/dcadsp_init_arm.c | 22 -- .../dcadsp_init.c => arm/synth_filter_init_arm.c} | 33 +-- libavcodec/x86/Makefile | 2 + libavcodec/x86/dcadsp.asm | 222 --------------------- libavcodec/x86/dcadsp_init.c | 51 ----- libavcodec/x86/{dcadsp.asm => synth_filter.asm} | 99 --------- .../x86/{dcadsp_init.c => synth_filter_init.c} | 21 +- 13 files changed, 26 insertions(+), 481 deletions(-) copy libavcodec/aarch64/{dcadsp_init.c => synth_filter_init.c} (78%) copy libavcodec/{aarch64/dcadsp_init.c => arm/synth_filter_init_arm.c} (66%) copy libavcodec/x86/{dcadsp.asm => synth_filter.asm} (76%) copy libavcodec/x86/{dcadsp_init.c => synth_filter_init.c} (83%) diff --git a/configure b/configure index 8f4642b..1719a5b 100755 --- a/configure +++ b/configure @@ -2039,6 +2039,7 @@ CONFIG_EXTRA=" sinewin snappy startcode + synth_filter texturedsp texturedspenc tpeldsp @@ -2276,7 +2277,7 @@ comfortnoise_encoder_select="lpc" cook_decoder_select="audiodsp mdct sinewin" cscd_decoder_select="lzo" cscd_decoder_suggest="zlib" -dca_decoder_select="fmtconvert mdct" +dca_decoder_select="fmtconvert mdct synth_filter" dds_decoder_select="texturedsp" dirac_decoder_select="dirac_parse dwt golomb videodsp mpegvideoenc" dnxhd_decoder_select="blockdsp idctdsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index b9ffdb9..3a6a453 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -115,6 +115,7 @@ OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o OBJS-$(CONFIG_SNAPPY) += snappy.o OBJS-$(CONFIG_STARTCODE) += startcode.o +OBJS-$(CONFIG_SYNTH_FILTER) += synth_filter.o OBJS-$(CONFIG_TEXTUREDSP) += texturedsp.o OBJS-$(CONFIG_TEXTUREDSPENC) += texturedspenc.o OBJS-$(CONFIG_TPELDSP) += tpeldsp.o @@ -223,7 +224,7 @@ OBJS-$(CONFIG_CSCD_DECODER) += cscd.o OBJS-$(CONFIG_CYUV_DECODER) += cyuv.o OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dca.o dcadsp.o \ dcadata.o dca_exss.o \ - dca_xll.o synth_filter.o + dca_xll.o OBJS-$(CONFIG_DCA_ENCODER) += dcaenc.o dca.o dcadata.o OBJS-$(CONFIG_DDS_DECODER) += dds.o OBJS-$(CONFIG_DIRAC_DECODER) += diracdec.o dirac.o diracdsp.o \ diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 022ed84..2df6325 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -10,6 +10,7 @@ OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o +OBJS-$(CONFIG_SYNTH_FILTER) += aarch64/synth_filter_init.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o @@ -17,8 +18,7 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o -NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \ - aarch64/synth_filter_neon.o +NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o @@ -31,5 +31,6 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o +NEON-OBJS-$(CONFIG_SYNTH_FILTER) += aarch64/synth_filter_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c index 78642a5..4440e4b 100644 --- a/libavcodec/aarch64/dcadsp_init.c +++ b/libavcodec/aarch64/dcadsp_init.c @@ -24,23 +24,10 @@ #include "libavutil/attributes.h" #include "libavutil/internal.h" #include "libavcodec/dcadsp.h" -#include "libavcodec/fft.h" - -#include "asm-offsets.h" - -#if HAVE_NEON || HAVE_VFP -AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); -#endif void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -50,11 +37,3 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) s->lfe_fir[1] = ff_dca_lfe_fir1_neon; } } - -av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_neon; -} diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c similarity index 78% copy from libavcodec/aarch64/dcadsp_init.c copy to libavcodec/aarch64/synth_filter_init.c index 78642a5..df51ae8 100644 --- a/libavcodec/aarch64/dcadsp_init.c +++ b/libavcodec/aarch64/synth_filter_init.c @@ -23,8 +23,7 @@ #include "libavutil/aarch64/cpu.h" #include "libavutil/attributes.h" #include "libavutil/internal.h" -#include "libavcodec/dcadsp.h" -#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" #include "asm-offsets.h" @@ -32,25 +31,12 @@ AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); #endif -void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); - void ff_synth_filter_float_neon(FFTContext *imdct, float *synth_buf_ptr, int *synth_buf_offset, float synth_buf2[32], const float window[512], float out[32], const float in[32], float scale); -av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_neon; - s->lfe_fir[1] = ff_dca_lfe_fir1_neon; - } -} - av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) { int cpu_flags = av_get_cpu_flags(); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index cdd35b0..f18e800 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -29,6 +29,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o +OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o @@ -85,10 +86,10 @@ ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o +VFP-OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_vfp.o # decoders/encoders -VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \ - arm/synth_filter_vfp.o +VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o # NEON optimizations @@ -119,6 +120,7 @@ NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \ arm/mdct_fixed_neon.o NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o +NEON-OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \ arm/vp8dsp_neon.o @@ -127,8 +129,7 @@ NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ arm/sbrdsp_neon.o NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o -NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ - arm/synth_filter_neon.o +NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ arm/hevcdsp_deblock_neon.o \ arm/hevcdsp_idct_neon.o \ diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c index 0f2e4c4..febb444 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/dcadsp_init_arm.c @@ -37,18 +37,6 @@ void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, const float window[512], float *samples_out, float raXin[32], float scale); -void ff_synth_filter_float_vfp(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -63,13 +51,3 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) s->lfe_fir[1] = ff_dca_lfe_fir1_neon; } } - -av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp_vm(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_vfp; - if (have_neon(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_neon; -} diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/arm/synth_filter_init_arm.c similarity index 66% copy from libavcodec/aarch64/dcadsp_init.c copy to libavcodec/arm/synth_filter_init_arm.c index 78642a5..3dae5b9 100644 --- a/libavcodec/aarch64/dcadsp_init.c +++ b/libavcodec/arm/synth_filter_init_arm.c @@ -20,20 +20,15 @@ #include "config.h" -#include "libavutil/aarch64/cpu.h" +#include "libavutil/arm/cpu.h" #include "libavutil/attributes.h" -#include "libavutil/internal.h" -#include "libavcodec/dcadsp.h" -#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" -#include "asm-offsets.h" - -#if HAVE_NEON || HAVE_VFP -AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF); -#endif - -void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs); +void ff_synth_filter_float_vfp(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale); void ff_synth_filter_float_neon(FFTContext *imdct, float *synth_buf_ptr, int *synth_buf_offset, @@ -41,20 +36,12 @@ void ff_synth_filter_float_neon(FFTContext *imdct, float out[32], const float in[32], float scale); -av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_neon; - s->lfe_fir[1] = ff_dca_lfe_fir1_neon; - } -} - -av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s) +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) { int cpu_flags = av_get_cpu_flags(); + if (have_vfp_vm(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_vfp; if (have_neon(cpu_flags)) s->synth_filter_float = ff_synth_filter_float_neon; } diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0d09fe6..90ff29d 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -31,6 +31,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \ OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o +OBJS-$(CONFIG_SYNTH_FILTER) += x86/synth_filter_init.o OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o @@ -119,6 +120,7 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ x86/qpel.o YASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o +YASM-OBJS-$(CONFIG_SYNTH_FILTER) += x86/synth_filter.o YASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 502b70a..55e73bc 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -121,225 +121,3 @@ DCA_LFE_FIR 1 INIT_XMM fma3 DCA_LFE_FIR 0 %endif - -%macro SETZERO 1 -%if cpuflag(sse2) && notcpuflag(avx) - pxor %1, %1 -%else - xorps %1, %1, %1 -%endif -%endmacro - -%macro SHUF 3 -%if cpuflag(avx) - mova %3, [%2 - 16] - vperm2f128 %1, %3, %3, 1 - vshufps %1, %1, %1, q0123 -%elif cpuflag(sse2) - pshufd %1, [%2], q0123 -%else - mova %1, [%2] - shufps %1, %1, q0123 -%endif -%endmacro - -%macro INNER_LOOP 1 - ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i - ;~ a += window[i + j] * (-synth_buf[15 - i + j]) - ;~ b += window[i + j + 16] * (synth_buf[i + j]) - SHUF m5, ptr2 + j + (15 - 3) * 4, m6 - mova m6, [ptr1 + j] -%if ARCH_X86_64 - SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 - mova m12, [ptr1 + j + mmsize] -%endif -%if cpuflag(fma3) - fmaddps m2, m6, [win + %1 + j + 16 * 4], m2 - fnmaddps m1, m5, [win + %1 + j], m1 -%if ARCH_X86_64 - fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8 - fnmaddps m7, m11, [win + %1 + j + mmsize], m7 -%endif -%else ; non-FMA - mulps m6, m6, [win + %1 + j + 16 * 4] - mulps m5, m5, [win + %1 + j] -%if ARCH_X86_64 - mulps m12, m12, [win + %1 + j + mmsize + 16 * 4] - mulps m11, m11, [win + %1 + j + mmsize] -%endif - addps m2, m2, m6 - subps m1, m1, m5 -%if ARCH_X86_64 - addps m8, m8, m12 - subps m7, m7, m11 -%endif -%endif ; cpuflag(fma3) - ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) - ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) - SHUF m6, ptr2 + j + (31 - 3) * 4, m5 - mova m5, [ptr1 + j + 16 * 4] -%if ARCH_X86_64 - SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 - mova m11, [ptr1 + j + mmsize + 16 * 4] -%endif -%if cpuflag(fma3) - fmaddps m3, m5, [win + %1 + j + 32 * 4], m3 - fmaddps m4, m6, [win + %1 + j + 48 * 4], m4 -%if ARCH_X86_64 - fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9 - fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10 -%endif -%else ; non-FMA - mulps m5, m5, [win + %1 + j + 32 * 4] - mulps m6, m6, [win + %1 + j + 48 * 4] -%if ARCH_X86_64 - mulps m11, m11, [win + %1 + j + mmsize + 32 * 4] - mulps m12, m12, [win + %1 + j + mmsize + 48 * 4] -%endif - addps m3, m3, m5 - addps m4, m4, m6 -%if ARCH_X86_64 - addps m9, m9, m11 - addps m10, m10, m12 -%endif -%endif ; cpuflag(fma3) - sub j, 64 * 4 -%endmacro - -; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32], -; const float window[512], float out[32], -; intptr_t offset, float scale) -%macro SYNTH_FILTER 0 -cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ - synth_buf, synth_buf2, window, out, off, scale -%define scale m0 -%if ARCH_X86_32 || WIN64 -%if cpuflag(sse2) && notcpuflag(avx) - movd scale, scalem - SPLATD m0 -%else - VBROADCASTSS m0, scalem -%endif -; Make sure offset is in a register and not on the stack -%define OFFQ r4q -%else - SPLATD xmm0 -%if cpuflag(avx) - vinsertf128 m0, m0, xmm0, 1 -%endif -%define OFFQ offq -%endif - ; prepare inner counter limit 1 - mov r5q, 480 - sub r5q, offmp - and r5q, -64 - shl r5q, 2 -%if ARCH_X86_32 || notcpuflag(avx) - mov OFFQ, r5q -%define i r5q - mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter -%else -%define i 0 -%define OFFQ r5q -%endif - -%define buf2 synth_buf2q -%if ARCH_X86_32 - mov buf2, synth_buf2mp -%endif -.mainloop: - ; m1 = a m2 = b m3 = c m4 = d - SETZERO m3 - SETZERO m4 - mova m1, [buf2 + i] - mova m2, [buf2 + i + 16 * 4] -%if ARCH_X86_32 -%define ptr1 r0q -%define ptr2 r1q -%define win r2q -%define j r3q - mov win, windowm - mov ptr1, synth_bufm -%if ARCH_X86_32 || notcpuflag(avx) - add win, i - add ptr1, i -%endif -%else ; ARCH_X86_64 -%define ptr1 r6q -%define ptr2 r7q ; must be loaded -%define win r8q -%define j r9q - SETZERO m9 - SETZERO m10 - mova m7, [buf2 + i + mmsize] - mova m8, [buf2 + i + mmsize + 16 * 4] - lea win, [windowq + i] - lea ptr1, [synth_bufq + i] -%endif - mov ptr2, synth_bufmp - ; prepare the inner loop counter - mov j, OFFQ -%if ARCH_X86_32 || notcpuflag(avx) - sub ptr2, i -%endif -.loop1: - INNER_LOOP 0 - jge .loop1 - - mov j, 448 * 4 - sub j, OFFQ - jz .end - sub ptr1, j - sub ptr2, j - add win, OFFQ ; now at j-64, so define OFFSET - sub j, 64 * 4 -.loop2: - INNER_LOOP 64 * 4 - jge .loop2 - -.end: -%if ARCH_X86_32 - mov buf2, synth_buf2m ; needed for next iteration anyway - mov outq, outmp ; j, which will be set again during it -%endif - ;~ out[i] = a * scale; - ;~ out[i + 16] = b * scale; - mulps m1, m1, scale - mulps m2, m2, scale -%if ARCH_X86_64 - mulps m7, m7, scale - mulps m8, m8, scale -%endif - ;~ synth_buf2[i] = c; - ;~ synth_buf2[i + 16] = d; - mova [buf2 + i + 0 * 4], m3 - mova [buf2 + i + 16 * 4], m4 -%if ARCH_X86_64 - mova [buf2 + i + 0 * 4 + mmsize], m9 - mova [buf2 + i + 16 * 4 + mmsize], m10 -%endif - ;~ out[i] = a; - ;~ out[i + 16] = a; - mova [outq + i + 0 * 4], m1 - mova [outq + i + 16 * 4], m2 -%if ARCH_X86_64 - mova [outq + i + 0 * 4 + mmsize], m7 - mova [outq + i + 16 * 4 + mmsize], m8 -%endif -%if ARCH_X86_32 || notcpuflag(avx) - sub i, (ARCH_X86_64 + 1) * mmsize - jge .mainloop -%endif - RET -%endmacro - -%if ARCH_X86_32 -INIT_XMM sse -SYNTH_FILTER -%endif -INIT_XMM sse2 -SYNTH_FILTER -INIT_YMM avx -SYNTH_FILTER -INIT_YMM fma3 -SYNTH_FILTER diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 1321dda..c27c045 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -40,54 +40,3 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) s->lfe_fir[0] = ff_dca_lfe_fir0_fma3; } } - - -#define SYNTH_FILTER_FUNC(opt) \ -void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \ - const float window[512], \ - float out[32], intptr_t offset, float scale); \ -static void synth_filter_##opt(FFTContext *imdct, \ - float *synth_buf_ptr, int *synth_buf_offset, \ - float synth_buf2[32], const float window[512], \ - float out[32], const float in[32], float scale) \ -{ \ - float *synth_buf= synth_buf_ptr + *synth_buf_offset; \ - \ - imdct->imdct_half(imdct, synth_buf, in); \ - \ - ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \ - out, *synth_buf_offset, scale); \ - \ - *synth_buf_offset = (*synth_buf_offset - 32) & 511; \ -} \ - -#if HAVE_YASM -#if ARCH_X86_32 -SYNTH_FILTER_FUNC(sse) -#endif -SYNTH_FILTER_FUNC(sse2) -SYNTH_FILTER_FUNC(avx) -SYNTH_FILTER_FUNC(fma3) -#endif /* HAVE_YASM */ - -av_cold void ff_synth_filter_init_x86(SynthFilterContext *s) -{ -#if HAVE_YASM - int cpu_flags = av_get_cpu_flags(); - -#if ARCH_X86_32 - if (EXTERNAL_SSE(cpu_flags)) { - s->synth_filter_float = synth_filter_sse; - } -#endif - if (EXTERNAL_SSE2(cpu_flags)) { - s->synth_filter_float = synth_filter_sse2; - } - if (EXTERNAL_AVX_FAST(cpu_flags)) { - s->synth_filter_float = synth_filter_avx; - } - if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) { - s->synth_filter_float = synth_filter_fma3; - } -#endif /* HAVE_YASM */ -} diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/synth_filter.asm similarity index 76% copy from libavcodec/x86/dcadsp.asm copy to libavcodec/x86/synth_filter.asm index 502b70a..bc1a48f 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/synth_filter.asm @@ -21,107 +21,8 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA -pf_inv16: times 4 dd 0x3D800000 ; 1/16 - SECTION .text -; %1=v0/v1 %2=in1 %3=in2 -%macro FIR_LOOP 2-3 -.loop%1: -%define va m1 -%define vb m2 -%if %1 -%define OFFSET 0 -%else -%define OFFSET NUM_COEF*count -%endif -; for v0, incrementing and for v1, decrementing - mova va, [cf0q + OFFSET] - mova vb, [cf0q + OFFSET + 4*NUM_COEF] -%if %0 == 3 - mova m4, [cf0q + OFFSET + mmsize] - mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize] -%endif - mulps va, %2 - mulps vb, %2 -%if %0 == 3 -%if cpuflag(fma3) - fmaddps va, m4, %3, va - fmaddps vb, m0, %3, vb -%else - mulps m4, %3 - mulps m0, %3 - addps va, m4 - addps vb, m0 -%endif -%endif - ; va = va1 va2 va3 va4 - ; vb = vb1 vb2 vb3 vb4 -%if %1 - SWAP va, vb -%endif - mova m4, va - unpcklps va, vb ; va3 vb3 va4 vb4 - unpckhps m4, vb ; va1 vb1 va2 vb2 - addps m4, va ; va1+3 vb1+3 va2+4 vb2+4 - movhlps vb, m4 ; va1+3 vb1+3 - addps vb, m4 ; va0..4 vb0..4 - movlps [outq + count], vb -%if %1 - sub cf0q, 8*NUM_COEF -%endif - add count, 8 - jl .loop%1 -%endmacro - -; void dca_lfe_fir(float *out, float *in, float *coefs) -%macro DCA_LFE_FIR 1 -cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0 -%define IN1 m3 -%define IN2 m5 -%define count inq -%define NUM_COEF 4*(2-%1) -%define NUM_OUT 32*(%1+1) - - movu IN1, [inq + 4 - 1*mmsize] - shufps IN1, IN1, q0123 -%if %1 == 0 - movu IN2, [inq + 4 - 2*mmsize] - shufps IN2, IN2, q0123 -%endif - - mov count, -4*NUM_OUT - add cf0q, 4*NUM_COEF*NUM_OUT - add outq, 4*NUM_OUT - ; compute v0 first -%if %1 == 0 - FIR_LOOP 0, IN1, IN2 -%else - FIR_LOOP 0, IN1 -%endif - shufps IN1, IN1, q0123 - mov count, -4*NUM_OUT - ; cf1 already correctly positioned - add outq, 4*NUM_OUT ; outq now at out2 - sub cf0q, 8*NUM_COEF -%if %1 == 0 - shufps IN2, IN2, q0123 - FIR_LOOP 1, IN2, IN1 -%else - FIR_LOOP 1, IN1 -%endif - RET -%endmacro - -INIT_XMM sse -DCA_LFE_FIR 0 -DCA_LFE_FIR 1 -%if HAVE_FMA3_EXTERNAL -INIT_XMM fma3 -DCA_LFE_FIR 0 -%endif - %macro SETZERO 1 %if cpuflag(sse2) && notcpuflag(avx) pxor %1, %1 diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/synth_filter_init.c similarity index 83% copy from libavcodec/x86/dcadsp_init.c copy to libavcodec/x86/synth_filter_init.c index 1321dda..0649ea2 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/synth_filter_init.c @@ -21,26 +21,7 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/dcadsp.h" - -void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); -void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs); - -av_cold void ff_dcadsp_init_x86(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (EXTERNAL_SSE(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_sse; - s->lfe_fir[1] = ff_dca_lfe_fir1_sse; - } - - if (EXTERNAL_FMA3(cpu_flags)) { - s->lfe_fir[0] = ff_dca_lfe_fir0_fma3; - } -} - +#include "libavcodec/synth_filter.h" #define SYNTH_FILTER_FUNC(opt) \ void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \ -- 2.7.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel