On Sun, Nov 01, 2015 at 09:16:22PM -0300, James Almer wrote:
> On 10/28/2015 11:38 AM, Diego Biurrun wrote:
> > --- /dev/null
> > +++ b/libavcodec/x86/ac3dsp_downmix.asm
> > @@ -0,0 +1,199 @@
> > +cglobal ac3_downmix_%1_to_%2, 
> > 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, 
> > len, src2, src3, src4, src5
> > +
> > +; load matrix pointers
> > +%define matrix0q r1q
> > +%define matrix1q r3q
> > +%if stereo
> > +    mov      matrix1q, [matrix0q+gprsize]
> > +%endif
> > +    mov      matrix0q, [matrix0q]
> > +
> > +; define matrix coeff names
> > +%assign %%i 0
> > +%assign %%j needed_mmregs
> > +%rep in_channels
> > +    %if %%i >= matrix_elements_mm
> > +        CAT_XDEFINE mx_stack_0_, %%i, 1
> > +        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
> > +    %else
> > +        CAT_XDEFINE mx_stack_0_, %%i, 0
> > +        CAT_XDEFINE mx_0_, %%i, m %+ %%j
> > +        %assign %%j %%j+1
> > +    %endif
> > +    %assign %%i %%i+1
> > +%endrep
> > +%if stereo
> > +%assign %%i 0
> > +%rep in_channels
> > +    %if in_channels + %%i >= matrix_elements_mm
> > +        CAT_XDEFINE mx_stack_1_, %%i, 1
> > +        CAT_XDEFINE mx_1_, %%i, 
> > [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
> > +    %else
> > +        CAT_XDEFINE mx_stack_1_, %%i, 0
> > +        CAT_XDEFINE mx_1_, %%i, m %+ %%j
> > +        %assign %%j %%j+1
> > +    %endif
> > +    %assign %%i %%i+1
> > +%endrep
> > +%endif
> > +
> > +; load/splat matrix coeffs
> > +%assign %%i 0
> > +%rep in_channels
> > +    %if mx_stack_0_ %+ %%i
> > +        VBROADCASTSS m0, [matrix0q+4*%%i]
> > +        mova  mx_0_ %+ %%i, m0
> > +    %else
> > +        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
> > +    %endif
> > +    %if stereo
> > +    %if mx_stack_1_ %+ %%i
> > +        VBROADCASTSS m0, [matrix1q+4*%%i]
> > +        mova  mx_1_ %+ %%i, m0
> > +    %else
> > +        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
> > +    %endif
> > +    %endif
> > +    %assign %%i %%i+1
> > +%endrep
> > +
> > +    lea          lenq, [4*r2d]
> > +    ; load channel pointers to registers
> > +%assign %%i 1
> > +%rep (in_channels - 1)
> > +    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
> > +    add         src %+ %%i %+ q, lenq
> > +    %assign %%i %%i+1
> > +%endrep
> > +    mov         src0q, [src0q]
> > +    add         src0q, lenq
> > +    neg          lenq
> > +.loop:
> > +    %if stereo || mx_stack_0_0
> > +    mova           m0, [src0q+lenq]
> > +    %endif
> > +    %if stereo
> > +    mulps          m1, m0, mx_1_0
> > +    %endif
> > +    %if stereo || mx_stack_0_0
> > +    mulps          m0, m0, mx_0_0
> > +    %else
> > +    mulps          m0, mx_0_0, [src0q+lenq]
> > +    %endif
> > +%assign %%i 1
> > +%rep (in_channels - 1)
> > +    %define src_ptr src %+ %%i %+ q
> > +    ; avoid extra load for mono if matrix is in a mm register
> > +    %if stereo || mx_stack_0_ %+ %%i
> > +    mova           m2, [src_ptr+lenq]
> > +    %endif
> > +    %if stereo
> > +    FMULADD_PS     m1, m2, mx_1_ %+ %%i, m1, m3
> > +    %endif
> > +    %if stereo || mx_stack_0_ %+ %%i
> > +    FMULADD_PS     m0, m2, mx_0_ %+ %%i, m0, m2
> > +    %else
> > +    FMULADD_PS     m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
> > +    %endif
> > +    %assign %%i %%i+1
> > +%endrep
> > +    mova [src0q+lenq], m0
> > +    %if stereo
> > +    mova [src1q+lenq], m1
> > +    %endif
> > +
> > +    add          lenq, mmsize
> > +    jl .loop
> > +    RET
> > +%endmacro
> > +
> > +%macro AC3_DOWNMIX_FUNCS 0
> > +%assign %%i 3
> > +%rep 4
> > +    INIT_XMM sse
> > +    AC3_DOWNMIX %%i, 1
> > +    AC3_DOWNMIX %%i, 2
> > +    ; Do not use ymm AVX or FMA4 on x86-32 for 6 channels due to stack 
> > alignment
> > +    ; issues. We are not guaranteed a 32-byte aligned stack, so we have to 
> > use
> > +    ; an extra register to save/restore the stack pointer. For 6 channels 
> > on
> > +    ; x86-32 we do not have an extra register available.
> > +    %if ARCH_X86_64 || %%i < 6
> > +    INIT_YMM avx
> > +    %else
> > +    INIT_XMM avx
> > +    %endif
> > +    AC3_DOWNMIX %%i, 1
> > +    AC3_DOWNMIX %%i, 2
> > +    %if HAVE_FMA4_EXTERNAL
> > +    %if ARCH_X86_64 || %%i < 6
> > +    INIT_YMM fma4
> 
> FMA4 is AMD exclusive, deprecated, and will not be featured on their upcoming 
> architecture.
> Also, the only CPUs that support it don't benefit from asm using 32byte wide 
> regs (ymm).
> 
> You have two options. One is to change this into FMA3 and call it a day. FMA3 
> works on both
> Intel and AMD CPUs (Except for first gen Bulldozer, which is fma4 only). The 
> other is making
> two versions, one FMA3 and one FMA4, the latter using exclusively xmm regs, 
> and the former
> both xmm or ymm as needed (What you already wrote).

I'd drop FMA4 then.  Nowadays 3DNow! is more of a liability than an asset and
the situation appears to repeat again, albeit much quicker.

> The first option is the simplest, so I'd say go with that. You'll however 
> have to check for
> the avxslow flag for any function using ymm regs (avx or fma) so they are not 
> used on
> Bulldozer based CPU. On those, an xmm variant should be used instead.
> 
> This code looks a lot like what's already in avresample, so the same should 
> be done there if
> possible.

Patches welcome.  I'll not have time to look into this in the next few weeks.

Diego
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to