On Sun, Nov 01, 2015 at 09:16:22PM -0300, James Almer wrote: > On 10/28/2015 11:38 AM, Diego Biurrun wrote: > > --- /dev/null > > +++ b/libavcodec/x86/ac3dsp_downmix.asm > > @@ -0,0 +1,199 @@ > > +cglobal ac3_downmix_%1_to_%2, > > 3,in_channels+1,total_mmregs,0-matrix_elements_stack*mmsize, src0, src1, > > len, src2, src3, src4, src5 > > + > > +; load matrix pointers > > +%define matrix0q r1q > > +%define matrix1q r3q > > +%if stereo > > + mov matrix1q, [matrix0q+gprsize] > > +%endif > > + mov matrix0q, [matrix0q] > > + > > +; define matrix coeff names > > +%assign %%i 0 > > +%assign %%j needed_mmregs > > +%rep in_channels > > + %if %%i >= matrix_elements_mm > > + CAT_XDEFINE mx_stack_0_, %%i, 1 > > + CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] > > + %else > > + CAT_XDEFINE mx_stack_0_, %%i, 0 > > + CAT_XDEFINE mx_0_, %%i, m %+ %%j > > + %assign %%j %%j+1 > > + %endif > > + %assign %%i %%i+1 > > +%endrep > > +%if stereo > > +%assign %%i 0 > > +%rep in_channels > > + %if in_channels + %%i >= matrix_elements_mm > > + CAT_XDEFINE mx_stack_1_, %%i, 1 > > + CAT_XDEFINE mx_1_, %%i, > > [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] > > + %else > > + CAT_XDEFINE mx_stack_1_, %%i, 0 > > + CAT_XDEFINE mx_1_, %%i, m %+ %%j > > + %assign %%j %%j+1 > > + %endif > > + %assign %%i %%i+1 > > +%endrep > > +%endif > > + > > +; load/splat matrix coeffs > > +%assign %%i 0 > > +%rep in_channels > > + %if mx_stack_0_ %+ %%i > > + VBROADCASTSS m0, [matrix0q+4*%%i] > > + mova mx_0_ %+ %%i, m0 > > + %else > > + VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] > > + %endif > > + %if stereo > > + %if mx_stack_1_ %+ %%i > > + VBROADCASTSS m0, [matrix1q+4*%%i] > > + mova mx_1_ %+ %%i, m0 > > + %else > > + VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] > > + %endif > > + %endif > > + %assign %%i %%i+1 > > +%endrep > > + > > + lea lenq, [4*r2d] > > + ; load channel pointers to registers > > +%assign %%i 1 > > +%rep (in_channels - 1) > > + mov src %+ %%i %+ q, [src0q+%%i*gprsize] > > + add src %+ %%i %+ q, lenq > > + %assign %%i %%i+1 > > +%endrep > > + mov src0q, [src0q] > > + add src0q, lenq > > + neg lenq > > +.loop: > > + %if stereo || mx_stack_0_0 > > + mova m0, [src0q+lenq] > > + %endif > > + %if stereo > > + mulps m1, m0, mx_1_0 > > + %endif > > + %if stereo || mx_stack_0_0 > > + mulps m0, m0, mx_0_0 > > + %else > > + mulps m0, mx_0_0, [src0q+lenq] > > + %endif > > +%assign %%i 1 > > +%rep (in_channels - 1) > > + %define src_ptr src %+ %%i %+ q > > + ; avoid extra load for mono if matrix is in a mm register > > + %if stereo || mx_stack_0_ %+ %%i > > + mova m2, [src_ptr+lenq] > > + %endif > > + %if stereo > > + FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3 > > + %endif > > + %if stereo || mx_stack_0_ %+ %%i > > + FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2 > > + %else > > + FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 > > + %endif > > + %assign %%i %%i+1 > > +%endrep > > + mova [src0q+lenq], m0 > > + %if stereo > > + mova [src1q+lenq], m1 > > + %endif > > + > > + add lenq, mmsize > > + jl .loop > > + RET > > +%endmacro > > + > > +%macro AC3_DOWNMIX_FUNCS 0 > > +%assign %%i 3 > > +%rep 4 > > + INIT_XMM sse > > + AC3_DOWNMIX %%i, 1 > > + AC3_DOWNMIX %%i, 2 > > + ; Do not use ymm AVX or FMA4 on x86-32 for 6 channels due to stack > > alignment > > + ; issues. We are not guaranteed a 32-byte aligned stack, so we have to > > use > > + ; an extra register to save/restore the stack pointer. For 6 channels > > on > > + ; x86-32 we do not have an extra register available. > > + %if ARCH_X86_64 || %%i < 6 > > + INIT_YMM avx > > + %else > > + INIT_XMM avx > > + %endif > > + AC3_DOWNMIX %%i, 1 > > + AC3_DOWNMIX %%i, 2 > > + %if HAVE_FMA4_EXTERNAL > > + %if ARCH_X86_64 || %%i < 6 > > + INIT_YMM fma4 > > FMA4 is AMD exclusive, deprecated, and will not be featured on their upcoming > architecture. > Also, the only CPUs that support it don't benefit from asm using 32byte wide > regs (ymm). > > You have two options. One is to change this into FMA3 and call it a day. FMA3 > works on both > Intel and AMD CPUs (Except for first gen Bulldozer, which is fma4 only). The > other is making > two versions, one FMA3 and one FMA4, the latter using exclusively xmm regs, > and the former > both xmm or ymm as needed (What you already wrote).
I'd drop FMA4 then. Nowadays 3DNow! is more of a liability than an asset and the situation appears to repeat again, albeit much quicker. > The first option is the simplest, so I'd say go with that. You'll however > have to check for > the avxslow flag for any function using ymm regs (avx or fma) so they are not > used on > Bulldozer based CPU. On those, an xmm variant should be used instead. > > This code looks a lot like what's already in avresample, so the same should > be done there if > possible. Patches welcome. I'll not have time to look into this in the next few weeks. Diego _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel