Hi, On Jan 16, 2013 8:10 AM, "Justin Ruggles" <justin.rugg...@gmail.com> wrote: > > On 01/15/2013 08:00 PM, Ronald S. Bultje wrote: > > From: "Ronald S. Bultje" <rsbul...@gmail.com> > > > > --- > > libavcodec/x86/Makefile | 3 +- > > libavcodec/x86/vorbisdsp.asm | 84 +++++++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/vorbisdsp_init.c | 71 +++------------------------------- > > 3 files changed, 91 insertions(+), 67 deletions(-) > > create mode 100644 libavcodec/x86/vorbisdsp.asm > > > > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > > index 3b3e3e2..964d0b7 100644 > > --- a/libavcodec/x86/Makefile > > +++ b/libavcodec/x86/Makefile > > @@ -61,7 +61,8 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ > > x86/rv40dsp.o > > YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o > > YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o > > -MMX-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o > > +MMX-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o \ > > + x86/vorbisdsp.o > > YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o > > YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o > > YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o > > diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm > > new file mode 100644 > > index 0000000..b259548 > > --- /dev/null > > +++ b/libavcodec/x86/vorbisdsp.asm > > @@ -0,0 +1,84 @@ > > +;****************************************************************************** > > +;* Vorbis x86 optimizations > > +;* Copyright (C) 2006 Loren Merritt <lor...@u.washington.edu> > > +;* > > +;* This file is part of Libav. > > +;* > > +;* Libav is free software; you can redistribute it and/or > > +;* modify it under the terms of the GNU Lesser General Public > > +;* License as published by the Free Software Foundation; either > > +;* version 2.1 of the License, or (at your option) any later version. > > +;* > > +;* Libav is distributed in the hope that it will be useful, > > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > +;* Lesser General Public License for more details. > > +;* > > +;* You should have received a copy of the GNU Lesser General Public > > +;* License along with Libav; if not, write to the Free Software > > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > > +;****************************************************************************** > > + > > +%include "libavutil/x86/x86util.asm" > > + > > +SECTION_RODATA > > + > > +cextern pdw_80000000 > > vorbis_inverse_coupling is the only function that uses ff_pdw_80000000 > so why not just move it here? > > > + > > +SECTION .text > > + > > +%if ARCH_X86_32 > > +INIT_MMX 3dnow > > +cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size > > + pxor m7, m7 > > + lea magq, [magq+block_sizeq*4] > > + lea angq, [angq+block_sizeq*4] > > + neg block_sizeq > > +.loop: > > + mova m0, [magq+block_sizeq*4] > > + mova m1, [angq+block_sizeq*4] > > + mova m2, m0 > > + mova m3, m1 > > + pfcmpgq m2, m7 ; m <= 0.0 > > + pfcmpgq m3, m7 ; a <= 0.0 > > + pslld m2, 31 ; keep only the sign bit > > + pxor m1, m2 > > + mova m4, m3 > > + pand m3, m1 > > + pandn m4, m1 > > + pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) > > + pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) > > + mova [angq+block_sizeq*4], m3 > > + mova [magq+block_sizeq*4], m0 > > + add block_sizeq, 2 > > + jl .loop > > + femms > > + RET > > +%endif > > Does it make any difference if you multiply block_size by 4 at the top, > then increment by mmsize? This would also allow for zero-extending by > using lea.
Later patch moves this to intptr_t, this seems equally easy? > > #if ARCH_X86_32 > > if (mm_flags & AV_CPU_FLAG_3DNOW) > > - dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; > > + dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; > > #endif > > if (mm_flags & AV_CPU_FLAG_SSE) > > - dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; > > + dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; > > #endif > > } > > Why put 3dnow only under ARCH_X86_32? All x86-64 have SSE. Ronald
_______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel