On 09/29/2012 01:17 AM, Justin Ruggles wrote: > Include x86-optimized versions for SSE2 and AVX. > --- > libavutil/float_dsp.c | 9 +++++++++ > libavutil/float_dsp.h | 15 +++++++++++++++ > libavutil/x86/float_dsp.asm | 40 > ++++++++++++++++++++++++++++++++++++++++ > libavutil/x86/float_dsp_init.c | 9 +++++++++ > libavutil/x86/x86util.asm | 11 +++++++++++ > 5 files changed, 84 insertions(+), 0 deletions(-) > > diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c > index b6b1181..22139de 100644 > --- a/libavutil/float_dsp.c > +++ b/libavutil/float_dsp.c > @@ -44,11 +44,20 @@ static void vector_fmul_scalar_c(float *dst, const float > *src, float mul, > dst[i] = src[i] * mul; > } > > +static void vector_dmul_scalar_c(double *dst, const double *src, double mul, > + int len) > +{ > + int i; > + for (i = 0; i < len; i++) > + dst[i] = src[i] * mul; > +} > + > void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) > { > fdsp->vector_fmul = vector_fmul_c; > fdsp->vector_fmac_scalar = vector_fmac_scalar_c; > fdsp->vector_fmul_scalar = vector_fmul_scalar_c; > + fdsp->vector_dmul_scalar = vector_dmul_scalar_c; > > #if ARCH_ARM > ff_float_dsp_init_arm(fdsp); > diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h > index cb4b28f..41b73c5 100644 > --- a/libavutil/float_dsp.h > +++ b/libavutil/float_dsp.h > @@ -66,6 +66,21 @@ typedef struct AVFloatDSPContext { > */ > void (*vector_fmul_scalar)(float *dst, const float *src, float mul, > int len); > + > + /** > + * Multiply a vector of double by a scalar double. Source and > + * destination vectors must overlap exactly or not at all. > + * > + * @param dst result vector > + * constraints: 32-byte aligned > + * @param src input vector > + * constraints: 32-byte aligned > + * @param mul scalar value > + * @param len length of vector > + * constraints: multiple of 8 > + */ > + void (*vector_dmul_scalar)(double *dst, const double *src, double mul, > + int len); > } AVFloatDSPContext; > > /** > diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm > index 7201ded..e2e023c 100644 > --- a/libavutil/x86/float_dsp.asm > +++ b/libavutil/x86/float_dsp.asm > @@ -119,3 +119,43 @@ cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len > > INIT_XMM sse > VECTOR_FMUL_SCALAR > + > +;------------------------------------------------------------------------------ > +; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, > +; int len) > +;------------------------------------------------------------------------------ > + > +%macro VECTOR_DMUL_SCALAR 0 > +%if UNIX64 > +cglobal vector_dmul_scalar, 3,3,3, dst, src, len > +%else > +cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len > +%endif > +%if ARCH_X86_32 > + VBROADCASTSD xmm0, mulm > +%else > +%if WIN64 > + movsd xmm0, xmm2 > +%endif > + movlhps xmm0, xmm0 > +%if cpuflag(avx) > + vinsertf128 ymm0, ymm0, xmm0, 1 > +%endif > +%endif > + lea lenq, [lend*8-2*mmsize] > +.loop: > + mulpd m1, m0, [srcq+lenq ] > + mulpd m2, m0, [srcq+lenq+mmsize] > + mova [dstq+lenq ], m1 > + mova [dstq+lenq+mmsize], m2 > + sub lenq, 2*mmsize > + jge .loop > + REP_RET > +%endmacro > + > +INIT_XMM sse2 > +VECTOR_DMUL_SCALAR > +%if HAVE_AVX_EXTERNAL > +INIT_YMM avx > +VECTOR_DMUL_SCALAR > +%endif > diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c > index d14ec6a..b3b7ff4 100644 > --- a/libavutil/x86/float_dsp_init.c > +++ b/libavutil/x86/float_dsp_init.c > @@ -35,6 +35,11 @@ extern void ff_vector_fmac_scalar_avx(float *dst, const > float *src, float mul, > extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float > mul, > int len); > > +extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src, > + double mul, int len); > +extern void ff_vector_dmul_scalar_avx(double *dst, const double *src, > + double mul, int len); > + > void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) > { > int mm_flags = av_get_cpu_flags(); > @@ -44,8 +49,12 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) > fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; > fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; > } > + if (EXTERNAL_SSE2(mm_flags)) { > + fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; > + } > if (EXTERNAL_AVX(mm_flags)) { > fdsp->vector_fmul = ff_vector_fmul_avx; > fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; > + fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; > } > } > diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm > index a5d89a1..571cc02 100644 > --- a/libavutil/x86/x86util.asm > +++ b/libavutil/x86/x86util.asm > @@ -626,6 +626,17 @@ > %endif > %endmacro > > +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 > +%if cpuflag(avx) && mmsize == 32 > + vbroadcastsd %1, %2 > +%elif cpuflag(sse3) > + movddup %1, %2 > +%else ; sse2 > + movsd %1, %2 > + movlhps %1, %1 > +%endif > +%endmacro > + > %macro SHUFFLE_MASK_W 8 > %rep 8 > %if %1>=0x80
ping -Justin _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel