On 09/29/2012 01:17 AM, Justin Ruggles wrote:
> Include x86-optimized versions for SSE2 and AVX.
> ---
>  libavutil/float_dsp.c          |    9 +++++++++
>  libavutil/float_dsp.h          |   15 +++++++++++++++
>  libavutil/x86/float_dsp.asm    |   40 
> ++++++++++++++++++++++++++++++++++++++++
>  libavutil/x86/float_dsp_init.c |    9 +++++++++
>  libavutil/x86/x86util.asm      |   11 +++++++++++
>  5 files changed, 84 insertions(+), 0 deletions(-)
> 
> diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
> index b6b1181..22139de 100644
> --- a/libavutil/float_dsp.c
> +++ b/libavutil/float_dsp.c
> @@ -44,11 +44,20 @@ static void vector_fmul_scalar_c(float *dst, const float 
> *src, float mul,
>          dst[i] = src[i] * mul;
>  }
>  
> +static void vector_dmul_scalar_c(double *dst, const double *src, double mul,
> +                                 int len)
> +{
> +    int i;
> +    for (i = 0; i < len; i++)
> +        dst[i] = src[i] * mul;
> +}
> +
>  void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
>  {
>      fdsp->vector_fmul = vector_fmul_c;
>      fdsp->vector_fmac_scalar = vector_fmac_scalar_c;
>      fdsp->vector_fmul_scalar = vector_fmul_scalar_c;
> +    fdsp->vector_dmul_scalar = vector_dmul_scalar_c;
>  
>  #if ARCH_ARM
>      ff_float_dsp_init_arm(fdsp);
> diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
> index cb4b28f..41b73c5 100644
> --- a/libavutil/float_dsp.h
> +++ b/libavutil/float_dsp.h
> @@ -66,6 +66,21 @@ typedef struct AVFloatDSPContext {
>       */
>      void (*vector_fmul_scalar)(float *dst, const float *src, float mul,
>                                 int len);
> +
> +    /**
> +     * Multiply a vector of double by a scalar double.  Source and
> +     * destination vectors must overlap exactly or not at all.
> +     *
> +     * @param dst result vector
> +     *            constraints: 32-byte aligned
> +     * @param src input vector
> +     *            constraints: 32-byte aligned
> +     * @param mul scalar value
> +     * @param len length of vector
> +     *            constraints: multiple of 8
> +     */
> +    void (*vector_dmul_scalar)(double *dst, const double *src, double mul,
> +                               int len);
>  } AVFloatDSPContext;
>  
>  /**
> diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
> index 7201ded..e2e023c 100644
> --- a/libavutil/x86/float_dsp.asm
> +++ b/libavutil/x86/float_dsp.asm
> @@ -119,3 +119,43 @@ cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
>  
>  INIT_XMM sse
>  VECTOR_FMUL_SCALAR
> +
> +;------------------------------------------------------------------------------
> +; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
> +;                            int len)
> +;------------------------------------------------------------------------------
> +
> +%macro VECTOR_DMUL_SCALAR 0
> +%if UNIX64
> +cglobal vector_dmul_scalar, 3,3,3, dst, src, len
> +%else
> +cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
> +%endif
> +%if ARCH_X86_32
> +    VBROADCASTSD xmm0, mulm
> +%else
> +%if WIN64
> +    movsd        xmm0, xmm2
> +%endif
> +    movlhps      xmm0, xmm0
> +%if cpuflag(avx)
> +    vinsertf128  ymm0, ymm0, xmm0, 1
> +%endif
> +%endif
> +    lea          lenq, [lend*8-2*mmsize]
> +.loop:
> +    mulpd          m1, m0, [srcq+lenq       ]
> +    mulpd          m2, m0, [srcq+lenq+mmsize]
> +    mova   [dstq+lenq       ], m1
> +    mova   [dstq+lenq+mmsize], m2
> +    sub          lenq, 2*mmsize
> +    jge .loop
> +    REP_RET
> +%endmacro
> +
> +INIT_XMM sse2
> +VECTOR_DMUL_SCALAR
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +VECTOR_DMUL_SCALAR
> +%endif
> diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
> index d14ec6a..b3b7ff4 100644
> --- a/libavutil/x86/float_dsp_init.c
> +++ b/libavutil/x86/float_dsp_init.c
> @@ -35,6 +35,11 @@ extern void ff_vector_fmac_scalar_avx(float *dst, const 
> float *src, float mul,
>  extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float 
> mul,
>                                        int len);
>  
> +extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
> +                                       double mul, int len);
> +extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
> +                                      double mul, int len);
> +
>  void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
>  {
>      int mm_flags = av_get_cpu_flags();
> @@ -44,8 +49,12 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
>          fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
>          fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
>      }
> +    if (EXTERNAL_SSE2(mm_flags)) {
> +        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
> +    }
>      if (EXTERNAL_AVX(mm_flags)) {
>          fdsp->vector_fmul = ff_vector_fmul_avx;
>          fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
> +        fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
>      }
>  }
> diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
> index a5d89a1..571cc02 100644
> --- a/libavutil/x86/x86util.asm
> +++ b/libavutil/x86/x86util.asm
> @@ -626,6 +626,17 @@
>  %endif
>  %endmacro
>  
> +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
> +%if cpuflag(avx) && mmsize == 32
> +    vbroadcastsd %1, %2
> +%elif cpuflag(sse3)
> +    movddup      %1, %2
> +%else ; sse2
> +    movsd        %1, %2
> +    movlhps      %1, %1
> +%endif
> +%endmacro
> +
>  %macro SHUFFLE_MASK_W 8
>      %rep 8
>          %if %1>=0x80

ping

-Justin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to