Re: [FFmpeg-devel] [PATCH v2] mdct15: add assembly optimizations for the 15-point FFT

James Almer Thu, 22 Jun 2017 15:05:46 -0700

On 6/22/2017 5:40 PM, Rostislav Pehlivanov wrote:
> c:    1802 decicycles in fft15,16774635 runs,   2581 skips
> fma3:  935 decicycles in fft15,16775893 runs,   1323 skips
> ---
>  libavcodec/mdct15.c          | 186 
> ++++++++++++++++++++++---------------------
>  libavcodec/mdct15.h          |  26 +++---
>  libavcodec/x86/Makefile      |   2 +
>  libavcodec/x86/mdct15.asm    | 148 ++++++++++++++++++++++++++++++++++
>  libavcodec/x86/mdct15_init.c |  91 +++++++++++++++++++++
>  5 files changed, 344 insertions(+), 109 deletions(-)
>  create mode 100644 libavcodec/x86/mdct15.asm
>  create mode 100644 libavcodec/x86/mdct15_init.c
> 
> diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
> index 8c42ece483..6eb428c250 100644
> --- a/libavcodec/mdct15.c
> +++ b/libavcodec/mdct15.c
> @@ -57,11 +57,6 @@ av_cold void ff_mdct15_uninit(MDCT15Context **ps)
>      av_freep(ps);
>  }
>  
> -static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t 
> stride);
> -
> -static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> -                         ptrdiff_t stride, float scale);
> -
>  static inline int init_pfa_reindex_tabs(MDCT15Context *s)
>  {
>      int i, j;
> @@ -93,88 +88,8 @@ static inline int init_pfa_reindex_tabs(MDCT15Context *s)
>      return 0;
>  }
>  
> -av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double 
> scale)
> -{
> -    MDCT15Context *s;
> -    double alpha, theta;
> -    int len2 = 15 * (1 << N);
> -    int len  = 2 * len2;
> -    int i;
> -
> -    /* Tested and verified to work on everything in between */
> -    if ((N < 2) || (N > 13))
> -        return AVERROR(EINVAL);
> -
> -    s = av_mallocz(sizeof(*s));
> -    if (!s)
> -        return AVERROR(ENOMEM);
> -
> -    s->fft_n = N - 1;
> -    s->len4 = len2 / 2;
> -    s->len2 = len2;
> -    s->inverse = inverse;
> -    s->mdct = mdct15;
> -    s->imdct_half = imdct15_half;
> -
> -    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> -        goto fail;
> -
> -    if (init_pfa_reindex_tabs(s))
> -        goto fail;
> -
> -    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
> -    if (!s->tmp)
> -        goto fail;
> -
> -    s->twiddle_exptab  = av_malloc_array(s->len4, 
> sizeof(*s->twiddle_exptab));
> -    if (!s->twiddle_exptab)
> -        goto fail;
> -
> -    theta = 0.125f + (scale < 0 ? s->len4 : 0);
> -    scale = sqrt(fabs(scale));
> -    for (i = 0; i < s->len4; i++) {
> -        alpha = 2 * M_PI * (i + theta) / len;
> -        s->twiddle_exptab[i].re = cos(alpha) * scale;
> -        s->twiddle_exptab[i].im = sin(alpha) * scale;
> -    }
> -
> -    /* 15-point FFT exptab */
> -    for (i = 0; i < 19; i++) {
> -        if (i < 15) {
> -            double theta = (2.0f * M_PI * i) / 15.0f;
> -            if (!s->inverse)
> -                theta *= -1;
> -            s->exptab[i].re = cos(theta);
> -            s->exptab[i].im = sin(theta);
> -        } else { /* Wrap around to simplify fft15 */
> -            s->exptab[i] = s->exptab[i - 15];
> -        }
> -    }
> -
> -    /* 5-point FFT exptab */
> -    s->exptab[19].re = cos(2.0f * M_PI / 5.0f);
> -    s->exptab[19].im = sin(2.0f * M_PI / 5.0f);
> -    s->exptab[20].re = cos(1.0f * M_PI / 5.0f);
> -    s->exptab[20].im = sin(1.0f * M_PI / 5.0f);
> -
> -    /* Invert the phase for an inverse transform, do nothing for a forward 
> transform */
> -    if (s->inverse) {
> -        s->exptab[19].im *= -1;
> -        s->exptab[20].im *= -1;
> -    }
> -
> -    *ps = s;
> -
> -    return 0;
> -
> -fail:
> -    ff_mdct15_uninit(&s);
> -    return AVERROR(ENOMEM);
> -}
> -
>  /* Stride is hardcoded to 3 */
> -static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
> -                        const FFTComplex *in)
> +static inline void fft5(FFTComplex *out, FFTComplex *in, FFTComplex 
> exptab[2])
>  {
>      FFTComplex z0[4], t[6];
>  
> @@ -219,14 +134,14 @@ static inline void fft5(const FFTComplex exptab[2], 
> FFTComplex *out,
>      out[4].im = in[0].im + z0[3].im;
>  }
>  
> -static void fft15(const FFTComplex exptab[22], FFTComplex *out, const 
> FFTComplex *in, size_t stride)
> +static void fft15_c(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, 
> ptrdiff_t stride)
>  {
>      int k;
>      FFTComplex tmp1[5], tmp2[5], tmp3[5];
>  
> -    fft5(exptab + 19, tmp1, in + 0);
> -    fft5(exptab + 19, tmp2, in + 1);
> -    fft5(exptab + 19, tmp3, in + 2);
> +    fft5(tmp1, in + 0, exptab + 19);
> +    fft5(tmp2, in + 1, exptab + 19);
> +    fft5(tmp3, in + 2, exptab + 19);
>  
>      for (k = 0; k < 5; k++) {
>          FFTComplex t[2];
> @@ -269,7 +184,7 @@ static void mdct15(MDCT15Context *s, float *dst, const 
> float *src, ptrdiff_t str
>              }
>              CMUL(fft15in[j].re, fft15in[j].im, re, im, 
> s->twiddle_exptab[k].re, -s->twiddle_exptab[k].im);
>          }
> -        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> +        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
>      }
>  
>      /* Then a 15xN FFT (where N is a power of two) */
> @@ -291,6 +206,8 @@ static void mdct15(MDCT15Context *s, float *dst, const 
> float *src, ptrdiff_t str
>      }
>  }
>  
> +#include "libavutil/timer.h"


Remove this.

> +
>  static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
>                           ptrdiff_t stride, float scale)
>  {
> @@ -306,7 +223,9 @@ static void imdct15_half(MDCT15Context *s, float *dst, 
> const float *src,
>              FFTComplex tmp = { *(in2 - 2*k*stride), *(in1 + 2*k*stride) };
>              CMUL3(fft15in[j], tmp, s->twiddle_exptab[k]);
>          }
> -        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> +        START_TIMER("fft15");
> +        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
> +        STOP_TIMER("fft15");

Also this, of course.

>      }
>  
>      /* Then a 15xN FFT (where N is a power of two) */
> @@ -327,3 +246,86 @@ static void imdct15_half(MDCT15Context *s, float *dst, 
> const float *src,
>          z[i0].im = scale * im1;
>      }
>  }
> +
> +av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double 
> scale)
> +{
> +    MDCT15Context *s;
> +    double alpha, theta;
> +    int len2 = 15 * (1 << N);
> +    int len  = 2 * len2;
> +    int i;
> +
> +    /* Tested and verified to work on everything in between */
> +    if ((N < 2) || (N > 13))
> +        return AVERROR(EINVAL);
> +
> +    s = av_mallocz(sizeof(*s));
> +    if (!s)
> +        return AVERROR(ENOMEM);
> +
> +    s->fft_n      = N - 1;
> +    s->len4       = len2 / 2;
> +    s->len2       = len2;
> +    s->inverse    = inverse;
> +    s->fft15      = fft15_c;
> +    s->mdct       = mdct15;
> +    s->imdct_half = imdct15_half;
> +
> +    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> +        goto fail;
> +
> +    if (init_pfa_reindex_tabs(s))
> +        goto fail;
> +
> +    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
> +    if (!s->tmp)
> +        goto fail;
> +
> +    s->twiddle_exptab  = av_malloc_array(s->len4, 
> sizeof(*s->twiddle_exptab));
> +    if (!s->twiddle_exptab)
> +        goto fail;
> +
> +    theta = 0.125f + (scale < 0 ? s->len4 : 0);
> +    scale = sqrt(fabs(scale));
> +    for (i = 0; i < s->len4; i++) {
> +        alpha = 2 * M_PI * (i + theta) / len;
> +        s->twiddle_exptab[i].re = cosf(alpha) * scale;
> +        s->twiddle_exptab[i].im = sinf(alpha) * scale;
> +    }
> +
> +    /* 15-point FFT exptab */
> +    for (i = 0; i < 19; i++) {
> +        if (i < 15) {
> +            double theta = (2.0f * M_PI * i) / 15.0f;
> +            if (!s->inverse)
> +                theta *= -1;
> +            s->exptab[i].re = cosf(theta);
> +            s->exptab[i].im = sinf(theta);
> +        } else { /* Wrap around to simplify fft15 */
> +            s->exptab[i] = s->exptab[i - 15];
> +        }
> +    }
> +
> +    /* 5-point FFT exptab */
> +    s->exptab[19].re = cosf(2.0f * M_PI / 5.0f);
> +    s->exptab[19].im = sinf(2.0f * M_PI / 5.0f);
> +    s->exptab[20].re = cosf(1.0f * M_PI / 5.0f);
> +    s->exptab[20].im = sinf(1.0f * M_PI / 5.0f);
> +
> +    /* Invert the phase for an inverse transform, do nothing for a forward 
> transform */
> +    if (s->inverse) {
> +        s->exptab[19].im *= -1;
> +        s->exptab[20].im *= -1;
> +    }
> +
> +    if (ARCH_X86)
> +        ff_mdct15_init_x86(s);
> +
> +    *ps = s;
> +
> +    return 0;
> +
> +fail:
> +    ff_mdct15_uninit(&s);
> +    return AVERROR(ENOMEM);
> +}
> diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
> index ef94edff6c..7d83f3ebdf 100644
> --- a/libavcodec/mdct15.h
> +++ b/libavcodec/mdct15.h
> @@ -34,34 +34,26 @@ typedef struct MDCT15Context {
>      int *pfa_postreindex;
>  
>      FFTContext ptwo_fft;
> -
>      FFTComplex *tmp;
> -
>      FFTComplex *twiddle_exptab;
>  
> -    /* 0 - 18: fft15 twiddles, 19 - 20: fft5 twiddles */
> -    FFTComplex exptab[21];
> +    DECLARE_ALIGNED(32, FFTComplex, exptab)[64];
>  
> -    /**
> -     * Calculate a full 2N -> N MDCT
> -     */
> +    /* 15-point FFT */
> +    void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, 
> ptrdiff_t stride);
> +
> +    /* Calculate a full 2N -> N MDCT */
>      void (*mdct)(struct MDCT15Context *s, float *dst, const float *src, 
> ptrdiff_t stride);
>  
> -    /**
> -     * Calculate the middle half of the iMDCT
> -     */
> +    /* Calculate the middle half of the iMDCT */
>      void (*imdct_half)(struct MDCT15Context *s, float *dst, const float *src,
>                         ptrdiff_t src_stride, float scale);
>  } MDCT15Context;
>  
> -/**
> - * Init an (i)MDCT of the length 2 * 15 * (2^N)
> - */
> +/* Init an (i)MDCT of the length 2 * 15 * (2^N) */
>  int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
> -
> -/**
> - * Frees a context
> - */
>  void ff_mdct15_uninit(MDCT15Context **ps);
>  
> +void ff_mdct15_init_x86(MDCT15Context *s);
> +
>  #endif /* AVCODEC_MDCT15_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index e65118d134..7cfcc8d96a 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -25,6 +25,7 @@ OBJS-$(CONFIG_HUFFYUVDSP)              += 
> x86/huffyuvdsp_init.o
>  OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
>  OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
>  OBJS-$(CONFIG_LPC)                     += x86/lpc.o
> +OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o
>  OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
>  OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
>  OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
> @@ -158,6 +159,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += 
> x86/hevc_add_res.o            \
>                                            x86/hevc_sao.o                \
>                                            x86/hevc_sao_10bit.o
>  X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
> +X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o

Again, move it to the subsystem section.

>  X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
>  X86ASM-OBJS-$(CONFIG_MPEG4_DECODER)    += x86/xvididct.o
>  X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
> diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
> new file mode 100644
> index 0000000000..ad39365b5b
> --- /dev/null
> +++ b/libavcodec/x86/mdct15.asm
> @@ -0,0 +1,148 @@
> +;******************************************************************************
> +;* SIMD optimized non-power-of-two MDCT functions
> +;*
> +;* Copyright (C) 2017 Rostislav Pehlivanov <atomnu...@gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32

No need to make it 32 aligned anymore.

> +
> +sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
> +
> +SECTION .text
> +
> +;                %1    %2     %3                 %4     %5        %6        
> %7
> +%macro FFT5 7 ; src, offset, dst1 (64bit used), dst2, exptab1, exptab2, 
> signadjust (uses m0-m4)
> +    movddup xm0, [%1q + 0*16 + 0 + %2]  ; in[ 0].re, in[ 0].im, in[ 0].re, 
> in[ 0].im
> +    movsd   xm1, [%1q + 1*16 + 8 + %2]  ; in[ 3].re, in[ 3].im,         0,   
>       0
> +    movups  xm2, [%1q + 2*16 + 8 + %2]  ; in[ 5].re, in[ 5].im, in[ 6].re, 
> in[ 6].im
> +    movups  xm3, [%1q + 4*16 + 0 + %2]  ; in[ 8].re, in[ 8].im, in[ 9].re, 
> in[ 9].im

Make these two movsd (Adding eight bytes to the offset)...

> +    movsd   xm4, [%1q + 6*16 + 0 + %2]  ; in[12].re, in[12].im,         0,   
>       0
> +
> +    vinsertf128  m0,  xm0, 1
> +
> +    shufps      xm1,  xm2, q3210        ; in[ 3].re, in[ 3].im, in[ 6].re, 
> in[ 6].im
> +    shufps      xm4,  xm3, q3210        ; in[12].re, in[12].im, in[ 9].re, 
> in[ 9].im

...and these q1010. It makes the function about 20 decicycles faster for me.

> +
> +    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
> +    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
> +
> +    movhlps     %3,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
> +    addps       %3,   xm1
> +    addps       %3,   xm0               ; DC[0].re, DC[0].im, junk...
> +    movlhps     %3,   %3                ; DC[0].re, DC[0].im, DC[0].re, 
> DC[0].im
> +
> +    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
> +    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
> +
> +    mulps       xm%4, xm1, %5
> +    mulps       xm4,  xm3, %6
> +    mulps       xm1,  %6
> +
> +    xorps       xm1,  %7
> +    fmaddsubps  xm3,  xm3, %5, xm1      ; t[0].re, t[0].im, t[2].re, t[2].im

This is the only remaining fma instruction. At this point i assume it
alone is not going to make the function measurably faster than just
using mulps + addsubps.

IMO, just replace it and make the function AVX. That way more CPUs will
be able to run it (SandyBridge/Bulldozer onward, instead of
Haswell/PileDriver).

> +    subps       xm%4, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
> +
> +    movhlps     xm2, xm%4, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
> +    movlhps     xm3, xm%4               ; t[0].re, t[0].im, t[4].re, t[4].im
> +
> +    xorps       xm2,  %7
> +    addps       xm%4, xm2, xm3
> +    subps       xm3,  xm2
> +
> +    shufps      xm3,  xm3, q1032
> +    vinsertf128 m%4,  xm3, 1            ; All ACs (tmp[1] through to tmp[4])
> +    addps       m%4,  m0                ; Finally offset with DCs
> +%endmacro
> +
> +;                           %1          %2        %3    %4    %5   %6
> +%macro BUTTERFLIES_DC 6 ; exptab, exptab_offset, src1, src2, src3, out (uses 
> m0-m1)
> +    movaps m0, [%1q + %2]
> +    vextractf128 xm1, m0, 1
> +
> +    mulps   xm1, %5
> +    mulps   xm0, %4
> +
> +    haddps  xm0,  xm1
> +    movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
> + 
> +    addps   xm0,  xm1
> +    addps   xm0,  %3
> +
> +    movsd [%6q], xm0
> +%endmacro
> +
> +;                           %1          %2        %3    %4    %5   %6
> +%macro BUTTERFLIES_AC 6 ; exptab, exptab_offset, src1, src2, src3, out (uses 
> m0-m3)
> +    mulps m0, %4, [%1q + 64*0 + 0*mmsize + %2]
> +    mulps m1, %4, [%1q + 64*0 + 1*mmsize + %2]
> +    mulps m2, %5, [%1q + 64*1 + 0*mmsize + %2]
> +    mulps m3, %5, [%1q + 64*1 + 1*mmsize + %2]
> +
> +    shufps m1, m1, q2301
> +    shufps m3, m3, q2301
> +
> +    addps m0, m1
> +    addps m2, m3
> +    addps m0, m2
> +    addps m0, %3
> +
> +    vextractf128 xm1, m0, 1
> +
> +    movlps [%6q + strideq*1], xm0
> +    movhps [%6q + strideq*2], xm0
> +    movlps [%6q +  stride3q], xm1
> +    movhps [%6q + strideq*4], xm1
> +%endmacro
> +
> +;******************************************************************************************
> +;void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, 
> ptrdiff_t stride);
> +;******************************************************************************************
> +INIT_YMM fma3
> +cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
> +%define out0q inq
> +    shl strideq, 3
> +
> +    movaps m5, [exptabq + 480]
> +    vextractf128 xm6, m5, 1
> +    movaps xm7, [sign_adjust_5]
> +
> +    FFT5 in,  0,  xm8, 11, xm5, xm6, xm7
> +    FFT5 in,  8,  xm9, 12, xm5, xm6, xm7
> +    FFT5 in, 16, xm10, 13, xm5, xm6, xm7

The first and last three arguments never change, so why not just
hardcoding them in the macro?

> +
> +    lea stride3q, [strideq  + strideq*2]
> +    lea stride5q, [stride3q + strideq*2]
> +
> +    mov out0q, outq
> +
> +    BUTTERFLIES_DC exptab, (8*6 + 4*0)*2*4, xm8, xm9, xm10, out0
> +    lea outq, [out0q + stride5q*1]
> +    BUTTERFLIES_DC exptab, (8*6 + 4*1)*2*4, xm8, xm9, xm10, out
> +    lea outq, [out0q + stride5q*2]
> +    BUTTERFLIES_DC exptab, (8*6 + 4*2)*2*4, xm8, xm9, xm10, out
> +
> +    BUTTERFLIES_AC exptab, (8*0)*2*4, m11, m12, m13, out0
> +    lea outq, [out0q + stride5q*1]
> +    BUTTERFLIES_AC exptab, (8*2)*2*4, m11, m12, m13, out
> +    lea outq, [out0q + stride5q*2]
> +    BUTTERFLIES_AC exptab, (8*4)*2*4, m11, m12, m13, out

Same here.

> +
> +    RET
> diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
> new file mode 100644
> index 0000000000..39106d0047
> --- /dev/null
> +++ b/libavcodec/x86/mdct15_init.c
> @@ -0,0 +1,91 @@
> +/*
> + * SIMD optimized non-power-of-two MDCT functions
> + *
> + * Copyright (C) 2017 Rostislav Pehlivanov <atomnu...@gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/mdct15.h"
> +
> +void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, 
> ptrdiff_t stride);
> +
> +static void perm_twiddles(MDCT15Context *s)
> +{
> +    int k;
> +
> +    FFTComplex tmp[21], tmp2[30];
> +    memcpy(tmp, s->exptab, sizeof(FFTComplex)*21);
> +
> +    /* 15-point FFT twiddles */
> +    for (k = 0; k < 5; k++) {
> +        tmp2[6*k + 0] = tmp[k +  0];
> +        tmp2[6*k + 2] = tmp[k +  5];
> +        tmp2[6*k + 4] = tmp[k + 10];
> +
> +        tmp2[6*k + 1] = tmp[2 * (k + 0)];
> +        tmp2[6*k + 3] = tmp[2 * (k + 5)];
> +        tmp2[6*k + 5] = tmp[2 *  k + 5 ];
> +    }
> +
> +    for (k = 0; k < 6; k++) {
> +        FFTComplex ac_exp[] = {
> +            { tmp2[6*1 + k].re,  tmp2[6*1 + k].re },
> +            { tmp2[6*2 + k].re,  tmp2[6*2 + k].re },
> +            { tmp2[6*3 + k].re,  tmp2[6*3 + k].re },
> +            { tmp2[6*4 + k].re,  tmp2[6*4 + k].re },
> +            { tmp2[6*1 + k].im, -tmp2[6*1 + k].im },
> +            { tmp2[6*2 + k].im, -tmp2[6*2 + k].im },
> +            { tmp2[6*3 + k].im, -tmp2[6*3 + k].im },
> +            { tmp2[6*4 + k].im, -tmp2[6*4 + k].im },
> +        };
> +        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
> +    }
> +
> +    /* Specialcase when k = 0 */
> +    for (k = 0; k < 3; k++) {
> +        FFTComplex dc_exp[] = {
> +            { tmp2[2*k + 0].re, -tmp2[2*k + 0].im },
> +            { tmp2[2*k + 0].im,  tmp2[2*k + 0].re },
> +            { tmp2[2*k + 1].re, -tmp2[2*k + 1].im },
> +            { tmp2[2*k + 1].im,  tmp2[2*k + 1].re },
> +        };
> +        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
> +    }
> +
> +    /* 5-point FFT twiddles */
> +    FFTComplex exp_5point[] = {

/ffmpeg/src/libavcodec/x86/mdct15_init.c:74:5: warning: ISO C90 forbids
mixed declarations and code [-Wdeclaration-after-statement]
     FFTComplex exp_5point[] = {
     ^~~~~~~~~~

> +        { tmp[19].re, tmp[19].re },
> +        { tmp[19].im, tmp[19].im },
> +        { tmp[20].re, tmp[20].re },
> +        { tmp[20].im, tmp[20].im },
> +    };
> +    memcpy(s->exptab + 8*6 + 4*3, exp_5point, 4*sizeof(FFTComplex));
> +}
> +
> +av_cold void ff_mdct15_init_x86(MDCT15Context *s)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (ARCH_X86_64 && EXTERNAL_FMA3(cpu_flags)) {
> +        perm_twiddles(s);

perm_twiddles() should be called once. If more versions of fft15 were to
be added, you'll have to find a way to do that.

> +        s->fft15 = ff_fft15_fma3;
> +    }
> +}
> 

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH v2] mdct15: add assembly optimizations for the 15-point FFT

Reply via email to