On Sun, Oct 30, 2011 at 05:15:47AM -0400, Justin Ruggles wrote:
> ---
> Benchmarks for the two calls combined.
>
> Athlon64:
> C - 11851
> SSE - 6603
>
> SandyBridge:
> C - 5655
> SSE - 2380
>
> libavcodec/dsputil.c | 13 +++++++++++++
> libavcodec/dsputil.h | 17 +++++++++++++++++
> libavcodec/twinvq.c | 34 ++++++++++++++++------------------
> libavcodec/x86/dsputil_mmx.c | 4 ++++
> libavcodec/x86/dsputil_yasm.asm | 31 +++++++++++++++++++++++++++++++
> 5 files changed, 81 insertions(+), 18 deletions(-)
>
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 182063c..9123857 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1,
> float *restrict v2,
> }
> }
>
> +static void butterflies_float_interleave_c(float *dst, const float *src0,
> + const float *src1, int len)
> +{
> + int i;
> + for (i = 0; i < len; i++) {
> + float f1 = src0[i];
> + float f2 = src1[i];
> + dst[2*i ] = f1 + f2;
> + dst[2*i + 1] = f1 - f2;
> + }
> +}
> +
> static float scalarproduct_float_c(const float *v1, const float *v2, int len)
> {
> float p = 0.0;
> @@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext
> *avctx)
> c->vector_clip_int32 = vector_clip_int32_c;
> c->scalarproduct_float = scalarproduct_float_c;
> c->butterflies_float = butterflies_float_c;
> + c->butterflies_float_interleave = butterflies_float_interleave_c;
> c->vector_fmul_scalar = vector_fmul_scalar_c;
> c->vector_fmac_scalar = vector_fmac_scalar_c;
>
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index acb2041..587a54d 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -453,6 +453,23 @@ typedef struct DSPContext {
> */
> void (*butterflies_float)(float *restrict v1, float *restrict v2, int
> len);
>
> + /**
> + * Calculate the sum and difference of two vectors of floats and
> interleave
> + * results into a separate output vector of floats, with each sum
> + * positioned before the corresponding difference.
> + *
> + * @param dst output vector
> + * constraints: 16-byte aligned
> + * @param src0 first input vector
> + * constraints: 16-byte aligned
> + * @param src1 second input vector
> + * constraints: 16-byte aligned
> + * @param len number of elements in the input
> + * constraints: multiple of 4
> + */
> + void (*butterflies_float_interleave)(float *dst, const float *src0,
> + const float *src1, int len);
> +
> /* (I)DCT */
> void (*fdct)(DCTELEM *block/* align 16*/);
> void (*fdct248)(DCTELEM *block/* align 16*/);
> diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
> index 73eb7c1..a285156 100644
> --- a/libavcodec/twinvq.c
> +++ b/libavcodec/twinvq.c
> @@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum
> FrameType ftype, int wtype,
> float *out)
> {
> const ModeTab *mtab = tctx->mtab;
> + int size1, size2;
> float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
> - int i, j;
> + int i;
>
> for (i = 0; i < tctx->avctx->channels; i++) {
> imdct_and_window(tctx, ftype, wtype,
> @@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum
> FrameType ftype, int wtype,
> i);
> }
>
> + size2 = tctx->last_block_pos[0];
> + size1 = mtab->size - size2;
> if (tctx->avctx->channels == 2) {
> - for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
> - float f1 = prev_buf[ i];
> - float f2 = prev_buf[2*mtab->size + i];
> - out[2*i ] = f1 + f2;
> - out[2*i + 1] = f1 - f2;
> - }
> - for (j = 0; i < mtab->size; j++,i++) {
> - float f1 = tctx->curr_frame[ j];
> - float f2 = tctx->curr_frame[2*mtab->size + j];
> - out[2*i ] = f1 + f2;
> - out[2*i + 1] = f1 - f2;
> - }
> + tctx->dsp.butterflies_float_interleave(out, prev_buf,
> + &prev_buf[2*mtab->size],
> + size1);
> +
> + out += 2 * size1;
> +
> + tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
> +
> &tctx->curr_frame[2*mtab->size],
> + size2);
> } else {
> - memcpy(out, prev_buf,
> - (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
> + memcpy(out, prev_buf, size1 * sizeof(*out));
>
> - out += mtab->size - tctx->last_block_pos[0];
> + out += size1;
>
> - memcpy(out, tctx->curr_frame,
> - (tctx->last_block_pos[0]) * sizeof(*out));
> + memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
> }
>
> }
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 959a2c2..8e0376d 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -2424,6 +2424,9 @@ void ff_vector_clip_int32_sse2_int(int32_t *dst, const
> int32_t *src, int32_t min
> void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t
> min,
> int32_t max, unsigned int len);
>
> +extern void ff_butterflies_float_interleave_sse(float *dst, const float
> *src0,
> + const float *src1, int len);
> +
> void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
> {
> int mm_flags = av_get_cpu_flags();
> @@ -2868,6 +2871,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
> *avctx)
> c->vector_clipf = vector_clipf_sse;
> #if HAVE_YASM
> c->scalarproduct_float = ff_scalarproduct_float_sse;
> + c->butterflies_float_interleave =
> ff_butterflies_float_interleave_sse;
> #endif
> }
> if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
> diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
> index fe96d8b..d1b9ce3 100644
> --- a/libavcodec/x86/dsputil_yasm.asm
> +++ b/libavcodec/x86/dsputil_yasm.asm
> @@ -1123,3 +1123,34 @@ VECTOR_CLIP_INT32 sse41, 11, 1, 1
> %else
> VECTOR_CLIP_INT32 sse41, 6, 1, 0
> %endif
> +
> +;-----------------------------------------------------------------------------
> +; void ff_butterflies_float_interleave(float *dst, const float *src0,
> +; const float *src1, int len);
> +;-----------------------------------------------------------------------------
> +
> +INIT_XMM
> +cglobal butterflies_float_interleave_sse, 4,4,4, dst, src0, src1, len
> + test lenq, lenq
> + jz .end
> + shl lenq, 2
> + lea src0q, [src0q + lenq]
> + lea src1q, [src1q + lenq]
> + lea dstq, [ dstq + 2*lenq]
> + neg lenq
> +.loop:
> + mova m0, [src0q + lenq]
> + mova m2, [src1q + lenq]
> + mova m1, m0
> + mova m3, m2
> + addps m0, m2
> + subps m1, m3
subps m1, m2 and throw mova m3, m2 out
> + mova m2, m0
> + unpcklps m0, m1
> + unpckhps m2, m1
> + mova [dstq + 2*lenq ], m0
> + mova [dstq + 2*lenq + mmsize], m2
> + add lenq, mmsize
> + jl .loop
> +.end:
> + REP_RET
> --
In general - LGTM
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel