PR #22583 opened by hassanhany URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22583 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22583.patch
ac3_sum_square_butterfly_int32_c: 1186.9 ( 1.00x) ac3_sum_square_butterfly_int32_sse4: 382.7 ( 3.10x) ac3_sum_square_butterfly_float_c: 1114.2 ( 1.00x) ac3_sum_square_butterfly_float_ssse3: 373.0 ( 2.99x) >From e7b506a096424f7ca962af85cc004c4f5d7dfdd0 Mon Sep 17 00:00:00 2001 From: Hassan Hany <[email protected]> Date: Mon, 23 Mar 2026 00:45:25 +0200 Subject: [PATCH 1/2] avcodec/x86: implement ac3_sum_square_butterfly_int32_sse4 ac3_sum_square_butterfly_int32_c: 1186.9 ( 1.00x) ac3_sum_square_butterfly_int32_sse4: 382.7 ( 3.10x) --- libavcodec/x86/ac3dsp.asm | 66 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/ac3dsp_init.c | 4 +++ 2 files changed, 70 insertions(+) diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index 21f59708b7..fcf63f0d85 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -248,6 +248,72 @@ cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len RET %endmacro + +;------------------------------------------------------------------------------ +;void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t *coef0, const int32_t *coef1, int len) +;------------------------------------------------------------------------------ + + +INIT_XMM sse4 + +%macro SQUARE_AND_ACCUM 3 + movdqa %3, %1 + pmuldq %3, %3 + paddq %2, %3 + pshufd %3, %1, 0xb1 + pmuldq %3, %3 + paddq %2, %3 +%endmacro + +%macro MUL_AND_ACCUM 5 + movdqa %4, %1 + pmuldq %4, %2 + paddq %3, %4 + pshufd %4, %1, 0xb1 + movdqa %5, %2 + pshufd %5, %5, 0xb1 + pmuldq %4, %5 + paddq %3, %4 +%endmacro + + +cglobal ac3_sum_square_butterfly_int32, 4, 4, 8, sum, coef0, coef1, len + + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + shl lend, 2 + add coef0q, lenq + add coef1q, lenq + neg lenq +.loop: + movdqa m4, [coef0q + lenq] + movdqa m5, [coef1q + lenq] + SQUARE_AND_ACCUM m4, m0, m7 + SQUARE_AND_ACCUM m5, m1, m6 + MUL_AND_ACCUM m4, m5, m2, m7, m6 + add lenq, 16 + jl .loop + movhlps m7, m0 + paddq m0, m7 + movhlps m7, m1 + paddq m1, m7 + movhlps m7, m2 + paddq m2, m7 + movdqa m3, m2 + paddq m2, m2 + movdqa m7, m0 + paddq m7, m1 + movdqa m3, m7 + psubq m3, m2 + paddq m2, m7 + movq [sumq], m0 + movq [sumq+8], m1 + movq [sumq+16], m2 + movq [sumq+24], m3 + RET + + %if HAVE_SSE2_EXTERNAL INIT_XMM sse2 AC3_EXTRACT_EXPONENTS diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index 353cf38f86..068c90359e 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -31,6 +31,7 @@ void ff_float_to_fixed24_avx (int32_t *dst, const float *src, size_t len); int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); +void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t *coef0, const int32_t *coef1, int len); void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); @@ -49,6 +50,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) if (!(cpu_flags & AV_CPU_FLAG_ATOM)) c->extract_exponents = ff_ac3_extract_exponents_ssse3; } + if (EXTERNAL_SSE4(cpu_flags)) { + c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_sse4; + } if (EXTERNAL_AVX_FAST(cpu_flags)) { c->float_to_fixed24 = ff_float_to_fixed24_avx; } -- 2.52.0 >From 7b11d9a1240fe2db34d2b040d9f901bd5447341f Mon Sep 17 00:00:00 2001 From: Hassan Hany <[email protected]> Date: Mon, 23 Mar 2026 01:27:32 +0200 Subject: [PATCH 2/2] avcodec/x86: Implement ac3_sum_square_butterfly_float_sse3 ac3_sum_square_butterfly_float_c: 1114.2 ( 1.00x) ac3_sum_square_butterfly_float_ssse3: 373.0 ( 2.99x) --- libavcodec/x86/ac3dsp.asm | 38 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/ac3dsp_init.c | 2 ++ 2 files changed, 40 insertions(+) diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index fcf63f0d85..7b286c1774 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -313,6 +313,44 @@ cglobal ac3_sum_square_butterfly_int32, 4, 4, 8, sum, coef0, coef1, len movq [sumq+24], m3 RET +;------------------------------------------------------------------------------ +;void ff_ac3_sum_square_butterfly_float_sse4(float sum[4], const float *coef0, const float *coef1, int len); +;------------------------------------------------------------------------------ + + +INIT_XMM sse3 + +cglobal ac3_sum_square_butterfly_float, 4, 4, 8, sum, coef0, coef1, len + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 + shl lend, 2 + add coef0q, lenq + add coef1q, lenq + neg lenq +.loop: + movaps m4, [coef0q + lenq] + movaps m5, [coef1q + lenq] + movaps m6, m4 + addps m6, m5 + movaps m7, m5 + subps m7, m4 + mulps m4, m4 + mulps m5, m5 + mulps m6, m6 + mulps m7, m7 + addps m0, m4 + addps m1, m5 + addps m2, m6 + addps m3, m7 + add lenq, 16 + jl .loop + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [sumq], m0 + RET %if HAVE_SSE2_EXTERNAL INIT_XMM sse2 diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index 068c90359e..e8ab1945b7 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -32,6 +32,7 @@ void ff_float_to_fixed24_avx (int32_t *dst, const float *src, size_t len); int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t *coef0, const int32_t *coef1, int len); +void ff_ac3_sum_square_butterfly_float_sse3(float sump[4], const float *coef0, const float *coef1, int len); void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); @@ -49,6 +50,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) if (EXTERNAL_SSSE3(cpu_flags)) { if (!(cpu_flags & AV_CPU_FLAG_ATOM)) c->extract_exponents = ff_ac3_extract_exponents_ssse3; + c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_sse3; } if (EXTERNAL_SSE4(cpu_flags)) { c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_sse4; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
