PR #22583 opened by hassanhany
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22583
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22583.patch

ac3_sum_square_butterfly_int32_c:                     1186.9 ( 1.00x)
ac3_sum_square_butterfly_int32_sse4:                   382.7 ( 3.10x)
ac3_sum_square_butterfly_float_c:                     1114.2 ( 1.00x)
ac3_sum_square_butterfly_float_ssse3:                  373.0 ( 2.99x)



>From e7b506a096424f7ca962af85cc004c4f5d7dfdd0 Mon Sep 17 00:00:00 2001
From: Hassan Hany <[email protected]>
Date: Mon, 23 Mar 2026 00:45:25 +0200
Subject: [PATCH 1/2] avcodec/x86: implement
 ac3_sum_square_butterfly_int32_sse4

ac3_sum_square_butterfly_int32_c:                     1186.9 ( 1.00x)
ac3_sum_square_butterfly_int32_sse4:                   382.7 ( 3.10x)
---
 libavcodec/x86/ac3dsp.asm    | 66 ++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_init.c |  4 +++
 2 files changed, 70 insertions(+)

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 21f59708b7..fcf63f0d85 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -248,6 +248,72 @@ cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
     RET
 %endmacro
 
+
+;------------------------------------------------------------------------------
+;void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t 
*coef0, const int32_t *coef1, int len)
+;------------------------------------------------------------------------------
+
+
+INIT_XMM sse4
+
+%macro SQUARE_AND_ACCUM 3
+    movdqa    %3, %1
+    pmuldq    %3, %3
+    paddq     %2, %3
+    pshufd    %3, %1, 0xb1
+    pmuldq    %3, %3
+    paddq     %2, %3
+%endmacro
+
+%macro MUL_AND_ACCUM 5
+    movdqa    %4, %1
+    pmuldq    %4, %2
+    paddq     %3, %4
+    pshufd    %4, %1, 0xb1
+    movdqa    %5, %2
+    pshufd    %5, %5, 0xb1
+    pmuldq    %4, %5
+    paddq     %3, %4
+%endmacro
+
+
+cglobal ac3_sum_square_butterfly_int32, 4, 4, 8, sum, coef0, coef1, len
+
+    pxor      m0, m0
+    pxor      m1, m1
+    pxor      m2, m2
+    shl       lend, 2
+    add       coef0q, lenq
+    add       coef1q, lenq
+    neg       lenq
+.loop:
+    movdqa    m4, [coef0q + lenq]
+    movdqa    m5, [coef1q + lenq]
+    SQUARE_AND_ACCUM m4, m0, m7
+    SQUARE_AND_ACCUM m5, m1, m6
+    MUL_AND_ACCUM    m4, m5, m2, m7, m6
+    add       lenq, 16
+    jl        .loop
+    movhlps   m7, m0
+    paddq     m0, m7
+    movhlps   m7, m1
+    paddq     m1, m7
+    movhlps   m7, m2
+    paddq     m2, m7
+    movdqa    m3, m2
+    paddq     m2, m2
+    movdqa    m7, m0
+    paddq     m7, m1
+    movdqa    m3, m7
+    psubq     m3, m2
+    paddq     m2, m7
+    movq      [sumq],    m0
+    movq      [sumq+8],  m1
+    movq      [sumq+16], m2
+    movq      [sumq+24], m3
+    RET
+
+
 %if HAVE_SSE2_EXTERNAL
 INIT_XMM sse2
 AC3_EXTRACT_EXPONENTS
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 353cf38f86..068c90359e 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -31,6 +31,7 @@ void ff_float_to_fixed24_avx  (int32_t *dst, const float 
*src, size_t len);
 
 int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
 
+void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t 
*coef0, const int32_t *coef1, int len);
 void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
 
@@ -49,6 +50,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
         if (!(cpu_flags & AV_CPU_FLAG_ATOM))
             c->extract_exponents = ff_ac3_extract_exponents_ssse3;
     }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_sse4;
+    }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         c->float_to_fixed24 = ff_float_to_fixed24_avx;
     }
-- 
2.52.0


>From 7b11d9a1240fe2db34d2b040d9f901bd5447341f Mon Sep 17 00:00:00 2001
From: Hassan Hany <[email protected]>
Date: Mon, 23 Mar 2026 01:27:32 +0200
Subject: [PATCH 2/2] avcodec/x86: Implement
 ac3_sum_square_butterfly_float_sse3

ac3_sum_square_butterfly_float_c:                     1114.2 ( 1.00x)
ac3_sum_square_butterfly_float_ssse3:                  373.0 ( 2.99x)
---
 libavcodec/x86/ac3dsp.asm    | 38 ++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_init.c |  2 ++
 2 files changed, 40 insertions(+)

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index fcf63f0d85..7b286c1774 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -313,6 +313,44 @@ cglobal ac3_sum_square_butterfly_int32, 4, 4, 8, sum, 
coef0, coef1, len
     movq      [sumq+24], m3
     RET
 
+;------------------------------------------------------------------------------
+;void ff_ac3_sum_square_butterfly_float_sse4(float sum[4], const float *coef0, 
const float *coef1, int len);
+;------------------------------------------------------------------------------
+
+
+INIT_XMM sse3
+
+cglobal ac3_sum_square_butterfly_float, 4, 4, 8, sum, coef0, coef1, len
+    pxor      m0, m0
+    pxor      m1, m1
+    pxor      m2, m2
+    pxor      m3, m3
+    shl       lend, 2
+    add       coef0q, lenq
+    add       coef1q, lenq
+    neg       lenq
+.loop:
+    movaps    m4, [coef0q + lenq]
+    movaps    m5, [coef1q + lenq]
+    movaps    m6, m4
+    addps     m6, m5
+    movaps    m7, m5
+    subps     m7, m4
+    mulps     m4, m4
+    mulps     m5, m5
+    mulps     m6, m6
+    mulps     m7, m7
+    addps     m0, m4
+    addps     m1, m5
+    addps     m2, m6
+    addps     m3, m7
+    add       lenq, 16
+    jl        .loop
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps    [sumq], m0
+    RET
 
 %if HAVE_SSE2_EXTERNAL
 INIT_XMM sse2
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 068c90359e..e8ab1945b7 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -32,6 +32,7 @@ void ff_float_to_fixed24_avx  (int32_t *dst, const float 
*src, size_t len);
 int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
 
 void ff_ac3_sum_square_butterfly_int32_sse4(int64_t sum[4],const int32_t 
*coef0, const int32_t *coef1, int len);
+void ff_ac3_sum_square_butterfly_float_sse3(float sump[4], const float *coef0, 
const float *coef1, int len);
 void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
 
@@ -49,6 +50,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         if (!(cpu_flags & AV_CPU_FLAG_ATOM))
             c->extract_exponents = ff_ac3_extract_exponents_ssse3;
+        c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_sse3;
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
         c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_sse4;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to