This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 415b466d41ac81856abc76d7a9341132b0f668b0
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Apr 12 23:50:52 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Apr 19 08:21:17 2026 +0200

    avcodec/x86/vp3dsp: Port ff_vp3_idct_dc_add_mmxext to SSE2
    
    This change should improve performance on Skylake and later
    Intel CPUs (which have only half the ports for saturated adds/subs
    for mmx register compared to xmm register): llvm-mca predicts
    a 25% performance improvement on Skylake.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp3dsp.asm    | 12 ++++++------
 libavcodec/x86/vp3dsp_init.c |  7 ++-----
 tests/checkasm/vp3dsp.c      |  2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 096c47057a..e9bc184800 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -1,5 +1,5 @@
 ;******************************************************************************
-;* MMX/SSE2-optimized functions for the VP3 decoder
+;* SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <[email protected]>
 ;*
 ;* This file is part of FFmpeg.
@@ -21,7 +21,7 @@
 
 %include "libavutil/x86/x86util.asm"
 
-; MMX-optimized functions cribbed from the original VP3 source code.
+; ASM-optimized functions cribbed from the original VP3 source code.
 
 SECTION_RODATA
 
@@ -421,9 +421,9 @@ cglobal vp3_idct_add, 3, 4, 9, dst, s, block, s3
     movq [dstq+s3q ], m5
 %endmacro
 
-INIT_MMX mmxext
-; void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t 
*block)
-cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
+INIT_XMM sse2
+; void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block)
+cglobal vp3_idct_dc_add, 3, 4, 6, dst, s, block, dc
     movsx        dcd, word [blockq]
     mov word [blockq], 0
 %define s3q blockq
@@ -431,7 +431,7 @@ cglobal vp3_idct_dc_add, 3, 4, 0, dst, s, block, dc
     add          dcd, 15
     sar          dcd, 5
     movd          m0, dcd
-    pshufw        m0, m0, 0x0
+    SPLATW        m0, m0
     pxor          m1, m1
     psubw         m1, m0
     packuswb      m0, m0
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 4a416c6f9a..0ba4f00ed6 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -29,7 +29,7 @@
 void ff_vp3_idct_put_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vp3_idct_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
-void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, ptrdiff_t stride, int16_t 
*block);
+void ff_vp3_idct_dc_add_sse2(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
 void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
                                int *bounding_values);
@@ -44,15 +44,12 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
-    }
-
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2;
 
         c->idct_put  = ff_vp3_idct_put_sse2;
         c->idct_add  = ff_vp3_idct_add_sse2;
+        c->idct_dc_add = ff_vp3_idct_dc_add_sse2;
 
         c->v_loop_filter = c->v_loop_filter_unaligned = 
ff_vp3_v_loop_filter_sse2;
         c->h_loop_filter = c->h_loop_filter_unaligned = 
ff_vp3_h_loop_filter_sse2;
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index f75e4a0617..7890364ee5 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -92,7 +92,7 @@ static void vp3_check_idct(int nb_bits)
         BUF_SIZE     = MAX_STRIDE * (NB_LINES - 1) + WIDTH,
     };
 
-    declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint8_t *dest, ptrdiff_t 
stride, int16_t *block);
+    declare_func(void, uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
     DECLARE_ALIGNED(16, int16_t, block_new)[64];
     DECLARE_ALIGNED(16, int16_t, block_ref)[64];

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to