vp3dsp: Port ff_put_vp_no_rnd_pixels8_l2_mmx to SSE2

Andreas Rheinhardt via ffmpeg-cvslog Sun, 19 Apr 2026 00:04:01 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 84b9de0633850721e5a3fecf906535226fd19ee9
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Apr 12 22:39:04 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Apr 19 08:15:54 2026 +0200

    avcodec/x86/vp3dsp: Port ff_put_vp_no_rnd_pixels8_l2_mmx to SSE2
    
    This allows to use pavgb to reduce the amount of instructions used
    to calculate the average; processing two rows via movhps allows
    to reduce the amount of pxor and pavgb even further and turned
    out to be beneficial.
    This patch also avoids a load as the constant used here can be easily
    generated at runtime.
    
    Old benchmarks:
    put_no_rnd_pixels_l2_c:                                 13.3 ( 1.00x)
    put_no_rnd_pixels_l2_mmx:                               11.6 ( 1.15x)
    
    New benchmarks:
    put_no_rnd_pixels_l2_c:                                 13.4 ( 1.00x)
    put_no_rnd_pixels_l2_sse2:                               7.5 ( 1.77x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp3dsp.asm    | 50 +++++++++++++++++++-------------------------
 libavcodec/x86/vp3dsp_init.c | 12 +++++------
 tests/checkasm/vp3dsp.c      |  2 +-
 3 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index b79477288a..87943cb302 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -34,7 +34,6 @@ vp3_idct_data: times 8 dw 64277
                times 8 dw 12785
 
 cextern pb_80
-cextern pb_FE
 
 cextern pw_4
 cextern pw_8
@@ -155,40 +154,35 @@ cglobal vp3_h_loop_filter, 3, 4, 6
     RET
 
 %macro PAVGB_NO_RND 0
-    mova   m4, m0
-    mova   m5, m2
-    pand   m4, m1
-    pand   m5, m3
-    pxor   m1, m0
-    pxor   m3, m2
-    pand   m1, m6
-    pand   m3, m6
-    psrlq  m1, 1
-    psrlq  m3, 1
-    paddb  m4, m1
-    paddb  m5, m3
+    pxor          m0, m4
+    pxor          m1, m4
+    pavgb         m0, m1
+    pxor          m0, m4
 %endmacro
 
-INIT_MMX mmx
-cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
-    mova   m6, [pb_FE]
+INIT_XMM sse2
+; void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a,
+;                                        const uint8_t *b, ptrdiff_t stride,
+;                                        int h)
+cglobal vp3_put_no_rnd_pixels8_l2, 5, 6, 5, dst, src1, src2, stride, h, stride3
     lea    stride3q,[strideq+strideq*2]
+    pcmpeqb       m4, m4
 .loop:
-    mova   m0, [src1q]
-    mova   m1, [src2q]
-    mova   m2, [src1q+strideq]
-    mova   m3, [src2q+strideq]
+    movq          m0, [src1q]
+    movq          m1, [src2q]
+    movhps        m0, [src1q+strideq]
+    movhps        m1, [src2q+strideq]
     PAVGB_NO_RND
-    mova   [dstq], m4
-    mova   [dstq+strideq], m5
+    movq      [dstq], m0
+    movhps [dstq+strideq], m0
 
-    mova   m0, [src1q+strideq*2]
-    mova   m1, [src2q+strideq*2]
-    mova   m2, [src1q+stride3q]
-    mova   m3, [src2q+stride3q]
+    movq          m0, [src1q+strideq*2]
+    movq          m1, [src2q+strideq*2]
+    movhps        m0, [src1q+stride3q]
+    movhps        m1, [src2q+stride3q]
     PAVGB_NO_RND
-    mova   [dstq+strideq*2], m4
-    mova   [dstq+stride3q],  m5
+    movq   [dstq+strideq*2], m0
+    movhps [dstq+stride3q],  m0
 
     lea    src1q, [src1q+strideq*4]
     lea    src2q, [src2q+strideq*4]
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 42daf99981..4a416c6f9a 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -36,23 +36,21 @@ void ff_vp3_v_loop_filter_sse2(uint8_t *src, ptrdiff_t 
stride,
 void ff_vp3_h_loop_filter_sse2(uint8_t *src, ptrdiff_t stride,
                                int *bounding_values);
 
-void ff_put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a,
-                                     const uint8_t *b, ptrdiff_t stride,
-                                     int h);
+void ff_vp3_put_no_rnd_pixels8_l2_sse2(uint8_t *dst, const uint8_t *a,
+                                       const uint8_t *b, ptrdiff_t stride,
+                                       int h);
 
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags)) {
-        c->put_no_rnd_pixels_l2 = ff_put_vp_no_rnd_pixels8_l2_mmx;
-    }
-
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_no_rnd_pixels_l2 = ff_vp3_put_no_rnd_pixels8_l2_sse2;
+
         c->idct_put  = ff_vp3_idct_put_sse2;
         c->idct_add  = ff_vp3_idct_add_sse2;
 
diff --git a/tests/checkasm/vp3dsp.c b/tests/checkasm/vp3dsp.c
index 44910b515b..b48a7514de 100644
--- a/tests/checkasm/vp3dsp.c
+++ b/tests/checkasm/vp3dsp.c
@@ -47,7 +47,7 @@ static void vp3_check_put_no_rnd_pixels_l2(const 
VP3DSPContext *const vp3dsp)
         BUF_SIZE     = MAX_STRIDE * (HEIGHT - 1) + WIDTH,
         SRC_BUF_SIZE = BUF_SIZE + (WIDTH - 1), ///< WIDTH-1 to use misaligned 
input
     };
-    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst,
+    declare_func(void, uint8_t *dst,
                  const uint8_t *a, const uint8_t *b,
                  ptrdiff_t stride, int h);
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/07: avcodec/x86/vp3dsp: Port ff_put_vp_no_rnd_pixels8_l2_mmx to SSE2

Reply via email to