vc1dsp_mc: Add SSSE3 version, remove superseded MMX version (PR #23020)

mkver via ffmpeg-devel Tue, 05 May 2026 03:53:00 -0700

PR #23020 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23020
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23020.patch


For now, this covers only the purely horizontal and vertical functions. The 
mixed ones will be added soon.


>From 9e5aa77d72b642b70968a4e049472a803c23ae69 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 15 Oct 2025 16:29:08 +0200
Subject: [PATCH 01/11] tests/checkasm/vc1dsp: Improve mspel test

Up until now, only the fullpel test (i.e. the test without pixel
interpolation) has been tested at all.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/vc1dsp.c | 78 +++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 19 deletions(-)

diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
index dda6d36257..63112256f0 100644
--- a/tests/checkasm/vc1dsp.c
+++ b/tests/checkasm/vc1dsp.c
@@ -441,34 +441,74 @@ static void check_unescape(void)
 
 static void check_mspel_pixels(void)
 {
-    LOCAL_ALIGNED_16(uint8_t, src0, [32 * 32]);
-    LOCAL_ALIGNED_16(uint8_t, src1, [32 * 32]);
-    LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 32]);
-    LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 32]);
+    enum {
+        MAX_BLOCK_SIZE = 16,
+        MAX_STRIDE     = 64,
+        /// BUF_SIZE is bigger than necessary in order to test strides > block 
width.
+        BUF_SIZE       = (MAX_BLOCK_SIZE - 1) * MAX_STRIDE + MAX_BLOCK_SIZE,
+        /**
+         * Due to qpel interpolation the input needs one extra line at the top
+         * and two at the bottom; horizontal interpolation also needs one pixel
+         * to the left and two to the right. At least the x86 implementation
+         * actually accesses three pixels to the right.
+         * The input is not subject to alignment requirements; making the 
input buffer
+         * bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random 
misalignment.
+         */
+        INPUT_BUF_SIZE = (MAX_BLOCK_SIZE - 1) + 1 +
+                         (MAX_BLOCK_SIZE + 1 + 2 - 1) * MAX_STRIDE + 
MAX_BLOCK_SIZE + 2 + 1,
+    };
+    DECLARE_ALIGNED(16, uint8_t, dstbuf0)[BUF_SIZE];
+    DECLARE_ALIGNED(16, uint8_t, dstbuf1)[BUF_SIZE];
+    uint8_t srcbuf0[INPUT_BUF_SIZE];
+    uint8_t srcbuf1[INPUT_BUF_SIZE];
 
     VC1DSPContext h;
 
-    const test tests[] = {
-        VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[0][0], 16, 16)
-        VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[1][0], 8, 8)
-        VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[0][0], 16, 16)
-        VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[1][0], 8, 8)
+    const struct MSPelTest {
+        const char *name;
+        size_t offset;
+    } tests[] = {
+#define MSPEL_TEST(elem) { .name = #elem, offsetof(VC1DSPContext, elem) }
+        MSPEL_TEST(put_vc1_mspel_pixels_tab),
+        MSPEL_TEST(avg_vc1_mspel_pixels_tab),
     };
 
     ff_vc1dsp_init(&h);
 
     for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-        void (*func)(uint8_t *, const uint8_t*, ptrdiff_t, int) = *(void 
**)((intptr_t) &h + tests[t].offset);
-        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
-            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const 
uint8_t*, ptrdiff_t, int);
-            RANDOMIZE_BUFFER8(dst, 32 * 32);
-            RANDOMIZE_BUFFER8(src, 32 * 32);
-            call_ref(dst0, src0, 32, 0);
-            call_new(dst1, src1, 32, 0);
-            if (memcmp(dst0, dst1, 32 * 32)) {
-                fail();
+        const vc1op_pixels_func (*func)[16] = 
(vc1op_pixels_func(*)[16])((char*)&h + tests[t].offset);
+        for (unsigned j = 0; j < 2; ++j) {
+            const unsigned blocksize = 16 >> j;
+
+            for (unsigned dxy = 0; dxy < 16; ++dxy) {
+                if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", 
tests[t].name, dxy & 3, dxy >> 2, blocksize)) {
+                    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const 
uint8_t*, ptrdiff_t, int);
+                    size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) 
* blocksize;
+                    ptrdiff_t stride  = (rnd() % (MAX_STRIDE / blocksize) + 1) 
* blocksize;
+                    size_t src_offset = 1 + stride + rnd() % MAX_BLOCK_SIZE;
+                    const uint8_t *src0 = srcbuf0 + src_offset, *src1 = 
srcbuf1 + src_offset;
+                    uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + 
dst_offset;
+
+                    if (rnd() & 1) {
+                        // Flip stride.
+                        dst1  += (blocksize - 1) * stride;
+                        dst0  += (blocksize - 1) * stride;
+                        // We need one line above src and two lines below the 
block,
+                        // hence blocksize * stride.
+                        src0  += blocksize * stride;
+                        src1  += blocksize * stride;
+                        stride = -stride;
+                    }
+                    RANDOMIZE_BUFFER8(dstbuf, sizeof(dstbuf0));
+                    RANDOMIZE_BUFFER8(srcbuf, sizeof(srcbuf0));
+                    call_ref(dst0, src0, stride, 0);
+                    call_new(dst1, src1, stride, 0);
+                    if (memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0))) {
+                        fail();
+                    }
+                    bench_new(dst1, src1, stride, 0);
+                }
             }
-            bench_new(dst1, src0, 32, 0);
         }
     }
 }
-- 
2.52.0


>From 457d06272bbd4d7557ff17b6c7fbe22450e8361d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 12 Oct 2025 15:14:37 +0200
Subject: [PATCH 02/11] avcodec/x86/vc1dsp_mc: Remove unnecessary movsxdifnidn

Forgotten in 5a49097b42cbc3eab888d15a91eeaf5520b5c381
merging Libav commit 2ec9fa5ec60dcd10e1cb10d8b4e4437e634ea428
(which changed the stride to ptrdiff_t).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_mc.asm | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index c1b3ed1bc3..b5f6bcec9b 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -194,7 +194,6 @@ HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE
 
 %macro INV_TRANS_INIT 0
-    movsxdifnidn linesizeq, linesized
     movd       m0, blockd
     SPLATW     m0, m0
     pxor       m1, m1
-- 
2.52.0


>From c6be9ff77d60bf7cf013367b4215b2668ec1e906 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 17:45:13 +0200
Subject: [PATCH 03/11] tests/checkasm/vc1dsp: Fix shadowing

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/vc1dsp.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
index 63112256f0..ece810aab1 100644
--- a/tests/checkasm/vc1dsp.c
+++ b/tests/checkasm/vc1dsp.c
@@ -306,18 +306,18 @@ static void check_inv_trans_adding(void)
 
     ff_vc1dsp_init(&h);
 
-    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) 
&h + tests[t].offset);
-        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
+    for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) {
+        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) 
&h + tests[k].offset);
+        if (check_func(func, "vc1dsp.%s", tests[k].name)) {
             matrix *coeffs;
             declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, 
int16_t *);
             RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
             RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
-            coeffs = 
generate_inverse_quantized_transform_coefficients(tests[t].width, 
tests[t].height);
-            for (int j = 0; j < tests[t].height; ++j)
-                for (int i = 0; i < tests[t].width; ++i) {
+            coeffs = 
generate_inverse_quantized_transform_coefficients(tests[k].width, 
tests[k].height);
+            for (int j = 0; j < tests[k].height; ++j)
+                for (int i = 0; i < tests[k].width; ++i) {
                     int idx = j * 8 + i;
-                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 
tests[t].width + i];
+                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 
tests[k].width + i];
                 }
             call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
             call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
@@ -352,10 +352,10 @@ static void check_loop_filter(void)
 
     ff_vc1dsp_init(&h);
 
-    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + 
tests[t].offset);
+    for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) {
+        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + 
tests[k].offset);
         declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
-        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
+        if (check_func(func, "vc1dsp.%s", tests[k].name)) {
             for (int count = 1000; count > 0; --count) {
                 int pq = rnd() % 31 + 1;
                 RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
@@ -368,9 +368,9 @@ static void check_loop_filter(void)
         for (int j = 0; j < 24; ++j)
             for (int i = 0; i < 48; ++i)
                 filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
-        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
+        if (check_func(func, "vc1dsp.%s_bestcase", tests[k].name))
             bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
-        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
+        if (check_func(func, "vc1dsp.%s_worstcase", tests[k].name))
             bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
     }
 }
@@ -475,13 +475,13 @@ static void check_mspel_pixels(void)
 
     ff_vc1dsp_init(&h);
 
-    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
-        const vc1op_pixels_func (*func)[16] = 
(vc1op_pixels_func(*)[16])((char*)&h + tests[t].offset);
+    for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) {
+        const vc1op_pixels_func (*func)[16] = 
(vc1op_pixels_func(*)[16])((char*)&h + tests[k].offset);
         for (unsigned j = 0; j < 2; ++j) {
             const unsigned blocksize = 16 >> j;
 
             for (unsigned dxy = 0; dxy < 16; ++dxy) {
-                if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", 
tests[t].name, dxy & 3, dxy >> 2, blocksize)) {
+                if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", 
tests[k].name, dxy & 3, dxy >> 2, blocksize)) {
                     declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const 
uint8_t*, ptrdiff_t, int);
                     size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) 
* blocksize;
                     ptrdiff_t stride  = (rnd() % (MAX_STRIDE / blocksize) + 1) 
* blocksize;
-- 
2.52.0


>From 25decfcd2e4c3edede90329c01106e72ec4bccfe Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 18:31:19 +0200
Subject: [PATCH 04/11] avcodec/x86/vc1dsp_mc: Move inverse transform into a
 file of its own

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/Makefile             |   3 +-
 libavcodec/x86/vc1dsp_inv_trans.asm | 121 ++++++++++++++++++++++++++++
 libavcodec/x86/vc1dsp_mc.asm        |  97 ----------------------
 3 files changed, 123 insertions(+), 98 deletions(-)
 create mode 100644 libavcodec/x86/vc1dsp_inv_trans.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index e87cb750f4..5d746d24c8 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -126,7 +126,8 @@ X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o     
            \
                                           x86/fpel.o                    \
                                           x86/qpel.o
 X86ASM-OBJS-$(CONFIG_RV34DSP)          += x86/rv34dsp.o
-X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp_loopfilter.o       \
+X86ASM-OBJS-$(CONFIG_VC1DSP)           += x86/vc1dsp_inv_trans.o        \
+                                          x86/vc1dsp_loopfilter.o       \
                                           x86/vc1dsp_mc.o x86/fpel.o
 ifdef ARCH_X86_64
 X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/simple_idct10.o
diff --git a/libavcodec/x86/vc1dsp_inv_trans.asm 
b/libavcodec/x86/vc1dsp_inv_trans.asm
new file mode 100644
index 0000000000..e1b74de6c4
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_inv_trans.asm
@@ -0,0 +1,121 @@
+;******************************************************************************
+;* VC1 inverse transform
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro INV_TRANS_INIT 0
+    movd       m0, blockd
+    SPLATW     m0, m0
+    pxor       m1, m1
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+
+    DEFINE_ARGS dest, linesize, linesize3
+    lea    linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+    mov%1                  m2, [destq+linesizeq*0]
+    mov%1                  m3, [destq+linesizeq*1]
+    mov%1                  m4, [destq+linesizeq*2]
+    mov%1                  m5, [destq+linesize3q]
+    paddusb                m2, m0
+    paddusb                m3, m0
+    paddusb                m4, m0
+    paddusb                m5, m0
+    psubusb                m2, m1
+    psubusb                m3, m1
+    psubusb                m4, m1
+    psubusb                m5, m1
+    mov%1 [linesizeq*0+destq], m2
+    mov%1 [linesizeq*1+destq], m3
+    mov%1 [linesizeq*2+destq], m4
+    mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+    movsx         r3d, WORD [blockq]
+    mov        blockd, r3d             ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
+    sar        blockd, 3               ; >> 3
+    shl        blockd, 2               ;  4 * dc
+    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS h
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS h
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    mov           r3d, blockd          ; dc
+    shl        blockd, 4               ; 16 * dc
+    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
+    sar        blockd, 7               ; >> 7
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+    movsx      blockd, WORD [blockq]   ; dc
+    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
+    sar        blockd, 1               ; >> 1
+    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
+    sar        blockd, 5               ; >> 5
+
+    INV_TRANS_INIT
+
+    INV_TRANS_PROCESS a
+    lea         destq, [destq+linesizeq*4]
+    INV_TRANS_PROCESS a
+    RET
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index b5f6bcec9b..1cb62ac409 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -192,100 +192,3 @@ HOR_16B_SHIFT2 OP_PUT, put
 INIT_MMX mmxext
 HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE
-
-%macro INV_TRANS_INIT 0
-    movd       m0, blockd
-    SPLATW     m0, m0
-    pxor       m1, m1
-    psubw      m1, m0
-    packuswb   m0, m0
-    packuswb   m1, m1
-
-    DEFINE_ARGS dest, linesize, linesize3
-    lea    linesize3q, [linesizeq*3]
-%endmacro
-
-%macro INV_TRANS_PROCESS 1
-    mov%1                  m2, [destq+linesizeq*0]
-    mov%1                  m3, [destq+linesizeq*1]
-    mov%1                  m4, [destq+linesizeq*2]
-    mov%1                  m5, [destq+linesize3q]
-    paddusb                m2, m0
-    paddusb                m3, m0
-    paddusb                m4, m0
-    paddusb                m5, m0
-    psubusb                m2, m1
-    psubusb                m3, m1
-    psubusb                m4, m1
-    psubusb                m5, m1
-    mov%1 [linesizeq*0+destq], m2
-    mov%1 [linesizeq*1+destq], m3
-    mov%1 [linesizeq*2+destq], m4
-    mov%1 [linesize3q +destq], m5
-%endmacro
-
-; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t 
*block)
-INIT_MMX mmxext
-cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
-    movsx         r3d, WORD [blockq]
-    mov        blockd, r3d             ; dc
-    shl        blockd, 4               ; 16 * dc
-    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
-    sar        blockd, 3               ; >> 3
-    mov           r3d, blockd          ; dc
-    shl        blockd, 4               ; 16 * dc
-    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
-    sar        blockd, 7               ; >> 7
-
-    INV_TRANS_INIT
-
-    INV_TRANS_PROCESS h
-    RET
-
-INIT_MMX mmxext
-cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
-    movsx         r3d, WORD [blockq]
-    mov        blockd, r3d             ; dc
-    shl        blockd, 4               ; 16 * dc
-    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
-    sar        blockd, 3               ; >> 3
-    shl        blockd, 2               ;  4 * dc
-    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
-    sar        blockd, 7               ; >> 7
-
-    INV_TRANS_INIT
-
-    INV_TRANS_PROCESS h
-    lea         destq, [destq+linesizeq*4]
-    INV_TRANS_PROCESS h
-    RET
-
-INIT_MMX mmxext
-cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
-    movsx      blockd, WORD [blockq]   ; dc
-    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
-    sar        blockd, 1               ; >> 1
-    mov           r3d, blockd          ; dc
-    shl        blockd, 4               ; 16 * dc
-    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
-    sar        blockd, 7               ; >> 7
-
-    INV_TRANS_INIT
-
-    INV_TRANS_PROCESS a
-    RET
-
-INIT_MMX mmxext
-cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
-    movsx      blockd, WORD [blockq]   ; dc
-    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
-    sar        blockd, 1               ; >> 1
-    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
-    sar        blockd, 5               ; >> 5
-
-    INV_TRANS_INIT
-
-    INV_TRANS_PROCESS a
-    lea         destq, [destq+linesizeq*4]
-    INV_TRANS_PROCESS a
-    RET
-- 
2.52.0


>From f37bd0241ee904f282d9e580213b315fa6abb246 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 19:51:29 +0200
Subject: [PATCH 05/11] avcodec/x86/vc1dsp_mc: Add size 8 horizontal SSSE3 mc
 functions

pmaddubsw strikes again.

vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_c:              150.2 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_mmxext:          44.5 ( 3.38x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_ssse3:           18.5 ( 8.12x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_c:              288.2 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_mmxext:          37.7 ( 7.64x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_ssse3:           18.1 (15.97x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_c:              155.4 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_mmxext:          46.5 ( 3.34x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_ssse3:           18.1 ( 8.60x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_c:              282.2 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_mmx:             42.7 ( 6.61x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_ssse3:           16.4 (17.16x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_c:              223.4 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_mmx:             36.3 ( 6.15x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_ssse3:           16.4 (13.59x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_c:              255.2 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_mmx:             43.6 ( 5.85x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_ssse3:           16.4 (15.52x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_init.c | 12 +++++++
 libavcodec/x86/vc1dsp_mc.asm | 65 ++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 9f80048791..0345d0e8c8 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -89,6 +89,14 @@ void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t 
linesize,
 void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize,
                                     int16_t *block);
 
+#define MSPEL_FUNC(OP, X, Y, SIZE, XMM)                                     \
+    void ff_vc1_ ## OP ## _mspel_mc ## X ## Y ## _ ## SIZE ##_ ## XMM       \
+             (uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd); \
+    dsp->OP ## _vc1_mspel_pixels_tab[SIZE == 8][X + 4 * Y] =                \
+        ff_vc1_ ## OP ## _mspel_mc ## X ## Y## _ ## SIZE ##_ ## XMM
+#define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \
+    MSPEL_FUNC(put, X, Y, SIZE, XMM);     \
+    MSPEL_FUNC(avg, X, Y, SIZE, XMM)
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
@@ -133,6 +141,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF816(ssse3);
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_put_vc1_chroma_mc8_nornd_ssse3;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_avg_vc1_chroma_mc8_nornd_ssse3;
+
+        MSPEL_FUNCS_SIZE(1, 0, 8, ssse3);
+        MSPEL_FUNCS_SIZE(2, 0, 8, ssse3);
+        MSPEL_FUNCS_SIZE(3, 0, 8, ssse3);
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse4;
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 1cb62ac409..b5bcdcaf0d 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -21,6 +21,12 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+pb_m4_36: times 8 db -4, 36
+pb_m4_53: times 8 db -4, 53
+pb_m3_18: times 8 db -3, 18
+
 cextern pw_9
 cextern pw_128
 
@@ -192,3 +198,62 @@ HOR_16B_SHIFT2 OP_PUT, put
 INIT_MMX mmxext
 HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE
+
+INIT_XMM ssse3
+%macro HOR_8B 2
+%define MOVU  movq
+%define MOVA  movq
+
+cglobal vc1_%1_mspel_mc10_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m1, [pb_m4_53]
+    mova              m2, [pb_m3_18]
+    sub             rndd, 32
+    jmp               vc1_%1_mspel_mc30_%2_after_prologue
+
+cglobal vc1_%1_mspel_mc20_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m1, [pb_m4_36]
+    lea             rndd, [4*rndd-32]
+    mova              m2, m1
+    jmp               vc1_%1_mspel_mc30_%2_after_prologue
+
+cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m2, [pb_m4_53]
+    mova              m1, [pb_m3_18]
+    sub             rndd, 32
+
+vc1_%1_mspel_mc30_%2_after_prologue:
+    movd              m0, rndd
+    WIN64_SPILL_XMM    7
+%define hd  rndd
+    mov               hd, %2
+    SPLATW            m0, m0
+.loop:
+    MOVU              m3, [srcq-1]
+    MOVU              m4, [srcq]
+    MOVU              m5, [srcq+1]
+    MOVU              m6, [srcq+2]
+
+    punpcklbw         m3, m4
+    pmaddubsw         m3, m1
+%ifidn %1,avg
+    movq              m4, [dstq]
+%endif
+    punpcklbw         m6, m5
+    pmaddubsw         m6, m2
+    add             srcq, strideq
+    psubw             m3, m0
+    paddw             m3, m6
+    psraw             m3, 6
+    packuswb          m3, m3
+%ifidn %1,avg
+    pavgb             m3, m4
+%endif
+    MOVA          [dstq], m3
+    add             dstq, strideq
+    dec               hd
+    jnz            .loop
+    RET
+%endmacro
+
+HOR_8B put, 8
+HOR_8B avg, 8
-- 
2.52.0


>From 08d22243ce115e821c9689b033dbca33330a308e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 20:28:37 +0200
Subject: [PATCH 06/11] avcodec/x86/vc1dsp_mc: Add size 16 horizontal SSSE3 mc
 functions

vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_c:             309.1 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_mmxext:        177.3 ( 1.74x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_ssse3:          52.3 ( 5.91x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_c:             279.6 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_mmxext:        148.8 ( 1.88x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_ssse3:          52.1 ( 5.37x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_c:             332.6 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_mmxext:        177.3 ( 1.88x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_ssse3:          52.5 ( 6.33x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_c:             288.8 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_mmx:           170.3 ( 1.70x)
vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_ssse3:          51.3 ( 5.63x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_c:             236.2 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_mmx:           144.1 ( 1.64x)
vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_ssse3:          51.3 ( 4.61x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_c:             286.6 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_mmx:           170.1 ( 1.69x)
vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_ssse3:          51.2 ( 5.60x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_init.c |  9 ++++++---
 libavcodec/x86/vc1dsp_mc.asm | 30 +++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 0345d0e8c8..b6fc7d3bfc 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -97,6 +97,9 @@ void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t 
linesize,
 #define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \
     MSPEL_FUNC(put, X, Y, SIZE, XMM);     \
     MSPEL_FUNC(avg, X, Y, SIZE, XMM)
+#define MSPEL_FUNCS(X, Y, XMM)            \
+    MSPEL_FUNCS_SIZE(X, Y,  8, XMM);      \
+    MSPEL_FUNCS_SIZE(X, Y, 16, XMM)
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
@@ -142,9 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_put_vc1_chroma_mc8_nornd_ssse3;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_avg_vc1_chroma_mc8_nornd_ssse3;
 
-        MSPEL_FUNCS_SIZE(1, 0, 8, ssse3);
-        MSPEL_FUNCS_SIZE(2, 0, 8, ssse3);
-        MSPEL_FUNCS_SIZE(3, 0, 8, ssse3);
+        MSPEL_FUNCS(1, 0, ssse3);
+        MSPEL_FUNCS(2, 0, ssse3);
+        MSPEL_FUNCS(3, 0, ssse3);
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse4;
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index b5bcdcaf0d..8bdc86627a 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -201,8 +201,13 @@ HOR_16B_SHIFT2 OP_AVG, avg
 
 INIT_XMM ssse3
 %macro HOR_8B 2
+%if %2 == 8
 %define MOVU  movq
 %define MOVA  movq
+%else
+%define MOVU  movu
+%define MOVA  mova
+%endif
 
 cglobal vc1_%1_mspel_mc10_%2, 4, 4, 6, dst, src, stride, rnd
     mova              m1, [pb_m4_53]
@@ -223,7 +228,7 @@ cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd
 
 vc1_%1_mspel_mc30_%2_after_prologue:
     movd              m0, rndd
-    WIN64_SPILL_XMM    7
+    WIN64_SPILL_XMM    7+(%2>>4)
 %define hd  rndd
     mov               hd, %2
     SPLATW            m0, m0
@@ -233,6 +238,7 @@ vc1_%1_mspel_mc30_%2_after_prologue:
     MOVU              m5, [srcq+1]
     MOVU              m6, [srcq+2]
 
+%if %2 == 8
     punpcklbw         m3, m4
     pmaddubsw         m3, m1
 %ifidn %1,avg
@@ -247,6 +253,25 @@ vc1_%1_mspel_mc30_%2_after_prologue:
     packuswb          m3, m3
 %ifidn %1,avg
     pavgb             m3, m4
+%endif
+%else
+    SBUTTERFLY        bw, 3, 4, 7
+    pmaddubsw         m3, m1
+    pmaddubsw         m4, m1
+    SBUTTERFLY        bw, 6, 5, 7
+    pmaddubsw         m6, m2
+    pmaddubsw         m5, m2
+    add             srcq, strideq
+    psubw             m3, m0
+    psubw             m4, m0
+    paddw             m3, m6
+    paddw             m4, m5
+    psraw             m3, 6
+    psraw             m4, 6
+    packuswb          m3, m4
+%ifidn %1, avg
+    pavgb             m3, [dstq]
+%endif
 %endif
     MOVA          [dstq], m3
     add             dstq, strideq
@@ -257,3 +282,6 @@ vc1_%1_mspel_mc30_%2_after_prologue:
 
 HOR_8B put, 8
 HOR_8B avg, 8
+
+HOR_8B put, 16
+HOR_8B avg, 16
-- 
2.52.0


>From bc1986cbcf509a76874cf8748980a9c2aa8e846b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 20:55:31 +0200
Subject: [PATCH 07/11] avcodec/x86/vc1dsp_mmx: Remove purely horizontal MMX mc
 functions

Superseded by SSSE3. By the way, the SSSE3 functions occupy
816B, but the functions removed now occupied 2304B with GCC
and 6512B with Clang (which inlined everything).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_mmx.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 3c771feb4b..f89becff64 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -351,7 +351,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t 
*src, int stride,\
         ::: "memory"\
     );\
 \
-    if (vmode) { /* Vertical filter to apply */\
         if (hmode) { /* Horizontal filter to apply, output to tmp */\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = 
(shift_value[hmode]+shift_value[vmode])>>1;\
@@ -368,10 +367,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t 
*src, int stride,\
             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
             return;\
         }\
-    }\
-\
-    /* Horizontal mode with no vertical mode */\
-    vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
 } \
 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
                                   int stride, int hmode, int vmode, int rnd)\
@@ -421,17 +416,14 @@ DECLARE_FUNCTION(0, 1)
 DECLARE_FUNCTION(0, 2)
 DECLARE_FUNCTION(0, 3)
 
-DECLARE_FUNCTION(1, 0)
 DECLARE_FUNCTION(1, 1)
 DECLARE_FUNCTION(1, 2)
 DECLARE_FUNCTION(1, 3)
 
-DECLARE_FUNCTION(2, 0)
 DECLARE_FUNCTION(2, 1)
 DECLARE_FUNCTION(2, 2)
 DECLARE_FUNCTION(2, 3)
 
-DECLARE_FUNCTION(3, 0)
 DECLARE_FUNCTION(3, 1)
 DECLARE_FUNCTION(3, 2)
 DECLARE_FUNCTION(3, 3)
@@ -446,17 +438,14 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
     FN_ASSIGN(put_, 0, 2, _mmx);
     FN_ASSIGN(put_, 0, 3, _mmx);
 
-    FN_ASSIGN(put_, 1, 0, _mmx);
     FN_ASSIGN(put_, 1, 1, _mmx);
     FN_ASSIGN(put_, 1, 2, _mmx);
     FN_ASSIGN(put_, 1, 3, _mmx);
 
-    FN_ASSIGN(put_, 2, 0, _mmx);
     FN_ASSIGN(put_, 2, 1, _mmx);
     FN_ASSIGN(put_, 2, 2, _mmx);
     FN_ASSIGN(put_, 2, 3, _mmx);
 
-    FN_ASSIGN(put_, 3, 0, _mmx);
     FN_ASSIGN(put_, 3, 1, _mmx);
     FN_ASSIGN(put_, 3, 2, _mmx);
     FN_ASSIGN(put_, 3, 3, _mmx);
@@ -468,17 +457,14 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
     FN_ASSIGN(avg_, 0, 2, _mmxext);
     FN_ASSIGN(avg_, 0, 3, _mmxext);
 
-    FN_ASSIGN(avg_, 1, 0, _mmxext);
     FN_ASSIGN(avg_, 1, 1, _mmxext);
     FN_ASSIGN(avg_, 1, 2, _mmxext);
     FN_ASSIGN(avg_, 1, 3, _mmxext);
 
-    FN_ASSIGN(avg_, 2, 0, _mmxext);
     FN_ASSIGN(avg_, 2, 1, _mmxext);
     FN_ASSIGN(avg_, 2, 2, _mmxext);
     FN_ASSIGN(avg_, 2, 3, _mmxext);
 
-    FN_ASSIGN(avg_, 3, 0, _mmxext);
     FN_ASSIGN(avg_, 3, 1, _mmxext);
     FN_ASSIGN(avg_, 3, 2, _mmxext);
     FN_ASSIGN(avg_, 3, 3, _mmxext);
-- 
2.52.0


>From 9b9e68911c9d2e59ca48cff7278fcd76a0ceb7f5 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 4 May 2026 22:11:38 +0200
Subject: [PATCH 08/11] avcodec/x86/vc1dsp_mc: Add size 8 vertical SSSE3 mc
 functions

vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_c:              165.6 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_mmxext:          44.4 ( 3.73x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_ssse3:           18.5 ( 8.97x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_c:              152.5 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_mmxext:          37.3 ( 4.09x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_ssse3:           18.5 ( 8.25x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_c:              162.9 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_mmxext:          44.1 ( 3.69x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_ssse3:           18.3 ( 8.88x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_c:              150.5 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_mmx:             42.4 ( 3.55x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_ssse3:           16.5 ( 9.11x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_c:               78.4 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_mmx:             36.1 ( 2.17x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_ssse3:           16.5 ( 4.76x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_c:              144.7 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_mmx:             42.6 ( 3.40x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_ssse3:           16.3 ( 8.89x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_init.c |  3 ++
 libavcodec/x86/vc1dsp_mc.asm | 72 ++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index b6fc7d3bfc..883edbe117 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -145,6 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_put_vc1_chroma_mc8_nornd_ssse3;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_avg_vc1_chroma_mc8_nornd_ssse3;
 
+        MSPEL_FUNCS_SIZE(0, 1, 8, ssse3);
+        MSPEL_FUNCS_SIZE(0, 2, 8, ssse3);
+        MSPEL_FUNCS_SIZE(0, 3, 8, ssse3);
         MSPEL_FUNCS(1, 0, ssse3);
         MSPEL_FUNCS(2, 0, ssse3);
         MSPEL_FUNCS(3, 0, ssse3);
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 8bdc86627a..924aaee54d 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -23,7 +23,12 @@
 
 SECTION_RODATA
 
+pb_m4_18: times 8 db -4, 18
+pb_53_m3: times 8 db 53, -3
+pb_m3_53: times 8 db -3, 53
+pb_18_m4: times 8 db 18, -4
 pb_m4_36: times 8 db -4, 36
+pb_36_m4: times 8 db 36, -4
 pb_m4_53: times 8 db -4, 53
 pb_m3_18: times 8 db -3, 18
 
@@ -285,3 +290,70 @@ HOR_8B avg, 8
 
 HOR_8B put, 16
 HOR_8B avg, 16
+
+%macro VER_8B 2
+%define MOVU  movq
+%define MOVA  movq
+
+cglobal vc1_%1_mspel_mc01_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m1, [pb_m4_18]
+    mova              m2, [pb_53_m3]
+    add             rndd, 31
+    jmp               vc1_%1_mspel_mc03_%2_after_prologue
+
+cglobal vc1_%1_mspel_mc02_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m1, [pb_m4_36]
+    mova              m2, [pb_36_m4]
+    lea             rndd, [4*rndd+28]
+    jmp               vc1_%1_mspel_mc03_%2_after_prologue
+
+cglobal vc1_%1_mspel_mc03_%2, 4, 4, 6, dst, src, stride, rnd
+    mova              m1, [pb_m3_53]
+    mova              m2, [pb_18_m4]
+    add             rndd, 31
+
+vc1_%1_mspel_mc03_%2_after_prologue:
+    neg          strideq
+    movd              m0, rndd
+    MOVU              m3, [srcq+strideq]
+    neg          strideq
+    MOVU              m4, [srcq]
+    MOVU              m5, [srcq+strideq]
+    SPLATW            m0, m0
+    WIN64_SPILL_XMM    8
+    lea             srcq, [srcq+2*strideq]
+%define hd  rndd
+    punpcklbw         m3, m5
+    mov               hd, %2
+
+.loop:
+    MOVU              m6, [srcq]
+    pmaddubsw         m3, m1
+    punpcklbw         m4, m6
+    paddw             m3, m0
+    pmaddubsw         m7, m4, m2
+    add             srcq, strideq
+    paddw             m7, m3
+    mova              m3, m4
+%ifidn %1, avg
+    movq              m4, [dstq]
+%endif
+    psraw             m7, 6
+%ifnidn %1, avg
+    mova              m4, m5
+%endif
+    packuswb          m7, m7
+%ifidn %1, avg
+    pavgb             m7, m4
+    mova              m4, m5
+%endif
+    MOVA          [dstq], m7
+    add             dstq, strideq
+    mova              m5, m6
+    dec               hd
+    jnz            .loop
+    RET
+%endmacro
+
+VER_8B put, 8
+VER_8B avg, 8
-- 
2.52.0


>From 634815b0709ba2888f7021d166f8ab2c609efd0c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 16 Oct 2025 20:18:41 +0200
Subject: [PATCH 09/11] avcodec/x86/vc1dsp_mc: Don't xor unnecessarily

m0 is not used at all in this function.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_mc.asm | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 924aaee54d..1481a6e3e6 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -169,7 +169,6 @@ cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, 
rnd, h
     LOAD_ROUNDER_MMX rndd
     mova               m5, [pw_9]
     mova               m6, [pw_128]
-    pxor               m0, m0
 
 .loop:
     mova               m1, [srcq + 2 * 0]
@@ -330,8 +329,8 @@ vc1_%1_mspel_mc03_%2_after_prologue:
     MOVU              m6, [srcq]
     pmaddubsw         m3, m1
     punpcklbw         m4, m6
-    paddw             m3, m0
     pmaddubsw         m7, m4, m2
+    paddw             m3, m0
     add             srcq, strideq
     paddw             m7, m3
     mova              m3, m4
-- 
2.52.0


>From 4c8bf3044b01953c92040313f7fcee085ddb8316 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 5 May 2026 12:17:30 +0200
Subject: [PATCH 10/11] avcodec/x86/vc1dsp_mc: Add size 16 vertical SSSE3 mc
 functions

vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_c:             334.4 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_mmxext:        177.5 ( 1.88x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_ssse3:          52.3 ( 6.40x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_c:             306.9 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_mmxext:        149.2 ( 2.06x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_ssse3:          52.2 ( 5.88x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_c:             334.2 ( 1.00x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_mmxext:        176.5 ( 1.89x)
vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_ssse3:          51.9 ( 6.44x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_c:             311.9 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_mmx:           169.8 ( 1.84x)
vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_ssse3:          51.6 ( 6.04x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_c:             279.3 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_mmx:           144.1 ( 1.94x)
vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_ssse3:          51.7 ( 5.40x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_c:             310.6 ( 1.00x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_mmx:           171.0 ( 1.82x)
vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_ssse3:          51.6 ( 6.02x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_init.c |  6 +--
 libavcodec/x86/vc1dsp_mc.asm | 91 ++++++++++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 883edbe117..7108c9052b 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -145,9 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_put_vc1_chroma_mc8_nornd_ssse3;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_avg_vc1_chroma_mc8_nornd_ssse3;
 
-        MSPEL_FUNCS_SIZE(0, 1, 8, ssse3);
-        MSPEL_FUNCS_SIZE(0, 2, 8, ssse3);
-        MSPEL_FUNCS_SIZE(0, 3, 8, ssse3);
+        MSPEL_FUNCS(0, 1, ssse3);
+        MSPEL_FUNCS(0, 2, ssse3);
+        MSPEL_FUNCS(0, 3, ssse3);
         MSPEL_FUNCS(1, 0, ssse3);
         MSPEL_FUNCS(2, 0, ssse3);
         MSPEL_FUNCS(3, 0, ssse3);
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 1481a6e3e6..a3d1a9ba5c 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -290,25 +290,42 @@ HOR_8B avg, 8
 HOR_8B put, 16
 HOR_8B avg, 16
 
-%macro VER_8B 2
-%define MOVU  movq
-%define MOVA  movq
+%macro SETUP_COEFFS 3 ; width, coeff1, coeff2
+%if ARCH_X86_64 || (%1 == 8)
+    mova          m1, [%2]
+    mova          m2, [%3]
+%define COEFF0 m1
+%define COEFF1 m2
+%define M8 m8
+%define M9 m9
+%else
+    lea           r4, [%2]
+%define COEFF0 [r4]
+%define COEFF1 [r4+(%3-%2)]
+%define M8 m1
+%define M9 m2
+%endif
+%endmacro
 
-cglobal vc1_%1_mspel_mc01_%2, 4, 4, 6, dst, src, stride, rnd
-    mova              m1, [pb_m4_18]
-    mova              m2, [pb_53_m3]
+%macro VER_8B 2
+%if %2 == 8
+%define MOVU  movq
+%else
+%define MOVU  movu
+%endif
+
+cglobal vc1_%1_mspel_mc01_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, 
rnd
+    SETUP_COEFFS %2, pb_m4_18, pb_53_m3
     add             rndd, 31
     jmp               vc1_%1_mspel_mc03_%2_after_prologue
 
-cglobal vc1_%1_mspel_mc02_%2, 4, 4, 6, dst, src, stride, rnd
-    mova              m1, [pb_m4_36]
-    mova              m2, [pb_36_m4]
+cglobal vc1_%1_mspel_mc02_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, 
rnd
+    SETUP_COEFFS %2, pb_m4_36, pb_36_m4
     lea             rndd, [4*rndd+28]
     jmp               vc1_%1_mspel_mc03_%2_after_prologue
 
-cglobal vc1_%1_mspel_mc03_%2, 4, 4, 6, dst, src, stride, rnd
-    mova              m1, [pb_m3_53]
-    mova              m2, [pb_18_m4]
+cglobal vc1_%1_mspel_mc03_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, 
rnd
+    SETUP_COEFFS %2, pb_m3_53, pb_18_m4
     add             rndd, 31
 
 vc1_%1_mspel_mc03_%2_after_prologue:
@@ -319,14 +336,20 @@ vc1_%1_mspel_mc03_%2_after_prologue:
     MOVU              m4, [srcq]
     MOVU              m5, [srcq+strideq]
     SPLATW            m0, m0
-    WIN64_SPILL_XMM    8
+    WIN64_SPILL_XMM    8+3*(%2>>4)
     lea             srcq, [srcq+2*strideq]
 %define hd  rndd
+%if %2 == 8
     punpcklbw         m3, m5
+%else
+    punpcklbw         m7, m3, m5
+    punpckhbw         m3, m5
+%endif
     mov               hd, %2
 
 .loop:
     MOVU              m6, [srcq]
+%if %2 == 8
     pmaddubsw         m3, m1
     punpcklbw         m4, m6
     pmaddubsw         m7, m4, m2
@@ -346,7 +369,44 @@ vc1_%1_mspel_mc03_%2_after_prologue:
     pavgb             m7, m4
     mova              m4, m5
 %endif
-    MOVA          [dstq], m7
+    movq          [dstq], m7
+%else
+    pmaddubsw         m7, COEFF0
+    pmaddubsw         m3, COEFF0
+    punpcklbw         M8, m4, m6
+    punpckhbw         m4, m6
+    pmaddubsw         M9, M8, COEFF1
+    paddw             m7, m0
+%if ARCH_X86_64
+    pmaddubsw        m10, m4, m2
+    paddw             m3, m0
+    paddw             m9, m7
+    mova              m7, m8
+    psraw             m9, 6
+    paddw            m10, m3
+%else
+    paddw             m3, m0
+    paddw             M9, m7
+    mova              m7, M8
+    pmaddubsw         M8, m4, COEFF1
+    psraw             M9, 6
+    paddw             M8, m3
+%endif
+    add             srcq, strideq
+    mova              m3, m4
+%if ARCH_X86_64
+    psraw            m10, 6
+    packuswb          m9, m10
+%else
+    psraw             M8, 6
+    packuswb          M9, M8
+%endif
+%ifidn %1, avg
+    pavgb             M9, [dstq]
+%endif
+    mova              m4, m5
+    mova          [dstq], M9
+%endif
     add             dstq, strideq
     mova              m5, m6
     dec               hd
@@ -356,3 +416,6 @@ vc1_%1_mspel_mc03_%2_after_prologue:
 
 VER_8B put, 8
 VER_8B avg, 8
+
+VER_8B put, 16
+VER_8B avg, 16
-- 
2.52.0


>From 4717c4a89ada55f9ca0bfb2f470171bfdfae8a6a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 5 May 2026 12:31:28 +0200
Subject: [PATCH 11/11] avcodec/x86/vc1dsp_mmx: Remove purely vertical MMX mc
 functions

They have been superseded by SSSE3. Notice that the functions removed
occupied 3424B with GCC and 6176B with Clang here, whereas
the SSSE3 functions replacing them occupy only 944B.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_mmx.c | 120 ------------------------------------
 1 file changed, 120 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index f89becff64..cb53e67de9 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -74,64 +74,6 @@ void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg 
stride,
      "punpcklwd %%mm7, %%mm7           \n\t"    \
      "punpckldq %%mm7, %%mm7           \n\t"
 
-/**
- * Purely vertical or horizontal 1/2 shift interpolation.
- * Sacrifice mm6 for *9 factor.
- */
-#define VC1_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
-                                     x86_reg stride, int rnd, x86_reg offset)\
-{\
-    rnd = 8-rnd;\
-    __asm__ volatile(\
-        "mov       $8, %%"FF_REG_c"        \n\t"\
-        LOAD_ROUNDER_MMX("%5")\
-        "movq      "MANGLE(ff_pw_9)", %%mm6\n\t"\
-        "1:                                \n\t"\
-        "movd      0(%0   ), %%mm3         \n\t"\
-        "movd      4(%0   ), %%mm4         \n\t"\
-        "movd      0(%0,%2), %%mm1         \n\t"\
-        "movd      4(%0,%2), %%mm2         \n\t"\
-        "add       %2, %0                  \n\t"\
-        "punpcklbw %%mm0, %%mm3            \n\t"\
-        "punpcklbw %%mm0, %%mm4            \n\t"\
-        "punpcklbw %%mm0, %%mm1            \n\t"\
-        "punpcklbw %%mm0, %%mm2            \n\t"\
-        "paddw     %%mm1, %%mm3            \n\t"\
-        "paddw     %%mm2, %%mm4            \n\t"\
-        "movd      0(%0,%3), %%mm1         \n\t"\
-        "movd      4(%0,%3), %%mm2         \n\t"\
-        "pmullw    %%mm6, %%mm3            \n\t" /* 0,9,9,0*/\
-        "pmullw    %%mm6, %%mm4            \n\t" /* 0,9,9,0*/\
-        "punpcklbw %%mm0, %%mm1            \n\t"\
-        "punpcklbw %%mm0, %%mm2            \n\t"\
-        "psubw     %%mm1, %%mm3            \n\t" /*-1,9,9,0*/\
-        "psubw     %%mm2, %%mm4            \n\t" /*-1,9,9,0*/\
-        "movd      0(%0,%2), %%mm1         \n\t"\
-        "movd      4(%0,%2), %%mm2         \n\t"\
-        "punpcklbw %%mm0, %%mm1            \n\t"\
-        "punpcklbw %%mm0, %%mm2            \n\t"\
-        "psubw     %%mm1, %%mm3            \n\t" /*-1,9,9,-1*/\
-        "psubw     %%mm2, %%mm4            \n\t" /*-1,9,9,-1*/\
-        NORMALIZE_MMX("$4")\
-        "packuswb  %%mm4, %%mm3            \n\t"\
-        OP((%1), %%mm3)\
-        "movq      %%mm3, (%1)             \n\t"\
-        "add       %6, %0                  \n\t"\
-        "add       %4, %1                  \n\t"\
-        "dec       %%"FF_REG_c"            \n\t"\
-        "jnz 1b                            \n\t"\
-        : "+r"(src),  "+r"(dst)\
-        : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
-          "g"(stride-offset)\
-          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
-        : "%"FF_REG_c, "memory"\
-    );\
-}
-
-VC1_SHIFT2(OP_PUT, put_)
-VC1_SHIFT2(OP_AVG, avg_)
-
 /**
  * Core of the 1/4 and 3/4 shift bicubic interpolation.
  *
@@ -270,59 +212,18 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, 
x86_reg stride,    \
     );                                                                  \
 }
 
-/**
- * Macro to build the 8 bits, any direction, version of vc1_put_shift[13].
- * Here, offset=src_stride. Parameters passed A1 to A4 must use
- * %3 (offset) and %4 (3*offset).
- *
- * @param  NAME   Either 1 or 3
- * @see MSPEL_FILTER13_CORE for information on A1->A4
- */
-#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)             \
-static void                                                             \
-OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
-                        x86_reg stride, int rnd, x86_reg offset)      \
-{                                                                       \
-    int h = 8;                                                          \
-    src -= offset;                                                      \
-    rnd = 32-rnd;                                                       \
-    __asm__ volatile (                                                      \
-        LOAD_ROUNDER_MMX("%6")                                          \
-        "movq      "MANGLE(ff_pw_53)", %%mm5       \n\t"                \
-        "movq      "MANGLE(ff_pw_18)", %%mm6       \n\t"                \
-        ".p2align 3                \n\t"                                \
-        "1:                        \n\t"                                \
-        MSPEL_FILTER13_CORE(DO_UNPACK, "movd   1", A1, A2, A3, A4)      \
-        NORMALIZE_MMX("$6")                                             \
-        TRANSFER_DO_PACK(OP)                                            \
-        "add       %5, %1          \n\t"                                \
-        "add       %5, %2          \n\t"                                \
-        "decl      %0              \n\t"                                \
-        "jnz 1b                    \n\t"                                \
-        : "+r"(h), "+r" (src),  "+r" (dst)                              \
-        : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
-          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
-        : "memory"                                                      \
-    );                                                                  \
-}
-
 /** 1/4 shift bicubic interpolation */
-MSPEL_FILTER13_8B     (shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1 
    )", OP_PUT, put_)
-MSPEL_FILTER13_8B     (shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1 
    )", OP_AVG, avg_)
 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1 
    )")
 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", 
OP_PUT, put_)
 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", 
OP_AVG, avg_)
 
 /** 3/4 shift bicubic interpolation */
-MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", 
"0(%1,%4  )", OP_PUT, put_)
-MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", 
"0(%1,%4  )", OP_AVG, avg_)
 MSPEL_FILTER13_VER_16B(shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", 
"0(%1,%4  )")
 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", 
OP_PUT, put_)
 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", 
OP_AVG, avg_)
 
 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t 
*src, x86_reg src_stride, int rnd, int64_t shift);
 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg 
dst_stride, const int16_t *src, int rnd);
-typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, 
x86_reg stride, int rnd, x86_reg offset);
 
 /**
  * Interpolate fractional pel values by applying proper vertical then
@@ -343,15 +244,12 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const 
uint8_t *src, int stride,\
          { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, 
vc1_put_ver_16b_shift3_mmx };\
     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
          { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## 
hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
-    static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
-         { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## 
vc1_shift3_mmx };\
 \
     __asm__ volatile(\
         "pxor %%mm0, %%mm0         \n\t"\
         ::: "memory"\
     );\
 \
-        if (hmode) { /* Horizontal filter to apply, output to tmp */\
             static const int shift_value[] = { 0, 5, 1, 5 };\
             int              shift = 
(shift_value[hmode]+shift_value[vmode])>>1;\
             int              r;\
@@ -361,12 +259,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t 
*src, int stride,\
             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
 \
             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
-            return;\
-        }\
-        else { /* No horizontal filter, output 8 lines to dst */\
-            vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
-            return;\
-        }\
 } \
 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
                                   int stride, int hmode, int vmode, int rnd)\
@@ -412,10 +304,6 @@ static void avg_vc1_mspel_mc ## a ## b ## 
_16_mmxext(uint8_t *dst,      \
      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
-DECLARE_FUNCTION(0, 1)
-DECLARE_FUNCTION(0, 2)
-DECLARE_FUNCTION(0, 3)
-
 DECLARE_FUNCTION(1, 1)
 DECLARE_FUNCTION(1, 2)
 DECLARE_FUNCTION(1, 3)
@@ -434,10 +322,6 @@ DECLARE_FUNCTION(3, 3)
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    FN_ASSIGN(put_, 0, 1, _mmx);
-    FN_ASSIGN(put_, 0, 2, _mmx);
-    FN_ASSIGN(put_, 0, 3, _mmx);
-
     FN_ASSIGN(put_, 1, 1, _mmx);
     FN_ASSIGN(put_, 1, 2, _mmx);
     FN_ASSIGN(put_, 1, 3, _mmx);
@@ -453,10 +337,6 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    FN_ASSIGN(avg_, 0, 1, _mmxext);
-    FN_ASSIGN(avg_, 0, 2, _mmxext);
-    FN_ASSIGN(avg_, 0, 3, _mmxext);
-
     FN_ASSIGN(avg_, 1, 1, _mmxext);
     FN_ASSIGN(avg_, 1, 2, _mmxext);
     FN_ASSIGN(avg_, 1, 3, _mmxext);
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avcodec/x86/vc1dsp_mc: Add SSSE3 version, remove superseded MMX version (PR #23020)

Reply via email to