PR #23020 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23020 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23020.patch
For now, this covers only the purely horizontal and vertical functions. The mixed ones will be added soon. >From 9e5aa77d72b642b70968a4e049472a803c23ae69 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 15 Oct 2025 16:29:08 +0200 Subject: [PATCH 01/11] tests/checkasm/vc1dsp: Improve mspel test Up until now, only the fullpel test (i.e. the test without pixel interpolation) has been tested at all. Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/vc1dsp.c | 78 +++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c index dda6d36257..63112256f0 100644 --- a/tests/checkasm/vc1dsp.c +++ b/tests/checkasm/vc1dsp.c @@ -441,34 +441,74 @@ static void check_unescape(void) static void check_mspel_pixels(void) { - LOCAL_ALIGNED_16(uint8_t, src0, [32 * 32]); - LOCAL_ALIGNED_16(uint8_t, src1, [32 * 32]); - LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 32]); - LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 32]); + enum { + MAX_BLOCK_SIZE = 16, + MAX_STRIDE = 64, + /// BUF_SIZE is bigger than necessary in order to test strides > block width. + BUF_SIZE = (MAX_BLOCK_SIZE - 1) * MAX_STRIDE + MAX_BLOCK_SIZE, + /** + * Due to qpel interpolation the input needs one extra line at the top + * and two at the bottom; horizontal interpolation also needs one pixel + * to the left and two to the right. At least the x86 implementation + * actually accesses three pixels to the right. + * The input is not subject to alignment requirements; making the input buffer + * bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random misalignment. + */ + INPUT_BUF_SIZE = (MAX_BLOCK_SIZE - 1) + 1 + + (MAX_BLOCK_SIZE + 1 + 2 - 1) * MAX_STRIDE + MAX_BLOCK_SIZE + 2 + 1, + }; + DECLARE_ALIGNED(16, uint8_t, dstbuf0)[BUF_SIZE]; + DECLARE_ALIGNED(16, uint8_t, dstbuf1)[BUF_SIZE]; + uint8_t srcbuf0[INPUT_BUF_SIZE]; + uint8_t srcbuf1[INPUT_BUF_SIZE]; VC1DSPContext h; - const test tests[] = { - VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[0][0], 16, 16) - VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[1][0], 8, 8) - VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[0][0], 16, 16) - VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[1][0], 8, 8) + const struct MSPelTest { + const char *name; + size_t offset; + } tests[] = { +#define MSPEL_TEST(elem) { .name = #elem, offsetof(VC1DSPContext, elem) } + MSPEL_TEST(put_vc1_mspel_pixels_tab), + MSPEL_TEST(avg_vc1_mspel_pixels_tab), }; ff_vc1dsp_init(&h); for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { - void (*func)(uint8_t *, const uint8_t*, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); - if (check_func(func, "vc1dsp.%s", tests[t].name)) { - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const uint8_t*, ptrdiff_t, int); - RANDOMIZE_BUFFER8(dst, 32 * 32); - RANDOMIZE_BUFFER8(src, 32 * 32); - call_ref(dst0, src0, 32, 0); - call_new(dst1, src1, 32, 0); - if (memcmp(dst0, dst1, 32 * 32)) { - fail(); + const vc1op_pixels_func (*func)[16] = (vc1op_pixels_func(*)[16])((char*)&h + tests[t].offset); + for (unsigned j = 0; j < 2; ++j) { + const unsigned blocksize = 16 >> j; + + for (unsigned dxy = 0; dxy < 16; ++dxy) { + if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", tests[t].name, dxy & 3, dxy >> 2, blocksize)) { + declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const uint8_t*, ptrdiff_t, int); + size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) * blocksize; + ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1) * blocksize; + size_t src_offset = 1 + stride + rnd() % MAX_BLOCK_SIZE; + const uint8_t *src0 = srcbuf0 + src_offset, *src1 = srcbuf1 + src_offset; + uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + dst_offset; + + if (rnd() & 1) { + // Flip stride. + dst1 += (blocksize - 1) * stride; + dst0 += (blocksize - 1) * stride; + // We need one line above src and two lines below the block, + // hence blocksize * stride. + src0 += blocksize * stride; + src1 += blocksize * stride; + stride = -stride; + } + RANDOMIZE_BUFFER8(dstbuf, sizeof(dstbuf0)); + RANDOMIZE_BUFFER8(srcbuf, sizeof(srcbuf0)); + call_ref(dst0, src0, stride, 0); + call_new(dst1, src1, stride, 0); + if (memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0))) { + fail(); + } + bench_new(dst1, src1, stride, 0); + } } - bench_new(dst1, src0, 32, 0); } } } -- 2.52.0 >From 457d06272bbd4d7557ff17b6c7fbe22450e8361d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 12 Oct 2025 15:14:37 +0200 Subject: [PATCH 02/11] avcodec/x86/vc1dsp_mc: Remove unnecessary movsxdifnidn Forgotten in 5a49097b42cbc3eab888d15a91eeaf5520b5c381 merging Libav commit 2ec9fa5ec60dcd10e1cb10d8b4e4437e634ea428 (which changed the stride to ptrdiff_t). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_mc.asm | 1 - 1 file changed, 1 deletion(-) diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index c1b3ed1bc3..b5f6bcec9b 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -194,7 +194,6 @@ HOR_16B_SHIFT2 OP_AVG, avg %endif ; HAVE_MMX_INLINE %macro INV_TRANS_INIT 0 - movsxdifnidn linesizeq, linesized movd m0, blockd SPLATW m0, m0 pxor m1, m1 -- 2.52.0 >From c6be9ff77d60bf7cf013367b4215b2668ec1e906 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 17:45:13 +0200 Subject: [PATCH 03/11] tests/checkasm/vc1dsp: Fix shadowing Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/vc1dsp.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c index 63112256f0..ece810aab1 100644 --- a/tests/checkasm/vc1dsp.c +++ b/tests/checkasm/vc1dsp.c @@ -306,18 +306,18 @@ static void check_inv_trans_adding(void) ff_vc1dsp_init(&h); - for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { - void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset); - if (check_func(func, "vc1dsp.%s", tests[t].name)) { + for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) { + void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[k].offset); + if (check_func(func, "vc1dsp.%s", tests[k].name)) { matrix *coeffs; declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *); RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8); RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24); - coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height); - for (int j = 0; j < tests[t].height; ++j) - for (int i = 0; i < tests[t].width; ++i) { + coeffs = generate_inverse_quantized_transform_coefficients(tests[k].width, tests[k].height); + for (int j = 0; j < tests[k].height; ++j) + for (int i = 0; i < tests[k].width; ++i) { int idx = j * 8 + i; - inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i]; + inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[k].width + i]; } call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0); call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1); @@ -352,10 +352,10 @@ static void check_loop_filter(void) ff_vc1dsp_init(&h); - for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { - void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset); + for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) { + void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[k].offset); declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int); - if (check_func(func, "vc1dsp.%s", tests[t].name)) { + if (check_func(func, "vc1dsp.%s", tests[k].name)) { for (int count = 1000; count > 0; --count) { int pq = rnd() % 31 + 1; RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48); @@ -368,9 +368,9 @@ static void check_loop_filter(void) for (int j = 0; j < 24; ++j) for (int i = 0; i < 48; ++i) filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4); - if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name)) + if (check_func(func, "vc1dsp.%s_bestcase", tests[k].name)) bench_new(filter_buf1 + 4 * 48 + 16, 48, 1); - if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name)) + if (check_func(func, "vc1dsp.%s_worstcase", tests[k].name)) bench_new(filter_buf1 + 4 * 48 + 16, 48, 31); } } @@ -475,13 +475,13 @@ static void check_mspel_pixels(void) ff_vc1dsp_init(&h); - for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) { - const vc1op_pixels_func (*func)[16] = (vc1op_pixels_func(*)[16])((char*)&h + tests[t].offset); + for (size_t k = 0; k < FF_ARRAY_ELEMS(tests); ++k) { + const vc1op_pixels_func (*func)[16] = (vc1op_pixels_func(*)[16])((char*)&h + tests[k].offset); for (unsigned j = 0; j < 2; ++j) { const unsigned blocksize = 16 >> j; for (unsigned dxy = 0; dxy < 16; ++dxy) { - if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", tests[t].name, dxy & 3, dxy >> 2, blocksize)) { + if (check_func(func[j][dxy], "vc1dsp.%s_mc%u%u_%u", tests[k].name, dxy & 3, dxy >> 2, blocksize)) { declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const uint8_t*, ptrdiff_t, int); size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) * blocksize; ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1) * blocksize; -- 2.52.0 >From 25decfcd2e4c3edede90329c01106e72ec4bccfe Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 18:31:19 +0200 Subject: [PATCH 04/11] avcodec/x86/vc1dsp_mc: Move inverse transform into a file of its own Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/vc1dsp_inv_trans.asm | 121 ++++++++++++++++++++++++++++ libavcodec/x86/vc1dsp_mc.asm | 97 ---------------------- 3 files changed, 123 insertions(+), 98 deletions(-) create mode 100644 libavcodec/x86/vc1dsp_inv_trans.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index e87cb750f4..5d746d24c8 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -126,7 +126,8 @@ X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ x86/qpel.o X86ASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o -X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_loopfilter.o \ +X86ASM-OBJS-$(CONFIG_VC1DSP) += x86/vc1dsp_inv_trans.o \ + x86/vc1dsp_loopfilter.o \ x86/vc1dsp_mc.o x86/fpel.o ifdef ARCH_X86_64 X86ASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o diff --git a/libavcodec/x86/vc1dsp_inv_trans.asm b/libavcodec/x86/vc1dsp_inv_trans.asm new file mode 100644 index 0000000000..e1b74de6c4 --- /dev/null +++ b/libavcodec/x86/vc1dsp_inv_trans.asm @@ -0,0 +1,121 @@ +;****************************************************************************** +;* VC1 inverse transform +;* Copyright (c) 2009 Fiona Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro INV_TRANS_INIT 0 + movd m0, blockd + SPLATW m0, m0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + + DEFINE_ARGS dest, linesize, linesize3 + lea linesize3q, [linesizeq*3] +%endmacro + +%macro INV_TRANS_PROCESS 1 + mov%1 m2, [destq+linesizeq*0] + mov%1 m3, [destq+linesizeq*1] + mov%1 m4, [destq+linesizeq*2] + mov%1 m5, [destq+linesize3q] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + mov%1 [linesizeq*0+destq], m2 + mov%1 [linesizeq*1+destq], m3 + mov%1 [linesizeq*2+destq], m4 + mov%1 [linesize3q +destq], m5 +%endmacro + +; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block) +INIT_MMX mmxext +cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block + movsx r3d, WORD [blockq] + mov blockd, r3d ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+4] ; 17 * dc + 4 + sar blockd, 3 ; >> 3 + mov r3d, blockd ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+64] ; 17 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS h + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block + movsx r3d, WORD [blockq] + mov blockd, r3d ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+4] ; 17 * dc + 4 + sar blockd, 3 ; >> 3 + shl blockd, 2 ; 4 * dc + lea blockd, [blockq*3+64] ; 12 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS h + lea destq, [destq+linesizeq*4] + INV_TRANS_PROCESS h + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block + movsx blockd, WORD [blockq] ; dc + lea blockd, [blockq*3+1] ; 3 * dc + 1 + sar blockd, 1 ; >> 1 + mov r3d, blockd ; dc + shl blockd, 4 ; 16 * dc + lea blockd, [blockq+r3+64] ; 17 * dc + 64 + sar blockd, 7 ; >> 7 + + INV_TRANS_INIT + + INV_TRANS_PROCESS a + RET + +INIT_MMX mmxext +cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block + movsx blockd, WORD [blockq] ; dc + lea blockd, [blockq*3+1] ; 3 * dc + 1 + sar blockd, 1 ; >> 1 + lea blockd, [blockq*3+16] ; 3 * dc + 16 + sar blockd, 5 ; >> 5 + + INV_TRANS_INIT + + INV_TRANS_PROCESS a + lea destq, [destq+linesizeq*4] + INV_TRANS_PROCESS a + RET diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index b5f6bcec9b..1cb62ac409 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -192,100 +192,3 @@ HOR_16B_SHIFT2 OP_PUT, put INIT_MMX mmxext HOR_16B_SHIFT2 OP_AVG, avg %endif ; HAVE_MMX_INLINE - -%macro INV_TRANS_INIT 0 - movd m0, blockd - SPLATW m0, m0 - pxor m1, m1 - psubw m1, m0 - packuswb m0, m0 - packuswb m1, m1 - - DEFINE_ARGS dest, linesize, linesize3 - lea linesize3q, [linesizeq*3] -%endmacro - -%macro INV_TRANS_PROCESS 1 - mov%1 m2, [destq+linesizeq*0] - mov%1 m3, [destq+linesizeq*1] - mov%1 m4, [destq+linesizeq*2] - mov%1 m5, [destq+linesize3q] - paddusb m2, m0 - paddusb m3, m0 - paddusb m4, m0 - paddusb m5, m0 - psubusb m2, m1 - psubusb m3, m1 - psubusb m4, m1 - psubusb m5, m1 - mov%1 [linesizeq*0+destq], m2 - mov%1 [linesizeq*1+destq], m3 - mov%1 [linesizeq*2+destq], m4 - mov%1 [linesize3q +destq], m5 -%endmacro - -; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block) -INIT_MMX mmxext -cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block - movsx r3d, WORD [blockq] - mov blockd, r3d ; dc - shl blockd, 4 ; 16 * dc - lea blockd, [blockq+r3+4] ; 17 * dc + 4 - sar blockd, 3 ; >> 3 - mov r3d, blockd ; dc - shl blockd, 4 ; 16 * dc - lea blockd, [blockq+r3+64] ; 17 * dc + 64 - sar blockd, 7 ; >> 7 - - INV_TRANS_INIT - - INV_TRANS_PROCESS h - RET - -INIT_MMX mmxext -cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block - movsx r3d, WORD [blockq] - mov blockd, r3d ; dc - shl blockd, 4 ; 16 * dc - lea blockd, [blockq+r3+4] ; 17 * dc + 4 - sar blockd, 3 ; >> 3 - shl blockd, 2 ; 4 * dc - lea blockd, [blockq*3+64] ; 12 * dc + 64 - sar blockd, 7 ; >> 7 - - INV_TRANS_INIT - - INV_TRANS_PROCESS h - lea destq, [destq+linesizeq*4] - INV_TRANS_PROCESS h - RET - -INIT_MMX mmxext -cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block - movsx blockd, WORD [blockq] ; dc - lea blockd, [blockq*3+1] ; 3 * dc + 1 - sar blockd, 1 ; >> 1 - mov r3d, blockd ; dc - shl blockd, 4 ; 16 * dc - lea blockd, [blockq+r3+64] ; 17 * dc + 64 - sar blockd, 7 ; >> 7 - - INV_TRANS_INIT - - INV_TRANS_PROCESS a - RET - -INIT_MMX mmxext -cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block - movsx blockd, WORD [blockq] ; dc - lea blockd, [blockq*3+1] ; 3 * dc + 1 - sar blockd, 1 ; >> 1 - lea blockd, [blockq*3+16] ; 3 * dc + 16 - sar blockd, 5 ; >> 5 - - INV_TRANS_INIT - - INV_TRANS_PROCESS a - lea destq, [destq+linesizeq*4] - INV_TRANS_PROCESS a - RET -- 2.52.0 >From f37bd0241ee904f282d9e580213b315fa6abb246 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 19:51:29 +0200 Subject: [PATCH 05/11] avcodec/x86/vc1dsp_mc: Add size 8 horizontal SSSE3 mc functions pmaddubsw strikes again. vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_c: 150.2 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_mmxext: 44.5 ( 3.38x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_8_ssse3: 18.5 ( 8.12x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_c: 288.2 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_mmxext: 37.7 ( 7.64x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_8_ssse3: 18.1 (15.97x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_c: 155.4 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_mmxext: 46.5 ( 3.34x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_8_ssse3: 18.1 ( 8.60x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_c: 282.2 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_mmx: 42.7 ( 6.61x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_8_ssse3: 16.4 (17.16x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_c: 223.4 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_mmx: 36.3 ( 6.15x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_8_ssse3: 16.4 (13.59x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_c: 255.2 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_mmx: 43.6 ( 5.85x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_8_ssse3: 16.4 (15.52x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_init.c | 12 +++++++ libavcodec/x86/vc1dsp_mc.asm | 65 ++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 9f80048791..0345d0e8c8 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -89,6 +89,14 @@ void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block); +#define MSPEL_FUNC(OP, X, Y, SIZE, XMM) \ + void ff_vc1_ ## OP ## _mspel_mc ## X ## Y ## _ ## SIZE ##_ ## XMM \ + (uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd); \ + dsp->OP ## _vc1_mspel_pixels_tab[SIZE == 8][X + 4 * Y] = \ + ff_vc1_ ## OP ## _mspel_mc ## X ## Y## _ ## SIZE ##_ ## XMM +#define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \ + MSPEL_FUNC(put, X, Y, SIZE, XMM); \ + MSPEL_FUNC(avg, X, Y, SIZE, XMM) av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) { @@ -133,6 +141,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) ASSIGN_LF816(ssse3); dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; + + MSPEL_FUNCS_SIZE(1, 0, 8, ssse3); + MSPEL_FUNCS_SIZE(2, 0, 8, ssse3); + MSPEL_FUNCS_SIZE(3, 0, 8, ssse3); } if (EXTERNAL_SSE4(cpu_flags)) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index 1cb62ac409..b5bcdcaf0d 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -21,6 +21,12 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +pb_m4_36: times 8 db -4, 36 +pb_m4_53: times 8 db -4, 53 +pb_m3_18: times 8 db -3, 18 + cextern pw_9 cextern pw_128 @@ -192,3 +198,62 @@ HOR_16B_SHIFT2 OP_PUT, put INIT_MMX mmxext HOR_16B_SHIFT2 OP_AVG, avg %endif ; HAVE_MMX_INLINE + +INIT_XMM ssse3 +%macro HOR_8B 2 +%define MOVU movq +%define MOVA movq + +cglobal vc1_%1_mspel_mc10_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_53] + mova m2, [pb_m3_18] + sub rndd, 32 + jmp vc1_%1_mspel_mc30_%2_after_prologue + +cglobal vc1_%1_mspel_mc20_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_36] + lea rndd, [4*rndd-32] + mova m2, m1 + jmp vc1_%1_mspel_mc30_%2_after_prologue + +cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd + mova m2, [pb_m4_53] + mova m1, [pb_m3_18] + sub rndd, 32 + +vc1_%1_mspel_mc30_%2_after_prologue: + movd m0, rndd + WIN64_SPILL_XMM 7 +%define hd rndd + mov hd, %2 + SPLATW m0, m0 +.loop: + MOVU m3, [srcq-1] + MOVU m4, [srcq] + MOVU m5, [srcq+1] + MOVU m6, [srcq+2] + + punpcklbw m3, m4 + pmaddubsw m3, m1 +%ifidn %1,avg + movq m4, [dstq] +%endif + punpcklbw m6, m5 + pmaddubsw m6, m2 + add srcq, strideq + psubw m3, m0 + paddw m3, m6 + psraw m3, 6 + packuswb m3, m3 +%ifidn %1,avg + pavgb m3, m4 +%endif + MOVA [dstq], m3 + add dstq, strideq + dec hd + jnz .loop + RET +%endmacro + +HOR_8B put, 8 +HOR_8B avg, 8 -- 2.52.0 >From 08d22243ce115e821c9689b033dbca33330a308e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 20:28:37 +0200 Subject: [PATCH 06/11] avcodec/x86/vc1dsp_mc: Add size 16 horizontal SSSE3 mc functions vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_c: 309.1 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_mmxext: 177.3 ( 1.74x) vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_ssse3: 52.3 ( 5.91x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_c: 279.6 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_mmxext: 148.8 ( 1.88x) vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_ssse3: 52.1 ( 5.37x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_c: 332.6 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_mmxext: 177.3 ( 1.88x) vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_ssse3: 52.5 ( 6.33x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_c: 288.8 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_mmx: 170.3 ( 1.70x) vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_ssse3: 51.3 ( 5.63x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_c: 236.2 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_mmx: 144.1 ( 1.64x) vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_ssse3: 51.3 ( 4.61x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_c: 286.6 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_mmx: 170.1 ( 1.69x) vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_ssse3: 51.2 ( 5.60x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_init.c | 9 ++++++--- libavcodec/x86/vc1dsp_mc.asm | 30 +++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 0345d0e8c8..b6fc7d3bfc 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -97,6 +97,9 @@ void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, #define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \ MSPEL_FUNC(put, X, Y, SIZE, XMM); \ MSPEL_FUNC(avg, X, Y, SIZE, XMM) +#define MSPEL_FUNCS(X, Y, XMM) \ + MSPEL_FUNCS_SIZE(X, Y, 8, XMM); \ + MSPEL_FUNCS_SIZE(X, Y, 16, XMM) av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) { @@ -142,9 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; - MSPEL_FUNCS_SIZE(1, 0, 8, ssse3); - MSPEL_FUNCS_SIZE(2, 0, 8, ssse3); - MSPEL_FUNCS_SIZE(3, 0, 8, ssse3); + MSPEL_FUNCS(1, 0, ssse3); + MSPEL_FUNCS(2, 0, ssse3); + MSPEL_FUNCS(3, 0, ssse3); } if (EXTERNAL_SSE4(cpu_flags)) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index b5bcdcaf0d..8bdc86627a 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -201,8 +201,13 @@ HOR_16B_SHIFT2 OP_AVG, avg INIT_XMM ssse3 %macro HOR_8B 2 +%if %2 == 8 %define MOVU movq %define MOVA movq +%else +%define MOVU movu +%define MOVA mova +%endif cglobal vc1_%1_mspel_mc10_%2, 4, 4, 6, dst, src, stride, rnd mova m1, [pb_m4_53] @@ -223,7 +228,7 @@ cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd vc1_%1_mspel_mc30_%2_after_prologue: movd m0, rndd - WIN64_SPILL_XMM 7 + WIN64_SPILL_XMM 7+(%2>>4) %define hd rndd mov hd, %2 SPLATW m0, m0 @@ -233,6 +238,7 @@ vc1_%1_mspel_mc30_%2_after_prologue: MOVU m5, [srcq+1] MOVU m6, [srcq+2] +%if %2 == 8 punpcklbw m3, m4 pmaddubsw m3, m1 %ifidn %1,avg @@ -247,6 +253,25 @@ vc1_%1_mspel_mc30_%2_after_prologue: packuswb m3, m3 %ifidn %1,avg pavgb m3, m4 +%endif +%else + SBUTTERFLY bw, 3, 4, 7 + pmaddubsw m3, m1 + pmaddubsw m4, m1 + SBUTTERFLY bw, 6, 5, 7 + pmaddubsw m6, m2 + pmaddubsw m5, m2 + add srcq, strideq + psubw m3, m0 + psubw m4, m0 + paddw m3, m6 + paddw m4, m5 + psraw m3, 6 + psraw m4, 6 + packuswb m3, m4 +%ifidn %1, avg + pavgb m3, [dstq] +%endif %endif MOVA [dstq], m3 add dstq, strideq @@ -257,3 +282,6 @@ vc1_%1_mspel_mc30_%2_after_prologue: HOR_8B put, 8 HOR_8B avg, 8 + +HOR_8B put, 16 +HOR_8B avg, 16 -- 2.52.0 >From bc1986cbcf509a76874cf8748980a9c2aa8e846b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 20:55:31 +0200 Subject: [PATCH 07/11] avcodec/x86/vc1dsp_mmx: Remove purely horizontal MMX mc functions Superseded by SSSE3. By the way, the SSSE3 functions occupy 816B, but the functions removed now occupied 2304B with GCC and 6512B with Clang (which inlined everything). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_mmx.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 3c771feb4b..f89becff64 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -351,7 +351,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ ::: "memory"\ );\ \ - if (vmode) { /* Vertical filter to apply */\ if (hmode) { /* Horizontal filter to apply, output to tmp */\ static const int shift_value[] = { 0, 5, 1, 5 };\ int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ @@ -368,10 +367,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ return;\ }\ - }\ -\ - /* Horizontal mode with no vertical mode */\ - vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ } \ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ int stride, int hmode, int vmode, int rnd)\ @@ -421,17 +416,14 @@ DECLARE_FUNCTION(0, 1) DECLARE_FUNCTION(0, 2) DECLARE_FUNCTION(0, 3) -DECLARE_FUNCTION(1, 0) DECLARE_FUNCTION(1, 1) DECLARE_FUNCTION(1, 2) DECLARE_FUNCTION(1, 3) -DECLARE_FUNCTION(2, 0) DECLARE_FUNCTION(2, 1) DECLARE_FUNCTION(2, 2) DECLARE_FUNCTION(2, 3) -DECLARE_FUNCTION(3, 0) DECLARE_FUNCTION(3, 1) DECLARE_FUNCTION(3, 2) DECLARE_FUNCTION(3, 3) @@ -446,17 +438,14 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) FN_ASSIGN(put_, 0, 2, _mmx); FN_ASSIGN(put_, 0, 3, _mmx); - FN_ASSIGN(put_, 1, 0, _mmx); FN_ASSIGN(put_, 1, 1, _mmx); FN_ASSIGN(put_, 1, 2, _mmx); FN_ASSIGN(put_, 1, 3, _mmx); - FN_ASSIGN(put_, 2, 0, _mmx); FN_ASSIGN(put_, 2, 1, _mmx); FN_ASSIGN(put_, 2, 2, _mmx); FN_ASSIGN(put_, 2, 3, _mmx); - FN_ASSIGN(put_, 3, 0, _mmx); FN_ASSIGN(put_, 3, 1, _mmx); FN_ASSIGN(put_, 3, 2, _mmx); FN_ASSIGN(put_, 3, 3, _mmx); @@ -468,17 +457,14 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) FN_ASSIGN(avg_, 0, 2, _mmxext); FN_ASSIGN(avg_, 0, 3, _mmxext); - FN_ASSIGN(avg_, 1, 0, _mmxext); FN_ASSIGN(avg_, 1, 1, _mmxext); FN_ASSIGN(avg_, 1, 2, _mmxext); FN_ASSIGN(avg_, 1, 3, _mmxext); - FN_ASSIGN(avg_, 2, 0, _mmxext); FN_ASSIGN(avg_, 2, 1, _mmxext); FN_ASSIGN(avg_, 2, 2, _mmxext); FN_ASSIGN(avg_, 2, 3, _mmxext); - FN_ASSIGN(avg_, 3, 0, _mmxext); FN_ASSIGN(avg_, 3, 1, _mmxext); FN_ASSIGN(avg_, 3, 2, _mmxext); FN_ASSIGN(avg_, 3, 3, _mmxext); -- 2.52.0 >From 9b9e68911c9d2e59ca48cff7278fcd76a0ceb7f5 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 4 May 2026 22:11:38 +0200 Subject: [PATCH 08/11] avcodec/x86/vc1dsp_mc: Add size 8 vertical SSSE3 mc functions vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_c: 165.6 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_mmxext: 44.4 ( 3.73x) vc1dsp.avg_vc1_mspel_pixels_tab_mc01_8_ssse3: 18.5 ( 8.97x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_c: 152.5 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_mmxext: 37.3 ( 4.09x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_8_ssse3: 18.5 ( 8.25x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_c: 162.9 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_mmxext: 44.1 ( 3.69x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_8_ssse3: 18.3 ( 8.88x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_c: 150.5 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_mmx: 42.4 ( 3.55x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_8_ssse3: 16.5 ( 9.11x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_c: 78.4 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_mmx: 36.1 ( 2.17x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_8_ssse3: 16.5 ( 4.76x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_c: 144.7 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_mmx: 42.6 ( 3.40x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_8_ssse3: 16.3 ( 8.89x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_init.c | 3 ++ libavcodec/x86/vc1dsp_mc.asm | 72 ++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index b6fc7d3bfc..883edbe117 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -145,6 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; + MSPEL_FUNCS_SIZE(0, 1, 8, ssse3); + MSPEL_FUNCS_SIZE(0, 2, 8, ssse3); + MSPEL_FUNCS_SIZE(0, 3, 8, ssse3); MSPEL_FUNCS(1, 0, ssse3); MSPEL_FUNCS(2, 0, ssse3); MSPEL_FUNCS(3, 0, ssse3); diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index 8bdc86627a..924aaee54d 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -23,7 +23,12 @@ SECTION_RODATA +pb_m4_18: times 8 db -4, 18 +pb_53_m3: times 8 db 53, -3 +pb_m3_53: times 8 db -3, 53 +pb_18_m4: times 8 db 18, -4 pb_m4_36: times 8 db -4, 36 +pb_36_m4: times 8 db 36, -4 pb_m4_53: times 8 db -4, 53 pb_m3_18: times 8 db -3, 18 @@ -285,3 +290,70 @@ HOR_8B avg, 8 HOR_8B put, 16 HOR_8B avg, 16 + +%macro VER_8B 2 +%define MOVU movq +%define MOVA movq + +cglobal vc1_%1_mspel_mc01_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_18] + mova m2, [pb_53_m3] + add rndd, 31 + jmp vc1_%1_mspel_mc03_%2_after_prologue + +cglobal vc1_%1_mspel_mc02_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m4_36] + mova m2, [pb_36_m4] + lea rndd, [4*rndd+28] + jmp vc1_%1_mspel_mc03_%2_after_prologue + +cglobal vc1_%1_mspel_mc03_%2, 4, 4, 6, dst, src, stride, rnd + mova m1, [pb_m3_53] + mova m2, [pb_18_m4] + add rndd, 31 + +vc1_%1_mspel_mc03_%2_after_prologue: + neg strideq + movd m0, rndd + MOVU m3, [srcq+strideq] + neg strideq + MOVU m4, [srcq] + MOVU m5, [srcq+strideq] + SPLATW m0, m0 + WIN64_SPILL_XMM 8 + lea srcq, [srcq+2*strideq] +%define hd rndd + punpcklbw m3, m5 + mov hd, %2 + +.loop: + MOVU m6, [srcq] + pmaddubsw m3, m1 + punpcklbw m4, m6 + paddw m3, m0 + pmaddubsw m7, m4, m2 + add srcq, strideq + paddw m7, m3 + mova m3, m4 +%ifidn %1, avg + movq m4, [dstq] +%endif + psraw m7, 6 +%ifnidn %1, avg + mova m4, m5 +%endif + packuswb m7, m7 +%ifidn %1, avg + pavgb m7, m4 + mova m4, m5 +%endif + MOVA [dstq], m7 + add dstq, strideq + mova m5, m6 + dec hd + jnz .loop + RET +%endmacro + +VER_8B put, 8 +VER_8B avg, 8 -- 2.52.0 >From 634815b0709ba2888f7021d166f8ab2c609efd0c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 16 Oct 2025 20:18:41 +0200 Subject: [PATCH 09/11] avcodec/x86/vc1dsp_mc: Don't xor unnecessarily m0 is not used at all in this function. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_mc.asm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index 924aaee54d..1481a6e3e6 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -169,7 +169,6 @@ cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h LOAD_ROUNDER_MMX rndd mova m5, [pw_9] mova m6, [pw_128] - pxor m0, m0 .loop: mova m1, [srcq + 2 * 0] @@ -330,8 +329,8 @@ vc1_%1_mspel_mc03_%2_after_prologue: MOVU m6, [srcq] pmaddubsw m3, m1 punpcklbw m4, m6 - paddw m3, m0 pmaddubsw m7, m4, m2 + paddw m3, m0 add srcq, strideq paddw m7, m3 mova m3, m4 -- 2.52.0 >From 4c8bf3044b01953c92040313f7fcee085ddb8316 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 5 May 2026 12:17:30 +0200 Subject: [PATCH 10/11] avcodec/x86/vc1dsp_mc: Add size 16 vertical SSSE3 mc functions vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_c: 334.4 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_mmxext: 177.5 ( 1.88x) vc1dsp.avg_vc1_mspel_pixels_tab_mc01_16_ssse3: 52.3 ( 6.40x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_c: 306.9 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_mmxext: 149.2 ( 2.06x) vc1dsp.avg_vc1_mspel_pixels_tab_mc02_16_ssse3: 52.2 ( 5.88x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_c: 334.2 ( 1.00x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_mmxext: 176.5 ( 1.89x) vc1dsp.avg_vc1_mspel_pixels_tab_mc03_16_ssse3: 51.9 ( 6.44x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_c: 311.9 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_mmx: 169.8 ( 1.84x) vc1dsp.put_vc1_mspel_pixels_tab_mc01_16_ssse3: 51.6 ( 6.04x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_c: 279.3 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_mmx: 144.1 ( 1.94x) vc1dsp.put_vc1_mspel_pixels_tab_mc02_16_ssse3: 51.7 ( 5.40x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_c: 310.6 ( 1.00x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_mmx: 171.0 ( 1.82x) vc1dsp.put_vc1_mspel_pixels_tab_mc03_16_ssse3: 51.6 ( 6.02x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_init.c | 6 +-- libavcodec/x86/vc1dsp_mc.asm | 91 ++++++++++++++++++++++++++++++------ 2 files changed, 80 insertions(+), 17 deletions(-) diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c index 883edbe117..7108c9052b 100644 --- a/libavcodec/x86/vc1dsp_init.c +++ b/libavcodec/x86/vc1dsp_init.c @@ -145,9 +145,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; - MSPEL_FUNCS_SIZE(0, 1, 8, ssse3); - MSPEL_FUNCS_SIZE(0, 2, 8, ssse3); - MSPEL_FUNCS_SIZE(0, 3, 8, ssse3); + MSPEL_FUNCS(0, 1, ssse3); + MSPEL_FUNCS(0, 2, ssse3); + MSPEL_FUNCS(0, 3, ssse3); MSPEL_FUNCS(1, 0, ssse3); MSPEL_FUNCS(2, 0, ssse3); MSPEL_FUNCS(3, 0, ssse3); diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm index 1481a6e3e6..a3d1a9ba5c 100644 --- a/libavcodec/x86/vc1dsp_mc.asm +++ b/libavcodec/x86/vc1dsp_mc.asm @@ -290,25 +290,42 @@ HOR_8B avg, 8 HOR_8B put, 16 HOR_8B avg, 16 -%macro VER_8B 2 -%define MOVU movq -%define MOVA movq +%macro SETUP_COEFFS 3 ; width, coeff1, coeff2 +%if ARCH_X86_64 || (%1 == 8) + mova m1, [%2] + mova m2, [%3] +%define COEFF0 m1 +%define COEFF1 m2 +%define M8 m8 +%define M9 m9 +%else + lea r4, [%2] +%define COEFF0 [r4] +%define COEFF1 [r4+(%3-%2)] +%define M8 m1 +%define M9 m2 +%endif +%endmacro -cglobal vc1_%1_mspel_mc01_%2, 4, 4, 6, dst, src, stride, rnd - mova m1, [pb_m4_18] - mova m2, [pb_53_m3] +%macro VER_8B 2 +%if %2 == 8 +%define MOVU movq +%else +%define MOVU movu +%endif + +cglobal vc1_%1_mspel_mc01_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, rnd + SETUP_COEFFS %2, pb_m4_18, pb_53_m3 add rndd, 31 jmp vc1_%1_mspel_mc03_%2_after_prologue -cglobal vc1_%1_mspel_mc02_%2, 4, 4, 6, dst, src, stride, rnd - mova m1, [pb_m4_36] - mova m2, [pb_36_m4] +cglobal vc1_%1_mspel_mc02_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, rnd + SETUP_COEFFS %2, pb_m4_36, pb_36_m4 lea rndd, [4*rndd+28] jmp vc1_%1_mspel_mc03_%2_after_prologue -cglobal vc1_%1_mspel_mc03_%2, 4, 4, 6, dst, src, stride, rnd - mova m1, [pb_m3_53] - mova m2, [pb_18_m4] +cglobal vc1_%1_mspel_mc03_%2, 4, 4+ARCH_X86_32*(%2>>4), 6, dst, src, stride, rnd + SETUP_COEFFS %2, pb_m3_53, pb_18_m4 add rndd, 31 vc1_%1_mspel_mc03_%2_after_prologue: @@ -319,14 +336,20 @@ vc1_%1_mspel_mc03_%2_after_prologue: MOVU m4, [srcq] MOVU m5, [srcq+strideq] SPLATW m0, m0 - WIN64_SPILL_XMM 8 + WIN64_SPILL_XMM 8+3*(%2>>4) lea srcq, [srcq+2*strideq] %define hd rndd +%if %2 == 8 punpcklbw m3, m5 +%else + punpcklbw m7, m3, m5 + punpckhbw m3, m5 +%endif mov hd, %2 .loop: MOVU m6, [srcq] +%if %2 == 8 pmaddubsw m3, m1 punpcklbw m4, m6 pmaddubsw m7, m4, m2 @@ -346,7 +369,44 @@ vc1_%1_mspel_mc03_%2_after_prologue: pavgb m7, m4 mova m4, m5 %endif - MOVA [dstq], m7 + movq [dstq], m7 +%else + pmaddubsw m7, COEFF0 + pmaddubsw m3, COEFF0 + punpcklbw M8, m4, m6 + punpckhbw m4, m6 + pmaddubsw M9, M8, COEFF1 + paddw m7, m0 +%if ARCH_X86_64 + pmaddubsw m10, m4, m2 + paddw m3, m0 + paddw m9, m7 + mova m7, m8 + psraw m9, 6 + paddw m10, m3 +%else + paddw m3, m0 + paddw M9, m7 + mova m7, M8 + pmaddubsw M8, m4, COEFF1 + psraw M9, 6 + paddw M8, m3 +%endif + add srcq, strideq + mova m3, m4 +%if ARCH_X86_64 + psraw m10, 6 + packuswb m9, m10 +%else + psraw M8, 6 + packuswb M9, M8 +%endif +%ifidn %1, avg + pavgb M9, [dstq] +%endif + mova m4, m5 + mova [dstq], M9 +%endif add dstq, strideq mova m5, m6 dec hd @@ -356,3 +416,6 @@ vc1_%1_mspel_mc03_%2_after_prologue: VER_8B put, 8 VER_8B avg, 8 + +VER_8B put, 16 +VER_8B avg, 16 -- 2.52.0 >From 4717c4a89ada55f9ca0bfb2f470171bfdfae8a6a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 5 May 2026 12:31:28 +0200 Subject: [PATCH 11/11] avcodec/x86/vc1dsp_mmx: Remove purely vertical MMX mc functions They have been superseded by SSSE3. Notice that the functions removed occupied 3424B with GCC and 6176B with Clang here, whereas the SSSE3 functions replacing them occupy only 944B. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vc1dsp_mmx.c | 120 ------------------------------------ 1 file changed, 120 deletions(-) diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index f89becff64..cb53e67de9 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -74,64 +74,6 @@ void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride, "punpcklwd %%mm7, %%mm7 \n\t" \ "punpckldq %%mm7, %%mm7 \n\t" -/** - * Purely vertical or horizontal 1/2 shift interpolation. - * Sacrifice mm6 for *9 factor. - */ -#define VC1_SHIFT2(OP, OPNAME)\ -static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ - x86_reg stride, int rnd, x86_reg offset)\ -{\ - rnd = 8-rnd;\ - __asm__ volatile(\ - "mov $8, %%"FF_REG_c" \n\t"\ - LOAD_ROUNDER_MMX("%5")\ - "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ - "1: \n\t"\ - "movd 0(%0 ), %%mm3 \n\t"\ - "movd 4(%0 ), %%mm4 \n\t"\ - "movd 0(%0,%2), %%mm1 \n\t"\ - "movd 4(%0,%2), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm0, %%mm3 \n\t"\ - "punpcklbw %%mm0, %%mm4 \n\t"\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "paddw %%mm1, %%mm3 \n\t"\ - "paddw %%mm2, %%mm4 \n\t"\ - "movd 0(%0,%3), %%mm1 \n\t"\ - "movd 4(%0,%3), %%mm2 \n\t"\ - "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ - "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ - "movd 0(%0,%2), %%mm1 \n\t"\ - "movd 4(%0,%2), %%mm2 \n\t"\ - "punpcklbw %%mm0, %%mm1 \n\t"\ - "punpcklbw %%mm0, %%mm2 \n\t"\ - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ - NORMALIZE_MMX("$4")\ - "packuswb %%mm4, %%mm3 \n\t"\ - OP((%1), %%mm3)\ - "movq %%mm3, (%1) \n\t"\ - "add %6, %0 \n\t"\ - "add %4, %1 \n\t"\ - "dec %%"FF_REG_c" \n\t"\ - "jnz 1b \n\t"\ - : "+r"(src), "+r"(dst)\ - : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ - "g"(stride-offset)\ - NAMED_CONSTRAINTS_ADD(ff_pw_9)\ - : "%"FF_REG_c, "memory"\ - );\ -} - -VC1_SHIFT2(OP_PUT, put_) -VC1_SHIFT2(OP_AVG, avg_) - /** * Core of the 1/4 and 3/4 shift bicubic interpolation. * @@ -270,59 +212,18 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ ); \ } -/** - * Macro to build the 8 bits, any direction, version of vc1_put_shift[13]. - * Here, offset=src_stride. Parameters passed A1 to A4 must use - * %3 (offset) and %4 (3*offset). - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ -static void \ -OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ - x86_reg stride, int rnd, x86_reg offset) \ -{ \ - int h = 8; \ - src -= offset; \ - rnd = 32-rnd; \ - __asm__ volatile ( \ - LOAD_ROUNDER_MMX("%6") \ - "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ - "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ - ".p2align 3 \n\t" \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ - NORMALIZE_MMX("$6") \ - TRANSFER_DO_PACK(OP) \ - "add %5, %1 \n\t" \ - "add %5, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ - NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ - : "memory" \ - ); \ -} - /** 1/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) -MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) /** 3/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) -MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); -typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); /** * Interpolate fractional pel values by applying proper vertical then @@ -343,15 +244,12 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\ - static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ - { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ \ __asm__ volatile(\ "pxor %%mm0, %%mm0 \n\t"\ ::: "memory"\ );\ \ - if (hmode) { /* Horizontal filter to apply, output to tmp */\ static const int shift_value[] = { 0, 5, 1, 5 };\ int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ int r;\ @@ -361,12 +259,6 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ \ vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ - return;\ - }\ - else { /* No horizontal filter, output 8 lines to dst */\ - vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ - return;\ - }\ } \ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ int stride, int hmode, int vmode, int rnd)\ @@ -412,10 +304,6 @@ static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ } -DECLARE_FUNCTION(0, 1) -DECLARE_FUNCTION(0, 2) -DECLARE_FUNCTION(0, 3) - DECLARE_FUNCTION(1, 1) DECLARE_FUNCTION(1, 2) DECLARE_FUNCTION(1, 3) @@ -434,10 +322,6 @@ DECLARE_FUNCTION(3, 3) av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { - FN_ASSIGN(put_, 0, 1, _mmx); - FN_ASSIGN(put_, 0, 2, _mmx); - FN_ASSIGN(put_, 0, 3, _mmx); - FN_ASSIGN(put_, 1, 1, _mmx); FN_ASSIGN(put_, 1, 2, _mmx); FN_ASSIGN(put_, 1, 3, _mmx); @@ -453,10 +337,6 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) { - FN_ASSIGN(avg_, 0, 1, _mmxext); - FN_ASSIGN(avg_, 0, 2, _mmxext); - FN_ASSIGN(avg_, 0, 3, _mmxext); - FN_ASSIGN(avg_, 1, 1, _mmxext); FN_ASSIGN(avg_, 1, 2, _mmxext); FN_ASSIGN(avg_, 1, 3, _mmxext); -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
