PR #20932 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932.patch


>From a7102ce7ed9e6c0a8c61a92eb8e66b4260057adb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 16:18:16 +0100
Subject: [PATCH 1/9] avcodec/x86/mpegvideoenc: Remove check for MMX

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoenc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index eac9947590..bb1d2cc319 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -123,16 +123,14 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const 
s)
     const int dct_algo = s->c.avctx->dct_algo;
 
     if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
-        int cpu_flags = av_get_cpu_flags();
 #if HAVE_SSE2_INLINE
+        int cpu_flags = av_get_cpu_flags();
         if (INLINE_SSE2(cpu_flags)) {
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
 #endif
             s->denoise_dct  = denoise_dct_sse2;
         }
-#endif
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
-- 
2.49.1


>From feecc0585a8b83eb0d0897c8a842e82f080d6f26 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 16:46:18 +0100
Subject: [PATCH 2/9] avcodec/x86/mpegvideoenc: Reduce number of registers used

Avoids a push+pop on x64 Windows.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoenc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index bb1d2cc319..2ca05f69ea 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -68,7 +68,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t 
block[])
     s->dct_count[intra]++;
 
     __asm__ volatile(
-        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pxor %%xmm6, %%xmm6                    \n\t"
         "1:                                     \n\t"
         "pxor %%xmm0, %%xmm0                    \n\t"
         "pxor %%xmm1, %%xmm1                    \n\t"
@@ -90,18 +90,18 @@ static void denoise_dct_sse2(MPVEncContext *const s, 
int16_t block[])
         "psubw %%xmm1, %%xmm3                   \n\t"
         "movdqa %%xmm2, (%0)                    \n\t"
         "movdqa %%xmm3, 16(%0)                  \n\t"
-        "movdqa %%xmm4, %%xmm6                  \n\t"
+        "movdqa %%xmm4, %%xmm2                  \n\t"
         "movdqa %%xmm5, %%xmm0                  \n\t"
-        "punpcklwd %%xmm7, %%xmm4               \n\t"
-        "punpckhwd %%xmm7, %%xmm6               \n\t"
-        "punpcklwd %%xmm7, %%xmm5               \n\t"
-        "punpckhwd %%xmm7, %%xmm0               \n\t"
+        "punpcklwd %%xmm6, %%xmm4               \n\t"
+        "punpckhwd %%xmm6, %%xmm2               \n\t"
+        "punpcklwd %%xmm6, %%xmm5               \n\t"
+        "punpckhwd %%xmm6, %%xmm0               \n\t"
         "paddd (%1), %%xmm4                     \n\t"
-        "paddd 16(%1), %%xmm6                   \n\t"
+        "paddd 16(%1), %%xmm2                   \n\t"
         "paddd 32(%1), %%xmm5                   \n\t"
         "paddd 48(%1), %%xmm0                   \n\t"
         "movdqa %%xmm4, (%1)                    \n\t"
-        "movdqa %%xmm6, 16(%1)                  \n\t"
+        "movdqa %%xmm2, 16(%1)                  \n\t"
         "movdqa %%xmm5, 32(%1)                  \n\t"
         "movdqa %%xmm0, 48(%1)                  \n\t"
         "add $32, %0                            \n\t"
@@ -112,7 +112,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, 
int16_t block[])
         : "+r" (block), "+r" (sum), "+r" (offset)
         : "r"(block+64)
           XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                            "%xmm4", "%xmm5", "%xmm6")
     );
 }
 #endif /* HAVE_SSE2_INLINE */
-- 
2.49.1


>From 89a1bacded6e635f4773d2ae8b72cbd4f9a12338 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 17:32:29 +0100
Subject: [PATCH 3/9] avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to
 external assembly

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoenc.c      | 59 ++++--------------------------
 libavcodec/x86/mpegvideoencdsp.asm | 46 +++++++++++++++++++++++
 2 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 2ca05f69ea..e5665ac781 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 
 #endif /* HAVE_6REGS */
 
-#if HAVE_INLINE_ASM
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+                             const uint16_t dct_offset[64]);
+
 static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
 {
     const int intra = s->c.mb_intra;
@@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t 
block[])
 
     s->dct_count[intra]++;
 
-    __asm__ volatile(
-        "pxor %%xmm6, %%xmm6                    \n\t"
-        "1:                                     \n\t"
-        "pxor %%xmm0, %%xmm0                    \n\t"
-        "pxor %%xmm1, %%xmm1                    \n\t"
-        "movdqa (%0), %%xmm2                    \n\t"
-        "movdqa 16(%0), %%xmm3                  \n\t"
-        "pcmpgtw %%xmm2, %%xmm0                 \n\t"
-        "pcmpgtw %%xmm3, %%xmm1                 \n\t"
-        "pxor %%xmm0, %%xmm2                    \n\t"
-        "pxor %%xmm1, %%xmm3                    \n\t"
-        "psubw %%xmm0, %%xmm2                   \n\t"
-        "psubw %%xmm1, %%xmm3                   \n\t"
-        "movdqa %%xmm2, %%xmm4                  \n\t"
-        "movdqa %%xmm3, %%xmm5                  \n\t"
-        "psubusw (%2), %%xmm2                   \n\t"
-        "psubusw 16(%2), %%xmm3                 \n\t"
-        "pxor %%xmm0, %%xmm2                    \n\t"
-        "pxor %%xmm1, %%xmm3                    \n\t"
-        "psubw %%xmm0, %%xmm2                   \n\t"
-        "psubw %%xmm1, %%xmm3                   \n\t"
-        "movdqa %%xmm2, (%0)                    \n\t"
-        "movdqa %%xmm3, 16(%0)                  \n\t"
-        "movdqa %%xmm4, %%xmm2                  \n\t"
-        "movdqa %%xmm5, %%xmm0                  \n\t"
-        "punpcklwd %%xmm6, %%xmm4               \n\t"
-        "punpckhwd %%xmm6, %%xmm2               \n\t"
-        "punpcklwd %%xmm6, %%xmm5               \n\t"
-        "punpckhwd %%xmm6, %%xmm0               \n\t"
-        "paddd (%1), %%xmm4                     \n\t"
-        "paddd 16(%1), %%xmm2                   \n\t"
-        "paddd 32(%1), %%xmm5                   \n\t"
-        "paddd 48(%1), %%xmm0                   \n\t"
-        "movdqa %%xmm4, (%1)                    \n\t"
-        "movdqa %%xmm2, 16(%1)                  \n\t"
-        "movdqa %%xmm5, 32(%1)                  \n\t"
-        "movdqa %%xmm0, 48(%1)                  \n\t"
-        "add $32, %0                            \n\t"
-        "add $64, %1                            \n\t"
-        "add $32, %2                            \n\t"
-        "cmp %3, %0                             \n\t"
-            " jb 1b                             \n\t"
-        : "+r" (block), "+r" (sum), "+r" (offset)
-        : "r"(block+64)
-          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                            "%xmm4", "%xmm5", "%xmm6")
-    );
+    ff_mpv_denoise_dct_sse2(block, sum, offset);
 }
-#endif /* HAVE_SSE2_INLINE */
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_SSE2_EXTERNAL */
 
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
@@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
 #endif
+#if HAVE_SSE2_EXTERNAL
             s->denoise_dct  = denoise_dct_sse2;
+#endif
         }
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
diff --git a/libavcodec/x86/mpegvideoencdsp.asm 
b/libavcodec/x86/mpegvideoencdsp.asm
index d12646ae54..0e86a5304c 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -24,6 +24,52 @@
 %include "libavutil/x86/x86util.asm"
 
 SECTION .text
+
+INIT_XMM sse2
+cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
+    pxor            m6, m6
+    lea             r3, [sumq+256]
+.loop:
+    mova            m2, [blockq]
+    mova            m3, [blockq+16]
+    mova            m0, m6
+    mova            m1, m6
+    pcmpgtw         m0, m2
+    pcmpgtw         m1, m3
+    pxor            m2, m0
+    pxor            m3, m1
+    psubw           m2, m0
+    psubw           m3, m1
+    psubusw         m4, m2, [offsetq]
+    psubusw         m5, m3, [offsetq+16]
+    pxor            m4, m0
+    pxor            m5, m1
+    add        offsetq, 32
+    psubw           m4, m0
+    psubw           m5, m1
+    mova      [blockq], m4
+    mova   [blockq+16], m5
+    mova            m0, m2
+    mova            m1, m3
+    add         blockq, 32
+    punpcklwd       m0, m6
+    punpckhwd       m2, m6
+    punpcklwd       m1, m6
+    punpckhwd       m3, m6
+    paddd           m0, [sumq]
+    paddd           m2, [sumq+16]
+    paddd           m1, [sumq+32]
+    paddd           m3, [sumq+48]
+    mova        [sumq], m0
+    mova     [sumq+16], m2
+    mova     [sumq+32], m1
+    mova     [sumq+48], m3
+    add           sumq, 64
+    cmp           sumq, r3
+    jb           .loop
+    RET
+
+
 ; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size)
 ; %1 = number of loops
 ; %2 = number of GPRs used
-- 
2.49.1


>From bdc7fcbd439cca0cb1d85f51ca06fce91ac7c150 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 18:24:18 +0100
Subject: [PATCH 4/9] avcodec/mpegvideo_enc: Port denoise_dct to
 MpegvideoEncDSPContext

It is very simple to remove the MPVEncContext from it.
Notice that this also fixes a bug in x86/mpegvideoenc.c: It only
used the SSE2 version of denoise_dct when dct_algo was auto or mmx
(and it was therefore unused during FATE).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/mips/Makefile                      |  3 +-
 libavcodec/mips/mpegvideo_mips.h              |  3 +-
 libavcodec/mips/mpegvideoenc_init_mips.c      | 33 ----------------
 libavcodec/mips/mpegvideoencdsp_init_mips.c   |  5 +++
 ...egvideoenc_mmi.c => mpegvideoencdsp_mmi.c} |  7 +---
 libavcodec/mpegvideo_enc.c                    | 38 +++++--------------
 libavcodec/mpegvideoenc.h                     |  2 -
 libavcodec/mpegvideoencdsp.c                  | 25 ++++++++++++
 libavcodec/mpegvideoencdsp.h                  |  3 ++
 libavcodec/x86/mpegvideoenc.c                 | 19 ----------
 libavcodec/x86/mpegvideoenc_template.c        |  7 +++-
 libavcodec/x86/mpegvideoencdsp_init.c         |  3 ++
 12 files changed, 53 insertions(+), 95 deletions(-)
 delete mode 100644 libavcodec/mips/mpegvideoenc_init_mips.c
 rename libavcodec/mips/{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} (95%)

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 4bbc2f00ea..1d777293d0 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -54,7 +54,6 @@ OBJS-$(CONFIG_BLOCKDSP)                   += 
mips/blockdsp_init_mips.o
 OBJS-$(CONFIG_PIXBLOCKDSP)                += mips/pixblockdsp_init_mips.o
 OBJS-$(CONFIG_IDCTDSP)                    += mips/idctdsp_init_mips.o
 OBJS-$(CONFIG_MPEGVIDEO)                  += mips/mpegvideo_init_mips.o
-OBJS-$(CONFIG_MPEGVIDEOENC)               += mips/mpegvideoenc_init_mips.o
 OBJS-$(CONFIG_MPEGVIDEOENCDSP)            += mips/mpegvideoencdsp_init_mips.o
 OBJS-$(CONFIG_ME_CMP)                     += mips/me_cmp_init_mips.o
 OBJS-$(CONFIG_MPEG4_DECODER)              += mips/xvididct_init_mips.o
@@ -100,7 +99,7 @@ MMI-OBJS-$(CONFIG_H264DSP)                += 
mips/h264dsp_mmi.o
 MMI-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_mmi.o
 MMI-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_mmi.o
 MMI-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_mmi.o
-MMI-OBJS-$(CONFIG_MPEGVIDEOENC)           += mips/mpegvideoenc_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEOENCDSP)        += mips/mpegvideoenc_mmi.o
 MMI-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_mmi.o           \
                                              mips/simple_idct_mmi.o
 MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 72ffed6985..2a9ea4006e 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -22,7 +22,6 @@
 #define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
 
 #include "libavcodec/mpegvideo.h"
-#include "libavcodec/mpegvideoenc.h"
 
 void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
         int n, int qscale);
@@ -34,6 +33,6 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, 
int16_t *block,
         int n, int qscale);
 void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
         int n, int qscale);
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block);
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64]);
 
 #endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideoenc_init_mips.c 
b/libavcodec/mips/mpegvideoenc_init_mips.c
deleted file mode 100644
index 7831973eb8..0000000000
--- a/libavcodec/mips/mpegvideoenc_init_mips.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2015 Manojkumar Bhosale ([email protected])
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/mips/cpu.h"
-#include "libavcodec/mpegvideoenc.h"
-#include "mpegvideo_mips.h"
-
-av_cold void ff_mpvenc_dct_init_mips(MPVEncContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_mmi(cpu_flags)) {
-        s->denoise_dct = ff_denoise_dct_mmi;
-    }
-}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c 
b/libavcodec/mips/mpegvideoencdsp_init_mips.c
index 24a17b91db..df916282a2 100644
--- a/libavcodec/mips/mpegvideoencdsp_init_mips.c
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -23,12 +23,17 @@
 #include "libavcodec/bit_depth_template.c"
 #include "libavcodec/mpegvideoencdsp.h"
 #include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
 
 av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
                                           AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_mmi(cpu_flags)) {
+        c->denoise_dct = ff_denoise_dct_mmi;
+    }
+
     if (have_msa(cpu_flags)) {
 #if BIT_DEPTH == 8
         c->pix_sum = ff_pix_sum_msa;
diff --git a/libavcodec/mips/mpegvideoenc_mmi.c 
b/libavcodec/mips/mpegvideoencdsp_mmi.c
similarity index 95%
rename from libavcodec/mips/mpegvideoenc_mmi.c
rename to libavcodec/mips/mpegvideoencdsp_mmi.c
index 085be3b0ec..2239a05978 100644
--- a/libavcodec/mips/mpegvideoenc_mmi.c
+++ b/libavcodec/mips/mpegvideoencdsp_mmi.c
@@ -25,17 +25,12 @@
 #include "mpegvideo_mips.h"
 #include "libavutil/mips/mmiutils.h"
 
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block)
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64])
 {
-    const int intra = s->c.mb_intra;
-    int *sum = s->dct_error_sum[intra];
-    uint16_t *offset = s->dct_offset[intra];
     double ftmp[8];
     mips_reg addr[1];
     DECLARE_VAR_ALL64;
 
-    s->dct_count[intra]++;
-
     __asm__ volatile(
         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index ce0ee4bb68..9e83026b51 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -86,7 +86,6 @@
 static int encode_picture(MPVMainEncContext *const s, const AVPacket *pkt);
 static int dct_quantize_refine(MPVEncContext *const s, int16_t *block, int16_t 
*weight, int16_t *orig, int n, int qscale);
 static int sse_mb(MPVEncContext *const s);
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block);
 static int dct_quantize_c(MPVEncContext *const s,
                           int16_t *block, int n,
                           int qscale, int *overflow);
@@ -300,11 +299,8 @@ static av_cold void mpv_encode_defaults(MPVMainEncContext 
*const m)
 av_cold void ff_dct_encode_init(MPVEncContext *const s)
 {
     s->dct_quantize = dct_quantize_c;
-    s->denoise_dct  = denoise_dct_c;
 
-#if ARCH_MIPS
-    ff_mpvenc_dct_init_mips(s);
-#elif ARCH_X86
+#if ARCH_X86
     ff_dct_encode_init_x86(s);
 #endif
 
@@ -3955,29 +3951,14 @@ static int encode_picture(MPVMainEncContext *const m, 
const AVPacket *pkt)
     return 0;
 }
 
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block)
+static inline void denoise_dct(MPVEncContext *const s, int16_t block[])
 {
+    if (!s->dct_error_sum)
+        return;
+
     const int intra = s->c.mb_intra;
-    int i;
-
     s->dct_count[intra]++;
-
-    for(i=0; i<64; i++){
-        int level= block[i];
-
-        if(level){
-            if(level>0){
-                s->dct_error_sum[intra][i] += level;
-                level -= s->dct_offset[intra][i];
-                if(level<0) level=0;
-            }else{
-                s->dct_error_sum[intra][i] -= level;
-                level += s->dct_offset[intra][i];
-                if(level>0) level=0;
-            }
-            block[i]= level;
-        }
-    }
+    s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], 
s->dct_offset[intra]);
 }
 
 static int dct_quantize_trellis_c(MPVEncContext *const s,
@@ -4009,8 +3990,8 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
 
     s->fdsp.fdct(block);
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    denoise_dct(s, block);
+
     qmul= qscale*16;
     qadd= ((qscale-1)|1)*8;
 
@@ -4678,8 +4659,7 @@ static int dct_quantize_c(MPVEncContext *const s,
 
     s->fdsp.fdct(block);
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    denoise_dct(s, block);
 
     if (s->c.mb_intra) {
         scantable = s->c.intra_scantable.scantable;
diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h
index ee115c3611..131908c10a 100644
--- a/libavcodec/mpegvideoenc.h
+++ b/libavcodec/mpegvideoenc.h
@@ -123,7 +123,6 @@ typedef struct MPVEncContext {
     uint16_t (*q_inter_matrix16)[2][64];
 
     /* noise reduction */
-    void (*denoise_dct)(struct MPVEncContext *s, int16_t *block);
     int (*dct_error_sum)[64];
     int dct_count[2];
     uint16_t (*dct_offset)[64];
@@ -397,7 +396,6 @@ int ff_mpv_reallocate_putbitbuffer(MPVEncContext *s, size_t 
threshold, size_t si
 void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
 
 void ff_dct_encode_init(MPVEncContext *s);
-void ff_mpvenc_dct_init_mips(MPVEncContext *s);
 void ff_dct_encode_init_x86(MPVEncContext *s);
 
 void ff_convert_matrix(MPVEncContext *s, int (*qmat)[64], uint16_t 
(*qmat16)[2][64],
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index b4fd2af915..3b4a57d58a 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -28,6 +28,29 @@
 #include "mathops.h"
 #include "mpegvideoencdsp.h"
 
+static void denoise_dct_c(int16_t block[64], int dct_error_sum[64],
+                          const uint16_t dct_offset[64])
+{
+    for (int i = 0; i < 64; ++i) {
+        int level = block[i];
+
+        if (level) {
+            if (level > 0) {
+                dct_error_sum[i] += level;
+                level -= dct_offset[i];
+                if (level < 0)
+                    level = 0;
+            } else {
+                dct_error_sum[i] -= level;
+                level += dct_offset[i];
+                if (level > 0)
+                    level = 0;
+            }
+            block[i] = level;
+        }
+    }
+}
+
 static int try_8x8basis_c(const int16_t rem[64], const int16_t weight[64],
                           const int16_t basis[64], int scale)
 {
@@ -253,6 +276,8 @@ static void shrink88(uint8_t *dst, ptrdiff_t dst_wrap,
 av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                                      AVCodecContext *avctx)
 {
+    c->denoise_dct  = denoise_dct_c;
+
     c->try_8x8basis = try_8x8basis_c;
     c->add_8x8basis = add_8x8basis_c;
 
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 6ec665677b..989503f25f 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -30,6 +30,9 @@
 #define EDGE_BOTTOM 2
 
 typedef struct MpegvideoEncDSPContext {
+    void (*denoise_dct)(int16_t block[64], int dct_error_sum[64],
+                        const uint16_t dct_offset[64]);
+
     int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64],
                         const int16_t basis[64], int scale);
     void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale);
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index e5665ac781..c667dcd2a2 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,22 +57,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 
 #endif /* HAVE_6REGS */
 
-#if HAVE_SSE2_EXTERNAL
-void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
-                             const uint16_t dct_offset[64]);
-
-static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
-{
-    const int intra = s->c.mb_intra;
-    int *sum= s->dct_error_sum[intra];
-    uint16_t *offset= s->dct_offset[intra];
-
-    s->dct_count[intra]++;
-
-    ff_mpv_denoise_dct_sse2(block, sum, offset);
-}
-#endif /* HAVE_SSE2_EXTERNAL */
-
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
     const int dct_algo = s->c.avctx->dct_algo;
@@ -83,9 +67,6 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
         if (INLINE_SSE2(cpu_flags)) {
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
-#endif
-#if HAVE_SSE2_EXTERNAL
-            s->denoise_dct  = denoise_dct_sse2;
 #endif
         }
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index f0b95c1621..14e993de2b 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -76,8 +76,11 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
     //s->fdct (block);
     ff_fdct_sse2(block); // cannot be anything else ...
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    if (s->dct_error_sum) {
+        const int intra = s->c.mb_intra;
+        s->dct_count[intra]++;
+        s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], 
s->dct_offset[intra]);
+    }
 
     if (s->c.mb_intra) {
         int dummy;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
index bf5b722016..f6169b5399 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+                             const uint16_t dct_offset[64]);
 int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
@@ -209,6 +211,7 @@ av_cold void 
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->denoise_dct = ff_mpv_denoise_dct_sse2;
         c->pix_sum     = ff_pix_sum16_sse2;
         c->pix_norm1   = ff_pix_norm1_sse2;
     }
-- 
2.49.1


>From 06076cb368f1cf6baaed5f6de8ed2894236c5910 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 19:06:14 +0100
Subject: [PATCH 5/9] tests/checkasm/mpegvideoencdsp: Test denoise_dct

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/mpegvideoencdsp.c | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index a4a4fa6f5c..955cd9f5b7 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -37,6 +37,37 @@
             buf[j] = rnd() % (max - min + 1) + min;      \
     } while (0)
 
+static void check_denoise_dct(MpegvideoEncDSPContext *c)
+{
+    declare_func(void, int16_t block[64], int dct_error_sum[64],
+                       const uint16_t dct_offset[64]);
+
+    if (check_func(c->denoise_dct, "denoise_dct")) {
+        DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+        DECLARE_ALIGNED(16, int16_t, block_new)[64];
+        DECLARE_ALIGNED(16, int, dct_error_sum_ref)[64];
+        DECLARE_ALIGNED(16, int, dct_error_sum_new)[64];
+        DECLARE_ALIGNED(16, uint16_t, dct_offset)[64];
+
+        for (size_t i = 0; i < FF_ARRAY_ELEMS(block_ref); ++i) {
+            unsigned random = rnd();
+            block_ref[i] = random & (1 << 16) ? random : 0;
+        }
+        randomize_buffers(dct_offset, sizeof(dct_offset));
+        randomize_buffer_clipped(dct_error_sum_ref, 0, (1 << 24) - 1);
+        memcpy(block_new, block_ref, sizeof(block_new));
+        memcpy(dct_error_sum_new, dct_error_sum_ref, 
sizeof(dct_error_sum_ref));
+
+        call_ref(block_ref, dct_error_sum_ref, dct_offset);
+        call_new(block_new, dct_error_sum_new, dct_offset);
+        if (memcmp(block_ref, block_new, sizeof(block_ref)) ||
+            memcmp(dct_error_sum_new, dct_error_sum_ref, 
sizeof(dct_error_sum_new)))
+            fail();
+
+        bench_new(block_new, dct_error_sum_new, dct_offset);
+    }
+}
+
 static void check_add_8x8basis(MpegvideoEncDSPContext *c)
 {
     declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
@@ -166,6 +197,8 @@ void checkasm_check_mpegvideoencdsp(void)
 
     ff_mpegvideoencdsp_init(&c, &avctx);
 
+    check_denoise_dct(&c);
+    report("denoise_dct");
     check_pix_sum(&c);
     report("pix_sum");
     check_pix_norm1(&c);
-- 
2.49.1


>From 1b9714e7bc08908a0f03c4a0a5757485489cec4c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 19:44:02 +0100
Subject: [PATCH 6/9] avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to
 ASM

Both GCC and Clang completely unroll the unlikely loop at -O3,
leading to codesize bloat; their code is also suboptimal, as they
don't make use of pmulhrsw (even with -mssse3). This commit
therefore ports the whole function to external assembly. The new
function occupies 176B here vs 1406B for GCC.

Benchmarks for a testcase with huge qscale (notice that the C version
is unrolled just like the unlikely loop in the SSSE3 version):
add_8x8basis_c:                                         43.4 ( 1.00x)
add_8x8basis_ssse3 (old):                               43.6 ( 1.00x)
add_8x8basis_ssse3 (new):                               12.6 ( 3.46x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoencdsp.asm    | 46 +++++++++++++++++++++++++++
 libavcodec/x86/mpegvideoencdsp_init.c | 46 ++++-----------------------
 2 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/libavcodec/x86/mpegvideoencdsp.asm 
b/libavcodec/x86/mpegvideoencdsp.asm
index 0e86a5304c..a85de32449 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -25,6 +25,52 @@
 
 SECTION .text
 
+; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale)
+INIT_XMM ssse3
+cglobal add_8x8basis, 3, 3, 4, rem, basis, scale
+    movd            m0, scaled
+    add         scaled, 1024
+    add         basisq, 128
+    add           remq, 128
+    cmp         scaled, 2047
+    mov            r2q, -128
+    ja     .huge_scale
+
+    punpcklwd       m0, m0
+    pshufd          m0, m0, 0x0
+    psllw           m0, 5
+.loop1:
+    mova            m1, [basisq+r2q]
+    mova            m2, [basisq+r2q+16]
+    pmulhrsw        m1, m0
+    pmulhrsw        m2, m0
+    paddw           m1, [remq+r2q]
+    paddw           m2, [remq+r2q+16]
+    mova    [remq+r2q], m1
+    mova [remq+r2q+16], m2
+    add            r2q, 32
+    js          .loop1
+    RET
+
+.huge_scale:
+    pslld           m0, 6
+    pshuflw         m1, m0, 0x55
+    psrlw           m0, 1
+    punpcklwd       m0, m0
+    punpcklwd       m1, m1
+    pshufd          m0, m0, 0x0
+.loop2:
+    mova            m2, [basisq+r2q]
+    pmulhrsw        m3, m2, m0
+    pmullw          m2, m1
+    paddw           m2, m3
+    paddw           m2, [remq+r2q]
+    mova    [remq+r2q], m2
+    add            r2q, 16
+    js          .loop2
+    RET
+
+
 INIT_XMM sse2
 cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
     pxor            m6, m6
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
index f6169b5399..220c75785a 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int 
dct_error_sum[64],
 int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
+void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale);
 
 #if HAVE_INLINE_ASM
 #if HAVE_SSSE3_INLINE
@@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const 
int16_t weight[64], c
     );
     return i;
 }
-
-static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale)
-{
-    x86_reg i=0;
-
-    if (FFABS(scale) < 1024) {
-        scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);
-        __asm__ volatile(
-                "movd                %3, %%xmm2     \n\t"
-                "punpcklwd       %%xmm2, %%xmm2     \n\t"
-                "pshufd      $0, %%xmm2, %%xmm2     \n\t"
-                ".p2align 4                         \n\t"
-                "1:                                 \n\t"
-                "movdqa        (%1, %0), %%xmm0     \n\t"
-                "movdqa      16(%1, %0), %%xmm1     \n\t"
-                "pmulhrsw        %%xmm2, %%xmm0     \n\t"
-                "pmulhrsw        %%xmm2, %%xmm1     \n\t"
-                "paddw         (%2, %0), %%xmm0     \n\t"
-                "paddw       16(%2, %0), %%xmm1     \n\t"
-                "movdqa          %%xmm0, (%2, %0)   \n\t"
-                "movdqa          %%xmm1, 16(%2, %0) \n\t"
-                "add                $32, %0         \n\t"
-                "cmp               $128, %0         \n\t" // FIXME optimize & 
bench
-                " jb                 1b             \n\t"
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-                XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
-        );
-    } else {
-        for (i=0; i<8*8; i++) {
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - 
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
-
 #endif /* HAVE_SSSE3_INLINE */
 
 /* Draw the edges of width 'w' of an image of size width, height */
@@ -227,15 +193,17 @@ av_cold void 
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
             c->draw_edges = draw_edges_mmx;
         }
     }
+#endif /* HAVE_INLINE_ASM */
 
+    if (X86_SSSE3(cpu_flags)) {
 #if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->try_8x8basis = try_8x8basis_ssse3;
         }
-        c->add_8x8basis = add_8x8basis_ssse3;
-    }
 #endif /* HAVE_SSSE3_INLINE */
+#if HAVE_SSSE3_EXTERNAL
+        c->add_8x8basis = ff_add_8x8basis_ssse3;
+#endif
+    }
 
-#endif /* HAVE_INLINE_ASM */
 }
-- 
2.49.1


>From 0dfe66422eedfd67028b89c46ff7db2f8fef80eb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 19:56:23 +0100
Subject: [PATCH 7/9] avcodec/x86/mpegvideoenc_template: Avoid touching
 nonvolatile register

xmm7 is nonvolatile on x64 Windows.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoenc_template.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index 14e993de2b..b5417f6d32 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -117,7 +117,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         __asm__ volatile(
             "movd %%"FF_REG_a", %%xmm3          \n\t" // last_non_zero_p1
             SPREADW("%%xmm3")
-            "pxor  %%xmm7, %%xmm7               \n\t" // 0
+            "pxor  %%xmm2, %%xmm2               \n\t" // 0
             "pxor  %%xmm4, %%xmm4               \n\t" // 0
             "movdqa  (%2), %%xmm5               \n\t" // qmat[0]
             "pxor  %%xmm6, %%xmm6               \n\t"
@@ -132,9 +132,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
             "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
-            "pcmpeqw %%xmm7, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
+            "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
             "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
-            "movdqa  %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+            "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
             "add        $16, %%"FF_REG_a"       \n\t"
@@ -146,13 +146,13 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                                "%xmm4", "%xmm5", "%xmm6")
         );
     }else{ // FMT_H263
         __asm__ volatile(
             "movd %%"FF_REG_a", %%xmm3          \n\t" // last_non_zero_p1
             SPREADW("%%xmm3")
-            "pxor %%xmm7, %%xmm7                \n\t" // 0
+            "pxor %%xmm2, %%xmm2                \n\t" // 0
             "pxor %%xmm4, %%xmm4                \n\t" // 0
             "mov $-128, %%"FF_REG_a"            \n\t"
             ".p2align 4                         \n\t"
@@ -166,9 +166,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
             "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
-            "pcmpeqw %%xmm7, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
+            "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
             "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
-            "movdqa  %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+            "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
             "add        $16, %%"FF_REG_a"       \n\t"
@@ -180,7 +180,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                                "%xmm4", "%xmm5", "%xmm6")
         );
     }
     __asm__ volatile(
-- 
2.49.1


>From 1c1109ba320528f01d610da9b25aae8591458526 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 16 Nov 2025 11:10:07 +0100
Subject: [PATCH 8/9] avcodec/x86/mpegvideoenc_template: Reduce number of
 registers used

qmat and bias always have a constant offset, so one can use one register
to address both of them. This allows to remove the check for HAVE_6REGS
(untested on a system where HAVE_6REGS is false).
Also avoid FF_REG_a while at it.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideoenc.c          |  8 +-------
 libavcodec/x86/mpegvideoenc_template.c | 21 +++++++++------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index c667dcd2a2..24dd049200 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -39,8 +39,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
     36, 37, 49, 50, 58, 59, 63, 64,
 };
 
-#if HAVE_6REGS
-
 #if HAVE_SSE2_INLINE
 #define COMPILE_TEMPLATE_SSSE3  0
 #define RENAME(a)      a ## _sse2
@@ -55,8 +53,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
-#endif /* HAVE_6REGS */
-
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
     const int dct_algo = s->c.avctx->dct_algo;
@@ -65,11 +61,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 #if HAVE_SSE2_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_SSE2(cpu_flags)) {
-#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
-#endif
         }
-#if HAVE_6REGS && HAVE_SSSE3_INLINE
+#if HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index b5417f6d32..e6ce791347 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -70,7 +70,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
 {
     x86_reg last_non_zero_p1;
     int level=0, q; //=0 is because gcc says uninitialized ...
-    const uint16_t *qmat, *bias;
+    const uint16_t *qmat;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
     //s->fdct (block);
@@ -86,11 +86,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         int dummy;
         if (n < 4){
             q = s->c.y_dc_scale;
-            bias = s->q_intra_matrix16[qscale][1];
             qmat = s->q_intra_matrix16[qscale][0];
         }else{
             q = s->c.c_dc_scale;
-            bias = s->q_chroma_intra_matrix16[qscale][1];
             qmat = s->q_chroma_intra_matrix16[qscale][0];
         }
         /* note: block[0] is assumed to be positive */
@@ -109,7 +107,6 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         last_non_zero_p1 = 1;
     } else {
         last_non_zero_p1 = 0;
-        bias = s->q_inter_matrix16[qscale][1];
         qmat = s->q_inter_matrix16[qscale][0];
     }
 
@@ -121,7 +118,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "pxor  %%xmm4, %%xmm4               \n\t" // 0
             "movdqa  (%2), %%xmm5               \n\t" // qmat[0]
             "pxor  %%xmm6, %%xmm6               \n\t"
-            "psubw   (%3), %%xmm6               \n\t" // -bias[0]
+            "psubw 128(%2), %%xmm6              \n\t" // -bias[0]
             "mov $-128, %%"FF_REG_a"            \n\t"
             ".p2align 4                         \n\t"
             "1:                                 \n\t"
@@ -131,9 +128,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "pmulhw  %%xmm5, %%xmm0             \n\t" // 
(ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
+            "movdqa  %%xmm0, (%4, %0)           \n\t"
             "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
-            "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+            "movdqa  (%3, %0), %%xmm1           \n\t"
             "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
@@ -143,7 +140,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "movd %%xmm3, %%"FF_REG_a"          \n\t"
             "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat), "r" (bias),
+            : "r" (block+64), "r" (qmat),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                 "%xmm4", "%xmm5", "%xmm6")
@@ -159,15 +156,15 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "1:                                 \n\t"
             "movdqa  (%1, %%"FF_REG_a"), %%xmm0 \n\t" // block[i]
             SAVE_SIGN("%%xmm1", "%%xmm0")             // ABS(block[i])
-            "movdqa  (%3, %%"FF_REG_a"), %%xmm6 \n\t" // bias[0]
+            "movdqa  128(%2, %0), %%xmm6        \n\t" // bias[i]
             "paddusw %%xmm6, %%xmm0             \n\t" // ABS(block[i]) + 
bias[0]
             "movdqa  (%2, %%"FF_REG_a"), %%xmm5 \n\t" // qmat[i]
             "pmulhw  %%xmm5, %%xmm0             \n\t" // 
(ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
+            "movdqa  %%xmm0, (%4, %0)           \n\t"
             "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
-            "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+            "movdqa  (%3, %0), %%xmm1           \n\t"
             "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
@@ -177,7 +174,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "movd %%xmm3, %%"FF_REG_a"          \n\t"
             "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+            : "r" (block+64), "r" (qmat+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                 "%xmm4", "%xmm5", "%xmm6")
-- 
2.49.1


>From 8ae2428ebedca7f191846e5fde2442069d15e8b1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 16 Nov 2025 12:10:22 +0100
Subject: [PATCH 9/9] avutil/x86/asm: Remove wrong comment, rename FF_REG_sp

Before FFmpeg commit 531b0a316b24f00965cd8a88efdbea2c6d63147f,
FFmpeg used REG_SP as macro for the stack pointer, yet this
clashed with a REG_SP define in Solaris system headers, so it
was changed to REG_sp and a comment was added for this.

Libav fixed it by adding an FF_ prefix to the macros in
1e9c5bf4c136fe9e010cc8a7e7270bba0d1bf45e. FFmpeg switched
to using these prefixes in 9eb3da2f9942cf1b1148d242bccfc383f666feb6,
using FF_REG_sp instead of Libav's FF_REG_SP. In said commit
the comment was changed to claim that Solaris system headers
define FF_REG_SP, but this is (most likely) wrong.

This commit removes the wrong comment and renames the (actually unused)
macro to FF_REG_SP to make it consistent with FF_REG_BP.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/x86/asm.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index 9bff42d628..f06ea25035 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -38,8 +38,7 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
 #    define FF_PTR_SIZE "8"
 typedef int64_t x86_reg;
 
-/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
-#    define FF_REG_sp "rsp"
+#    define FF_REG_SP "rsp"
 #    define FF_REG_BP "rbp"
 #    define FF_REGBP   rbp
 #    define FF_REGa    rax
@@ -60,7 +59,7 @@ typedef int64_t x86_reg;
 #    define FF_PTR_SIZE "4"
 typedef int32_t x86_reg;
 
-#    define FF_REG_sp "esp"
+#    define FF_REG_SP "esp"
 #    define FF_REG_BP "ebp"
 #    define FF_REGBP   ebp
 #    define FF_REGa    eax
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to