PR #23067 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23067
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23067.patch


>From dcf5d041ad50f9b23b234bd9aa4da6ff862fb296 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 19:53:57 +0200
Subject: [PATCH 1/6] avfilter/vf_pp7: Constify

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_pp7.c          | 18 +++++++++---------
 libavfilter/vf_pp7.h          |  5 ++---
 libavfilter/x86/vf_pp7.asm    |  2 +-
 libavfilter/x86/vf_pp7_init.c |  2 +-
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/libavfilter/vf_pp7.c b/libavfilter/vf_pp7.c
index 7b653b977f..ea27e10060 100644
--- a/libavfilter/vf_pp7.c
+++ b/libavfilter/vf_pp7.c
@@ -96,7 +96,7 @@ static void init_thres2(PP7Context *p)
     }
 }
 
-static inline void dctA_c(int16_t *dst, uint8_t *src, int stride)
+static inline void dctA_c(int16_t *dst, const uint8_t *src, int stride)
 {
     int i;
 
@@ -119,7 +119,7 @@ static inline void dctA_c(int16_t *dst, uint8_t *src, int 
stride)
     }
 }
 
-static void dctB_c(int16_t *dst, int16_t *src)
+static void dctB_c(int16_t *dst, const int16_t *src)
 {
     int i;
 
@@ -142,7 +142,7 @@ static void dctB_c(int16_t *dst, int16_t *src)
     }
 }
 
-static int hardthresh_c(PP7Context *p, int16_t *src, int qp)
+static int hardthresh_c(const PP7Context *p, const int16_t *src, int qp)
 {
     int i;
     int a;
@@ -158,7 +158,7 @@ static int hardthresh_c(PP7Context *p, int16_t *src, int qp)
     return (a + (1 << 11)) >> 12;
 }
 
-static int mediumthresh_c(PP7Context *p, int16_t *src, int qp)
+static int mediumthresh_c(const PP7Context *p, const int16_t *src, int qp)
 {
     int i;
     int a;
@@ -182,7 +182,7 @@ static int mediumthresh_c(PP7Context *p, int16_t *src, int 
qp)
     return (a + (1 << 11)) >> 12;
 }
 
-static int softthresh_c(PP7Context *p, int16_t *src, int qp)
+static int softthresh_c(const PP7Context *p, const int16_t *src, int qp)
 {
     int i;
     int a;
@@ -202,10 +202,10 @@ static int softthresh_c(PP7Context *p, int16_t *src, int 
qp)
     return (a + (1 << 11)) >> 12;
 }
 
-static void filter(PP7Context *p, uint8_t *dst, uint8_t *src,
+static void filter(PP7Context *p, uint8_t *dst, const uint8_t *src,
                    int dst_stride, int src_stride,
                    int width, int height,
-                   uint8_t *qp_store, int qp_stride, int is_luma)
+                   const uint8_t *qp_store, int qp_stride, int is_luma)
 {
     int x, y;
     const int stride = is_luma ? p->temp_stride : ((width + 16 + 15) & (~15));
@@ -231,7 +231,7 @@ static void filter(PP7Context *p, uint8_t *dst, uint8_t 
*src,
     for (y = 0; y < height; y++) {
         for (x = -8; x < 0; x += 4) {
             const int index = x + y * stride + (8 - 3) * (1 + stride) + 8; 
//FIXME silly offset
-            uint8_t *src  = p_src + index;
+            const uint8_t *src = p_src + index;
             int16_t *tp   = temp + 4 * x;
 
             dctA_c(tp + 4 * 8, src, stride);
@@ -249,7 +249,7 @@ static void filter(PP7Context *p, uint8_t *dst, uint8_t 
*src,
             }
             for (; x < end; x++) {
                 const int index = x + y * stride + (8 - 3) * (1 + stride) + 8; 
//FIXME silly offset
-                uint8_t *src = p_src + index;
+                const uint8_t *src = p_src + index;
                 int16_t *tp  = temp + 4 * x;
                 int v;
 
diff --git a/libavfilter/vf_pp7.h b/libavfilter/vf_pp7.h
index b7cbb020bb..c733079291 100644
--- a/libavfilter/vf_pp7.h
+++ b/libavfilter/vf_pp7.h
@@ -37,9 +37,8 @@ typedef struct PP7Context {
     int temp_stride;
     uint8_t *src;
 
-    int (*requantize)(struct PP7Context *p, int16_t *src, int qp);
-    void (*dctB)(int16_t *dst, int16_t *src);
-
+    int (*requantize)(const struct PP7Context *p, const int16_t *src, int qp);
+    void (*dctB)(int16_t *dst, const int16_t *src);
 } PP7Context;
 
 void ff_pp7_init_x86(PP7Context *pp7);
diff --git a/libavfilter/x86/vf_pp7.asm b/libavfilter/x86/vf_pp7.asm
index 7b3e5cf5e3..9dfabdcc8d 100644
--- a/libavfilter/x86/vf_pp7.asm
+++ b/libavfilter/x86/vf_pp7.asm
@@ -26,7 +26,7 @@ SECTION .text
 
 INIT_MMX mmx
 
-;void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src)
+;void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src)
 cglobal pp7_dctB, 2, 2, 0, dst, src
     movq   m0, [srcq]
     movq   m1, [srcq+mmsize*1]
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
index 165b0dd5d0..a87882359d 100644
--- a/libavfilter/x86/vf_pp7_init.c
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -23,7 +23,7 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_pp7.h"
 
-void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src);
+void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src);
 
 av_cold void ff_pp7_init_x86(PP7Context *p)
 {
-- 
2.52.0


>From d92a1bf0bfc9051a1c301c145b67f3fd5df858d7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 20:18:59 +0200
Subject: [PATCH 2/6] avfilter/vf_pp7: Add proper PP7DSPContext

This is in preparation for checkasm tests for dctB.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_pp7.c                  | 52 +++++++++++-------------
 libavfilter/{vf_pp7.h => vf_pp7dsp.h} | 57 ++++++++++++++++++---------
 libavfilter/x86/vf_pp7_init.c         |  4 +-
 3 files changed, 62 insertions(+), 51 deletions(-)
 rename libavfilter/{vf_pp7.h => vf_pp7dsp.h} (50%)

diff --git a/libavfilter/vf_pp7.c b/libavfilter/vf_pp7.c
index ea27e10060..d8a5501b47 100644
--- a/libavfilter/vf_pp7.c
+++ b/libavfilter/vf_pp7.c
@@ -33,10 +33,12 @@
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"
 
+#include "avfilter.h"
 #include "filters.h"
 #include "qp_table.h"
-#include "vf_pp7.h"
+#include "vf_pp7dsp.h"
 #include "video.h"
 
 enum mode {
@@ -45,6 +47,23 @@ enum mode {
     MODE_MEDIUM
 };
 
+typedef struct PP7Context {
+    const AVClass *class;
+    int thres2[99][16];
+
+    int qp;
+    int mode;
+    enum AVVideoEncParamsType qscale_type;
+    int hsub;
+    int vsub;
+    int temp_stride;
+    uint8_t *src;
+
+    int (*requantize)(const struct PP7Context *p, const int16_t *src, int qp);
+
+    PP7DSPContext pp7dsp;
+} PP7Context;
+
 #define OFFSET(x) offsetof(PP7Context, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 static const AVOption pp7_options[] = {
@@ -119,29 +138,6 @@ static inline void dctA_c(int16_t *dst, const uint8_t 
*src, int stride)
     }
 }
 
-static void dctB_c(int16_t *dst, const int16_t *src)
-{
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        int s0 = src[0 * 4] + src[6 * 4];
-        int s1 = src[1 * 4] + src[5 * 4];
-        int s2 = src[2 * 4] + src[4 * 4];
-        int s3 = src[3 * 4];
-        int s = s3 + s3;
-        s3 = s  - s0;
-        s0 = s  + s0;
-        s  = s2 + s1;
-        s2 = s2 - s1;
-        dst[0 * 4] = s0 + s;
-        dst[2 * 4] = s0 - s;
-        dst[1 * 4] = 2 * s3 +     s2;
-        dst[3 * 4] =     s3 - 2 * s2;
-        src++;
-        dst++;
-    }
-}
-
 static int hardthresh_c(const PP7Context *p, const int16_t *src, int qp)
 {
     int i;
@@ -256,7 +252,7 @@ static void filter(PP7Context *p, uint8_t *dst, const 
uint8_t *src,
                 if ((x & 3) == 0)
                     dctA_c(tp + 4 * 8, src, stride);
 
-                p->dctB(block, tp);
+                p->pp7dsp.dctB(block, tp);
 
                 v = p->requantize(p, block, qp);
                 v = (v + dither[y & 7][x & 7]) >> 6;
@@ -303,11 +299,7 @@ static int config_input(AVFilterLink *inlink)
         case 2: pp7->requantize = mediumthresh_c; break;
     }
 
-    pp7->dctB = dctB_c;
-
-#if ARCH_X86 && HAVE_X86ASM
-    ff_pp7_init_x86(pp7);
-#endif
+    ff_pp7dsp_init(&pp7->pp7dsp);
 
     return 0;
 }
diff --git a/libavfilter/vf_pp7.h b/libavfilter/vf_pp7dsp.h
similarity index 50%
rename from libavfilter/vf_pp7.h
rename to libavfilter/vf_pp7dsp.h
index c733079291..e75917bdb4 100644
--- a/libavfilter/vf_pp7.h
+++ b/libavfilter/vf_pp7dsp.h
@@ -19,28 +19,47 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#ifndef AVFILTER_PP7_H
-#define AVFILTER_PP7_H
+#ifndef AVFILTER_PP7DSP_H
+#define AVFILTER_PP7DSP_H
 
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
+#include <stdint.h>
 
-typedef struct PP7Context {
-    AVClass *class;
-    int thres2[99][16];
+#include "config.h"
 
-    int qp;
-    int mode;
-    enum AVVideoEncParamsType qscale_type;
-    int hsub;
-    int vsub;
-    int temp_stride;
-    uint8_t *src;
-
-    int (*requantize)(const struct PP7Context *p, const int16_t *src, int qp);
+typedef struct PP7DSPContext {
     void (*dctB)(int16_t *dst, const int16_t *src);
-} PP7Context;
+} PP7DSPContext;
 
-void ff_pp7_init_x86(PP7Context *pp7);
+void ff_pp7dsp_init_x86(PP7DSPContext *pp7dsp);
 
-#endif /* AVFILTER_PP7_H */
+static void dctB_c(int16_t *dst, const int16_t *src)
+{
+    for (int i = 0; i < 4; i++) {
+        int s0 = src[0 * 4] + src[6 * 4];
+        int s1 = src[1 * 4] + src[5 * 4];
+        int s2 = src[2 * 4] + src[4 * 4];
+        int s3 = src[3 * 4];
+        int s = s3 + s3;
+        s3 = s  - s0;
+        s0 = s  + s0;
+        s  = s2 + s1;
+        s2 = s2 - s1;
+        dst[0 * 4] = s0 + s;
+        dst[2 * 4] = s0 - s;
+        dst[1 * 4] = 2 * s3 +     s2;
+        dst[3 * 4] =     s3 - 2 * s2;
+        src++;
+        dst++;
+    }
+}
+
+static inline void ff_pp7dsp_init(PP7DSPContext *pp7dsp)
+{
+    pp7dsp->dctB = dctB_c;
+
+#if ARCH_X86 && HAVE_X86ASM
+    ff_pp7dsp_init_x86(pp7dsp);
+#endif
+}
+
+#endif /* AVFILTER_PP7DSP_H */
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
index a87882359d..53ac907f27 100644
--- a/libavfilter/x86/vf_pp7_init.c
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -21,11 +21,11 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_pp7.h"
+#include "libavfilter/vf_pp7dsp.h"
 
 void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src);
 
-av_cold void ff_pp7_init_x86(PP7Context *p)
+av_cold void ff_pp7dsp_init_x86(PP7DSPContext *p)
 {
     int cpu_flags = av_get_cpu_flags();
 
-- 
2.52.0


>From c7a827ed04442dea20fa99966efd542b388ee68a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 20:43:49 +0200
Subject: [PATCH 3/6] tests/checkasm: Add vf_pp7 checkasm test

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_pp7.c   | 66 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 72 insertions(+)
 create mode 100644 tests/checkasm/vf_pp7.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 55d2527047..53d8f3ec66 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -76,6 +76,7 @@ AVFILTEROBJS-$(CONFIG_FSPP_FILTER)       += vf_fspp.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_IDET_FILTER)       += vf_idet.o
+AVFILTEROBJS-$(CONFIG_PP7_FILTER)        += vf_pp7.o
 AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER)  += vf_threshold.o
 AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER)    += vf_nlmeans.o
 AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index e863ff6eed..93298c46a7 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -342,6 +342,9 @@ static const struct {
     #if CONFIG_NLMEANS_FILTER
         { "vf_nlmeans", checkasm_check_nlmeans },
     #endif
+    #if CONFIG_PP7_FILTER
+        { "vf_pp7", checkasm_check_vf_pp7 },
+    #endif
     #if CONFIG_THRESHOLD_FILTER
         { "vf_threshold", checkasm_check_vf_threshold },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 72a1404163..552de20169 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -162,6 +162,7 @@ void checkasm_check_vf_eq(void);
 void checkasm_check_vf_fspp(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
+void checkasm_check_vf_pp7(void);
 void checkasm_check_vf_threshold(void);
 void checkasm_check_vf_sobel(void);
 void checkasm_check_vp3dsp(void);
diff --git a/tests/checkasm/vf_pp7.c b/tests/checkasm/vf_pp7.c
new file mode 100644
index 0000000000..07664f7472
--- /dev/null
+++ b/tests/checkasm/vf_pp7.c
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "checkasm.h"
+#include "libavfilter/vf_pp7dsp.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define randomize_buffer(buf)                                      \
+    do {                                                           \
+        static_assert(!(sizeof(buf) % 4), "Tail handling needed"); \
+        for (size_t k = 0; k < sizeof(buf); k += 4) {              \
+            AV_WN32A((char*)buf + k, rnd());                       \
+        }                                                          \
+    } while (0)
+
+static void check_dctB(const PP7DSPContext *const pp7dsp)
+{
+    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int16_t *src);
+
+    if (!check_func(pp7dsp->dctB, "dctB"))
+        return;
+
+    DECLARE_ALIGNED(8, int16_t, src)[7 * 4];
+    DECLARE_ALIGNED(8, int16_t, dst_ref)[6 * 4];
+    DECLARE_ALIGNED(8, int16_t, dst_new)[6 * 4];
+
+    randomize_buffer(src);
+    randomize_buffer(dst_ref);
+    memcpy(dst_new, dst_ref, sizeof(dst_new));
+    call_ref(dst_ref, src);
+    call_new(dst_new, src);
+    if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+        fail();
+
+    bench_new(dst_new, src);
+}
+
+void checkasm_check_vf_pp7(void)
+{
+    PP7DSPContext pp7dsp;
+
+    ff_pp7dsp_init(&pp7dsp);
+
+    check_dctB(&pp7dsp);
+    report("dctB");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index b7392fa745..5fc1ec1e5f 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -83,6 +83,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-vf_hflip                                  \
                 fate-checkasm-vf_idet                                   \
                 fate-checkasm-vf_nlmeans                                \
+                fate-checkasm-vf_pp7                                    \
                 fate-checkasm-vf_threshold                              \
                 fate-checkasm-vf_sobel                                  \
                 fate-checkasm-videodsp                                  \
-- 
2.52.0


>From 238738b91fa137c576f4048e5e65d907872763dc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 20:47:50 +0200
Subject: [PATCH 4/6] avfilter/vf_pp7dsp: Add restrict

Makes GCC optimize the scalar codepath away.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_pp7dsp.h       | 4 ++--
 libavfilter/x86/vf_pp7_init.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/vf_pp7dsp.h b/libavfilter/vf_pp7dsp.h
index e75917bdb4..cb405f4f58 100644
--- a/libavfilter/vf_pp7dsp.h
+++ b/libavfilter/vf_pp7dsp.h
@@ -27,12 +27,12 @@
 #include "config.h"
 
 typedef struct PP7DSPContext {
-    void (*dctB)(int16_t *dst, const int16_t *src);
+    void (*dctB)(int16_t *restrict dst, const int16_t *restrict src);
 } PP7DSPContext;
 
 void ff_pp7dsp_init_x86(PP7DSPContext *pp7dsp);
 
-static void dctB_c(int16_t *dst, const int16_t *src)
+static void dctB_c(int16_t *restrict dst, const int16_t *restrict src)
 {
     for (int i = 0; i < 4; i++) {
         int s0 = src[0 * 4] + src[6 * 4];
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
index 53ac907f27..f294ca7764 100644
--- a/libavfilter/x86/vf_pp7_init.c
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -23,7 +23,7 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_pp7dsp.h"
 
-void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src);
+void ff_pp7_dctB_mmx(int16_t *restrict dst, const int16_t *restrict src);
 
 av_cold void ff_pp7dsp_init_x86(PP7DSPContext *p)
 {
-- 
2.52.0


>From a8bca4ba6a27f6fccfecc1b0ab7d8ad8ee993dc7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 22:01:15 +0200
Subject: [PATCH 5/6] avfilter/x86/vf_pp7: Port ff_pp7_dctB_mmx to SSE2

Unfortunately a bit slower than the MMX version due to
the impossibility to use memory operands in paddw.
The situation would reverse if ff_dctB_mmx() would have
to issue emms.

dctB_c:                                                  3.7 ( 1.00x)
dctB_mmx:                                                3.3 ( 1.13x)
dctB_sse2:                                               3.6 ( 1.03x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_pp7.c          |  2 --
 libavfilter/x86/vf_pp7.asm    | 55 +++++++++++++++++------------------
 libavfilter/x86/vf_pp7_init.c |  6 ++--
 tests/checkasm/vf_pp7.c       |  2 +-
 4 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/libavfilter/vf_pp7.c b/libavfilter/vf_pp7.c
index d8a5501b47..10f56c804f 100644
--- a/libavfilter/vf_pp7.c
+++ b/libavfilter/vf_pp7.c
@@ -27,7 +27,6 @@
  * project, and ported by Arwa Arif for FFmpeg.
  */
 
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
@@ -351,7 +350,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
                    cw,        ch,        qp_table, qp_stride, 0);
             filter(pp7, out->data[2], in->data[2], out->linesize[2], 
in->linesize[2],
                    cw,        ch,        qp_table, qp_stride, 0);
-            emms_c();
         }
     }
 
diff --git a/libavfilter/x86/vf_pp7.asm b/libavfilter/x86/vf_pp7.asm
index 9dfabdcc8d..1a0921ed50 100644
--- a/libavfilter/x86/vf_pp7.asm
+++ b/libavfilter/x86/vf_pp7.asm
@@ -24,34 +24,31 @@
 
 SECTION .text
 
-INIT_MMX mmx
+INIT_XMM sse2
+;void ff_pp7_dctB_sse2(int16_t *dst, const int16_t *src)
+cglobal pp7_dctB, 2, 2, 6, dst, src
+    movq         m0, [srcq+8*0]
+    movq         m5, [srcq+8*6]
+    movq         m3, [srcq+8*3]
+    movq         m1, [srcq+8*1]
+    movq         m4, [srcq+8*5]
+    movq         m2, [srcq+8*2]
+    paddw        m0, m5
+    movq         m5, [srcq+8*4]
+    paddw        m3, m3
+    paddw        m1, m4
+    paddw        m2, m5
 
-;void ff_pp7_dctB_mmx(int16_t *dst, const int16_t *src)
-cglobal pp7_dctB, 2, 2, 0, dst, src
-    movq   m0, [srcq]
-    movq   m1, [srcq+mmsize*1]
-    paddw  m0, [srcq+mmsize*6]
-    paddw  m1, [srcq+mmsize*5]
-    movq   m2, [srcq+mmsize*2]
-    movq   m3, [srcq+mmsize*3]
-    paddw  m2, [srcq+mmsize*4]
-    paddw  m3, m3
-    movq   m4, m3
-    psubw  m3, m0
-    paddw  m4, m0
-    movq   m0, m2
-    psubw  m2, m1
-    paddw  m0, m1
-    movq   m1, m4
-    psubw  m4, m0
-    paddw  m1, m0
-    movq   m0, m3
-    psubw  m3, m2
-    psubw  m3, m2
-    paddw  m2, m0
-    paddw  m2, m0
-    movq   [dstq], m1
-    movq   [dstq+mmsize*2], m4
-    movq   [dstq+mmsize*1], m2
-    movq   [dstq+mmsize*3], m3
+    SUMSUB_BA     w, 0, 3, 4
+    SUMSUB_BA     w, 1, 2, 5
+
+    SUMSUB_BA     w, 1, 0, 4
+    movq     [dstq], m1
+    paddw        m4, m2, m3
+    paddw        m2, m2
+    movq [dstq+8*2], m0
+    paddw        m4, m3
+    psubw        m3, m2
+    movq [dstq+8*1], m4
+    movq [dstq+8*3], m3
     RET
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
index f294ca7764..725326382b 100644
--- a/libavfilter/x86/vf_pp7_init.c
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -23,12 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_pp7dsp.h"
 
-void ff_pp7_dctB_mmx(int16_t *restrict dst, const int16_t *restrict src);
+void ff_pp7_dctB_sse2(int16_t *restrict dst, const int16_t *restrict src);
 
 av_cold void ff_pp7dsp_init_x86(PP7DSPContext *p)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags))
-        p->dctB = ff_pp7_dctB_mmx;
+    if (EXTERNAL_SSE2(cpu_flags))
+        p->dctB = ff_pp7_dctB_sse2;
 }
diff --git a/tests/checkasm/vf_pp7.c b/tests/checkasm/vf_pp7.c
index 07664f7472..e506eeb16c 100644
--- a/tests/checkasm/vf_pp7.c
+++ b/tests/checkasm/vf_pp7.c
@@ -35,7 +35,7 @@
 
 static void check_dctB(const PP7DSPContext *const pp7dsp)
 {
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int16_t *src);
+    declare_func(void, int16_t *dst, const int16_t *src);
 
     if (!check_func(pp7dsp->dctB, "dctB"))
         return;
-- 
2.52.0


>From 5433b8b0d8c36dcf489b6ed347eda47ce9bf51e0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 10 May 2026 22:41:11 +0200
Subject: [PATCH 6/6] avfilter/vf_pp7: Fix shadowing

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_pp7.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_pp7.c b/libavfilter/vf_pp7.c
index 10f56c804f..a2ad375227 100644
--- a/libavfilter/vf_pp7.c
+++ b/libavfilter/vf_pp7.c
@@ -226,10 +226,9 @@ static void filter(PP7Context *p, uint8_t *dst, const 
uint8_t *src,
     for (y = 0; y < height; y++) {
         for (x = -8; x < 0; x += 4) {
             const int index = x + y * stride + (8 - 3) * (1 + stride) + 8; 
//FIXME silly offset
-            const uint8_t *src = p_src + index;
             int16_t *tp   = temp + 4 * x;
 
-            dctA_c(tp + 4 * 8, src, stride);
+            dctA_c(tp + 4 * 8, p_src + index, stride);
         }
         for (x = 0; x < width; ) {
             const int qps = 3 + is_luma;
@@ -244,12 +243,11 @@ static void filter(PP7Context *p, uint8_t *dst, const 
uint8_t *src,
             }
             for (; x < end; x++) {
                 const int index = x + y * stride + (8 - 3) * (1 + stride) + 8; 
//FIXME silly offset
-                const uint8_t *src = p_src + index;
                 int16_t *tp  = temp + 4 * x;
                 int v;
 
                 if ((x & 3) == 0)
-                    dctA_c(tp + 4 * 8, src, stride);
+                    dctA_c(tp + 4 * 8, p_src + index, stride);
 
                 p->pp7dsp.dctB(block, tp);
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to