[FFmpeg-devel] [PATCH] avfilter/vf_fspp: Add checkasm, port to SSE2, fix big-endian (PR #20909)

mkver via ffmpeg-devel Thu, 13 Nov 2025 03:55:30 -0800

PR #20909 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909.patch



>From 92fe3d96e6f9a3b169a3edcdb48ecdc543ba862e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 9 Nov 2025 17:06:46 +0100
Subject: [PATCH 01/23] avfilter/vf_fspp: Add DSPCtx, move DSP functions to
 file of their own

This is in preparation for adding checkasm tests; without it,
checkasm would pull all of libavfilter in.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/Makefile                    |   2 +-
 libavfilter/vf_fspp.c                   | 399 +++---------------------
 libavfilter/vf_fsppdsp.c                | 369 ++++++++++++++++++++++
 libavfilter/{vf_fspp.h => vf_fsppdsp.h} |  85 +++--
 libavfilter/x86/vf_fspp_init.c          |   4 +-
 5 files changed, 455 insertions(+), 404 deletions(-)
 create mode 100644 libavfilter/vf_fsppdsp.c
 rename libavfilter/{vf_fspp.h => vf_fsppdsp.h} (52%)

diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 69d74183b2..d56a458e45 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER)              += 
vf_framestep.o
 OBJS-$(CONFIG_FREEZEDETECT_FILTER)           += vf_freezedetect.o
 OBJS-$(CONFIG_FREEZEFRAMES_FILTER)           += vf_freezeframes.o
 OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
-OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o vf_fsppdsp.o 
qp_table.o
 OBJS-$(CONFIG_FSYNC_FILTER)                  += vf_fsync.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
 OBJS-$(CONFIG_GBLUR_VULKAN_FILTER)           += vf_gblur_vulkan.o vulkan.o 
vulkan_filter.o
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 6b4a715367..9371c63e77 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -41,12 +41,40 @@
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"
 
+#include "avfilter.h"
 #include "filters.h"
 #include "qp_table.h"
-#include "vf_fspp.h"
+#include "vf_fsppdsp.h"
 #include "video.h"
 
+#define BLOCKSZ  12
+#define MAX_LEVEL 5
+
+typedef struct FSPPContext {
+    const struct AVClass *class;
+    uint64_t threshold_mtx_noq[8 * 2];
+    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
+
+    int log2_count;
+    int strength;
+    int hsub;
+    int vsub;
+    int temp_stride;
+    int qp;
+    enum AVVideoEncParamsType qscale_type;
+    int prev_q;
+    uint8_t *src;
+    int16_t *temp;
+    int8_t  *non_b_qp_table;
+    int non_b_qp_stride;
+    int use_bframe_qp;
+
+    FSPPDSPContext dsp;
+} FSPPContext;
+
+
 #define OFFSET(x) offsetof(FSPPContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 static const AVOption fspp_options[] = {
@@ -59,17 +87,6 @@ static const AVOption fspp_options[] = {
 
 AVFILTER_DEFINE_CLASS(fspp);
 
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
-    {  0,  48,  12,  60,   3,  51,  15,  63, },
-    { 32,  16,  44,  28,  35,  19,  47,  31, },
-    {  8,  56,   4,  52,  11,  59,   7,  55, },
-    { 40,  24,  36,  20,  43,  27,  39,  23, },
-    {  2,  50,  14,  62,   1,  49,  13,  61, },
-    { 34,  18,  46,  30,  33,  17,  45,  29, },
-    { 10,  58,   6,  54,   9,  57,   5,  53, },
-    { 42,  26,  38,  22,  41,  25,  37,  21, },
-};
-
 static const short custom_threshold[64] = {
 // values (296) can't be too high
 // -it causes too big quant dependence
@@ -84,73 +101,6 @@ static const short custom_threshold[64] = {
      20,  27,  26,  23,  20,  15,  11,   5
 };
 
-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_c(uint8_t *dst, int16_t *src,
-                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
-{
-    int y, x;
-#define STORE(pos)                                                             
\
-    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
-    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE(0);
-            STORE(1);
-            STORE(2);
-            STORE(3);
-            STORE(4);
-            STORE(5);
-            STORE(6);
-            STORE(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-//This func reads from 2 slices, 0 & 2  and clears 2-nd
-static void store_slice2_c(uint8_t *dst, int16_t *src,
-                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
-{
-    int y, x;
-#define STORE2(pos)                                                            
                           \
-    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
-    src[x + pos + 16 * src_stride] = 0;                                        
                           \
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE2(0);
-            STORE2(1);
-            STORE2(2);
-            STORE2(3);
-            STORE2(4);
-            STORE2(5);
-            STORE2(6);
-            STORE2(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
-{
-    int a;
-    for (a = 0; a < 64; a++)
-        thr_adr[a] = q * thr_adr_noq[a];
-}
-
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                    int dst_stride, int src_stride,
                    int width, int height,
@@ -197,13 +147,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
         if (qy < 0) qy = 0;
 
         qy = (qy >> qpsv) * qp_stride;
-        p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
+        p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 
         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 
1)) {
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
 
             if (p->qp)
-                p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 
8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+                p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
             else
                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
                     t = x + x0 - 2;                    //correct 
t=x+x0-2-(y&1), but its the same
@@ -213,288 +163,42 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
                     t = qp_store[qy + (t >> qpsh)];
                     t = ff_norm_qscale(t, p->qscale_type);
 
-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t 
*)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
-                    p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+                    if (t != p->prev_q) p->prev_q = t, 
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t 
*)(&p->threshold_mtx[0]), t);
+                    p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), 
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
                 }
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - 
(y & 1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * 
sizeof(int16_t)); //cycling
             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * 
sizeof(int16_t));
         }
 
         es = width + 8 - x0; //  8, ...
         if (es > 8)
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 
1), stride, (es - 4) >> 2);
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y & 1), stride, (es - 4) >> 2);
 
-        p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
+        p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
         if (es > 3)
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - 
(y & 1), stride, es >> 2);
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, es >> 2);
 
         if (!(y1 & 7) && y1) {
             if (y1 & 8)
-                p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * 
stride,
-                               dst_stride, stride, width, 8, 5 - 
p->log2_count);
+                p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 
8 * stride,
+                                   dst_stride, stride, width, 8, 5 - 
p->log2_count);
             else
-                p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * 
stride,
-                                dst_stride, stride, width, 8, 5 - 
p->log2_count);
+                p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 
0 * stride,
+                                    dst_stride, stride, width, 8, 5 - 
p->log2_count);
         }
     }
 
     if (y & 7) {  // height % 8 != 0
         if (y & 8)
-            p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 
* stride,
-                           dst_stride, stride, width, y&7, 5 - p->log2_count);
+            p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 
+ 8 * stride,
+                               dst_stride, stride, width, y&7, 5 - 
p->log2_count);
         else
-            p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 
* stride,
+            p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 
+ 0 * stride,
                             dst_stride, stride, width, y&7, 5 - p->log2_count);
     }
 }
 
-static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
-    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
-
-    int16_t *dataptr;
-    int16_t *wsptr;
-    int16_t *threshold;
-    int ctr;
-
-    dataptr = data;
-    wsptr = output;
-
-    for (; cnt > 0; cnt -= 2) { //start positions
-        threshold = (int16_t *)thr_adr;//threshold_mtx
-        for (ctr = DCTSIZE; ctr > 0; ctr--) {
-            // Process columns from input, add to output.
-            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
-            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
-
-            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
-            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
-
-            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
-            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
-
-            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
-            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
-
-            // Even part of FDCT
-
-            tmp10 = tmp0 + tmp3;
-            tmp13 = tmp0 - tmp3;
-            tmp11 = tmp1 + tmp2;
-            tmp12 = tmp1 - tmp2;
-
-            d0 = tmp10 + tmp11;
-            d4 = tmp10 - tmp11;
-
-            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-            d2 = tmp13 + z1;
-            d6 = tmp13 - z1;
-
-            // Even part of IDCT
-
-            THRESHOLD(tmp0, d0, threshold[0 * 8]);
-            THRESHOLD(tmp1, d2, threshold[2 * 8]);
-            THRESHOLD(tmp2, d4, threshold[4 * 8]);
-            THRESHOLD(tmp3, d6, threshold[6 * 8]);
-            tmp0 += 2;
-            tmp10 = (tmp0 + tmp2) >> 2;
-            tmp11 = (tmp0 - tmp2) >> 2;
-
-            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
-            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; 
//<<2
-
-            tmp0 = tmp10 + tmp13; //->temps
-            tmp3 = tmp10 - tmp13; //->temps
-            tmp1 = tmp11 + tmp12; //->temps
-            tmp2 = tmp11 - tmp12; //->temps
-
-            // Odd part of FDCT
-
-            tmp10 = tmp4 + tmp5;
-            tmp11 = tmp5 + tmp6;
-            tmp12 = tmp6 + tmp7;
-
-            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
-            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
-            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
-            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
-
-            z11 = tmp7 + z3;
-            z13 = tmp7 - z3;
-
-            d5 = z13 + z2;
-            d3 = z13 - z2;
-            d1 = z11 + z4;
-            d7 = z11 - z4;
-
-            // Odd part of IDCT
-
-            THRESHOLD(tmp4, d1, threshold[1 * 8]);
-            THRESHOLD(tmp5, d3, threshold[3 * 8]);
-            THRESHOLD(tmp6, d5, threshold[5 * 8]);
-            THRESHOLD(tmp7, d7, threshold[7 * 8]);
-
-            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
-            z13 = tmp6 + tmp5;
-            z10 = (tmp6 - tmp5) << 1;
-            z11 = tmp4 + tmp7;
-            z12 = (tmp4 - tmp7) << 1;
-
-            tmp7  = (z11 + z13) >> 2; //+2 !
-            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
-            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
-            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
-            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
-
-            tmp6 = tmp12 - tmp7;
-            tmp5 = tmp11 - tmp6;
-            tmp4 = tmp10 + tmp5;
-
-            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
-            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
-            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
-            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
-            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
-            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
-            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
-            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
-            //
-            dataptr++; //next column
-            wsptr++;
-            threshold++;
-        }
-        dataptr += 8; //skip each second start pos
-        wsptr   += 8;
-    }
-}
-
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z5, z10, z11, z12, z13;
-    int16_t *outptr;
-    int16_t *wsptr;
-
-    cnt *= 4;
-    wsptr = workspace;
-    outptr = output_adr;
-    for (; cnt > 0; cnt--) {
-        // Even part
-        //Simd version reads 4x4 block and transposes it
-        tmp10 = wsptr[2] +  wsptr[3];
-        tmp11 = wsptr[2] -  wsptr[3];
-
-        tmp13 = wsptr[0] +  wsptr[1];
-        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
-
-        tmp0 = tmp10 + tmp13; //->temps
-        tmp3 = tmp10 - tmp13; //->temps
-        tmp1 = tmp11 + tmp12;
-        tmp2 = tmp11 - tmp12;
-
-        // Odd part
-        //Also transpose, with previous:
-        // ---- ----      ||||
-        // ---- ---- idct ||||
-        // ---- ---- ---> ||||
-        // ---- ----      ||||
-        z13 = wsptr[4] + wsptr[5];
-        z10 = wsptr[4] - wsptr[5];
-        z11 = wsptr[6] + wsptr[7];
-        z12 = wsptr[6] - wsptr[7];
-
-        tmp7 = z11 + z13;
-        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
-
-        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
-        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
-        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
-
-        tmp6 = (tmp12 << 3) - tmp7;
-        tmp5 = (tmp11 << 3) - tmp6;
-        tmp4 = (tmp10 << 3) + tmp5;
-
-        // Final output stage: descale and write column
-        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
-        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
-        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
-        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
-        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
-        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
-        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
-        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
-        outptr++;
-
-        wsptr += DCTSIZE;       // advance pointer to next row
-    }
-}
-
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
-    int16_t *dataptr;
-
-    cnt *= 4;
-    // Pass 1: process rows.
-
-    dataptr = data;
-    for (; cnt > 0; cnt--) {
-        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
-        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
-        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
-        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
-        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
-        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
-        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
-        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
-
-        // Even part
-
-        tmp10 = tmp0 + tmp3;
-        tmp13 = tmp0 - tmp3;
-        tmp11 = tmp1 + tmp2;
-        tmp12 = tmp1 - tmp2;
-        //Even columns are written first, this leads to different order of 
columns
-        //in column_fidct(), but they are processed independently, so all ok.
-        //Later in the row_idct() columns are read in the same order.
-        dataptr[2] = tmp10 + tmp11;
-        dataptr[3] = tmp10 - tmp11;
-
-        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-        dataptr[0] = tmp13 + z1;
-        dataptr[1] = tmp13 - z1;
-
-        // Odd part
-
-        tmp10 = (tmp4 + tmp5) << 2;
-        tmp11 = (tmp5 + tmp6) << 2;
-        tmp12 = (tmp6 + tmp7) << 2;
-
-        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
-        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
-        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
-        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
-
-        z11 = tmp7 + z3;
-        z13 = tmp7 - z3;
-
-        dataptr[4] = z13 + z2;
-        dataptr[5] = z13 - z2;
-        dataptr[6] = z11 + z4;
-        dataptr[7] = z11 - z4;
-
-        pixels++;               // advance pointer to next column
-        dataptr += DCTSIZE;
-    }
-}
-
 static const enum AVPixelFormat pix_fmts[] = {
     AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
     AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
@@ -522,16 +226,7 @@ static int config_input(AVFilterLink *inlink)
     if (!fspp->temp || !fspp->src)
         return AVERROR(ENOMEM);
 
-    fspp->store_slice  = store_slice_c;
-    fspp->store_slice2 = store_slice2_c;
-    fspp->mul_thrmat   = mul_thrmat_c;
-    fspp->column_fidct = column_fidct_c;
-    fspp->row_idct     = row_idct_c;
-    fspp->row_fdct     = row_fdct_c;
-
-#if ARCH_X86
-    ff_fspp_init_x86(fspp);
-#endif
+    ff_fsppdsp_init(&fspp->dsp);
 
     return 0;
 }
@@ -567,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     }
 
     if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
+        fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
 
     /* if we are not in a constant user quantizer mode and we don't want to use
      * the quantizers from the B-frames (B-frames often have a higher QP), we
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
new file mode 100644
index 0000000000..ab31c77203
--- /dev/null
+++ b/libavfilter/vf_fsppdsp.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <[email protected]>
+ * Copyright (C) 2005 Nikolaj Poroshin <[email protected]>
+ * Copyright (c) 2014 Arwa Arif <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+
+#include "vf_fsppdsp.h"
+
+#include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
+
+#define DCTSIZE 8
+
+#define FIX(x,s)  ((x) * (1 << s) + 0.5)
+
+#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
+#define THRESHOLD(r,x,t)                         \
+    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
+    else r = 0;
+#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
+
+typedef int32_t int_simd16_t;
+static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
+static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
+static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
+static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
+static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
+static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
+static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
+static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
+static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
+
+DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+    {  0,  48,  12,  60,   3,  51,  15,  63, },
+    { 32,  16,  44,  28,  35,  19,  47,  31, },
+    {  8,  56,   4,  52,  11,  59,   7,  55, },
+    { 40,  24,  36,  20,  43,  27,  39,  23, },
+    {  2,  50,  14,  62,   1,  49,  13,  61, },
+    { 34,  18,  46,  30,  33,  17,  45,  29, },
+    { 10,  58,   6,  54,   9,  57,   5,  53, },
+    { 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE(pos)                                                             
\
+    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
+    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
+    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE(0);
+            STORE(1);
+            STORE(2);
+            STORE(3);
+            STORE(4);
+            STORE(5);
+            STORE(6);
+            STORE(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+//This func reads from 2 slices, 0 & 2  and clears 2-nd
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE2(pos)                                                            
                           \
+    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
+    src[x + pos + 16 * src_stride] = 0;                                        
                           \
+    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE2(0);
+            STORE2(1);
+            STORE2(2);
+            STORE2(3);
+            STORE2(4);
+            STORE2(5);
+            STORE2(6);
+            STORE2(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+{
+    for (int a = 0; a < 64; a++)
+        thr_adr[a] = q * thr_adr_noq[a];
+}
+
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+    int16_t *dataptr;
+    int16_t *wsptr;
+    int16_t *threshold;
+
+    dataptr = data;
+    wsptr = output;
+
+    for (; cnt > 0; cnt -= 2) { //start positions
+        threshold = (int16_t *)thr_adr;//threshold_mtx
+        for (int ctr = DCTSIZE; ctr > 0; ctr--) {
+            // Process columns from input, add to output.
+            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+
+            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+
+            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+
+            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+            // Even part of FDCT
+
+            tmp10 = tmp0 + tmp3;
+            tmp13 = tmp0 - tmp3;
+            tmp11 = tmp1 + tmp2;
+            tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            // Even part of IDCT
+
+            THRESHOLD(tmp0, d0, threshold[0 * 8]);
+            THRESHOLD(tmp1, d2, threshold[2 * 8]);
+            THRESHOLD(tmp2, d4, threshold[4 * 8]);
+            THRESHOLD(tmp3, d6, threshold[6 * 8]);
+            tmp0 += 2;
+            tmp10 = (tmp0 + tmp2) >> 2;
+            tmp11 = (tmp0 - tmp2) >> 2;
+
+            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
+            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; 
//<<2
+
+            tmp0 = tmp10 + tmp13; //->temps
+            tmp3 = tmp10 - tmp13; //->temps
+            tmp1 = tmp11 + tmp12; //->temps
+            tmp2 = tmp11 - tmp12; //->temps
+
+            // Odd part of FDCT
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
+            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
+            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
+            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            d5 = z13 + z2;
+            d3 = z13 - z2;
+            d1 = z11 + z4;
+            d7 = z11 - z4;
+
+            // Odd part of IDCT
+
+            THRESHOLD(tmp4, d1, threshold[1 * 8]);
+            THRESHOLD(tmp5, d3, threshold[3 * 8]);
+            THRESHOLD(tmp6, d5, threshold[5 * 8]);
+            THRESHOLD(tmp7, d7, threshold[7 * 8]);
+
+            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+            z13 = tmp6 + tmp5;
+            z10 = (tmp6 - tmp5) << 1;
+            z11 = tmp4 + tmp7;
+            z12 = (tmp4 - tmp7) << 1;
+
+            tmp7  = (z11 + z13) >> 2; //+2 !
+            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
+            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 + tmp5;
+
+            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
+            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
+            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
+            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
+            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
+            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
+            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
+            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
+            //
+            dataptr++; //next column
+            wsptr++;
+            threshold++;
+        }
+        dataptr += 8; //skip each second start pos
+        wsptr   += 8;
+    }
+}
+
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z5, z10, z11, z12, z13;
+    int16_t *outptr;
+    int16_t *wsptr;
+
+    cnt *= 4;
+    wsptr = workspace;
+    outptr = output_adr;
+    for (; cnt > 0; cnt--) {
+        // Even part
+        //Simd version reads 4x4 block and transposes it
+        tmp10 = wsptr[2] +  wsptr[3];
+        tmp11 = wsptr[2] -  wsptr[3];
+
+        tmp13 = wsptr[0] +  wsptr[1];
+        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
+
+        tmp0 = tmp10 + tmp13; //->temps
+        tmp3 = tmp10 - tmp13; //->temps
+        tmp1 = tmp11 + tmp12;
+        tmp2 = tmp11 - tmp12;
+
+        // Odd part
+        //Also transpose, with previous:
+        // ---- ----      ||||
+        // ---- ---- idct ||||
+        // ---- ---- ---> ||||
+        // ---- ----      ||||
+        z13 = wsptr[4] + wsptr[5];
+        z10 = wsptr[4] - wsptr[5];
+        z11 = wsptr[6] + wsptr[7];
+        z12 = wsptr[6] - wsptr[7];
+
+        tmp7 = z11 + z13;
+        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
+        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
+
+        tmp6 = (tmp12 << 3) - tmp7;
+        tmp5 = (tmp11 << 3) - tmp6;
+        tmp4 = (tmp10 << 3) + tmp5;
+
+        // Final output stage: descale and write column
+        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
+        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
+        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
+        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
+        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
+        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
+        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
+        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
+        outptr++;
+
+        wsptr += DCTSIZE;       // advance pointer to next row
+    }
+}
+
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+    int16_t *dataptr;
+
+    cnt *= 4;
+    // Pass 1: process rows.
+
+    dataptr = data;
+    for (; cnt > 0; cnt--) {
+        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
+        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
+        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
+        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
+        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
+        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
+        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
+        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
+
+        // Even part
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+        //Even columns are written first, this leads to different order of 
columns
+        //in column_fidct(), but they are processed independently, so all ok.
+        //Later in the row_idct() columns are read in the same order.
+        dataptr[2] = tmp10 + tmp11;
+        dataptr[3] = tmp10 - tmp11;
+
+        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+        dataptr[0] = tmp13 + z1;
+        dataptr[1] = tmp13 - z1;
+
+        // Odd part
+
+        tmp10 = (tmp4 + tmp5) << 2;
+        tmp11 = (tmp5 + tmp6) << 2;
+        tmp12 = (tmp6 + tmp7) << 2;
+
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
+        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
+        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
+        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
+
+        z11 = tmp7 + z3;
+        z13 = tmp7 - z3;
+
+        dataptr[4] = z13 + z2;
+        dataptr[5] = z13 - z2;
+        dataptr[6] = z11 + z4;
+        dataptr[7] = z11 - z4;
+
+        pixels++;               // advance pointer to next column
+        dataptr += DCTSIZE;
+    }
+}
diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fsppdsp.h
similarity index 52%
rename from libavfilter/vf_fspp.h
rename to libavfilter/vf_fsppdsp.h
index ee7de3ffef..c441b75094 100644
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -20,56 +20,17 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#ifndef AVFILTER_FSPP_H
-#define AVFILTER_FSPP_H
+#ifndef AVFILTER_FSPPDSP_H
+#define AVFILTER_FSPPDSP_H
 
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
+#include <stddef.h>
+#include <stdint.h>
 
-#define BLOCKSZ 12
-#define MAX_LEVEL 5
+#include "config.h"
 
-#define DCTSIZE 8
-#define DCTSIZE_S "8"
-
-#define FIX(x,s)  ((x) * (1 << s) + 0.5)
-
-#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
-#define THRESHOLD(r,x,t)                         \
-    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
-    else r = 0;
-#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
-
-typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
-static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
-static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
-static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
-
-typedef struct FSPPContext {
-    AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
-
-    int log2_count;
-    int strength;
-    int hsub;
-    int vsub;
-    int temp_stride;
-    int qp;
-    enum AVVideoEncParamsType qscale_type;
-    int prev_q;
-    uint8_t *src;
-    int16_t *temp;
-    int8_t  *non_b_qp_table;
-    int non_b_qp_stride;
-    int use_bframe_qp;
+#include "libavutil/attributes_internal.h"
 
+typedef struct FSPPDSPContext {
     void (*store_slice)(uint8_t *dst, int16_t *src,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
@@ -88,9 +49,35 @@ typedef struct FSPPContext {
 
     void (*row_fdct)(int16_t *data, const uint8_t *pixels,
                      ptrdiff_t line_size, int cnt);
+} FSPPDSPContext;
 
-} FSPPContext;
+FF_VISIBILITY_PUSH_HIDDEN
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt);
 
-void ff_fspp_init_x86(FSPPContext *fspp);
+void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
+FF_VISIBILITY_POP_HIDDEN
 
-#endif /* AVFILTER_FSPP_H */
+static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
+{
+    fspp->store_slice  = ff_store_slice_c;
+    fspp->store_slice2 = ff_store_slice2_c;
+    fspp->mul_thrmat   = ff_mul_thrmat_c;
+    fspp->column_fidct = ff_column_fidct_c;
+    fspp->row_idct     = ff_row_idct_c;
+    fspp->row_fdct     = ff_row_fdct_c;
+
+#if ARCH_X86
+    ff_fsppdsp_init_x86(fspp);
+#endif
+}
+
+#endif /* AVFILTER_FSPPDSP_H */
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 8e00317cb7..2aadb50967 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -21,7 +21,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_fspp.h"
+#include "libavfilter/vf_fsppdsp.h"
 
 void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
@@ -34,7 +34,7 @@ void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, 
int16_t *output, int c
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
-av_cold void ff_fspp_init_x86(FSPPContext *s)
+av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-- 
2.49.1


>From 4f3d8ea9d11842357998cca26f502831d5d5c9c0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 9 Nov 2025 17:22:21 +0100
Subject: [PATCH 02/23] avfilter/vf_fsppdsp: Use enum for constants

It means that the compiler does not have to optimize the static const
object away.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index ab31c77203..d2d04463b4 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -29,7 +29,7 @@
 
 #define DCTSIZE 8
 
-#define FIX(x,s)  ((x) * (1 << s) + 0.5)
+#define FIX(x,s)  (int)((x) * (1 << s) + 0.5)
 
 #define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
 #define THRESHOLD(r,x,t)                         \
@@ -38,15 +38,18 @@
 #define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
 
 typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
-static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
-static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
-static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
+
+enum {
+    FIX_0_382683433   = FIX(0.382683433, 14),
+    FIX_0_541196100   = FIX(0.541196100, 14),
+    FIX_0_707106781   = FIX(M_SQRT1_2  , 14),
+    FIX_1_306562965   = FIX(1.306562965, 14),
+    FIX_1_414213562_A = FIX(M_SQRT2    , 14),
+    FIX_1_847759065   = FIX(1.847759065, 13),
+    FIX_2_613125930   = FIX(-2.613125930, 13),
+    FIX_1_414213562   = FIX(M_SQRT2    , 13),
+    FIX_1_082392200   = FIX(1.082392200, 13),
+};
 
 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
     {  0,  48,  12,  60,   3,  51,  15,  63, },
-- 
2.49.1


>From 787c89a3ac68fa1d023e2f06c653b55ba26f0917 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 9 Nov 2025 17:27:16 +0100
Subject: [PATCH 03/23] avfilter/x86/vf_fspp: Don't duplicate dither table

Reuse the one from vf_fsppdsp.c; also don't overalign said table too
much.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c    | 6 +++---
 libavfilter/vf_fsppdsp.h    | 2 ++
 libavfilter/x86/vf_fspp.asm | 9 +++------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index d2d04463b4..b84d7b57bb 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -51,7 +51,7 @@ enum {
     FIX_1_082392200   = FIX(1.082392200, 13),
 };
 
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
     {  0,  48,  12,  60,   3,  51,  15,  63, },
     { 32,  16,  44,  28,  35,  19,  47,  31, },
     {  8,  56,   4,  52,  11,  59,   7,  55, },
@@ -74,7 +74,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
+        const uint8_t *d = ff_fspp_dither[y];
         for (int x = 0; x < width; x += 8) {
             int temp;
             STORE(0);
@@ -103,7 +103,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
+        const uint8_t *d = ff_fspp_dither[y];
         for (int x = 0; x < width; x += 8) {
             int temp;
             STORE2(0);
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index c441b75094..0dbd628abf 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -52,6 +52,8 @@ typedef struct FSPPDSPContext {
 } FSPPDSPContext;
 
 FF_VISIBILITY_PUSH_HIDDEN
+extern const uint8_t ff_fspp_dither[8][8];
+
 void ff_store_slice_c(uint8_t *dst, int16_t *src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c7f8f64f1b..0ea6216193 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -25,10 +25,7 @@
 
 SECTION_RODATA
 
-pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  
19,  47,  31, \
-              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  
27,  39,  23, \
-              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  
17,  45,  29, \
-             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  
25,  37,  21
+cextern fspp_dither
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
@@ -73,7 +70,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, 
dither, tmp, tmp2
     sub       tmp2q, widthq
     movd      m2, ditherd ; log2_scale
     add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
     mov       src_strideq, tmp2q
     shl       tmpq, 4
     lea       dither_heightq, [ditherq+dither_heightq*8]
@@ -139,7 +136,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     sub       tmp2q, widthq
     movd      m2, ditherd ; log2_scale
     add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
     mov       src_strideq, tmp2q
     shl       tmpq, 5
     lea       dither_heightq, [ditherq+dither_heightq*8]
-- 
2.49.1


>From 659b75505b3b0e03a20701f7f8ebf77dd954205b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 9 Nov 2025 18:50:48 +0100
Subject: [PATCH 04/23] tests/checkasm: Add vf_fspp mul_thrmat test

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 +++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/vf_fspp.c  | 52 +++++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 58 insertions(+)
 create mode 100644 tests/checkasm/vf_fspp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..6636bc7774 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+AVFILTEROBJS-$(CONFIG_FSPP_FILTER)       += vf_fspp.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_IDET_FILTER)       += vf_idet.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..20d8f19757 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -297,6 +297,9 @@ static const struct {
     #if CONFIG_EQ_FILTER
         { "vf_eq", checkasm_check_vf_eq },
     #endif
+    #if CONFIG_FSPP_FILTER
+        { "vf_fspp", checkasm_check_vf_fspp },
+    #endif
     #if CONFIG_GBLUR_FILTER
         { "vf_gblur", checkasm_check_vf_gblur },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..45cd23cac4 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
+void checkasm_check_vf_fspp(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
new file mode 100644
index 0000000000..a84ae8d5af
--- /dev/null
+++ b/tests/checkasm/vf_fspp.c
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "checkasm.h"
+#include "libavfilter/vf_fsppdsp.h"
+
+#define randomize_buffers(buf)                           \
+    do {                                                 \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+            buf[j] = rnd();                              \
+    } while (0)
+
+
+static void check_mul_thrmat(void)
+{
+    FSPPDSPContext fspp;
+    int16_t src[64];
+    int16_t dst_ref[64], dst_new[64];
+    const int q = (uint8_t)rnd();
+    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t 
*thr_adr, int q);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
+        randomize_buffers(src);
+        call_ref(src, dst_ref, q);
+        call_new(src, dst_new, q);
+        if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
+            fail();
+        bench_new(src, dst_new, q);
+    }
+}
+
+void checkasm_check_vf_fspp(void)
+{
+    check_mul_thrmat();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index ca1cd0dea3..2be880c8db 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-vf_colordetect                            \
                 fate-checkasm-vf_colorspace                             \
                 fate-checkasm-vf_eq                                     \
+                fate-checkasm-vf_fspp                                   \
                 fate-checkasm-vf_gblur                                  \
                 fate-checkasm-vf_hflip                                  \
                 fate-checkasm-vf_nlmeans                                \
-- 
2.49.1


>From bd0b98cc10caea569331eae8fd1af13d4d546ddb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 9 Nov 2025 19:10:30 +0100
Subject: [PATCH 05/23] avfilter/x86/vf_fspp: Port mul_thrmat to SSE2

This fixes an ABI violation, as mul_thrmat did not issue emms.
It seems that this ABI violation could reach the user, namely
if ff_get_video_buffer() fails. Notice that ff_get_video_buffer()
itself could fail because of this, namely if the allocator uses
floating point registers.

On x64 (where GCC already used SSE2 in the C version)
mul_thrmat_c:                                            4.4 ( 1.00x)
mul_thrmat_mmx:                                          8.6 ( 0.52x)
mul_thrmat_sse2:                                         4.4 ( 1.00x)

On 32bit (where SSE2 is not known to be available):
mul_thrmat_c:                                           56.0 ( 1.00x)
mul_thrmat_sse2:                                         6.0 ( 9.40x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fspp.c          |  5 +-
 libavfilter/vf_fsppdsp.h       |  3 +-
 libavfilter/x86/vf_fspp.asm    | 84 +++++++++++++---------------------
 libavfilter/x86/vf_fspp_init.c |  6 ++-
 tests/checkasm/vf_fspp.c       |  8 ++--
 5 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9371c63e77..fa562cbd45 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -54,8 +54,6 @@
 
 typedef struct FSPPContext {
     const struct AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later 
SSE2) versions
 
     int log2_count;
     int strength;
@@ -72,6 +70,9 @@ typedef struct FSPPContext {
     int use_bframe_qp;
 
     FSPPDSPContext dsp;
+
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
+    DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
 } FSPPContext;
 
 
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 0dbd628abf..e87fa6861c 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,7 +39,8 @@ typedef struct FSPPDSPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
+                       int16_t *thr_adr /* align 16 */, int q);
 
     void (*column_fidct)(int16_t *thr_adr, int16_t *data,
                          int16_t *output, int cnt);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 0ea6216193..c9408978d8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     jl .loop_height
     RET
 
-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
-    movd      m7, qd
-    movq      m0, [thrnq]
-    punpcklwd m7, m7
-    movq      m1, [thrnq+8]
-    punpckldq m7, m7
-    pmullw    m0, m7
-    movq      m2, [thrnq+8*2]
-    pmullw    m1, m7
-    movq      m3, [thrnq+8*3]
-    pmullw    m2, m7
-    movq      [thrq], m0
-    movq      m4, [thrnq+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8], m1
-    movq      m5, [thrnq+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*2], m2
-    movq      m6, [thrnq+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*3], m3
-    movq      m0, [thrnq+8*7]
-    pmullw    m6, m7
-    movq      [thrq+8*4], m4
-    movq      m1, [thrnq+8*7+8]
-    pmullw    m0, m7
-    movq      [thrq+8*5], m5
-    movq      m2, [thrnq+8*7+8*2]
-    pmullw    m1, m7
-    movq      [thrq+8*6], m6
-    movq      m3, [thrnq+8*7+8*3]
-    pmullw    m2, m7
-    movq      [thrq+8*7], m0
-    movq      m4, [thrnq+8*7+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8*7+8], m1
-    movq      m5, [thrnq+8*7+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*7+8*2], m2
-    movq      m6, [thrnq+8*7+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*7+8*3], m3
-    movq      m0, [thrnq+14*8]
-    pmullw    m6, m7
-    movq      [thrq+8*7+8*4], m4
-    movq      m1, [thrnq+14*8+8]
-    pmullw    m0, m7
-    movq      [thrq+8*7+8*5], m5
-    pmullw    m1, m7
-    movq      [thrq+8*7+8*6], m6
-    movq      [thrq+14*8], m0
-    movq      [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+INIT_XMM sse2
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+    movd      m4, qd
+    mova      m0, [thrnq]
+    punpcklwd m4, m4
+    mova      m1, [thrnq+16]
+    pshufd    m4, m4, 0
+    pmullw    m0, m4
+    mova      m2, [thrnq+16*2]
+    pmullw    m1, m4
+    mova      m3, [thrnq+16*3]
+    pmullw    m2, m4
+    mova      [thrq], m0
+    mova      m0, [thrnq+16*4]
+    pmullw    m3, m4
+    mova      [thrq+16], m1
+    mova      m1, [thrnq+16*5]
+    pmullw    m0, m4
+    mova      [thrq+16*2], m2
+    mova      m2, [thrnq+16*6]
+    pmullw    m1, m4
+    mova      [thrq+16*3], m3
+    mova      m3, [thrnq+16*7]
+    pmullw    m2, m4
+    mova      [thrq+16*4], m0
+    pmullw    m3, m4
+    mova      [thrq+16*5], m1
+    mova      [thrq+16*6], m2
+    mova      [thrq+16*7], m3
     RET
 
 %macro COLUMN_FDCT 1-3 0, 0
@@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
     add       outq, 8+%1
 %endmacro
 
+INIT_MMX mmx
 ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
 cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
 .fdct1:
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 2aadb50967..9f6095ce24 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
 void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
@@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     if (EXTERNAL_MMX(cpu_flags)) {
         s->store_slice  = ff_store_slice_mmx;
         s->store_slice2 = ff_store_slice2_mmx;
-        s->mul_thrmat   = ff_mul_thrmat_mmx;
         s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->mul_thrmat   = ff_mul_thrmat_sse2;
+    }
 }
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index a84ae8d5af..117e1c670e 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -18,6 +18,7 @@
 
 #include "checkasm.h"
 #include "libavfilter/vf_fsppdsp.h"
+#include "libavutil/mem_internal.h"
 
 #define randomize_buffers(buf)                           \
     do {                                                 \
@@ -29,10 +30,11 @@
 static void check_mul_thrmat(void)
 {
     FSPPDSPContext fspp;
-    int16_t src[64];
-    int16_t dst_ref[64], dst_new[64];
+    DECLARE_ALIGNED(16, int16_t, src)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_new)[64];
     const int q = (uint8_t)rnd();
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t 
*thr_adr, int q);
+    declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     ff_fsppdsp_init(&fspp);
 
-- 
2.49.1


>From cd9e9ca3c1126d820bf6108c939b9911f2e72bd9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 10 Nov 2025 12:54:31 +0100
Subject: [PATCH 06/23] avfilter/vf_fsppdsp: Use standard clamping

This is obviously what is intended and what the MMX code does;
yet I cannot rule out that it changes the output for some inputs:
I have observed individual src values which would lead to temp
values just above 512 if they came in pairs (i.e. if both inputs
were simultaneously huge).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index b84d7b57bb..f3f7c87174 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -24,6 +24,7 @@
 
 #include "vf_fsppdsp.h"
 
+#include "libavutil/common.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/mem_internal.h"
 
@@ -70,7 +71,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
 #define STORE(pos)                                                             
\
     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        
\
     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          
\
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
\
+    temp = av_clip_uint8(temp);                                                
\
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
@@ -99,7 +100,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
 #define STORE2(pos)                                                            
                           \
     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> 
log2_scale)) >> (6 - log2_scale);  \
     src[x + pos + 16 * src_stride] = 0;                                        
                           \
-    if (temp & 0x100) temp = ~(temp >> 31);                                    
                           \
+    temp = av_clip_uint8(temp);                                                
                           \
     dst[x + pos] = temp;
 
     for (int y = 0; y < height; y++) {
-- 
2.49.1


>From c90066ba04c4f8ff8471f99d80c8cda68a491b63 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 10 Nov 2025 21:57:45 +0100
Subject: [PATCH 07/23] tests/checkasm/vf_fspp: Test store_slice

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/vf_fspp.c | 77 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 117e1c670e..eab62c9450 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -16,8 +16,12 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include "checkasm.h"
 #include "libavfilter/vf_fsppdsp.h"
+#include "libavcodec/mathops.h"
 #include "libavutil/mem_internal.h"
 
 #define randomize_buffers(buf)                           \
@@ -26,6 +30,78 @@
             buf[j] = rnd();                              \
     } while (0)
 
+#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
+    do {                                                    \
+        for (size_t j = 0; j < nb_elems; ++j)               \
+            buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
+    } while (0)
+
+static void check_store_slice(void)
+{
+    enum {
+        MAX_WIDTH  = 256,
+        /// in elements, not in bytes; 32 is arbirary
+        MAX_STRIDE = MAX_WIDTH + 32,
+        MAX_HEIGHT = 8,
+    };
+    FSPPDSPContext fspp;
+    ff_fsppdsp_init(&fspp);
+    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+    for (int i = 0; i < 2; ++i) {
+        if (check_func(i ? fspp.store_slice2 : fspp.store_slice, 
"store_slice%s", i ? "2" : "")) {
+            // store slice resets the row eight lines above the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            // store_slice2 resets the row 16 lines below the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + 
MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], 
dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+            int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
+            ptrdiff_t      width = 1 + rnd() % MAX_WIDTH;
+            ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - 
MAX_WIDTH), 8);
+            ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - 
MAX_WIDTH), 8);
+            ptrdiff_t height = 1 + rnd() % 8;
+            size_t nb_elems;
+
+            if (i) {
+                src_ref      = src_ref2;
+                src_new      = src_new2;
+                or_src_ref   = src_ref2;
+                or_src_new   = src_new2;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref2);
+            } else {
+                src_ref      = src_ref1 + 8 * src_stride;
+                src_new      = src_new1 + 8 * src_stride;
+                or_src_ref   = src_ref1;
+                or_src_new   = src_new1;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref1);
+            }
+            if (rnd() & 1) {
+                dst_ref    += dst_stride * (height - 1);
+                dst_new    += dst_stride * (height - 1);
+                dst_stride *= -1;
+            }
+            randomize_buffers(dstbuf_new);
+            memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
+            randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
+
+            ptrdiff_t log2_scale = rnd() & 1;
+            call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, 
log2_scale);
+            call_new(dst_new, src_new, dst_stride, src_stride, width, height, 
log2_scale);
+            if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
+                memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
+                fail();
+            // don't use random parameters for benchmarks
+            src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
+            bench_new(dstbuf_new, src_ref,
+                      MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
+        }
+    }
+}
 
 static void check_mul_thrmat(void)
 {
@@ -50,5 +126,6 @@ static void check_mul_thrmat(void)
 
 void checkasm_check_vf_fspp(void)
 {
+    check_store_slice();
     check_mul_thrmat();
 }
-- 
2.49.1


>From e67ee1a479f984274016ffacc71aae7ac636417c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 10 Nov 2025 22:06:34 +0100
Subject: [PATCH 08/23] avfilter/x86/vf_fspp: Port store_slice to SSE2

Old benchmarks:
store_slice_c:                                        2798.3 ( 1.00x)
store_slice_mmx:                                       950.2 ( 2.94x)
store_slice2_c:                                       3811.7 ( 1.00x)
store_slice2_mmx:                                      682.3 ( 5.59x)

New benchmarks:
store_slice_c:                                        2797.2 ( 1.00x)
store_slice_sse2:                                      543.5 ( 5.15x)
store_slice2_c:                                       3817.0 ( 1.00x)
store_slice2_sse2:                                     408.2 ( 9.35x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.h       |  4 +-
 libavfilter/x86/vf_fspp.asm    | 70 +++++++++++++---------------------
 libavfilter/x86/vf_fspp_init.c | 12 +++---
 3 files changed, 34 insertions(+), 52 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index e87fa6861c..b440809f02 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,11 +31,11 @@
 #include "libavutil/attributes_internal.h"
 
 typedef struct FSPPDSPContext {
-    void (*store_slice)(uint8_t *dst, int16_t *src,
+    void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*store_slice2)(uint8_t *dst, int16_t *src,
+    void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c9408978d8..489e69f8ce 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -43,15 +43,15 @@ SECTION .text
 
 %define DCTSIZE 8
 
-INIT_MMX mmx
+INIT_XMM sse2
 
-;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
+;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
 %if ARCH_X86_64
-cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
+cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
     mov       widthq, r4m
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, 
dither, tmp, tmp2
     mov       tmpq, src_strideq
     and       widthq, ~7
     sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
     xor       ditherq, -1 ; log2_scale
     mov       tmp2q, tmpq
     add       ditherq, 7 ; log2_scale
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       src_strideq, tmp2q
     shl       tmpq, 4
     lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1
 
 .loop_height:
     movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
     mov       tmp2q, widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4
 
 .loop_width:
-    movq      [srcq+tmpq], m7
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
-    movq      [srcq+tmpq+8], m7
+    mova      m0, [srcq]
+    mova      [srcq+tmpq], m1
     paddw     m0, m3
-    paddw     m1, m4
-    movq      [srcq], m7
+    mova      [srcq], m1
     psraw     m0, m2
-    psraw     m1, m2
-    movq      [srcq+8], m7
-    packuswb  m0, m1
+    packuswb  m0, m0
     add       srcq, 16
     movq      [dstq], m0
     add       dstq, 8
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     jl .loop_height
     RET
 
-;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
-;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
+;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+;                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale)
 %if ARCH_X86_64
-cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
+cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, 
dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, 
tmp2
+cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, 
tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
     mov       dstq, dstm
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       tmpq, src_strideq
     and       widthq, ~7
     sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
     xor       ditherq, -1 ; log2_scale
     mov       tmp2q, tmpq
     add       ditherq, 7 ; log2_scale
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     mov       src_strideq, tmp2q
     shl       tmpq, 5
     lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1
 
 .loop_height:
     movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
     mov       tmp2q,widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4
 
 .loop_width:
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
+    mova      m0, [srcq]
     paddw     m0, m3
     paddw     m0, [srcq+tmpq]
-    paddw     m1, m4
-    movq      m6, [srcq+tmpq+8]
-    movq      [srcq+tmpq], m7
+    mova      [srcq+tmpq], m1
     psraw     m0, m2
-    paddw     m1, m6
-    movq      [srcq+tmpq+8], m7
-    psraw     m1, m2
-    packuswb  m0, m1
+    packuswb  m0, m0
     movq      [dstq], m0
     add       srcq, 16
     add       dstq, 8
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, 
dither_height, dither, tmp, tmp2
     RET
 
 ;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-INIT_XMM sse2
 cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     movd      m4, qd
     mova      m0, [thrnq]
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 9f6095ce24..ee875547d2 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -23,12 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/vf_fsppdsp.h"
 
-void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
+void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        s->store_slice  = ff_store_slice_mmx;
-        s->store_slice2 = ff_store_slice2_mmx;
         s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        s->store_slice  = ff_store_slice_sse2;
+        s->store_slice2 = ff_store_slice2_sse2;
         s->mul_thrmat   = ff_mul_thrmat_sse2;
     }
 }
-- 
2.49.1


>From d1b45c85cef16657579aafe61eebaf23c816c75d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 10 Nov 2025 23:03:23 +0100
Subject: [PATCH 09/23] avfilter/vf_fsppdsp: Use restrict

It is possible because the requirements are fulfilled;
it is also beneficial performance and code-size wise.
For GCC 14 (with -O3), this reduced codesize by 26750B
here; for Clang 20, it was 432B.

Old benchmarks:
mul_thrmat_c:                                            4.3 ( 1.00x)
mul_thrmat_sse2:                                         4.3 ( 1.00x)
store_slice_c:                                        2810.8 ( 1.00x)
store_slice_sse2:                                      542.5 ( 5.18x)
store_slice2_c:                                       3817.0 ( 1.00x)
store_slice2_sse2:                                     410.4 ( 9.30x)

New benchmarks:
mul_thrmat_c:                                            4.3 ( 1.00x)
mul_thrmat_sse2:                                         4.3 ( 1.00x)
store_slice_c:                                        1510.1 ( 1.00x)
store_slice_sse2:                                      545.2 ( 2.77x)
store_slice2_c:                                       1763.5 ( 1.00x)
store_slice2_sse2:                                     408.3 ( 4.32x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 15 +++++++++------
 libavfilter/vf_fsppdsp.h | 31 +++++++++++++++++--------------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index f3f7c87174..583571bf94 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
 };
 
 //This func reads from 1 slice, 1 and clears 0 & 1
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 {
@@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
 }
 
 //This func reads from 2 slices, 0 & 2  and clears 2-nd
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 {
@@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
     }
 }
 
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q)
 {
     for (int a = 0; a < 64; a++)
         thr_adr[a] = q * thr_adr_noq[a];
 }
 
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt)
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+                       int16_t *restrict output, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, 
int16_t *output, int cnt
     }
 }
 
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt)
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, 
ptrdiff_t output_str
     }
 }
 
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt)
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index b440809f02..66030da4b1 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,40 +31,43 @@
 #include "libavutil/attributes_internal.h"
 
 typedef struct FSPPDSPContext {
-    void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
+    void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 
16 */,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
+    void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 
16 */,
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
-                       int16_t *thr_adr /* align 16 */, int q);
+    void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+                       int16_t *restrict thr_adr /* align 16 */, int q);
 
-    void (*column_fidct)(int16_t *thr_adr, int16_t *data,
-                         int16_t *output, int cnt);
+    void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+                         int16_t *restrict output, int cnt);
 
-    void (*row_idct)(int16_t *workspace, int16_t *output_adr,
+    void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
                      ptrdiff_t output_stride, int cnt);
 
-    void (*row_fdct)(int16_t *data, const uint8_t *pixels,
+    void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
                      ptrdiff_t line_size, int cnt);
 } FSPPDSPContext;
 
 FF_VISIBILITY_PUSH_HIDDEN
 extern const uint8_t ff_fspp_dither[8][8];
 
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, 
int cnt);
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q);
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+                       int16_t *restrict output, int cnt);
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt);
 
 void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
 FF_VISIBILITY_POP_HIDDEN
-- 
2.49.1


>From 50665134560dd4f7bc70dcde7b6c0c64af53a14b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 14:21:09 +0100
Subject: [PATCH 10/23] avfilter/vf_fsppdsp: Reduce discrepancies between C
 code and x86 asm

The x86 assembly uses the following pattern to zero all
the values with abs<threshold:
    x -= threshold;
    x satu+= threshold (unsigned saturated addition)
    x += threshold
    x satu-= threshold (unsigned saturated subtraction)
The reference C code meanwhile zeroed everything
with abs <= threshold. This commit makes the C code behave
like the x86 assembly to reduce discrepancies between the two.

An alternative would be to require SSSE3, so that
one can use pabsw, pcmpgtw for abs>threshold, followed by
a pand with the original data. Or one could modify the thresholds
to make both equal.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 583571bf94..e530bcd06b 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -34,7 +34,7 @@
 
 #define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
 #define THRESHOLD(r,x,t)                         \
-    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
+    if (((unsigned)((x) + t)) >= t * 2) r = (x); \
     else r = 0;
 #define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
 
-- 
2.49.1


>From ab13abb61eae8bec9f90f26020c30824e4ef175e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 18:44:49 +0100
Subject: [PATCH 11/23] avfilter/x86/vf_fspp: Make ff_column_fidct_mmx()
 bitexact

It currently is not, because the shortcut mode uses different rounding
than the C code (as well as the non-shortcut code).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_fspp.asm | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 489e69f8ce..2f49945c13 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
 pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
 pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
-pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
-pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
-pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
 pw_4:    times 4 dw 4
 pw_2:    times 4 dw 2
 
@@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     or        tmpd, tmpd
     jnz %1
     movq      m4, [rsp]
-    movq      m1, m0
-    pmulhw    m0, [pw_3642]
-    movq      m2, m1
-    movq      m5, [outq+DCTSIZE*0*2]
-    movq      m3, m2
-    pmulhw    m1, [pw_2441]
+    psraw     m3, m0, 2
+    psllw     m0, 1
+    mova      m5, [outq+DCTSIZE*0*2]
+    pmulhw    m1, m0, [pw_3B21]
+    pmulhw    m2, m0, [pw_22A3]
+    pmulhw    m0, [pw_2D41]
     paddw     m5, m4
     movq      m6, [rsp+8]
-    psraw     m3, 2
-    pmulhw    m2, [pw_0CBB]
+    psubw     m2, m1
     psubw     m4, m3
     movq      m7, [outq+DCTSIZE*1*2]
     paddw     m5, m3
-    movq      [outq+DCTSIZE*7*2], m4
+    psubw     m1, m3
+    mova      [outq+DCTSIZE*7*2], m4
+    psubw     m0, m1
+    paddw     m2, m0
+    mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
     movq      m3, [rsp+8*2]
-    psubw     m6, m0
+    psubw     m6, m1
     movq      m4, [outq+DCTSIZE*2*2]
-    paddw     m7, m0
+    paddw     m7, m1
     movq      [outq], m5
     paddw     m4, m3
     movq      [outq+DCTSIZE*6*2], m6
-    psubw     m3, m1
+    psubw     m3, m0
     movq      m5, [outq+DCTSIZE*5*2]
-    paddw     m4, m1
+    paddw     m4, m0
     movq      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
     movq      m0, [rsp+8*3]
@@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     movq      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
     movq      [outq+DCTSIZE*2*2], m4
-    psubw     m0, m2
+    paddw     m0, m2
     movq      m7, [outq+DCTSIZE*4*2]
-    paddw     m6, m2
+    psubw     m6, m2
     movq      [outq+DCTSIZE*5*2], m5
     paddw     m7, m0
     movq      [outq+DCTSIZE*3*2], m6
-- 
2.49.1


>From 7b5a0acf7916e700e59a0e54401e0c4eb5f5e672 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 19:39:35 +0100
Subject: [PATCH 12/23] avfilter/x86/vf_fspp: Put shifts into constants

This avoids some shift instructions and also gives us more headroom
in the registers. In fact, I have proven to myself that everything
that is supposed to fit into 16bits now actually does so.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_fspp.asm | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 2f49945c13..f61efc99f8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -27,10 +27,13 @@ SECTION_RODATA
 
 cextern fspp_dither
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
 pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
 pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
 pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
 pw_4:    times 4 dw 4
@@ -211,12 +214,12 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psubw     m2, m6
     paddw     m7, m1
     movq      m6, [thrq+4*16+%2]
-    psllw     m7, 2
+    psllw     m7, 1
     psubw     m5, [thrq+%2]
     psubw     m2, m6
     paddusw   m5, [thrq+%2]
     paddusw   m2, m6
-    pmulhw    m7, [pw_2D41]
+    pmulhw    m7, [pw_5A82]
     paddw     m5, [thrq+%2]
     paddw     m2, m6
     psubusw   m5, [thrq+%2]
@@ -261,15 +264,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m4, m0
     movq      m7, m3
     psubw     m3, m4
-    psllw     m3, 2
-    psllw     m7, 2
-    pmulhw    m3, [pw_187E]
+    psllw     m7, 1
+    pmulhw    m3, [pw_61F8]
     psllw     m4, 2
-    pmulhw    m7, [pw_22A3]
-    psllw     m2, 2
+    pmulhw    m7, [pw_4546]
+    psllw     m2, 1
     pmulhw    m4, [pw_539F]
     paddw     m5, m1
-    pmulhw    m2, [pw_2D41]
+    pmulhw    m2, [pw_5A82]
     psubw     m6, m1
     paddw     m7, m3
     movq      [rsp+8], m5
@@ -313,11 +315,10 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     jnz %1
     movq      m4, [rsp]
     psraw     m3, m0, 2
-    psllw     m0, 1
     mova      m5, [outq+DCTSIZE*0*2]
-    pmulhw    m1, m0, [pw_3B21]
-    pmulhw    m2, m0, [pw_22A3]
-    pmulhw    m0, [pw_2D41]
+    pmulhw    m1, m0, [pw_7642]
+    pmulhw    m2, m0, [pw_4546]
+    pmulhw    m0, [pw_5A82]
     paddw     m5, m4
     movq      m6, [rsp+8]
     psubw     m2, m1
@@ -360,23 +361,20 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %macro COLUMN_IDCT 0-1 0
     movq      m3, m5
     psubw     m5, m1
-    psllw     m5, 1
     paddw     m3, m1
     movq      m2, m0
     psubw     m0, m6
-    movq      m1, m5
-    psllw     m0, 1
+    psllw     m1, m5, 1
     pmulhw    m1, [pw_AC62]
     paddw     m5, m0
-    pmulhw    m5, [pw_3B21]
+    pmulhw    m5, [pw_7642]
     paddw     m2, m6
-    pmulhw    m0, [pw_22A3]
+    pmulhw    m0, [pw_4546]
     movq      m7, m2
     movq      m4, [rsp]
     psubw     m2, m3
-    psllw     m2, 1
     paddw     m7, m3
-    pmulhw    m2, [pw_2D41]
+    pmulhw    m2, [pw_5A82]
     movq      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]
-- 
2.49.1


>From 55342723889a54740920fb24d8ef2f83a7ec5b80 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 21:03:06 +0100
Subject: [PATCH 13/23] tests/checkasm/vf_fspp: Add test for column_fidct

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/vf_fspp.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index eab62c9450..f9e7b35e88 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -36,6 +36,12 @@
             buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
     } while (0)
 
+#define randomize_buffer_range(buf, min, max)               \
+    do {                                                    \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j)    \
+            buf[j] = min + rnd() % (max - min + 1);         \
+    } while (0)
+
 static void check_store_slice(void)
 {
     enum {
@@ -124,8 +130,41 @@ static void check_mul_thrmat(void)
     }
 }
 
+static void check_column_fidct(void)
+{
+    enum {
+        NB_BLOCKS = 8, ///< arbitrary
+    };
+    FSPPDSPContext fspp;
+    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
+                      int16_t *output, int cnt);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.column_fidct, "column_fidct")) {
+        DECLARE_ALIGNED(16, int16_t, threshold)[64];
+        DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
+
+        randomize_buffer_range(threshold, 0, INT16_MAX);
+        randomize_buffer_range(src, -1284, 1284);
+        randomize_buffers(dst_new);
+        memcpy(dst_ref, dst_new, sizeof(dst_ref));
+
+        call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
+        call_new(threshold, src, dst_new, NB_BLOCKS * 8);
+
+        if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+            fail();
+
+        bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
+    }
+}
+
 void checkasm_check_vf_fspp(void)
 {
     check_store_slice();
     check_mul_thrmat();
+    check_column_fidct();
 }
-- 
2.49.1


>From 1c3f7376e10a466c018a51e589cbd9d46a1d3792 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 21:42:32 +0100
Subject: [PATCH 14/23] avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to
 SSE2

It gains a lot because it has to operate on eight words;
it also saves 608B of .text here.

Old benchmarks:
column_fidct_c:                                       3365.7 ( 1.00x)
column_fidct_mmx:                                     1784.6 ( 1.89x)

New benchmarks:
column_fidct_c:                                       3361.5 ( 1.00x)
column_fidct_sse2:                                     801.1 ( 4.20x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_fspp.asm    | 209 ++++++++++++++++-----------------
 libavfilter/x86/vf_fspp_init.c |   4 +-
 tests/checkasm/vf_fspp.c       |   4 +-
 3 files changed, 107 insertions(+), 110 deletions(-)

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index f61efc99f8..3f37911722 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -26,18 +26,18 @@
 SECTION_RODATA
 
 cextern fspp_dither
+pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
+pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
+pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
+pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
+pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
+pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
+pw_2:    times 8 dw 2
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
-pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
-pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
-pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
-pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
-pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
-pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
 pw_4:    times 4 dw 4
-pw_2:    times 4 dw 2
 
 SECTION .text
 
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     mova      [thrq+16*7], m3
     RET
 
-%macro COLUMN_FDCT 1-3 0, 0
-    movq      m1, [srcq+DCTSIZE*0*2]
-    movq      m7, [srcq+DCTSIZE*3*2]
-    movq      m0, m1
+%macro COLUMN_FDCT 1
+    mova      m1, [srcq+DCTSIZE*0*2]
+    mova      m7, [srcq+DCTSIZE*3*2]
+    mova      m0, m1
     paddw     m1, [srcq+DCTSIZE*7*2]
-    movq      m3, m7
+    mova      m3, m7
     paddw     m7, [srcq+DCTSIZE*4*2]
-    movq      m5, m1
-    movq      m6, [srcq+DCTSIZE*1*2]
+    mova      m5, m1
+    mova      m6, [srcq+DCTSIZE*1*2]
     psubw     m1, m7
-    movq      m2, [srcq+DCTSIZE*2*2]
-    movq      m4, m6
+    mova      m2, [srcq+DCTSIZE*2*2]
+    mova      m4, m6
     paddw     m6, [srcq+DCTSIZE*6*2]
     paddw     m5, m7
     paddw     m2, [srcq+DCTSIZE*5*2]
-    movq      m7, m6
+    mova      m7, m6
     paddw     m6, m2
     psubw     m7, m2
-    movq      m2, m5
+    mova      m2, m5
     paddw     m5, m6
     psubw     m2, m6
     paddw     m7, m1
-    movq      m6, [thrq+4*16+%2]
+    mova      m6, [thrq+4*16]
     psllw     m7, 1
-    psubw     m5, [thrq+%2]
+    psubw     m5, [thrq]
     psubw     m2, m6
-    paddusw   m5, [thrq+%2]
+    paddusw   m5, [thrq]
     paddusw   m2, m6
     pmulhw    m7, [pw_5A82]
-    paddw     m5, [thrq+%2]
+    paddw     m5, [thrq]
     paddw     m2, m6
-    psubusw   m5, [thrq+%2]
+    psubusw   m5, [thrq]
     psubusw   m2, m6
     paddw     m5, [pw_2]
-    movq      m6, m2
+    mova      m6, m2
     paddw     m2, m5
     psubw     m5, m6
-    movq      m6, m1
+    mova      m6, m1
     paddw     m1, m7
-    psubw     m1, [thrq+2*16+%2]
+    psubw     m1, [thrq+2*16]
     psubw     m6, m7
-    movq      m7, [thrq+6*16+%2]
+    mova      m7, [thrq+6*16]
     psraw     m5, 2
-    paddusw   m1, [thrq+2*16+%2]
+    paddusw   m1, [thrq+2*16]
     psubw     m6, m7
-    paddw     m1, [thrq+2*16+%2]
+    paddw     m1, [thrq+2*16]
     paddusw   m6, m7
-    psubusw   m1, [thrq+2*16+%2]
+    psubusw   m1, [thrq+2*16]
     paddw     m6, m7
     psubw     m3, [srcq+DCTSIZE*4*2]
     psubusw   m6, m7
-    movq      m7, m1
+    mova      m7, m1
     psraw     m2, 2
     psubw     m4, [srcq+DCTSIZE*6*2]
     psubw     m1, m6
     psubw     m0, [srcq+DCTSIZE*7*2]
     paddw     m6, m7
     psraw     m6, 2
-    movq      m7, m2
+    mova      m7, m2
     pmulhw    m1, [pw_5A82]
     paddw     m2, m6
-    movq      [rsp], m2
+    mova   [rsp], m2
     psubw     m7, m6
-    movq      m2, [srcq+DCTSIZE*2*2]
+    mova      m2, [srcq+DCTSIZE*2*2]
     psubw     m1, m6
     psubw     m2, [srcq+DCTSIZE*5*2]
-    movq      m6, m5
-    movq      [rsp+8*3], m7
+    mova      m6, m5
+    mova      [rsp+16*3], m7
     paddw     m3, m2
     paddw     m2, m4
     paddw     m4, m0
-    movq      m7, m3
+    mova      m7, m3
     psubw     m3, m4
     psllw     m7, 1
     pmulhw    m3, [pw_61F8]
     psllw     m4, 2
+    add     srcq, 32
     pmulhw    m7, [pw_4546]
     psllw     m2, 1
     pmulhw    m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     pmulhw    m2, [pw_5A82]
     psubw     m6, m1
     paddw     m7, m3
-    movq      [rsp+8], m5
+    mova      [rsp+16], m5
     paddw     m4, m3
-    movq      m3, [thrq+3*16+%2]
-    movq      m1, m0
-    movq      [rsp+8*2], m6
+    mova      m3, [thrq+3*16]
+    mova      m1, m0
+    mova      [rsp+16*2], m6
     psubw     m1, m2
     paddw     m0, m2
-    movq      m5, m1
-    movq      m2, [thrq+5*16+%2]
+    mova      m5, m1
+    mova      m2, [thrq+5*16]
     psubw     m1, m7
     paddw     m5, m7
     psubw     m1, m3
-    movq      m7, [thrq+16+%2]
+    mova      m7, [thrq+16]
     psubw     m5, m2
-    movq      m6, m0
+    mova      m6, m0
     paddw     m0, m4
     paddusw   m1, m3
     psubw     m6, m4
-    movq      m4, [thrq+7*16+%2]
+    mova      m4, [thrq+7*16]
     psubw     m0, m7
     psubw     m6, m4
     paddusw   m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psubusw   m1, m3
     psubusw   m5, m2
     psubusw   m6, m4
-    movq      m4, m1
+    mova      m4, m1
     por       m4, m5
     paddusw   m0, m7
     por       m4, m6
     paddw     m0, m7
     packssdw  m4, m4
     psubusw   m0, m7
-    movd      tmpd, m4
-    or        tmpd, tmpd
+%if ARCH_X86_64
+    movq    tmpq, m4
+%else
+    packssdw  m4, m4
+    movd    tmpd, m4
+%endif
+    or      tmpq, tmpq
     jnz %1
-    movq      m4, [rsp]
+    mova      m4, [rsp]
     psraw     m3, m0, 2
     mova      m5, [outq+DCTSIZE*0*2]
     pmulhw    m1, m0, [pw_7642]
     pmulhw    m2, m0, [pw_4546]
     pmulhw    m0, [pw_5A82]
     paddw     m5, m4
-    movq      m6, [rsp+8]
+    mova      m6, [rsp+16]
     psubw     m2, m1
     psubw     m4, m3
-    movq      m7, [outq+DCTSIZE*1*2]
+    mova      m7, [outq+DCTSIZE*1*2]
     paddw     m5, m3
     psubw     m1, m3
     mova      [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m0
     mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
-    movq      m3, [rsp+8*2]
+    mova      m3, [rsp+16*2]
     psubw     m6, m1
-    movq      m4, [outq+DCTSIZE*2*2]
+    mova      m4, [outq+DCTSIZE*2*2]
     paddw     m7, m1
-    movq      [outq], m5
+    mova  [outq], m5
     paddw     m4, m3
-    movq      [outq+DCTSIZE*6*2], m6
+    mova      [outq+DCTSIZE*6*2], m6
     psubw     m3, m0
-    movq      m5, [outq+DCTSIZE*5*2]
+    mova      m5, [outq+DCTSIZE*5*2]
     paddw     m4, m0
-    movq      m6, [outq+DCTSIZE*3*2]
+    mova      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
-    movq      m0, [rsp+8*3]
-    add       srcq, 8+%3
-    movq      [outq+DCTSIZE*1*2], m7
+    mova      m0, [rsp+16*3]
+    mova      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
-    movq      [outq+DCTSIZE*2*2], m4
+    mova      [outq+DCTSIZE*2*2], m4
     paddw     m0, m2
-    movq      m7, [outq+DCTSIZE*4*2]
+    mova      m7, [outq+DCTSIZE*4*2]
     psubw     m6, m2
-    movq      [outq+DCTSIZE*5*2], m5
+    mova      [outq+DCTSIZE*5*2], m5
     paddw     m7, m0
-    movq      [outq+DCTSIZE*3*2], m6
-    movq      [outq+DCTSIZE*4*2], m7
-    add       outq, 8+%3
+    mova      [outq+DCTSIZE*3*2], m6
+    mova      [outq+DCTSIZE*4*2], m7
+    add     outq, 32
 %endmacro
 
-%macro COLUMN_IDCT 0-1 0
-    movq      m3, m5
+%macro COLUMN_IDCT 0
+    mova      m3, m5
     psubw     m5, m1
     paddw     m3, m1
-    movq      m2, m0
+    mova      m2, m0
     psubw     m0, m6
     psllw     m1, m5, 1
     pmulhw    m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     pmulhw    m5, [pw_7642]
     paddw     m2, m6
     pmulhw    m0, [pw_4546]
-    movq      m7, m2
-    movq      m4, [rsp]
+    mova      m7, m2
+    mova      m4, [rsp]
     psubw     m2, m3
     paddw     m7, m3
     pmulhw    m2, [pw_5A82]
-    movq      m6, m4
+    mova      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]
     psubw     m6, m7
-    movq      m3, [rsp+8]
+    mova      m3, [rsp+16]
     paddw     m4, m7
-    movq      [outq+DCTSIZE*7*2], m6
+    mova      [outq+DCTSIZE*7*2], m6
     paddw     m1, m5
-    movq      [outq], m4
+    mova  [outq], m4
     psubw     m1, m7
-    movq      m7, [rsp+8*2]
+    mova      m7, [rsp+16*2]
     psubw     m0, m5
-    movq      m6, [rsp+8*3]
-    movq      m5, m3
+    mova      m6, [rsp+16*3]
+    mova      m5, m3
     paddw     m3, [outq+DCTSIZE*1*2]
     psubw     m5, m1
     psubw     m2, m1
     paddw     m3, m1
-    movq      [outq+DCTSIZE*6*2], m5
-    movq      m4, m7
+    mova      [outq+DCTSIZE*6*2], m5
+    mova      m4, m7
     paddw     m7, [outq+DCTSIZE*2*2]
     psubw     m4, m2
     paddw     m4, [outq+DCTSIZE*5*2]
     paddw     m7, m2
-    movq      [outq+DCTSIZE*1*2], m3
+    mova      [outq+DCTSIZE*1*2], m3
     paddw     m0, m2
-    movq      [outq+DCTSIZE*2*2], m7
-    movq      m1, m6
+    mova      [outq+DCTSIZE*2*2], m7
+    mova      m1, m6
     paddw     m6, [outq+DCTSIZE*4*2]
     psubw     m1, m0
     paddw     m1, [outq+DCTSIZE*3*2]
     paddw     m6, m0
-    movq      [outq+DCTSIZE*5*2], m4
-    add       srcq, 8+%1
-    movq      [outq+DCTSIZE*4*2], m6
-    movq      [outq+DCTSIZE*3*2], m1
-    add       outq, 8+%1
+    mova      [outq+DCTSIZE*5*2], m4
+    mova      [outq+DCTSIZE*4*2], m6
+    mova      [outq+DCTSIZE*3*2], m1
+    add     outq, 32
 %endmacro
 
-INIT_MMX mmx
-;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
-.fdct1:
-    COLUMN_FDCT .idct1
-    jmp .fdct2
+;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
+cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+.fdct:
+    COLUMN_FDCT .idct
+    sub    cntd, 2
+    jg .fdct
+    RET
 
-.idct1:
+.idct:
     COLUMN_IDCT
-
-.fdct2:
-    COLUMN_FDCT .idct2, 8, 16
     sub    cntd, 2
-    jg .fdct1
-    RET
-
-.idct2:
-    COLUMN_IDCT 16
-    sub    cntd, 2
-    jg .fdct1
+    jg .fdct
     RET
 
+INIT_MMX mmx
 ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
     add       strideq, strideq
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index ee875547d2..c7a9b1799e 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int 
cnt);
+void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
 void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        s->column_fidct = ff_column_fidct_mmx;
         s->row_idct     = ff_row_idct_mmx;
         s->row_fdct     = ff_row_fdct_mmx;
     }
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
         s->store_slice  = ff_store_slice_sse2;
         s->store_slice2 = ff_store_slice2_sse2;
         s->mul_thrmat   = ff_mul_thrmat_sse2;
+        s->column_fidct = ff_column_fidct_sse2;
     }
 }
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index f9e7b35e88..b65a46247d 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -136,8 +136,8 @@ static void check_column_fidct(void)
         NB_BLOCKS = 8, ///< arbitrary
     };
     FSPPDSPContext fspp;
-    declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
-                      int16_t *output, int cnt);
+    declare_func(void, int16_t *thr_adr, int16_t *data,
+                       int16_t *output, int cnt);
 
     ff_fsppdsp_init(&fspp);
 
-- 
2.49.1


>From ce16476ecb7cbf7496a4fe8ece6c8d77f5bc3f31 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 22:44:28 +0100
Subject: [PATCH 15/23] avfilter/x86/vf_fspp: Avoid stack on x64

Possible due to the amount of registers.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_fspp.asm | 78 ++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 3f37911722..cad44ed0bf 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -210,35 +210,47 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m6, m2
     psubw     m7, m2
     mova      m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq]
+%define THRQ m8
+%else
+%define THRQ [thrq]
+%endif
     paddw     m5, m6
     psubw     m2, m6
     paddw     m7, m1
     mova      m6, [thrq+4*16]
     psllw     m7, 1
-    psubw     m5, [thrq]
+    psubw     m5, THRQ
     psubw     m2, m6
-    paddusw   m5, [thrq]
+    paddusw   m5, THRQ
     paddusw   m2, m6
-    pmulhw    m7, [pw_5A82]
-    paddw     m5, [thrq]
+    pmulhw    m7, SQRT2
+    paddw     m5, THRQ
     paddw     m2, m6
-    psubusw   m5, [thrq]
+    psubusw   m5, THRQ
     psubusw   m2, m6
     paddw     m5, [pw_2]
     mova      m6, m2
     paddw     m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq+2*16]
+%define THRQ m8
+%else
+%define THRQ [thrq+2*16]
+%endif
     psubw     m5, m6
     mova      m6, m1
     paddw     m1, m7
-    psubw     m1, [thrq+2*16]
+    psubw     m1, THRQ
     psubw     m6, m7
     mova      m7, [thrq+6*16]
     psraw     m5, 2
-    paddusw   m1, [thrq+2*16]
+    paddusw   m1, THRQ
     psubw     m6, m7
-    paddw     m1, [thrq+2*16]
+    paddw     m1, THRQ
     paddusw   m6, m7
-    psubusw   m1, [thrq+2*16]
+    psubusw   m1, THRQ
     paddw     m6, m7
     psubw     m3, [srcq+DCTSIZE*4*2]
     psubusw   m6, m7
@@ -250,15 +262,15 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m6, m7
     psraw     m6, 2
     mova      m7, m2
-    pmulhw    m1, [pw_5A82]
+    pmulhw    m1, SQRT2
     paddw     m2, m6
-    mova   [rsp], m2
+    mova    tmp0, m2
     psubw     m7, m6
     mova      m2, [srcq+DCTSIZE*2*2]
     psubw     m1, m6
     psubw     m2, [srcq+DCTSIZE*5*2]
     mova      m6, m5
-    mova      [rsp+16*3], m7
+    mova    tmp3, m7
     paddw     m3, m2
     paddw     m2, m4
     paddw     m4, m0
@@ -272,14 +284,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     psllw     m2, 1
     pmulhw    m4, [pw_539F]
     paddw     m5, m1
-    pmulhw    m2, [pw_5A82]
+    pmulhw    m2, SQRT2
     psubw     m6, m1
     paddw     m7, m3
-    mova      [rsp+16], m5
+    mova    tmp1, m5
     paddw     m4, m3
     mova      m3, [thrq+3*16]
     mova      m1, m0
-    mova      [rsp+16*2], m6
+    mova    tmp2, m6
     psubw     m1, m2
     paddw     m0, m2
     mova      m5, m1
@@ -319,14 +331,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %endif
     or      tmpq, tmpq
     jnz %1
-    mova      m4, [rsp]
+    mova      m4, tmp0
     psraw     m3, m0, 2
     mova      m5, [outq+DCTSIZE*0*2]
     pmulhw    m1, m0, [pw_7642]
     pmulhw    m2, m0, [pw_4546]
-    pmulhw    m0, [pw_5A82]
+    pmulhw    m0, SQRT2
     paddw     m5, m4
-    mova      m6, [rsp+16]
+    mova      m6, tmp1
     psubw     m2, m1
     psubw     m4, m3
     mova      m7, [outq+DCTSIZE*1*2]
@@ -337,7 +349,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m0
     mova      [outq+DCTSIZE*0*2], m5
     paddw     m7, m6
-    mova      m3, [rsp+16*2]
+    mova      m3, tmp2
     psubw     m6, m1
     mova      m4, [outq+DCTSIZE*2*2]
     paddw     m7, m1
@@ -349,7 +361,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m4, m0
     mova      m6, [outq+DCTSIZE*3*2]
     paddw     m5, m3
-    mova      m0, [rsp+16*3]
+    mova      m0, tmp3
     mova      [outq+DCTSIZE*1*2], m7
     paddw     m6, m0
     mova      [outq+DCTSIZE*2*2], m4
@@ -376,23 +388,23 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
     paddw     m2, m6
     pmulhw    m0, [pw_4546]
     mova      m7, m2
-    mova      m4, [rsp]
+    mova      m4, tmp0
     psubw     m2, m3
     paddw     m7, m3
-    pmulhw    m2, [pw_5A82]
+    pmulhw    m2, SQRT2
     mova      m6, m4
     psraw     m7, 2
     paddw     m4, [outq]
     psubw     m6, m7
-    mova      m3, [rsp+16]
+    mova      m3, tmp1
     paddw     m4, m7
     mova      [outq+DCTSIZE*7*2], m6
     paddw     m1, m5
     mova  [outq], m4
     psubw     m1, m7
-    mova      m7, [rsp+16*2]
+    mova      m7, tmp2
     psubw     m0, m5
-    mova      m6, [rsp+16*3]
+    mova      m6, tmp3
     mova      m5, m3
     paddw     m3, [outq+DCTSIZE*1*2]
     psubw     m5, m1
@@ -419,7 +431,21 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
 %endmacro
 
 ;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, 
cnt, tmp
+%if ARCH_X86_64
+    %define tmp0 m8
+    %define tmp1 m9
+    %define tmp2 m10
+    %define tmp3 m11
+    %define SQRT2 m12
+    mova     m12, [pw_5A82]
+%else
+    %define tmp0 [rsp]
+    %define tmp1 [rsp+16]
+    %define tmp2 [rsp+2*16]
+    %define tmp3 [rsp+3*16]
+    %define SQRT2 [pw_5A82]
+%endif
 .fdct:
     COLUMN_FDCT .idct
     sub    cntd, 2
-- 
2.49.1


>From cfe9edb8bd267e1bcadad15a8fba244c866cc6bc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 23:05:30 +0100
Subject: [PATCH 16/23] avfilter/vf_fspp: Fix effective type violation

Also don't use unnecessarily large alignment; it avoids having to align
the stack.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fspp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index fa562cbd45..3db7fe114e 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -114,9 +114,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
     const int qpsh = 4 - p->hsub * !is_luma;
     const int qpsv = 4 - p->vsub * !is_luma;
 
-    DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * 
BLOCKSZ];
-    int16_t *block  = (int16_t *)block_align;
-    int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
+    DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * 
BLOCKSZ];
+    int16_t *block  = block_align;
+    int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
 
     memset(block3, 0, 4 * 8 * BLOCKSZ);
 
-- 
2.49.1


>From 24019cd51376f55e5477b7f038dfffb779b9a21c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 23:15:24 +0100
Subject: [PATCH 17/23] avfilter/vf_fsppdsp: Constify

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c       | 30 +++++++++++++-----------------
 libavfilter/vf_fsppdsp.h       | 12 ++++++------
 libavfilter/x86/vf_fspp_init.c |  6 +++---
 tests/checkasm/vf_fspp.c       |  4 ++--
 4 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index e530bcd06b..7fdc5ece25 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -121,13 +121,13 @@ void ff_store_slice2_c(uint8_t *restrict dst, int16_t 
*restrict src,
     }
 }
 
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q)
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict 
thr_adr, int q)
 {
     for (int a = 0; a < 64; a++)
         thr_adr[a] = q * thr_adr_noq[a];
 }
 
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                        int16_t *restrict output, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -135,28 +135,26 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t 
*restrict data,
     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 
-    int16_t *dataptr;
     int16_t *wsptr;
     int16_t *threshold;
 
-    dataptr = data;
     wsptr = output;
 
     for (; cnt > 0; cnt -= 2) { //start positions
         threshold = (int16_t *)thr_adr;//threshold_mtx
         for (int ctr = DCTSIZE; ctr > 0; ctr--) {
             // Process columns from input, add to output.
-            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
-            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+            tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
+            tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
 
-            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
-            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+            tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
+            tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
 
-            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
-            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+            tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
+            tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
 
-            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
-            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+            tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
+            tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
 
             // Even part of FDCT
 
@@ -241,26 +239,24 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t 
*restrict data,
             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
             //
-            dataptr++; //next column
+            data++; //next column
             wsptr++;
             threshold++;
         }
-        dataptr += 8; //skip each second start pos
+        data  += 8; //skip each second start pos
         wsptr   += 8;
     }
 }
 
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
                    ptrdiff_t output_stride, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
     int_simd16_t z5, z10, z11, z12, z13;
     int16_t *outptr;
-    int16_t *wsptr;
 
     cnt *= 4;
-    wsptr = workspace;
     outptr = output_adr;
     for (; cnt > 0; cnt--) {
         // Even part
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 66030da4b1..5a2f1af030 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,13 +39,13 @@ typedef struct FSPPDSPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
 
-    void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+    void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
                        int16_t *restrict thr_adr /* align 16 */, int q);
 
-    void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+    void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                          int16_t *restrict output, int cnt);
 
-    void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
+    void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict 
output_adr,
                      ptrdiff_t output_stride, int cnt);
 
     void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
@@ -61,10 +61,10 @@ void ff_store_slice_c(uint8_t *restrict dst, int16_t 
*restrict src,
 void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, 
int q);
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict 
thr_adr, int q);
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t 
*restrict data,
                        int16_t *restrict output, int cnt);
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict 
output_adr,
                    ptrdiff_t output_stride, int cnt);
 void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
                    ptrdiff_t line_size, int cnt);
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index c7a9b1799e..caf94b30d6 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,9 +29,9 @@ void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
 void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t 
log2_scale);
-void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, 
int cnt);
-void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
+void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t 
*output, int cnt);
+void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t 
output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t 
line_size, int cnt);
 
 av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index b65a46247d..341ce0fd37 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -116,7 +116,7 @@ static void check_mul_thrmat(void)
     DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
     DECLARE_ALIGNED(16, int16_t, dst_new)[64];
     const int q = (uint8_t)rnd();
-    declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+    declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     ff_fsppdsp_init(&fspp);
 
@@ -136,7 +136,7 @@ static void check_column_fidct(void)
         NB_BLOCKS = 8, ///< arbitrary
     };
     FSPPDSPContext fspp;
-    declare_func(void, int16_t *thr_adr, int16_t *data,
+    declare_func(void, const int16_t *thr_adr, const int16_t *data,
                        int16_t *output, int cnt);
 
     ff_fsppdsp_init(&fspp);
-- 
2.49.1


>From c794b6db8dae32f228de7a123e5c79cc880868ca Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 12 Nov 2025 23:26:04 +0100
Subject: [PATCH 18/23] avfilter/x86/vf_spp: Fix comment

Forgotten in dcb28ed860166c9715afb7c71c70889e6b9b8c8d.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_spp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index 48c3d25d7c..7dcf18ec7d 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
     }
 }
 
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 
 av_cold void ff_spp_init_x86(SPPContext *s)
 {
-- 
2.49.1


>From 4b047d8788cee8ff6ca8190c88d24937b5e7783c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 13 Nov 2025 10:48:23 +0100
Subject: [PATCH 19/23] avfilter/vf_fspp: Avoid casts, effective-type
 violations

Maybe uint64_t has been used as a poor man's alignment specifier?
Anyway, reading an uint64_t via an lvalue of type int16_t (as happens
in the C versions of the dsp functions) is undefined behavior.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fspp.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 3db7fe114e..670e9288d9 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,6 +37,7 @@
 
 #include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
@@ -71,8 +72,8 @@ typedef struct FSPPContext {
 
     FSPPDSPContext dsp;
 
-    DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
-    DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
 } FSPPContext;
 
 
@@ -154,7 +155,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
             p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y&1), stride, 2 * (BLOCKSZ - 1));
 
             if (p->qp)
-                p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 
0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+                p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 
0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
             else
                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
                     t = x + x0 - 2;                    //correct 
t=x+x0-2-(y&1), but its the same
@@ -164,8 +165,11 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
                     t = qp_store[qy + (t >> qpsh)];
                     t = ff_norm_qscale(t, p->qscale_type);
 
-                    if (t != p->prev_q) p->prev_q = t, 
p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t 
*)(&p->threshold_mtx[0]), t);
-                    p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), 
block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+                    if (t != p->prev_q) {
+                        p->prev_q = t;
+                        p->dsp.mul_thrmat(p->threshold_mtx_noq, 
p->threshold_mtx, t);
+                    }
+                    p->dsp.column_fidct(p->threshold_mtx, block + x * 8, 
block3 + x * 8, 8); //yes, this is a HOTSPOT
                 }
             p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * 
sizeof(int16_t)); //cycling
@@ -176,7 +180,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t 
*src,
         if (es > 8)
             p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - 
(y & 1), stride, (es - 4) >> 2);
 
-        p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, 
es&(~1));
+        p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
         if (es > 3)
             p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 
2 - (y & 1), stride, es >> 2);
 
@@ -251,19 +255,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
 
     for (i = 0; i < 8; i++) {
-        fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 
2]
+        AV_WN64A(&fspp->threshold_mtx_noq[8 * i], 
(uint64_t)custom_threshold_m[i * 8 + 2]
                                       |(((uint64_t)custom_threshold_m[i * 8 + 
6]) << 16)
                                       |(((uint64_t)custom_threshold_m[i * 8 + 
0]) << 32)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48);
+                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48));
 
-        fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 
8 + 5]
+        AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], 
(uint64_t)custom_threshold_m[i * 8 + 5]
                                           |(((uint64_t)custom_threshold_m[i * 
8 + 3]) << 16)
                                           |(((uint64_t)custom_threshold_m[i * 
8 + 1]) << 32)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48);
+                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48));
     }
 
-    if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t 
*)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), 
fspp->qp);
+    if (fspp->qp) {
+        fspp->prev_q = fspp->qp;
+        fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, 
fspp->qp);
+    }
 
     /* if we are not in a constant user quantizer mode and we don't want to use
      * the quantizers from the B-frames (B-frames often have a higher QP), we
-- 
2.49.1


>From 6d4b85dc3c10f0e41410928db16af912e6945dc0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 13 Nov 2025 11:02:56 +0100
Subject: [PATCH 20/23] avfilter/vf_fspp: Make output endian-independent

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fspp.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 670e9288d9..9e5c688fb2 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,7 +37,6 @@
 
 #include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
-#include "libavutil/intreadwrite.h"
 #include "libavutil/mem.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
@@ -254,16 +253,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
 
-    for (i = 0; i < 8; i++) {
-        AV_WN64A(&fspp->threshold_mtx_noq[8 * i], 
(uint64_t)custom_threshold_m[i * 8 + 2]
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
6]) << 16)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
0]) << 32)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 
4]) << 48));
-
-        AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], 
(uint64_t)custom_threshold_m[i * 8 + 5]
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 3]) << 16)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 1]) << 32)
-                                          |(((uint64_t)custom_threshold_m[i * 
8 + 7]) << 48));
+    for (int i = 0; i < 64; i += 8) {
+        fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
+        fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
+        fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
+        fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
+        fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
+        fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
+        fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
+        fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
     }
 
     if (fspp->qp) {
-- 
2.49.1


>From c362c3e167b7a26c95e4cb6fe24f7eaf486bcb40 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 13 Nov 2025 11:18:28 +0100
Subject: [PATCH 21/23] avfilter/vf_fspp: Pre-reorder threshold table

Avoids reordering at runtime.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fspp.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9e5c688fb2..cbf2e06d67 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -92,14 +92,16 @@ static const short custom_threshold[64] = {
 // values (296) can't be too high
 // -it causes too big quant dependence
 // or maybe overflow(check), which results in some flashing
-     71, 296, 295, 237,  71,  40,  38,  19,
-    245, 193, 185, 121, 102,  73,  53,  27,
-    158, 129, 141, 107,  97,  73,  50,  26,
-    102, 116, 109,  98,  82,  66,  45,  23,
-     71,  94,  95,  81,  70,  56,  38,  20,
-     56,  77,  74,  66,  56,  44,  30,  15,
-     38,  53,  50,  45,  38,  30,  21,  11,
-     20,  27,  26,  23,  20,  15,  11,   5
+// reorder coefficients to the order in which columns are processed
+#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
+    REORDER( 71, 296, 295, 237,  71,  40,  38,  19),
+    REORDER(245, 193, 185, 121, 102,  73,  53,  27),
+    REORDER(158, 129, 141, 107,  97,  73,  50,  26),
+    REORDER(102, 116, 109,  98,  82,  66,  45,  23),
+    REORDER( 71,  94,  95,  81,  70,  56,  38,  20),
+    REORDER( 56,  77,  74,  66,  56,  44,  30,  15),
+    REORDER( 38,  53,  50,  45,  38,  30,  21,  11),
+    REORDER( 20,  27,  26,  23,  20,  15,  11,   5)
 };
 
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -244,25 +246,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 
     int qp_stride = 0;
     int8_t *qp_table = NULL;
-    int i, bias;
     int ret = 0;
-    int custom_threshold_m[64];
 
-    bias = (1 << 4) + fspp->strength;
-
-    for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
-        custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 
0.5);
-
-    for (int i = 0; i < 64; i += 8) {
-        fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
-        fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
-        fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
-        fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
-        fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
-        fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
-        fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
-        fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
-    }
+    //FIXME: tune custom_threshold[] and remove this !
+    for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
+        fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) 
+ 0.5);
 
     if (fspp->qp) {
         fspp->prev_q = fspp->qp;
-- 
2.49.1


>From a86a9361989d99ad8db46eabf39f664ed3f89072 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 13 Nov 2025 12:04:15 +0100
Subject: [PATCH 22/23] avfilter/vf_fsppdsp: Remove pointless cast

Also don't cast const away and use a smaller scope.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 7fdc5ece25..3230376a19 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -136,12 +136,11 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 
     int16_t *wsptr;
-    int16_t *threshold;
 
     wsptr = output;
 
     for (; cnt > 0; cnt -= 2) { //start positions
-        threshold = (int16_t *)thr_adr;//threshold_mtx
+        const int16_t *threshold = thr_adr;//threshold_mtx
         for (int ctr = DCTSIZE; ctr > 0; ctr--) {
             // Process columns from input, add to output.
             tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
-- 
2.49.1


>From 57ca0480e6dcb64f1b4f948b6a79bcb8aaa97723 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 13 Nov 2025 11:57:02 +0100
Subject: [PATCH 23/23] avfilter/vf_fsppdsp: Fix left shifts of negative
 numbers

They are undefined behavior and UBSan warns about them
(in the checkasm test). Put the shifts in the constants
instead. This even gives a tiny speedup here:

Old benchmarks:
column_fidct_c:                                       3369.9 ( 1.00x)
column_fidct_sse2:                                     829.1 ( 4.06x)
New benchmarks:
column_fidct_c:                                       3304.2 ( 1.00x)
column_fidct_sse2:                                     827.9 ( 3.99x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/vf_fsppdsp.c | 46 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 3230376a19..8025e87366 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -165,7 +165,7 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
             d0 = tmp10 + tmp11;
             d4 = tmp10 - tmp11;
 
-            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+            z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
             d2 = tmp13 + z1;
             d6 = tmp13 - z1;
 
@@ -193,10 +193,10 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
             tmp11 = tmp5 + tmp6;
             tmp12 = tmp6 + tmp7;
 
-            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
-            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
-            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
-            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+            z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+            z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+            z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+            z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
 
             z11 = tmp7 + z3;
             z13 = tmp7 - z3;
@@ -215,15 +215,15 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, 
const int16_t *restrict
 
             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
             z13 = tmp6 + tmp5;
-            z10 = (tmp6 - tmp5) << 1;
+            z10 = (tmp6 - tmp5) * 2;
             z11 = tmp4 + tmp7;
-            z12 = (tmp4 - tmp7) << 1;
+            z12 = (tmp4 - tmp7) * 2;
 
             tmp7  = (z11 + z13) >> 2; //+2 !
-            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
-            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
-            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
-            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - 
!!
+            tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
+            z5    = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - !!
 
             tmp6 = tmp12 - tmp7;
             tmp5 = tmp11 - tmp6;
@@ -264,7 +264,7 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t 
*restrict output_adr,
         tmp11 = wsptr[2] -  wsptr[3];
 
         tmp13 = wsptr[0] +  wsptr[1];
-        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - 
tmp13;//this shift order to avoid overflow
+        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - 
tmp13;//this shift order to avoid overflow
 
         tmp0 = tmp10 + tmp13; //->temps
         tmp3 = tmp10 - tmp13; //->temps
@@ -289,9 +289,9 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t 
*restrict output_adr,
         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 
-        tmp6 = (tmp12 << 3) - tmp7;
-        tmp5 = (tmp11 << 3) - tmp6;
-        tmp4 = (tmp10 << 3) + tmp5;
+        tmp6 = tmp12 * 8 - tmp7;
+        tmp5 = tmp11 * 8 - tmp6;
+        tmp4 = tmp10 * 8 + tmp5;
 
         // Final output stage: descale and write column
         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
@@ -342,20 +342,20 @@ void ff_row_fdct_c(int16_t *restrict data, const uint8_t 
*restrict pixels,
         dataptr[2] = tmp10 + tmp11;
         dataptr[3] = tmp10 - tmp11;
 
-        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+        z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
         dataptr[0] = tmp13 + z1;
         dataptr[1] = tmp13 - z1;
 
         // Odd part
 
-        tmp10 = (tmp4 + tmp5) << 2;
-        tmp11 = (tmp5 + tmp6) << 2;
-        tmp12 = (tmp6 + tmp7) << 2;
+        tmp10 = tmp4 + tmp5;
+        tmp11 = tmp5 + tmp6;
+        tmp12 = tmp6 + tmp7;
 
-        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
-        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
-        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
-        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100 << 2) + z5;
+        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965 << 2) + z5;
+        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781 << 2);
 
         z11 = tmp7 + z3;
         z13 = tmp7 - z3;
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] avfilter/vf_fspp: Add checkasm, port to SSE2, fix big-endian (PR #20909)

Reply via email to