PR #20909 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909.patch
>From 92fe3d96e6f9a3b169a3edcdb48ecdc543ba862e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 9 Nov 2025 17:06:46 +0100 Subject: [PATCH 01/23] avfilter/vf_fspp: Add DSPCtx, move DSP functions to file of their own This is in preparation for adding checkasm tests; without it, checkasm would pull all of libavfilter in. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/Makefile | 2 +- libavfilter/vf_fspp.c | 399 +++--------------------- libavfilter/vf_fsppdsp.c | 369 ++++++++++++++++++++++ libavfilter/{vf_fspp.h => vf_fsppdsp.h} | 85 +++-- libavfilter/x86/vf_fspp_init.c | 4 +- 5 files changed, 455 insertions(+), 404 deletions(-) create mode 100644 libavfilter/vf_fsppdsp.c rename libavfilter/{vf_fspp.h => vf_fsppdsp.h} (52%) diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 69d74183b2..d56a458e45 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER) += vf_framestep.o OBJS-$(CONFIG_FREEZEDETECT_FILTER) += vf_freezedetect.o OBJS-$(CONFIG_FREEZEFRAMES_FILTER) += vf_freezeframes.o OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o -OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o +OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o vf_fsppdsp.o qp_table.o OBJS-$(CONFIG_FSYNC_FILTER) += vf_fsync.o OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o OBJS-$(CONFIG_GBLUR_VULKAN_FILTER) += vf_gblur_vulkan.o vulkan.o vulkan_filter.o diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 6b4a715367..9371c63e77 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -41,12 +41,40 @@ #include "libavutil/mem_internal.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/video_enc_params.h" +#include "avfilter.h" #include "filters.h" #include "qp_table.h" -#include "vf_fspp.h" +#include "vf_fsppdsp.h" #include "video.h" +#define BLOCKSZ 12 +#define MAX_LEVEL 5 + +typedef struct FSPPContext { + const struct AVClass *class; + uint64_t threshold_mtx_noq[8 * 2]; + uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions + + int log2_count; + int strength; + int hsub; + int vsub; + int temp_stride; + int qp; + enum AVVideoEncParamsType qscale_type; + int prev_q; + uint8_t *src; + int16_t *temp; + int8_t *non_b_qp_table; + int non_b_qp_stride; + int use_bframe_qp; + + FSPPDSPContext dsp; +} FSPPContext; + + #define OFFSET(x) offsetof(FSPPContext, x) #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM static const AVOption fspp_options[] = { @@ -59,17 +87,6 @@ static const AVOption fspp_options[] = { AVFILTER_DEFINE_CLASS(fspp); -DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { - { 0, 48, 12, 60, 3, 51, 15, 63, }, - { 32, 16, 44, 28, 35, 19, 47, 31, }, - { 8, 56, 4, 52, 11, 59, 7, 55, }, - { 40, 24, 36, 20, 43, 27, 39, 23, }, - { 2, 50, 14, 62, 1, 49, 13, 61, }, - { 34, 18, 46, 30, 33, 17, 45, 29, }, - { 10, 58, 6, 54, 9, 57, 5, 53, }, - { 42, 26, 38, 22, 41, 25, 37, 21, }, -}; - static const short custom_threshold[64] = { // values (296) can't be too high // -it causes too big quant dependence @@ -84,73 +101,6 @@ static const short custom_threshold[64] = { 20, 27, 26, 23, 20, 15, 11, 5 }; -//This func reads from 1 slice, 1 and clears 0 & 1 -static void store_slice_c(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) -{ - int y, x; -#define STORE(pos) \ - temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ - src[x + pos] = src[x + pos - 8 * src_stride] = 0; \ - if (temp & 0x100) temp = ~(temp >> 31); \ - dst[x + pos] = temp; - - for (y = 0; y < height; y++) { - const uint8_t *d = dither[y]; - for (x = 0; x < width; x += 8) { - int temp; - STORE(0); - STORE(1); - STORE(2); - STORE(3); - STORE(4); - STORE(5); - STORE(6); - STORE(7); - } - src += src_stride; - dst += dst_stride; - } -} - -//This func reads from 2 slices, 0 & 2 and clears 2-nd -static void store_slice2_c(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) -{ - int y, x; -#define STORE2(pos) \ - temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ - src[x + pos + 16 * src_stride] = 0; \ - if (temp & 0x100) temp = ~(temp >> 31); \ - dst[x + pos] = temp; - - for (y = 0; y < height; y++) { - const uint8_t *d = dither[y]; - for (x = 0; x < width; x += 8) { - int temp; - STORE2(0); - STORE2(1); - STORE2(2); - STORE2(3); - STORE2(4); - STORE2(5); - STORE2(6); - STORE2(7); - } - src += src_stride; - dst += dst_stride; - } -} - -static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q) -{ - int a; - for (a = 0; a < 64; a++) - thr_adr[a] = q * thr_adr_noq[a]; -} - static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, @@ -197,13 +147,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, if (qy < 0) qy = 0; qy = (qy >> qpsv) * qp_stride; - p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2); + p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2); for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) { - p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1)); + p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1)); if (p->qp) - p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT + p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT else for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) { t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same @@ -213,288 +163,42 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, t = qp_store[qy + (t >> qpsh)]; t = ff_norm_qscale(t, p->qscale_type); - if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t); - p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT + if (t != p->prev_q) p->prev_q = t, p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t); + p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT } - p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1)); + p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1)); memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t)); } es = width + 8 - x0; // 8, ... if (es > 8) - p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2); + p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2); - p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1)); + p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1)); if (es > 3) - p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2); + p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2); if (!(y1 & 7) && y1) { if (y1 & 8) - p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride, - dst_stride, stride, width, 8, 5 - p->log2_count); + p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride, + dst_stride, stride, width, 8, 5 - p->log2_count); else - p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride, - dst_stride, stride, width, 8, 5 - p->log2_count); + p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride, + dst_stride, stride, width, 8, 5 - p->log2_count); } } if (y & 7) { // height % 8 != 0 if (y & 8) - p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride, - dst_stride, stride, width, y&7, 5 - p->log2_count); + p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride, + dst_stride, stride, width, y&7, 5 - p->log2_count); else - p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride, + p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride, dst_stride, stride, width, y&7, 5 - p->log2_count); } } -static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt) -{ - int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int_simd16_t tmp10, tmp11, tmp12, tmp13; - int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13; - int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7; - - int16_t *dataptr; - int16_t *wsptr; - int16_t *threshold; - int ctr; - - dataptr = data; - wsptr = output; - - for (; cnt > 0; cnt -= 2) { //start positions - threshold = (int16_t *)thr_adr;//threshold_mtx - for (ctr = DCTSIZE; ctr > 0; ctr--) { - // Process columns from input, add to output. - tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7]; - tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7]; - - tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6]; - tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6]; - - tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5]; - tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5]; - - tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4]; - tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4]; - - // Even part of FDCT - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - d0 = tmp10 + tmp11; - d4 = tmp10 - tmp11; - - z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); - d2 = tmp13 + z1; - d6 = tmp13 - z1; - - // Even part of IDCT - - THRESHOLD(tmp0, d0, threshold[0 * 8]); - THRESHOLD(tmp1, d2, threshold[2 * 8]); - THRESHOLD(tmp2, d4, threshold[4 * 8]); - THRESHOLD(tmp3, d6, threshold[6 * 8]); - tmp0 += 2; - tmp10 = (tmp0 + tmp2) >> 2; - tmp11 = (tmp0 - tmp2) >> 2; - - tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides) - tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2 - - tmp0 = tmp10 + tmp13; //->temps - tmp3 = tmp10 - tmp13; //->temps - tmp1 = tmp11 + tmp12; //->temps - tmp2 = tmp11 - tmp12; //->temps - - // Odd part of FDCT - - tmp10 = tmp4 + tmp5; - tmp11 = tmp5 + tmp6; - tmp12 = tmp6 + tmp7; - - z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433); - z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5; - z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5; - z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781); - - z11 = tmp7 + z3; - z13 = tmp7 - z3; - - d5 = z13 + z2; - d3 = z13 - z2; - d1 = z11 + z4; - d7 = z11 - z4; - - // Odd part of IDCT - - THRESHOLD(tmp4, d1, threshold[1 * 8]); - THRESHOLD(tmp5, d3, threshold[3 * 8]); - THRESHOLD(tmp6, d5, threshold[5 * 8]); - THRESHOLD(tmp7, d7, threshold[7 * 8]); - - //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0 - z13 = tmp6 + tmp5; - z10 = (tmp6 - tmp5) << 1; - z11 = tmp4 + tmp7; - z12 = (tmp4 - tmp7) << 1; - - tmp7 = (z11 + z13) >> 2; //+2 ! - tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562); - z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); - tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; - tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !! - - tmp6 = tmp12 - tmp7; - tmp5 = tmp11 - tmp6; - tmp4 = tmp10 + tmp5; - - wsptr[DCTSIZE * 0] += (tmp0 + tmp7); - wsptr[DCTSIZE * 1] += (tmp1 + tmp6); - wsptr[DCTSIZE * 2] += (tmp2 + tmp5); - wsptr[DCTSIZE * 3] += (tmp3 - tmp4); - wsptr[DCTSIZE * 4] += (tmp3 + tmp4); - wsptr[DCTSIZE * 5] += (tmp2 - tmp5); - wsptr[DCTSIZE * 6] = (tmp1 - tmp6); - wsptr[DCTSIZE * 7] = (tmp0 - tmp7); - // - dataptr++; //next column - wsptr++; - threshold++; - } - dataptr += 8; //skip each second start pos - wsptr += 8; - } -} - -static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt) -{ - int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int_simd16_t tmp10, tmp11, tmp12, tmp13; - int_simd16_t z5, z10, z11, z12, z13; - int16_t *outptr; - int16_t *wsptr; - - cnt *= 4; - wsptr = workspace; - outptr = output_adr; - for (; cnt > 0; cnt--) { - // Even part - //Simd version reads 4x4 block and transposes it - tmp10 = wsptr[2] + wsptr[3]; - tmp11 = wsptr[2] - wsptr[3]; - - tmp13 = wsptr[0] + wsptr[1]; - tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow - - tmp0 = tmp10 + tmp13; //->temps - tmp3 = tmp10 - tmp13; //->temps - tmp1 = tmp11 + tmp12; - tmp2 = tmp11 - tmp12; - - // Odd part - //Also transpose, with previous: - // ---- ---- |||| - // ---- ---- idct |||| - // ---- ---- ---> |||| - // ---- ---- |||| - z13 = wsptr[4] + wsptr[5]; - z10 = wsptr[4] - wsptr[5]; - z11 = wsptr[6] + wsptr[7]; - z12 = wsptr[6] - wsptr[7]; - - tmp7 = z11 + z13; - tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562); - - z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); - tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; - tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_ - - tmp6 = (tmp12 << 3) - tmp7; - tmp5 = (tmp11 << 3) - tmp6; - tmp4 = (tmp10 << 3) + tmp5; - - // Final output stage: descale and write column - outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3); - outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3); - outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3); - outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3); - outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3); - outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3); - outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ? - outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ? - outptr++; - - wsptr += DCTSIZE; // advance pointer to next row - } -} - -static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt) -{ - int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int_simd16_t tmp10, tmp11, tmp12, tmp13; - int_simd16_t z1, z2, z3, z4, z5, z11, z13; - int16_t *dataptr; - - cnt *= 4; - // Pass 1: process rows. - - dataptr = data; - for (; cnt > 0; cnt--) { - tmp0 = pixels[line_size * 0] + pixels[line_size * 7]; - tmp7 = pixels[line_size * 0] - pixels[line_size * 7]; - tmp1 = pixels[line_size * 1] + pixels[line_size * 6]; - tmp6 = pixels[line_size * 1] - pixels[line_size * 6]; - tmp2 = pixels[line_size * 2] + pixels[line_size * 5]; - tmp5 = pixels[line_size * 2] - pixels[line_size * 5]; - tmp3 = pixels[line_size * 3] + pixels[line_size * 4]; - tmp4 = pixels[line_size * 3] - pixels[line_size * 4]; - - // Even part - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - //Even columns are written first, this leads to different order of columns - //in column_fidct(), but they are processed independently, so all ok. - //Later in the row_idct() columns are read in the same order. - dataptr[2] = tmp10 + tmp11; - dataptr[3] = tmp10 - tmp11; - - z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); - dataptr[0] = tmp13 + z1; - dataptr[1] = tmp13 - z1; - - // Odd part - - tmp10 = (tmp4 + tmp5) << 2; - tmp11 = (tmp5 + tmp6) << 2; - tmp12 = (tmp6 + tmp7) << 2; - - z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433); - z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5; - z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5; - z3 = MULTIPLY16H(tmp11, FIX_0_707106781); - - z11 = tmp7 + z3; - z13 = tmp7 - z3; - - dataptr[4] = z13 + z2; - dataptr[5] = z13 - z2; - dataptr[6] = z11 + z4; - dataptr[7] = z11 - z4; - - pixels++; // advance pointer to next column - dataptr += DCTSIZE; - } -} - static const enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV411P, @@ -522,16 +226,7 @@ static int config_input(AVFilterLink *inlink) if (!fspp->temp || !fspp->src) return AVERROR(ENOMEM); - fspp->store_slice = store_slice_c; - fspp->store_slice2 = store_slice2_c; - fspp->mul_thrmat = mul_thrmat_c; - fspp->column_fidct = column_fidct_c; - fspp->row_idct = row_idct_c; - fspp->row_fdct = row_fdct_c; - -#if ARCH_X86 - ff_fspp_init_x86(fspp); -#endif + ff_fsppdsp_init(&fspp->dsp); return 0; } @@ -567,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) } if (fspp->qp) - fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp); + fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp); /* if we are not in a constant user quantizer mode and we don't want to use * the quantizers from the B-frames (B-frames often have a higher QP), we diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c new file mode 100644 index 0000000000..ab31c77203 --- /dev/null +++ b/libavfilter/vf_fsppdsp.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2003 Michael Niedermayer <[email protected]> + * Copyright (C) 2005 Nikolaj Poroshin <[email protected]> + * Copyright (c) 2014 Arwa Arif <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdint.h> + +#include "vf_fsppdsp.h" + +#include "libavutil/mathematics.h" +#include "libavutil/mem_internal.h" + +#define DCTSIZE 8 + +#define FIX(x,s) ((x) * (1 << s) + 0.5) + +#define MULTIPLY16H(x,k) (((x) * (k)) >> 16) +#define THRESHOLD(r,x,t) \ + if(((unsigned)((x) + t)) > t * 2) r = (x); \ + else r = 0; +#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n) + +typedef int32_t int_simd16_t; +static const int16_t FIX_0_382683433 = FIX(0.382683433, 14); +static const int16_t FIX_0_541196100 = FIX(0.541196100, 14); +static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14); +static const int16_t FIX_1_306562965 = FIX(1.306562965, 14); +static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14); +static const int16_t FIX_1_847759065 = FIX(1.847759065, 13); +static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13); +static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13); +static const int16_t FIX_1_082392200 = FIX(1.082392200, 13); + +DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { + { 0, 48, 12, 60, 3, 51, 15, 63, }, + { 32, 16, 44, 28, 35, 19, 47, 31, }, + { 8, 56, 4, 52, 11, 59, 7, 55, }, + { 40, 24, 36, 20, 43, 27, 39, 23, }, + { 2, 50, 14, 62, 1, 49, 13, 61, }, + { 34, 18, 46, 30, 33, 17, 45, 29, }, + { 10, 58, 6, 54, 9, 57, 5, 53, }, + { 42, 26, 38, 22, 41, 25, 37, 21, }, +}; + +//This func reads from 1 slice, 1 and clears 0 & 1 +void ff_store_slice_c(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +{ +#define STORE(pos) \ + temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ + src[x + pos] = src[x + pos - 8 * src_stride] = 0; \ + if (temp & 0x100) temp = ~(temp >> 31); \ + dst[x + pos] = temp; + + for (int y = 0; y < height; y++) { + const uint8_t *d = dither[y]; + for (int x = 0; x < width; x += 8) { + int temp; + STORE(0); + STORE(1); + STORE(2); + STORE(3); + STORE(4); + STORE(5); + STORE(6); + STORE(7); + } + src += src_stride; + dst += dst_stride; + } +} + +//This func reads from 2 slices, 0 & 2 and clears 2-nd +void ff_store_slice2_c(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +{ +#define STORE2(pos) \ + temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ + src[x + pos + 16 * src_stride] = 0; \ + if (temp & 0x100) temp = ~(temp >> 31); \ + dst[x + pos] = temp; + + for (int y = 0; y < height; y++) { + const uint8_t *d = dither[y]; + for (int x = 0; x < width; x += 8) { + int temp; + STORE2(0); + STORE2(1); + STORE2(2); + STORE2(3); + STORE2(4); + STORE2(5); + STORE2(6); + STORE2(7); + } + src += src_stride; + dst += dst_stride; + } +} + +void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q) +{ + for (int a = 0; a < 64; a++) + thr_adr[a] = q * thr_adr_noq[a]; +} + +void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt) +{ + int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13; + int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7; + + int16_t *dataptr; + int16_t *wsptr; + int16_t *threshold; + + dataptr = data; + wsptr = output; + + for (; cnt > 0; cnt -= 2) { //start positions + threshold = (int16_t *)thr_adr;//threshold_mtx + for (int ctr = DCTSIZE; ctr > 0; ctr--) { + // Process columns from input, add to output. + tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7]; + tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7]; + + tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6]; + tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6]; + + tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5]; + tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5]; + + tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4]; + tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4]; + + // Even part of FDCT + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + d0 = tmp10 + tmp11; + d4 = tmp10 - tmp11; + + z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); + d2 = tmp13 + z1; + d6 = tmp13 - z1; + + // Even part of IDCT + + THRESHOLD(tmp0, d0, threshold[0 * 8]); + THRESHOLD(tmp1, d2, threshold[2 * 8]); + THRESHOLD(tmp2, d4, threshold[4 * 8]); + THRESHOLD(tmp3, d6, threshold[6 * 8]); + tmp0 += 2; + tmp10 = (tmp0 + tmp2) >> 2; + tmp11 = (tmp0 - tmp2) >> 2; + + tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides) + tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2 + + tmp0 = tmp10 + tmp13; //->temps + tmp3 = tmp10 - tmp13; //->temps + tmp1 = tmp11 + tmp12; //->temps + tmp2 = tmp11 - tmp12; //->temps + + // Odd part of FDCT + + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433); + z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5; + z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5; + z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781); + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + d5 = z13 + z2; + d3 = z13 - z2; + d1 = z11 + z4; + d7 = z11 - z4; + + // Odd part of IDCT + + THRESHOLD(tmp4, d1, threshold[1 * 8]); + THRESHOLD(tmp5, d3, threshold[3 * 8]); + THRESHOLD(tmp6, d5, threshold[5 * 8]); + THRESHOLD(tmp7, d7, threshold[7 * 8]); + + //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0 + z13 = tmp6 + tmp5; + z10 = (tmp6 - tmp5) << 1; + z11 = tmp4 + tmp7; + z12 = (tmp4 - tmp7) << 1; + + tmp7 = (z11 + z13) >> 2; //+2 ! + tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562); + z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !! + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + wsptr[DCTSIZE * 0] += (tmp0 + tmp7); + wsptr[DCTSIZE * 1] += (tmp1 + tmp6); + wsptr[DCTSIZE * 2] += (tmp2 + tmp5); + wsptr[DCTSIZE * 3] += (tmp3 - tmp4); + wsptr[DCTSIZE * 4] += (tmp3 + tmp4); + wsptr[DCTSIZE * 5] += (tmp2 - tmp5); + wsptr[DCTSIZE * 6] = (tmp1 - tmp6); + wsptr[DCTSIZE * 7] = (tmp0 - tmp7); + // + dataptr++; //next column + wsptr++; + threshold++; + } + dataptr += 8; //skip each second start pos + wsptr += 8; + } +} + +void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt) +{ + int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z5, z10, z11, z12, z13; + int16_t *outptr; + int16_t *wsptr; + + cnt *= 4; + wsptr = workspace; + outptr = output_adr; + for (; cnt > 0; cnt--) { + // Even part + //Simd version reads 4x4 block and transposes it + tmp10 = wsptr[2] + wsptr[3]; + tmp11 = wsptr[2] - wsptr[3]; + + tmp13 = wsptr[0] + wsptr[1]; + tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow + + tmp0 = tmp10 + tmp13; //->temps + tmp3 = tmp10 - tmp13; //->temps + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + // Odd part + //Also transpose, with previous: + // ---- ---- |||| + // ---- ---- idct |||| + // ---- ---- ---> |||| + // ---- ---- |||| + z13 = wsptr[4] + wsptr[5]; + z10 = wsptr[4] - wsptr[5]; + z11 = wsptr[6] + wsptr[7]; + z12 = wsptr[6] - wsptr[7]; + + tmp7 = z11 + z13; + tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562); + + z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_ + + tmp6 = (tmp12 << 3) - tmp7; + tmp5 = (tmp11 << 3) - tmp6; + tmp4 = (tmp10 << 3) + tmp5; + + // Final output stage: descale and write column + outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3); + outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3); + outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3); + outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3); + outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3); + outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3); + outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ? + outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ? + outptr++; + + wsptr += DCTSIZE; // advance pointer to next row + } +} + +void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt) +{ + int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int_simd16_t tmp10, tmp11, tmp12, tmp13; + int_simd16_t z1, z2, z3, z4, z5, z11, z13; + int16_t *dataptr; + + cnt *= 4; + // Pass 1: process rows. + + dataptr = data; + for (; cnt > 0; cnt--) { + tmp0 = pixels[line_size * 0] + pixels[line_size * 7]; + tmp7 = pixels[line_size * 0] - pixels[line_size * 7]; + tmp1 = pixels[line_size * 1] + pixels[line_size * 6]; + tmp6 = pixels[line_size * 1] - pixels[line_size * 6]; + tmp2 = pixels[line_size * 2] + pixels[line_size * 5]; + tmp5 = pixels[line_size * 2] - pixels[line_size * 5]; + tmp3 = pixels[line_size * 3] + pixels[line_size * 4]; + tmp4 = pixels[line_size * 3] - pixels[line_size * 4]; + + // Even part + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + //Even columns are written first, this leads to different order of columns + //in column_fidct(), but they are processed independently, so all ok. + //Later in the row_idct() columns are read in the same order. + dataptr[2] = tmp10 + tmp11; + dataptr[3] = tmp10 - tmp11; + + z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); + dataptr[0] = tmp13 + z1; + dataptr[1] = tmp13 - z1; + + // Odd part + + tmp10 = (tmp4 + tmp5) << 2; + tmp11 = (tmp5 + tmp6) << 2; + tmp12 = (tmp6 + tmp7) << 2; + + z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433); + z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5; + z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5; + z3 = MULTIPLY16H(tmp11, FIX_0_707106781); + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + dataptr[4] = z13 + z2; + dataptr[5] = z13 - z2; + dataptr[6] = z11 + z4; + dataptr[7] = z11 - z4; + + pixels++; // advance pointer to next column + dataptr += DCTSIZE; + } +} diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fsppdsp.h similarity index 52% rename from libavfilter/vf_fspp.h rename to libavfilter/vf_fsppdsp.h index ee7de3ffef..c441b75094 100644 --- a/libavfilter/vf_fspp.h +++ b/libavfilter/vf_fsppdsp.h @@ -20,56 +20,17 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ -#ifndef AVFILTER_FSPP_H -#define AVFILTER_FSPP_H +#ifndef AVFILTER_FSPPDSP_H +#define AVFILTER_FSPPDSP_H -#include "libavutil/video_enc_params.h" -#include "avfilter.h" +#include <stddef.h> +#include <stdint.h> -#define BLOCKSZ 12 -#define MAX_LEVEL 5 +#include "config.h" -#define DCTSIZE 8 -#define DCTSIZE_S "8" - -#define FIX(x,s) ((x) * (1 << s) + 0.5) - -#define MULTIPLY16H(x,k) (((x) * (k)) >> 16) -#define THRESHOLD(r,x,t) \ - if(((unsigned)((x) + t)) > t * 2) r = (x); \ - else r = 0; -#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n) - -typedef int32_t int_simd16_t; -static const int16_t FIX_0_382683433 = FIX(0.382683433, 14); -static const int16_t FIX_0_541196100 = FIX(0.541196100, 14); -static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14); -static const int16_t FIX_1_306562965 = FIX(1.306562965, 14); -static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14); -static const int16_t FIX_1_847759065 = FIX(1.847759065, 13); -static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13); -static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13); -static const int16_t FIX_1_082392200 = FIX(1.082392200, 13); - -typedef struct FSPPContext { - AVClass *class; - uint64_t threshold_mtx_noq[8 * 2]; - uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions - - int log2_count; - int strength; - int hsub; - int vsub; - int temp_stride; - int qp; - enum AVVideoEncParamsType qscale_type; - int prev_q; - uint8_t *src; - int16_t *temp; - int8_t *non_b_qp_table; - int non_b_qp_stride; - int use_bframe_qp; +#include "libavutil/attributes_internal.h" +typedef struct FSPPDSPContext { void (*store_slice)(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); @@ -88,9 +49,35 @@ typedef struct FSPPContext { void (*row_fdct)(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); +} FSPPDSPContext; -} FSPPContext; +FF_VISIBILITY_PUSH_HIDDEN +void ff_store_slice_c(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_store_slice2_c(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); -void ff_fspp_init_x86(FSPPContext *fspp); +void ff_fsppdsp_init_x86(FSPPDSPContext *fspp); +FF_VISIBILITY_POP_HIDDEN -#endif /* AVFILTER_FSPP_H */ +static inline void ff_fsppdsp_init(FSPPDSPContext *fspp) +{ + fspp->store_slice = ff_store_slice_c; + fspp->store_slice2 = ff_store_slice2_c; + fspp->mul_thrmat = ff_mul_thrmat_c; + fspp->column_fidct = ff_column_fidct_c; + fspp->row_idct = ff_row_idct_c; + fspp->row_fdct = ff_row_fdct_c; + +#if ARCH_X86 + ff_fsppdsp_init_x86(fspp); +#endif +} + +#endif /* AVFILTER_FSPPDSP_H */ diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index 8e00317cb7..2aadb50967 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -21,7 +21,7 @@ #include "libavutil/attributes.h" #include "libavutil/x86/cpu.h" -#include "libavfilter/vf_fspp.h" +#include "libavfilter/vf_fsppdsp.h" void ff_store_slice_mmx(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, @@ -34,7 +34,7 @@ void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int c void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); -av_cold void ff_fspp_init_x86(FSPPContext *s) +av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) { int cpu_flags = av_get_cpu_flags(); -- 2.49.1 >From 4f3d8ea9d11842357998cca26f502831d5d5c9c0 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 9 Nov 2025 17:22:21 +0100 Subject: [PATCH 02/23] avfilter/vf_fsppdsp: Use enum for constants It means that the compiler does not have to optimize the static const object away. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index ab31c77203..d2d04463b4 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -29,7 +29,7 @@ #define DCTSIZE 8 -#define FIX(x,s) ((x) * (1 << s) + 0.5) +#define FIX(x,s) (int)((x) * (1 << s) + 0.5) #define MULTIPLY16H(x,k) (((x) * (k)) >> 16) #define THRESHOLD(r,x,t) \ @@ -38,15 +38,18 @@ #define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n) typedef int32_t int_simd16_t; -static const int16_t FIX_0_382683433 = FIX(0.382683433, 14); -static const int16_t FIX_0_541196100 = FIX(0.541196100, 14); -static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14); -static const int16_t FIX_1_306562965 = FIX(1.306562965, 14); -static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14); -static const int16_t FIX_1_847759065 = FIX(1.847759065, 13); -static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13); -static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13); -static const int16_t FIX_1_082392200 = FIX(1.082392200, 13); + +enum { + FIX_0_382683433 = FIX(0.382683433, 14), + FIX_0_541196100 = FIX(0.541196100, 14), + FIX_0_707106781 = FIX(M_SQRT1_2 , 14), + FIX_1_306562965 = FIX(1.306562965, 14), + FIX_1_414213562_A = FIX(M_SQRT2 , 14), + FIX_1_847759065 = FIX(1.847759065, 13), + FIX_2_613125930 = FIX(-2.613125930, 13), + FIX_1_414213562 = FIX(M_SQRT2 , 13), + FIX_1_082392200 = FIX(1.082392200, 13), +}; DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { { 0, 48, 12, 60, 3, 51, 15, 63, }, -- 2.49.1 >From 787c89a3ac68fa1d023e2f06c653b55ba26f0917 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 9 Nov 2025 17:27:16 +0100 Subject: [PATCH 03/23] avfilter/x86/vf_fspp: Don't duplicate dither table Reuse the one from vf_fsppdsp.c; also don't overalign said table too much. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 6 +++--- libavfilter/vf_fsppdsp.h | 2 ++ libavfilter/x86/vf_fspp.asm | 9 +++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index d2d04463b4..b84d7b57bb 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -51,7 +51,7 @@ enum { FIX_1_082392200 = FIX(1.082392200, 13), }; -DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { +DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = { { 0, 48, 12, 60, 3, 51, 15, 63, }, { 32, 16, 44, 28, 35, 19, 47, 31, }, { 8, 56, 4, 52, 11, 59, 7, 55, }, @@ -74,7 +74,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src, dst[x + pos] = temp; for (int y = 0; y < height; y++) { - const uint8_t *d = dither[y]; + const uint8_t *d = ff_fspp_dither[y]; for (int x = 0; x < width; x += 8) { int temp; STORE(0); @@ -103,7 +103,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src, dst[x + pos] = temp; for (int y = 0; y < height; y++) { - const uint8_t *d = dither[y]; + const uint8_t *d = ff_fspp_dither[y]; for (int x = 0; x < width; x += 8) { int temp; STORE2(0); diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index c441b75094..0dbd628abf 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -52,6 +52,8 @@ typedef struct FSPPDSPContext { } FSPPDSPContext; FF_VISIBILITY_PUSH_HIDDEN +extern const uint8_t ff_fspp_dither[8][8]; + void ff_store_slice_c(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index c7f8f64f1b..0ea6216193 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -25,10 +25,7 @@ SECTION_RODATA -pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ - 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ - 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ - 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 +cextern fspp_dither pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) @@ -73,7 +70,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 sub tmp2q, widthq movd m2, ditherd ; log2_scale add tmp2q, tmp2q - lea ditherq, [pb_dither] + lea ditherq, [fspp_dither] mov src_strideq, tmp2q shl tmpq, 4 lea dither_heightq, [ditherq+dither_heightq*8] @@ -139,7 +136,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 sub tmp2q, widthq movd m2, ditherd ; log2_scale add tmp2q, tmp2q - lea ditherq, [pb_dither] + lea ditherq, [fspp_dither] mov src_strideq, tmp2q shl tmpq, 5 lea dither_heightq, [ditherq+dither_heightq*8] -- 2.49.1 >From 659b75505b3b0e03a20701f7f8ebf77dd954205b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 9 Nov 2025 18:50:48 +0100 Subject: [PATCH 04/23] tests/checkasm: Add vf_fspp mul_thrmat test Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 +++ tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_fspp.c | 52 +++++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 58 insertions(+) create mode 100644 tests/checkasm/vf_fspp.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index e47070d90f..6636bc7774 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o +AVFILTEROBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 4469e043f5..20d8f19757 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -297,6 +297,9 @@ static const struct { #if CONFIG_EQ_FILTER { "vf_eq", checkasm_check_vf_eq }, #endif + #if CONFIG_FSPP_FILTER + { "vf_fspp", checkasm_check_vf_fspp }, + #endif #if CONFIG_GBLUR_FILTER { "vf_gblur", checkasm_check_vf_gblur }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e1ccd4011b..45cd23cac4 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -148,6 +148,7 @@ void checkasm_check_v210enc(void); void checkasm_check_vc1dsp(void); void checkasm_check_vf_bwdif(void); void checkasm_check_vf_eq(void); +void checkasm_check_vf_fspp(void); void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c new file mode 100644 index 0000000000..a84ae8d5af --- /dev/null +++ b/tests/checkasm/vf_fspp.c @@ -0,0 +1,52 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "checkasm.h" +#include "libavfilter/vf_fsppdsp.h" + +#define randomize_buffers(buf) \ + do { \ + for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \ + buf[j] = rnd(); \ + } while (0) + + +static void check_mul_thrmat(void) +{ + FSPPDSPContext fspp; + int16_t src[64]; + int16_t dst_ref[64], dst_new[64]; + const int q = (uint8_t)rnd(); + declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); + + ff_fsppdsp_init(&fspp); + + if (check_func(fspp.mul_thrmat, "mul_thrmat")) { + randomize_buffers(src); + call_ref(src, dst_ref, q); + call_new(src, dst_new, q); + if (memcmp(dst_ref, dst_new, sizeof(dst_ref))) + fail(); + bench_new(src, dst_new, q); + } +} + +void checkasm_check_vf_fspp(void) +{ + check_mul_thrmat(); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index ca1cd0dea3..2be880c8db 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-vf_colordetect \ fate-checkasm-vf_colorspace \ fate-checkasm-vf_eq \ + fate-checkasm-vf_fspp \ fate-checkasm-vf_gblur \ fate-checkasm-vf_hflip \ fate-checkasm-vf_nlmeans \ -- 2.49.1 >From bd0b98cc10caea569331eae8fd1af13d4d546ddb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 9 Nov 2025 19:10:30 +0100 Subject: [PATCH 05/23] avfilter/x86/vf_fspp: Port mul_thrmat to SSE2 This fixes an ABI violation, as mul_thrmat did not issue emms. It seems that this ABI violation could reach the user, namely if ff_get_video_buffer() fails. Notice that ff_get_video_buffer() itself could fail because of this, namely if the allocator uses floating point registers. On x64 (where GCC already used SSE2 in the C version) mul_thrmat_c: 4.4 ( 1.00x) mul_thrmat_mmx: 8.6 ( 0.52x) mul_thrmat_sse2: 4.4 ( 1.00x) On 32bit (where SSE2 is not known to be available): mul_thrmat_c: 56.0 ( 1.00x) mul_thrmat_sse2: 6.0 ( 9.40x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fspp.c | 5 +- libavfilter/vf_fsppdsp.h | 3 +- libavfilter/x86/vf_fspp.asm | 84 +++++++++++++--------------------- libavfilter/x86/vf_fspp_init.c | 6 ++- tests/checkasm/vf_fspp.c | 8 ++-- 5 files changed, 45 insertions(+), 61 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 9371c63e77..fa562cbd45 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -54,8 +54,6 @@ typedef struct FSPPContext { const struct AVClass *class; - uint64_t threshold_mtx_noq[8 * 2]; - uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions int log2_count; int strength; @@ -72,6 +70,9 @@ typedef struct FSPPContext { int use_bframe_qp; FSPPDSPContext dsp; + + DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2]; + DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2]; } FSPPContext; diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index 0dbd628abf..e87fa6861c 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -39,7 +39,8 @@ typedef struct FSPPDSPContext { ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q); + void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */, + int16_t *thr_adr /* align 16 */, int q); void (*column_fidct)(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index 0ea6216193..c9408978d8 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 jl .loop_height RET -;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -cglobal mul_thrmat, 3, 3, 0, thrn, thr, q - movd m7, qd - movq m0, [thrnq] - punpcklwd m7, m7 - movq m1, [thrnq+8] - punpckldq m7, m7 - pmullw m0, m7 - movq m2, [thrnq+8*2] - pmullw m1, m7 - movq m3, [thrnq+8*3] - pmullw m2, m7 - movq [thrq], m0 - movq m4, [thrnq+8*4] - pmullw m3, m7 - movq [thrq+8], m1 - movq m5, [thrnq+8*5] - pmullw m4, m7 - movq [thrq+8*2], m2 - movq m6, [thrnq+8*6] - pmullw m5, m7 - movq [thrq+8*3], m3 - movq m0, [thrnq+8*7] - pmullw m6, m7 - movq [thrq+8*4], m4 - movq m1, [thrnq+8*7+8] - pmullw m0, m7 - movq [thrq+8*5], m5 - movq m2, [thrnq+8*7+8*2] - pmullw m1, m7 - movq [thrq+8*6], m6 - movq m3, [thrnq+8*7+8*3] - pmullw m2, m7 - movq [thrq+8*7], m0 - movq m4, [thrnq+8*7+8*4] - pmullw m3, m7 - movq [thrq+8*7+8], m1 - movq m5, [thrnq+8*7+8*5] - pmullw m4, m7 - movq [thrq+8*7+8*2], m2 - movq m6, [thrnq+8*7+8*6] - pmullw m5, m7 - movq [thrq+8*7+8*3], m3 - movq m0, [thrnq+14*8] - pmullw m6, m7 - movq [thrq+8*7+8*4], m4 - movq m1, [thrnq+14*8+8] - pmullw m0, m7 - movq [thrq+8*7+8*5], m5 - pmullw m1, m7 - movq [thrq+8*7+8*6], m6 - movq [thrq+14*8], m0 - movq [thrq+14*8+8], m1 +;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +INIT_XMM sse2 +cglobal mul_thrmat, 3, 3, 5, thrn, thr, q + movd m4, qd + mova m0, [thrnq] + punpcklwd m4, m4 + mova m1, [thrnq+16] + pshufd m4, m4, 0 + pmullw m0, m4 + mova m2, [thrnq+16*2] + pmullw m1, m4 + mova m3, [thrnq+16*3] + pmullw m2, m4 + mova [thrq], m0 + mova m0, [thrnq+16*4] + pmullw m3, m4 + mova [thrq+16], m1 + mova m1, [thrnq+16*5] + pmullw m0, m4 + mova [thrq+16*2], m2 + mova m2, [thrnq+16*6] + pmullw m1, m4 + mova [thrq+16*3], m3 + mova m3, [thrnq+16*7] + pmullw m2, m4 + mova [thrq+16*4], m0 + pmullw m3, m4 + mova [thrq+16*5], m1 + mova [thrq+16*6], m2 + mova [thrq+16*7], m3 RET %macro COLUMN_FDCT 1-3 0, 0 @@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q add outq, 8+%1 %endmacro +INIT_MMX mmx ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp .fdct1: diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index 2aadb50967..9f6095ce24 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src, void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); @@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) if (EXTERNAL_MMX(cpu_flags)) { s->store_slice = ff_store_slice_mmx; s->store_slice2 = ff_store_slice2_mmx; - s->mul_thrmat = ff_mul_thrmat_mmx; s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } + if (EXTERNAL_SSE2(cpu_flags)) { + s->mul_thrmat = ff_mul_thrmat_sse2; + } } diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index a84ae8d5af..117e1c670e 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -18,6 +18,7 @@ #include "checkasm.h" #include "libavfilter/vf_fsppdsp.h" +#include "libavutil/mem_internal.h" #define randomize_buffers(buf) \ do { \ @@ -29,10 +30,11 @@ static void check_mul_thrmat(void) { FSPPDSPContext fspp; - int16_t src[64]; - int16_t dst_ref[64], dst_new[64]; + DECLARE_ALIGNED(16, int16_t, src)[64]; + DECLARE_ALIGNED(16, int16_t, dst_ref)[64]; + DECLARE_ALIGNED(16, int16_t, dst_new)[64]; const int q = (uint8_t)rnd(); - declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); + declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); ff_fsppdsp_init(&fspp); -- 2.49.1 >From cd9e9ca3c1126d820bf6108c939b9911f2e72bd9 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 10 Nov 2025 12:54:31 +0100 Subject: [PATCH 06/23] avfilter/vf_fsppdsp: Use standard clamping This is obviously what is intended and what the MMX code does; yet I cannot rule out that it changes the output for some inputs: I have observed individual src values which would lead to temp values just above 512 if they came in pairs (i.e. if both inputs were simultaneously huge). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index b84d7b57bb..f3f7c87174 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -24,6 +24,7 @@ #include "vf_fsppdsp.h" +#include "libavutil/common.h" #include "libavutil/mathematics.h" #include "libavutil/mem_internal.h" @@ -70,7 +71,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src, #define STORE(pos) \ temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ src[x + pos] = src[x + pos - 8 * src_stride] = 0; \ - if (temp & 0x100) temp = ~(temp >> 31); \ + temp = av_clip_uint8(temp); \ dst[x + pos] = temp; for (int y = 0; y < height; y++) { @@ -99,7 +100,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src, #define STORE2(pos) \ temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \ src[x + pos + 16 * src_stride] = 0; \ - if (temp & 0x100) temp = ~(temp >> 31); \ + temp = av_clip_uint8(temp); \ dst[x + pos] = temp; for (int y = 0; y < height; y++) { -- 2.49.1 >From c90066ba04c4f8ff8471f99d80c8cda68a491b63 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 10 Nov 2025 21:57:45 +0100 Subject: [PATCH 07/23] tests/checkasm/vf_fspp: Test store_slice Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/vf_fspp.c | 77 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index 117e1c670e..eab62c9450 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -16,8 +16,12 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include <stddef.h> +#include <stdint.h> + #include "checkasm.h" #include "libavfilter/vf_fsppdsp.h" +#include "libavcodec/mathops.h" #include "libavutil/mem_internal.h" #define randomize_buffers(buf) \ @@ -26,6 +30,78 @@ buf[j] = rnd(); \ } while (0) +#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\ + do { \ + for (size_t j = 0; j < nb_elems; ++j) \ + buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \ + } while (0) + +static void check_store_slice(void) +{ + enum { + MAX_WIDTH = 256, + /// in elements, not in bytes; 32 is arbirary + MAX_STRIDE = MAX_WIDTH + 32, + MAX_HEIGHT = 8, + }; + FSPPDSPContext fspp; + ff_fsppdsp_init(&fspp); + declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); + + for (int i = 0; i < 2; ++i) { + if (check_func(i ? fspp.store_slice2 : fspp.store_slice, "store_slice%s", i ? "2" : "")) { + // store slice resets the row eight lines above the current one + DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH]; + DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH]; + // store_slice2 resets the row 16 lines below the current one + DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH]; + DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH]; + uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH]; + uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref; + int16_t *src_ref, *src_new, *or_src_ref, *or_src_new; + ptrdiff_t width = 1 + rnd() % MAX_WIDTH; + ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8); + ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8); + ptrdiff_t height = 1 + rnd() % 8; + size_t nb_elems; + + if (i) { + src_ref = src_ref2; + src_new = src_new2; + or_src_ref = src_ref2; + or_src_new = src_new2; + nb_elems = FF_ARRAY_ELEMS(src_ref2); + } else { + src_ref = src_ref1 + 8 * src_stride; + src_new = src_new1 + 8 * src_stride; + or_src_ref = src_ref1; + or_src_new = src_new1; + nb_elems = FF_ARRAY_ELEMS(src_ref1); + } + if (rnd() & 1) { + dst_ref += dst_stride * (height - 1); + dst_new += dst_stride * (height - 1); + dst_stride *= -1; + } + randomize_buffers(dstbuf_new); + memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref)); + randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14); + + ptrdiff_t log2_scale = rnd() & 1; + call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, log2_scale); + call_new(dst_new, src_new, dst_stride, src_stride, width, height, log2_scale); + if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) || + memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems)) + fail(); + // don't use random parameters for benchmarks + src_ref = or_src_ref + !i * 8 * MAX_STRIDE; + bench_new(dstbuf_new, src_ref, + MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1); + } + } +} static void check_mul_thrmat(void) { @@ -50,5 +126,6 @@ static void check_mul_thrmat(void) void checkasm_check_vf_fspp(void) { + check_store_slice(); check_mul_thrmat(); } -- 2.49.1 >From e67ee1a479f984274016ffacc71aae7ac636417c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 10 Nov 2025 22:06:34 +0100 Subject: [PATCH 08/23] avfilter/x86/vf_fspp: Port store_slice to SSE2 Old benchmarks: store_slice_c: 2798.3 ( 1.00x) store_slice_mmx: 950.2 ( 2.94x) store_slice2_c: 3811.7 ( 1.00x) store_slice2_mmx: 682.3 ( 5.59x) New benchmarks: store_slice_c: 2797.2 ( 1.00x) store_slice_sse2: 543.5 ( 5.15x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 408.2 ( 9.35x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.h | 4 +- libavfilter/x86/vf_fspp.asm | 70 +++++++++++++--------------------- libavfilter/x86/vf_fspp_init.c | 12 +++--- 3 files changed, 34 insertions(+), 52 deletions(-) diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index e87fa6861c..b440809f02 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -31,11 +31,11 @@ #include "libavutil/attributes_internal.h" typedef struct FSPPDSPContext { - void (*store_slice)(uint8_t *dst, int16_t *src, + void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*store_slice2)(uint8_t *dst, int16_t *src, + void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index c9408978d8..489e69f8ce 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -43,15 +43,15 @@ SECTION .text %define DCTSIZE 8 -INIT_MMX mmx +INIT_XMM sse2 -;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, -; ptrdiff_t dst_stride, ptrdiff_t src_stride, -; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +;void ff_store_slice_sse2(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) %if ARCH_X86_64 -cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 %else -cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2 %define dst_strideq r2m %define src_strideq r3m mov widthq, r4m @@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov tmpq, src_strideq and widthq, ~7 sub dst_strideq, widthq - movd m5, ditherd ; log2_scale + movd m4, ditherd ; log2_scale xor ditherq, -1 ; log2_scale mov tmp2q, tmpq add ditherq, 7 ; log2_scale @@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov src_strideq, tmp2q shl tmpq, 4 lea dither_heightq, [ditherq+dither_heightq*8] - pxor m7, m7 + pxor m1, m1 .loop_height: movq m3, [ditherq] - movq m4, m3 - punpcklbw m3, m7 - punpckhbw m4, m7 + punpcklbw m3, m1 mov tmp2q, widthq - psraw m3, m5 - psraw m4, m5 + psraw m3, m4 .loop_width: - movq [srcq+tmpq], m7 - movq m0, [srcq] - movq m1, [srcq+8] - movq [srcq+tmpq+8], m7 + mova m0, [srcq] + mova [srcq+tmpq], m1 paddw m0, m3 - paddw m1, m4 - movq [srcq], m7 + mova [srcq], m1 psraw m0, m2 - psraw m1, m2 - movq [srcq+8], m7 - packuswb m0, m1 + packuswb m0, m0 add srcq, 16 movq [dstq], m0 add dstq, 8 @@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 jl .loop_height RET -;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, -; ptrdiff_t dst_stride, ptrdiff_t src_stride, -; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) %if ARCH_X86_64 -cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 %else -cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2 %define dst_strideq r2m %define src_strideq r3m mov dstq, dstm @@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov tmpq, src_strideq and widthq, ~7 sub dst_strideq, widthq - movd m5, ditherd ; log2_scale + movd m4, ditherd ; log2_scale xor ditherq, -1 ; log2_scale mov tmp2q, tmpq add ditherq, 7 ; log2_scale @@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 mov src_strideq, tmp2q shl tmpq, 5 lea dither_heightq, [ditherq+dither_heightq*8] - pxor m7, m7 + pxor m1, m1 .loop_height: movq m3, [ditherq] - movq m4, m3 - punpcklbw m3, m7 - punpckhbw m4, m7 + punpcklbw m3, m1 mov tmp2q,widthq - psraw m3, m5 - psraw m4, m5 + psraw m3, m4 .loop_width: - movq m0, [srcq] - movq m1, [srcq+8] + mova m0, [srcq] paddw m0, m3 paddw m0, [srcq+tmpq] - paddw m1, m4 - movq m6, [srcq+tmpq+8] - movq [srcq+tmpq], m7 + mova [srcq+tmpq], m1 psraw m0, m2 - paddw m1, m6 - movq [srcq+tmpq+8], m7 - psraw m1, m2 - packuswb m0, m1 + packuswb m0, m0 movq [dstq], m0 add srcq, 16 add dstq, 8 @@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 RET ;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -INIT_XMM sse2 cglobal mul_thrmat, 3, 3, 5, thrn, thr, q movd m4, qd mova m0, [thrnq] diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index 9f6095ce24..ee875547d2 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -23,12 +23,12 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/vf_fsppdsp.h" -void ff_store_slice_mmx(uint8_t *dst, int16_t *src, - ptrdiff_t dst_stride, ptrdiff_t src_stride, - ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, +void ff_store_slice_sse2(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); @@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - s->store_slice = ff_store_slice_mmx; - s->store_slice2 = ff_store_slice2_mmx; s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } if (EXTERNAL_SSE2(cpu_flags)) { + s->store_slice = ff_store_slice_sse2; + s->store_slice2 = ff_store_slice2_sse2; s->mul_thrmat = ff_mul_thrmat_sse2; } } -- 2.49.1 >From d1b45c85cef16657579aafe61eebaf23c816c75d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 10 Nov 2025 23:03:23 +0100 Subject: [PATCH 09/23] avfilter/vf_fsppdsp: Use restrict It is possible because the requirements are fulfilled; it is also beneficial performance and code-size wise. For GCC 14 (with -O3), this reduced codesize by 26750B here; for Clang 20, it was 432B. Old benchmarks: mul_thrmat_c: 4.3 ( 1.00x) mul_thrmat_sse2: 4.3 ( 1.00x) store_slice_c: 2810.8 ( 1.00x) store_slice_sse2: 542.5 ( 5.18x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 410.4 ( 9.30x) New benchmarks: mul_thrmat_c: 4.3 ( 1.00x) mul_thrmat_sse2: 4.3 ( 1.00x) store_slice_c: 1510.1 ( 1.00x) store_slice_sse2: 545.2 ( 2.77x) store_slice2_c: 1763.5 ( 1.00x) store_slice2_sse2: 408.3 ( 4.32x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 15 +++++++++------ libavfilter/vf_fsppdsp.h | 31 +++++++++++++++++-------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index f3f7c87174..583571bf94 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = { }; //This func reads from 1 slice, 1 and clears 0 & 1 -void ff_store_slice_c(uint8_t *dst, int16_t *src, +void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) { @@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src, } //This func reads from 2 slices, 0 & 2 and clears 2-nd -void ff_store_slice2_c(uint8_t *dst, int16_t *src, +void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) { @@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src, } } -void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q) +void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q) { for (int a = 0; a < 64; a++) thr_adr[a] = q * thr_adr_noq[a]; } -void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt) +void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, + int16_t *restrict output, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; @@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt } } -void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt) +void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr, + ptrdiff_t output_stride, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; @@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_str } } -void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt) +void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels, + ptrdiff_t line_size, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index b440809f02..66030da4b1 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -31,40 +31,43 @@ #include "libavutil/attributes_internal.h" typedef struct FSPPDSPContext { - void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */, + void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */, + void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */, - int16_t *thr_adr /* align 16 */, int q); + void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */, + int16_t *restrict thr_adr /* align 16 */, int q); - void (*column_fidct)(int16_t *thr_adr, int16_t *data, - int16_t *output, int cnt); + void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data, + int16_t *restrict output, int cnt); - void (*row_idct)(int16_t *workspace, int16_t *output_adr, + void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr, ptrdiff_t output_stride, int cnt); - void (*row_fdct)(int16_t *data, const uint8_t *pixels, + void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels, ptrdiff_t line_size, int cnt); } FSPPDSPContext; FF_VISIBILITY_PUSH_HIDDEN extern const uint8_t ff_fspp_dither[8][8]; -void ff_store_slice_c(uint8_t *dst, int16_t *src, +void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_store_slice2_c(uint8_t *dst, int16_t *src, +void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); -void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); -void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); +void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q); +void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, + int16_t *restrict output, int cnt); +void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr, + ptrdiff_t output_stride, int cnt); +void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels, + ptrdiff_t line_size, int cnt); void ff_fsppdsp_init_x86(FSPPDSPContext *fspp); FF_VISIBILITY_POP_HIDDEN -- 2.49.1 >From 50665134560dd4f7bc70dcde7b6c0c64af53a14b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 14:21:09 +0100 Subject: [PATCH 10/23] avfilter/vf_fsppdsp: Reduce discrepancies between C code and x86 asm The x86 assembly uses the following pattern to zero all the values with abs<threshold: x -= threshold; x satu+= threshold (unsigned saturated addition) x += threshold x satu-= threshold (unsigned saturated subtraction) The reference C code meanwhile zeroed everything with abs <= threshold. This commit makes the C code behave like the x86 assembly to reduce discrepancies between the two. An alternative would be to require SSSE3, so that one can use pabsw, pcmpgtw for abs>threshold, followed by a pand with the original data. Or one could modify the thresholds to make both equal. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index 583571bf94..e530bcd06b 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -34,7 +34,7 @@ #define MULTIPLY16H(x,k) (((x) * (k)) >> 16) #define THRESHOLD(r,x,t) \ - if(((unsigned)((x) + t)) > t * 2) r = (x); \ + if (((unsigned)((x) + t)) >= t * 2) r = (x); \ else r = 0; #define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n) -- 2.49.1 >From ab13abb61eae8bec9f90f26020c30824e4ef175e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 18:44:49 +0100 Subject: [PATCH 11/23] avfilter/x86/vf_fspp: Make ff_column_fidct_mmx() bitexact It currently is not, because the shortcut mode uses different rounding than the C code (as well as the non-shortcut code). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_fspp.asm | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index 489e69f8ce..2f49945c13 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) -pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) -pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) -pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) pw_4: times 4 dw 4 pw_2: times 4 dw 2 @@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q or tmpd, tmpd jnz %1 movq m4, [rsp] - movq m1, m0 - pmulhw m0, [pw_3642] - movq m2, m1 - movq m5, [outq+DCTSIZE*0*2] - movq m3, m2 - pmulhw m1, [pw_2441] + psraw m3, m0, 2 + psllw m0, 1 + mova m5, [outq+DCTSIZE*0*2] + pmulhw m1, m0, [pw_3B21] + pmulhw m2, m0, [pw_22A3] + pmulhw m0, [pw_2D41] paddw m5, m4 movq m6, [rsp+8] - psraw m3, 2 - pmulhw m2, [pw_0CBB] + psubw m2, m1 psubw m4, m3 movq m7, [outq+DCTSIZE*1*2] paddw m5, m3 - movq [outq+DCTSIZE*7*2], m4 + psubw m1, m3 + mova [outq+DCTSIZE*7*2], m4 + psubw m0, m1 + paddw m2, m0 + mova [outq+DCTSIZE*0*2], m5 paddw m7, m6 movq m3, [rsp+8*2] - psubw m6, m0 + psubw m6, m1 movq m4, [outq+DCTSIZE*2*2] - paddw m7, m0 + paddw m7, m1 movq [outq], m5 paddw m4, m3 movq [outq+DCTSIZE*6*2], m6 - psubw m3, m1 + psubw m3, m0 movq m5, [outq+DCTSIZE*5*2] - paddw m4, m1 + paddw m4, m0 movq m6, [outq+DCTSIZE*3*2] paddw m5, m3 movq m0, [rsp+8*3] @@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q movq [outq+DCTSIZE*1*2], m7 paddw m6, m0 movq [outq+DCTSIZE*2*2], m4 - psubw m0, m2 + paddw m0, m2 movq m7, [outq+DCTSIZE*4*2] - paddw m6, m2 + psubw m6, m2 movq [outq+DCTSIZE*5*2], m5 paddw m7, m0 movq [outq+DCTSIZE*3*2], m6 -- 2.49.1 >From 7b5a0acf7916e700e59a0e54401e0c4eb5f5e672 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 19:39:35 +0100 Subject: [PATCH 12/23] avfilter/x86/vf_fspp: Put shifts into constants This avoids some shift instructions and also gives us more headroom in the registers. In fact, I have proven to myself that everything that is supposed to fit into 16bits now actually does so. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_fspp.asm | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index 2f49945c13..f61efc99f8 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -27,10 +27,13 @@ SECTION_RODATA cextern fspp_dither pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) +pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14) pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) +pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13) pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) +pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) pw_4: times 4 dw 4 @@ -211,12 +214,12 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q psubw m2, m6 paddw m7, m1 movq m6, [thrq+4*16+%2] - psllw m7, 2 + psllw m7, 1 psubw m5, [thrq+%2] psubw m2, m6 paddusw m5, [thrq+%2] paddusw m2, m6 - pmulhw m7, [pw_2D41] + pmulhw m7, [pw_5A82] paddw m5, [thrq+%2] paddw m2, m6 psubusw m5, [thrq+%2] @@ -261,15 +264,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m4, m0 movq m7, m3 psubw m3, m4 - psllw m3, 2 - psllw m7, 2 - pmulhw m3, [pw_187E] + psllw m7, 1 + pmulhw m3, [pw_61F8] psllw m4, 2 - pmulhw m7, [pw_22A3] - psllw m2, 2 + pmulhw m7, [pw_4546] + psllw m2, 1 pmulhw m4, [pw_539F] paddw m5, m1 - pmulhw m2, [pw_2D41] + pmulhw m2, [pw_5A82] psubw m6, m1 paddw m7, m3 movq [rsp+8], m5 @@ -313,11 +315,10 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q jnz %1 movq m4, [rsp] psraw m3, m0, 2 - psllw m0, 1 mova m5, [outq+DCTSIZE*0*2] - pmulhw m1, m0, [pw_3B21] - pmulhw m2, m0, [pw_22A3] - pmulhw m0, [pw_2D41] + pmulhw m1, m0, [pw_7642] + pmulhw m2, m0, [pw_4546] + pmulhw m0, [pw_5A82] paddw m5, m4 movq m6, [rsp+8] psubw m2, m1 @@ -360,23 +361,20 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q %macro COLUMN_IDCT 0-1 0 movq m3, m5 psubw m5, m1 - psllw m5, 1 paddw m3, m1 movq m2, m0 psubw m0, m6 - movq m1, m5 - psllw m0, 1 + psllw m1, m5, 1 pmulhw m1, [pw_AC62] paddw m5, m0 - pmulhw m5, [pw_3B21] + pmulhw m5, [pw_7642] paddw m2, m6 - pmulhw m0, [pw_22A3] + pmulhw m0, [pw_4546] movq m7, m2 movq m4, [rsp] psubw m2, m3 - psllw m2, 1 paddw m7, m3 - pmulhw m2, [pw_2D41] + pmulhw m2, [pw_5A82] movq m6, m4 psraw m7, 2 paddw m4, [outq] -- 2.49.1 >From 55342723889a54740920fb24d8ef2f83a7ec5b80 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 21:03:06 +0100 Subject: [PATCH 13/23] tests/checkasm/vf_fspp: Add test for column_fidct Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/vf_fspp.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index eab62c9450..f9e7b35e88 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -36,6 +36,12 @@ buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \ } while (0) +#define randomize_buffer_range(buf, min, max) \ + do { \ + for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \ + buf[j] = min + rnd() % (max - min + 1); \ + } while (0) + static void check_store_slice(void) { enum { @@ -124,8 +130,41 @@ static void check_mul_thrmat(void) } } +static void check_column_fidct(void) +{ + enum { + NB_BLOCKS = 8, ///< arbitrary + }; + FSPPDSPContext fspp; + declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data, + int16_t *output, int cnt); + + ff_fsppdsp_init(&fspp); + + if (check_func(fspp.column_fidct, "column_fidct")) { + DECLARE_ALIGNED(16, int16_t, threshold)[64]; + DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)]; + DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)]; + DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)]; + + randomize_buffer_range(threshold, 0, INT16_MAX); + randomize_buffer_range(src, -1284, 1284); + randomize_buffers(dst_new); + memcpy(dst_ref, dst_new, sizeof(dst_ref)); + + call_ref(threshold, src, dst_ref, NB_BLOCKS * 8); + call_new(threshold, src, dst_new, NB_BLOCKS * 8); + + if (memcmp(dst_new, dst_ref, sizeof(dst_new))) + fail(); + + bench_new(threshold, src, dst_new, NB_BLOCKS * 8); + } +} + void checkasm_check_vf_fspp(void) { check_store_slice(); check_mul_thrmat(); + check_column_fidct(); } -- 2.49.1 >From 1c3f7376e10a466c018a51e589cbd9d46a1d3792 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 21:42:32 +0100 Subject: [PATCH 14/23] avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2 It gains a lot because it has to operate on eight words; it also saves 608B of .text here. Old benchmarks: column_fidct_c: 3365.7 ( 1.00x) column_fidct_mmx: 1784.6 ( 1.89x) New benchmarks: column_fidct_c: 3361.5 ( 1.00x) column_fidct_sse2: 801.1 ( 4.20x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_fspp.asm | 209 ++++++++++++++++----------------- libavfilter/x86/vf_fspp_init.c | 4 +- tests/checkasm/vf_fspp.c | 4 +- 3 files changed, 107 insertions(+), 110 deletions(-) diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index f61efc99f8..3f37911722 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -26,18 +26,18 @@ SECTION_RODATA cextern fspp_dither +pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2 +pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4 +pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14) +pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14) +pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2 +pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13) +pw_2: times 8 dw 2 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) -pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14) pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) -pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13) pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) -pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) -pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) -pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13) pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) -pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) pw_4: times 4 dw 4 -pw_2: times 4 dw 2 SECTION .text @@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q mova [thrq+16*7], m3 RET -%macro COLUMN_FDCT 1-3 0, 0 - movq m1, [srcq+DCTSIZE*0*2] - movq m7, [srcq+DCTSIZE*3*2] - movq m0, m1 +%macro COLUMN_FDCT 1 + mova m1, [srcq+DCTSIZE*0*2] + mova m7, [srcq+DCTSIZE*3*2] + mova m0, m1 paddw m1, [srcq+DCTSIZE*7*2] - movq m3, m7 + mova m3, m7 paddw m7, [srcq+DCTSIZE*4*2] - movq m5, m1 - movq m6, [srcq+DCTSIZE*1*2] + mova m5, m1 + mova m6, [srcq+DCTSIZE*1*2] psubw m1, m7 - movq m2, [srcq+DCTSIZE*2*2] - movq m4, m6 + mova m2, [srcq+DCTSIZE*2*2] + mova m4, m6 paddw m6, [srcq+DCTSIZE*6*2] paddw m5, m7 paddw m2, [srcq+DCTSIZE*5*2] - movq m7, m6 + mova m7, m6 paddw m6, m2 psubw m7, m2 - movq m2, m5 + mova m2, m5 paddw m5, m6 psubw m2, m6 paddw m7, m1 - movq m6, [thrq+4*16+%2] + mova m6, [thrq+4*16] psllw m7, 1 - psubw m5, [thrq+%2] + psubw m5, [thrq] psubw m2, m6 - paddusw m5, [thrq+%2] + paddusw m5, [thrq] paddusw m2, m6 pmulhw m7, [pw_5A82] - paddw m5, [thrq+%2] + paddw m5, [thrq] paddw m2, m6 - psubusw m5, [thrq+%2] + psubusw m5, [thrq] psubusw m2, m6 paddw m5, [pw_2] - movq m6, m2 + mova m6, m2 paddw m2, m5 psubw m5, m6 - movq m6, m1 + mova m6, m1 paddw m1, m7 - psubw m1, [thrq+2*16+%2] + psubw m1, [thrq+2*16] psubw m6, m7 - movq m7, [thrq+6*16+%2] + mova m7, [thrq+6*16] psraw m5, 2 - paddusw m1, [thrq+2*16+%2] + paddusw m1, [thrq+2*16] psubw m6, m7 - paddw m1, [thrq+2*16+%2] + paddw m1, [thrq+2*16] paddusw m6, m7 - psubusw m1, [thrq+2*16+%2] + psubusw m1, [thrq+2*16] paddw m6, m7 psubw m3, [srcq+DCTSIZE*4*2] psubusw m6, m7 - movq m7, m1 + mova m7, m1 psraw m2, 2 psubw m4, [srcq+DCTSIZE*6*2] psubw m1, m6 psubw m0, [srcq+DCTSIZE*7*2] paddw m6, m7 psraw m6, 2 - movq m7, m2 + mova m7, m2 pmulhw m1, [pw_5A82] paddw m2, m6 - movq [rsp], m2 + mova [rsp], m2 psubw m7, m6 - movq m2, [srcq+DCTSIZE*2*2] + mova m2, [srcq+DCTSIZE*2*2] psubw m1, m6 psubw m2, [srcq+DCTSIZE*5*2] - movq m6, m5 - movq [rsp+8*3], m7 + mova m6, m5 + mova [rsp+16*3], m7 paddw m3, m2 paddw m2, m4 paddw m4, m0 - movq m7, m3 + mova m7, m3 psubw m3, m4 psllw m7, 1 pmulhw m3, [pw_61F8] psllw m4, 2 + add srcq, 32 pmulhw m7, [pw_4546] psllw m2, 1 pmulhw m4, [pw_539F] @@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q pmulhw m2, [pw_5A82] psubw m6, m1 paddw m7, m3 - movq [rsp+8], m5 + mova [rsp+16], m5 paddw m4, m3 - movq m3, [thrq+3*16+%2] - movq m1, m0 - movq [rsp+8*2], m6 + mova m3, [thrq+3*16] + mova m1, m0 + mova [rsp+16*2], m6 psubw m1, m2 paddw m0, m2 - movq m5, m1 - movq m2, [thrq+5*16+%2] + mova m5, m1 + mova m2, [thrq+5*16] psubw m1, m7 paddw m5, m7 psubw m1, m3 - movq m7, [thrq+16+%2] + mova m7, [thrq+16] psubw m5, m2 - movq m6, m0 + mova m6, m0 paddw m0, m4 paddusw m1, m3 psubw m6, m4 - movq m4, [thrq+7*16+%2] + mova m4, [thrq+7*16] psubw m0, m7 psubw m6, m4 paddusw m5, m2 @@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q psubusw m1, m3 psubusw m5, m2 psubusw m6, m4 - movq m4, m1 + mova m4, m1 por m4, m5 paddusw m0, m7 por m4, m6 paddw m0, m7 packssdw m4, m4 psubusw m0, m7 - movd tmpd, m4 - or tmpd, tmpd +%if ARCH_X86_64 + movq tmpq, m4 +%else + packssdw m4, m4 + movd tmpd, m4 +%endif + or tmpq, tmpq jnz %1 - movq m4, [rsp] + mova m4, [rsp] psraw m3, m0, 2 mova m5, [outq+DCTSIZE*0*2] pmulhw m1, m0, [pw_7642] pmulhw m2, m0, [pw_4546] pmulhw m0, [pw_5A82] paddw m5, m4 - movq m6, [rsp+8] + mova m6, [rsp+16] psubw m2, m1 psubw m4, m3 - movq m7, [outq+DCTSIZE*1*2] + mova m7, [outq+DCTSIZE*1*2] paddw m5, m3 psubw m1, m3 mova [outq+DCTSIZE*7*2], m4 @@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m2, m0 mova [outq+DCTSIZE*0*2], m5 paddw m7, m6 - movq m3, [rsp+8*2] + mova m3, [rsp+16*2] psubw m6, m1 - movq m4, [outq+DCTSIZE*2*2] + mova m4, [outq+DCTSIZE*2*2] paddw m7, m1 - movq [outq], m5 + mova [outq], m5 paddw m4, m3 - movq [outq+DCTSIZE*6*2], m6 + mova [outq+DCTSIZE*6*2], m6 psubw m3, m0 - movq m5, [outq+DCTSIZE*5*2] + mova m5, [outq+DCTSIZE*5*2] paddw m4, m0 - movq m6, [outq+DCTSIZE*3*2] + mova m6, [outq+DCTSIZE*3*2] paddw m5, m3 - movq m0, [rsp+8*3] - add srcq, 8+%3 - movq [outq+DCTSIZE*1*2], m7 + mova m0, [rsp+16*3] + mova [outq+DCTSIZE*1*2], m7 paddw m6, m0 - movq [outq+DCTSIZE*2*2], m4 + mova [outq+DCTSIZE*2*2], m4 paddw m0, m2 - movq m7, [outq+DCTSIZE*4*2] + mova m7, [outq+DCTSIZE*4*2] psubw m6, m2 - movq [outq+DCTSIZE*5*2], m5 + mova [outq+DCTSIZE*5*2], m5 paddw m7, m0 - movq [outq+DCTSIZE*3*2], m6 - movq [outq+DCTSIZE*4*2], m7 - add outq, 8+%3 + mova [outq+DCTSIZE*3*2], m6 + mova [outq+DCTSIZE*4*2], m7 + add outq, 32 %endmacro -%macro COLUMN_IDCT 0-1 0 - movq m3, m5 +%macro COLUMN_IDCT 0 + mova m3, m5 psubw m5, m1 paddw m3, m1 - movq m2, m0 + mova m2, m0 psubw m0, m6 psllw m1, m5, 1 pmulhw m1, [pw_AC62] @@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q pmulhw m5, [pw_7642] paddw m2, m6 pmulhw m0, [pw_4546] - movq m7, m2 - movq m4, [rsp] + mova m7, m2 + mova m4, [rsp] psubw m2, m3 paddw m7, m3 pmulhw m2, [pw_5A82] - movq m6, m4 + mova m6, m4 psraw m7, 2 paddw m4, [outq] psubw m6, m7 - movq m3, [rsp+8] + mova m3, [rsp+16] paddw m4, m7 - movq [outq+DCTSIZE*7*2], m6 + mova [outq+DCTSIZE*7*2], m6 paddw m1, m5 - movq [outq], m4 + mova [outq], m4 psubw m1, m7 - movq m7, [rsp+8*2] + mova m7, [rsp+16*2] psubw m0, m5 - movq m6, [rsp+8*3] - movq m5, m3 + mova m6, [rsp+16*3] + mova m5, m3 paddw m3, [outq+DCTSIZE*1*2] psubw m5, m1 psubw m2, m1 paddw m3, m1 - movq [outq+DCTSIZE*6*2], m5 - movq m4, m7 + mova [outq+DCTSIZE*6*2], m5 + mova m4, m7 paddw m7, [outq+DCTSIZE*2*2] psubw m4, m2 paddw m4, [outq+DCTSIZE*5*2] paddw m7, m2 - movq [outq+DCTSIZE*1*2], m3 + mova [outq+DCTSIZE*1*2], m3 paddw m0, m2 - movq [outq+DCTSIZE*2*2], m7 - movq m1, m6 + mova [outq+DCTSIZE*2*2], m7 + mova m1, m6 paddw m6, [outq+DCTSIZE*4*2] psubw m1, m0 paddw m1, [outq+DCTSIZE*3*2] paddw m6, m0 - movq [outq+DCTSIZE*5*2], m4 - add srcq, 8+%1 - movq [outq+DCTSIZE*4*2], m6 - movq [outq+DCTSIZE*3*2], m1 - add outq, 8+%1 + mova [outq+DCTSIZE*5*2], m4 + mova [outq+DCTSIZE*4*2], m6 + mova [outq+DCTSIZE*3*2], m1 + add outq, 32 %endmacro -INIT_MMX mmx -;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); -cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp -.fdct1: - COLUMN_FDCT .idct1 - jmp .fdct2 +;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp +.fdct: + COLUMN_FDCT .idct + sub cntd, 2 + jg .fdct + RET -.idct1: +.idct: COLUMN_IDCT - -.fdct2: - COLUMN_FDCT .idct2, 8, 16 sub cntd, 2 - jg .fdct1 - RET - -.idct2: - COLUMN_IDCT 16 - sub cntd, 2 - jg .fdct1 + jg .fdct RET +INIT_MMX mmx ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 add strideq, strideq diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index ee875547d2..c7a9b1799e 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); @@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - s->column_fidct = ff_column_fidct_mmx; s->row_idct = ff_row_idct_mmx; s->row_fdct = ff_row_fdct_mmx; } @@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) s->store_slice = ff_store_slice_sse2; s->store_slice2 = ff_store_slice2_sse2; s->mul_thrmat = ff_mul_thrmat_sse2; + s->column_fidct = ff_column_fidct_sse2; } } diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index f9e7b35e88..b65a46247d 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -136,8 +136,8 @@ static void check_column_fidct(void) NB_BLOCKS = 8, ///< arbitrary }; FSPPDSPContext fspp; - declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data, - int16_t *output, int cnt); + declare_func(void, int16_t *thr_adr, int16_t *data, + int16_t *output, int cnt); ff_fsppdsp_init(&fspp); -- 2.49.1 >From ce16476ecb7cbf7496a4fe8ece6c8d77f5bc3f31 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 22:44:28 +0100 Subject: [PATCH 15/23] avfilter/x86/vf_fspp: Avoid stack on x64 Possible due to the amount of registers. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_fspp.asm | 78 ++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm index 3f37911722..cad44ed0bf 100644 --- a/libavfilter/x86/vf_fspp.asm +++ b/libavfilter/x86/vf_fspp.asm @@ -210,35 +210,47 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m6, m2 psubw m7, m2 mova m2, m5 +%if ARCH_X86_64 + mova m8, [thrq] +%define THRQ m8 +%else +%define THRQ [thrq] +%endif paddw m5, m6 psubw m2, m6 paddw m7, m1 mova m6, [thrq+4*16] psllw m7, 1 - psubw m5, [thrq] + psubw m5, THRQ psubw m2, m6 - paddusw m5, [thrq] + paddusw m5, THRQ paddusw m2, m6 - pmulhw m7, [pw_5A82] - paddw m5, [thrq] + pmulhw m7, SQRT2 + paddw m5, THRQ paddw m2, m6 - psubusw m5, [thrq] + psubusw m5, THRQ psubusw m2, m6 paddw m5, [pw_2] mova m6, m2 paddw m2, m5 +%if ARCH_X86_64 + mova m8, [thrq+2*16] +%define THRQ m8 +%else +%define THRQ [thrq+2*16] +%endif psubw m5, m6 mova m6, m1 paddw m1, m7 - psubw m1, [thrq+2*16] + psubw m1, THRQ psubw m6, m7 mova m7, [thrq+6*16] psraw m5, 2 - paddusw m1, [thrq+2*16] + paddusw m1, THRQ psubw m6, m7 - paddw m1, [thrq+2*16] + paddw m1, THRQ paddusw m6, m7 - psubusw m1, [thrq+2*16] + psubusw m1, THRQ paddw m6, m7 psubw m3, [srcq+DCTSIZE*4*2] psubusw m6, m7 @@ -250,15 +262,15 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m6, m7 psraw m6, 2 mova m7, m2 - pmulhw m1, [pw_5A82] + pmulhw m1, SQRT2 paddw m2, m6 - mova [rsp], m2 + mova tmp0, m2 psubw m7, m6 mova m2, [srcq+DCTSIZE*2*2] psubw m1, m6 psubw m2, [srcq+DCTSIZE*5*2] mova m6, m5 - mova [rsp+16*3], m7 + mova tmp3, m7 paddw m3, m2 paddw m2, m4 paddw m4, m0 @@ -272,14 +284,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q psllw m2, 1 pmulhw m4, [pw_539F] paddw m5, m1 - pmulhw m2, [pw_5A82] + pmulhw m2, SQRT2 psubw m6, m1 paddw m7, m3 - mova [rsp+16], m5 + mova tmp1, m5 paddw m4, m3 mova m3, [thrq+3*16] mova m1, m0 - mova [rsp+16*2], m6 + mova tmp2, m6 psubw m1, m2 paddw m0, m2 mova m5, m1 @@ -319,14 +331,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q %endif or tmpq, tmpq jnz %1 - mova m4, [rsp] + mova m4, tmp0 psraw m3, m0, 2 mova m5, [outq+DCTSIZE*0*2] pmulhw m1, m0, [pw_7642] pmulhw m2, m0, [pw_4546] - pmulhw m0, [pw_5A82] + pmulhw m0, SQRT2 paddw m5, m4 - mova m6, [rsp+16] + mova m6, tmp1 psubw m2, m1 psubw m4, m3 mova m7, [outq+DCTSIZE*1*2] @@ -337,7 +349,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m2, m0 mova [outq+DCTSIZE*0*2], m5 paddw m7, m6 - mova m3, [rsp+16*2] + mova m3, tmp2 psubw m6, m1 mova m4, [outq+DCTSIZE*2*2] paddw m7, m1 @@ -349,7 +361,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m4, m0 mova m6, [outq+DCTSIZE*3*2] paddw m5, m3 - mova m0, [rsp+16*3] + mova m0, tmp3 mova [outq+DCTSIZE*1*2], m7 paddw m6, m0 mova [outq+DCTSIZE*2*2], m4 @@ -376,23 +388,23 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q paddw m2, m6 pmulhw m0, [pw_4546] mova m7, m2 - mova m4, [rsp] + mova m4, tmp0 psubw m2, m3 paddw m7, m3 - pmulhw m2, [pw_5A82] + pmulhw m2, SQRT2 mova m6, m4 psraw m7, 2 paddw m4, [outq] psubw m6, m7 - mova m3, [rsp+16] + mova m3, tmp1 paddw m4, m7 mova [outq+DCTSIZE*7*2], m6 paddw m1, m5 mova [outq], m4 psubw m1, m7 - mova m7, [rsp+16*2] + mova m7, tmp2 psubw m0, m5 - mova m6, [rsp+16*3] + mova m6, tmp3 mova m5, m3 paddw m3, [outq+DCTSIZE*1*2] psubw m5, m1 @@ -419,7 +431,21 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q %endmacro ;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); -cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp +cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, cnt, tmp +%if ARCH_X86_64 + %define tmp0 m8 + %define tmp1 m9 + %define tmp2 m10 + %define tmp3 m11 + %define SQRT2 m12 + mova m12, [pw_5A82] +%else + %define tmp0 [rsp] + %define tmp1 [rsp+16] + %define tmp2 [rsp+2*16] + %define tmp3 [rsp+3*16] + %define SQRT2 [pw_5A82] +%endif .fdct: COLUMN_FDCT .idct sub cntd, 2 -- 2.49.1 >From cfe9edb8bd267e1bcadad15a8fba244c866cc6bc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 23:05:30 +0100 Subject: [PATCH 16/23] avfilter/vf_fspp: Fix effective type violation Also don't use unnecessarily large alignment; it avoids having to align the stack. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fspp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index fa562cbd45..3db7fe114e 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -114,9 +114,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, const int qpsh = 4 - p->hsub * !is_luma; const int qpsv = 4 - p->vsub * !is_luma; - DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ]; - int16_t *block = (int16_t *)block_align; - int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ); + DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * BLOCKSZ]; + int16_t *block = block_align; + int16_t *block3 = block_align + 8 * 8 * BLOCKSZ; memset(block3, 0, 4 * 8 * BLOCKSZ); -- 2.49.1 >From 24019cd51376f55e5477b7f038dfffb779b9a21c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 23:15:24 +0100 Subject: [PATCH 17/23] avfilter/vf_fsppdsp: Constify Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 30 +++++++++++++----------------- libavfilter/vf_fsppdsp.h | 12 ++++++------ libavfilter/x86/vf_fspp_init.c | 6 +++--- tests/checkasm/vf_fspp.c | 4 ++-- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index e530bcd06b..7fdc5ece25 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -121,13 +121,13 @@ void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src, } } -void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q) +void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q) { for (int a = 0; a < 64; a++) thr_adr[a] = q * thr_adr_noq[a]; } -void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, +void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data, int16_t *restrict output, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -135,28 +135,26 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13; int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7; - int16_t *dataptr; int16_t *wsptr; int16_t *threshold; - dataptr = data; wsptr = output; for (; cnt > 0; cnt -= 2) { //start positions threshold = (int16_t *)thr_adr;//threshold_mtx for (int ctr = DCTSIZE; ctr > 0; ctr--) { // Process columns from input, add to output. - tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7]; - tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7]; + tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7]; + tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7]; - tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6]; - tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6]; + tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6]; + tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6]; - tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5]; - tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5]; + tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5]; + tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5]; - tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4]; - tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4]; + tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4]; + tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4]; // Even part of FDCT @@ -241,26 +239,24 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, wsptr[DCTSIZE * 6] = (tmp1 - tmp6); wsptr[DCTSIZE * 7] = (tmp0 - tmp7); // - dataptr++; //next column + data++; //next column wsptr++; threshold++; } - dataptr += 8; //skip each second start pos + data += 8; //skip each second start pos wsptr += 8; } } -void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr, +void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr, ptrdiff_t output_stride, int cnt) { int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int_simd16_t tmp10, tmp11, tmp12, tmp13; int_simd16_t z5, z10, z11, z12, z13; int16_t *outptr; - int16_t *wsptr; cnt *= 4; - wsptr = workspace; outptr = output_adr; for (; cnt > 0; cnt--) { // Even part diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h index 66030da4b1..5a2f1af030 100644 --- a/libavfilter/vf_fsppdsp.h +++ b/libavfilter/vf_fsppdsp.h @@ -39,13 +39,13 @@ typedef struct FSPPDSPContext { ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); - void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */, + void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */, int16_t *restrict thr_adr /* align 16 */, int q); - void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data, + void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t *restrict data, int16_t *restrict output, int cnt); - void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr, + void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict output_adr, ptrdiff_t output_stride, int cnt); void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels, @@ -61,10 +61,10 @@ void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src, void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q); -void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data, +void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q); +void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data, int16_t *restrict output, int cnt); -void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr, +void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels, ptrdiff_t line_size, int cnt); diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c index c7a9b1799e..caf94b30d6 100644 --- a/libavfilter/x86/vf_fspp_init.c +++ b/libavfilter/x86/vf_fspp_init.c @@ -29,9 +29,9 @@ void ff_store_slice_sse2(uint8_t *dst, int16_t *src, void ff_store_slice2_sse2(uint8_t *dst, int16_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); -void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q); -void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); -void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt); +void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s) diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c index b65a46247d..341ce0fd37 100644 --- a/tests/checkasm/vf_fspp.c +++ b/tests/checkasm/vf_fspp.c @@ -116,7 +116,7 @@ static void check_mul_thrmat(void) DECLARE_ALIGNED(16, int16_t, dst_ref)[64]; DECLARE_ALIGNED(16, int16_t, dst_new)[64]; const int q = (uint8_t)rnd(); - declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q); + declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q); ff_fsppdsp_init(&fspp); @@ -136,7 +136,7 @@ static void check_column_fidct(void) NB_BLOCKS = 8, ///< arbitrary }; FSPPDSPContext fspp; - declare_func(void, int16_t *thr_adr, int16_t *data, + declare_func(void, const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt); ff_fsppdsp_init(&fspp); -- 2.49.1 >From c794b6db8dae32f228de7a123e5c79cc880868ca Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 12 Nov 2025 23:26:04 +0100 Subject: [PATCH 18/23] avfilter/x86/vf_spp: Fix comment Forgotten in dcb28ed860166c9715afb7c71c70889e6b9b8c8d. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_spp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c index 48c3d25d7c..7dcf18ec7d 100644 --- a/libavfilter/x86/vf_spp.c +++ b/libavfilter/x86/vf_spp.c @@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src, } } -#endif /* HAVE_MMX_INLINE */ +#endif /* HAVE_SSE2_INLINE */ av_cold void ff_spp_init_x86(SPPContext *s) { -- 2.49.1 >From 4b047d8788cee8ff6ca8190c88d24937b5e7783c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 13 Nov 2025 10:48:23 +0100 Subject: [PATCH 19/23] avfilter/vf_fspp: Avoid casts, effective-type violations Maybe uint64_t has been used as a poor man's alignment specifier? Anyway, reading an uint64_t via an lvalue of type int16_t (as happens in the C versions of the dsp functions) is undefined behavior. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fspp.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 3db7fe114e..670e9288d9 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -37,6 +37,7 @@ #include "libavutil/emms.h" #include "libavutil/imgutils.h" +#include "libavutil/intreadwrite.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" #include "libavutil/opt.h" @@ -71,8 +72,8 @@ typedef struct FSPPContext { FSPPDSPContext dsp; - DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2]; - DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2]; + DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8]; + DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8]; } FSPPContext; @@ -154,7 +155,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1)); if (p->qp) - p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT + p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT else for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) { t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same @@ -164,8 +165,11 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, t = qp_store[qy + (t >> qpsh)]; t = ff_norm_qscale(t, p->qscale_type); - if (t != p->prev_q) p->prev_q = t, p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t); - p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT + if (t != p->prev_q) { + p->prev_q = t; + p->dsp.mul_thrmat(p->threshold_mtx_noq, p->threshold_mtx, t); + } + p->dsp.column_fidct(p->threshold_mtx, block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT } p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1)); memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling @@ -176,7 +180,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, if (es > 8) p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2); - p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1)); + p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1)); if (es > 3) p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2); @@ -251,19 +255,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5); for (i = 0; i < 8; i++) { - fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2] + AV_WN64A(&fspp->threshold_mtx_noq[8 * i], (uint64_t)custom_threshold_m[i * 8 + 2] |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16) |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32) - |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48); + |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48)); - fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5] + AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], (uint64_t)custom_threshold_m[i * 8 + 5] |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16) |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32) - |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48); + |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48)); } - if (fspp->qp) - fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp); + if (fspp->qp) { + fspp->prev_q = fspp->qp; + fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, fspp->qp); + } /* if we are not in a constant user quantizer mode and we don't want to use * the quantizers from the B-frames (B-frames often have a higher QP), we -- 2.49.1 >From 6d4b85dc3c10f0e41410928db16af912e6945dc0 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 13 Nov 2025 11:02:56 +0100 Subject: [PATCH 20/23] avfilter/vf_fspp: Make output endian-independent Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fspp.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 670e9288d9..9e5c688fb2 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -37,7 +37,6 @@ #include "libavutil/emms.h" #include "libavutil/imgutils.h" -#include "libavutil/intreadwrite.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" #include "libavutil/opt.h" @@ -254,16 +253,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this ! custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5); - for (i = 0; i < 8; i++) { - AV_WN64A(&fspp->threshold_mtx_noq[8 * i], (uint64_t)custom_threshold_m[i * 8 + 2] - |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16) - |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32) - |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48)); - - AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], (uint64_t)custom_threshold_m[i * 8 + 5] - |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16) - |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32) - |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48)); + for (int i = 0; i < 64; i += 8) { + fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2]; + fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6]; + fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0]; + fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4]; + fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5]; + fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3]; + fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1]; + fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7]; } if (fspp->qp) { -- 2.49.1 >From c362c3e167b7a26c95e4cb6fe24f7eaf486bcb40 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 13 Nov 2025 11:18:28 +0100 Subject: [PATCH 21/23] avfilter/vf_fspp: Pre-reorder threshold table Avoids reordering at runtime. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fspp.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c index 9e5c688fb2..cbf2e06d67 100644 --- a/libavfilter/vf_fspp.c +++ b/libavfilter/vf_fspp.c @@ -92,14 +92,16 @@ static const short custom_threshold[64] = { // values (296) can't be too high // -it causes too big quant dependence // or maybe overflow(check), which results in some flashing - 71, 296, 295, 237, 71, 40, 38, 19, - 245, 193, 185, 121, 102, 73, 53, 27, - 158, 129, 141, 107, 97, 73, 50, 26, - 102, 116, 109, 98, 82, 66, 45, 23, - 71, 94, 95, 81, 70, 56, 38, 20, - 56, 77, 74, 66, 56, 44, 30, 15, - 38, 53, 50, 45, 38, 30, 21, 11, - 20, 27, 26, 23, 20, 15, 11, 5 +// reorder coefficients to the order in which columns are processed +#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h + REORDER( 71, 296, 295, 237, 71, 40, 38, 19), + REORDER(245, 193, 185, 121, 102, 73, 53, 27), + REORDER(158, 129, 141, 107, 97, 73, 50, 26), + REORDER(102, 116, 109, 98, 82, 66, 45, 23), + REORDER( 71, 94, 95, 81, 70, 56, 38, 20), + REORDER( 56, 77, 74, 66, 56, 44, 30, 15), + REORDER( 38, 53, 50, 45, 38, 30, 21, 11), + REORDER( 20, 27, 26, 23, 20, 15, 11, 5) }; static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, @@ -244,25 +246,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) int qp_stride = 0; int8_t *qp_table = NULL; - int i, bias; int ret = 0; - int custom_threshold_m[64]; - bias = (1 << 4) + fspp->strength; - - for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this ! - custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5); - - for (int i = 0; i < 64; i += 8) { - fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2]; - fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6]; - fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0]; - fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4]; - fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5]; - fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3]; - fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1]; - fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7]; - } + //FIXME: tune custom_threshold[] and remove this ! + for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i) + fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5); if (fspp->qp) { fspp->prev_q = fspp->qp; -- 2.49.1 >From a86a9361989d99ad8db46eabf39f664ed3f89072 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 13 Nov 2025 12:04:15 +0100 Subject: [PATCH 22/23] avfilter/vf_fsppdsp: Remove pointless cast Also don't cast const away and use a smaller scope. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index 7fdc5ece25..3230376a19 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -136,12 +136,11 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7; int16_t *wsptr; - int16_t *threshold; wsptr = output; for (; cnt > 0; cnt -= 2) { //start positions - threshold = (int16_t *)thr_adr;//threshold_mtx + const int16_t *threshold = thr_adr;//threshold_mtx for (int ctr = DCTSIZE; ctr > 0; ctr--) { // Process columns from input, add to output. tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7]; -- 2.49.1 >From 57ca0480e6dcb64f1b4f948b6a79bcb8aaa97723 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 13 Nov 2025 11:57:02 +0100 Subject: [PATCH 23/23] avfilter/vf_fsppdsp: Fix left shifts of negative numbers They are undefined behavior and UBSan warns about them (in the checkasm test). Put the shifts in the constants instead. This even gives a tiny speedup here: Old benchmarks: column_fidct_c: 3369.9 ( 1.00x) column_fidct_sse2: 829.1 ( 4.06x) New benchmarks: column_fidct_c: 3304.2 ( 1.00x) column_fidct_sse2: 827.9 ( 3.99x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/vf_fsppdsp.c | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c index 3230376a19..8025e87366 100644 --- a/libavfilter/vf_fsppdsp.c +++ b/libavfilter/vf_fsppdsp.c @@ -165,7 +165,7 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict d0 = tmp10 + tmp11; d4 = tmp10 - tmp11; - z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); + z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2); d2 = tmp13 + z1; d6 = tmp13 - z1; @@ -193,10 +193,10 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; - z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433); - z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5; - z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5; - z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781); + z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2); + z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5; + z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5; + z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2); z11 = tmp7 + z3; z13 = tmp7 - z3; @@ -215,15 +215,15 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0 z13 = tmp6 + tmp5; - z10 = (tmp6 - tmp5) << 1; + z10 = (tmp6 - tmp5) * 2; z11 = tmp4 + tmp7; - z12 = (tmp4 - tmp7) << 1; + z12 = (tmp4 - tmp7) * 2; tmp7 = (z11 + z13) >> 2; //+2 ! - tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562); - z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); - tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; - tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !! + tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1); + z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !! tmp6 = tmp12 - tmp7; tmp5 = tmp11 - tmp6; @@ -264,7 +264,7 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr, tmp11 = wsptr[2] - wsptr[3]; tmp13 = wsptr[0] + wsptr[1]; - tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow + tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow tmp0 = tmp10 + tmp13; //->temps tmp3 = tmp10 - tmp13; //->temps @@ -289,9 +289,9 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr, tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5; tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_ - tmp6 = (tmp12 << 3) - tmp7; - tmp5 = (tmp11 << 3) - tmp6; - tmp4 = (tmp10 << 3) + tmp5; + tmp6 = tmp12 * 8 - tmp7; + tmp5 = tmp11 * 8 - tmp6; + tmp4 = tmp10 * 8 + tmp5; // Final output stage: descale and write column outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3); @@ -342,20 +342,20 @@ void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels, dataptr[2] = tmp10 + tmp11; dataptr[3] = tmp10 - tmp11; - z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781); + z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2); dataptr[0] = tmp13 + z1; dataptr[1] = tmp13 - z1; // Odd part - tmp10 = (tmp4 + tmp5) << 2; - tmp11 = (tmp5 + tmp6) << 2; - tmp12 = (tmp6 + tmp7) << 2; + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; - z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433); - z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5; - z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5; - z3 = MULTIPLY16H(tmp11, FIX_0_707106781); + z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2); + z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5; + z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5; + z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2); z11 = tmp7 + z3; z13 = tmp7 - z3; -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
