PR #23315 opened by Marcos Ashton (MarcosAsh)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23315
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23315.patch
## Summary
Adds SSE2 and SSSE3 implementations of both median_sad comparison functions
(-cmp msad) to libavcodec/x86/me_cmp. These have had aarch64 NEON
implementations since 2022 (0ee535b1db, b2732115dd) but no x86 SIMD, so x86
always used the C fallback.
The approach matches the NEON code: widen V = pix1 - pix2 to int16 words,
compute the median predictor branchlessly with mid_pred(a, b, c) == max(min(a,
b), min(max(a, b), c)), and accumulate absolute differences per column. Shifted
column vectors are produced in-register with psrldq so no out-of-bounds loads
are made. Three-operand instruction forms are used throughout, so an AVX
version only needs another INIT_XMM instantiation.
checkasm --bench on an Intel Core Ultra 7 155H:
median_sad_0_c: 801.3 ( 1.00x)
median_sad_0_sse2: 160.1 ( 5.00x)
median_sad_0_ssse3: 115.9 ( 6.92x)
median_sad_1_c: 349.4 ( 1.00x)
median_sad_1_sse2: 92.0 ( 3.80x)
median_sad_1_ssse3: 68.4 ( 5.11x)
End to end, mpeg4 encoding of 720p with -cmp msad -subcmp msad -mbcmp msad
-precmp msad uses 41% less CPU time (16.30s -> 9.65s user time, 300 frames).
Output is bit-exact with the C implementation, verified with framemd5 against
-cpuflags 0 under -flags +bitexact.
Both functions are x86_64 only due to register pressure, same as the NEON
versions being aarch64 only. x86_32 keeps using the C code. Covered by the
existing checkasm motion test, and the full FATE suite passes.
>From 544ba24e8b7676496d5375da17d7a557e8163187 Mon Sep 17 00:00:00 2001
From: marcos ashton <[email protected]>
Date: Tue, 2 Jun 2026 19:49:41 +0100
Subject: [PATCH 1/2] avcodec/x86/me_cmp: add SSE2 and SSSE3 median_sad16
The median_sad functions have NEON implementations but no x86 ones,
so x86 always used the C code. x86_64 only due to register pressure.
median_sad_0_c: 801.3 ( 1.00x)
median_sad_0_sse2: 160.1 ( 5.00x)
median_sad_0_ssse3: 115.9 ( 6.92x)
Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.
Signed-off-by: marcos ashton <[email protected]>
---
libavcodec/x86/me_cmp.asm | 121 +++++++++++++++++++++++++++++++++++
libavcodec/x86/me_cmp_init.c | 13 ++++
2 files changed, 134 insertions(+)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 314b091fc8..3c2cb416ea 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -809,3 +809,124 @@ VSAD_APPROX 8, a
INIT_XMM sse2
VSAD_APPROX 16, a
VSAD_APPROX 16, u
+
+;---------------------------------------------------------------------
+;int ff_median_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
+; ptrdiff_t stride, int h);
+;---------------------------------------------------------------------
+%if ARCH_X86_64
+
+; Load one row of 16 pixels from pix1/pix2 and compute V = pix1 - pix2 as
+; int16 words, both unshifted (columns 0-15) and shifted by one column
+; (columns 1-16; the last word is garbage and is discarded by the caller).
+; %1: V columns 0-7, %2: V columns 8-15, %3: V columns 1-8, %4: V columns 9-16
+; %5, %6: temporaries, %7: zero register
+%macro LOAD_V16 7
+ movu %1, [pix1q]
+ movu %5, [pix2q]
+ psrldq %3, %1, 1
+ psrldq %6, %5, 1
+ punpckhbw %2, %1, %7
+ punpcklbw %1, %7
+ punpckhbw %4, %5, %7
+ punpcklbw %5, %7
+ psubw %1, %5
+ psubw %2, %4
+ punpckhbw %4, %3, %7
+ punpcklbw %3, %7
+ punpckhbw %5, %6, %7
+ punpcklbw %6, %7
+ psubw %3, %6
+ psubw %4, %5
+%endmacro
+
+; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
+; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).
+; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
+; %6, %7, %8: temporaries
+%macro MEDIAN_ABS_ACC 8
+ paddw %6, %2, %3 ; top + left
+ psubw %6, %4 ; top + left - topleft
+ pminsw %7, %2, %3 ; min(top, left)
+ pmaxsw %8, %2, %3 ; max(top, left)
+ pminsw %8, %6
+ pmaxsw %7, %8 ; mid_pred(top, left, top + left - topleft)
+ psubw %6, %5, %7
+ ABS1 %6, %8
+ paddw %1, %6
+%endmacro
+
+; Register layout:
+; m0 accumulator for columns 1-8
+; m1 accumulator for columns 9-16 (the last word is discarded at the end)
+; m2 accumulator for column 0 (only the first word is used)
+; m3 previous row V, columns 0-7 (topleft predictors)
+; m4 previous row V, columns 8-15
+; m5 previous row V, columns 1-8 (top predictors)
+; m6 previous row V, columns 9-16
+; m7 zero
+; m8 current row V, columns 0-7 (left predictors)
+; m9 current row V, columns 8-15
+; m10 current row V, columns 1-8 (values being predicted)
+; m11 current row V, columns 9-16
+; m12-m14 temporaries
+%macro MEDIAN_SAD16 0
+cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
+ pxor m7, m7
+ LOAD_V16 m3, m4, m5, m6, m12, m13, m7
+
+ ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+ mova m2, m3
+ ABS1 m2, m12
+ psubw m0, m5, m3
+ ABS1 m0, m12
+ psubw m1, m6, m4
+ ABS1 m1, m12
+
+ add pix1q, strideq
+ add pix2q, strideq
+ sub hd, 1
+ jle .end
+.loop:
+ LOAD_V16 m8, m9, m10, m11, m12, m13, m7
+ ; column 0: abs(V(0) - V(-stride))
+ psubw m12, m8, m3
+ ABS1 m12, m13
+ paddw m2, m12
+ ; columns 1-8 and 9-16
+ MEDIAN_ABS_ACC m0, m5, m8, m3, m10, m12, m13, m14
+ MEDIAN_ABS_ACC m1, m6, m9, m4, m11, m12, m13, m14
+ ; the current row becomes the previous row
+ mova m3, m8
+ mova m4, m9
+ mova m5, m10
+ mova m6, m11
+ add pix1q, strideq
+ add pix2q, strideq
+ sub hd, 1
+ jg .loop
+.end:
+ ; column 16 lies outside of the block and column 0 only contributes
+ ; its first word
+ pslldq m1, 2
+ psrldq m1, 2
+ pslldq m2, 14
+ psrldq m2, 14
+ paddw m0, m1
+ paddw m0, m2
+ ; the per-word sums are at most 16 * 510, but their total needs more than
+ ; 16 bits: widen to dwords before the horizontal sum
+ punpckhwd m12, m0, m7
+ punpcklwd m0, m7
+ paddd m0, m12
+ HADDD m0, m12
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+MEDIAN_SAD16
+INIT_XMM ssse3
+MEDIAN_SAD16
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index dbb4ef96bb..d7d30c3235 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -22,6 +22,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
@@ -70,6 +71,10 @@ int ff_vsad16_approx_sse2(MPVEncContext *v, const uint8_t
*pix1, const uint8_t *
ptrdiff_t stride, int h);
int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const
uint8_t *pix2,
ptrdiff_t stride, int h);
+int ff_median_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
+ ptrdiff_t stride, int h);
+int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
+ ptrdiff_t stride, int h);
#define hadamard_func(cpu)
\
int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
\
@@ -160,6 +165,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
} else {
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
}
+
+#if ARCH_X86_64
+ c->median_sad[0] = ff_median_sad16_sse2;
+#endif
}
if (EXTERNAL_SSSE3(cpu_flags)) {
@@ -169,5 +178,9 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+
+#if ARCH_X86_64
+ c->median_sad[0] = ff_median_sad16_ssse3;
+#endif
}
}
--
2.52.0
>From e361c811a64d6e9d4dbd844d0ca8bb48cb9f1023 Mon Sep 17 00:00:00 2001
From: marcos ashton <[email protected]>
Date: Tue, 2 Jun 2026 19:50:56 +0100
Subject: [PATCH 2/2] avcodec/x86/me_cmp: add SSE2 and SSSE3 median_sad8
Same approach as median_sad16, processing one 8 pixel row per XMM
register.
median_sad_1_c: 349.4 ( 1.00x)
median_sad_1_sse2: 92.0 ( 3.80x)
median_sad_1_ssse3: 68.4 ( 5.11x)
Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.
Signed-off-by: marcos ashton <[email protected]>
---
libavcodec/x86/me_cmp.asm | 75 ++++++++++++++++++++++++++++++++++++
libavcodec/x86/me_cmp_init.c | 6 +++
2 files changed, 81 insertions(+)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 3c2cb416ea..8954c01654 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -840,6 +840,21 @@ VSAD_APPROX 16, u
psubw %4, %5
%endmacro
+; Same as LOAD_V16 for one row of 8 pixels.
+; %1: V columns 0-7, %2: V columns 1-8, %3, %4: temporaries, %5: zero register
+%macro LOAD_V8 5
+ movq %1, [pix1q]
+ movq %3, [pix2q]
+ psrldq %2, %1, 1
+ psrldq %4, %3, 1
+ punpcklbw %1, %5
+ punpcklbw %3, %5
+ psubw %1, %3
+ punpcklbw %2, %5
+ punpcklbw %4, %5
+ psubw %2, %4
+%endmacro
+
; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).
; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
@@ -929,4 +944,64 @@ MEDIAN_SAD16
INIT_XMM ssse3
MEDIAN_SAD16
+; Register layout:
+; m0 accumulator for columns 1-8 (the last word is discarded at the end)
+; m1 accumulator for column 0 (only the first word is used)
+; m2 previous row V, columns 0-7 (topleft predictors)
+; m3 previous row V, columns 1-8 (top predictors)
+; m4 zero
+; m5 current row V, columns 0-7 (left predictors)
+; m6 current row V, columns 1-8 (values being predicted)
+; m7-m9 temporaries
+%macro MEDIAN_SAD8 0
+cglobal median_sad8, 5, 5, 10, v, pix1, pix2, stride, h
+ pxor m4, m4
+ LOAD_V8 m2, m3, m7, m8, m4
+
+ ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+ mova m1, m2
+ ABS1 m1, m7
+ psubw m0, m3, m2
+ ABS1 m0, m7
+
+ add pix1q, strideq
+ add pix2q, strideq
+ sub hd, 1
+ jle .end
+.loop:
+ LOAD_V8 m5, m6, m7, m8, m4
+ ; column 0: abs(V(0) - V(-stride))
+ psubw m7, m5, m2
+ ABS1 m7, m8
+ paddw m1, m7
+ ; columns 1-8
+ MEDIAN_ABS_ACC m0, m3, m5, m2, m6, m7, m8, m9
+ ; the current row becomes the previous row
+ mova m2, m5
+ mova m3, m6
+ add pix1q, strideq
+ add pix2q, strideq
+ sub hd, 1
+ jg .loop
+.end:
+ ; column 8 lies outside of the block and column 0 only contributes
+ ; its first word
+ pslldq m0, 2
+ psrldq m0, 2
+ pslldq m1, 14
+ psrldq m1, 14
+ paddw m0, m1
+ punpckhwd m7, m0, m4
+ punpcklwd m0, m4
+ paddd m0, m7
+ HADDD m0, m7
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+MEDIAN_SAD8
+INIT_XMM ssse3
+MEDIAN_SAD8
+
%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d7d30c3235..2320e09bad 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -73,8 +73,12 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t
*pix1, const uint8_t
ptrdiff_t stride, int h);
int ff_median_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
ptrdiff_t stride, int h);
+int ff_median_sad8_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
+ ptrdiff_t stride, int h);
int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
ptrdiff_t stride, int h);
+int ff_median_sad8_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t
*pix2,
+ ptrdiff_t stride, int h);
#define hadamard_func(cpu)
\
int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
\
@@ -168,6 +172,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
#if ARCH_X86_64
c->median_sad[0] = ff_median_sad16_sse2;
+ c->median_sad[1] = ff_median_sad8_sse2;
#endif
}
@@ -181,6 +186,7 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c,
AVCodecContext *avctx)
#if ARCH_X86_64
c->median_sad[0] = ff_median_sad16_ssse3;
+ c->median_sad[1] = ff_median_sad8_ssse3;
#endif
}
}
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]