me_cmp: add SSSE3 median_sad16

marcos ashton via ffmpeg-cvslog Thu, 18 Jun 2026 15:08:46 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit b8a44a7f76eaa1ec68034fb91745b36ea4c6df3f
Author:     marcos ashton <[email protected]>
AuthorDate: Thu Jun 4 22:02:50 2026 +0100
Commit:     michaelni <[email protected]>
CommitDate: Thu Jun 18 22:08:02 2026 +0000

    avcodec/x86/me_cmp: add SSSE3 median_sad16
    
    The median_sad functions have NEON implementations but no x86 ones,
    so x86 always used the C code. x86_64 only due to register pressure.
    
    median_sad_0_c:     314.5 ( 1.00x)
    median_sad_0_ssse3:  51.1 ( 6.16x)
    
    Benchmarks and tests run with checkasm on an Intel Core Ultra 7 155H.
    
    Signed-off-by: marcos ashton <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 119 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/me_cmp_init.c |   7 +++
 2 files changed, 126 insertions(+)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 314b091fc8..5c403b2715 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -809,3 +809,122 @@ VSAD_APPROX 8,  a
 INIT_XMM sse2
 VSAD_APPROX 16, a
 VSAD_APPROX 16, u
+
+;---------------------------------------------------------------------
+;int ff_median_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+;                        ptrdiff_t stride, int h);
+;---------------------------------------------------------------------
+%if ARCH_X86_64
+
+; Load one row of 16 pixels from pix1/pix2 and compute V = pix1 - pix2 as
+; int16 words.  No zero register is needed: both byte vectors are unpacked
+; against the same scratch register %5, so its garbage high bytes cancel in
+; the subtraction.  The shifted columns are derived from the unshifted word
+; vectors, so no out-of-bounds loads are made.
+; %1: V columns 0-7,  %2: V columns 8-15
+; %3: V columns 1-8,  %4: V columns 9-16 (column 16 is zero)
+; %5: scratch register, its contents are irrelevant
+%macro LOAD_V16 5
+    movu      %1, [pix1q]
+    movu      %3, [pix2q]
+    punpckhbw %2, %1, %5
+    punpcklbw %1, %5
+    punpckhbw %4, %3, %5
+    punpcklbw %3, %5
+    psubw     %1, %3            ; V columns 0-7
+    psubw     %2, %4            ; V columns 8-15
+    palignr   %3, %2, %1, 2     ; V columns 1-8
+    psrldq    %4, %2, 2         ; V columns 9-16
+%endmacro
+
+; Accumulate abs(%5 - mid_pred(%2, %3, %2 + %3 - %4)) into %1, using
+; mid_pred(a, b, c) == max(min(a, b), min(max(a, b), c)).  The top predictor
+; %2 is not needed afterwards and is clobbered.
+; %1: accumulator, %2: top, %3: left, %4: topleft, %5: values being predicted
+; %6, %7: temporaries
+%macro MEDIAN_ABS_ACC 7
+    paddw     %6, %2, %3        ; top + left
+    psubw     %6, %4            ; top + left - topleft
+    pminsw    %7, %2, %3        ; min(top, left)
+    pmaxsw    %2, %3            ; max(top, left)
+    pminsw    %2, %6
+    pmaxsw    %7, %2            ; mid_pred(top, left, top + left - topleft)
+    psubw     %6, %5, %7
+    pabsw     %6, %6
+    paddw     %1, %6
+%endmacro
+
+; Accumulate one row's cost from the previous and current row vectors.
+; %1-%4: previous row V (columns 0-7, 8-15, 1-8, 9-16)
+; %5-%8: current  row V (columns 0-7, 8-15, 1-8, 9-16), loaded here
+; m0-m2 are the accumulators, m11/m12 temporaries, m14 scratch.  The top
+; predictors %3/%4 are consumed by MEDIAN_ABS_ACC, but they belong to the
+; previous row and are reloaded before being needed again.
+%macro PROCESS_ROW16 8
+    LOAD_V16  %5, %6, %7, %8, m14
+    add       pix1q, strideq
+    add       pix2q, strideq
+    ; column 0: abs(V(0) - V(-stride))
+    psubw     m11, %5, %1
+    pabsw     m11, m11
+    paddw     m2, m11
+    ; columns 1-8 and 9-16
+    MEDIAN_ABS_ACC m0, %3, %5, %1, %7, m11, m12
+    MEDIAN_ABS_ACC m1, %4, %6, %2, %8, m11, m12
+%endmacro
+
+; Register layout:
+;   m0  accumulator for columns 1-8
+;   m1  accumulator for columns 9-16 (the last word is discarded at the end)
+;   m2  accumulator for column 0 (only the first word is used)
+;   m3-m6   one row's V (columns 0-7, 8-15, 1-8, 9-16)
+;   m7-m10  the other row's V (columns 0-7, 8-15, 1-8, 9-16)
+;   m11, m12 temporaries
+;   m14 scratch register for LOAD_V16
+; The loop is unrolled by two so the two register sets alternate the roles of
+; previous and current row, which removes the per-row register copies.
+%macro MEDIAN_SAD16 0
+cglobal median_sad16, 5, 5, 15, v, pix1, pix2, stride, h
+    LOAD_V16  m3, m4, m5, m6, m14
+    add       pix1q, strideq
+    add       pix2q, strideq
+
+    ; first row: abs(V(0)) + sum of abs(V(j) - V(j-1))
+    pabsw     m2, m3
+    psubw     m0, m5, m3
+    pabsw     m0, m0
+    psubw     m1, m6, m4
+    pabsw     m1, m1
+
+    sub       hd, 1
+    jle       .end
+.loop:
+    PROCESS_ROW16 m3, m4, m5, m6, m7, m8, m9, m10
+    sub       hd, 1
+    jle       .end
+    PROCESS_ROW16 m7, m8, m9, m10, m3, m4, m5, m6
+    sub       hd, 1
+    jg        .loop
+.end:
+    ; column 16 lies outside of the block and column 0 only contributes its
+    ; first word; the kept columns may end up in any lane since the final sum
+    ; is horizontal anyway
+    pslldq    m1, 2
+    pslldq    m2, 14
+    paddw     m0, m1
+    paddw     m0, m2
+    ; the per-word sums are at most 16 * 510, but their total needs more than
+    ; 16 bits: widen to dwords before the horizontal sum
+    pxor      m1, m1
+    punpckhwd m12, m0, m1
+    punpcklwd m0, m1
+    paddd     m0, m12
+    HADDD     m0, m12
+    movd      eax, m0
+    RET
+%endmacro
+
+INIT_XMM ssse3
+MEDIAN_SAD16
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index dbb4ef96bb..3b3ad6aa33 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -22,6 +22,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
@@ -70,6 +71,8 @@ int ff_vsad16_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t *
                    ptrdiff_t stride, int h);
 int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const 
uint8_t *pix2,
                            ptrdiff_t stride, int h);
+int ff_median_sad16_ssse3(MPVEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                          ptrdiff_t stride, int h);
 
 #define hadamard_func(cpu)                                                     
  \
     int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,       
  \
@@ -169,5 +172,9 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+
+#if ARCH_X86_64
+        c->median_sad[0] = ff_median_sad16_ssse3;
+#endif
     }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 01/02: avcodec/x86/me_cmp: add SSSE3 median_sad16

Reply via email to