This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit c0e1c1d6b3245a5bf46b5cb5c22cd16a9138a21b
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 7 01:01:44 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Apr 30 10:39:33 2026 +0200

    avcodec/x86/qpeldsp: Add SSSE3 size 16 horizontal filter
    
    Beats the mmxext version by a lot (in the following,
    [0][1-3] refers to horizontal-only size 16 mc;
    the _sse2 comparators for the other cases use mmxext
    horizontal mc coupled with vertical SSE2 mc):
    
    avg_qpel_pixels_tab[0][1]_c:                           945.5 ( 1.00x)
    avg_qpel_pixels_tab[0][1]_mmxext:                      262.6 ( 3.60x)
    avg_qpel_pixels_tab[0][1]_ssse3:                       110.4 ( 8.57x)
    avg_qpel_pixels_tab[0][2]_c:                          1042.1 ( 1.00x)
    avg_qpel_pixels_tab[0][2]_mmxext:                      245.1 ( 4.25x)
    avg_qpel_pixels_tab[0][2]_ssse3:                        91.7 (11.37x)
    avg_qpel_pixels_tab[0][3]_c:                           941.8 ( 1.00x)
    avg_qpel_pixels_tab[0][3]_mmxext:                      260.1 ( 3.62x)
    avg_qpel_pixels_tab[0][3]_ssse3:                       110.1 ( 8.56x)
    avg_qpel_pixels_tab[0][5]_c:                          1939.5 ( 1.00x)
    avg_qpel_pixels_tab[0][5]_sse2:                        394.3 ( 4.92x)
    avg_qpel_pixels_tab[0][5]_ssse3:                       247.4 ( 7.84x)
    avg_qpel_pixels_tab[0][6]_c:                          1785.8 ( 1.00x)
    avg_qpel_pixels_tab[0][6]_sse2:                        380.6 ( 4.69x)
    avg_qpel_pixels_tab[0][6]_ssse3:                       221.1 ( 8.08x)
    avg_qpel_pixels_tab[0][7]_c:                          1932.5 ( 1.00x)
    avg_qpel_pixels_tab[0][7]_sse2:                        393.4 ( 4.91x)
    avg_qpel_pixels_tab[0][7]_ssse3:                       238.8 ( 8.09x)
    avg_qpel_pixels_tab[0][9]_c:                          1976.9 ( 1.00x)
    avg_qpel_pixels_tab[0][9]_sse2:                        380.8 ( 5.19x)
    avg_qpel_pixels_tab[0][9]_ssse3:                       223.3 ( 8.85x)
    avg_qpel_pixels_tab[0][10]_c:                         1911.9 ( 1.00x)
    avg_qpel_pixels_tab[0][10]_sse2:                       366.9 ( 5.21x)
    avg_qpel_pixels_tab[0][10]_ssse3:                      207.0 ( 9.24x)
    avg_qpel_pixels_tab[0][11]_c:                         2046.9 ( 1.00x)
    avg_qpel_pixels_tab[0][11]_sse2:                       385.5 ( 5.31x)
    avg_qpel_pixels_tab[0][11]_ssse3:                      227.9 ( 8.98x)
    avg_qpel_pixels_tab[0][13]_c:                         1940.8 ( 1.00x)
    avg_qpel_pixels_tab[0][13]_sse2:                       389.7 ( 4.98x)
    avg_qpel_pixels_tab[0][13]_ssse3:                      244.2 ( 7.95x)
    avg_qpel_pixels_tab[0][14]_c:                         1778.4 ( 1.00x)
    avg_qpel_pixels_tab[0][14]_sse2:                       379.2 ( 4.69x)
    avg_qpel_pixels_tab[0][14]_ssse3:                      223.5 ( 7.96x)
    avg_qpel_pixels_tab[0][15]_c:                         1905.9 ( 1.00x)
    avg_qpel_pixels_tab[0][15]_sse2:                       398.9 ( 4.78x)
    avg_qpel_pixels_tab[0][15]_ssse3:                      238.3 ( 8.00x)
    put_no_rnd_qpel_pixels_tab[0][1]_c:                    922.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][1]_mmxext:               275.0 ( 3.35x)
    put_no_rnd_qpel_pixels_tab[0][1]_ssse3:                108.4 ( 8.51x)
    put_no_rnd_qpel_pixels_tab[0][2]_c:                    889.7 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][2]_mmxext:               236.7 ( 3.76x)
    put_no_rnd_qpel_pixels_tab[0][2]_ssse3:                 86.8 (10.25x)
    put_no_rnd_qpel_pixels_tab[0][3]_c:                    915.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][3]_mmxext:               274.3 ( 3.34x)
    put_no_rnd_qpel_pixels_tab[0][3]_ssse3:                108.2 ( 8.46x)
    put_no_rnd_qpel_pixels_tab[0][5]_sse2:                 400.0 ( 4.63x)
    put_no_rnd_qpel_pixels_tab[0][5]_ssse3:                246.0 ( 7.53x)
    put_no_rnd_qpel_pixels_tab[0][6]_c:                   1753.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][6]_sse2:                 382.5 ( 4.59x)
    put_no_rnd_qpel_pixels_tab[0][6]_ssse3:                226.4 ( 7.75x)
    put_no_rnd_qpel_pixels_tab[0][7]_c:                   1854.6 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][7]_sse2:                 393.5 ( 4.71x)
    put_no_rnd_qpel_pixels_tab[0][7]_ssse3:                248.6 ( 7.46x)
    put_no_rnd_qpel_pixels_tab[0][9]_c:                   1794.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][9]_sse2:                 382.2 ( 4.70x)
    put_no_rnd_qpel_pixels_tab[0][9]_ssse3:                228.0 ( 7.87x)
    put_no_rnd_qpel_pixels_tab[0][10]_c:                  1724.7 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][10]_sse2:                353.8 ( 4.88x)
    put_no_rnd_qpel_pixels_tab[0][10]_ssse3:               206.5 ( 8.35x)
    put_no_rnd_qpel_pixels_tab[0][11]_c:                  1796.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][11]_sse2:                378.1 ( 4.75x)
    put_no_rnd_qpel_pixels_tab[0][11]_ssse3:               227.1 ( 7.91x)
    put_no_rnd_qpel_pixels_tab[0][13]_c:                  1834.4 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][13]_sse2:                400.7 ( 4.58x)
    put_no_rnd_qpel_pixels_tab[0][13]_ssse3:               244.2 ( 7.51x)
    put_no_rnd_qpel_pixels_tab[0][14]_c:                  1755.7 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][14]_sse2:                387.2 ( 4.53x)
    put_no_rnd_qpel_pixels_tab[0][14]_ssse3:               226.8 ( 7.74x)
    put_no_rnd_qpel_pixels_tab[0][15]_c:                  1847.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][15]_sse2:                400.6 ( 4.61x)
    put_no_rnd_qpel_pixels_tab[0][15]_ssse3:               246.1 ( 7.51x)
    put_qpel_pixels_tab[0][1]_c:                           919.6 ( 1.00x)
    put_qpel_pixels_tab[0][1]_mmxext:                      255.5 ( 3.60x)
    put_qpel_pixels_tab[0][1]_ssse3:                       108.3 ( 8.49x)
    put_qpel_pixels_tab[0][2]_c:                           883.9 ( 1.00x)
    put_qpel_pixels_tab[0][2]_mmxext:                      238.1 ( 3.71x)
    put_qpel_pixels_tab[0][2]_ssse3:                        86.7 (10.19x)
    put_qpel_pixels_tab[0][3]_c:                           921.9 ( 1.00x)
    put_qpel_pixels_tab[0][3]_mmxext:                      258.9 ( 3.56x)
    put_qpel_pixels_tab[0][3]_ssse3:                       108.1 ( 8.53x)
    put_qpel_pixels_tab[0][5]_c:                          1907.5 ( 1.00x)
    put_qpel_pixels_tab[0][5]_sse2:                        384.2 ( 4.96x)
    put_qpel_pixels_tab[0][5]_ssse3:                       234.8 ( 8.13x)
    put_qpel_pixels_tab[0][6]_c:                          1757.4 ( 1.00x)
    put_qpel_pixels_tab[0][6]_sse2:                        382.8 ( 4.59x)
    put_qpel_pixels_tab[0][6]_ssse3:                       217.6 ( 8.08x)
    put_qpel_pixels_tab[0][7]_c:                          1927.5 ( 1.00x)
    put_qpel_pixels_tab[0][7]_sse2:                        384.6 ( 5.01x)
    put_qpel_pixels_tab[0][7]_ssse3:                       231.2 ( 8.34x)
    put_qpel_pixels_tab[0][9]_c:                          1832.1 ( 1.00x)
    put_qpel_pixels_tab[0][9]_sse2:                        374.8 ( 4.89x)
    put_qpel_pixels_tab[0][9]_ssse3:                       219.4 ( 8.35x)
    put_qpel_pixels_tab[0][10]_c:                         1710.3 ( 1.00x)
    put_qpel_pixels_tab[0][10]_sse2:                       384.5 ( 4.45x)
    put_qpel_pixels_tab[0][10]_ssse3:                      202.9 ( 8.43x)
    put_qpel_pixels_tab[0][11]_c:                         1825.0 ( 1.00x)
    put_qpel_pixels_tab[0][11]_sse2:                       369.6 ( 4.94x)
    put_qpel_pixels_tab[0][11]_ssse3:                      216.8 ( 8.42x)
    put_qpel_pixels_tab[0][13]_c:                         1898.4 ( 1.00x)
    put_qpel_pixels_tab[0][13]_sse2:                       384.9 ( 4.93x)
    put_qpel_pixels_tab[0][13]_ssse3:                      238.6 ( 7.96x)
    put_qpel_pixels_tab[0][14]_c:                         1779.1 ( 1.00x)
    put_qpel_pixels_tab[0][14]_sse2:                       373.3 ( 4.77x)
    put_qpel_pixels_tab[0][14]_ssse3:                      218.1 ( 8.16x)
    put_qpel_pixels_tab[0][15]_c:                         1918.2 ( 1.00x)
    put_qpel_pixels_tab[0][15]_sse2:                       385.3 ( 4.98x)
    put_qpel_pixels_tab[0][15]_ssse3:                      236.8 ( 8.10x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/qpeldsp.asm    | 62 +++++++++++++++++++++++++++++++++++++++----
 libavcodec/x86/qpeldsp_init.c |  6 +++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index 6c2c96cee9..43e55aeb25 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -30,6 +30,11 @@ pw_15: times 8 dw 15
 cextern pw_16
 pw_20: times 8 dw 20
 
+shuffle_mask16_0: db 2, 1, 1, 0, 0, 0, 1, 0, 1,  2,  2,  3,  4,  3,  5,  4
+shuffle_mask16_1: db 5, 6, 6, 7, 8, 7, 9, 8, 9, 10, 10, 11, 12, 11, 13, 12
+shuffle_mask16_2: db 0, 1, 1, 2, 3, 2, 3, 3,  3, 2,  2,  1, -1, -1, -1, -1
+coeff16_0: times 2 db -1,  3, -1,  3,  3, -1,  3, -1
+coeff16_1: times 2 db 20, -6, 20, -6, -6, 20, -6, 20
 
 SECTION .text
 
@@ -151,10 +156,50 @@ cglobal put_no_rnd_pixels16x16_l2, 5,6
 
 
 %macro MPEG4_QPEL16_H_LOWPASS 1
-cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
+cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, 16*notcpuflag(sse2), dst, src, 
dstride, srcstride, h
+%if notcpuflag(ssse3)
     pxor         m7, m7
+%else
+    mova         m7, [coeff16_0]
+%endif
 .loop:
-    mova         m0, [r1]
+    movu         m0, [srcq]
+%if cpuflag(ssse3)
+    pshufb       m1, m0, [shuffle_mask16_0]
+    pmaddubsw    m2, m1, m7
+    pshufb       m0, [shuffle_mask16_1]
+    pmaddubsw    m4, m0, m7
+    palignr      m3, m0, m1, 4
+    pmaddubsw    m3, [coeff16_1]
+    palignr      m5, m0, m1, 8
+    movd         m6, [srcq+13]
+    pmaddubsw    m5, [coeff16_1]
+    paddw        m2, m3
+    palignr      m3, m0, m1, 12
+    pshufb       m6, [shuffle_mask16_2]
+    pmaddubsw    m3, m7
+    paddw        m4, [PW_ROUND]
+    palignr      m1, m6, m0, 4
+    pmaddubsw    m1, [coeff16_1]
+    add        srcq, srcstrideq
+    paddw        m2, m5
+    palignr      m5, m6, m0, 8
+    pmaddubsw    m5, [coeff16_1]
+    palignr      m6, m0, 12
+    pmaddubsw    m6, m7
+    paddw        m2, [PW_ROUND]
+    paddw        m4, m1
+    paddw        m2, m3
+    paddw        m4, m5
+    psraw        m2, 5
+    paddw        m4, m6
+    psraw        m4, 5
+    packuswb     m2, m4
+%ifidn %1, avg
+    pavgb        m2, [dstq]
+%endif
+    mova     [dstq], m2
+%else
     mova         m1, m0
     mova         m2, m0
     punpcklbw    m0, m7
@@ -252,8 +297,9 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
     packuswb     m0, m4
     OP_MOV   [r0+8], m0, m4
     add          r1, r3
-    add          r0, r2
-    dec r4d
+%endif
+    add        dstq, dstrideq
+    dec          hd
     jne .loop
     RET
 %endmacro
@@ -279,7 +325,13 @@ MPEG4_QPEL16_H_LOWPASS avg
 %define OP_MOV PUT_OP
 MPEG4_QPEL16_H_LOWPASS put_no_rnd
 
-
+INIT_XMM ssse3
+%define PW_ROUND pw_16
+MPEG4_QPEL16_H_LOWPASS put
+%define PW_ROUND pw_16
+MPEG4_QPEL16_H_LOWPASS avg
+%define PW_ROUND pw_15
+MPEG4_QPEL16_H_LOWPASS put_no_rnd
 
 %macro MPEG4_QPEL8_H_LOWPASS 1
 cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index 877bb9bdce..363df19acf 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -269,11 +269,13 @@ MACRO(put_no_rnd, no_rnd_, SIZE, SIZEP1, HXMM, VXMM, 
HVXMM, L2)
 QPEL3(QPEL_H,   8,  9, mmxext, mmxext, mmxext, mmxext)
 
 QPEL3(QPEL_H,  16, 17, mmxext, mmxext, mmxext, mmxext)
+QPEL3(QPEL_H,  16, 17, ssse3, sse2, ssse3, sse2)
 
 QPEL3(QPEL_V,   8,  9, ssse3, sse2, ssse3, mmxext)
 QPEL3(QPEL_HV,  8,  9, mmxext, sse2, sse2, mmxext)
 QPEL3(QPEL_V,  16, 17, ssse3, sse2, ssse3, sse2)
 QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, sse2)
+QPEL3(QPEL_HV, 16, 17, ssse3, sse2, ssse3, sse2)
 
 #define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \
     c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## 
SIZE ## _mc ## X ## Y ## _ ## CPU
@@ -329,4 +331,8 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
         SET_HV_QPEL_FUNCS(8,  sse2,);
     }
 #endif
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        SET_H_QPEL_FUNCS(16, ssse3,);
+        SET_HV_QPEL_FUNCS(16, ssse3,);
+    }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to