This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit c0e1c1d6b3245a5bf46b5cb5c22cd16a9138a21b Author: Andreas Rheinhardt <[email protected]> AuthorDate: Fri Nov 7 01:01:44 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Apr 30 10:39:33 2026 +0200 avcodec/x86/qpeldsp: Add SSSE3 size 16 horizontal filter Beats the mmxext version by a lot (in the following, [0][1-3] refers to horizontal-only size 16 mc; the _sse2 comparators for the other cases use mmxext horizontal mc coupled with vertical SSE2 mc): avg_qpel_pixels_tab[0][1]_c: 945.5 ( 1.00x) avg_qpel_pixels_tab[0][1]_mmxext: 262.6 ( 3.60x) avg_qpel_pixels_tab[0][1]_ssse3: 110.4 ( 8.57x) avg_qpel_pixels_tab[0][2]_c: 1042.1 ( 1.00x) avg_qpel_pixels_tab[0][2]_mmxext: 245.1 ( 4.25x) avg_qpel_pixels_tab[0][2]_ssse3: 91.7 (11.37x) avg_qpel_pixels_tab[0][3]_c: 941.8 ( 1.00x) avg_qpel_pixels_tab[0][3]_mmxext: 260.1 ( 3.62x) avg_qpel_pixels_tab[0][3]_ssse3: 110.1 ( 8.56x) avg_qpel_pixels_tab[0][5]_c: 1939.5 ( 1.00x) avg_qpel_pixels_tab[0][5]_sse2: 394.3 ( 4.92x) avg_qpel_pixels_tab[0][5]_ssse3: 247.4 ( 7.84x) avg_qpel_pixels_tab[0][6]_c: 1785.8 ( 1.00x) avg_qpel_pixels_tab[0][6]_sse2: 380.6 ( 4.69x) avg_qpel_pixels_tab[0][6]_ssse3: 221.1 ( 8.08x) avg_qpel_pixels_tab[0][7]_c: 1932.5 ( 1.00x) avg_qpel_pixels_tab[0][7]_sse2: 393.4 ( 4.91x) avg_qpel_pixels_tab[0][7]_ssse3: 238.8 ( 8.09x) avg_qpel_pixels_tab[0][9]_c: 1976.9 ( 1.00x) avg_qpel_pixels_tab[0][9]_sse2: 380.8 ( 5.19x) avg_qpel_pixels_tab[0][9]_ssse3: 223.3 ( 8.85x) avg_qpel_pixels_tab[0][10]_c: 1911.9 ( 1.00x) avg_qpel_pixels_tab[0][10]_sse2: 366.9 ( 5.21x) avg_qpel_pixels_tab[0][10]_ssse3: 207.0 ( 9.24x) avg_qpel_pixels_tab[0][11]_c: 2046.9 ( 1.00x) avg_qpel_pixels_tab[0][11]_sse2: 385.5 ( 5.31x) avg_qpel_pixels_tab[0][11]_ssse3: 227.9 ( 8.98x) avg_qpel_pixels_tab[0][13]_c: 1940.8 ( 1.00x) avg_qpel_pixels_tab[0][13]_sse2: 389.7 ( 4.98x) avg_qpel_pixels_tab[0][13]_ssse3: 244.2 ( 7.95x) avg_qpel_pixels_tab[0][14]_c: 1778.4 ( 1.00x) avg_qpel_pixels_tab[0][14]_sse2: 379.2 ( 4.69x) avg_qpel_pixels_tab[0][14]_ssse3: 223.5 ( 7.96x) avg_qpel_pixels_tab[0][15]_c: 1905.9 ( 1.00x) avg_qpel_pixels_tab[0][15]_sse2: 398.9 ( 4.78x) avg_qpel_pixels_tab[0][15]_ssse3: 238.3 ( 8.00x) put_no_rnd_qpel_pixels_tab[0][1]_c: 922.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][1]_mmxext: 275.0 ( 3.35x) put_no_rnd_qpel_pixels_tab[0][1]_ssse3: 108.4 ( 8.51x) put_no_rnd_qpel_pixels_tab[0][2]_c: 889.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][2]_mmxext: 236.7 ( 3.76x) put_no_rnd_qpel_pixels_tab[0][2]_ssse3: 86.8 (10.25x) put_no_rnd_qpel_pixels_tab[0][3]_c: 915.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][3]_mmxext: 274.3 ( 3.34x) put_no_rnd_qpel_pixels_tab[0][3]_ssse3: 108.2 ( 8.46x) put_no_rnd_qpel_pixels_tab[0][5]_sse2: 400.0 ( 4.63x) put_no_rnd_qpel_pixels_tab[0][5]_ssse3: 246.0 ( 7.53x) put_no_rnd_qpel_pixels_tab[0][6]_c: 1753.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][6]_sse2: 382.5 ( 4.59x) put_no_rnd_qpel_pixels_tab[0][6]_ssse3: 226.4 ( 7.75x) put_no_rnd_qpel_pixels_tab[0][7]_c: 1854.6 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][7]_sse2: 393.5 ( 4.71x) put_no_rnd_qpel_pixels_tab[0][7]_ssse3: 248.6 ( 7.46x) put_no_rnd_qpel_pixels_tab[0][9]_c: 1794.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][9]_sse2: 382.2 ( 4.70x) put_no_rnd_qpel_pixels_tab[0][9]_ssse3: 228.0 ( 7.87x) put_no_rnd_qpel_pixels_tab[0][10]_c: 1724.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][10]_sse2: 353.8 ( 4.88x) put_no_rnd_qpel_pixels_tab[0][10]_ssse3: 206.5 ( 8.35x) put_no_rnd_qpel_pixels_tab[0][11]_c: 1796.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][11]_sse2: 378.1 ( 4.75x) put_no_rnd_qpel_pixels_tab[0][11]_ssse3: 227.1 ( 7.91x) put_no_rnd_qpel_pixels_tab[0][13]_c: 1834.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][13]_sse2: 400.7 ( 4.58x) put_no_rnd_qpel_pixels_tab[0][13]_ssse3: 244.2 ( 7.51x) put_no_rnd_qpel_pixels_tab[0][14]_c: 1755.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][14]_sse2: 387.2 ( 4.53x) put_no_rnd_qpel_pixels_tab[0][14]_ssse3: 226.8 ( 7.74x) put_no_rnd_qpel_pixels_tab[0][15]_c: 1847.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][15]_sse2: 400.6 ( 4.61x) put_no_rnd_qpel_pixels_tab[0][15]_ssse3: 246.1 ( 7.51x) put_qpel_pixels_tab[0][1]_c: 919.6 ( 1.00x) put_qpel_pixels_tab[0][1]_mmxext: 255.5 ( 3.60x) put_qpel_pixels_tab[0][1]_ssse3: 108.3 ( 8.49x) put_qpel_pixels_tab[0][2]_c: 883.9 ( 1.00x) put_qpel_pixels_tab[0][2]_mmxext: 238.1 ( 3.71x) put_qpel_pixels_tab[0][2]_ssse3: 86.7 (10.19x) put_qpel_pixels_tab[0][3]_c: 921.9 ( 1.00x) put_qpel_pixels_tab[0][3]_mmxext: 258.9 ( 3.56x) put_qpel_pixels_tab[0][3]_ssse3: 108.1 ( 8.53x) put_qpel_pixels_tab[0][5]_c: 1907.5 ( 1.00x) put_qpel_pixels_tab[0][5]_sse2: 384.2 ( 4.96x) put_qpel_pixels_tab[0][5]_ssse3: 234.8 ( 8.13x) put_qpel_pixels_tab[0][6]_c: 1757.4 ( 1.00x) put_qpel_pixels_tab[0][6]_sse2: 382.8 ( 4.59x) put_qpel_pixels_tab[0][6]_ssse3: 217.6 ( 8.08x) put_qpel_pixels_tab[0][7]_c: 1927.5 ( 1.00x) put_qpel_pixels_tab[0][7]_sse2: 384.6 ( 5.01x) put_qpel_pixels_tab[0][7]_ssse3: 231.2 ( 8.34x) put_qpel_pixels_tab[0][9]_c: 1832.1 ( 1.00x) put_qpel_pixels_tab[0][9]_sse2: 374.8 ( 4.89x) put_qpel_pixels_tab[0][9]_ssse3: 219.4 ( 8.35x) put_qpel_pixels_tab[0][10]_c: 1710.3 ( 1.00x) put_qpel_pixels_tab[0][10]_sse2: 384.5 ( 4.45x) put_qpel_pixels_tab[0][10]_ssse3: 202.9 ( 8.43x) put_qpel_pixels_tab[0][11]_c: 1825.0 ( 1.00x) put_qpel_pixels_tab[0][11]_sse2: 369.6 ( 4.94x) put_qpel_pixels_tab[0][11]_ssse3: 216.8 ( 8.42x) put_qpel_pixels_tab[0][13]_c: 1898.4 ( 1.00x) put_qpel_pixels_tab[0][13]_sse2: 384.9 ( 4.93x) put_qpel_pixels_tab[0][13]_ssse3: 238.6 ( 7.96x) put_qpel_pixels_tab[0][14]_c: 1779.1 ( 1.00x) put_qpel_pixels_tab[0][14]_sse2: 373.3 ( 4.77x) put_qpel_pixels_tab[0][14]_ssse3: 218.1 ( 8.16x) put_qpel_pixels_tab[0][15]_c: 1918.2 ( 1.00x) put_qpel_pixels_tab[0][15]_sse2: 385.3 ( 4.98x) put_qpel_pixels_tab[0][15]_ssse3: 236.8 ( 8.10x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/qpeldsp.asm | 62 +++++++++++++++++++++++++++++++++++++++---- libavcodec/x86/qpeldsp_init.c | 6 +++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index 6c2c96cee9..43e55aeb25 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -30,6 +30,11 @@ pw_15: times 8 dw 15 cextern pw_16 pw_20: times 8 dw 20 +shuffle_mask16_0: db 2, 1, 1, 0, 0, 0, 1, 0, 1, 2, 2, 3, 4, 3, 5, 4 +shuffle_mask16_1: db 5, 6, 6, 7, 8, 7, 9, 8, 9, 10, 10, 11, 12, 11, 13, 12 +shuffle_mask16_2: db 0, 1, 1, 2, 3, 2, 3, 3, 3, 2, 2, 1, -1, -1, -1, -1 +coeff16_0: times 2 db -1, 3, -1, 3, 3, -1, 3, -1 +coeff16_1: times 2 db 20, -6, 20, -6, -6, 20, -6, 20 SECTION .text @@ -151,10 +156,50 @@ cglobal put_no_rnd_pixels16x16_l2, 5,6 %macro MPEG4_QPEL16_H_LOWPASS 1 -cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, 16*notcpuflag(sse2), dst, src, dstride, srcstride, h +%if notcpuflag(ssse3) pxor m7, m7 +%else + mova m7, [coeff16_0] +%endif .loop: - mova m0, [r1] + movu m0, [srcq] +%if cpuflag(ssse3) + pshufb m1, m0, [shuffle_mask16_0] + pmaddubsw m2, m1, m7 + pshufb m0, [shuffle_mask16_1] + pmaddubsw m4, m0, m7 + palignr m3, m0, m1, 4 + pmaddubsw m3, [coeff16_1] + palignr m5, m0, m1, 8 + movd m6, [srcq+13] + pmaddubsw m5, [coeff16_1] + paddw m2, m3 + palignr m3, m0, m1, 12 + pshufb m6, [shuffle_mask16_2] + pmaddubsw m3, m7 + paddw m4, [PW_ROUND] + palignr m1, m6, m0, 4 + pmaddubsw m1, [coeff16_1] + add srcq, srcstrideq + paddw m2, m5 + palignr m5, m6, m0, 8 + pmaddubsw m5, [coeff16_1] + palignr m6, m0, 12 + pmaddubsw m6, m7 + paddw m2, [PW_ROUND] + paddw m4, m1 + paddw m2, m3 + paddw m4, m5 + psraw m2, 5 + paddw m4, m6 + psraw m4, 5 + packuswb m2, m4 +%ifidn %1, avg + pavgb m2, [dstq] +%endif + mova [dstq], m2 +%else mova m1, m0 mova m2, m0 punpcklbw m0, m7 @@ -252,8 +297,9 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 packuswb m0, m4 OP_MOV [r0+8], m0, m4 add r1, r3 - add r0, r2 - dec r4d +%endif + add dstq, dstrideq + dec hd jne .loop RET %endmacro @@ -279,7 +325,13 @@ MPEG4_QPEL16_H_LOWPASS avg %define OP_MOV PUT_OP MPEG4_QPEL16_H_LOWPASS put_no_rnd - +INIT_XMM ssse3 +%define PW_ROUND pw_16 +MPEG4_QPEL16_H_LOWPASS put +%define PW_ROUND pw_16 +MPEG4_QPEL16_H_LOWPASS avg +%define PW_ROUND pw_15 +MPEG4_QPEL16_H_LOWPASS put_no_rnd %macro MPEG4_QPEL8_H_LOWPASS 1 cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0 diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 877bb9bdce..363df19acf 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -269,11 +269,13 @@ MACRO(put_no_rnd, no_rnd_, SIZE, SIZEP1, HXMM, VXMM, HVXMM, L2) QPEL3(QPEL_H, 8, 9, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_H, 16, 17, mmxext, mmxext, mmxext, mmxext) +QPEL3(QPEL_H, 16, 17, ssse3, sse2, ssse3, sse2) QPEL3(QPEL_V, 8, 9, ssse3, sse2, ssse3, mmxext) QPEL3(QPEL_HV, 8, 9, mmxext, sse2, sse2, mmxext) QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, sse2) QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, sse2) +QPEL3(QPEL_HV, 16, 17, ssse3, sse2, ssse3, sse2) #define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \ c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## SIZE ## _mc ## X ## Y ## _ ## CPU @@ -329,4 +331,8 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) SET_HV_QPEL_FUNCS(8, sse2,); } #endif + if (EXTERNAL_SSSE3(cpu_flags)) { + SET_H_QPEL_FUNCS(16, ssse3,); + SET_HV_QPEL_FUNCS(16, ssse3,); + } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
