This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit a3d747f3446e828c5e03880d0459d51994f6ec15 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Nov 6 18:10:57 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Apr 30 10:39:33 2026 +0200 avcodec/x86/qpeldsp{,_init}: Use SSE2 pixels16x16_l2 functions put and avg versions have been added and used in H264 in b91081274f5a5b5f0f1ce820331f702378a425e8. This commit adds the size 16 version of put_no_rnd and uses all three of them in the SSE2 size 16 qpel functions (i.e. it uses them in the ones that have a vertical component); it also removes the 16x17 MMXEXT versions (which are no longer used). This is particularly beneficial for put_no_rnd: avg_qpel_pixels_tab[0][5]_c: 1910.9 ( 1.00x) avg_qpel_pixels_tab[0][5]_sse2 (old): 405.1 ( 4.72x) avg_qpel_pixels_tab[0][5]_sse2: 392.9 ( 4.86x) avg_qpel_pixels_tab[0][6]_c: 1778.9 ( 1.00x) avg_qpel_pixels_tab[0][6]_sse2 (old): 385.5 ( 4.61x) avg_qpel_pixels_tab[0][6]_sse2: 374.9 ( 4.75x) avg_qpel_pixels_tab[0][7]_c: 1935.3 ( 1.00x) avg_qpel_pixels_tab[0][7]_sse2 (old): 403.1 ( 4.80x) avg_qpel_pixels_tab[0][7]_sse2: 391.6 ( 4.94x) avg_qpel_pixels_tab[0][9]_c: 1969.0 ( 1.00x) avg_qpel_pixels_tab[0][9]_sse2 (old): 384.1 ( 5.13x) avg_qpel_pixels_tab[0][9]_sse2: 380.3 ( 5.18x) avg_qpel_pixels_tab[0][11]_c: 2014.9 ( 1.00x) avg_qpel_pixels_tab[0][11]_sse2 (old): 385.6 ( 5.23x) avg_qpel_pixels_tab[0][11]_sse2: 380.2 ( 5.30x) avg_qpel_pixels_tab[0][13]_c: 1925.7 ( 1.00x) avg_qpel_pixels_tab[0][13]_sse2 (old): 406.1 ( 4.74x) avg_qpel_pixels_tab[0][13]_sse2: 390.4 ( 4.93x) avg_qpel_pixels_tab[0][14]_c: 1793.0 ( 1.00x) avg_qpel_pixels_tab[0][14]_sse2 (old): 389.6 ( 4.60x) avg_qpel_pixels_tab[0][14]_sse2: 377.1 ( 4.75x) avg_qpel_pixels_tab[0][15]_c: 1913.0 ( 1.00x) avg_qpel_pixels_tab[0][15]_sse2 (old): 404.2 ( 4.73x) avg_qpel_pixels_tab[0][15]_sse2: 390.8 ( 4.89x) put_no_rnd_qpel_pixels_tab[0][5]_c: 1864.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][5]_sse2 (old): 425.6 ( 4.38x) put_no_rnd_qpel_pixels_tab[0][5]_sse2: 396.2 ( 4.71x) put_no_rnd_qpel_pixels_tab[0][6]_c: 1767.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][6]_sse2 (old): 388.4 ( 4.55x) put_no_rnd_qpel_pixels_tab[0][6]_sse2: 377.7 ( 4.68x) put_no_rnd_qpel_pixels_tab[0][7]_c: 1874.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][7]_sse2 (old): 427.6 ( 4.38x) put_no_rnd_qpel_pixels_tab[0][7]_sse2: 400.0 ( 4.69x) put_no_rnd_qpel_pixels_tab[0][9]_c: 1759.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][9]_sse2 (old): 393.0 ( 4.48x) put_no_rnd_qpel_pixels_tab[0][9]_sse2: 379.7 ( 4.63x) put_no_rnd_qpel_pixels_tab[0][11]_c: 1820.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][11]_sse2 (old): 392.7 ( 4.64x) put_no_rnd_qpel_pixels_tab[0][11]_sse2: 377.4 ( 4.82x) put_no_rnd_qpel_pixels_tab[0][13]_c: 1841.2 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][13]_sse2 (old): 427.1 ( 4.31x) put_no_rnd_qpel_pixels_tab[0][13]_sse2: 395.9 ( 4.65x) put_no_rnd_qpel_pixels_tab[0][14]_c: 1761.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][14]_sse2 (old): 392.3 ( 4.49x) put_no_rnd_qpel_pixels_tab[0][14]_sse2: 375.9 ( 4.69x) put_no_rnd_qpel_pixels_tab[0][15]_c: 1869.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][15]_sse2 (old): 425.6 ( 4.39x) put_no_rnd_qpel_pixels_tab[0][15]_sse2: 397.3 ( 4.70x) put_qpel_pixels_tab[0][5]_c: 1888.2 ( 1.00x) put_qpel_pixels_tab[0][5]_sse2 (old): 396.5 ( 4.76x) put_qpel_pixels_tab[0][5]_sse2: 382.5 ( 4.94x) put_qpel_pixels_tab[0][6]_c: 1760.4 ( 1.00x) put_qpel_pixels_tab[0][6]_sse2 (old): 377.0 ( 4.67x) put_qpel_pixels_tab[0][6]_sse2: 372.1 ( 4.73x) put_qpel_pixels_tab[0][7]_c: 1927.6 ( 1.00x) put_qpel_pixels_tab[0][7]_sse2 (old): 396.5 ( 4.86x) put_qpel_pixels_tab[0][7]_sse2: 383.4 ( 5.03x) put_qpel_pixels_tab[0][9]_c: 1775.9 ( 1.00x) put_qpel_pixels_tab[0][9]_sse2 (old): 377.9 ( 4.70x) put_qpel_pixels_tab[0][9]_sse2: 372.3 ( 4.77x) put_qpel_pixels_tab[0][11]_c: 1809.0 ( 1.00x) put_qpel_pixels_tab[0][11]_sse2 (old): 374.6 ( 4.83x) put_qpel_pixels_tab[0][11]_sse2: 380.3 ( 4.76x) put_qpel_pixels_tab[0][13]_c: 1893.2 ( 1.00x) put_qpel_pixels_tab[0][13]_sse2 (old): 399.2 ( 4.74x) put_qpel_pixels_tab[0][13]_sse2: 384.7 ( 4.92x) put_qpel_pixels_tab[0][14]_c: 1756.2 ( 1.00x) put_qpel_pixels_tab[0][14]_sse2 (old): 377.9 ( 4.65x) put_qpel_pixels_tab[0][14]_sse2: 374.4 ( 4.69x) put_qpel_pixels_tab[0][15]_c: 1922.8 ( 1.00x) put_qpel_pixels_tab[0][15]_sse2 (old): 399.0 ( 4.82x) put_qpel_pixels_tab[0][15]_sse2: 387.8 ( 4.96x) The purely vertical size 16 mc functions now no longer use any MMX. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/qpel.asm | 18 ------- libavcodec/x86/qpeldsp.asm | 109 ++++++++++++++++-------------------------- libavcodec/x86/qpeldsp_init.c | 19 ++++---- 3 files changed, 53 insertions(+), 93 deletions(-) diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 8f18cf93db..ef4daeb018 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -37,7 +37,6 @@ SECTION .text %macro PIXELS_L2 2-3 ; avg vs put, size, size+1 %define OP op_%1 %ifidn %1, put -%if notcpuflag(sse2) ; SSE2 currently only uses 16x16 ; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) cglobal put_pixels%2x%3_l2, 5,6,2 @@ -50,7 +49,6 @@ cglobal put_pixels%2x%3_l2, 5,6,2 ; FIXME: avoid jump if prologue is empty jmp %1_pixels%2x%2_after_prologue_ %+ cpuname %endif -%endif ; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) cglobal %1_pixels%2x%2_l2, 5,6,2 @@ -89,22 +87,6 @@ PIXELS_L2 avg, 16 %macro PIXELS16_L2 1 %define OP op_%1 -%ifidn %1, put -; void ff_put_pixels16x17_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride) -cglobal put_pixels16x17_l2, 5,6 - mova m0, [r1] - mova m1, [r1+8] - pavgb m0, [r2] - pavgb m1, [r2+8] - add r1, r4 - add r2, 16 - OP m0, [r0] - OP m1, [r0+8] - add r0, r3 - ; FIXME: avoid jump if prologue is empty - jmp %1_pixels16x16_after_prologue_ %+ cpuname -%endif ; void ff_avg/put_pixels16x16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) cglobal %1_pixels16x16_l2, 5,6 diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index a2e04f247f..6c2c96cee9 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -33,105 +33,83 @@ pw_20: times 8 dw 20 SECTION .text -%macro PUT_NO_RND_PIXELS8_L2 0 +%macro PUT_NO_RND_PIXELS_L2 2 ; void ff_put_no_rnd_pixels8x9_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) -cglobal put_no_rnd_pixels8x9_l2, 5,6 - pcmpeqb m6, m6 - mova m0, [r1] +cglobal put_no_rnd_pixels%1x%2_l2, 5,6,5 + movu m0, [r1] mova m1, [r2] + pcmpeqb m4, m4 add r1, r4 - add r2, 8 - pxor m0, m6 - pxor m1, m6 - PAVGB m0, m1 - pxor m0, m6 + add r2, %1 + pxor m0, m4 + pxor m1, m4 + pavgb m0, m1 + pxor m0, m4 mova [r0], m0 add r0, r3 - jmp put_no_rnd_pixels8x8_after_prologue_ %+ cpuname + jmp put_no_rnd_pixels%1x%1_after_prologue_ %+ cpuname ; void ff_put_no_rnd_pixels8x8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) -cglobal put_no_rnd_pixels8x8_l2, 5,6 - pcmpeqb m6, m6 -put_no_rnd_pixels8x8_after_prologue_ %+ cpuname: - mov r5d, 8 +cglobal put_no_rnd_pixels%1x%1_l2, 5,6,5 + pcmpeqb m4, m4 +put_no_rnd_pixels%1x%1_after_prologue_ %+ cpuname: + mov r5d, %1 .loop: - mova m0, [r1] + movu m0, [r1] add r1, r4 - mova m1, [r1] + movu m1, [r1] add r1, r4 mova m2, [r2] - mova m3, [r2+8] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 + mova m3, [r2+%1] + pxor m0, m4 + pxor m1, m4 + pxor m2, m4 + pxor m3, m4 + pavgb m0, m2 + pavgb m1, m3 + pxor m0, m4 + pxor m1, m4 mova [r0], m0 add r0, r3 mova [r0], m1 add r0, r3 - mova m0, [r1] + movu m0, [r1] add r1, r4 - mova m1, [r1] + movu m1, [r1] add r1, r4 - mova m2, [r2+16] - mova m3, [r2+24] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 + mova m2, [r2+2*%1] + mova m3, [r2+3*%1] + add r2, 4*%1 + pxor m0, m4 + pxor m1, m4 + pxor m2, m4 + pxor m3, m4 + pavgb m0, m2 + pavgb m1, m3 + pxor m0, m4 + pxor m1, m4 mova [r0], m0 add r0, r3 mova [r0], m1 add r0, r3 - add r2, 32 sub r5d, 4 jne .loop RET %endmacro INIT_MMX mmxext -PUT_NO_RND_PIXELS8_L2 +PUT_NO_RND_PIXELS_L2 8, 9 +INIT_XMM sse2 +PUT_NO_RND_PIXELS_L2 16, 17 -; void ff_put_no_rnd_pixels16x17_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride) -%macro PUT_NO_RND_PIXELS16_l2 0 -cglobal put_no_rnd_pixels16x17_l2, 5,6 - pcmpeqb m6, m6 - mova m0, [r1] - mova m1, [r1+8] - mova m2, [r2] - mova m3, [r2+8] - pxor m0, m6 - pxor m1, m6 - pxor m2, m6 - pxor m3, m6 - PAVGB m0, m2 - PAVGB m1, m3 - pxor m0, m6 - pxor m1, m6 - add r1, r4 - add r2, 16 - mova [r0], m0 - mova [r0+8], m1 - add r0, r3 - jmp put_no_rnd_pixels16x16_after_prologue_ %+ cpuname - ; void ff_put_no_rnd_pixels16x16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) +INIT_MMX mmxext cglobal put_no_rnd_pixels16x16_l2, 5,6 pcmpeqb m6, m6 -put_no_rnd_pixels16x16_after_prologue_ %+ cpuname: mov r5d, 16 .loop: mova m0, [r1] @@ -170,10 +148,7 @@ put_no_rnd_pixels16x16_after_prologue_ %+ cpuname: sub r5d, 2 jne .loop RET -%endmacro -INIT_MMX mmxext -PUT_NO_RND_PIXELS16_l2 %macro MPEG4_QPEL16_H_LOWPASS 1 cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 64e3348d2e..877bb9bdce 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -35,9 +35,9 @@ void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride); -void ff_put_pixels16x17_l2_mmxext(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); +void ff_put_pixels16x17_l2_sse2(uint8_t *dst, + const uint8_t *src1, const uint8_t *src2, + ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_no_rnd_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride); @@ -47,9 +47,12 @@ void ff_put_no_rnd_pixels8x9_l2_mmxext(uint8_t *dst, void ff_put_no_rnd_pixels16x16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride); -void ff_put_no_rnd_pixels16x17_l2_mmxext(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); +void ff_put_no_rnd_pixels16x16_l2_sse2(uint8_t *dst, + const uint8_t *src1, const uint8_t *src2, + ptrdiff_t dstStride, ptrdiff_t src1Stride); +void ff_put_no_rnd_pixels16x17_l2_sse2(uint8_t *dst, + const uint8_t *src1, const uint8_t *src2, + ptrdiff_t dstStride, ptrdiff_t src1Stride); #define QPEL_H(OPNAME, RND, SIZE, UNUSED1, XMM, UNUSED2, UNUSED3, L2) \ void ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_ ## XMM (uint8_t *dst, \ @@ -269,8 +272,8 @@ QPEL3(QPEL_H, 16, 17, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_V, 8, 9, ssse3, sse2, ssse3, mmxext) QPEL3(QPEL_HV, 8, 9, mmxext, sse2, sse2, mmxext) -QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, mmxext) -QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, mmxext) +QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, sse2) +QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, sse2) #define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \ c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## SIZE ## _mc ## X ## Y ## _ ## CPU _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
