This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 9beecb26704e8d9a4a27c07fd8da05eb94cf45ed Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Nov 6 16:51:58 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Apr 30 10:39:33 2026 +0200 avcodec/x86/qpeldsp: Add SSE2 vertical lowpass functions Benchmarks ([4], [8] and [12] are pure vertical functions and therefore show the biggest improvements): avg_qpel_pixels_tab[0][4]_c: 844.5 ( 1.00x) avg_qpel_pixels_tab[0][4]_mmxext: 225.5 ( 3.74x) avg_qpel_pixels_tab[0][4]_sse2: 146.6 ( 5.76x) avg_qpel_pixels_tab[0][5]_c: 1915.9 ( 1.00x) avg_qpel_pixels_tab[0][5]_mmxext: 499.6 ( 3.83x) avg_qpel_pixels_tab[0][5]_sse2: 405.5 ( 4.72x) avg_qpel_pixels_tab[0][6]_c: 1775.9 ( 1.00x) avg_qpel_pixels_tab[0][6]_mmxext: 484.9 ( 3.66x) avg_qpel_pixels_tab[0][6]_sse2: 385.4 ( 4.61x) avg_qpel_pixels_tab[0][7]_c: 1937.0 ( 1.00x) avg_qpel_pixels_tab[0][7]_mmxext: 501.3 ( 3.86x) avg_qpel_pixels_tab[0][7]_sse2: 403.6 ( 4.80x) avg_qpel_pixels_tab[0][8]_c: 976.7 ( 1.00x) avg_qpel_pixels_tab[0][8]_mmxext: 216.9 ( 4.50x) avg_qpel_pixels_tab[0][8]_sse2: 113.1 ( 8.64x) avg_qpel_pixels_tab[0][9]_c: 1971.8 ( 1.00x) avg_qpel_pixels_tab[0][9]_mmxext: 494.9 ( 3.98x) avg_qpel_pixels_tab[0][9]_sse2: 388.3 ( 5.08x) avg_qpel_pixels_tab[0][10]_c: 1900.8 ( 1.00x) avg_qpel_pixels_tab[0][10]_mmxext: 476.4 ( 3.99x) avg_qpel_pixels_tab[0][10]_sse2: 362.4 ( 5.24x) avg_qpel_pixels_tab[0][11]_c: 2003.3 ( 1.00x) avg_qpel_pixels_tab[0][11]_mmxext: 496.5 ( 4.04x) avg_qpel_pixels_tab[0][11]_sse2: 385.9 ( 5.19x) avg_qpel_pixels_tab[0][12]_c: 841.8 ( 1.00x) avg_qpel_pixels_tab[0][12]_mmxext: 226.7 ( 3.71x) avg_qpel_pixels_tab[0][12]_sse2: 143.3 ( 5.87x) avg_qpel_pixels_tab[0][13]_c: 1929.0 ( 1.00x) avg_qpel_pixels_tab[0][13]_mmxext: 499.6 ( 3.86x) avg_qpel_pixels_tab[0][13]_sse2: 412.1 ( 4.68x) avg_qpel_pixels_tab[0][14]_c: 1777.9 ( 1.00x) avg_qpel_pixels_tab[0][14]_mmxext: 484.8 ( 3.67x) avg_qpel_pixels_tab[0][14]_sse2: 385.9 ( 4.61x) avg_qpel_pixels_tab[0][15]_c: 1914.8 ( 1.00x) avg_qpel_pixels_tab[0][15]_mmxext: 501.8 ( 3.82x) avg_qpel_pixels_tab[0][15]_sse2: 405.0 ( 4.73x) avg_qpel_pixels_tab[1][4]_c: 203.4 ( 1.00x) avg_qpel_pixels_tab[1][4]_mmxext: 64.7 ( 3.14x) avg_qpel_pixels_tab[1][4]_sse2: 40.3 ( 5.05x) avg_qpel_pixels_tab[1][5]_c: 488.8 ( 1.00x) avg_qpel_pixels_tab[1][5]_mmxext: 134.6 ( 3.63x) avg_qpel_pixels_tab[1][5]_sse2: 108.5 ( 4.50x) avg_qpel_pixels_tab[1][6]_c: 448.2 ( 1.00x) avg_qpel_pixels_tab[1][6]_mmxext: 128.8 ( 3.48x) avg_qpel_pixels_tab[1][6]_sse2: 102.5 ( 4.37x) avg_qpel_pixels_tab[1][7]_c: 489.6 ( 1.00x) avg_qpel_pixels_tab[1][7]_mmxext: 134.5 ( 3.64x) avg_qpel_pixels_tab[1][7]_sse2: 108.8 ( 4.50x) avg_qpel_pixels_tab[1][8]_c: 223.8 ( 1.00x) avg_qpel_pixels_tab[1][8]_mmxext: 57.5 ( 3.89x) avg_qpel_pixels_tab[1][8]_sse2: 36.3 ( 6.16x) avg_qpel_pixels_tab[1][9]_c: 496.6 ( 1.00x) avg_qpel_pixels_tab[1][9]_mmxext: 129.8 ( 3.82x) avg_qpel_pixels_tab[1][9]_sse2: 105.1 ( 4.72x) avg_qpel_pixels_tab[1][10]_c: 466.1 ( 1.00x) avg_qpel_pixels_tab[1][10]_mmxext: 123.2 ( 3.78x) avg_qpel_pixels_tab[1][10]_sse2: 99.1 ( 4.70x) avg_qpel_pixels_tab[1][11]_c: 497.9 ( 1.00x) avg_qpel_pixels_tab[1][11]_mmxext: 129.9 ( 3.83x) avg_qpel_pixels_tab[1][11]_sse2: 105.4 ( 4.72x) avg_qpel_pixels_tab[1][12]_c: 203.5 ( 1.00x) avg_qpel_pixels_tab[1][12]_mmxext: 63.8 ( 3.19x) avg_qpel_pixels_tab[1][12]_sse2: 38.8 ( 5.25x) avg_qpel_pixels_tab[1][13]_c: 487.9 ( 1.00x) avg_qpel_pixels_tab[1][13]_mmxext: 134.7 ( 3.62x) avg_qpel_pixels_tab[1][13]_sse2: 108.4 ( 4.50x) avg_qpel_pixels_tab[1][14]_c: 447.4 ( 1.00x) avg_qpel_pixels_tab[1][14]_mmxext: 128.2 ( 3.49x) avg_qpel_pixels_tab[1][14]_sse2: 102.4 ( 4.37x) avg_qpel_pixels_tab[1][15]_c: 487.5 ( 1.00x) avg_qpel_pixels_tab[1][15]_mmxext: 134.0 ( 3.64x) avg_qpel_pixels_tab[1][15]_sse2: 109.9 ( 4.44x) put_no_rnd_qpel_pixels_tab[0][4]_c: 825.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][4]_mmxext: 242.5 ( 3.40x) put_no_rnd_qpel_pixels_tab[0][4]_sse2: 136.0 ( 6.07x) put_no_rnd_qpel_pixels_tab[0][5]_c: 1837.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][5]_mmxext: 542.5 ( 3.39x) put_no_rnd_qpel_pixels_tab[0][5]_sse2: 446.5 ( 4.11x) put_no_rnd_qpel_pixels_tab[0][6]_c: 1766.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][6]_mmxext: 493.6 ( 3.58x) put_no_rnd_qpel_pixels_tab[0][6]_sse2: 394.6 ( 4.48x) put_no_rnd_qpel_pixels_tab[0][7]_c: 1877.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][7]_mmxext: 541.9 ( 3.46x) put_no_rnd_qpel_pixels_tab[0][7]_sse2: 447.6 ( 4.19x) put_no_rnd_qpel_pixels_tab[0][8]_c: 785.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][8]_mmxext: 206.2 ( 3.81x) put_no_rnd_qpel_pixels_tab[0][8]_sse2: 101.6 ( 7.73x) put_no_rnd_qpel_pixels_tab[0][9]_c: 1772.2 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][9]_mmxext: 489.5 ( 3.62x) put_no_rnd_qpel_pixels_tab[0][9]_sse2: 394.8 ( 4.49x) put_no_rnd_qpel_pixels_tab[0][10]_c: 1711.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][10]_mmxext: 461.2 ( 3.71x) put_no_rnd_qpel_pixels_tab[0][10]_sse2: 357.9 ( 4.78x) put_no_rnd_qpel_pixels_tab[0][11]_c: 1815.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][11]_mmxext: 490.8 ( 3.70x) put_no_rnd_qpel_pixels_tab[0][11]_sse2: 394.0 ( 4.61x) put_no_rnd_qpel_pixels_tab[0][12]_c: 824.8 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][12]_mmxext: 242.9 ( 3.40x) put_no_rnd_qpel_pixels_tab[0][12]_sse2: 135.3 ( 6.10x) put_no_rnd_qpel_pixels_tab[0][13]_c: 1843.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][13]_mmxext: 545.4 ( 3.38x) put_no_rnd_qpel_pixels_tab[0][13]_sse2: 444.9 ( 4.14x) put_no_rnd_qpel_pixels_tab[0][14]_c: 1758.1 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][14]_mmxext: 497.7 ( 3.53x) put_no_rnd_qpel_pixels_tab[0][14]_sse2: 393.5 ( 4.47x) put_no_rnd_qpel_pixels_tab[0][15]_c: 1861.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[0][15]_mmxext: 545.0 ( 3.42x) put_no_rnd_qpel_pixels_tab[0][15]_sse2: 445.7 ( 4.18x) put_no_rnd_qpel_pixels_tab[1][4]_c: 198.3 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][4]_mmxext: 64.3 ( 3.08x) put_no_rnd_qpel_pixels_tab[1][4]_sse2: 39.8 ( 4.98x) put_no_rnd_qpel_pixels_tab[1][5]_c: 460.7 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][5]_mmxext: 137.2 ( 3.36x) put_no_rnd_qpel_pixels_tab[1][5]_sse2: 113.5 ( 4.06x) put_no_rnd_qpel_pixels_tab[1][6]_c: 441.4 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][6]_mmxext: 126.7 ( 3.49x) put_no_rnd_qpel_pixels_tab[1][6]_sse2: 103.7 ( 4.26x) put_no_rnd_qpel_pixels_tab[1][7]_c: 465.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][7]_mmxext: 137.7 ( 3.38x) put_no_rnd_qpel_pixels_tab[1][7]_sse2: 114.0 ( 4.09x) put_no_rnd_qpel_pixels_tab[1][8]_c: 193.8 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][8]_mmxext: 52.1 ( 3.72x) put_no_rnd_qpel_pixels_tab[1][8]_sse2: 27.8 ( 6.97x) put_no_rnd_qpel_pixels_tab[1][9]_c: 450.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][9]_mmxext: 126.2 ( 3.57x) put_no_rnd_qpel_pixels_tab[1][9]_sse2: 104.3 ( 4.32x) put_no_rnd_qpel_pixels_tab[1][10]_c: 436.5 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][10]_mmxext: 118.1 ( 3.69x) put_no_rnd_qpel_pixels_tab[1][10]_sse2: 92.4 ( 4.73x) put_no_rnd_qpel_pixels_tab[1][11]_c: 453.6 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][11]_mmxext: 128.7 ( 3.52x) put_no_rnd_qpel_pixels_tab[1][11]_sse2: 103.6 ( 4.38x) put_no_rnd_qpel_pixels_tab[1][12]_c: 201.2 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][12]_mmxext: 64.2 ( 3.13x) put_no_rnd_qpel_pixels_tab[1][12]_sse2: 39.6 ( 5.08x) put_no_rnd_qpel_pixels_tab[1][13]_c: 461.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][13]_mmxext: 137.6 ( 3.36x) put_no_rnd_qpel_pixels_tab[1][13]_sse2: 113.4 ( 4.07x) put_no_rnd_qpel_pixels_tab[1][14]_c: 442.6 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][14]_mmxext: 127.0 ( 3.49x) put_no_rnd_qpel_pixels_tab[1][14]_sse2: 102.2 ( 4.33x) put_no_rnd_qpel_pixels_tab[1][15]_c: 462.9 ( 1.00x) put_no_rnd_qpel_pixels_tab[1][15]_mmxext: 139.5 ( 3.32x) put_no_rnd_qpel_pixels_tab[1][15]_sse2: 113.3 ( 4.09x) put_qpel_pixels_tab[0][4]_c: 824.6 ( 1.00x) put_qpel_pixels_tab[0][4]_mmxext: 220.1 ( 3.75x) put_qpel_pixels_tab[0][4]_sse2: 137.8 ( 5.98x) put_qpel_pixels_tab[0][5]_c: 1892.0 ( 1.00x) put_qpel_pixels_tab[0][5]_mmxext: 508.0 ( 3.72x) put_qpel_pixels_tab[0][5]_sse2: 408.6 ( 4.63x) put_qpel_pixels_tab[0][6]_c: 1758.0 ( 1.00x) put_qpel_pixels_tab[0][6]_mmxext: 476.7 ( 3.69x) put_qpel_pixels_tab[0][6]_sse2: 381.4 ( 4.61x) put_qpel_pixels_tab[0][7]_c: 1924.3 ( 1.00x) put_qpel_pixels_tab[0][7]_mmxext: 495.1 ( 3.89x) put_qpel_pixels_tab[0][7]_sse2: 417.2 ( 4.61x) put_qpel_pixels_tab[0][8]_c: 772.1 ( 1.00x) put_qpel_pixels_tab[0][8]_mmxext: 197.5 ( 3.91x) put_qpel_pixels_tab[0][8]_sse2: 118.4 ( 6.52x) put_qpel_pixels_tab[0][9]_c: 1778.2 ( 1.00x) put_qpel_pixels_tab[0][9]_mmxext: 476.7 ( 3.73x) put_qpel_pixels_tab[0][9]_sse2: 379.6 ( 4.68x) put_qpel_pixels_tab[0][10]_c: 1714.6 ( 1.00x) put_qpel_pixels_tab[0][10]_mmxext: 460.7 ( 3.72x) put_qpel_pixels_tab[0][10]_sse2: 386.8 ( 4.43x) put_qpel_pixels_tab[0][11]_c: 1819.1 ( 1.00x) put_qpel_pixels_tab[0][11]_mmxext: 474.9 ( 3.83x) put_qpel_pixels_tab[0][11]_sse2: 404.5 ( 4.50x) put_qpel_pixels_tab[0][12]_c: 829.7 ( 1.00x) put_qpel_pixels_tab[0][12]_mmxext: 221.5 ( 3.75x) put_qpel_pixels_tab[0][12]_sse2: 138.7 ( 5.98x) put_qpel_pixels_tab[0][13]_c: 1892.8 ( 1.00x) put_qpel_pixels_tab[0][13]_mmxext: 494.4 ( 3.83x) put_qpel_pixels_tab[0][13]_sse2: 413.9 ( 4.57x) put_qpel_pixels_tab[0][14]_c: 1763.1 ( 1.00x) put_qpel_pixels_tab[0][14]_mmxext: 473.4 ( 3.72x) put_qpel_pixels_tab[0][14]_sse2: 377.8 ( 4.67x) put_qpel_pixels_tab[0][15]_c: 1896.4 ( 1.00x) put_qpel_pixels_tab[0][15]_mmxext: 492.5 ( 3.85x) put_qpel_pixels_tab[0][15]_sse2: 399.0 ( 4.75x) put_qpel_pixels_tab[1][4]_c: 198.6 ( 1.00x) put_qpel_pixels_tab[1][4]_mmxext: 60.9 ( 3.26x) put_qpel_pixels_tab[1][4]_sse2: 40.1 ( 4.95x) put_qpel_pixels_tab[1][5]_c: 471.4 ( 1.00x) put_qpel_pixels_tab[1][5]_mmxext: 131.8 ( 3.58x) put_qpel_pixels_tab[1][5]_sse2: 107.2 ( 4.40x) put_qpel_pixels_tab[1][6]_c: 440.3 ( 1.00x) put_qpel_pixels_tab[1][6]_mmxext: 126.3 ( 3.49x) put_qpel_pixels_tab[1][6]_sse2: 100.6 ( 4.38x) put_qpel_pixels_tab[1][7]_c: 469.2 ( 1.00x) put_qpel_pixels_tab[1][7]_mmxext: 131.7 ( 3.56x) put_qpel_pixels_tab[1][7]_sse2: 106.9 ( 4.39x) put_qpel_pixels_tab[1][8]_c: 194.2 ( 1.00x) put_qpel_pixels_tab[1][8]_mmxext: 52.9 ( 3.67x) put_qpel_pixels_tab[1][8]_sse2: 28.0 ( 6.95x) put_qpel_pixels_tab[1][9]_c: 464.6 ( 1.00x) put_qpel_pixels_tab[1][9]_mmxext: 125.1 ( 3.71x) put_qpel_pixels_tab[1][9]_sse2: 100.9 ( 4.60x) put_qpel_pixels_tab[1][10]_c: 433.8 ( 1.00x) put_qpel_pixels_tab[1][10]_mmxext: 118.2 ( 3.67x) put_qpel_pixels_tab[1][10]_sse2: 94.5 ( 4.59x) put_qpel_pixels_tab[1][11]_c: 463.9 ( 1.00x) put_qpel_pixels_tab[1][11]_mmxext: 125.5 ( 3.70x) put_qpel_pixels_tab[1][11]_sse2: 102.6 ( 4.52x) put_qpel_pixels_tab[1][12]_c: 199.2 ( 1.00x) put_qpel_pixels_tab[1][12]_mmxext: 63.7 ( 3.12x) put_qpel_pixels_tab[1][12]_sse2: 36.2 ( 5.50x) put_qpel_pixels_tab[1][13]_c: 475.6 ( 1.00x) put_qpel_pixels_tab[1][13]_mmxext: 139.5 ( 3.41x) put_qpel_pixels_tab[1][13]_sse2: 107.3 ( 4.43x) put_qpel_pixels_tab[1][14]_c: 441.9 ( 1.00x) put_qpel_pixels_tab[1][14]_mmxext: 126.9 ( 3.48x) put_qpel_pixels_tab[1][14]_sse2: 101.3 ( 4.36x) put_qpel_pixels_tab[1][15]_c: 475.9 ( 1.00x) put_qpel_pixels_tab[1][15]_mmxext: 131.9 ( 3.61x) put_qpel_pixels_tab[1][15]_sse2: 107.0 ( 4.45x) The new functions (in qpeldsp.asm) occupy 8244B (the MMXEXT functions which they will replace occupy only 6720B). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/qpeldsp.asm | 144 ++++++++++++++++++++++++++---------------- libavcodec/x86/qpeldsp_init.c | 34 ++++++++++ 2 files changed, 123 insertions(+), 55 deletions(-) diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index fd97b71134..d6c8778151 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -26,9 +26,9 @@ SECTION_RODATA cextern pw_3 -pw_15: times 4 dw 15 +pw_15: times 8 dw 15 cextern pw_16 -pw_20: times 4 dw 20 +pw_20: times 8 dw 20 SECTION .text @@ -396,68 +396,75 @@ MPEG4_QPEL8_H_LOWPASS put_no_rnd paddw m5, m4 psraw m5, 5 packuswb m5, m5 - OP_MOV %5, m5, m7 + OP_MOV %5, m5, m4 SWAP 0,1,2,3 %endmacro %macro MPEG4_QPEL16_V_LOWPASS 1 -cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 +cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 7, 544 mov r4d, 17 mov r5, rsp - pxor m7, m7 + pxor m4, m4 .looph: - mova m0, [r1] - mova m1, [r1] + movu m0, [r1] + mova m1, m0 +%if mmsize == 8 mova m2, [r1+8] mova m3, [r1+8] - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 + punpcklbw m0, m4 + punpckhbw m1, m4 + punpcklbw m2, m4 + punpckhbw m3, m4 mova [r5], m0 mova [r5+0x88], m1 mova [r5+0x110], m2 mova [r5+0x198], m3 - add r5, 8 +%else + punpcklbw m0, m4 + punpckhbw m1, m4 + mova [r5], m0 + mova [r5+0x110], m1 +%endif add r1, r3 + add r5, mmsize dec r4d jne .looph - mov r4d, 4 + mov r4d, 16/(mmsize/2) mov r1, r0 mov r5, rsp .loopv: - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - add r1, 4 - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] + mova m0, [r5+0 * mmsize] + mova m1, [r5+1 * mmsize] + mova m2, [r5+2 * mmsize] + mova m3, [r5+3 * mmsize] + add r1, mmsize/2 + QPEL_V_LOW [r5+2*mmsize], [r5+1*mmsize], [r5+0*mmsize], [r5+4*mmsize], [r0] + QPEL_V_LOW [r5+1*mmsize], [r5+0*mmsize], [r5+0*mmsize], [r5+5*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] + QPEL_V_LOW [r5+0*mmsize], [r5+0*mmsize], [r5+1*mmsize], [r5+6*mmsize], [r0] + QPEL_V_LOW [r5+0*mmsize], [r5+1*mmsize], [r5+2*mmsize], [r5+7*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] + QPEL_V_LOW [r5+1*mmsize], [r5+2*mmsize], [r5+3*mmsize], [r5+8*mmsize], [r0] + QPEL_V_LOW [r5+2*mmsize], [r5+3*mmsize], [r5+4*mmsize], [r5+9*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] + QPEL_V_LOW [r5+3*mmsize], [r5+4*mmsize], [r5+5*mmsize], [r5+10*mmsize], [r0] + QPEL_V_LOW [r5+4*mmsize], [r5+5*mmsize], [r5+6*mmsize], [r5+11*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] - QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] + QPEL_V_LOW [r5+5*mmsize], [r5+6*mmsize], [r5+7*mmsize], [r5+12*mmsize], [r0] + QPEL_V_LOW [r5+6*mmsize], [r5+7*mmsize], [r5+8*mmsize], [r5+13*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] - QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] + QPEL_V_LOW [r5+7*mmsize], [r5+8*mmsize], [r5+ 9*mmsize], [r5+14*mmsize], [r0] + QPEL_V_LOW [r5+8*mmsize], [r5+9*mmsize], [r5+10*mmsize], [r5+15*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] - QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] + QPEL_V_LOW [r5+ 9*mmsize], [r5+10*mmsize], [r5+11*mmsize], [r5+16*mmsize], [r0] + QPEL_V_LOW [r5+10*mmsize], [r5+11*mmsize], [r5+12*mmsize], [r5+16*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] - QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] + QPEL_V_LOW [r5+11*mmsize], [r5+12*mmsize], [r5+13*mmsize], [r5+15*mmsize], [r0] + QPEL_V_LOW [r5+12*mmsize], [r5+13*mmsize], [r5+14*mmsize], [r5+14*mmsize], [r0+r2] - add r5, 0x88 + add r5, 17*mmsize mov r0, r1 dec r4d jne .loopv @@ -488,47 +495,60 @@ MPEG4_QPEL16_V_LOWPASS put_no_rnd %macro MPEG4_QPEL8_V_LOWPASS 1 -cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 144 +cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 7, 144 mov r4d, 9 mov r5, rsp - pxor m7, m7 + pxor m2, m2 .looph: - mova m0, [r1] - mova m1, [r1] - punpcklbw m0, m7 - punpckhbw m1, m7 + movq m0, [r1] + add r1, r3 +%if mmsize == 8 + mova m1, m0 + punpcklbw m0, m2 + punpckhbw m1, m2 mova [r5], m0 mova [r5+0x48], m1 - add r5, 8 - add r1, r3 +%else + punpcklbw m0, m2 + mova [r5], m0 +%endif + add r5, mmsize dec r4d jne .looph +%if mmsize == 8 mov r4d, 2 mov r1, r0 mov r5, rsp .loopv: - mova m0, [r5+ 0x0] - mova m1, [r5+ 0x8] - mova m2, [r5+0x10] - mova m3, [r5+0x18] - QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] - QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] +%define R5 r5 +%else +%define R5 rsp +%endif + + mova m0, [R5+0 * mmsize] + mova m1, [R5+1 * mmsize] + mova m2, [R5+2 * mmsize] + mova m3, [R5+3 * mmsize] + QPEL_V_LOW [R5+2*mmsize], [R5+1*mmsize], [R5+0*mmsize], [R5+4*mmsize], [r0] + QPEL_V_LOW [R5+1*mmsize], [R5+0*mmsize], [R5+0*mmsize], [R5+5*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] - QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] + QPEL_V_LOW [R5+0*mmsize], [R5+0*mmsize], [R5+1*mmsize], [R5+6*mmsize], [r0] + QPEL_V_LOW [R5+0*mmsize], [R5+1*mmsize], [R5+2*mmsize], [R5+7*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] - QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] + QPEL_V_LOW [R5+1*mmsize], [R5+2*mmsize], [R5+3*mmsize], [R5+8*mmsize], [r0] + QPEL_V_LOW [R5+2*mmsize], [R5+3*mmsize], [R5+4*mmsize], [R5+8*mmsize], [r0+r2] lea r0, [r0+r2*2] - QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] - QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] + QPEL_V_LOW [R5+3*mmsize], [R5+4*mmsize], [R5+5*mmsize], [R5+7*mmsize], [r0] + QPEL_V_LOW [R5+4*mmsize], [R5+5*mmsize], [R5+6*mmsize], [R5+6*mmsize], [r0+r2] +%if mmsize == 8 add r5, 0x48 lea r0, [r1+4] dec r4d jne .loopv +%endif RET %endmacro @@ -542,3 +562,17 @@ MPEG4_QPEL8_V_LOWPASS avg %define PW_ROUND pw_15 %define OP_MOV PUT_OPH MPEG4_QPEL8_V_LOWPASS put_no_rnd + +INIT_XMM sse2 +%define PW_ROUND pw_16 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put +MPEG4_QPEL8_V_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OPH +MPEG4_QPEL16_V_LOWPASS avg +MPEG4_QPEL8_V_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put_no_rnd +MPEG4_QPEL8_V_LOWPASS put_no_rnd diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index 7bcd465d2f..025753ce17 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -271,6 +271,35 @@ QPEL3(QPEL_H, 16, 17, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_V, 16, 17, mmxext, mmxext, mmxext, mmxext) QPEL3(QPEL_HV, 16, 17, mmxext, mmxext, mmxext, mmxext) +QPEL3(QPEL_V, 8, 9, ssse3, sse2, ssse3, mmxext) +QPEL3(QPEL_HV, 8, 9, mmxext, sse2, sse2, mmxext) +QPEL3(QPEL_V, 16, 17, ssse3, sse2, ssse3, mmxext) +QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, mmxext) + +#define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \ + c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## SIZE ## _mc ## X ## Y ## _ ## CPU + +#define SET_QPEL_FUNCS3(X, Y, SIZE, CPU, PREFIX) \ + SET_QPEL_FUNC(avg, X, Y, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNC(put, X, Y, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNC(put_no_rnd, X, Y, SIZE, CPU, PREFIX) + +#define SET_V_QPEL_FUNCS(SIZE, CPU, PREFIX) \ + SET_QPEL_FUNCS3(0, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(0, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(0, 3, SIZE, CPU, PREFIX) + +#define SET_HV_QPEL_FUNCS(SIZE, CPU, PREFIX) \ + SET_QPEL_FUNCS3(1, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(1, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(1, 3, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(2, 3, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 1, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 2, SIZE, CPU, PREFIX); \ + SET_QPEL_FUNCS3(3, 3, SIZE, CPU, PREFIX) + #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ do { \ c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ @@ -313,6 +342,11 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c) c->put_no_rnd_qpel_pixels_tab[1][0] = c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2; c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2; + + SET_V_QPEL_FUNCS (16, sse2,); + SET_HV_QPEL_FUNCS(16, sse2,); + SET_V_QPEL_FUNCS (8, sse2,); + SET_HV_QPEL_FUNCS(8, sse2,); } #endif } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
