This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 9beecb26704e8d9a4a27c07fd8da05eb94cf45ed
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Nov 6 16:51:58 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Apr 30 10:39:33 2026 +0200

    avcodec/x86/qpeldsp: Add SSE2 vertical lowpass functions
    
    Benchmarks ([4], [8] and [12] are pure vertical functions
    and therefore show the biggest improvements):
    
    avg_qpel_pixels_tab[0][4]_c:                           844.5 ( 1.00x)
    avg_qpel_pixels_tab[0][4]_mmxext:                      225.5 ( 3.74x)
    avg_qpel_pixels_tab[0][4]_sse2:                        146.6 ( 5.76x)
    avg_qpel_pixels_tab[0][5]_c:                          1915.9 ( 1.00x)
    avg_qpel_pixels_tab[0][5]_mmxext:                      499.6 ( 3.83x)
    avg_qpel_pixels_tab[0][5]_sse2:                        405.5 ( 4.72x)
    avg_qpel_pixels_tab[0][6]_c:                          1775.9 ( 1.00x)
    avg_qpel_pixels_tab[0][6]_mmxext:                      484.9 ( 3.66x)
    avg_qpel_pixels_tab[0][6]_sse2:                        385.4 ( 4.61x)
    avg_qpel_pixels_tab[0][7]_c:                          1937.0 ( 1.00x)
    avg_qpel_pixels_tab[0][7]_mmxext:                      501.3 ( 3.86x)
    avg_qpel_pixels_tab[0][7]_sse2:                        403.6 ( 4.80x)
    avg_qpel_pixels_tab[0][8]_c:                           976.7 ( 1.00x)
    avg_qpel_pixels_tab[0][8]_mmxext:                      216.9 ( 4.50x)
    avg_qpel_pixels_tab[0][8]_sse2:                        113.1 ( 8.64x)
    avg_qpel_pixels_tab[0][9]_c:                          1971.8 ( 1.00x)
    avg_qpel_pixels_tab[0][9]_mmxext:                      494.9 ( 3.98x)
    avg_qpel_pixels_tab[0][9]_sse2:                        388.3 ( 5.08x)
    avg_qpel_pixels_tab[0][10]_c:                         1900.8 ( 1.00x)
    avg_qpel_pixels_tab[0][10]_mmxext:                     476.4 ( 3.99x)
    avg_qpel_pixels_tab[0][10]_sse2:                       362.4 ( 5.24x)
    avg_qpel_pixels_tab[0][11]_c:                         2003.3 ( 1.00x)
    avg_qpel_pixels_tab[0][11]_mmxext:                     496.5 ( 4.04x)
    avg_qpel_pixels_tab[0][11]_sse2:                       385.9 ( 5.19x)
    avg_qpel_pixels_tab[0][12]_c:                          841.8 ( 1.00x)
    avg_qpel_pixels_tab[0][12]_mmxext:                     226.7 ( 3.71x)
    avg_qpel_pixels_tab[0][12]_sse2:                       143.3 ( 5.87x)
    avg_qpel_pixels_tab[0][13]_c:                         1929.0 ( 1.00x)
    avg_qpel_pixels_tab[0][13]_mmxext:                     499.6 ( 3.86x)
    avg_qpel_pixels_tab[0][13]_sse2:                       412.1 ( 4.68x)
    avg_qpel_pixels_tab[0][14]_c:                         1777.9 ( 1.00x)
    avg_qpel_pixels_tab[0][14]_mmxext:                     484.8 ( 3.67x)
    avg_qpel_pixels_tab[0][14]_sse2:                       385.9 ( 4.61x)
    avg_qpel_pixels_tab[0][15]_c:                         1914.8 ( 1.00x)
    avg_qpel_pixels_tab[0][15]_mmxext:                     501.8 ( 3.82x)
    avg_qpel_pixels_tab[0][15]_sse2:                       405.0 ( 4.73x)
    avg_qpel_pixels_tab[1][4]_c:                           203.4 ( 1.00x)
    avg_qpel_pixels_tab[1][4]_mmxext:                       64.7 ( 3.14x)
    avg_qpel_pixels_tab[1][4]_sse2:                         40.3 ( 5.05x)
    avg_qpel_pixels_tab[1][5]_c:                           488.8 ( 1.00x)
    avg_qpel_pixels_tab[1][5]_mmxext:                      134.6 ( 3.63x)
    avg_qpel_pixels_tab[1][5]_sse2:                        108.5 ( 4.50x)
    avg_qpel_pixels_tab[1][6]_c:                           448.2 ( 1.00x)
    avg_qpel_pixels_tab[1][6]_mmxext:                      128.8 ( 3.48x)
    avg_qpel_pixels_tab[1][6]_sse2:                        102.5 ( 4.37x)
    avg_qpel_pixels_tab[1][7]_c:                           489.6 ( 1.00x)
    avg_qpel_pixels_tab[1][7]_mmxext:                      134.5 ( 3.64x)
    avg_qpel_pixels_tab[1][7]_sse2:                        108.8 ( 4.50x)
    avg_qpel_pixels_tab[1][8]_c:                           223.8 ( 1.00x)
    avg_qpel_pixels_tab[1][8]_mmxext:                       57.5 ( 3.89x)
    avg_qpel_pixels_tab[1][8]_sse2:                         36.3 ( 6.16x)
    avg_qpel_pixels_tab[1][9]_c:                           496.6 ( 1.00x)
    avg_qpel_pixels_tab[1][9]_mmxext:                      129.8 ( 3.82x)
    avg_qpel_pixels_tab[1][9]_sse2:                        105.1 ( 4.72x)
    avg_qpel_pixels_tab[1][10]_c:                          466.1 ( 1.00x)
    avg_qpel_pixels_tab[1][10]_mmxext:                     123.2 ( 3.78x)
    avg_qpel_pixels_tab[1][10]_sse2:                        99.1 ( 4.70x)
    avg_qpel_pixels_tab[1][11]_c:                          497.9 ( 1.00x)
    avg_qpel_pixels_tab[1][11]_mmxext:                     129.9 ( 3.83x)
    avg_qpel_pixels_tab[1][11]_sse2:                       105.4 ( 4.72x)
    avg_qpel_pixels_tab[1][12]_c:                          203.5 ( 1.00x)
    avg_qpel_pixels_tab[1][12]_mmxext:                      63.8 ( 3.19x)
    avg_qpel_pixels_tab[1][12]_sse2:                        38.8 ( 5.25x)
    avg_qpel_pixels_tab[1][13]_c:                          487.9 ( 1.00x)
    avg_qpel_pixels_tab[1][13]_mmxext:                     134.7 ( 3.62x)
    avg_qpel_pixels_tab[1][13]_sse2:                       108.4 ( 4.50x)
    avg_qpel_pixels_tab[1][14]_c:                          447.4 ( 1.00x)
    avg_qpel_pixels_tab[1][14]_mmxext:                     128.2 ( 3.49x)
    avg_qpel_pixels_tab[1][14]_sse2:                       102.4 ( 4.37x)
    avg_qpel_pixels_tab[1][15]_c:                          487.5 ( 1.00x)
    avg_qpel_pixels_tab[1][15]_mmxext:                     134.0 ( 3.64x)
    avg_qpel_pixels_tab[1][15]_sse2:                       109.9 ( 4.44x)
    
    put_no_rnd_qpel_pixels_tab[0][4]_c:                    825.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][4]_mmxext:               242.5 ( 3.40x)
    put_no_rnd_qpel_pixels_tab[0][4]_sse2:                 136.0 ( 6.07x)
    put_no_rnd_qpel_pixels_tab[0][5]_c:                   1837.4 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][5]_mmxext:               542.5 ( 3.39x)
    put_no_rnd_qpel_pixels_tab[0][5]_sse2:                 446.5 ( 4.11x)
    put_no_rnd_qpel_pixels_tab[0][6]_c:                   1766.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][6]_mmxext:               493.6 ( 3.58x)
    put_no_rnd_qpel_pixels_tab[0][6]_sse2:                 394.6 ( 4.48x)
    put_no_rnd_qpel_pixels_tab[0][7]_c:                   1877.4 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][7]_mmxext:               541.9 ( 3.46x)
    put_no_rnd_qpel_pixels_tab[0][7]_sse2:                 447.6 ( 4.19x)
    put_no_rnd_qpel_pixels_tab[0][8]_c:                    785.1 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][8]_mmxext:               206.2 ( 3.81x)
    put_no_rnd_qpel_pixels_tab[0][8]_sse2:                 101.6 ( 7.73x)
    put_no_rnd_qpel_pixels_tab[0][9]_c:                   1772.2 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][9]_mmxext:               489.5 ( 3.62x)
    put_no_rnd_qpel_pixels_tab[0][9]_sse2:                 394.8 ( 4.49x)
    put_no_rnd_qpel_pixels_tab[0][10]_c:                  1711.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][10]_mmxext:              461.2 ( 3.71x)
    put_no_rnd_qpel_pixels_tab[0][10]_sse2:                357.9 ( 4.78x)
    put_no_rnd_qpel_pixels_tab[0][11]_c:                  1815.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][11]_mmxext:              490.8 ( 3.70x)
    put_no_rnd_qpel_pixels_tab[0][11]_sse2:                394.0 ( 4.61x)
    put_no_rnd_qpel_pixels_tab[0][12]_c:                   824.8 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][12]_mmxext:              242.9 ( 3.40x)
    put_no_rnd_qpel_pixels_tab[0][12]_sse2:                135.3 ( 6.10x)
    put_no_rnd_qpel_pixels_tab[0][13]_c:                  1843.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][13]_mmxext:              545.4 ( 3.38x)
    put_no_rnd_qpel_pixels_tab[0][13]_sse2:                444.9 ( 4.14x)
    put_no_rnd_qpel_pixels_tab[0][14]_c:                  1758.1 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][14]_mmxext:              497.7 ( 3.53x)
    put_no_rnd_qpel_pixels_tab[0][14]_sse2:                393.5 ( 4.47x)
    put_no_rnd_qpel_pixels_tab[0][15]_c:                  1861.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[0][15]_mmxext:              545.0 ( 3.42x)
    put_no_rnd_qpel_pixels_tab[0][15]_sse2:                445.7 ( 4.18x)
    put_no_rnd_qpel_pixels_tab[1][4]_c:                    198.3 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][4]_mmxext:                64.3 ( 3.08x)
    put_no_rnd_qpel_pixels_tab[1][4]_sse2:                  39.8 ( 4.98x)
    put_no_rnd_qpel_pixels_tab[1][5]_c:                    460.7 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][5]_mmxext:               137.2 ( 3.36x)
    put_no_rnd_qpel_pixels_tab[1][5]_sse2:                 113.5 ( 4.06x)
    put_no_rnd_qpel_pixels_tab[1][6]_c:                    441.4 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][6]_mmxext:               126.7 ( 3.49x)
    put_no_rnd_qpel_pixels_tab[1][6]_sse2:                 103.7 ( 4.26x)
    put_no_rnd_qpel_pixels_tab[1][7]_c:                    465.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][7]_mmxext:               137.7 ( 3.38x)
    put_no_rnd_qpel_pixels_tab[1][7]_sse2:                 114.0 ( 4.09x)
    put_no_rnd_qpel_pixels_tab[1][8]_c:                    193.8 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][8]_mmxext:                52.1 ( 3.72x)
    put_no_rnd_qpel_pixels_tab[1][8]_sse2:                  27.8 ( 6.97x)
    put_no_rnd_qpel_pixels_tab[1][9]_c:                    450.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][9]_mmxext:               126.2 ( 3.57x)
    put_no_rnd_qpel_pixels_tab[1][9]_sse2:                 104.3 ( 4.32x)
    put_no_rnd_qpel_pixels_tab[1][10]_c:                   436.5 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][10]_mmxext:              118.1 ( 3.69x)
    put_no_rnd_qpel_pixels_tab[1][10]_sse2:                 92.4 ( 4.73x)
    put_no_rnd_qpel_pixels_tab[1][11]_c:                   453.6 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][11]_mmxext:              128.7 ( 3.52x)
    put_no_rnd_qpel_pixels_tab[1][11]_sse2:                103.6 ( 4.38x)
    put_no_rnd_qpel_pixels_tab[1][12]_c:                   201.2 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][12]_mmxext:               64.2 ( 3.13x)
    put_no_rnd_qpel_pixels_tab[1][12]_sse2:                 39.6 ( 5.08x)
    put_no_rnd_qpel_pixels_tab[1][13]_c:                   461.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][13]_mmxext:              137.6 ( 3.36x)
    put_no_rnd_qpel_pixels_tab[1][13]_sse2:                113.4 ( 4.07x)
    put_no_rnd_qpel_pixels_tab[1][14]_c:                   442.6 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][14]_mmxext:              127.0 ( 3.49x)
    put_no_rnd_qpel_pixels_tab[1][14]_sse2:                102.2 ( 4.33x)
    put_no_rnd_qpel_pixels_tab[1][15]_c:                   462.9 ( 1.00x)
    put_no_rnd_qpel_pixels_tab[1][15]_mmxext:              139.5 ( 3.32x)
    put_no_rnd_qpel_pixels_tab[1][15]_sse2:                113.3 ( 4.09x)
    
    put_qpel_pixels_tab[0][4]_c:                           824.6 ( 1.00x)
    put_qpel_pixels_tab[0][4]_mmxext:                      220.1 ( 3.75x)
    put_qpel_pixels_tab[0][4]_sse2:                        137.8 ( 5.98x)
    put_qpel_pixels_tab[0][5]_c:                          1892.0 ( 1.00x)
    put_qpel_pixels_tab[0][5]_mmxext:                      508.0 ( 3.72x)
    put_qpel_pixels_tab[0][5]_sse2:                        408.6 ( 4.63x)
    put_qpel_pixels_tab[0][6]_c:                          1758.0 ( 1.00x)
    put_qpel_pixels_tab[0][6]_mmxext:                      476.7 ( 3.69x)
    put_qpel_pixels_tab[0][6]_sse2:                        381.4 ( 4.61x)
    put_qpel_pixels_tab[0][7]_c:                          1924.3 ( 1.00x)
    put_qpel_pixels_tab[0][7]_mmxext:                      495.1 ( 3.89x)
    put_qpel_pixels_tab[0][7]_sse2:                        417.2 ( 4.61x)
    put_qpel_pixels_tab[0][8]_c:                           772.1 ( 1.00x)
    put_qpel_pixels_tab[0][8]_mmxext:                      197.5 ( 3.91x)
    put_qpel_pixels_tab[0][8]_sse2:                        118.4 ( 6.52x)
    put_qpel_pixels_tab[0][9]_c:                          1778.2 ( 1.00x)
    put_qpel_pixels_tab[0][9]_mmxext:                      476.7 ( 3.73x)
    put_qpel_pixels_tab[0][9]_sse2:                        379.6 ( 4.68x)
    put_qpel_pixels_tab[0][10]_c:                         1714.6 ( 1.00x)
    put_qpel_pixels_tab[0][10]_mmxext:                     460.7 ( 3.72x)
    put_qpel_pixels_tab[0][10]_sse2:                       386.8 ( 4.43x)
    put_qpel_pixels_tab[0][11]_c:                         1819.1 ( 1.00x)
    put_qpel_pixels_tab[0][11]_mmxext:                     474.9 ( 3.83x)
    put_qpel_pixels_tab[0][11]_sse2:                       404.5 ( 4.50x)
    put_qpel_pixels_tab[0][12]_c:                          829.7 ( 1.00x)
    put_qpel_pixels_tab[0][12]_mmxext:                     221.5 ( 3.75x)
    put_qpel_pixels_tab[0][12]_sse2:                       138.7 ( 5.98x)
    put_qpel_pixels_tab[0][13]_c:                         1892.8 ( 1.00x)
    put_qpel_pixels_tab[0][13]_mmxext:                     494.4 ( 3.83x)
    put_qpel_pixels_tab[0][13]_sse2:                       413.9 ( 4.57x)
    put_qpel_pixels_tab[0][14]_c:                         1763.1 ( 1.00x)
    put_qpel_pixels_tab[0][14]_mmxext:                     473.4 ( 3.72x)
    put_qpel_pixels_tab[0][14]_sse2:                       377.8 ( 4.67x)
    put_qpel_pixels_tab[0][15]_c:                         1896.4 ( 1.00x)
    put_qpel_pixels_tab[0][15]_mmxext:                     492.5 ( 3.85x)
    put_qpel_pixels_tab[0][15]_sse2:                       399.0 ( 4.75x)
    put_qpel_pixels_tab[1][4]_c:                           198.6 ( 1.00x)
    put_qpel_pixels_tab[1][4]_mmxext:                       60.9 ( 3.26x)
    put_qpel_pixels_tab[1][4]_sse2:                         40.1 ( 4.95x)
    put_qpel_pixels_tab[1][5]_c:                           471.4 ( 1.00x)
    put_qpel_pixels_tab[1][5]_mmxext:                      131.8 ( 3.58x)
    put_qpel_pixels_tab[1][5]_sse2:                        107.2 ( 4.40x)
    put_qpel_pixels_tab[1][6]_c:                           440.3 ( 1.00x)
    put_qpel_pixels_tab[1][6]_mmxext:                      126.3 ( 3.49x)
    put_qpel_pixels_tab[1][6]_sse2:                        100.6 ( 4.38x)
    put_qpel_pixels_tab[1][7]_c:                           469.2 ( 1.00x)
    put_qpel_pixels_tab[1][7]_mmxext:                      131.7 ( 3.56x)
    put_qpel_pixels_tab[1][7]_sse2:                        106.9 ( 4.39x)
    put_qpel_pixels_tab[1][8]_c:                           194.2 ( 1.00x)
    put_qpel_pixels_tab[1][8]_mmxext:                       52.9 ( 3.67x)
    put_qpel_pixels_tab[1][8]_sse2:                         28.0 ( 6.95x)
    put_qpel_pixels_tab[1][9]_c:                           464.6 ( 1.00x)
    put_qpel_pixels_tab[1][9]_mmxext:                      125.1 ( 3.71x)
    put_qpel_pixels_tab[1][9]_sse2:                        100.9 ( 4.60x)
    put_qpel_pixels_tab[1][10]_c:                          433.8 ( 1.00x)
    put_qpel_pixels_tab[1][10]_mmxext:                     118.2 ( 3.67x)
    put_qpel_pixels_tab[1][10]_sse2:                        94.5 ( 4.59x)
    put_qpel_pixels_tab[1][11]_c:                          463.9 ( 1.00x)
    put_qpel_pixels_tab[1][11]_mmxext:                     125.5 ( 3.70x)
    put_qpel_pixels_tab[1][11]_sse2:                       102.6 ( 4.52x)
    put_qpel_pixels_tab[1][12]_c:                          199.2 ( 1.00x)
    put_qpel_pixels_tab[1][12]_mmxext:                      63.7 ( 3.12x)
    put_qpel_pixels_tab[1][12]_sse2:                        36.2 ( 5.50x)
    put_qpel_pixels_tab[1][13]_c:                          475.6 ( 1.00x)
    put_qpel_pixels_tab[1][13]_mmxext:                     139.5 ( 3.41x)
    put_qpel_pixels_tab[1][13]_sse2:                       107.3 ( 4.43x)
    put_qpel_pixels_tab[1][14]_c:                          441.9 ( 1.00x)
    put_qpel_pixels_tab[1][14]_mmxext:                     126.9 ( 3.48x)
    put_qpel_pixels_tab[1][14]_sse2:                       101.3 ( 4.36x)
    put_qpel_pixels_tab[1][15]_c:                          475.9 ( 1.00x)
    put_qpel_pixels_tab[1][15]_mmxext:                     131.9 ( 3.61x)
    put_qpel_pixels_tab[1][15]_sse2:                       107.0 ( 4.45x)
    
    The new functions (in qpeldsp.asm) occupy 8244B (the MMXEXT functions
    which they will replace occupy only 6720B).
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/qpeldsp.asm    | 144 ++++++++++++++++++++++++++----------------
 libavcodec/x86/qpeldsp_init.c |  34 ++++++++++
 2 files changed, 123 insertions(+), 55 deletions(-)

diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index fd97b71134..d6c8778151 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -26,9 +26,9 @@
 SECTION_RODATA
 
 cextern pw_3
-pw_15: times 4 dw 15
+pw_15: times 8 dw 15
 cextern pw_16
-pw_20: times 4 dw 20
+pw_20: times 8 dw 20
 
 
 SECTION .text
@@ -396,68 +396,75 @@ MPEG4_QPEL8_H_LOWPASS put_no_rnd
     paddw      m5, m4
     psraw      m5, 5
     packuswb   m5, m5
-    OP_MOV     %5, m5, m7
+    OP_MOV     %5, m5, m4
     SWAP 0,1,2,3
 %endmacro
 
 %macro MPEG4_QPEL16_V_LOWPASS 1
-cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
+cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 7, 544
     mov         r4d, 17
     mov          r5, rsp
-    pxor         m7, m7
+    pxor         m4, m4
 .looph:
-    mova         m0, [r1]
-    mova         m1, [r1]
+    movu         m0, [r1]
+    mova         m1, m0
+%if mmsize == 8
     mova         m2, [r1+8]
     mova         m3, [r1+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+    punpcklbw    m0, m4
+    punpckhbw    m1, m4
+    punpcklbw    m2, m4
+    punpckhbw    m3, m4
     mova       [r5], m0
     mova  [r5+0x88], m1
     mova [r5+0x110], m2
     mova [r5+0x198], m3
-    add          r5, 8
+%else
+    punpcklbw    m0, m4
+    punpckhbw    m1, m4
+    mova       [r5], m0
+    mova [r5+0x110], m1
+%endif
     add          r1, r3
+    add          r5, mmsize
     dec r4d
     jne .looph
 
 
-    mov         r4d, 4
+    mov         r4d, 16/(mmsize/2)
     mov          r1, r0
     mov          r5, rsp
 .loopv:
-    mova         m0, [r5+ 0x0]
-    mova         m1, [r5+ 0x8]
-    mova         m2, [r5+0x10]
-    mova         m3, [r5+0x18]
-    add          r1, 4
-    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
-    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+    mova         m0, [r5+0 * mmsize]
+    mova         m1, [r5+1 * mmsize]
+    mova         m2, [r5+2 * mmsize]
+    mova         m3, [r5+3 * mmsize]
+    add          r1, mmsize/2
+    QPEL_V_LOW [r5+2*mmsize],  [r5+1*mmsize],  [r5+0*mmsize],  [r5+4*mmsize],  
[r0]
+    QPEL_V_LOW [r5+1*mmsize],  [r5+0*mmsize],  [r5+0*mmsize],  [r5+5*mmsize],  
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
-    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+    QPEL_V_LOW [r5+0*mmsize],  [r5+0*mmsize],  [r5+1*mmsize],  [r5+6*mmsize],  
[r0]
+    QPEL_V_LOW [r5+0*mmsize],  [r5+1*mmsize],  [r5+2*mmsize],  [r5+7*mmsize],  
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
-    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
+    QPEL_V_LOW [r5+1*mmsize],  [r5+2*mmsize],  [r5+3*mmsize],  [r5+8*mmsize],  
[r0]
+    QPEL_V_LOW [r5+2*mmsize],  [r5+3*mmsize],  [r5+4*mmsize],  [r5+9*mmsize],  
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
-    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
+    QPEL_V_LOW [r5+3*mmsize],  [r5+4*mmsize],  [r5+5*mmsize],  [r5+10*mmsize], 
[r0]
+    QPEL_V_LOW [r5+4*mmsize],  [r5+5*mmsize],  [r5+6*mmsize],  [r5+11*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
-    QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
+    QPEL_V_LOW [r5+5*mmsize],  [r5+6*mmsize],  [r5+7*mmsize],  [r5+12*mmsize], 
[r0]
+    QPEL_V_LOW [r5+6*mmsize],  [r5+7*mmsize],  [r5+8*mmsize],  [r5+13*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
-    QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
+    QPEL_V_LOW [r5+7*mmsize],  [r5+8*mmsize],  [r5+ 9*mmsize], [r5+14*mmsize], 
[r0]
+    QPEL_V_LOW [r5+8*mmsize],  [r5+9*mmsize],  [r5+10*mmsize], [r5+15*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
-    QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
+    QPEL_V_LOW [r5+ 9*mmsize], [r5+10*mmsize], [r5+11*mmsize], [r5+16*mmsize], 
[r0]
+    QPEL_V_LOW [r5+10*mmsize], [r5+11*mmsize], [r5+12*mmsize], [r5+16*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
-    QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
+    QPEL_V_LOW [r5+11*mmsize], [r5+12*mmsize], [r5+13*mmsize], [r5+15*mmsize], 
[r0]
+    QPEL_V_LOW [r5+12*mmsize], [r5+13*mmsize], [r5+14*mmsize], [r5+14*mmsize], 
[r0+r2]
 
-    add    r5, 0x88
+    add    r5, 17*mmsize
     mov    r0, r1
     dec r4d
     jne .loopv
@@ -488,47 +495,60 @@ MPEG4_QPEL16_V_LOWPASS put_no_rnd
 
 
 %macro MPEG4_QPEL8_V_LOWPASS 1
-cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 144
+cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 7, 144
     mov         r4d, 9
     mov          r5, rsp
-    pxor         m7, m7
+    pxor         m2, m2
 .looph:
-    mova         m0, [r1]
-    mova         m1, [r1]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
+    movq         m0, [r1]
+    add          r1, r3
+%if mmsize == 8
+    mova         m1, m0
+    punpcklbw    m0, m2
+    punpckhbw    m1, m2
     mova       [r5], m0
     mova  [r5+0x48], m1
-    add          r5, 8
-    add          r1, r3
+%else
+    punpcklbw    m0, m2
+    mova       [r5], m0
+%endif
+    add          r5, mmsize
     dec r4d
     jne .looph
 
 
+%if mmsize == 8
     mov         r4d, 2
     mov          r1, r0
     mov          r5, rsp
 .loopv:
-    mova         m0, [r5+ 0x0]
-    mova         m1, [r5+ 0x8]
-    mova         m2, [r5+0x10]
-    mova         m3, [r5+0x18]
-    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
-    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+%define R5 r5
+%else
+%define R5 rsp
+%endif
+
+    mova         m0, [R5+0 * mmsize]
+    mova         m1, [R5+1 * mmsize]
+    mova         m2, [R5+2 * mmsize]
+    mova         m3, [R5+3 * mmsize]
+    QPEL_V_LOW [R5+2*mmsize], [R5+1*mmsize], [R5+0*mmsize], [R5+4*mmsize], [r0]
+    QPEL_V_LOW [R5+1*mmsize], [R5+0*mmsize], [R5+0*mmsize], [R5+5*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
-    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+    QPEL_V_LOW [R5+0*mmsize], [R5+0*mmsize], [R5+1*mmsize], [R5+6*mmsize], [r0]
+    QPEL_V_LOW [R5+0*mmsize], [R5+1*mmsize], [R5+2*mmsize], [R5+7*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
-    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
+    QPEL_V_LOW [R5+1*mmsize], [R5+2*mmsize], [R5+3*mmsize], [R5+8*mmsize], [r0]
+    QPEL_V_LOW [R5+2*mmsize], [R5+3*mmsize], [R5+4*mmsize], [R5+8*mmsize], 
[r0+r2]
     lea    r0, [r0+r2*2]
-    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
-    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
+    QPEL_V_LOW [R5+3*mmsize], [R5+4*mmsize], [R5+5*mmsize], [R5+7*mmsize], [r0]
+    QPEL_V_LOW [R5+4*mmsize], [R5+5*mmsize], [R5+6*mmsize], [R5+6*mmsize], 
[r0+r2]
 
+%if mmsize == 8
     add    r5, 0x48
     lea    r0, [r1+4]
     dec r4d
     jne .loopv
+%endif
     RET
 %endmacro
 
@@ -542,3 +562,17 @@ MPEG4_QPEL8_V_LOWPASS avg
 %define PW_ROUND pw_15
 %define OP_MOV PUT_OPH
 MPEG4_QPEL8_V_LOWPASS put_no_rnd
+
+INIT_XMM sse2
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put
+MPEG4_QPEL8_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL16_V_LOWPASS avg
+MPEG4_QPEL8_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put_no_rnd
+MPEG4_QPEL8_V_LOWPASS put_no_rnd
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index 7bcd465d2f..025753ce17 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -271,6 +271,35 @@ QPEL3(QPEL_H,  16, 17, mmxext, mmxext, mmxext, mmxext)
 QPEL3(QPEL_V,  16, 17, mmxext, mmxext, mmxext, mmxext)
 QPEL3(QPEL_HV, 16, 17, mmxext, mmxext, mmxext, mmxext)
 
+QPEL3(QPEL_V,   8,  9, ssse3, sse2, ssse3, mmxext)
+QPEL3(QPEL_HV,  8,  9, mmxext, sse2, sse2, mmxext)
+QPEL3(QPEL_V,  16, 17, ssse3, sse2, ssse3, mmxext)
+QPEL3(QPEL_HV, 16, 17, mmxext, sse2, sse2, mmxext)
+
+#define SET_QPEL_FUNC(OP, X, Y, SIZE, CPU, PREFIX) \
+    c->OP ## _qpel_pixels_tab[SIZE == 8][X+4*Y] = PREFIX ## OP ## _qpel ## 
SIZE ## _mc ## X ## Y ## _ ## CPU
+
+#define SET_QPEL_FUNCS3(X, Y, SIZE, CPU, PREFIX)        \
+    SET_QPEL_FUNC(avg,        X, Y, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNC(put,        X, Y, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNC(put_no_rnd, X, Y, SIZE, CPU, PREFIX)
+
+#define SET_V_QPEL_FUNCS(SIZE, CPU, PREFIX)   \
+    SET_QPEL_FUNCS3(0, 1, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(0, 2, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(0, 3, SIZE, CPU, PREFIX)
+
+#define SET_HV_QPEL_FUNCS(SIZE, CPU, PREFIX)  \
+    SET_QPEL_FUNCS3(1, 1, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(1, 2, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(1, 3, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(2, 1, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(2, 2, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(2, 3, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(3, 1, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(3, 2, SIZE, CPU, PREFIX); \
+    SET_QPEL_FUNCS3(3, 3, SIZE, CPU, PREFIX)
+
 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
 do {                                                                         \
     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
@@ -313,6 +342,11 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
         c->put_no_rnd_qpel_pixels_tab[1][0] =
         c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2;
         c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2;
+
+        SET_V_QPEL_FUNCS (16, sse2,);
+        SET_HV_QPEL_FUNCS(16, sse2,);
+        SET_V_QPEL_FUNCS (8,  sse2,);
+        SET_HV_QPEL_FUNCS(8,  sse2,);
     }
 #endif
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to