This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 23d3116af93027866689e52e50533cd3121679ab Author: Andreas Rheinhardt <[email protected]> AuthorDate: Mon Apr 27 17:33:29 2026 +0200 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Apr 30 10:39:33 2026 +0200 avcodec/x86/qpeldsp: Add combination of h_lowpass + l2 If the subpel part of the horizontal component of the motion vector is 1/4 or 3/4, the MPEG-4 qpel motion compensation first computes the mc for the corresponding motion vector with 1/2 horizontal subpel part and then averages this with the left (for 1/4) or the right (for 3/4) source pixel. These two stages are currently performed in two different functions, involving a stack buffer as intermediate. This means that horizontal prediction for every function with a 1/4 or 3/4 horizontal subpel mv is more expensive code-size wise (and also performance-wise) as it involves two calls. Given that the horizontal lowpass functions are not that long, adding combinations of h_lowpass+l2 actually reduces binary size: An increase of 1136B in the asm files is more than offset by size reductions in the wrappers: 1968B here when not using stack protection, 2256B when using stack protection. Of course it also improves performance. Old benchmarks: avg_qpel_pixels_tab[0][1]_ssse3: 106.9 ( 8.69x) avg_qpel_pixels_tab[0][3]_ssse3: 105.5 ( 8.84x) avg_qpel_pixels_tab[0][5]_ssse3: 226.9 ( 8.57x) avg_qpel_pixels_tab[0][7]_ssse3: 231.1 ( 8.38x) avg_qpel_pixels_tab[0][9]_ssse3: 217.8 ( 9.04x) avg_qpel_pixels_tab[0][11]_ssse3: 214.9 ( 9.32x) avg_qpel_pixels_tab[0][13]_ssse3: 227.1 ( 8.48x) avg_qpel_pixels_tab[0][15]_ssse3: 236.1 ( 8.02x) New benchmarks: avg_qpel_pixels_tab[0][1]_ssse3: 96.7 ( 9.65x) avg_qpel_pixels_tab[0][3]_ssse3: 96.6 ( 9.73x) avg_qpel_pixels_tab[0][5]_ssse3: 225.8 ( 8.61x) avg_qpel_pixels_tab[0][7]_ssse3: 228.4 ( 8.51x) avg_qpel_pixels_tab[0][9]_ssse3: 217.1 ( 9.05x) avg_qpel_pixels_tab[0][11]_ssse3: 217.8 ( 9.32x) avg_qpel_pixels_tab[0][13]_ssse3: 227.2 ( 8.54x) avg_qpel_pixels_tab[0][15]_ssse3: 220.5 ( 8.72x) Note: The l2 functions are also used for vertical lowpass functions, yet given that they are much bigger, duplicating them would lead to massive code size increase. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/qpel.asm | 20 ++-------- libavcodec/x86/qpeldsp.asm | 86 +++++++++++++++++++++++++++++++------------ libavcodec/x86/qpeldsp_init.c | 68 +++++++++++----------------------- 3 files changed, 87 insertions(+), 87 deletions(-) diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm index 8382acaa01..cffab9dc91 100644 --- a/libavcodec/x86/qpel.asm +++ b/libavcodec/x86/qpel.asm @@ -34,25 +34,11 @@ SECTION .text mova %2, %1 %endmacro -%macro PIXELS_L2 2-3 ; avg vs put, size, size+1 +%macro PIXELS_L2 2 ; avg vs put, size %define OP op_%1 -%ifidn %1, put -; void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride) -cglobal put_pixels%2x%3_l2, 5,6,2 - movu m0, [r1] - pavgb m0, [r2] - add r1, r4 - add r2, mmsize - OP m0, [r0] - add r0, r3 - ; FIXME: avoid jump if prologue is empty - jmp %1_pixels%2x%2_after_prologue_ %+ cpuname -%endif ; void ff_avg/put_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) cglobal %1_pixels%2x%2_l2, 5,6,2 -%1_pixels%2x%2_after_prologue_ %+ cpuname: mov r5d, %2 .loop: movu m0, [r1] @@ -78,9 +64,9 @@ cglobal %1_pixels%2x%2_l2, 5,6,2 %endmacro INIT_MMX mmxext -PIXELS_L2 put, 8, 9 +PIXELS_L2 put, 8 PIXELS_L2 avg, 8 INIT_XMM sse2 -PIXELS_L2 put, 16, 17 +PIXELS_L2 put, 16 PIXELS_L2 avg, 16 diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm index d6e8b54537..d398e44436 100644 --- a/libavcodec/x86/qpeldsp.asm +++ b/libavcodec/x86/qpeldsp.asm @@ -44,28 +44,11 @@ coeff16_1: times 2 db 20, -6, 20, -6, -6, 20, -6, 20 SECTION .text -%macro PUT_NO_RND_PIXELS_L2 2 -; void ff_put_no_rnd_pixels8x9_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, -; ptrdiff_t dstStride, ptrdiff_t src1Stride) -cglobal put_no_rnd_pixels%1x%2_l2, 5,6,5 - movu m0, [r1] - mova m1, [r2] - pcmpeqb m4, m4 - add r1, r4 - add r2, %1 - pxor m0, m4 - pxor m1, m4 - pavgb m0, m1 - pxor m0, m4 - mova [r0], m0 - add r0, r3 - jmp put_no_rnd_pixels%1x%1_after_prologue_ %+ cpuname - +%macro PUT_NO_RND_PIXELS_L2 1 ; void ff_put_no_rnd_pixels8x8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ; ptrdiff_t dstStride, ptrdiff_t src1Stride) cglobal put_no_rnd_pixels%1x%1_l2, 5,6,5 pcmpeqb m4, m4 -put_no_rnd_pixels%1x%1_after_prologue_ %+ cpuname: mov r5d, %1 .loop: movu m0, [r1] @@ -111,14 +94,42 @@ put_no_rnd_pixels%1x%1_after_prologue_ %+ cpuname: %endmacro INIT_MMX mmxext -PUT_NO_RND_PIXELS_L2 8, 9 +PUT_NO_RND_PIXELS_L2 8 INIT_XMM sse2 -PUT_NO_RND_PIXELS_L2 16, 17 +PUT_NO_RND_PIXELS_L2 16 +%macro L2 5 +%ifidn %2, l2 +%ifidn %1, put_no_rnd +%ifn UNIX64 + pcmpeqb %5, %5 +%endif + pxor %4, PW_FF + pxor %3, PW_FF + pavgb %3, %4 + pxor %3, PW_FF +%else ; avg or put + pavgb %3, %4 +%endif +%endif +%endmacro -%macro MPEG4_QPEL16_H_LOWPASS 1 +%macro MPEG4_QPEL16_H_LOWPASS 1-2 "" +%ifidn %2, l2 +cglobal %1_mpeg4_qpel16_h_lowpass_l2, 6, 6, 8+UNIX64, dst, src, dstride, srcstride, h, offset +%else cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, dst, src, dstride, srcstride, h +%endif mova m7, [coeff16_0] +%define PW_FF m1 +%ifidn %1, put_no_rnd +%ifidn %2, l2 +%if UNIX64 + pcmpeqb m8, m8 +%define PW_FF m8 +%endif +%endif +%endif .loop: movu m0, [srcq] pshufb m1, m0, [shuffle_mask16_0] @@ -137,12 +148,15 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, dst, src, dstride, srcstride, h paddw m4, [PW_ROUND] palignr m1, m6, m0, 4 pmaddubsw m1, [coeff16_1] - add srcq, srcstrideq paddw m2, m5 palignr m5, m6, m0, 8 pmaddubsw m5, [coeff16_1] palignr m6, m0, 12 +%ifidn %2, l2 + movu m0, [srcq+offsetq] +%endif pmaddubsw m6, m7 + add srcq, srcstrideq paddw m2, [PW_ROUND] paddw m4, m1 paddw m2, m3 @@ -151,6 +165,7 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, dst, src, dstride, srcstride, h paddw m4, m6 psraw m4, 5 packuswb m2, m4 + L2 %1, %2, m2, m0, m1 %ifidn %1, avg pavgb m2, [dstq] %endif @@ -164,18 +179,34 @@ cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 8, dst, src, dstride, srcstride, h INIT_XMM ssse3 %define PW_ROUND pw_16 MPEG4_QPEL16_H_LOWPASS put +MPEG4_QPEL16_H_LOWPASS put, l2 %define PW_ROUND pw_16 MPEG4_QPEL16_H_LOWPASS avg +MPEG4_QPEL16_H_LOWPASS avg, l2 %define PW_ROUND pw_15 MPEG4_QPEL16_H_LOWPASS put_no_rnd +MPEG4_QPEL16_H_LOWPASS put_no_rnd, l2 -%macro MPEG4_QPEL8_H_LOWPASS 1 +%macro MPEG4_QPEL8_H_LOWPASS 1-2 "" +%ifidn %2, l2 +cglobal %1_mpeg4_qpel8_h_lowpass_l2, 6, 6, 8+2*ARCH_X86_64+UNIX64, dst, src, dstride, srcstride, h, offset +%else cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 8+2*ARCH_X86_64, dst, src, dstride, srcstride, h +%endif mova m4, [PW_ROUND] mova m5, [coeff8_0] %if ARCH_X86_64 mova m8, [coeff8_1] mova m9, [coeff8_2] +%endif +%define PW_FF m0 +%ifidn %1, put_no_rnd +%ifidn %2, l2 +%if UNIX64 + pcmpeqb m10, m10 +%define PW_FF m10 +%endif +%endif %endif mova m6, [coeff8_3] mova m7, [shuffle_mask8] @@ -200,12 +231,16 @@ cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 8+2*ARCH_X86_64, dst, src, dstride, srcs %endif pshufb m0, m7 pmaddubsw m0, m6 - add srcq, srcstrideq paddw m1, m4 paddw m1, m3 +%ifidn %2, l2 + movq m3, [srcq+offsetq] +%endif + add srcq, srcstrideq paddw m1, m0 psraw m1, 5 packuswb m1, m1 + L2 %1, %2, m1, m3, m0 %ifidn %1, avg pavgb m1, m2 %endif @@ -219,10 +254,13 @@ cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 8+2*ARCH_X86_64, dst, src, dstride, srcs INIT_XMM ssse3 %define PW_ROUND pw_16 MPEG4_QPEL8_H_LOWPASS put +MPEG4_QPEL8_H_LOWPASS put, l2 %define PW_ROUND pw_16 MPEG4_QPEL8_H_LOWPASS avg +MPEG4_QPEL8_H_LOWPASS avg, l2 %define PW_ROUND pw_15 MPEG4_QPEL8_H_LOWPASS put_no_rnd +MPEG4_QPEL8_H_LOWPASS put_no_rnd, l2 %macro QPEL_V_LOW 5 diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c index ba1fa228ab..d9c4deb94d 100644 --- a/libavcodec/x86/qpeldsp_init.c +++ b/libavcodec/x86/qpeldsp_init.c @@ -32,24 +32,12 @@ #include "fpel.h" #include "qpel.h" -void ff_put_pixels8x9_l2_mmxext(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); -void ff_put_pixels16x17_l2_sse2(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_no_rnd_pixels8x8_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride); -void ff_put_no_rnd_pixels8x9_l2_mmxext(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); void ff_put_no_rnd_pixels16x16_l2_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t dstStride, ptrdiff_t src1Stride); -void ff_put_no_rnd_pixels16x17_l2_sse2(uint8_t *dst, - const uint8_t *src1, const uint8_t *src2, - ptrdiff_t dstStride, ptrdiff_t src1Stride); #define QPEL_H(OPNAME, RND, SIZE, UNUSED1, XMM, UNUSED2, UNUSED3, L2) \ void ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_ ## XMM (uint8_t *dst, \ @@ -57,15 +45,18 @@ void ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_ ## XMM (uint8_t *dst, ptrdiff_t dstStride, \ ptrdiff_t srcStride, \ int h); \ +void ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## XMM(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t dstStride, \ + ptrdiff_t srcStride, \ + int h, \ + ptrdiff_t l2_offset);\ static void OPNAME ## _qpel ## SIZE ## _mc10_ ## XMM(uint8_t *dst, \ const uint8_t *src, \ ptrdiff_t stride) \ { \ - DECLARE_ALIGNED(SIZE, uint8_t, half)[SIZE*SIZE]; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## XMM(half, src, SIZE, \ - stride, SIZE); \ - ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, src, half, \ - stride, stride); \ + ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## XMM(dst, src, stride, \ + stride, SIZE, 0); \ } \ \ static void OPNAME ## _qpel ## SIZE ## _mc20_ ## XMM(uint8_t *dst, \ @@ -80,11 +71,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc30_ ## XMM(uint8_t *dst, const uint8_t *src, \ ptrdiff_t stride) \ { \ - DECLARE_ALIGNED(SIZE, uint8_t, half)[SIZE*SIZE]; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## XMM(half, src, SIZE, \ - stride, SIZE); \ - ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, src + 1, half, \ - stride, stride); \ + ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## XMM(dst, src, stride, \ + stride, SIZE, 1); \ } #define QPEL_V(OPNAME, RND, SIZE, UNUSED1, UNUSED2, XMM, UNUSED3, L2) \ @@ -130,10 +118,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc11_ ## HVXMM(uint8_t *dst, DECLARE_ALIGNED(SIZE, uint8_t, half)[(SIZE + SIZEP1)*SIZE]; \ uint8_t *const halfH = half + SIZE*SIZE; \ uint8_t *const halfHV = half; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src, halfH, \ - SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 0); \ ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(halfHV, halfH, \ SIZE, SIZE); \ ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, halfH, halfHV, \ @@ -147,10 +133,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc31_ ## HVXMM(uint8_t *dst, DECLARE_ALIGNED(SIZE, uint8_t, half)[(SIZE + SIZEP1)*SIZE]; \ uint8_t *const halfH = half + SIZE*SIZE; \ uint8_t *const halfHV = half; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src + 1, \ - halfH, SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 1); \ ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(halfHV, halfH, \ SIZE, SIZE); \ ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, halfH, halfHV, \ @@ -164,10 +148,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc13_ ## HVXMM(uint8_t *dst, DECLARE_ALIGNED(SIZE, uint8_t, half)[(SIZE + SIZEP1)*SIZE]; \ uint8_t *const halfH = half + SIZE*SIZE; \ uint8_t *const halfHV = half; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src, halfH, \ - SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 0); \ ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(halfHV, halfH, \ SIZE, SIZE); \ ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, halfH + SIZE, \ @@ -181,10 +163,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc33_ ## HVXMM(uint8_t *dst, DECLARE_ALIGNED(SIZE, uint8_t, half)[(SIZE + SIZEP1)*SIZE]; \ uint8_t *const halfH = half + SIZE*SIZE; \ uint8_t *const halfHV = half; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src + 1, halfH,\ - SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 1); \ ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(halfHV, halfH, \ SIZE, SIZE); \ ff_ ## OPNAME ## _pixels ## SIZE ## x ## SIZE ## _l2_ ## L2(dst, halfH + SIZE, \ @@ -226,10 +206,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc12_ ## HVXMM(uint8_t *dst, ptrdiff_t stride) \ { \ DECLARE_ALIGNED(SIZE, uint8_t, halfH)[SIZEP1*SIZE]; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src, halfH, \ - SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 0); \ ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(dst, halfH, \ stride, SIZE); \ } \ @@ -239,10 +217,8 @@ static void OPNAME ## _qpel ## SIZE ## _mc32_ ## HVXMM(uint8_t *dst, ptrdiff_t stride) \ { \ DECLARE_ALIGNED(SIZE, uint8_t, halfH)[SIZEP1*SIZE]; \ - ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_ ## HXMM(halfH, src, SIZE, \ - stride, SIZEP1); \ - ff_put_ ## RND ## pixels ## SIZE ## x ## SIZEP1 ## _l2_ ## L2(halfH, src + 1, halfH,\ - SIZE, stride); \ + ff_put_ ## RND ## mpeg4_qpel ## SIZE ## _h_lowpass_l2_ ## HXMM(halfH, src, SIZE, \ + stride, SIZEP1, 1); \ ff_ ## OPNAME ## _mpeg4_qpel ## SIZE ## _v_lowpass_ ## VXMM(dst, halfH, \ stride, SIZE); \ } \ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
