PR #22623 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22623 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22623.patch
Also improve splatting coefficients. >From 5a29081f63c7f01fb77b45d970a60cec2d9d6c8a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Mar 2026 19:43:32 +0100 Subject: [PATCH 1/5] avcodec/x86/hevc/deblock: Avoid vmovdqa Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/deblock.asm | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm index d43d95142a..5ed27fc38f 100644 --- a/libavcodec/x86/hevc/deblock.asm +++ b/libavcodec/x86/hevc/deblock.asm @@ -374,19 +374,18 @@ ALIGN 16 %if %1 > 8 shl r9d, %1 - 8 %endif - movd m8, r9d; tc0 + movd m9, r9d; tc0 mov r3d, [tcq+4]; %if %1 > 8 shl r3d, %1 - 8 %endif add r9d, r3d; tc0 + tc1 jz .bypassluma - movd m9, r3d; tc1 - punpcklwd m8, m8 + movd m8, r3d; tc1 punpcklwd m9, m9 - shufps m8, m9, 0; tc0, tc1 - mova m9, m8 - psllw m8, 2; tc << 2 + punpcklwd m8, m8 + shufps m9, m8, 0; tc0, tc1 + psllw m8, m9, 2; tc << 2 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) ;end tc25 calculations -- 2.52.0 >From 16a023dc94f47304c07e8e90ed418e274850b8aa Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 25 Mar 2026 21:58:17 +0100 Subject: [PATCH 2/5] avfilter/x86/vf_idetdsp: Avoid (v)movdqa Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavfilter/x86/vf_idetdsp.asm | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/libavfilter/x86/vf_idetdsp.asm b/libavfilter/x86/vf_idetdsp.asm index 12d65000ab..720a2a4f8e 100644 --- a/libavfilter/x86/vf_idetdsp.asm +++ b/libavfilter/x86/vf_idetdsp.asm @@ -50,7 +50,6 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index .loop_16bit: movu m2, [bq + indexq * 2] ; B movu m3, [aq + indexq * 2] ; A - mova m6, m2 psubusw m5, m2, m3 ; ba movu m4, [cq + indexq * 2] ; C @@ -58,7 +57,7 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index psubusw m3, m2 ; ab CMP indexd, widthd - psubusw m6, m4 ; bc + psubusw m6, m2, m4 ; bc psubusw m4, m2 ; cb PABS_DIFF_WD m3, m6, m7 ; |ab - bc| @@ -97,21 +96,19 @@ cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total .sse2_loop: movu m2, [bq + indexq*1] ; B movu m3, [aq + indexq*1] ; A - mova m6, m2 - mova m4, m3 psubusb m5, m2, m3 ; ba - movu m3, [cq + indexq*1] ; C + movu m4, [cq + indexq*1] ; C add indexq, mmsize - psubusb m4, m2 ; ab + psubusb m3, m2 ; ab CMP indexd, widthd - psubusb m6, m3 ; bc - psubusb m3, m2 ; cb + psubusb m6, m2, m4 ; bc + psubusb m4, m2 ; cb - psadbw m4, m6 ; |ab - bc| - paddq m0, m4 - psadbw m5, m3 ; |ba - cb| + psadbw m3, m6 ; |ab - bc| + paddq m0, m3 + psadbw m5, m4 ; |ba - cb| paddq m1, m5 jl .sse2_loop -- 2.52.0 >From e4be6913e5210bfec8375a23c4189021fdecfb9c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Mar 2026 00:12:40 +0100 Subject: [PATCH 3/5] avcodec/x86/h26x/h2656_inter: Remove always-true checks It has already been checked before that we are only dealing with high bitdepth here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h26x/h2656_inter.asm | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index 49a95d58fb..429f9b4667 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -408,13 +408,9 @@ SECTION .text pmaddwd %%reg1, %3 pmaddwd %%reg3, %4 paddd %%reg1, %%reg3 -%if %1 != 8 psrad %%reg1, %1-8 %endif -%endif -%if %1 != 8 psrad %%reg0, %1-8 -%endif packssdw %%reg0, %%reg1 %endif %endmacro @@ -437,9 +433,7 @@ SECTION .text paddd m0, m2 paddd m4, m6 paddd m0, m4 -%if %2 != 8 psrad m0, %2-8 -%endif %if %1 > 4 pmaddwd m1, [%3q+4*mmsize] pmaddwd m3, [%3q+5*mmsize] @@ -448,9 +442,7 @@ SECTION .text paddd m1, m3 paddd m5, m7 paddd m1, m5 -%if %2 != 8 psrad m1, %2-8 -%endif %endif p%4 m0, m1 %endif @@ -503,9 +495,7 @@ SECTION .text paddd m0, m2 paddd m4, m6 paddd m0, m4 -%if %2 != 8 psrad m0, %2-8 -%endif %if %1 > 4 pmaddwd m1, m12 pmaddwd m3, m13 @@ -514,11 +504,9 @@ SECTION .text paddd m1, m3 paddd m5, m7 paddd m1, m5 -%if %2 != 8 psrad m1, %2-8 %endif %endif -%endif %endmacro %macro UNI_COMPUTE 5 pmulhrsw %3, %5 -- 2.52.0 >From d8d1109244264810314440bd4af8604a301373ab Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Mar 2026 00:41:34 +0100 Subject: [PATCH 4/5] avcodec/x86/h26x/h2656_inter: Don't prepare unused coeffs for hv funcs 8 tap motion compensation functions with both vertical and horizontal components are under severe register pressure, so that the filter coefficients have to be put on the stack. Before this commit, this meant that coefficients for use with pmaddubsw and pmaddwd were always created. Yet this is completely unnecessary, as every such register is only used for exactly one purpose and it is known at compile time which one it is (only 8bit horizontal filters are used with pmaddubsw), so only prepare that one. This also allows to half the amount of stack used. This saves 2432B of .text here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h26x/h2656_inter.asm | 44 +++++++++++------------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index 429f9b4667..9dffa40f3a 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -99,26 +99,16 @@ SECTION .text VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3 VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5 VPBROADCASTW m15, [%2q + 3 * 2] ; coeff 6, 7 -%if %0 == 3 - MC_8TAP_SAVE_FILTER %3, m12, m13, m14, m15 -%endif %if %1 != 8 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 - %if %0 == 3 - MC_8TAP_SAVE_FILTER %3 + 4*mmsize, m12, m13, m14, m15 - %endif -%elif %0 == 3 - pmovsxbw m8, xm12 - pmovsxbw m9, xm13 - pmovsxbw m10, xm14 - pmovsxbw m11, xm15 - MC_8TAP_SAVE_FILTER %3 + 4*mmsize, m8, m9, m10, m11 %endif - +%if %0 == 3 + MC_8TAP_SAVE_FILTER %3, m12, m13, m14, m15 +%endif %endmacro %macro MC_4TAP_LOAD 4 @@ -426,19 +416,19 @@ SECTION .text paddw m4, m6 paddw m0, m4 %else - pmaddwd m0, [%3q+4*mmsize] - pmaddwd m2, [%3q+5*mmsize] - pmaddwd m4, [%3q+6*mmsize] - pmaddwd m6, [%3q+7*mmsize] + pmaddwd m0, [%3q+0*mmsize] + pmaddwd m2, [%3q+1*mmsize] + pmaddwd m4, [%3q+2*mmsize] + pmaddwd m6, [%3q+3*mmsize] paddd m0, m2 paddd m4, m6 paddd m0, m4 psrad m0, %2-8 %if %1 > 4 - pmaddwd m1, [%3q+4*mmsize] - pmaddwd m3, [%3q+5*mmsize] - pmaddwd m5, [%3q+6*mmsize] - pmaddwd m7, [%3q+7*mmsize] + pmaddwd m1, [%3q+0*mmsize] + pmaddwd m3, [%3q+1*mmsize] + pmaddwd m5, [%3q+2*mmsize] + pmaddwd m7, [%3q+3*mmsize] paddd m1, m3 paddd m5, m7 paddd m1, m5 @@ -856,11 +846,11 @@ cglobal %1_put_uni_8tap_v%2_%3, 7, 9, 16, dst, dststride, src, srcstride, height ; int height, const int8_t *hf, const int8_t *vf, int width) ; ****************************** %macro PUT_8TAP_HV 3 -cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, dst, dststride, src, srcstride, height, hf, vf, r3src +cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*8, dst, dststride, src, srcstride, height, hf, vf, r3src MC_8TAP_FILTER %3, hf, 0 lea hfq, [rsp] - MC_8TAP_FILTER %3, vf, 8*mmsize - lea vfq, [rsp + 8*mmsize] + MC_8TAP_FILTER 14, vf, 4*mmsize + lea vfq, [rsp + 4*mmsize] lea r3srcq, [srcstrideq*3] sub srcq, r3srcq @@ -931,11 +921,11 @@ cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, dst, dststride, src, srcst RET -cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 16*mmsize, dst, dststride, src, srcstride, height, hf, vf, r3src +cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 8*mmsize, dst, dststride, src, srcstride, height, hf, vf, r3src MC_8TAP_FILTER %3, hf, 0 lea hfq, [rsp] - MC_8TAP_FILTER %3, vf, 8*mmsize - lea vfq, [rsp + 8*mmsize] + MC_8TAP_FILTER 14, vf, 4*mmsize + lea vfq, [rsp + 4*mmsize] lea r3srcq, [srcstrideq*3] sub srcq, r3srcq MC_8TAP_H_LOAD %3, srcq, %2, 15 -- 2.52.0 >From b8239d840267131b6fcfbcf151b8c75e93d8dbdc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 26 Mar 2026 01:19:21 +0100 Subject: [PATCH 5/5] avcodec/x86/h26x/h2656_inter: Simplify splatting coefficients For pre-AVX2, vpbroadcastw is emulated via a load, followed by two shuffles. Yet given that one always wants to splat multiple pairs of coefficients which are adjacent in memory, one can do better than that: Load all of them at once, perform a punpcklwd with itself and use one pshufd per register. In case one has to sign-extend the coefficients, too, one can replace the punpcklwd with one pmovsxbw (instead of one per register) and use pshufd directly afterwards. This saved 4816B of .text here. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/h26x/h2656_inter.asm | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm index 9dffa40f3a..ce4bb53cb4 100644 --- a/libavcodec/x86/h26x/h2656_inter.asm +++ b/libavcodec/x86/h26x/h2656_inter.asm @@ -64,15 +64,27 @@ SECTION .text %endmacro %macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b, +%if cpuflag(avx2) VPBROADCASTW %3, [%2q + 0 * 2] ; coeff 0, 1 VPBROADCASTW %4, [%2q + 1 * 2] ; coeff 2, 3 %if %1 != 8 pmovsxbw %3, xmm%3 pmovsxbw %4, xmm%4 %endif +%else + movd %3, [%2q] ; coeff 0, 1, 2, 3 +%if %1 != 8 + pmovsxbw %3, %3 ; coeff 0, 1, 2, 3 (words) +%else + punpcklwd %3, %3 ; coeff 0,1,0,1,2,3,2,3 +%endif + pshufd %4, %3, q1111 + pshufd %3, %3, q0000 +%endif %endmacro %macro MC_4TAP_HV_FILTER 1 +%if cpuflag(avx2) VPBROADCASTW m12, [vfq + 0 * 2] ; vf 0, 1 VPBROADCASTW m13, [vfq + 1 * 2] ; vf 2, 3 VPBROADCASTW m14, [hfq + 0 * 2] ; hf 0, 1 @@ -83,6 +95,21 @@ SECTION .text %if %1 != 8 pmovsxbw m14, xm14 pmovsxbw m15, xm15 +%endif +%else + movd m12, [vfq] ; vf 0,1,2,3 + movd m14, [hfq] ; hf 0,1,2,3 + + pmovsxbw m12, m12 ; vf 0,1,2,3 (words) +%if %1 != 8 + pmovsxbw m14, m14 ; hf 0,1,2,3 (words) +%else + punpcklwd m14, m14 ; hf 0,1,0,1,2,3,2,3 +%endif + pshufd m13, m12, q1111 + pshufd m12, m12, q0000 + pshufd m15, m14, q1111 + pshufd m14, m14, q0000 %endif lea r3srcq, [srcstrideq*3] %endmacro @@ -95,6 +122,7 @@ SECTION .text %endmacro %macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset +%if cpuflag(avx2) VPBROADCASTW m12, [%2q + 0 * 2] ; coeff 0, 1 VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3 VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5 @@ -106,6 +134,18 @@ SECTION .text pmovsxbw m14, xm14 pmovsxbw m15, xm15 %endif +%else +%if %1 != 8 + pmovsxbw m15, [%2q] ; coeffs 0-7 (words) +%else + movq m15, [%2q] ; coeffs 0-7 + punpcklwd m15, m15 +%endif + pshufd m12, m15, q0000 + pshufd m13, m15, q1111 + pshufd m14, m15, q2222 + pshufd m15, m15, q3333 +%endif %if %0 == 3 MC_8TAP_SAVE_FILTER %3, m12, m13, m14, m15 %endif -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
