PR #22623 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22623
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22623.patch

Also improve splatting coefficients.


>From 5a29081f63c7f01fb77b45d970a60cec2d9d6c8a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 25 Mar 2026 19:43:32 +0100
Subject: [PATCH 1/5] avcodec/x86/hevc/deblock: Avoid vmovdqa

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/hevc/deblock.asm | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm
index d43d95142a..5ed27fc38f 100644
--- a/libavcodec/x86/hevc/deblock.asm
+++ b/libavcodec/x86/hevc/deblock.asm
@@ -374,19 +374,18 @@ ALIGN 16
 %if %1 > 8
     shl             r9d, %1 - 8
 %endif
-    movd             m8, r9d; tc0
+    movd             m9, r9d; tc0
     mov             r3d, [tcq+4];
 %if %1 > 8
     shl             r3d, %1 - 8
 %endif
     add             r9d, r3d; tc0 + tc1
     jz             .bypassluma
-    movd             m9, r3d; tc1
-    punpcklwd        m8, m8
+    movd             m8, r3d; tc1
     punpcklwd        m9, m9
-    shufps           m8, m9, 0; tc0, tc1
-    mova             m9, m8
-    psllw            m8, 2; tc << 2
+    punpcklwd        m8, m8
+    shufps           m9, m8, 0; tc0, tc1
+    psllw            m8, m9, 2; tc << 2
     pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
     ;end tc25 calculations
 
-- 
2.52.0


>From 16a023dc94f47304c07e8e90ed418e274850b8aa Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 25 Mar 2026 21:58:17 +0100
Subject: [PATCH 2/5] avfilter/x86/vf_idetdsp: Avoid (v)movdqa

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavfilter/x86/vf_idetdsp.asm | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/libavfilter/x86/vf_idetdsp.asm b/libavfilter/x86/vf_idetdsp.asm
index 12d65000ab..720a2a4f8e 100644
--- a/libavfilter/x86/vf_idetdsp.asm
+++ b/libavfilter/x86/vf_idetdsp.asm
@@ -50,7 +50,6 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
 .loop_16bit:
     movu      m2, [bq + indexq * 2]  ; B
     movu      m3, [aq + indexq * 2]  ; A
-    mova      m6, m2
     psubusw   m5, m2, m3             ; ba
 
     movu      m4, [cq + indexq * 2]  ; C
@@ -58,7 +57,7 @@ cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
     psubusw   m3, m2                 ; ab
     CMP       indexd, widthd
 
-    psubusw   m6, m4                 ; bc
+    psubusw   m6, m2, m4             ; bc
     psubusw   m4, m2                 ; cb
 
     PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
@@ -97,21 +96,19 @@ cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, 
total
 .sse2_loop:
     movu      m2, [bq + indexq*1]  ; B
     movu      m3, [aq + indexq*1]  ; A
-    mova      m6, m2
-    mova      m4, m3
     psubusb   m5, m2, m3           ; ba
 
-    movu      m3, [cq + indexq*1]  ; C
+    movu      m4, [cq + indexq*1]  ; C
     add       indexq, mmsize
-    psubusb   m4, m2               ; ab
+    psubusb   m3, m2               ; ab
     CMP       indexd, widthd
 
-    psubusb   m6, m3               ; bc
-    psubusb   m3, m2               ; cb
+    psubusb   m6, m2, m4           ; bc
+    psubusb   m4, m2               ; cb
 
-    psadbw    m4, m6               ; |ab - bc|
-    paddq     m0, m4
-    psadbw    m5, m3               ; |ba - cb|
+    psadbw    m3, m6               ; |ab - bc|
+    paddq     m0, m3
+    psadbw    m5, m4               ; |ba - cb|
     paddq     m1, m5
     jl       .sse2_loop
 
-- 
2.52.0


>From e4be6913e5210bfec8375a23c4189021fdecfb9c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 26 Mar 2026 00:12:40 +0100
Subject: [PATCH 3/5] avcodec/x86/h26x/h2656_inter: Remove always-true checks

It has already been checked before that we are only dealing
with high bitdepth here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h26x/h2656_inter.asm | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 49a95d58fb..429f9b4667 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -408,13 +408,9 @@ SECTION .text
     pmaddwd        %%reg1, %3
     pmaddwd        %%reg3, %4
     paddd          %%reg1, %%reg3
-%if %1 != 8
     psrad          %%reg1, %1-8
 %endif
-%endif
-%if %1 != 8
     psrad          %%reg0, %1-8
-%endif
     packssdw       %%reg0, %%reg1
 %endif
 %endmacro
@@ -437,9 +433,7 @@ SECTION .text
     paddd             m0, m2
     paddd             m4, m6
     paddd             m0, m4
-%if %2 != 8
     psrad             m0, %2-8
-%endif
 %if %1 > 4
     pmaddwd           m1, [%3q+4*mmsize]
     pmaddwd           m3, [%3q+5*mmsize]
@@ -448,9 +442,7 @@ SECTION .text
     paddd             m1, m3
     paddd             m5, m7
     paddd             m1, m5
-%if %2 != 8
     psrad             m1, %2-8
-%endif
 %endif
     p%4               m0, m1
 %endif
@@ -503,9 +495,7 @@ SECTION .text
     paddd             m0, m2
     paddd             m4, m6
     paddd             m0, m4
-%if %2 != 8
     psrad             m0, %2-8
-%endif
 %if %1 > 4
     pmaddwd           m1, m12
     pmaddwd           m3, m13
@@ -514,11 +504,9 @@ SECTION .text
     paddd             m1, m3
     paddd             m5, m7
     paddd             m1, m5
-%if %2 != 8
     psrad             m1, %2-8
 %endif
 %endif
-%endif
 %endmacro
 %macro UNI_COMPUTE 5
     pmulhrsw          %3, %5
-- 
2.52.0


>From d8d1109244264810314440bd4af8604a301373ab Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 26 Mar 2026 00:41:34 +0100
Subject: [PATCH 4/5] avcodec/x86/h26x/h2656_inter: Don't prepare unused coeffs
 for hv funcs

8 tap motion compensation functions with both vertical and horizontal
components are under severe register pressure, so that the filter
coefficients have to be put on the stack. Before this commit,
this meant that coefficients for use with pmaddubsw and pmaddwd
were always created. Yet this is completely unnecessary, as
every such register is only used for exactly one purpose and
it is known at compile time which one it is (only 8bit horizontal
filters are used with pmaddubsw), so only prepare that one.
This also allows to half the amount of stack used.

This saves 2432B of .text here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h26x/h2656_inter.asm | 44 +++++++++++------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 429f9b4667..9dffa40f3a 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -99,26 +99,16 @@ SECTION .text
     VPBROADCASTW                      m13, [%2q + 1 * 2]  ; coeff 2, 3
     VPBROADCASTW                      m14, [%2q + 2 * 2]  ; coeff 4, 5
     VPBROADCASTW                      m15, [%2q + 3 * 2]  ; coeff 6, 7
-%if %0 == 3
-    MC_8TAP_SAVE_FILTER                %3, m12, m13, m14, m15
-%endif
 
 %if %1 != 8
     pmovsxbw                          m12, xm12
     pmovsxbw                          m13, xm13
     pmovsxbw                          m14, xm14
     pmovsxbw                          m15, xm15
-    %if %0 == 3
-    MC_8TAP_SAVE_FILTER     %3 + 4*mmsize, m12, m13, m14, m15
-    %endif
-%elif %0 == 3
-    pmovsxbw                          m8, xm12
-    pmovsxbw                          m9, xm13
-    pmovsxbw                         m10, xm14
-    pmovsxbw                         m11, xm15
-    MC_8TAP_SAVE_FILTER     %3 + 4*mmsize, m8, m9, m10, m11
 %endif
-
+%if %0 == 3
+    MC_8TAP_SAVE_FILTER     %3, m12, m13, m14, m15
+%endif
 %endmacro
 
 %macro MC_4TAP_LOAD 4
@@ -426,19 +416,19 @@ SECTION .text
     paddw             m4, m6
     paddw             m0, m4
 %else
-    pmaddwd           m0, [%3q+4*mmsize]
-    pmaddwd           m2, [%3q+5*mmsize]
-    pmaddwd           m4, [%3q+6*mmsize]
-    pmaddwd           m6, [%3q+7*mmsize]
+    pmaddwd           m0, [%3q+0*mmsize]
+    pmaddwd           m2, [%3q+1*mmsize]
+    pmaddwd           m4, [%3q+2*mmsize]
+    pmaddwd           m6, [%3q+3*mmsize]
     paddd             m0, m2
     paddd             m4, m6
     paddd             m0, m4
     psrad             m0, %2-8
 %if %1 > 4
-    pmaddwd           m1, [%3q+4*mmsize]
-    pmaddwd           m3, [%3q+5*mmsize]
-    pmaddwd           m5, [%3q+6*mmsize]
-    pmaddwd           m7, [%3q+7*mmsize]
+    pmaddwd           m1, [%3q+0*mmsize]
+    pmaddwd           m3, [%3q+1*mmsize]
+    pmaddwd           m5, [%3q+2*mmsize]
+    pmaddwd           m7, [%3q+3*mmsize]
     paddd             m1, m3
     paddd             m5, m7
     paddd             m1, m5
@@ -856,11 +846,11 @@ cglobal %1_put_uni_8tap_v%2_%3, 7, 9, 16, dst, dststride, 
src, srcstride, height
 ;                     int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; ******************************
 %macro PUT_8TAP_HV 3
-cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, dst, dststride, src, 
srcstride, height, hf, vf, r3src
+cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*8, dst, dststride, src, 
srcstride, height, hf, vf, r3src
     MC_8TAP_FILTER           %3, hf, 0
     lea                     hfq, [rsp]
-    MC_8TAP_FILTER           %3, vf, 8*mmsize
-    lea                     vfq, [rsp + 8*mmsize]
+    MC_8TAP_FILTER           14, vf, 4*mmsize
+    lea                     vfq, [rsp + 4*mmsize]
 
     lea                  r3srcq, [srcstrideq*3]
     sub                    srcq, r3srcq
@@ -931,11 +921,11 @@ cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, 
dst, dststride, src, srcst
     RET
 
 
-cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 16*mmsize, dst, dststride, src, 
srcstride, height, hf, vf, r3src
+cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 8*mmsize, dst, dststride, src, 
srcstride, height, hf, vf, r3src
     MC_8TAP_FILTER           %3, hf, 0
     lea                     hfq, [rsp]
-    MC_8TAP_FILTER           %3, vf, 8*mmsize
-    lea                     vfq, [rsp + 8*mmsize]
+    MC_8TAP_FILTER           14, vf, 4*mmsize
+    lea                     vfq, [rsp + 4*mmsize]
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, r3srcq
     MC_8TAP_H_LOAD       %3, srcq, %2, 15
-- 
2.52.0


>From b8239d840267131b6fcfbcf151b8c75e93d8dbdc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 26 Mar 2026 01:19:21 +0100
Subject: [PATCH 5/5] avcodec/x86/h26x/h2656_inter: Simplify splatting
 coefficients

For pre-AVX2, vpbroadcastw is emulated via a load, followed
by two shuffles. Yet given that one always wants to splat
multiple pairs of coefficients which are adjacent in memory,
one can do better than that: Load all of them at once, perform
a punpcklwd with itself and use one pshufd per register.
In case one has to sign-extend the coefficients, too,
one can replace the punpcklwd with one pmovsxbw (instead of one
per register) and use pshufd directly afterwards.

This saved 4816B of .text here.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h26x/h2656_inter.asm | 40 +++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 9dffa40f3a..ce4bb53cb4 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -64,15 +64,27 @@ SECTION .text
 %endmacro
 
 %macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+%if cpuflag(avx2)
     VPBROADCASTW   %3, [%2q + 0 * 2]  ; coeff 0, 1
     VPBROADCASTW   %4, [%2q + 1 * 2]  ; coeff 2, 3
 %if %1 != 8
     pmovsxbw       %3, xmm%3
     pmovsxbw       %4, xmm%4
 %endif
+%else
+    movd           %3, [%2q]          ; coeff 0, 1, 2, 3
+%if %1 != 8
+    pmovsxbw       %3, %3             ; coeff 0, 1, 2, 3 (words)
+%else
+    punpcklwd      %3, %3             ; coeff 0,1,0,1,2,3,2,3
+%endif
+    pshufd         %4, %3, q1111
+    pshufd         %3, %3, q0000
+%endif
 %endmacro
 
 %macro MC_4TAP_HV_FILTER 1
+%if cpuflag(avx2)
     VPBROADCASTW  m12, [vfq + 0 * 2]  ; vf 0, 1
     VPBROADCASTW  m13, [vfq + 1 * 2]  ; vf 2, 3
     VPBROADCASTW  m14, [hfq + 0 * 2]  ; hf 0, 1
@@ -83,6 +95,21 @@ SECTION .text
 %if %1 != 8
     pmovsxbw      m14, xm14
     pmovsxbw      m15, xm15
+%endif
+%else
+    movd          m12, [vfq]          ; vf 0,1,2,3
+    movd          m14, [hfq]          ; hf 0,1,2,3
+
+    pmovsxbw      m12, m12            ; vf 0,1,2,3 (words)
+%if %1 != 8
+    pmovsxbw      m14, m14            ; hf 0,1,2,3 (words)
+%else
+    punpcklwd     m14, m14            ; hf 0,1,0,1,2,3,2,3
+%endif
+    pshufd        m13, m12, q1111
+    pshufd        m12, m12, q0000
+    pshufd        m15, m14, q1111
+    pshufd        m14, m14, q0000
 %endif
     lea           r3srcq, [srcstrideq*3]
 %endmacro
@@ -95,6 +122,7 @@ SECTION .text
 %endmacro
 
 %macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+%if cpuflag(avx2)
     VPBROADCASTW                      m12, [%2q + 0 * 2]  ; coeff 0, 1
     VPBROADCASTW                      m13, [%2q + 1 * 2]  ; coeff 2, 3
     VPBROADCASTW                      m14, [%2q + 2 * 2]  ; coeff 4, 5
@@ -106,6 +134,18 @@ SECTION .text
     pmovsxbw                          m14, xm14
     pmovsxbw                          m15, xm15
 %endif
+%else
+%if %1 != 8
+    pmovsxbw                          m15, [%2q]          ; coeffs 0-7 (words)
+%else
+    movq                              m15, [%2q]          ; coeffs 0-7
+    punpcklwd                         m15, m15
+%endif
+    pshufd                            m12, m15, q0000
+    pshufd                            m13, m15, q1111
+    pshufd                            m14, m15, q2222
+    pshufd                            m15, m15, q3333
+%endif
 %if %0 == 3
     MC_8TAP_SAVE_FILTER     %3, m12, m13, m14, m15
 %endif
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to