output: Make xmm functions usable even without aligned stack

Andreas Rheinhardt via ffmpeg-cvslog Mon, 13 Apr 2026 00:22:25 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 8a7c1f7fb8d70337e736e72cdc1e8edf4981e307
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Apr 10 03:07:13 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Apr 13 08:46:44 2026 +0200

    swscale/x86/output: Make xmm functions usable even without aligned stack
    
    x86-32 lacks one GPR, so it needs to be read from the stack.
    If the stack needs to be realigned, we can no longer access
    the original location of one argument, so just request a bit
    more stack size and copy said argument at a fixed offset from
    the new stack.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libswscale/x86/output.asm | 31 +++++++++++++++++++------------
 libswscale/x86/swscale.c  | 13 +++++--------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 7d87dd919c..bbe15510f8 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -137,7 +137,11 @@ SECTION .text
     mova            m1, [yuv2yuvX_%1_start]
     mova            m2,  m1
 %endif ; %1 == 8/9/10/16
+%if ARCH_X86_32 && !HAVE_ALIGNED_STACK && (%1 == 8)
+    mov       cntr_reg, [rsp+32]
+%else
     movsx     cntr_reg,  fltsizem
+%endif
 .filterloop_%2_ %+ %%i:
     ; input pixels
     mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
@@ -233,15 +237,27 @@ SECTION .text
 %define movsx movsxd
 %endif
 
-cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8
+%assign STACK_SIZE ARCH_X86_32*(32+mmsize*!HAVE_ALIGNED_STACK)
+%else
+%assign STACK_SIZE 0
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, -STACK_SIZE, filter, fltsize, src, dst, w, 
dither, offset
 %if %1 == 8 || %1 == 9 || %1 == 10
     pxor            m6,  m6
 %endif ; %1 == 8/9/10
 
 %if %1 == 8
 %if ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
-    SUB             rsp, pad
+%if !HAVE_ALIGNED_STACK
+    ; For 8-bit content on x86-32 we need the stack for both vector and GP 
regs.
+    ; If the stack is not suitably aligned, then x86inc aligns it for us, but
+    ; we can then no longer access the original location of fltsize, so copy
+    ; it here at a known offset of rsp.
+    mov       [rsp+32], fltsized
+%endif
+
 %define m_dith m7
 %else ; x86-64
 %define m_dith m9
@@ -304,16 +320,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, 
dst, w, dither, offset
     yuv2planeX_mainloop %1, u
 %endif ; mmsize == 8/16
 
-%if %1 == 8
-%if ARCH_X86_32
-    ADD             rsp, pad
-    RET
-%else ; x86-64
     RET
-%endif ; x86-32/64
-%else ; %1 == 9/10/16
-    RET
-%endif ; %1 == 8/9/10/16
 %endmacro
 
 %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 85faf92c56..f3aaa704f6 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -540,12 +540,12 @@ av_cold void ff_sws_init_swscale_x86(SwsInternal *c)
                                      ff_hscale16to19_ ## filtersize ## _ ## 
opt1; \
     } \
 } while (0)
-#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case) \
 switch(c->dstBpc){ \
     case 16:                          do_16_case;                          
break; \
     case 10: if (!isBE(c->opts.dst_format) && 
!isSemiPlanarYUV(c->opts.dst_format) && !isDataInHighBits(c->opts.dst_format)) 
vscalefn = ff_yuv2planeX_10_ ## opt; break; \
     case 9:  if (!isBE(c->opts.dst_format)) vscalefn = ff_yuv2planeX_9_  ## 
opt; break; \
-    case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = 
ff_yuv2planeX_8_  ## opt; break; \
+    case 8: if (!c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_  ## opt; 
break; \
     }
 #define ASSIGN_VSCALE_FUNC(vscalefn, opt) \
     switch(c->dstBpc){ \
@@ -572,8 +572,7 @@ switch(c->dstBpc){ \
     if (EXTERNAL_SSE2(cpu_flags)) {
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
-        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
-                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, );
         if (!(c->opts.flags & SWS_ACCURATE_RND))
             ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2);
 
@@ -622,15 +621,13 @@ switch(c->dstBpc){ \
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
-                            if (!isBE(c->opts.dst_format)) c->yuv2planeX = 
ff_yuv2planeX_16_sse4,
-                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+                            if (!isBE(c->opts.dst_format)) c->yuv2planeX = 
ff_yuv2planeX_16_sse4);
         if (c->dstBpc == 16 && !isBE(c->opts.dst_format) && !(c->opts.flags & 
SWS_ACCURATE_RND))
             c->yuv2plane1 = ff_yuv2plane1_16_sse4;
     }
 
     if (EXTERNAL_AVX(cpu_flags)) {
-        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
-                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, );
         if (!(c->opts.flags & SWS_ACCURATE_RND))
             ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx);
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/04: swscale/x86/output: Make xmm functions usable even without aligned stack

Reply via email to