ops_int: use sized mov for packed_shuffle output

Niklas Haas via ffmpeg-cvslog Thu, 16 Apr 2026 14:03:08 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit ba516a34cd1cdc46ab35a9440ef2f0891c774bd0
Author:     Niklas Haas <[email protected]>
AuthorDate: Sun Apr 12 18:16:39 2026 +0200
Commit:     Niklas Haas <[email protected]>
CommitDate: Thu Apr 16 20:59:39 2026 +0000

    swscale/x86/ops_int: use sized mov for packed_shuffle output
    
    This code made the input read conditional on the byte count, but not the
    output, leading to a lot of over-write for cases like 15, 5.
    
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c       | 14 +++++++++-----
 libswscale/x86/ops_int.asm | 20 ++++++++++++--------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 115970c226..a87fa56f53 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -869,6 +869,13 @@ static bool op_is_type_invariant(const SwsOp *op)
     return false;
 }
 
+static int movsize(const int bytes, const int mmsize)
+{
+    return bytes <= 4 ? 4 : /* movd */
+           bytes <= 8 ? 8 : /* movq */
+           mmsize;          /* movu */
+}
+
 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
 {
     uint8_t shuffle[16];
@@ -888,17 +895,14 @@ static int solve_shuffle(const SwsOpList *ops, int 
mmsize, SwsCompiledOp *out)
     const int num_lanes = mmsize / 16;
     const int in_total  = num_lanes * read_bytes;
     const int out_total = num_lanes * write_bytes;
-    const int read_size = in_total <= 4 ? 4 : /* movd */
-                          in_total <= 8 ? 8 : /* movq */
-                          mmsize;             /* movu */
 
     *out = (SwsCompiledOp) {
         .priv        = av_memdup(shuffle, sizeof(shuffle)),
         .free        = av_free,
         .slice_align = 1,
         .block_size  = pixels * num_lanes,
-        .over_read   = read_size - in_total,
-        .over_write  = mmsize - out_total,
+        .over_read   = movsize(in_total,  mmsize) - in_total,
+        .over_write  = movsize(out_total, mmsize) - out_total,
         .cpu_flags   = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
                        mmsize > 16 ? AV_CPU_FLAG_AVX2 :
                                      AV_CPU_FLAG_SSE4,
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index f28c8c640d..15e0918083 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -163,6 +163,16 @@ process_fn 4
 ; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
 ; versions that can handle a larger number of bytes at once.
 
+%macro MOVSIZE 3 ; size, dst, src
+    %if %1 <= 4
+        movd %2, %3
+    %elif %1 <= 8
+        movq %2, %3
+    %else
+        movu %2, %3
+    %endif
+%endmacro
+
 %macro packed_shuffle 2 ; size_in, size_out
 cglobal packed_shuffle%1_%2, 6, 10, 2, \
     exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
@@ -185,15 +195,9 @@ cglobal packed_shuffle%1_%2, 6, 10, 2, \
             sub srcq, srcidxq
             sub dstq, dstidxq
 .loop:
-    %if %1 <= 4
-            movd m0, [srcq + srcidxq]
-    %elif %1 <= 8
-            movq m0, [srcq + srcidxq]
-    %else
-            movu m0, [srcq + srcidxq]
-    %endif
+            MOVSIZE %1, m0, [srcq + srcidxq]
             pshufb m0, m1
-            movu [dstq + dstidxq], m0
+            MOVSIZE %2, [dstq + dstidxq], m0
             add srcidxq, %1
 IF %1 != %2,add dstidxq, %2
             jnz .loop

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/16: swscale/x86/ops_int: use sized mov for packed_shuffle output

Reply via email to