yuv422p on odd width

Michael Niedermayer via ffmpeg-cvslog Tue, 16 Jun 2026 19:17:43 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch release/8.1
in repository ffmpeg.


commit 86590257a5201061fa1250e2c8cad2b1e593f1d8
Author:     Michael Niedermayer <[email protected]>
AuthorDate: Mon Jun 15 22:12:41 2026 +0200
Commit:     Michael Niedermayer <[email protected]>
CommitDate: Wed Jun 17 04:08:30 2026 +0200

    swscale/aarch64: fix uyvy/yuyv to yuv420p/yuv422p on odd width
    
    interleaved_yuv_to_planar, shared by uyvytoyuv422, uyvytoyuv420,
    yuyvtoyuv422 and yuyvtoyuv420, only handled even widths. The packed
    UYVY/YUYV macroblocks are pixel pairs and the trailing half macroblock of
    an odd width was mishandled:
    
    - the slow path (width <= 31) decrements its pixel counter by two from an
      odd value, so it never reaches zero and the loop runs far past the line,
      overwriting the destination (observed as a crash in checkasm);
    - the fast path (width >= 32) shifts the tail pointers back by width-32 and
      reprocesses an overlapping, misaligned tuple, producing wrong samples and
      dropping the last chroma column.
    
    Process only whole pixel pairs and emit the trailing odd column from a
    per-line epilogue that matches the C reference: for yuv422 one Y, U and V
    sample; for yuv420 the Y of both lines of the pair with the chroma averaged
    across them, and luma only for the final line when the height is odd. The
    empty even part (width 0 or 1) is guarded so the slow path no longer enters
    its run-past loop.
    
    All four variants are now bit-exact with the C reference for even and odd
    widths. Verified with checkasm under qemu-aarch64.
    
    Signed-off-by: Michael Niedermayer <[email protected]>
    (cherry picked from commit a554d0aa8a64673364baf04affb62ce0df219db7)
    Signed-off-by: Michael Niedermayer <[email protected]>
---
 libswscale/aarch64/rgb2rgb_neon.S | 79 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/libswscale/aarch64/rgb2rgb_neon.S 
b/libswscale/aarch64/rgb2rgb_neon.S
index 665aa4496b..ba2f904879 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -795,11 +795,78 @@ w17 - set to 1 if last line has to be handled separately 
(odd height)
         add             x2, x2, x7
 .endm
 
+// width is processed in whole pixel pairs; if the original width was odd this
+// emits the trailing column for one line, matching the C reference. w15 holds
+// the odd-width flag, the pointers are at the end of the even part of the 
line.
+.macro write_last_odd_column src_fmt, dst_fmt, is_final_odd_line=0, y_off=0
+.ifc \dst_fmt, yuv422
+        cbz             w15, 0f
+.ifc \src_fmt, uyvy
+        ldrb            w12, [x3, #1]                     // Y = src[2w-1]
+        strb            w12, [x0]
+        ldrb            w12, [x3]                         // U = src[2w-2]
+        strb            w12, [x1]
+        ldrb            w12, [x3, #2]                     // V = src[2w]
+        strb            w12, [x2]
+.else
+        ldrb            w12, [x3]                         // Y = src[2w-2]
+        strb            w12, [x0]
+        ldrb            w12, [x3, #1]                     // U = src[2w-1]
+        strb            w12, [x1]
+        ldrb            w12, [x3, #3]                     // V = src[2w+1]
+        strb            w12, [x2]
+.endif
+0:
+.endif
+.ifc \dst_fmt, yuv420
+        cbz             w15, 0f
+.if \is_final_odd_line
+        ldrb            w12, [x3, #\y_off]                // luma only; chroma 
is skipped on the odd last line
+        strb            w12, [x0]
+.else
+.ifc \src_fmt, uyvy
+        ldrb            w12, [x3, #1]                     // Y, top line
+        strb            w12, [x0]
+        ldrb            w12, [x13, #1]                    // Y, bottom line
+        strb            w12, [x10]
+        ldrb            w12, [x3]                         // U = (top + 
bottom) >> 1
+        ldrb            w14, [x13]
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1]
+        ldrb            w12, [x3, #2]                     // V = (top + 
bottom) >> 1
+        ldrb            w14, [x13, #2]
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2]
+.else
+        ldrb            w12, [x3]                         // Y, top line
+        strb            w12, [x0]
+        ldrb            w12, [x13]                        // Y, bottom line
+        strb            w12, [x10]
+        ldrb            w12, [x3, #1]                     // U = (top + 
bottom) >> 1
+        ldrb            w14, [x13, #1]
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x1]
+        ldrb            w12, [x3, #3]                     // V = (top + 
bottom) >> 1
+        ldrb            w14, [x13, #3]
+        add             w12, w12, w14
+        lsr             w12, w12, #1
+        strb            w12, [x2]
+.endif
+.endif
+0:
+.endif
+.endm
+
 .macro interleaved_yuv_to_planar src_fmt, dst_fmt
 function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
         sxtw            x6, w6
         sxtw            x7, w7
         ldrsw           x8, [sp]
+        and             w15, w4, #1                       // odd width: 
trailing column via epilogue
+        bic             w4, w4, #1                        // process whole 
pixel pairs
         ands            w11, w4, #~31                     // choose between 
fast and slow path
 
 .ifc \dst_fmt, yuv420
@@ -825,6 +892,7 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
         b.ne            2b
         fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
         fastpath_iteration \src_fmt, \dst_fmt, 0, 0
+        write_last_odd_column \src_fmt, \dst_fmt
         subs            w5, w5, #1
         move_pointers_to_next_line \src_fmt, \dst_fmt
         b.ne            1b
@@ -837,16 +905,24 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
         b.ne            4b
         fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
         fastpath_iteration \src_fmt, \dst_fmt, 1, 1
+.ifc \src_fmt, uyvy
+        write_last_odd_column \src_fmt, \dst_fmt, 1, 1
+.else
+        write_last_odd_column \src_fmt, \dst_fmt, 1, 0
+.endif
 3:
 .endif
         ret
 
 6:                                                        // slow path - width 
is at most 31
         and             w9, w4, #31
+        cbz             w9, 9f                            // even part empty 
(orig width 0 or 1)
 7:
         subs            w9, w9, #2
         slowpath_iteration \src_fmt, \dst_fmt, 0
         b.ne            7b
+9:
+        write_last_odd_column \src_fmt, \dst_fmt
         subs            w5, w5, #1
         move_pointers_to_next_line \src_fmt, \dst_fmt
         b.ne            6b
@@ -857,10 +933,13 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
 .ifc \src_fmt, uyvy
         add             x3, x3, #1
 .endif
+        cbz             w9, 10f                           // even part empty 
(orig width 0 or 1)
 5:
         subs            w9, w9, #2
         slowpath_iteration \src_fmt, \dst_fmt, 1
         b.ne            5b
+10:
+        write_last_odd_column \src_fmt, \dst_fmt, 1, 0
 8:
 .endif
         ret

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/05: swscale/aarch64: fix uyvy/yuyv to yuv420p/yuv422p on odd width

Reply via email to