This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch release/8.1 in repository ffmpeg.
commit 86590257a5201061fa1250e2c8cad2b1e593f1d8 Author: Michael Niedermayer <[email protected]> AuthorDate: Mon Jun 15 22:12:41 2026 +0200 Commit: Michael Niedermayer <[email protected]> CommitDate: Wed Jun 17 04:08:30 2026 +0200 swscale/aarch64: fix uyvy/yuyv to yuv420p/yuv422p on odd width interleaved_yuv_to_planar, shared by uyvytoyuv422, uyvytoyuv420, yuyvtoyuv422 and yuyvtoyuv420, only handled even widths. The packed UYVY/YUYV macroblocks are pixel pairs and the trailing half macroblock of an odd width was mishandled: - the slow path (width <= 31) decrements its pixel counter by two from an odd value, so it never reaches zero and the loop runs far past the line, overwriting the destination (observed as a crash in checkasm); - the fast path (width >= 32) shifts the tail pointers back by width-32 and reprocesses an overlapping, misaligned tuple, producing wrong samples and dropping the last chroma column. Process only whole pixel pairs and emit the trailing odd column from a per-line epilogue that matches the C reference: for yuv422 one Y, U and V sample; for yuv420 the Y of both lines of the pair with the chroma averaged across them, and luma only for the final line when the height is odd. The empty even part (width 0 or 1) is guarded so the slow path no longer enters its run-past loop. All four variants are now bit-exact with the C reference for even and odd widths. Verified with checkasm under qemu-aarch64. Signed-off-by: Michael Niedermayer <[email protected]> (cherry picked from commit a554d0aa8a64673364baf04affb62ce0df219db7) Signed-off-by: Michael Niedermayer <[email protected]> --- libswscale/aarch64/rgb2rgb_neon.S | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index 665aa4496b..ba2f904879 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -795,11 +795,78 @@ w17 - set to 1 if last line has to be handled separately (odd height) add x2, x2, x7 .endm +// width is processed in whole pixel pairs; if the original width was odd this +// emits the trailing column for one line, matching the C reference. w15 holds +// the odd-width flag, the pointers are at the end of the even part of the line. +.macro write_last_odd_column src_fmt, dst_fmt, is_final_odd_line=0, y_off=0 +.ifc \dst_fmt, yuv422 + cbz w15, 0f +.ifc \src_fmt, uyvy + ldrb w12, [x3, #1] // Y = src[2w-1] + strb w12, [x0] + ldrb w12, [x3] // U = src[2w-2] + strb w12, [x1] + ldrb w12, [x3, #2] // V = src[2w] + strb w12, [x2] +.else + ldrb w12, [x3] // Y = src[2w-2] + strb w12, [x0] + ldrb w12, [x3, #1] // U = src[2w-1] + strb w12, [x1] + ldrb w12, [x3, #3] // V = src[2w+1] + strb w12, [x2] +.endif +0: +.endif +.ifc \dst_fmt, yuv420 + cbz w15, 0f +.if \is_final_odd_line + ldrb w12, [x3, #\y_off] // luma only; chroma is skipped on the odd last line + strb w12, [x0] +.else +.ifc \src_fmt, uyvy + ldrb w12, [x3, #1] // Y, top line + strb w12, [x0] + ldrb w12, [x13, #1] // Y, bottom line + strb w12, [x10] + ldrb w12, [x3] // U = (top + bottom) >> 1 + ldrb w14, [x13] + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1] + ldrb w12, [x3, #2] // V = (top + bottom) >> 1 + ldrb w14, [x13, #2] + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2] +.else + ldrb w12, [x3] // Y, top line + strb w12, [x0] + ldrb w12, [x13] // Y, bottom line + strb w12, [x10] + ldrb w12, [x3, #1] // U = (top + bottom) >> 1 + ldrb w14, [x13, #1] + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x1] + ldrb w12, [x3, #3] // V = (top + bottom) >> 1 + ldrb w14, [x13, #3] + add w12, w12, w14 + lsr w12, w12, #1 + strb w12, [x2] +.endif +.endif +0: +.endif +.endm + .macro interleaved_yuv_to_planar src_fmt, dst_fmt function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 sxtw x6, w6 sxtw x7, w7 ldrsw x8, [sp] + and w15, w4, #1 // odd width: trailing column via epilogue + bic w4, w4, #1 // process whole pixel pairs ands w11, w4, #~31 // choose between fast and slow path .ifc \dst_fmt, yuv420 @@ -825,6 +892,7 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 b.ne 2b fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0 fastpath_iteration \src_fmt, \dst_fmt, 0, 0 + write_last_odd_column \src_fmt, \dst_fmt subs w5, w5, #1 move_pointers_to_next_line \src_fmt, \dst_fmt b.ne 1b @@ -837,16 +905,24 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 b.ne 4b fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1 fastpath_iteration \src_fmt, \dst_fmt, 1, 1 +.ifc \src_fmt, uyvy + write_last_odd_column \src_fmt, \dst_fmt, 1, 1 +.else + write_last_odd_column \src_fmt, \dst_fmt, 1, 0 +.endif 3: .endif ret 6: // slow path - width is at most 31 and w9, w4, #31 + cbz w9, 9f // even part empty (orig width 0 or 1) 7: subs w9, w9, #2 slowpath_iteration \src_fmt, \dst_fmt, 0 b.ne 7b +9: + write_last_odd_column \src_fmt, \dst_fmt subs w5, w5, #1 move_pointers_to_next_line \src_fmt, \dst_fmt b.ne 6b @@ -857,10 +933,13 @@ function ff_\src_fmt\()to\dst_fmt\()_neon, export=1 .ifc \src_fmt, uyvy add x3, x3, #1 .endif + cbz w9, 10f // even part empty (orig width 0 or 1) 5: subs w9, w9, #2 slowpath_iteration \src_fmt, \dst_fmt, 1 b.ne 5b +10: + write_last_odd_column \src_fmt, \dst_fmt, 1, 0 8: .endif ret _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
