yuv2rgb_neon: aggregate 16bpp predicates (PR #23214)

DROO AMOR via ffmpeg-devel Sat, 23 May 2026 15:04:57 -0700

PR #23214 opened by DROO AMOR (DROOdotFOO)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23214
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23214.patch


Follow-up to 
[`2e142e52ae`](https://code.ffmpeg.org/FFmpeg/FFmpeg/commit/2e142e52ae) 
addressing mstorsjo's inline review comment on #23152 ([comment 
#42007](https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23152#issuecomment-42007)):

> This is probably ok for now, but for the future, I wonder if we should 
> aggregate these conditions somewhere, and set something so we could just do a 
> `.if rgb16` or so; this becomes kinda unmaintainable when there's no 
> `.elseifc` and we need to do such a long nested chain of `.else`.

Collapses the six `.ifc` cascades in `yuv2rgb_neon.S` that gate 16bpp behavior 
into a single `set_rgb16_predicates` macro emitting four GAS `.set` symbols:

| symbol    | value                                                             
                     |
| --------- | ----------------------------------------------------- |
| `rgb16`   | 1 for `*565le` and `*555le` outputs; 0 otherwise        |
| `r_first`  | 1 for `rgb*le` (R high); 0 for `bgr*le` (B high)               |
| `gshift`  | `pack_rgb16`'s `g_shr` arg: 2 for 565, 3 for 555         |
| `hshift`  | `pack_rgb16`'s `high_shl` arg: 11 for 565, 10 for 555  |

Call sites become a flat `.if rgb16` gate (five places) plus a 2-way `.if 
r_first` inside `.if rgb16` for the pack dispatch (one place). `.if`/`.endif` 
count drops from 46 to 33. Full detail in the commit message.

## Verification

Pure source-level refactor. Full object disassembly is byte-for-byte identical 
to the pre-refactor build on Apple M1 (clang):

```
Pre-refactor object disassembly MD5:   2a6ac497cabc81849e0c80ec0fde0550
Post-refactor object disassembly MD5:  2a6ac497cabc81849e0c80ec0fde0550
```

Per-function MD5 across all 48 exported functions (16 with `rgb16=1`, 32 with 
`rgb16=0`): every one matches the pre-refactor object.

| Check | Result |
|--------------------------------- | -------- |
| `checkasm --test=sw_yuv2rgb` | 110/110 |
| Full `checkasm` | 7657/7657 |
| `tools/patcheck` | silent (only the usual asm `.else` false positive that 
fires on any `.S` diff, and the "minor change, ignore" changelog note) |

CPU: Apple M1 (`sysctl -n machdep.cpu.brand_string`)


>From 497bb085c6ad55e74c9ff4a008fe22d680390c5b Mon Sep 17 00:00:00 2001
From: DROOdotFOO <[email protected]>
Date: Sat, 23 May 2026 17:53:20 +0200
Subject: [PATCH] swscale/aarch64/yuv2rgb_neon: aggregate 16bpp predicates

The six .ifc cascades that gate 16bpp behavior in yuv2rgb_neon.S
(linesize padding in three load_args macros, d8/d9 save/restore,
main-loop pack dispatch) all branch on the same four output formats.
Aggregate the predicate into four GAS .set symbols emitted once per
declare_func via a new set_rgb16_predicates macro:

  rgb16   - 1 for *565le and *555le outputs; 0 otherwise
  r_first - 1 for rgb*le (R high); 0 for bgr*le (B high)
  gshift  - 2 for 565, 3 for 555 (passed as pack_rgb16's g_shr)
  hshift  - 11 for 565, 10 for 555 (passed as pack_rgb16's high_shl)

Call sites become a flat ".if rgb16" gate (five places) plus a 2-way
".if r_first" inside ".if rgb16" for the pack dispatch (one place).
.if/.endif count drops from 46 to 33; -88/+49 lines net.

Pure source-level refactor in response to review feedback on
2e142e52ae (mstorsjo: "this becomes kinda unmaintainable when there's
no .elseifc"). Verified: full object disassembly is byte-for-byte
identical to the pre-refactor build (MD5 2a6ac497cabc81849e0c80ec0fde0550
both before and after on Apple M1, clang). checkasm --test=sw_yuv2rgb
110/110, full checkasm 7657/7657.

Signed-off-by: DROOdotFOO <[email protected]>
---
 libswscale/aarch64/yuv2rgb_neon.S | 137 +++++++++++-------------------
 1 file changed, 49 insertions(+), 88 deletions(-)

diff --git a/libswscale/aarch64/yuv2rgb_neon.S 
b/libswscale/aarch64/yuv2rgb_neon.S
index cf4b08351a..484d630998 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -63,22 +63,10 @@
         add             w17, w0, w0, lsl #1
         sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
   .else
-   .ifc \ofmt,rgb565le
+   .if rgb16
         sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
    .else
-    .ifc \ofmt,bgr565le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-    .else
-     .ifc \ofmt,rgb555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-     .else
-      .ifc \ofmt,bgr555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-      .else
         sub             w3, w3, w0, lsl #2                              // w3 
= linesize  - width * 4 (padding)
-      .endif
-     .endif
-    .endif
    .endif
   .endif
  .endif
@@ -112,22 +100,10 @@
         add             w17, w0, w0, lsl #1
         sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
   .else
-   .ifc \ofmt,rgb565le
+   .if rgb16
         sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
    .else
-    .ifc \ofmt,bgr565le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-    .else
-     .ifc \ofmt,rgb555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-     .else
-      .ifc \ofmt,bgr555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-      .else
         sub             w3, w3, w0, lsl #2                              // w3 
= linesize  - width * 4 (padding)
-      .endif
-     .endif
-    .endif
    .endif
   .endif
  .endif
@@ -171,22 +147,10 @@
         add             w17, w0, w0, lsl #1
         sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
   .else
-   .ifc \ofmt,rgb565le
+   .if rgb16
         sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
    .else
-    .ifc \ofmt,bgr565le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-    .else
-     .ifc \ofmt,rgb555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-     .else
-      .ifc \ofmt,bgr555le
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-      .else
         sub             w3, w3, w0, lsl #2                              // w3 
= linesize  - width * 4 (padding)
-      .endif
-     .endif
-    .endif
    .endif
   .endif
  .endif
@@ -278,36 +242,50 @@
         mov             \a2, v29.8b                                     // 
real alpha (next 8 pixels)
 .endm
 
-// The 16bpp output paths use v8/v9 to assemble packed pixels before the
-// final st1. v8/v9 are AAPCS callee-saved (low 64 bits must be preserved),
-// so each function spills d8/d9 to the stack on entry and reloads on exit.
-// Other output formats don't touch v8-v15, so the save/restore is gated.
-.macro save_d8_d9_if_16bpp ofmt
+// Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts
+// (r_first=1 for rgb*, 0 for bgr*; gshift/hshift = 2/11 for 565,
+// 3/10 for 555), letting sibling macros branch on .if rgb16 instead of
+// repeating a four-way .ifc cascade.
+.macro set_rgb16_predicates ofmt
+        .set rgb16,     0
+        .set r_first,   0
+        .set gshift,    0
+        .set hshift,    0
 .ifc \ofmt,rgb565le
-        stp             d8, d9, [sp, #-0x10]!
+        .set rgb16,     1
+        .set r_first,   1
+        .set gshift,    2
+        .set hshift,   11
 .endif
 .ifc \ofmt,bgr565le
-        stp             d8, d9, [sp, #-0x10]!
+        .set rgb16,     1
+        .set gshift,    2
+        .set hshift,   11
 .endif
 .ifc \ofmt,rgb555le
-        stp             d8, d9, [sp, #-0x10]!
+        .set rgb16,     1
+        .set r_first,   1
+        .set gshift,    3
+        .set hshift,   10
 .endif
 .ifc \ofmt,bgr555le
+        .set rgb16,     1
+        .set gshift,    3
+        .set hshift,   10
+.endif
+.endm
+
+// 16bpp packing uses v8/v9 as the accumulator. AAPCS-64 requires d8/d9
+// callee-saved (low 64 bits of v8/v9); other ofmts don't touch v8-v15,
+// so the spill is gated on rgb16.
+.macro save_d8_d9_if_16bpp
+.if rgb16
         stp             d8, d9, [sp, #-0x10]!
 .endif
 .endm
 
-.macro restore_d8_d9_if_16bpp ofmt
-.ifc \ofmt,rgb565le
-        ldp             d8, d9, [sp], #0x10
-.endif
-.ifc \ofmt,bgr565le
-        ldp             d8, d9, [sp], #0x10
-.endif
-.ifc \ofmt,rgb555le
-        ldp             d8, d9, [sp], #0x10
-.endif
-.ifc \ofmt,bgr555le
+.macro restore_d8_d9_if_16bpp
+.if rgb16
         ldp             d8, d9, [sp], #0x10
 .endif
 .endm
@@ -333,8 +311,9 @@
 
 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
+        set_rgb16_predicates \ofmt
         load_args_\ifmt \ofmt
-        save_d8_d9_if_16bpp \ofmt
+        save_d8_d9_if_16bpp
 
         movi            v31.8h, #4, lsl #8                              // 128 
* (1<<3) (loop-invariant)
         movi            v30.8b, #255                                    // 
alpha = 255  (loop-invariant)
@@ -415,39 +394,21 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         st1             {  v6.8b,  v7.8b }, [x10], #16
         st1             { v18.8b, v19.8b }, [x15], #16
   .else
-   .ifc \ofmt,rgb565le
+   .if rgb16
         compute_rgb     v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
-        // RGB565 LE: (R[7:3] << 11) | (G[7:2] << 5) | B[7:3]
-        pack_rgb16      v8,  v6,  v5,  v4,  2, 11
-        pack_rgb16      v9,  v18, v17, v16, 2, 11
+    .if r_first
+        // rgb*le: (R << hshift) | (G << 5) | B
+        pack_rgb16      v8,  v6,  v5,  v4,  gshift, hshift
+        pack_rgb16      v9,  v18, v17, v16, gshift, hshift
+    .else
+        // bgr*le: (B << hshift) | (G << 5) | R
+        pack_rgb16      v8,  v4,  v5,  v6,  gshift, hshift
+        pack_rgb16      v9,  v16, v17, v18, gshift, hshift
+    .endif
         st1             { v8.8h, v9.8h}, [x2], #32
    .else
-    .ifc \ofmt,bgr565le
-        compute_rgb     v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
-        // BGR565 LE: (B[7:3] << 11) | (G[7:2] << 5) | R[7:3]
-        pack_rgb16      v8,  v4,  v5,  v6,  2, 11
-        pack_rgb16      v9,  v16, v17, v18, 2, 11
-        st1             { v8.8h, v9.8h}, [x2], #32
-    .else
-     .ifc \ofmt,rgb555le
-        compute_rgb     v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
-        // RGB555 LE: (R[7:3] << 10) | (G[7:3] << 5) | B[7:3]
-        pack_rgb16      v8,  v6,  v5,  v4,  3, 10
-        pack_rgb16      v9,  v18, v17, v16, 3, 10
-        st1             { v8.8h, v9.8h}, [x2], #32
-     .else
-      .ifc \ofmt,bgr555le
-        compute_rgb     v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b
-        // BGR555 LE: (B[7:3] << 10) | (G[7:3] << 5) | R[7:3]
-        pack_rgb16      v8,  v4,  v5,  v6,  3, 10
-        pack_rgb16      v9,  v16, v17, v18, 3, 10
-        st1             { v8.8h, v9.8h}, [x2], #32
-      .else
         st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
         st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
-      .endif
-     .endif
-    .endif
    .endif
   .endif
  .endif
@@ -464,7 +425,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         subs            w1, w1, #1                                      // 
height -= 1
         b.gt            1b
         mov             w0, w9
-        restore_d8_d9_if_16bpp \ofmt
+        restore_d8_d9_if_16bpp
         ret
 endfunc
 .endm
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/aarch64/yuv2rgb_neon: aggregate 16bpp predicates (PR #23214)

Reply via email to