yuv2rgb_neon: 2 lines at a time (PR #23272)

DROO AMOR via ffmpeg-devel Fri, 29 May 2026 10:09:38 -0700

PR #23272 opened by DROO AMOR (DROOdotFOO)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23272
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23272.patch


# swscale/aarch64/yuv2rgb_neon: 2 lines at a time

The 2-lines-at-a-time follow-up discussed on #23152. 
For vertically-subsampled inputs (nv12, nv21, yuv420p, yuva420p) two output 
rows share a chroma row, so the chroma->RGB offsets are computed once and 
applied to both rows instead of being recomputed per row. 
yuv422p is unchanged (full-height chroma, nothing to reuse).

The mechanism is Ramiro Polla's, from his unsent follow-up to 2e142e52ae.

**Result:** ~14-24% fewer NEON cycles at width=1920 on Apple M1 across the 
converted formats (yuv422p within +-2%).

### Testing

- `checkasm --test=sw_yuv2rgb`: 110/110
- full `checkasm`: 7657/7657
- `make fate`: clean
- `tools/patcheck`: no new warnings

CPU: Apple M1 (`sysctl -n machdep.cpu.brand_string`).

Co-authored-by: Ramiro Polla <[email protected]>


>From cd986c2e37b87dfdfcee60c22db24620a9261674 Mon Sep 17 00:00:00 2001
From: DROOdotFOO <[email protected]>
Date: Fri, 29 May 2026 18:43:17 +0200
Subject: [PATCH] swscale/aarch64/yuv2rgb_neon: 2 lines at a time

For the vertically-subsampled inputs (nv12, nv21, yuv420p, yuva420p)
two output rows share one chroma row. Process both rows in the inner
loop and derive the chroma->RGB offsets once, instead of the
single-row path that rewinds the chroma pointer to recompute them for
every row. yuv422p keeps the single-row path; its chroma is full
height, so there is nothing to reuse. yuva420p loads its
full-resolution alpha once per row.

The mechanism is from Ramiro Polla's unsent follow-up to 2e142e52ae.

NEON cycles, Apple M1, width=1920 (checkasm --bench), single-row ->
two-row:

  | input   | argb       | rgb24      | gbrp       | rgb565le   |
  |---------|------------|------------|------------|------------|
  | yuv420p | 43.6->35.2 | 38.2->30.2 | 36.0->28.3 | 49.3->41.8 |
  | nv12    | 45.3->36.7 | 38.7->30.6 | 38.0->29.8 | 52.8->42.8 |
  | nv21    | 45.4->36.3 | 40.1->30.6 | 38.4->30.1 | 50.7->43.2 |

yuva420p packed runs the same band (argb 44.0->35.7); its
rgb24/gbrp/16bpp reuse the yuv420p path. ~14-24% fewer cycles across
the converted formats; yuv422p (unchanged) stays within +-2%.

Verified with checkasm --test=sw_yuv2rgb (110/110) and the full
checkasm regression (7657/7657) on Apple M1.

Co-authored-by: Ramiro Polla <[email protected]>
Signed-off-by: DROOdotFOO <[email protected]>
---
 libswscale/aarch64/yuv2rgb_neon.S | 580 ++++++++++++++++++++++--------
 1 file changed, 434 insertions(+), 146 deletions(-)

diff --git a/libswscale/aarch64/yuv2rgb_neon.S 
b/libswscale/aarch64/yuv2rgb_neon.S
index 484d630998..7602cf428d 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -45,88 +45,6 @@
         sub             w16, w16, w0                                    // w16 
= linesize2 - width     (padding2)
 .endm
 
-.macro load_args_nv12 ofmt
-        ldr             x8,  [sp]                                       // 
table
-        load_yoff_ycoeff 8, 16                                           // 
y_offset, y_coeff
-        ld1             {v1.1d}, [x8]
-        dup             v0.8h, w10
-        dup             v3.8h, w9
-.ifc \ofmt,gbrp
-        load_dst1_dst2  24, 32, 40, 48
-        sub             w3, w3, w0                                      // w3 
= linesize  - width     (padding)
-.else
- .ifc \ofmt,rgb24
-        add             w17, w0, w0, lsl #1
-        sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
- .else
-  .ifc \ofmt,bgr24
-        add             w17, w0, w0, lsl #1
-        sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
-  .else
-   .if rgb16
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-   .else
-        sub             w3, w3, w0, lsl #2                              // w3 
= linesize  - width * 4 (padding)
-   .endif
-  .endif
- .endif
-.endif
-        sub             w5, w5, w0                                      // w5 
= linesizeY - width     (paddingY)
-        sub             w7, w7, w0                                      // w7 
= linesizeC - width     (paddingC)
-        neg             w11, w0
-.endm
-
-.macro load_args_nv21 ofmt
-        load_args_nv12  \ofmt
-.endm
-
-.macro load_args_yuv420p ofmt
-        ldr             x13, [sp]                                       // srcV
-        ldr             w14, [sp, #8]                                   // 
linesizeV
-        ldr             x8,  [sp, #16]                                  // 
table
-        load_yoff_ycoeff 24, 32                                          // 
y_offset, y_coeff
-        ld1             {v1.1d}, [x8]
-        dup             v0.8h, w10
-        dup             v3.8h, w9
-.ifc \ofmt,gbrp
-        load_dst1_dst2  40, 48, 56, 64
-        sub             w3, w3, w0                                      // w3 
= linesize  - width     (padding)
-.else
- .ifc \ofmt,rgb24
-        add             w17, w0, w0, lsl #1
-        sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
- .else
-  .ifc \ofmt,bgr24
-        add             w17, w0, w0, lsl #1
-        sub             w3, w3, w17                                     // w3 
= linesize  - width * 3 (padding)
-  .else
-   .if rgb16
-        sub             w3, w3, w0, lsl #1                              // w3 
= linesize  - width * 2 (padding)
-   .else
-        sub             w3, w3, w0, lsl #2                              // w3 
= linesize  - width * 4 (padding)
-   .endif
-  .endif
- .endif
-.endif
-        sub             w5, w5, w0                                      // w5 
= linesizeY - width     (paddingY)
-        sub             w7,  w7,  w0, lsr #1                            // w7  
= linesizeU - width / 2 (paddingU)
-        sub             w14, w14, w0, lsr #1                            // w14 
= linesizeV - width / 2 (paddingV)
-        lsr             w11, w0, #1
-        neg             w11, w11
-.endm
-
-.macro load_args_yuva420p ofmt
-        load_args_yuv420p \ofmt
-#if defined(__APPLE__)
-        ldr             x15, [sp, #32]                                  // srcA
-        ldr             w16, [sp, #40]                                  // 
linesizeA
-#else
-        ldr             x15, [sp, #40]                                  // srcA
-        ldr             w16, [sp, #48]                                  // 
linesizeA
-#endif
-        sub             w16, w16, w0                                    // w16 
= linesizeA - width    (paddingA)
-.endm
-
 .macro load_args_yuv422p ofmt
         ldr             x13, [sp]                                       // srcV
         ldr             w14, [sp, #8]                                   // 
linesizeV
@@ -179,37 +97,10 @@
         ushll           v19.8h, v17.8b, #3
 .endm
 
-.macro load_chroma_yuva420p
-        load_chroma_yuv420p
-.endm
-
 .macro load_chroma_yuv422p
         load_chroma_yuv420p
 .endm
 
-.macro increment_nv12
-        ands            w17, w1, #1
-        csel            w17, w7, w11, ne                                // 
incC = (h & 1) ? paddincC : -width
-        add             x6,  x6, w17, sxtw                              // 
srcC += incC
-.endm
-
-.macro increment_nv21
-        increment_nv12
-.endm
-
-.macro increment_yuv420p
-        ands            w17, w1, #1
-        csel            w17,  w7, w11, ne                               // 
incU = (h & 1) ? paddincU : -width/2
-        add             x6,  x6,  w17, sxtw                             // 
srcU += incU
-        csel            w17, w14, w11, ne                               // 
incV = (h & 1) ? paddincV : -width/2
-        add             x13, x13, w17, sxtw                             // 
srcV += incV
-.endm
-
-.macro increment_yuva420p
-        increment_yuv420p
-        add             x15, x15, w16, sxtw                             // 
srcA += paddingA (every row)
-.endm
-
 .macro increment_yuv422p
         add             x6,  x6,  w7, sxtw                              // 
srcU += incU
         add             x13, x13, w14, sxtw                             // 
srcV += incV
@@ -236,10 +127,144 @@
         mov             \a2, v30.8b
 .endm
 
-.macro compute_rgba_alpha r1 g1 b1 a1 r2 g2 b2 a2
-        compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
-        mov             \a1, v28.8b                                     // 
real alpha (first 8 pixels)
-        mov             \a2, v29.8b                                     // 
real alpha (next 8 pixels)
+// Chroma-preserving variant of compute_rgb for the 2-lines-at-a-time
+// path: the per-luma sums are written into the destination registers
+// instead of v20-v25, so the chroma contributions in v20-v25 survive to
+// be reused for the second luma row. Args are bare reg names (e.g. v5).
+.macro compute_rgb_2l r1 g1 b1 r2 g2 b2
+        add             \r1\().8h, v26.8h, v20.8h                       // Y1 
+ R1
+        add             \r2\().8h, v27.8h, v21.8h                       // Y2 
+ R2
+        add             \g1\().8h, v26.8h, v22.8h                       // Y1 
+ G1
+        add             \g2\().8h, v27.8h, v23.8h                       // Y2 
+ G2
+        add             \b1\().8h, v26.8h, v24.8h                       // Y1 
+ B1
+        add             \b2\().8h, v27.8h, v25.8h                       // Y2 
+ B2
+        sqrshrun        \r1\().8b, \r1\().8h, #1                        // 
clip_u8((Y1 + R1) >> 1)
+        sqrshrun        \r2\().8b, \r2\().8h, #1                        // 
clip_u8((Y2 + R2) >> 1)
+        sqrshrun        \g1\().8b, \g1\().8h, #1                        // 
clip_u8((Y1 + G1) >> 1)
+        sqrshrun        \g2\().8b, \g2\().8h, #1                        // 
clip_u8((Y2 + G2) >> 1)
+        sqrshrun        \b1\().8b, \b1\().8h, #1                        // 
clip_u8((Y1 + B1) >> 1)
+        sqrshrun        \b2\().8b, \b2\().8h, #1                        // 
clip_u8((Y2 + B2) >> 1)
+.endm
+
+// Shared chroma -> RGB offsets for the 2-lines path. Consumes the widened
+// chroma in v18/v19 (set by load_chroma_<ifmt>) and produces the per-channel
+// chroma contributions in v20-v25 (R1,R2,G1,G2,B1,B2). Computed once per
+// pixel column and reused by both luma rows via compute_rgb_2l.
+.macro chroma_to_rgb_offsets
+        sub             v18.8h, v18.8h, v31.8h                          // 
U*(1<<3) - 128*(1<<3)
+        sub             v19.8h, v19.8h, v31.8h                          // 
V*(1<<3) - 128*(1<<3)
+        sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * 
v2r            (R)
+        sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * 
u2g
+        sqdmulh         v19.8h, v19.8h, v1.h[2]                         //     
      V * v2g
+        sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * 
u2b            (B)
+        add             v22.8h, v22.8h, v19.8h                          // U * 
u2g + V * v2g  (G)
+        zip2            v21.8h, v20.8h, v20.8h                          // R2
+        zip1            v20.8h, v20.8h, v20.8h                          // R1
+        zip2            v23.8h, v22.8h, v22.8h                          // G2
+        zip1            v22.8h, v22.8h, v22.8h                          // G1
+        zip2            v25.8h, v24.8h, v24.8h                          // B2
+        zip1            v24.8h, v24.8h, v24.8h                          // B1
+.endm
+
+// Load and scale 16 luma samples from \rsrcY into v26 (Y1) / v27 (Y2),
+// ready for compute_rgb_2l. v0 = y_coeff, v3 = y_offset (loop-invariant).
+.macro load_luma rsrcY
+        ld1             {v2.16b}, [\rsrcY], #16                         // 
load luma
+        ushll           v26.8h, v2.8b,  #3                              // 
Y1*(1<<3)
+        ushll2          v27.8h, v2.16b, #3                              // 
Y2*(1<<3)
+        sub             v26.8h, v26.8h, v3.8h                           // 
Y1*(1<<3) - y_offset
+        sub             v27.8h, v27.8h, v3.8h                           // 
Y2*(1<<3) - y_offset
+        sqdmulh         v26.8h, v26.8h, v0.8h                           // (Y1 
* y_coeff) >> 15
+        sqdmulh         v27.8h, v27.8h, v0.8h                           // (Y2 
* y_coeff) >> 15
+.endm
+
+// Process one output row for the 2-lines path: load 16 luma px from \rsrcY,
+// combine with the shared chroma offsets (v20-v25), and store 16 px in
+// format \ofmt. Packed/16bpp use \rdst0 only; gbrp uses \rdst0/1/2; yuva420p
+// reads per-row alpha from \rsrcA. v20-v25 are preserved for the next row.
+// The .if rgb16 / r_first / gshift / hshift branch below depends on the
+// rgb16 predicates -- the caller MUST run set_rgb16_predicates \ofmt before
+// invoking this macro (every declare_2l_* path does so on entry).
+.macro process_row ifmt, ofmt, rsrcY, rsrcA, rdst0, rdst1, rdst2
+        load_luma       \rsrcY
+.ifc \ifmt,yuva420p
+        ld1             {v28.8b, v29.8b}, [\rsrcA], #16                 // 16 
alpha bytes
+.endif
+.ifc \ofmt,argb // a r g b
+        compute_rgb_2l  v5, v6, v7, v17, v18, v19
+ .ifc \ifmt,yuva420p
+        mov             v4.8b,  v28.8b
+        mov             v16.8b, v29.8b
+ .else
+        mov             v4.8b,  v30.8b
+        mov             v16.8b, v30.8b
+ .endif
+        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
+        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
+.endif
+.ifc \ofmt,rgba // r g b a
+        compute_rgb_2l  v4, v5, v6, v16, v17, v18
+ .ifc \ifmt,yuva420p
+        mov             v7.8b,  v28.8b
+        mov             v19.8b, v29.8b
+ .else
+        mov             v7.8b,  v30.8b
+        mov             v19.8b, v30.8b
+ .endif
+        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
+        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
+.endif
+.ifc \ofmt,abgr // a b g r
+        compute_rgb_2l  v7, v6, v5, v19, v18, v17
+ .ifc \ifmt,yuva420p
+        mov             v4.8b,  v28.8b
+        mov             v16.8b, v29.8b
+ .else
+        mov             v4.8b,  v30.8b
+        mov             v16.8b, v30.8b
+ .endif
+        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
+        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
+.endif
+.ifc \ofmt,bgra // b g r a
+        compute_rgb_2l  v6, v5, v4, v18, v17, v16
+ .ifc \ifmt,yuva420p
+        mov             v7.8b,  v28.8b
+        mov             v19.8b, v29.8b
+ .else
+        mov             v7.8b,  v30.8b
+        mov             v19.8b, v30.8b
+ .endif
+        st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [\rdst0], #32
+        st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [\rdst0], #32
+.endif
+.ifc \ofmt,rgb24
+        compute_rgb_2l  v4, v5, v6, v16, v17, v18
+        st3             { v4.8b, v5.8b, v6.8b}, [\rdst0], #24
+        st3             {v16.8b,v17.8b,v18.8b}, [\rdst0], #24
+.endif
+.ifc \ofmt,bgr24
+        compute_rgb_2l  v6, v5, v4, v18, v17, v16
+        st3             { v4.8b, v5.8b, v6.8b}, [\rdst0], #24
+        st3             {v16.8b,v17.8b,v18.8b}, [\rdst0], #24
+.endif
+.ifc \ofmt,gbrp
+        compute_rgb_2l  v18, v4, v6, v19, v5, v7
+        st1             {  v4.8b,  v5.8b }, [\rdst0], #16
+        st1             {  v6.8b,  v7.8b }, [\rdst1], #16
+        st1             { v18.8b, v19.8b }, [\rdst2], #16
+.endif
+.if rgb16
+        compute_rgb_2l  v4, v5, v6, v16, v17, v18
+ .if r_first
+        pack_rgb16_2l   v8,  v6,  v5,  v4,  gshift, hshift
+        pack_rgb16_2l   v9,  v18, v17, v16, gshift, hshift
+ .else
+        pack_rgb16_2l   v8,  v4,  v5,  v6,  gshift, hshift
+        pack_rgb16_2l   v9,  v16, v17, v18, gshift, hshift
+ .endif
+        st1             { v8.8h, v9.8h}, [\rdst0], #32
+.endif
 .endm
 
 // Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts
@@ -309,6 +334,23 @@
         sli             \dst\().8h, v23.8h, #\high_shl
 .endm
 
+// As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after
+// compute_rgb_2l) instead of v20-v23, so the shared chroma contributions
+// in v20-v25 survive for the second luma row. Clobbers v26-v29.
+// NB: v28/v29 also hold the per-row alpha in the yuva420p path, so this is
+// only safe because yuva420p never reaches a 16bpp output (its 16bpp targets
+// are dispatched through the yuv420p path); do not call this for yuva inputs.
+.macro pack_rgb16_2l dst, low_ch, mid_ch, high_ch, g_shr, high_shl
+        ushr            v26.8b, \high_ch\().8b, #3
+        ushr            v27.8b, \mid_ch\().8b,  #\g_shr
+        ushr            v28.8b, \low_ch\().8b,  #3
+        uxtl            \dst\().8h, v28.8b
+        uxtl            v29.8h, v27.8b
+        sli             \dst\().8h, v29.8h, #5
+        uxtl            v29.8h, v26.8b
+        sli             \dst\().8h, v29.8h, #\high_shl
+.endm
+
 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         set_rgb16_predicates \ofmt
@@ -327,9 +369,6 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * 
v2r            (R)
         sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * 
u2g
         ld1             {v2.16b}, [x4], #16                             // 
load luma (interleaved)
-.ifc \ifmt,yuva420p
-        ld1             {v28.8b, v29.8b}, [x15], #16                    // 
load 16 alpha bytes
-.endif
         sqdmulh         v19.8h, v19.8h, v1.h[2]                         //     
      V * v2g
         sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * 
u2b            (B)
         ushll           v26.8h, v2.8b,  #3                              // 
Y1*(1<<3)
@@ -347,35 +386,19 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         zip1            v24.8h, v24.8h, v24.8h                          // B1
 
 .ifc \ofmt,argb // 1 2 3 0
- .ifc \ifmt,yuva420p
-        compute_rgba_alpha v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
- .else
         compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
- .endif
 .endif
 
 .ifc \ofmt,rgba // 0 1 2 3
- .ifc \ifmt,yuva420p
-        compute_rgba_alpha v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
- .else
         compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
- .endif
 .endif
 
 .ifc \ofmt,abgr // 3 2 1 0
- .ifc \ifmt,yuva420p
-        compute_rgba_alpha v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
- .else
         compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
- .endif
 .endif
 
 .ifc \ofmt,bgra // 2 1 0 3
- .ifc \ifmt,yuva420p
-        compute_rgba_alpha v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
- .else
         compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
- .endif
 .endif
 
 .ifc \ofmt,rgb24
@@ -440,9 +463,274 @@ endfunc
         declare_func    \ifmt, bgr24
 .endm
 
-declare_rgb_funcs nv12
-declare_rgb_funcs nv21
-declare_rgb_funcs yuv420p
+// 2-lines-at-a-time path for the single-dst-pointer output formats
+// (argb/rgba/abgr/bgra, rgb24/bgr24, and the four 16bpp LE forms) with
+// vertically-subsampled inputs (nv12/nv21/yuv420p). Two consecutive output
+// rows share one chroma row, so the chroma -> RGB offsets (v20-v25) are
+// computed once and applied to both luma rows, halving the chroma work
+// relative to the single-row declare_func (which rewinds the chroma pointer
+// to re-derive it per row). ABI is identical to the function it replaces.
+// Co-authored with Ramiro Polla's unsent series (PR #23152, item #5).
+// Uses caller-saved scratch only (x9=srcY2, x12=dst2); 16bpp additionally
+// spills d8/d9 (callee-saved) as the legacy path does.
+// Precondition (shared by all declare_2l_* paths): the slice height is even.
+// These converters only run for vertically-subsampled (4:2:0) sources, where
+// an even slice height is a libswscale contract -- a chroma row is shared by
+// two luma rows, so a slice cannot end mid-pair. The dispatch also gates on
+// !(src_h & 1). The single-row declare_func relies on the same invariant for
+// its chroma pairing; the 2-line loop simply consumes it two rows at a time
+// (height -= 2) with no odd-row tail.
+.macro declare_2l_packed ifmt ofmt
+function ff_\ifmt\()_to_\ofmt\()_neon, export=1
+        set_rgb16_predicates \ofmt
+.ifc \ifmt,yuv420p
+        ldr             x13, [sp]                                       // srcV
+        ldr             w14, [sp, #8]                                   // 
linesizeV
+        ldr             x8,  [sp, #16]                                  // 
table
+        load_yoff_ycoeff 24, 32                                          // 
y_offset, y_coeff
+.else
+        ldr             x8,  [sp]                                       // 
table
+        load_yoff_ycoeff 8, 16                                           // 
y_offset, y_coeff
+.endif
+        ld1             {v1.1d}, [x8]
+        dup             v0.8h, w10                                      // 
y_coeff
+        dup             v3.8h, w9                                       // 
y_offset
+        save_d8_d9_if_16bpp
+        add             x9,  x4, w5, sxtw                               // 
srcY2 = srcY + linesizeY
+        add             x12, x2, w3, sxtw                               // 
dst2  = dst  + linesize
+        lsl             w17, w5, #1
+        sub             w5,  w17, w0                                    // 
srcY pair stride = 2*linesizeY - width
+        lsl             w17, w3, #1
+.if rgb16
+        sub             w3,  w17, w0, lsl #1                            // dst 
pair stride = 2*linesize - width*2
+.else
+ .ifc \ofmt,rgb24
+        sub             w3,  w17, w0
+        sub             w3,  w3,  w0, lsl #1                            // dst 
pair stride = 2*linesize - width*3
+ .else
+  .ifc \ofmt,bgr24
+        sub             w3,  w17, w0
+        sub             w3,  w3,  w0, lsl #1                            // dst 
pair stride = 2*linesize - width*3
+  .else
+        sub             w3,  w17, w0, lsl #2                            // dst 
pair stride = 2*linesize - width*4
+  .endif
+ .endif
+.endif
+.ifc \ifmt,yuv420p
+        sub             w7,  w7,  w0, lsr #1                            // 
paddingU = linesizeU - width/2
+        sub             w14, w14, w0, lsr #1                            // 
paddingV = linesizeV - width/2
+.else
+        sub             w7,  w7,  w0                                    // 
paddingC = linesizeC - width
+.endif
+        movi            v31.8h, #4, lsl #8                              // 128 
* (1<<3) (loop-invariant)
+        movi            v30.8b, #255                                    // 
alpha = 255  (loop-invariant)
+        mov             w15, w1                                         // 
save original height (return value)
+1:
+        mov             w8, w0                                          // w8 
= width
+2:
+        load_chroma_\ifmt
+        chroma_to_rgb_offsets
+        process_row     \ifmt, \ofmt, x4, x4, x2,  x2,  x2             // line 
1
+        process_row     \ifmt, \ofmt, x9, x9, x12, x12, x12            // line 
2
+        subs            w8, w8, #16                                     // 
width -= 16
+        b.gt            2b
+        add             x2,  x2,  w3, sxtw                              // dst 
  += pair stride
+        add             x12, x12, w3, sxtw                              // 
dst2  += pair stride
+        add             x4,  x4,  w5, sxtw                              // 
srcY  += pair stride
+        add             x9,  x9,  w5, sxtw                              // 
srcY2 += pair stride
+.ifc \ifmt,yuv420p
+        add             x6,  x6,  w7,  sxtw                             // 
srcU += paddingU
+        add             x13, x13, w14, sxtw                             // 
srcV += paddingV
+.else
+        add             x6,  x6,  w7,  sxtw                             // 
srcC += paddingC
+.endif
+        subs            w1, w1, #2                                      // 
height -= 2
+        b.gt            1b
+        mov             w0, w15
+        restore_d8_d9_if_16bpp
+        ret
+endfunc
+.endm
+
+.macro declare_rgb_funcs_2l_packed ifmt
+        declare_2l_packed \ifmt, argb
+        declare_2l_packed \ifmt, rgba
+        declare_2l_packed \ifmt, abgr
+        declare_2l_packed \ifmt, bgra
+        declare_2l_packed \ifmt, rgb24
+        declare_2l_packed \ifmt, bgr24
+.endm
+
+// 2-lines-at-a-time path for yuva420p -> {argb,rgba,abgr,bgra}. Chroma is
+// vertically subsampled and shared between the two output rows, but the
+// alpha plane is full resolution, so each row loads its own alpha (x15 line
+// 1, x11 line 2). Caller-saved scratch only (x9=srcY2, x11=srcA2, x12=dst2).
+.macro declare_2l_yuva ofmt
+function ff_yuva420p_to_\ofmt\()_neon, export=1
+        set_rgb16_predicates \ofmt                                       // 
rgb16=0 for yuva packed ofmts
+        ldr             x13, [sp]                                       // srcV
+        ldr             w14, [sp, #8]                                   // 
linesizeV
+        ldr             x8,  [sp, #16]                                  // 
table
+        load_yoff_ycoeff 24, 32                                          // 
y_offset, y_coeff
+#if defined(__APPLE__)
+        ldr             x15, [sp, #32]                                  // srcA
+        ldr             w16, [sp, #40]                                  // 
linesizeA
+#else
+        ldr             x15, [sp, #40]                                  // srcA
+        ldr             w16, [sp, #48]                                  // 
linesizeA
+#endif
+        ld1             {v1.1d}, [x8]
+        dup             v0.8h, w10                                      // 
y_coeff
+        dup             v3.8h, w9                                       // 
y_offset
+        mov             w10, w1                                         // 
save original height (return value)
+        add             x9,  x4,  w5,  sxtw                             // 
srcY2 = srcY + linesizeY
+        add             x12, x2,  w3,  sxtw                             // 
dst2  = dst  + linesize
+        add             x11, x15, w16, sxtw                             // 
srcA2 = srcA + linesizeA
+        lsl             w17, w5, #1
+        sub             w5,  w17, w0                                    // 
srcY pair stride = 2*linesizeY - width
+        lsl             w17, w3, #1
+        sub             w3,  w17, w0, lsl #2                            // dst 
pair stride = 2*linesize - width*4
+        lsl             w16, w16, #1
+        sub             w16, w16, w0                                    // 
srcA pair stride = 2*linesizeA - width
+        sub             w7,  w7,  w0, lsr #1                            // 
paddingU = linesizeU - width/2
+        sub             w14, w14, w0, lsr #1                            // 
paddingV = linesizeV - width/2
+        movi            v31.8h, #4, lsl #8                              // 128 
* (1<<3) (loop-invariant)
+        movi            v30.8b, #255                                    // 
alpha = 255 (unused for yuva)
+1:
+        mov             w8, w0                                          // w8 
= width
+2:
+        load_chroma_yuv420p
+        chroma_to_rgb_offsets
+        process_row     yuva420p, \ofmt, x4, x15, x2,  x2,  x2         // line 
1
+        process_row     yuva420p, \ofmt, x9, x11, x12, x12, x12        // line 
2
+        subs            w8, w8, #16                                     // 
width -= 16
+        b.gt            2b
+        add             x2,  x2,  w3,  sxtw                             // dst 
  += pair stride
+        add             x12, x12, w3,  sxtw                             // 
dst2  += pair stride
+        add             x4,  x4,  w5,  sxtw                             // 
srcY  += pair stride
+        add             x9,  x9,  w5,  sxtw                             // 
srcY2 += pair stride
+        add             x15, x15, w16, sxtw                             // 
srcA  += pair stride
+        add             x11, x11, w16, sxtw                             // 
srcA2 += pair stride
+        add             x6,  x6,  w7,  sxtw                             // 
srcU += paddingU
+        add             x13, x13, w14, sxtw                             // 
srcV += paddingV
+        subs            w1, w1, #2                                      // 
height -= 2
+        b.gt            1b
+        mov             w0, w10
+        ret
+endfunc
+.endm
+
+.macro declare_yuva_funcs_2l
+        declare_2l_yuva argb
+        declare_2l_yuva rgba
+        declare_2l_yuva abgr
+        declare_2l_yuva bgra
+.endm
+
+// 2-lines-at-a-time path for {nv12,nv21,yuv420p} -> gbrp (three output
+// planes). Like declare_2l_packed but with three dst pointers per line, so
+// the second-line plane pointers exhaust the caller-saved registers; x19/x20
+// are spilled (AAPCS callee-saved). All stack args are read before the spill
+// so the standard arg offsets apply. Plane line-2 ptrs: x11/x17/x19, srcY2 
x20.
+.macro declare_2l_gbrp ifmt
+function ff_\ifmt\()_to_gbrp_neon, export=1
+        set_rgb16_predicates gbrp                                       // 
rgb16=0 for gbrp
+// y_coeff/y_offset must be consumed into v0/v3 before x10 is reloaded with
+// dst1 (load_yoff_ycoeff leaves y_coeff in w10). All stack args are read
+// before x19/x20 are spilled so the standard arg offsets apply.
+.ifc \ifmt,yuv420p
+        ldr             x13, [sp]                                       // srcV
+        ldr             w14, [sp, #8]                                   // 
linesizeV
+        ldr             x8,  [sp, #16]                                  // 
table
+        load_yoff_ycoeff 24, 32                                          // 
y_offset, y_coeff
+        ld1             {v1.1d}, [x8]
+        dup             v0.8h, w10                                      // 
y_coeff
+        dup             v3.8h, w9                                       // 
y_offset
+#if defined(__APPLE__)
+        ldr             x10, [sp, #32]                                  // dst1
+        ldr             w12, [sp, #40]                                  // 
linesize1
+        ldr             x15, [sp, #48]                                  // dst2
+        ldr             w16, [sp, #56]                                  // 
linesize2
+#else
+        ldr             x10, [sp, #40]
+        ldr             w12, [sp, #48]
+        ldr             x15, [sp, #56]
+        ldr             w16, [sp, #64]
+#endif
+.else
+        ldr             x8,  [sp]                                       // 
table
+        load_yoff_ycoeff 8, 16                                           // 
y_offset, y_coeff
+        ld1             {v1.1d}, [x8]
+        dup             v0.8h, w10                                      // 
y_coeff
+        dup             v3.8h, w9                                       // 
y_offset
+#if defined(__APPLE__)
+        ldr             x10, [sp, #16]                                  // dst1
+        ldr             w12, [sp, #24]                                  // 
linesize1
+        ldr             x15, [sp, #32]                                  // dst2
+        ldr             w16, [sp, #40]                                  // 
linesize2
+#else
+        ldr             x10, [sp, #24]
+        ldr             w12, [sp, #32]
+        ldr             x15, [sp, #40]
+        ldr             w16, [sp, #48]
+#endif
+.endif
+        stp             x19, x20, [sp, #-0x10]!                         // 
callee-saved (line2 planar ptrs)
+        mov             w9,  w1                                         // 
save original height (return value)
+        add             x20, x4,  w5,  sxtw                            // 
srcY2  = srcY  + linesizeY
+        lsl             w8,  w5,  #1
+        sub             w5,  w8,  w0                                   // srcY 
pair stride = 2*linesizeY - width
+        add             x11, x2,  w3,  sxtw                            // 
dst0_2 = dst0  + linesize0
+        lsl             w8,  w3,  #1
+        sub             w3,  w8,  w0                                   // dst0 
pair stride = 2*linesize0 - width
+        add             x17, x10, w12, sxtw                            // 
dst1_2 = dst1  + linesize1
+        lsl             w8,  w12, #1
+        sub             w12, w8,  w0                                   // dst1 
pair stride = 2*linesize1 - width
+        add             x19, x15, w16, sxtw                            // 
dst2_2 = dst2  + linesize2
+        lsl             w8,  w16, #1
+        sub             w16, w8,  w0                                   // dst2 
pair stride = 2*linesize2 - width
+.ifc \ifmt,yuv420p
+        sub             w7,  w7,  w0, lsr #1                           // 
paddingU = linesizeU - width/2
+        sub             w14, w14, w0, lsr #1                           // 
paddingV = linesizeV - width/2
+.else
+        sub             w7,  w7,  w0                                  // 
paddingC = linesizeC - width
+.endif
+        movi            v31.8h, #4, lsl #8                             // 128 
* (1<<3) (loop-invariant)
+1:
+        mov             w8, w0                                         // w8 = 
width
+2:
+        load_chroma_\ifmt
+        chroma_to_rgb_offsets
+        process_row     \ifmt, gbrp, x4,  x4,  x2,  x10, x15          // line 1
+        process_row     \ifmt, gbrp, x20, x20, x11, x17, x19          // line 2
+        subs            w8, w8, #16                                    // 
width -= 16
+        b.gt            2b
+        add             x2,  x2,  w3,  sxtw                            // dst0 
 += pair stride
+        add             x10, x10, w12, sxtw                            // dst1 
 += pair stride
+        add             x15, x15, w16, sxtw                            // dst2 
 += pair stride
+        add             x11, x11, w3,  sxtw                            // 
dst0_2 += pair stride
+        add             x17, x17, w12, sxtw                            // 
dst1_2 += pair stride
+        add             x19, x19, w16, sxtw                            // 
dst2_2 += pair stride
+        add             x4,  x4,  w5,  sxtw                            // srcY 
 += pair stride
+        add             x20, x20, w5,  sxtw                            // 
srcY2 += pair stride
+        add             x6,  x6,  w7,  sxtw                            // 
srcU/srcC += padding
+.ifc \ifmt,yuv420p
+        add             x13, x13, w14, sxtw                            // srcV 
+= paddingV
+.endif
+        subs            w1, w1, #2                                     // 
height -= 2
+        b.gt            1b
+        mov             w0, w9
+        ldp             x19, x20, [sp], #0x10                          // 
restore callee-saved
+        ret
+endfunc
+.endm
+
+declare_rgb_funcs_2l_packed nv12
+        declare_2l_gbrp nv12
+declare_rgb_funcs_2l_packed nv21
+        declare_2l_gbrp nv21
+declare_rgb_funcs_2l_packed yuv420p
+        declare_2l_gbrp yuv420p
 declare_rgb_funcs yuv422p
 
 .macro declare_rgb16_funcs ifmt
@@ -452,16 +740,16 @@ declare_rgb_funcs yuv422p
         declare_func    \ifmt, bgr555le
 .endm
 
-declare_rgb16_funcs nv12
-declare_rgb16_funcs nv21
-declare_rgb16_funcs yuv420p
-declare_rgb16_funcs yuv422p
-
-.macro declare_yuva_funcs ifmt
-        declare_func    \ifmt, argb
-        declare_func    \ifmt, rgba
-        declare_func    \ifmt, abgr
-        declare_func    \ifmt, bgra
+.macro declare_rgb16_funcs_2l ifmt
+        declare_2l_packed \ifmt, rgb565le
+        declare_2l_packed \ifmt, bgr565le
+        declare_2l_packed \ifmt, rgb555le
+        declare_2l_packed \ifmt, bgr555le
 .endm
 
-declare_yuva_funcs yuva420p
+declare_rgb16_funcs_2l nv12
+declare_rgb16_funcs_2l nv21
+declare_rgb16_funcs_2l yuv420p
+declare_rgb16_funcs yuv422p
+
+declare_yuva_funcs_2l
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/aarch64/yuv2rgb_neon: 2 lines at a time (PR #23272)

Reply via email to