[FFmpeg-devel] [PR] swscale/ops: revive series for splitting planes for disjoint passes (PR #23467)

Niklas Haas via ffmpeg-devel Fri, 12 Jun 2026 09:11:05 -0700

PR #23467 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23467
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23467.patch


Partially replaces #22293, though I decided to omit the actual allocation 
changes and refcopy commits for now to keep the change manageable.

TODO:
- [ ] I accidentally based this work off an older version of the plane_split 
branch (v2); I need to see if something changed in the meantime in the newer 
version (sws_plane_split_v3)


>From deb2db6542958304b1206074fa4bb1ed10b15946 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 8 Jan 2026 13:08:23 +0100
Subject: [PATCH 1/9] swscale/ops: solve for plane dependencies

When output planes are independent of each other, we can try and split them
into separate output paths.

Generates benign diffs that just reflect the new addition, e.g.:

 yuva444p 16x16 -> rgb24 16x16:
   [ u8 +++X] SWS_OP_READ         : 3 elem(s) planar >> 0
-    min: {0 0 0 _}, max: {255 255 255 _}
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 1_ 2_ _}
   [ u8 +++X] SWS_OP_CONVERT      : u8 -> f32
-    min: {0 0 0 _}, max: {255 255 255 _}
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 1_ 2_ _}
   [f32 ...X] SWS_OP_LINEAR       : matrix3+off3 [...]
-    min: {...}, max: {...}
+    min: {...}, max: {...}, deps: {02_ 012_ 01_ _}
   [f32 ...X] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 -1}
-    min: {...}, max: {...}
+    min: {...}, max: {...}, deps: {02_ 012_ 01_ _}
   [f32 ...X] SWS_OP_MAX          : {0 0 0 _} <= x
-    min: {0 0 0 _}, max: {...}
+    min: {0 0 0 _}, max: {...}, deps: {02_ 012_ 01_ _}
   [f32 ...X] SWS_OP_MIN          : x <= {255 255 255 _}
-    min: {0 0 0 _}, max: {255 255 255 _}
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {02_ 012_ 01_ _}
   [f32 +++X] SWS_OP_CONVERT      : f32 -> u8
-    min: {0 0 0 _}, max: {255 255 255 _}
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {02_ 012_ 01_ _}
   [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) packed >> 0
     (X = unused, z = byteswapped, + = exact, 0 = zero)
  translated micro-ops:
     u8_read_planar_xyz
     u8_to_f32_xyz
     f32_linear_xyz_x0x0x_xxx0x_xx00x
     f32_dither_xyz_0_3_2_16x16
     f32_max_xyz
     f32_min_xyz
     f32_to_u8_xyz
     u8_write_packed_xyz

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c            | 49 ++++++++++++++++++++++++++++++++++---
 libswscale/ops.h            |  4 +++
 tests/ref/fate/sws-ops-list |  2 +-
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 4afeaabf54..d1691543e6 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -310,8 +310,10 @@ enum {
 
 static SwsCompFlags merge_comp_flags(SwsCompFlags a, SwsCompFlags b)
 {
-    const SwsCompFlags flags_or  = SWS_COMP_GARBAGE;
     const SwsCompFlags flags_and = SWS_COMP_IDENTITY;
+    const SwsCompFlags flags_or  = SWS_COMP_PLANE0 | SWS_COMP_PLANE1 |
+                                   SWS_COMP_PLANE2 | SWS_COMP_PLANE3 |
+                                   SWS_COMP_GARBAGE;
     return ((a & b) & flags_and) | ((a | b) & flags_or);
 }
 
@@ -392,6 +394,12 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 op->comps.flags[i] = ops->comps_src.flags[idx];
                 op->comps.min[i]   = ops->comps_src.min[idx];
                 op->comps.max[i]   = ops->comps_src.max[idx];
+
+                /* Mark plane dependencies */
+                switch (op->rw.mode) {
+                case SWS_RW_PACKED: op->comps.flags[i] |= SWS_COMP_PLANE0; 
break;
+                case SWS_RW_PLANAR: op->comps.flags[i] |= SWS_COMP_PLANE0 << 
i; break;
+                }
             }
             for (int i = op->rw.elems; i < 4; i++) {
                 op->comps.flags[i] = prev.flags[i];
@@ -840,6 +848,22 @@ static char describe_comp_flags(SwsCompFlags flags)
         return '.';
 }
 
+static void print_comp_planes(AVBPrint *bp, SwsCompFlags flags)
+{
+    if (flags & SWS_COMP_GARBAGE) {
+        av_bprintf(bp, "_");
+        return;
+    }
+
+    int num = 0;
+    for (int i = 0; i < 4; i++) {
+        if (flags & (SWS_COMP_PLANE0 << i))
+            av_bprintf(bp, "%c", '0' + i);
+    }
+    if (!num)
+        av_bprintf(bp, "_");
+}
+
 static void print_q(AVBPrint *bp, const AVRational q)
 {
     if (!q.den) {
@@ -1013,15 +1037,34 @@ void ff_sws_op_list_print(void *log, int lev, int 
lev_extra,
         av_assert0(av_bprint_is_complete(&bp));
         av_log(log, lev, "%s\n", bp.str);
 
-        /* Only print value ranges if any are relevant */
+        /* Only print value ranges / dependencies if any are relevant */
         SwsCompMask range_mask = ff_sws_comp_mask_q4(op->comps.min) |
                                  ff_sws_comp_mask_q4(op->comps.max);
-        if (range_mask & mask) {
+
+        const SwsCompFlags dep_flags = SWS_COMP_PLANE0 | SWS_COMP_PLANE1 |
+                                       SWS_COMP_PLANE2 | SWS_COMP_PLANE3;
+
+        bool has_deps = false;
+        for (int j = 0; j < 4; j++) {
+            if (SWS_OP_NEEDED(op, j) && (op->comps.flags[j] & dep_flags)) {
+                has_deps = true;
+                break;
+            }
+        }
+
+        if ((range_mask & mask) || has_deps) {
             av_bprint_clear(&bp);
             av_bprintf(&bp, "    min: ");
             print_q4(&bp, op->comps.min, mask);
             av_bprintf(&bp, ", max: ");
             print_q4(&bp, op->comps.max, mask);
+            av_bprintf(&bp, ", deps: {");
+            for (int j = 0; j < 4; j++) {
+                if (j)
+                    av_bprintf(&bp, " ");
+                print_comp_planes(&bp, op->comps.flags[j]);
+            }
+            av_bprintf(&bp, "}");
             av_assert0(av_bprint_is_complete(&bp));
             av_log(log, lev_extra, "%s\n", bp.str);
         }
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 65d9d49e60..04ad32782e 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -74,6 +74,10 @@ typedef enum SwsCompFlags {
     SWS_COMP_EXACT   = 1 << 1, /* value is an exact integer */
     SWS_COMP_ZERO    = 1 << 2, /* known to be a constant zero */
     SWS_COMP_SWAPPED = 1 << 3, /* byte order is swapped */
+    SWS_COMP_PLANE0  = 1 << 4, /* depends on values from plane 0 */
+    SWS_COMP_PLANE1  = 1 << 5, /* depends on values from plane 1 */
+    SWS_COMP_PLANE2  = 1 << 6, /* depends on values from plane 2 */
+    SWS_COMP_PLANE3  = 1 << 7, /* depends on values from plane 3 */
 } SwsCompFlags;
 
 typedef struct SwsComps {
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 68a1fc1105..76dd6eded1 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-e2f26cb6df5c11015e613016bb1a004a
+8757b9a2f7eb6b8a56e31a6009622512
-- 
2.52.0


>From 909b8dab7eb691ffbb40afb46d6086f2e02b6f45 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 12 Feb 2026 10:50:09 +0100
Subject: [PATCH 2/9] swscale/ops: keep track of copied/cleared components

These represent components which have not (yet) been modified from their
input values (i.e. after a read, or clear). Such components can be
basically passed through via a refcopy (where applicable), as well as helping
to distinguish dissimilar types of plane for (plane splitting).

Generates benign diffs like:

 gray 16x16 -> yuv444p 16x16:
-  [ u8 +XXX] SWS_OP_READ         : 1 elem(s) planar >> 0
+  [ u8 =XXX] SWS_OP_READ         : 1 elem(s) planar >> 0
     min: {0 _ _ _}, max: {255 _ _ _}, deps: {0_ _ _ _}
-  [ u8 +XXX] SWS_OP_CONVERT      : u8 -> f32
+  [ u8 =XXX] SWS_OP_CONVERT      : u8 -> f32
     min: {0 _ _ _}, max: {255 _ _ _}, deps: {0_ _ _ _}
   [f32 .XXX] SWS_OP_LINEAR       : luma [[73/85 0 0 0 16] [0 1 0 0 0] [0 0 1 0 
0] [0 0 0 1 0]]
     min: {16 _ _ _}, max: {235 _ _ _}, deps: {0_ _ _ _}
   [f32 .XXX] SWS_OP_DITHER       : 16x16 matrix + {0 -1 -1 -1}
     min: {16.001953 _ _ _}, max: {235.998047 _ _ _}, deps: {0_ _ _ _}
   [f32 +XXX] SWS_OP_CONVERT      : f32 -> u8
     min: {16 _ _ _}, max: {235 _ _ _}, deps: {0_ _ _ _}
-  [ u8 +++X] SWS_OP_CLEAR        : {_ 128 128 _}
+  [ u8 +$$X] SWS_OP_CLEAR        : {_ 128 128 _}
     min: {16 128 128 _}, max: {235 128 128 _}, deps: {0_ _ _ _}
   [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
-    (X = unused, z = byteswapped, + = exact, 0 = zero)
+    ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
  translated micro-ops:
     u8_read_planar_x
     u8_to_f32_x
     f32_linear_x_x000x
     f32_dither_x_0_16x16
     f32_to_u8_x
     u8_clear_yz_xx
     u8_write_planar_xyz

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c            | 90 ++++++++++++++++++++++---------------
 libswscale/ops.h            |  2 +
 tests/ref/fate/sws-ops-list |  2 +-
 3 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index d1691543e6..65f621e64b 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -303,11 +303,14 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
     av_unreachable("Invalid operation type!");
 }
 
-/* merge_comp_flags() forms a monoid with SWS_COMP_IDENTITY as the null 
element */
 enum {
-    SWS_COMP_IDENTITY = SWS_COMP_ZERO | SWS_COMP_EXACT,
+    SWS_COMP_IDENTITY = SWS_COMP_ZERO | SWS_COMP_EXACT |
+                        SWS_COMP_COPY | SWS_COMP_CONST,
+
+    SWS_COMP_DIRTY = ~(SWS_COMP_COPY | SWS_COMP_CONST),
 };
 
+/* merge_comp_flags() forms a monoid with SWS_COMP_IDENTITY as the null 
element */
 static SwsCompFlags merge_comp_flags(SwsCompFlags a, SwsCompFlags b)
 {
     const SwsCompFlags flags_and = SWS_COMP_IDENTITY;
@@ -317,29 +320,13 @@ static SwsCompFlags merge_comp_flags(SwsCompFlags a, 
SwsCompFlags b)
     return ((a & b) & flags_and) | ((a | b) & flags_or);
 }
 
-/* Linearly propagate flags per component */
-static void propagate_flags(SwsOp *op, const SwsComps *prev)
-{
-    for (int i = 0; i < 4; i++)
-        op->comps.flags[i] = prev->flags[i];
-}
-
-/* Clear undefined values in dst with src */
-static void clear_undefined_values(AVRational dst[4], const AVRational src[4])
-{
-    for (int i = 0; i < 4; i++) {
-        if (dst[i].den == 0)
-            dst[i] = src[i];
-    }
-}
-
 static void apply_filter_weights(SwsComps *comps, const SwsComps *prev,
                                  const SwsFilterWeights *weights)
 {
     const AVRational posw = { weights->sum_positive, SWS_FILTER_SCALE };
     const AVRational negw = { weights->sum_negative, SWS_FILTER_SCALE };
     for (int i = 0; i < 4; i++) {
-        comps->flags[i] = prev->flags[i];
+        comps->flags[i] = prev->flags[i] & SWS_COMP_DIRTY;
         /* Only point sampling preserves exactness */
         if (weights->filter_size != 1)
             comps->flags[i] &= ~SWS_COMP_EXACT;
@@ -400,7 +387,18 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 case SWS_RW_PACKED: op->comps.flags[i] |= SWS_COMP_PLANE0; 
break;
                 case SWS_RW_PLANAR: op->comps.flags[i] |= SWS_COMP_PLANE0 << 
i; break;
                 }
+
+                /**
+                 * Don't mark packed or fractional reads as a copy, because the
+                 * read operation implicitly unpacks the data into separate
+                 * components. The only case in which op lists involving such
+                 * reads can be refcopies is in the case of a true noop, which
+                 * is already covered by the no-op check.
+                 */
+                if (op->rw.mode == SWS_RW_PLANAR && !op->rw.frac)
+                    op->comps.flags[i] |= SWS_COMP_COPY;
             }
+
             for (int i = op->rw.elems; i < 4; i++) {
                 op->comps.flags[i] = prev.flags[i];
                 op->comps.min[i]   = prev.min[i];
@@ -414,7 +412,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
             break;
         case SWS_OP_SWAP_BYTES:
             for (int i = 0; i < 4; i++) {
-                op->comps.flags[i] = prev.flags[i] ^ SWS_COMP_SWAPPED;
+                op->comps.flags[i] = (prev.flags[i] ^ SWS_COMP_SWAPPED) & 
SWS_COMP_DIRTY;
                 op->comps.min[i]   = prev.min[i];
                 op->comps.max[i]   = prev.max[i];
             }
@@ -422,27 +420,35 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
         case SWS_OP_WRITE:
             for (int i = 0; i < op->rw.elems; i++)
                 av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
-            av_fallthrough;
+            for (int i = 0; i < 4; i++)
+                op->comps.flags[i] = prev.flags[i];
+            break;
         case SWS_OP_LSHIFT:
         case SWS_OP_RSHIFT:
-            propagate_flags(op, &prev);
+            for (int i = 0; i < 4; i++)
+                op->comps.flags[i] = prev.flags[i] & SWS_COMP_DIRTY;
             break;
         case SWS_OP_MIN:
-            propagate_flags(op, &prev);
-            clear_undefined_values(op->comps.max, op->clamp.limit);
-            break;
-        case SWS_OP_MAX:
-            propagate_flags(op, &prev);
-            clear_undefined_values(op->comps.min, op->clamp.limit);
+        case SWS_OP_MAX: {
+            AVRational *bound = op->op == SWS_OP_MIN ? op->comps.max : 
op->comps.min;
+            for (int i = 0; i < 4; i++) {
+                op->comps.flags[i] = prev.flags[i];
+                if (op->clamp.limit[i].den)
+                    op->comps.flags[i] &= SWS_COMP_DIRTY;
+                if (!bound[i].den) /* reset undefined bounds to known range */
+                    bound[i] = op->clamp.limit[i];
+            }
             break;
+        }
         case SWS_OP_DITHER:
             for (int i = 0; i < 4; i++) {
-                op->comps.min[i] = prev.min[i];
-                op->comps.max[i] = prev.max[i];
+                op->comps.flags[i] = prev.flags[i];
+                op->comps.min[i]   = prev.min[i];
+                op->comps.max[i]   = prev.max[i];
                 if (op->dither.y_offset[i] < 0)
                     continue;
                 /* Strip zero flag because of the nonzero dithering offset */
-                op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
+                op->comps.flags[i] &= ~SWS_COMP_ZERO & SWS_COMP_DIRTY;
                 op->comps.min[i] = av_add_q(op->comps.min[i], op->dither.min);
                 op->comps.max[i] = av_add_q(op->comps.max[i], op->dither.max);
             }
@@ -452,7 +458,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 const int pattern = op->pack.pattern[i];
                 if (pattern) {
                     av_assert1(pattern < 32);
-                    op->comps.flags[i] = prev.flags[0];
+                    op->comps.flags[i] = prev.flags[0] & SWS_COMP_DIRTY;
                     op->comps.min[i]   = Q(0);
                     op->comps.max[i]   = Q((1ULL << pattern) - 1);
                 } else
@@ -467,13 +473,13 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 if (i > 0) /* clear remaining comps for sanity */
                     op->comps.flags[i] = SWS_COMP_GARBAGE;
             }
-            op->comps.flags[0] = flags;
+            op->comps.flags[0] = flags & SWS_COMP_DIRTY;
             break;
         }
         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (SWS_COMP_TEST(op->clear.mask, i)) {
-                    op->comps.flags[i] = 0;
+                    op->comps.flags[i] = SWS_COMP_CONST;
                     if (op->clear.value[i].num == 0)
                         op->comps.flags[i] |= SWS_COMP_ZERO;
                     if (op->clear.value[i].den == 1)
@@ -490,6 +496,8 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
         case SWS_OP_CONVERT:
             for (int i = 0; i < 4; i++) {
                 op->comps.flags[i] = prev.flags[i];
+                if (!(prev.flags[i] & SWS_COMP_EXACT) || op->convert.expand)
+                    op->comps.flags[i] &= SWS_COMP_DIRTY;
                 if (ff_sws_pixel_type_is_int(op->convert.to))
                     op->comps.flags[i] |= SWS_COMP_EXACT;
             }
@@ -498,6 +506,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
             for (int i = 0; i < 4; i++) {
                 SwsCompFlags flags = SWS_COMP_IDENTITY;
                 AVRational min = Q(0), max = Q(0);
+                AVRational sum = Q(0);
                 for (int j = 0; j < 4; j++) {
                     const AVRational k = op->lin.m[i][j];
                     AVRational mink = av_mul_q(prev.min[j], k);
@@ -510,10 +519,13 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                             FFSWAP(AVRational, mink, maxk);
                         min = av_add_q(min, mink);
                         max = av_add_q(max, maxk);
+                        if (sum.num || av_cmp_q(k, Q(1)))
+                            flags &= SWS_COMP_DIRTY;
+                        sum = av_add_q(sum, k);
                     }
                 }
                 if (op->lin.m[i][4].num) { /* nonzero offset */
-                    flags &= ~SWS_COMP_ZERO;
+                    flags &= ~SWS_COMP_ZERO & SWS_COMP_DIRTY;
                     if (op->lin.m[i][4].den != 1) /* fractional offset */
                         flags &= ~SWS_COMP_EXACT;
                     min = av_add_q(min, op->lin.m[i][4]);
@@ -526,7 +538,7 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
             break;
         case SWS_OP_SCALE:
             for (int i = 0; i < 4; i++) {
-                op->comps.flags[i] = prev.flags[i];
+                op->comps.flags[i] = prev.flags[i] & SWS_COMP_DIRTY;
                 if (op->scale.factor.den != 1) /* fractional scale */
                     op->comps.flags[i] &= ~SWS_COMP_EXACT;
                 if (op->scale.factor.num < 0)
@@ -842,6 +854,10 @@ static char describe_comp_flags(SwsCompFlags flags)
         return '0';
     else if (flags & SWS_COMP_SWAPPED)
         return 'z';
+    else if (flags & SWS_COMP_CONST)
+        return '$';
+    else if (flags & SWS_COMP_COPY)
+        return '=';
     else if (flags & SWS_COMP_EXACT)
         return '+';
     else
@@ -1071,7 +1087,7 @@ void ff_sws_op_list_print(void *log, int lev, int 
lev_extra,
 
     }
 
-    av_log(log, lev, "    (X = unused, z = byteswapped, + = exact, 0 = 
zero)\n");
+    av_log(log, lev, "    ('X' unused, 'z' byteswapped, '=' copied, '$' const, 
'+' integer, '0' zero)\n");
 }
 
 #define DUMMY_SIZE 16
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 04ad32782e..c76d9aa19c 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -78,6 +78,8 @@ typedef enum SwsCompFlags {
     SWS_COMP_PLANE1  = 1 << 5, /* depends on values from plane 1 */
     SWS_COMP_PLANE2  = 1 << 6, /* depends on values from plane 2 */
     SWS_COMP_PLANE3  = 1 << 7, /* depends on values from plane 3 */
+    SWS_COMP_COPY    = 1 << 8, /* value is unmodified from the source plane */
+    SWS_COMP_CONST   = 1 << 9, /* value is a fixed constant */
 } SwsCompFlags;
 
 typedef struct SwsComps {
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 76dd6eded1..4d49726526 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-8757b9a2f7eb6b8a56e31a6009622512
+b4842784ed2fe1fb68963f9de4f9f9eb
-- 
2.52.0


>From 15d196f90fe771ec5d58d9f94000c3dffe8906d4 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 10 Feb 2026 22:32:09 +0100
Subject: [PATCH 3/9] swscale/graph: add a function to allow reusing output
 buffers

Used for plane splitting, among other things. (e.g. plane passthrough)

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c | 18 ++++++++++++++++++
 libswscale/graph.h |  8 ++++++++
 2 files changed, 26 insertions(+)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index f12c493c49..572db622fb 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -227,6 +227,24 @@ fail:
     return ret;
 }
 
+void ff_sws_pass_link_output(SwsPass *pass1, SwsPass *pass2)
+{
+    if (!pass1 || !pass2)
+        return;
+
+    av_assert0(pass1->format == pass2->format);
+    av_assert0(pass1->width  == pass2->width);
+    av_assert0(pass1->height == pass2->height);
+    SwsPassBuffer *dst = pass1->output, *src = pass2->output;
+
+    dst->width_align = FFMAX(dst->width_align, src->width_align);
+    dst->width_pad   = FFMAX(dst->width_pad,   src->width_pad);
+    av_assert1(dst->width  == src->width);
+    av_assert1(dst->height == src->height);
+
+    av_refstruct_replace(&pass2->output, pass1->output);
+}
+
 static void frame_shift(const SwsFrame *f, const int y, uint8_t *data[4])
 {
     for (int i = 0; i < 4; i++) {
diff --git a/libswscale/graph.h b/libswscale/graph.h
index adf4b19675..0d82da054d 100644
--- a/libswscale/graph.h
+++ b/libswscale/graph.h
@@ -198,6 +198,14 @@ int ff_sws_graph_add_pass(SwsGraph *graph, enum 
AVPixelFormat fmt,
                           void *priv, void (*free)(void *priv),
                           SwsPass **out_pass);
 
+/**
+ * Link the output buffers to a different pass, rather than allocating
+ * new image buffers. This allows reusing the same buffer for multiple passes,
+ * e.g. in the case of in-place passes or partial passes that modify different
+ * planes.
+ **/
+void ff_sws_pass_link_output(SwsPass *pass1, SwsPass *pass2);
+
 /**
  * Remove all passes added since the given index.
  */
-- 
2.52.0


>From 03ba4110a72c1e7152a88ae20f361a367f882538 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 10 Jan 2026 12:40:35 +0100
Subject: [PATCH 4/9] swscale/optimizer: add ff_sws_op_list_split()

Can be used to extract a reduced subset of operations affecting only certain
output planes, or e.g. splitting an op list into a "memcpy" and a "non-memcpy"
part.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.h           | 14 ++++++++
 libswscale/ops_optimizer.c | 72 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/libswscale/ops.h b/libswscale/ops.h
index c76d9aa19c..543965e9d0 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -386,4 +386,18 @@ int ff_sws_enum_op_lists(SwsContext *ctx, void *opaque,
                          enum AVPixelFormat src_fmt, enum AVPixelFormat 
dst_fmt,
                          int (*cb)(SwsContext *ctx, void *opaque, SwsOpList 
*ops));
 
+/**
+ * Partition an operation list into two sub-lists according to whether or not
+ * a given output write overlaps with `flags`. This can be used to extract e.g.
+ * copied or constant planes, or just reduce to a specific subset of planes.
+ *
+ * @param ops Will be modified in-place to contain all operations that match.
+ * @param out_rest Will be set to a newly allocated list containing the 
residual,
+ *                 or NULL if there is no remainder.
+ * @param flags Flags to match against (any partial match included).
+ *
+ * @return 0 or a negative error code on failure.
+ */
+int ff_sws_op_list_split(SwsOpList *ops, SwsOpList **out_rest, SwsCompFlags 
flags);
+
 #endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 08fef00c63..ffe8b0d1bc 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -808,6 +808,78 @@ retry:
     return 0;
 }
 
+static int filter_output_planes(SwsOpList *ops, SwsCompMask planes)
+{
+    SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
+    SwsOp *write = &ops->ops[ops->num_ops - 1];
+
+    write->rw.elems = 0;
+    for (int src = 0; src < 4; src++) {
+        if (!SWS_COMP_TEST(planes, src))
+            continue; /* plane not selected */
+        const int dst = write->rw.elems++;
+        av_assert2(src >= dst);
+        swiz.in[dst] = src;
+        FFSWAP(int, ops->plane_dst[dst], ops->plane_dst[src]);
+    }
+
+    /* Insert swizzle to select desired planes */
+    int ret = ff_sws_op_list_insert_at(ops, ops->num_ops - 1, &(SwsOp) {
+        .op      = SWS_OP_SWIZZLE,
+        .type    = write->type,
+        .swizzle = swiz,
+    });
+    if (ret < 0)
+        return ret;
+
+    /* The optimizer will take care of the rest */
+    return ff_sws_op_list_optimize(ops);
+}
+
+int ff_sws_op_list_split(SwsOpList *ops, SwsOpList **out_rest, SwsCompFlags 
flags)
+{
+    if (ops->num_ops < 2) {
+        *out_rest = NULL;
+        return 0;
+    }
+
+    const SwsOp *prev  = &ops->ops[ops->num_ops - 2];
+    const SwsOp *write = &ops->ops[ops->num_ops - 1];
+    if (write->op != SWS_OP_WRITE || write->rw.mode != SWS_RW_PLANAR) {
+        *out_rest = NULL; /* can't split */
+        return 0;
+    }
+
+    SwsCompMask accept = 0, reject = 0;
+    for (int i = 0; i < write->rw.elems; i++) {
+        if (prev->comps.flags[i] & flags) {
+            accept |= SWS_COMP(i);
+        } else {
+            reject |= SWS_COMP(i);
+        }
+    }
+
+    if (!reject || !accept) {
+        *out_rest = NULL;
+        return 0; /* nothing to split */
+    }
+
+    SwsOpList *rest = ff_sws_op_list_duplicate(ops);
+    if (!rest)
+        return AVERROR(ENOMEM);
+
+    int ret;
+    if ((ret = filter_output_planes(ops,  accept)) < 0 ||
+        (ret = filter_output_planes(rest, reject)) < 0)
+    {
+        ff_sws_op_list_free(&rest);
+        return ret;
+    }
+
+    *out_rest = rest;
+    return 0;
+}
+
 int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
                          int size, uint8_t clear_val,
                          int *read_bytes, int *write_bytes)
-- 
2.52.0


>From a90066de702f9a59f577cbcab4e3e21095657375 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 2 Mar 2026 00:49:05 +0100
Subject: [PATCH 5/9] swscale/ops_dispatch: don't assume first operation is a
 read

Makes ff_sws_compile_pass() more robust; will be needed for plane splitting.
Besides, it's perfectly valid to have an operation list that starts with
e.g. SWS_OP_CLEAR.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 44248195d7..8e0e306368 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -526,9 +526,9 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
     const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(dst->format);
     const SwsOp *read  = ff_sws_op_list_input(ops);
     const SwsOp *write = ff_sws_op_list_output(ops);
-    p->planes_in  = ff_sws_rw_op_planes(read);
+    p->planes_in  = read ? ff_sws_rw_op_planes(read) : 0;
     p->planes_out = ff_sws_rw_op_planes(write);
-    p->pixel_bits_in  = rw_pixel_bits(read);
+    p->pixel_bits_in  = read ? rw_pixel_bits(read) : 0;
     p->pixel_bits_out = rw_pixel_bits(write);
     p->exec_base = (SwsOpExec) {
         .width  = dst->width,
@@ -568,8 +568,8 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         p->idx_out[i] = idx;
     }
 
-    const SwsFilterWeights *filter = read->rw.filter.kernel;
-    if (read->rw.filter.op == SWS_OP_FILTER_V) {
+    const SwsFilterWeights *filter = read ? read->rw.filter.kernel : NULL;
+    if (read && read->rw.filter.op == SWS_OP_FILTER_V) {
         p->offsets_y = av_refstruct_ref(filter->offsets);
 
         /* Compute relative pointer bumps for each output line */
@@ -587,7 +587,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         }
         bump[filter->dst_size - 1] = 0;
         p->exec_base.in_bump_y = bump;
-    } else if (read->rw.filter.op == SWS_OP_FILTER_H) {
+    } else if (read && read->rw.filter.op == SWS_OP_FILTER_H) {
         /* Compute pixel offset map for each output line */
         const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
         int32_t *offset = av_malloc_array(pixels, sizeof(*offset));
@@ -647,11 +647,9 @@ int ff_sws_compile_pass(SwsGraph *graph, const 
SwsOpBackend *backend,
         goto out;
     }
 
-    const SwsOp *read  = ff_sws_op_list_input(ops);
     const SwsOp *write = ff_sws_op_list_output(ops);
-    if (!read || !write) {
-        av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
-               "and write, respectively.\n");
+    if (!write) {
+        av_log(ctx, AV_LOG_ERROR, "Last operation must be SWS_OP_WRITE.\n");
         ret = AVERROR(EINVAL);
         goto out;
     }
-- 
2.52.0


>From 38aa3a3502fa3d8784df69e3857a7a9f4f49537b Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 12 Jun 2026 16:10:25 +0200
Subject: [PATCH 6/9] swscale/ops_dispatch: split up ff_sws_compile_pass()

Introduce a new function compile_subpass() to handle the actual splitting
and compilation of a single subpass. This allows us to another wrapping loop
for higher-level logic to also split up the input ops list into separate
parallel op lists, without the two loops getting needlessly confusing.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 73 +++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 8e0e306368..d3122bb189 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -631,6 +631,47 @@ fail:
     return ret;
 }
 
+static int compile_subpass(SwsGraph *graph, const SwsOpBackend *backend,
+                           SwsOpList **ops, SwsPass *input, SwsPass **output)
+{
+    SwsContext *ctx = graph->ctx;
+
+    int ret = compile(graph, backend, *ops, input, output);
+    if (ret != AVERROR(ENOTSUP))
+        return ret;
+
+    av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n");
+    SwsPass *prev = input;
+    bool first = true;
+    while (*ops) {
+        SwsOpList *rest;
+        ret = ff_sws_op_list_subpass(*ops, &rest);
+        if (ret < 0)
+            return ret;
+
+        if (first && !rest) {
+            /* No point in compiling an unsplit pass again */
+            return AVERROR(ENOTSUP);
+        }
+
+        ret = compile(graph, backend, *ops, prev, output ? &prev : NULL);
+        if (ret < 0) {
+            ff_sws_op_list_free(&rest);
+            return ret;
+        }
+
+        ff_sws_op_list_free(ops);
+        first = false;
+        *ops = rest;
+    }
+
+    /* Return last subpass successfully compiled */
+    if (output)
+        *output = prev;
+
+    return 0;
+}
+
 int ff_sws_compile_pass(SwsGraph *graph, const SwsOpBackend *backend,
                         SwsOpList **pops, int flags, SwsPass *input,
                         SwsPass **output)
@@ -662,41 +703,13 @@ int ff_sws_compile_pass(SwsGraph *graph, const 
SwsOpBackend *backend,
         ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops);
     }
 
-    ret = compile(graph, backend, ops, input, output);
-    if (ret != AVERROR(ENOTSUP))
+    ret = compile_subpass(graph, backend, &ops, input, output);
+    if (ret < 0)
         goto out;
 
-    av_log(ctx, AV_LOG_DEBUG, "Retrying with separated filter passes.\n");
-    SwsPass *prev = input;
-    bool first = true;
-    while (ops) {
-        SwsOpList *rest;
-        ret = ff_sws_op_list_subpass(ops, &rest);
-        if (ret < 0)
-            goto out;
-
-        if (first && !rest) {
-            /* No point in compiling an unsplit pass again */
-            ret = AVERROR(ENOTSUP);
-            goto out;
-        }
-
-        ret = compile(graph, backend, ops, prev, output ? &prev : NULL);
-        if (ret < 0) {
-            ff_sws_op_list_free(&rest);
-            goto out;
-        }
-
-        ff_sws_op_list_free(&ops);
-        first = false;
-        ops = rest;
-    }
-
     if (output) {
-        /* Return last subpass successfully compiled */
         av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n",
                graph->num_passes - passes_orig);
-        *output = prev;
     }
 
 out:
-- 
2.52.0


>From b30f8e30ae5741518581d473b0a35c9a0cbef4e9 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 12 Jun 2026 17:37:39 +0200
Subject: [PATCH 7/9] swscale/ops_dispatch: add option to split const/copied
 subpasses

This already helps performance as-is, but will help performance massively
once we add the ability for the memcpy backend to do a refcopy instead of
an actual copy.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.h          |  3 +++
 libswscale/ops_dispatch.c | 27 +++++++++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/libswscale/ops.h b/libswscale/ops.h
index 543965e9d0..0216bdbe51 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -369,6 +369,9 @@ int ff_sws_op_list_optimize(SwsOpList *ops);
 enum SwsOpCompileFlags {
     /* Automatically optimize the operations when compiling */
     SWS_OP_FLAG_OPTIMIZE = 1 << 0,
+
+    /* Split off copied/cleared planes into separate subpasses */
+    SWS_OP_FLAG_SPLIT_PLANES = 1 << 1,
 };
 
 /**
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index d3122bb189..10969b55ce 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -666,8 +666,11 @@ static int compile_subpass(SwsGraph *graph, const 
SwsOpBackend *backend,
     }
 
     /* Return last subpass successfully compiled */
-    if (output)
+    if (output) {
+        /* Re-use any existing allocation from previous sub-passes */
+        ff_sws_pass_link_output(prev, *output);
         *output = prev;
+    }
 
     return 0;
 }
@@ -703,9 +706,25 @@ int ff_sws_compile_pass(SwsGraph *graph, const 
SwsOpBackend *backend,
         ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops);
     }
 
-    ret = compile_subpass(graph, backend, &ops, input, output);
-    if (ret < 0)
-        goto out;
+    if (output)
+        *output = NULL;
+
+    while (ops) {
+        SwsOpList *rest = NULL;
+
+        if (flags & SWS_OP_FLAG_SPLIT_PLANES) {
+            ret = ff_sws_op_list_split(ops, &rest, SWS_COMP_COPY | 
SWS_COMP_CONST);
+            if (ret < 0)
+                goto out;
+        }
+
+        ret = compile_subpass(graph, backend, &ops, input, output);
+        if (ret < 0)
+            goto out;
+
+        ff_sws_op_list_free(&ops);
+        ops = rest;
+    }
 
     if (output) {
         av_log(ctx, AV_LOG_VERBOSE, "Using %d separate passes.\n",
-- 
2.52.0


>From baa2cc7f4a7c8c1b8ee59e37cbdea52a5c56ee89 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 12 Jun 2026 18:00:19 +0200
Subject: [PATCH 8/9] swscale/tests/sws_ops: split passes when printing ops
 lists

This affects a large number of conversions across the board, either:

1. Lifting a constant alpha/chroma clear out from the conversion pass:

 rgb24 16x16 -> yuva444p 16x16:
+  [ u8 $XXX] SWS_OP_CLEAR        : {255 _ _ _}
+    min: {255 _ _ _}, max: {255 _ _ _}, deps: {_ _ _ _}
+  [ u8 XXXX] SWS_OP_WRITE        : 1 elem(s) planar >> 0, via {3}
+    ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
+ translated micro-ops:
+    u8_clear_x_1
+    u8_write_planar_x
+ Sub-pass #1:
   [ u8 +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
     min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 0_ 0_ _}
   [ u8 +++X] SWS_OP_CONVERT      : u8 -> f32
     min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 0_ 0_ _}
   [f32 ...X] SWS_OP_LINEAR       : matrix3+off3 [[0.256788 0.504129 0.097906 0 
16] [-0.148223 -0.290993 112/255 0 128] [112/255 -0.367788 -0.071427 0 128] [0 
0 0 1 0]]
     min: {16 16 16 _}, max: {235 240 240 _}, deps: {0_ 0_ 0_ _}
   [f32 ...X] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 -1}
     min: {16.001953 16.001953 16.001953 _}, max: {235.998047 240.998047 
240.998047 _}, deps: {0_ 0_ 0_ _}
   [f32 +++X] SWS_OP_CONVERT      : f32 -> u8
     min: {16 16 16 _}, max: {235 240 240 _}, deps: {0_ 0_ 0_ _}
-  [ u8 +++$] SWS_OP_CLEAR        : {_ _ _ 255}
-    min: {16 16 16 255}, max: {235 240 240 255}, deps: {0_ 0_ 0_ _}
-  [ u8 XXXX] SWS_OP_WRITE        : 4 elem(s) planar >> 0
+  [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
     ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)

 gray 16x16 -> yuv444p 16x16:
+  [ u8 $$XX] SWS_OP_CLEAR        : {128 128 _ _}
+    min: {128 128 _ _}, max: {128 128 _ _}, deps: {_ _ _ _}
+  [ u8 XXXX] SWS_OP_WRITE        : 2 elem(s) planar >> 0, via {2, 1}
+    ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
+ translated micro-ops:
+    u8_clear_xy_xx
+    u8_write_planar_xy
+ Sub-pass #1:
   [ u8 =XXX] SWS_OP_READ         : 1 elem(s) planar >> 0
     min: {0 _ _ _}, max: {255 _ _ _}, deps: {0_ _ _ _}
   [ u8 =XXX] SWS_OP_CONVERT      : u8 -> f32
     min: {0 _ _ _}, max: {255 _ _ _}, deps: {0_ _ _ _}
   [f32 .XXX] SWS_OP_LINEAR       : luma [[73/85 0 0 0 16] [0 1 0 0 0] [0 0 1 0 
0] [0 0 0 1 0]]
     min: {16 _ _ _}, max: {235 _ _ _}, deps: {0_ _ _ _}
   [f32 .XXX] SWS_OP_DITHER       : 16x16 matrix + {0 -1 -1 -1}
     min: {16.001953 _ _ _}, max: {235.998047 _ _ _}, deps: {0_ _ _ _}
   [f32 +XXX] SWS_OP_CONVERT      : f32 -> u8
     min: {16 _ _ _}, max: {235 _ _ _}, deps: {0_ _ _ _}
-  [ u8 +$$X] SWS_OP_CLEAR        : {_ 128 128 _}
-    min: {16 128 128 _}, max: {235 128 128 _}, deps: {0_ _ _ _}
-  [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+  [ u8 XXXX] SWS_OP_WRITE        : 1 elem(s) planar >> 0
     ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
  translated micro-ops:
     u8_read_planar_x
     u8_to_f32_x
     f32_linear_x_x000x
     f32_dither_x_0_16x16
     f32_to_u8_x
-    u8_clear_yz_xx
-    u8_write_planar_xyz
+    u8_write_planar_x

or

2. Passing through a plane that was previously unmodified by an ops chain:

 gbrap 16x16 -> yuva444p 16x16:
-  [ u8 ====] SWS_OP_READ         : 4 elem(s) planar >> 0, via {2, 0, 1, 3}
-    min: {0 0 0 0}, max: {255 255 255 255}, deps: {0_ 1_ 2_ 3_}
-  [ u8 ====] SWS_OP_CONVERT      : u8 -> f32
-    min: {0 0 0 0}, max: {255 255 255 255}, deps: {0_ 1_ 2_ 3_}
-  [f32 ...=] SWS_OP_LINEAR       : matrix3+off3 [[0.256788 0.504129 0.097906 0 
16] [-0.148223 -0.290993 112/255 0 128] [112/255 -0.367788 -0.071427 0 128] [0 
0 0 1 0]]
-    min: {16 16 16 0}, max: {235 240 240 255}, deps: {012_ 012_ 012_ 3_}
-  [f32 ...=] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 -1}
-    min: {16.001953 16.001953 16.001953 0}, max: {235.998047 240.998047 
240.998047 255}, deps: {012_ 012_ 012_ 3_}
-  [f32 +++=] SWS_OP_CONVERT      : f32 -> u8
-    min: {16 16 16 0}, max: {235 240 240 255}, deps: {012_ 012_ 012_ 3_}
-  [ u8 XXXX] SWS_OP_WRITE        : 4 elem(s) planar >> 0
+  [ u8 =XXX] SWS_OP_READ         : 1 elem(s) planar >> 0, via {3}
+    min: {0 _ _ _}, max: {255 _ _ _}, deps: {0_ _ _ _}
+  [ u8 XXXX] SWS_OP_WRITE        : 1 elem(s) planar >> 0, via {3}
     ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
  translated micro-ops:
-    u8_read_planar_xyzw
-    u8_to_f32_xyzw
+    u8_read_planar_x
+    u8_write_planar_x
+ Sub-pass #1:
+  [ u8 ===X] SWS_OP_READ         : 3 elem(s) planar >> 0, via {2, 0, 1}
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 1_ 2_ _}
+  [ u8 ===X] SWS_OP_CONVERT      : u8 -> f32
+    min: {0 0 0 _}, max: {255 255 255 _}, deps: {0_ 1_ 2_ _}
+  [f32 ...X] SWS_OP_LINEAR       : matrix3+off3 [[0.256788 0.504129 0.097906 0 
16] [-0.148223 -0.290993 112/255 0 128] [112/255 -0.367788 -0.071427 0 128] [0 
0 0 1 0]]
+    min: {16 16 16 _}, max: {235 240 240 _}, deps: {012_ 012_ 012_ _}
+  [f32 ...X] SWS_OP_DITHER       : 16x16 matrix + {0 3 2 -1}
+    min: {16.001953 16.001953 16.001953 _}, max: {235.998047 240.998047 
240.998047 _}, deps: {012_ 012_ 012_ _}
+  [f32 +++X] SWS_OP_CONVERT      : f32 -> u8
+    min: {16 16 16 _}, max: {235 240 240 _}, deps: {012_ 012_ 012_ _}
+  [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) planar >> 0
+    ('X' unused, 'z' byteswapped, '=' copied, '$' const, '+' integer, '0' zero)
+ translated micro-ops:
+    u8_read_planar_xyz
+    u8_to_f32_xyz
     f32_linear_xyz_xxx0x_xxx0x_xxx0x
     f32_dither_xyz_0_3_2_16x16
-    f32_to_u8_xyzw
-    u8_write_planar_xyzw
+    f32_to_u8_xyz
+    u8_write_planar_xyz

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/tests/sws_ops.c  | 3 ++-
 tests/ref/fate/sws-ops-list | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libswscale/tests/sws_ops.c b/libswscale/tests/sws_ops.c
index 5264cd55d1..10d8b14c3c 100644
--- a/libswscale/tests/sws_ops.c
+++ b/libswscale/tests/sws_ops.c
@@ -93,7 +93,8 @@ static int print_passes(SwsContext *ctx, void *graph, 
SwsOpList *ops)
         return AVERROR(ENOMEM);
 
     pass_idx = 0;
-    return ff_sws_compile_pass(graph, &backend_print, &copy, 0, NULL, NULL);
+    return ff_sws_compile_pass(graph, &backend_print, &copy,
+                               SWS_OP_FLAG_SPLIT_PLANES, NULL, NULL);
 }
 static void log_stdout(void *avcl, int level, const char *fmt, va_list vl)
 {
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 4d49726526..f34ad8676c 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-b4842784ed2fe1fb68963f9de4f9f9eb
+33489ee970b9fc91bde60a26580ca0f0
-- 
2.52.0


>From 378494cdd7d4a336c558c49f0b9fab6afe57b524 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 12 Jun 2026 17:55:13 +0200
Subject: [PATCH 9/9] swscale/uops: split planes when generating ops lists

This updates uops_macros.h and the graph.c implementation in lockstep,
otherwise we'd have an intermediate commit with a bunch of broken formats.

Overall speedup=1.008x faster, min=0.144x max=5.550x

The min/max numbers are mostly measurement noise, but the real speedup for
affected formats is anywhere from 0.9x to around 2x-3x.

It's worth noting that the speedup for the formats which currently regress
is because we don't yet refcopy the planes, but I have another series in the
works which will take care of this soon.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c       |  3 ++-
 libswscale/uops.c        |  3 ++-
 libswscale/uops_macros.h | 34 ++++++++++++++--------------------
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index 572db622fb..01836fe6b9 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -671,7 +671,8 @@ static int add_ops_convert_pass(SwsGraph *graph, const 
SwsFormat *src,
     av_log(ctx, AV_LOG_DEBUG, "Unoptimized operation list:\n");
     ff_sws_op_list_print(ctx, AV_LOG_DEBUG, AV_LOG_TRACE, ops);
 
-    return ff_sws_compile_pass(graph, NULL, &ops, SWS_OP_FLAG_OPTIMIZE, input, 
output);
+    const int flags = SWS_OP_FLAG_OPTIMIZE | SWS_OP_FLAG_SPLIT_PLANES;
+    return ff_sws_compile_pass(graph, NULL, &ops, flags, input, output);
 #else
     return AVERROR(ENOTSUP);
 #endif
diff --git a/libswscale/uops.c b/libswscale/uops.c
index b73aedb6e1..1f72db79c0 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -936,7 +936,8 @@ static int register_all_uops(SwsContext *ctx, void *graph, 
SwsOpList *ops)
     if (!copy)
         return AVERROR(ENOMEM);
 
-    return ff_sws_compile_pass(graph, &backend_uops, &copy, 0, NULL, NULL);
+    return ff_sws_compile_pass(graph, &backend_uops, &copy,
+                               SWS_OP_FLAG_SPLIT_PLANES, NULL, NULL);
 }
 
 static const SwsFlags flags[] = {
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index f63d046aa3..410bae747c 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -117,7 +117,6 @@
     MACRO(__VA_ARGS__, u8_permute_xyz_yzx                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 1, 2, 0, 3) \
     MACRO(__VA_ARGS__, u8_permute_xyz_yzw                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 1, 2, 3, 0) \
     MACRO(__VA_ARGS__, u8_permute_xyz_zxy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 2, 0, 1, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zwy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 2, 3, 1, 0) \
     MACRO(__VA_ARGS__, u8_permute_xyz_wzy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 3, 2, 1, 0) \
     MACRO(__VA_ARGS__, u8_permute_w_x                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x8, 3, 1, 2, 0) \
     MACRO(__VA_ARGS__, u8_permute_yw_wy                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xa, 0, 3, 2, 1) \
@@ -148,7 +147,6 @@
     MACRO(__VA_ARGS__, u8_permute_xyz_yzx                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 0, 3}) \
     MACRO(__VA_ARGS__, u8_permute_xyz_yzw                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 3, 0}) \
     MACRO(__VA_ARGS__, u8_permute_xyz_zxy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 0, 1, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zwy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 3, 1, 0}) \
     MACRO(__VA_ARGS__, u8_permute_xyz_wzy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{3, 2, 1, 0}) \
     MACRO(__VA_ARGS__, u8_permute_w_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{3, 1, 2, 0}) \
     MACRO(__VA_ARGS__, u8_permute_yw_wy                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xa, .par.swizzle.in = 
{0, 3, 2, 1}) \
@@ -187,7 +185,6 @@
     MACRO(__VA_ARGS__, u8_move_yz_xx                           , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) \
     MACRO(__VA_ARGS__, u8_move_wz_zx                           , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 2, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0) \
     MACRO(__VA_ARGS__, u8_move_xyz_yzw                         , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 3, 0, 1, 2, 0, 0, 0, 1, 2, 3, 0, 0, 0) \
-    MACRO(__VA_ARGS__, u8_move_xzy_zyw                         , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 3, 0, 2, 1, 0, 0, 0, 2, 1, 3, 0, 0, 0) \
     MACRO(__VA_ARGS__, u8_move_yzw_xxx                         , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0) \
     MACRO(__VA_ARGS__, u8_move_zwy_xyx                         , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 3, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0) \
     MACRO(__VA_ARGS__, u8_move_wyz_yzx                         , SWS_PIXEL_U8 
, SWS_UOP_MOVE            , 0x0, 3, 3, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, 0) \
@@ -222,7 +219,6 @@
     MACRO(__VA_ARGS__, u8_move_yz_xx                           , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 2, .par.move.dst = {1, 2, 0, 0, 0, 0}, .par.move.src = 
{0, 0, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u8_move_wz_zx                           , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 2, .par.move.dst = {3, 2, 0, 0, 0, 0}, .par.move.src = 
{2, 0, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u8_move_xyz_yzw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {0, 1, 2, 0, 0, 0}, .par.move.src = 
{1, 2, 3, 0, 0, 0}) \
-    MACRO(__VA_ARGS__, u8_move_xzy_zyw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {0, 2, 1, 0, 0, 0}, .par.move.src = 
{2, 1, 3, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u8_move_yzw_xxx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {1, 2, 3, 0, 0, 0}, .par.move.src = 
{0, 0, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u8_move_zwy_xyx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {2, 3, 1, 0, 0, 0}, .par.move.src = 
{0, 1, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u8_move_wyz_yzx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {3, 1, 2, 0, 0, 0}, .par.move.src = 
{1, 2, 0, 0, 0, 0}) \
@@ -336,14 +332,14 @@
     MACRO(__VA_ARGS__, u8_clear_xy_xx                          , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x3, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_xz_xx                          , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x5, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_yz_xx                          , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x6, 0x00000, 0x00000) \
+    MACRO(__VA_ARGS__, u8_clear_xyz_1xx                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x7, 0x00001, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_w_0                            , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x8, 0x00000, 0x00008) \
     MACRO(__VA_ARGS__, u8_clear_w_1                            , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0x8, 0x00008, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_xyw_xx0                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xb, 0x00000, 0x00008) \
     MACRO(__VA_ARGS__, u8_clear_xyw_xx1                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xb, 0x00008, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_zw_xx                          , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xc, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u8_clear_xzw_1xx                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xd, 0x00001, 0x00000) \
-    MACRO(__VA_ARGS__, u8_clear_xzw_xx1                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xd, 0x00008, 0x00000) \
-    MACRO(__VA_ARGS__, u8_clear_yzw_xx1                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xe, 0x00008, 0x00000)
+    MACRO(__VA_ARGS__, u8_clear_xzw_xx1                        , SWS_PIXEL_U8 
, SWS_UOP_CLEAR           , 0xd, 0x00008, 0x00000)
 #define SWS_FOR_STRUCT_U8_CLEAR(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_clear_x_0                            , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x1, .par.clear.one = 
0x0, .par.clear.zero = 0x1) \
     MACRO(__VA_ARGS__, u8_clear_x_1                            , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x1, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
@@ -351,14 +347,14 @@
     MACRO(__VA_ARGS__, u8_clear_xy_xx                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x3, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_xz_xx                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x5, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_yz_xx                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x6, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
+    MACRO(__VA_ARGS__, u8_clear_xyz_1xx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x7, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_w_0                            , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x8, .par.clear.one = 
0x0, .par.clear.zero = 0x8) \
     MACRO(__VA_ARGS__, u8_clear_w_1                            , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0x8, .par.clear.one = 
0x8, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_xyw_xx0                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xb, .par.clear.one = 
0x0, .par.clear.zero = 0x8) \
     MACRO(__VA_ARGS__, u8_clear_xyw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xb, .par.clear.one = 
0x8, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_zw_xx                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xc, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_xzw_1xx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
-    MACRO(__VA_ARGS__, u8_clear_xzw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x8, .par.clear.zero = 0x0) \
-    MACRO(__VA_ARGS__, u8_clear_yzw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x8, .par.clear.zero = 0x0)
+    MACRO(__VA_ARGS__, u8_clear_xzw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x8, .par.clear.zero = 0x0)
 #define SWS_FOR_U8_LINEAR(MACRO, ...)
 #define SWS_FOR_STRUCT_U8_LINEAR(MACRO, ...)
 #define SWS_FOR_U8_LINEAR_FMA(MACRO, ...)
@@ -452,7 +448,6 @@
     MACRO(__VA_ARGS__, u16_permute_xyz_yzx                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 1, 2, 0, 3) \
     MACRO(__VA_ARGS__, u16_permute_xyz_yzw                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 1, 2, 3, 0) \
     MACRO(__VA_ARGS__, u16_permute_xyz_zxy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 2, 0, 1, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zwy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 2, 3, 1, 0) \
     MACRO(__VA_ARGS__, u16_permute_xyz_wzy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 3, 2, 1, 0) \
     MACRO(__VA_ARGS__, u16_permute_w_x                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x8, 3, 1, 2, 0) \
     MACRO(__VA_ARGS__, u16_permute_zw_xz                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xc, 3, 1, 0, 2) \
@@ -475,7 +470,6 @@
     MACRO(__VA_ARGS__, u16_permute_xyz_yzx                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 0, 3}) \
     MACRO(__VA_ARGS__, u16_permute_xyz_yzw                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 3, 0}) \
     MACRO(__VA_ARGS__, u16_permute_xyz_zxy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 0, 1, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zwy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 3, 1, 0}) \
     MACRO(__VA_ARGS__, u16_permute_xyz_wzy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{3, 2, 1, 0}) \
     MACRO(__VA_ARGS__, u16_permute_w_x                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{3, 1, 2, 0}) \
     MACRO(__VA_ARGS__, u16_permute_zw_xz                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xc, .par.swizzle.in = 
{3, 1, 0, 2}) \
@@ -503,7 +497,6 @@
     MACRO(__VA_ARGS__, u16_move_yz_xx                          , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0) \
     MACRO(__VA_ARGS__, u16_move_wz_zx                          , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 2, 3, 2, 0, 0, 0, 0, 2, 0, 0, 0, 
0, 0) \
     MACRO(__VA_ARGS__, u16_move_xyz_yzw                        , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 3, 0, 1, 2, 0, 0, 0, 1, 2, 3, 0, 
0, 0) \
-    MACRO(__VA_ARGS__, u16_move_xzy_zyw                        , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 3, 0, 2, 1, 0, 0, 0, 2, 1, 3, 0, 
0, 0) \
     MACRO(__VA_ARGS__, u16_move_zwy_xyx                        , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 3, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 
0, 0) \
     MACRO(__VA_ARGS__, u16_move_wzy_zyx                        , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 3, 3, 2, 1, 0, 0, 0, 2, 1, 0, 0, 
0, 0) \
     MACRO(__VA_ARGS__, u16_move_txy_xyt                        , 
SWS_PIXEL_U16, SWS_UOP_MOVE            , 0x0, 3, -1, 0, 1, 0, 0, 0, 0, 1, -1, 
0, 0, 0) \
@@ -528,7 +521,6 @@
     MACRO(__VA_ARGS__, u16_move_yz_xx                          , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 2, .par.move.dst = {1, 2, 0, 0, 0, 0}, .par.move.src = 
{0, 0, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u16_move_wz_zx                          , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 2, .par.move.dst = {3, 2, 0, 0, 0, 0}, .par.move.src = 
{2, 0, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u16_move_xyz_yzw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {0, 1, 2, 0, 0, 0}, .par.move.src = 
{1, 2, 3, 0, 0, 0}) \
-    MACRO(__VA_ARGS__, u16_move_xzy_zyw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {0, 2, 1, 0, 0, 0}, .par.move.src = 
{2, 1, 3, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u16_move_zwy_xyx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {2, 3, 1, 0, 0, 0}, .par.move.src = 
{0, 1, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u16_move_wzy_zyx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {3, 2, 1, 0, 0, 0}, .par.move.src = 
{2, 1, 0, 0, 0, 0}) \
     MACRO(__VA_ARGS__, u16_move_txy_xyt                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_MOVE            , .mask = 0x0, 
.par.move.num_moves = 3, .par.move.dst = {-1, 0, 1, 0, 0, 0}, .par.move.src = 
{0, 1, -1, 0, 0, 0}) \
@@ -664,29 +656,31 @@
     MACRO(__VA_ARGS__, u16_rshift_xyz_4                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_RSHIFT          , .mask = 0x7, .par.shift.amount 
= 4) \
     MACRO(__VA_ARGS__, u16_rshift_xyz_6                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_RSHIFT          , .mask = 0x7, .par.shift.amount 
= 6)
 #define SWS_FOR_U16_CLEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u16_clear_x_x                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x1, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_x_1                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x1, 0x00001, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_y_1                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x2, 0x00002, 0x00000) \
-    MACRO(__VA_ARGS__, u16_clear_yz_xx                         , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x6, 0x00000, 0x00000) \
+    MACRO(__VA_ARGS__, u16_clear_xy_xx                         , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x3, 0x00000, 0x00000) \
+    MACRO(__VA_ARGS__, u16_clear_xyz_xxx                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x7, 0x00000, 0x00000) \
+    MACRO(__VA_ARGS__, u16_clear_xyz_1xx                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x7, 0x00001, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_w_x                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x8, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_w_0                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x8, 0x00000, 0x00008) \
     MACRO(__VA_ARGS__, u16_clear_w_1                           , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0x8, 0x00008, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_zw_xx                         , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xc, 0x00000, 0x00000) \
     MACRO(__VA_ARGS__, u16_clear_xzw_xx0                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xd, 0x00000, 0x00008) \
-    MACRO(__VA_ARGS__, u16_clear_xzw_1xx                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xd, 0x00001, 0x00000) \
-    MACRO(__VA_ARGS__, u16_clear_yzw_xxx                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xe, 0x00000, 0x00000) \
-    MACRO(__VA_ARGS__, u16_clear_yzw_xx1                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xe, 0x00008, 0x00000)
+    MACRO(__VA_ARGS__, u16_clear_xzw_1xx                       , 
SWS_PIXEL_U16, SWS_UOP_CLEAR           , 0xd, 0x00001, 0x00000)
 #define SWS_FOR_STRUCT_U16_CLEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u16_clear_x_x                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x1, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_x_1                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x1, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_y_1                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x2, .par.clear.one = 
0x2, .par.clear.zero = 0x0) \
-    MACRO(__VA_ARGS__, u16_clear_yz_xx                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x6, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
+    MACRO(__VA_ARGS__, u16_clear_xy_xx                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x3, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
+    MACRO(__VA_ARGS__, u16_clear_xyz_xxx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x7, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
+    MACRO(__VA_ARGS__, u16_clear_xyz_1xx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x7, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_w_x                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x8, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_w_0                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x8, .par.clear.one = 
0x0, .par.clear.zero = 0x8) \
     MACRO(__VA_ARGS__, u16_clear_w_1                           , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0x8, .par.clear.one = 
0x8, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_zw_xx                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xc, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_xzw_xx0                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x0, .par.clear.zero = 0x8) \
-    MACRO(__VA_ARGS__, u16_clear_xzw_1xx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
-    MACRO(__VA_ARGS__, u16_clear_yzw_xxx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
-    MACRO(__VA_ARGS__, u16_clear_yzw_xx1                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x8, .par.clear.zero = 0x0)
+    MACRO(__VA_ARGS__, u16_clear_xzw_1xx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x1, .par.clear.zero = 0x0)
 #define SWS_FOR_U16_LINEAR(MACRO, ...)
 #define SWS_FOR_STRUCT_U16_LINEAR(MACRO, ...)
 #define SWS_FOR_U16_LINEAR_FMA(MACRO, ...)
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/ops: revive series for splitting planes for disjoint passes (PR #23467)

Reply via email to