[FFmpeg-devel] [PR] swscale: backport isolated or minor commits/fixes from my subsampling dev branch (PR #23538)

Niklas Haas via ffmpeg-devel Fri, 19 Jun 2026 18:13:02 -0700

PR #23538 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23538
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23538.patch


Cherry-picked a bit of low hanging fruit to get an LLM review and CI workflow.


>From 127efb70727bd9a82659238f415506c913bac66a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 16 Jun 2026 15:03:56 +0200
Subject: [PATCH 01/16] swscale/format: pass SwsFormat to
 ff_sws_decode_colors()

Needed to set initial plane size metadata. I decided to update
ff_sws_encode_colors() as well for symmetry.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.c | 23 +++++++++++------------
 libswscale/format.h |  4 ++--
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/libswscale/format.c b/libswscale/format.c
index 545ba1909e..c34d335500 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -958,11 +958,10 @@ static void swizzle_inv(SwsSwizzleOp *swiz)
  * it will end up getting pushed towards the output or optimized away entirely
  * by the optimization pass.
  */
-static SwsClearOp fmt_clear(enum AVPixelFormat fmt)
+static SwsClearOp fmt_clear(const SwsFormat *fmt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
-    const bool has_chroma = desc->nb_components >= 3;
-    const bool has_alpha  = desc->flags & AV_PIX_FMT_FLAG_ALPHA;
+    const bool has_chroma = fmt->desc->nb_components >= 3;
+    const bool has_alpha  = fmt->desc->flags & AV_PIX_FMT_FLAG_ALPHA;
 
     SwsClearOp c = {0};
     if (!has_chroma) {
@@ -984,9 +983,9 @@ static SwsClearOp fmt_clear(enum AVPixelFormat fmt)
 #  define NATIVE_ENDIAN_FLAG 0
 #endif
 
-int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt)
+int ff_sws_decode_pixfmt(SwsOpList *ops, const SwsFormat *fmt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+    const AVPixFmtDescriptor *desc = fmt->desc;
     SwsPixelType pixel_type, raw_type;
     SwsReadWriteOp rw_op;
     SwsSwizzleOp swizzle;
@@ -994,7 +993,7 @@ int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat 
fmt)
     SwsComps *comps = &ops->comps_src;
     SwsShiftOp shift;
 
-    RET(fmt_analyze(fmt, &rw_op, &unpack, &swizzle, &shift,
+    RET(fmt_analyze(fmt->format, &rw_op, &unpack, &swizzle, &shift,
                     &pixel_type, &raw_type));
 
     swizzle_inv(&swizzle);
@@ -1073,16 +1072,16 @@ int ff_sws_decode_pixfmt(SwsOpList *ops, enum 
AVPixelFormat fmt)
     return 0;
 }
 
-int ff_sws_encode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt)
+int ff_sws_encode_pixfmt(SwsOpList *ops, const SwsFormat *fmt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+    const AVPixFmtDescriptor *desc = fmt->desc;
     SwsPixelType pixel_type, raw_type;
     SwsReadWriteOp rw_op;
     SwsSwizzleOp swizzle;
     SwsPackOp pack;
     SwsShiftOp shift;
 
-    RET(fmt_analyze(fmt, &rw_op, &pack, &swizzle, &shift,
+    RET(fmt_analyze(fmt->format, &rw_op, &pack, &swizzle, &shift,
                     &pixel_type, &raw_type));
 
     if (shift.amount) {
@@ -1680,7 +1679,7 @@ int ff_sws_op_list_generate(SwsContext *ctx, const 
SwsFormat *src,
     ops->dst = *dst;
 
     const SwsPixelType type = SWS_PIXEL_F32;
-    int ret = ff_sws_decode_pixfmt(ops, src->format);
+    int ret = ff_sws_decode_pixfmt(ops, src);
     if (ret < 0)
         goto fail;
     ret = ff_sws_decode_colors(ctx, type, ops, src, incomplete);
@@ -1692,7 +1691,7 @@ int ff_sws_op_list_generate(SwsContext *ctx, const 
SwsFormat *src,
     ret = ff_sws_encode_colors(ctx, type, ops, src, dst, incomplete);
     if (ret < 0)
         goto fail;
-    ret = ff_sws_encode_pixfmt(ops, dst->format);
+    ret = ff_sws_encode_pixfmt(ops, dst);
     if (ret < 0)
         goto fail;
 
diff --git a/libswscale/format.h b/libswscale/format.h
index 67f25d7006..36158da55a 100644
--- a/libswscale/format.h
+++ b/libswscale/format.h
@@ -174,8 +174,8 @@ typedef enum SwsPixelType SwsPixelType;
  *
  * Returns 0 on success, or a negative error code on failure.
  */
-int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt);
-int ff_sws_encode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt);
+int ff_sws_decode_pixfmt(SwsOpList *ops, const SwsFormat *fmt);
+int ff_sws_encode_pixfmt(SwsOpList *ops, const SwsFormat *fmt);
 
 /**
  * Append a set of operations for transforming decoded pixel values to/from
-- 
2.52.0


>From e52459195c247b51cd813fd5aa8ab68e88d3affe Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 20 Jun 2026 02:50:26 +0200
Subject: [PATCH 02/16] swscale/ops: simplify SWS_OP_READ default comps
 handling

We can still pre-fill the prev array here; ff_sws_apply_op_q() is a no-op.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 31bb2bdec4..1ea5261c3d 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -361,7 +361,6 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
         SwsOp *op = &ops->ops[n];
 
         switch (op->op) {
-        case SWS_OP_READ:
         case SWS_OP_LINEAR:
         case SWS_OP_DITHER:
         case SWS_OP_SWAP_BYTES:
@@ -393,11 +392,6 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 op->comps.min[i]   = ops->comps_src.min[idx];
                 op->comps.max[i]   = ops->comps_src.max[idx];
             }
-            for (int i = op->rw.elems; i < 4; i++) {
-                op->comps.flags[i] = prev.flags[i];
-                op->comps.min[i]   = prev.min[i];
-                op->comps.max[i]   = prev.max[i];
-            }
 
             if (op->rw.filter.op) {
                 const SwsComps prev = op->comps;
-- 
2.52.0


>From faac9fa705e58ea60597da091e35e58a7640128a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 20 Jun 2026 02:52:02 +0200
Subject: [PATCH 03/16] swscale/ops_optimizer: set correct range metadata after
 split pass

Replaces a few "nan" value ranges by real values, and drops a bunch of
redundant non-FMA variants that resulted from this bug.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  |  5 ++++-
 libswscale/uops_macros.h    | 16 ----------------
 tests/ref/fate/sws-ops-list |  2 +-
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 69973b63ce..e6ebd21515 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -1005,8 +1005,11 @@ int ff_sws_op_list_subpass(SwsOpList *ops1, SwsOpList 
**out_rest)
     ops1->dst = ops2->src;
 
     for (int i = 0; i < nb_planes; i++) {
+        const int idx = swiz_wr.in[i];
         ops1->plane_dst[i] = ops2->plane_src[i] = i;
-        ops2->comps_src.flags[i] = prev->comps.flags[swiz_wr.in[i]];
+        ops2->comps_src.flags[i]  = prev->comps.flags[idx];
+        ops2->comps_src.min[i]    = prev->comps.min[idx];
+        ops2->comps_src.max[i]    = prev->comps.max[idx];
     }
 
     ff_sws_op_list_remove_at(ops1, idx, ops1->num_ops - idx);
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index 3c4d6b6a3e..5d9e1a8026 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -1177,8 +1177,6 @@
     MACRO(__VA_ARGS__, f32_linear_xyzw_x0000_0x000_00x00_000x0 , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR          , .mask = 0xf, .par.lin.one = 
0x0, .par.lin.zero = 0xbefbe)
 #define SWS_FOR_F32_LINEAR_FMA(MACRO, ...) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_xxx0x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefa8, 0xfffe8) \
-    MACRO(__VA_ARGS__, f32_linear_fma_x_XXx0x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefa8, 0xfffeb) \
-    MACRO(__VA_ARGS__, f32_linear_fma_x_xXX0x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefa8, 0xfffee) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_XXX0x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefa8, 0xfffef) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_x000x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefae, 0xfffee) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_X000x                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefae, 0xfffef) \
@@ -1188,18 +1186,12 @@
     MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x1, 0x41040, 0xbefb8, 0xffffe) \
     MACRO(__VA_ARGS__, f32_linear_fma_y_0x000                  , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x2, 0x41001, 0xbefbe, 0xfffbf) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba108, 0xfa108) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba108, 0xfad6b) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba108, 0xfbdaf) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX0x_xXX0x_xXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba108, 0xfb9ce) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XXX0x_XXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba108, 0xfbdef) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xbb10a, 0xfb10a) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xbb10a, 0xfbdef) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba118, 0xfa118) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXx00_XXx0x_XXx0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba118, 0xfad7a) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XxX0x_XXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba118, 0xfbdbe) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba118, 0xfb9de) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XXX0x_XXX0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xba118, 0xfbdfe) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xbadae, 0xfadae) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xbadae, 0xfbdef) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40000, 0xbefbe, 0xfefbe) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x    , 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0x7, 0x40421, 0xbb10a, 0xfbdef) \
@@ -1214,8 +1206,6 @@
     MACRO(__VA_ARGS__, f32_linear_fma_xyzw_x0000_0x000_00x00_000x0, 
SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA      , 0xf, 0x00000, 0xbefbe, 0xbefbe)
 #define SWS_FOR_STRUCT_F32_LINEAR_FMA(MACRO, ...) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_xxx0x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffe8) \
-    MACRO(__VA_ARGS__, f32_linear_fma_x_XXx0x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffeb) \
-    MACRO(__VA_ARGS__, f32_linear_fma_x_xXX0x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffee) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_XXX0x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffef) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_x000x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffee) \
     MACRO(__VA_ARGS__, f32_linear_fma_x_X000x                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \
@@ -1225,18 +1215,12 @@
     MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffe) \
     MACRO(__VA_ARGS__, f32_linear_fma_y_0x000                  , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x2, .par.lin.one = 
0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffbf) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfa108) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfad6b) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdaf) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX0x_xXX0x_xXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfb9ce) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XXX0x_XXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdef) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfb10a) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfa118) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXx00_XXx0x_XXx0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfad7a) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XxX0x_XXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdbe) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfb9de) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XXX0x_XXX0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdfe) \
-    MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfadae) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfbdef) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefbe) \
     MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x    , .type = 
SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA      , .mask = 0x7, .par.lin.one = 
0x40421, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 6b4003121a..dcda011ccc 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-bbe27c8c324f08d933f6397f5fb96650
+e490d908612d059c644e64b43247fb08
-- 
2.52.0


>From b120505ce2d5e934451019250af4e846d0ced9c4 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 20 Jun 2026 02:55:04 +0200
Subject: [PATCH 04/16] swscale/ops: apply ff_sws_comp_mask_swizzle() in-place

More convenient at every use site.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c           | 7 ++++---
 libswscale/ops.h           | 2 +-
 libswscale/ops_optimizer.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 1ea5261c3d..71f450fc6b 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -144,16 +144,17 @@ SwsCompMask ff_sws_comp_mask_q4(const AVRational q[4])
     return mask;
 }
 
-SwsCompMask ff_sws_comp_mask_swizzle(const SwsCompMask mask, const 
SwsSwizzleOp *swiz)
+void ff_sws_comp_mask_swizzle(SwsCompMask *mask, const SwsSwizzleOp *swiz)
 {
+    const SwsCompMask orig = *mask;
     SwsCompMask res = 0;
     for (int i = 0; i < 4; i++) {
         const int src = swiz->in[i];
-        if (SWS_COMP_TEST(mask, src))
+        if (SWS_COMP_TEST(orig, src))
             res |= SWS_COMP(i);
     }
 
-    return res;
+    *mask = res;
 }
 
 SwsCompMask ff_sws_comp_mask_needed(const SwsOp *op)
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 41755f35f4..b5de7546d5 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -146,7 +146,7 @@ typedef struct SwsSwizzleOp {
 } SwsSwizzleOp;
 
 #define SWS_SWIZZLE(X,Y,Z,W) ((SwsSwizzleOp) { .in = {X, Y, Z, W} })
-SwsCompMask ff_sws_comp_mask_swizzle(SwsCompMask mask, const SwsSwizzleOp 
*swiz);
+void ff_sws_comp_mask_swizzle(SwsCompMask *mask, const SwsSwizzleOp *swiz);
 
 typedef struct SwsShiftOp {
     uint8_t amount; /* number of bits to shift */
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index e6ebd21515..8c473c1a52 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -61,7 +61,7 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next)
         op->type = next->filter.type;
         return true;
     case SWS_OP_SWIZZLE:
-        op->clear.mask = ff_sws_comp_mask_swizzle(op->clear.mask, 
&next->swizzle);
+        ff_sws_comp_mask_swizzle(&op->clear.mask, &next->swizzle);
         ff_sws_apply_op_q(next, op->clear.value);
         return true;
     case SWS_OP_SWAP_BYTES:
-- 
2.52.0


>From d474b408f2966646a824be2500e82a39038a5dc5 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 16:14:49 +0200
Subject: [PATCH 05/16] swscale/ops_optimizer: simplify unused op check
 (cosmetic)

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 8c473c1a52..e41af380e9 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -393,12 +393,10 @@ retry:
         SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;
 
         /* common helper variable */
+        const SwsCompMask needed = ff_sws_comp_mask_needed(op);
         bool noop = true;
 
-        if (!SWS_OP_NEEDED(op, 0) && !SWS_OP_NEEDED(op, 1) &&
-            !SWS_OP_NEEDED(op, 2) && !SWS_OP_NEEDED(op, 3) &&
-            op->op != SWS_OP_WRITE)
-        {
+        if (!needed && op->op != SWS_OP_WRITE) {
             /* Remove any operation whose output is not needed */
             ff_sws_op_list_remove_at(ops, n, 1);
             goto retry;
-- 
2.52.0


>From cb8a006f8a2777ad2766ed7018458e9024586ad8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 14:09:42 +0200
Subject: [PATCH 06/16] swscale/graph: don't over-allocate pass buffer lines

This is not only wasteful but also serves no real purpose. Looping over
the correct number of lines is trivial; there is far less point in vertical
padding than horizontal padding.

Furthermore, this might actually introduce issues when linking output buffers;
since the extra padding depends on the pass's alignment and threading
requirements, which may differ from pass to pass.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index 3db3b98c7b..06e5ebefc8 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -201,6 +201,10 @@ int ff_sws_graph_add_pass(SwsGraph *graph, enum 
AVPixelFormat fmt,
         goto fail;
     }
 
+    pass->output->height = pass->height;
+    pass->output->width  = pass->width;
+    pass->output->width_align = 1;
+
     if (!align) {
         pass->slice_h = pass->height;
         pass->num_slices = 1;
@@ -210,11 +214,6 @@ int ff_sws_graph_add_pass(SwsGraph *graph, enum 
AVPixelFormat fmt,
         pass->num_slices = (pass->height + pass->slice_h - 1) / pass->slice_h;
     }
 
-    /* Align output buffer to include extra slice padding */
-    pass->output->height = pass->slice_h * pass->num_slices;
-    pass->output->width  = pass->width;
-    pass->output->width_align = 1;
-
     ret = av_dynarray_add_nofree(&graph->passes, &graph->num_passes, pass);
     if (ret < 0)
         goto fail;
-- 
2.52.0


>From ba1c1d9eee75a2bd5f2677907af1753112c5c69f Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 20 Jun 2026 02:55:31 +0200
Subject: [PATCH 07/16] swscale/graph: separate pass dispatch size from buffer
 size

This allows adding passes which will be dispatched over a reduced number of
lines, without affecting the allocated buffer dimensions - e.g. for passes
which purely write to subsampled chroma planes.

A few hard-coded references to pass->width/height need to be replaced by
the corresponding output frame references, but it's not a huge deal.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c        | 43 +++++++++++++++++++++------------------
 libswscale/graph.h        |  6 ++++--
 libswscale/ops_dispatch.c | 15 +++++++-------
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index 06e5ebefc8..a765b4cd5c 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -174,7 +174,8 @@ static void pass_free(SwsPass *pass)
 
 int ff_sws_graph_add_pass(SwsGraph *graph, enum AVPixelFormat fmt,
                           int width, int height, SwsPass *input,
-                          int align, SwsPassFunc run, SwsPassSetup setup,
+                          int lines, int align,
+                          SwsPassFunc run, SwsPassSetup setup,
                           void *priv, void (*free_cb)(void *priv),
                           SwsPass **out_pass)
 {
@@ -186,14 +187,16 @@ int ff_sws_graph_add_pass(SwsGraph *graph, enum 
AVPixelFormat fmt,
         return AVERROR(ENOMEM);
     }
 
+    if (!lines)
+        lines = height;
+
     pass->graph  = graph;
     pass->run    = run;
     pass->setup  = setup;
     pass->priv   = priv;
     pass->free   = free_cb;
     pass->format = fmt;
-    pass->width  = width;
-    pass->height = height;
+    pass->lines  = lines;
     pass->input  = input;
     pass->output = av_refstruct_alloc_ext(sizeof(*pass->output), 0, NULL, 
free_buffer);
     if (!pass->output) {
@@ -201,17 +204,17 @@ int ff_sws_graph_add_pass(SwsGraph *graph, enum 
AVPixelFormat fmt,
         goto fail;
     }
 
-    pass->output->height = pass->height;
-    pass->output->width  = pass->width;
+    pass->output->height = height;
+    pass->output->width  = width;
     pass->output->width_align = 1;
 
     if (!align) {
-        pass->slice_h = pass->height;
+        pass->slice_h = pass->lines;
         pass->num_slices = 1;
     } else {
-        pass->slice_h = (pass->height + graph->num_threads - 1) / 
graph->num_threads;
+        pass->slice_h = (pass->lines + graph->num_threads - 1) / 
graph->num_threads;
         pass->slice_h = FFALIGN(pass->slice_h, align);
-        pass->num_slices = (pass->height + pass->slice_h - 1) / pass->slice_h;
+        pass->num_slices = (pass->lines + pass->slice_h - 1) / pass->slice_h;
     }
 
     ret = av_dynarray_add_nofree(&graph->passes, &graph->num_passes, pass);
@@ -267,7 +270,7 @@ static void run_rgb0(const SwsFrame *out, const SwsFrame 
*in, int y, int h,
 {
     SwsInternal *c = pass->priv;
     const int x0 = c->src0Alpha - 1;
-    const int w4 = 4 * pass->width;
+    const int w4 = 4 * out->width;
     const int src_stride = in->linesize[0];
     const int dst_stride = out->linesize[0];
     const uint8_t *src = in->data[0] + y * src_stride;
@@ -289,7 +292,7 @@ static void run_xyz2rgb(const SwsFrame *out, const SwsFrame 
*in, int y, int h,
     const SwsInternal *c = pass->priv;
     c->xyz12Torgb48(c, out->data[0] + y * out->linesize[0], out->linesize[0],
                     in->data[0] + y * in->linesize[0], in->linesize[0],
-                    pass->width, h);
+                    out->width, h);
 }
 
 static void run_rgb2xyz(const SwsFrame *out, const SwsFrame *in, int y, int h,
@@ -298,7 +301,7 @@ static void run_rgb2xyz(const SwsFrame *out, const SwsFrame 
*in, int y, int h,
     const SwsInternal *c = pass->priv;
     c->rgb48Toxyz12(c, out->data[0] + y * out->linesize[0], out->linesize[0],
                     in->data[0] + y * in->linesize[0], in->linesize[0],
-                    pass->width, h);
+                    out->width, h);
 }
 
 /***********************************************************************
@@ -465,7 +468,7 @@ static int init_legacy_subpass(SwsGraph *graph, SwsContext 
*sws,
 
     if (c->src0Alpha && !c->dst0Alpha && isALPHA(sws->dst_format)) {
         ret = ff_sws_graph_add_pass(graph, AV_PIX_FMT_RGBA, src_w, src_h, 
input,
-                                    1, run_rgb0, NULL, c, NULL, &input);
+                                    0, 1, run_rgb0, NULL, c, NULL, &input);
         if (ret < 0) {
             sws_free_context(&sws);
             return ret;
@@ -474,14 +477,14 @@ static int init_legacy_subpass(SwsGraph *graph, 
SwsContext *sws,
 
     if (c->srcXYZ && !(c->dstXYZ && unscaled)) {
         ret = ff_sws_graph_add_pass(graph, AV_PIX_FMT_RGB48, src_w, src_h, 
input,
-                                    1, run_xyz2rgb, NULL, c, NULL, &input);
+                                    0, 1, run_xyz2rgb, NULL, c, NULL, &input);
         if (ret < 0) {
             sws_free_context(&sws);
             return ret;
         }
     }
 
-    ret = ff_sws_graph_add_pass(graph, sws->dst_format, dst_w, dst_h, input, 
align,
+    ret = ff_sws_graph_add_pass(graph, sws->dst_format, dst_w, dst_h, input, 
0, align,
                                 c->convert_unscaled ? run_legacy_unscaled : 
run_legacy_swscale,
                                 setup_legacy_swscale, sws, 
free_legacy_swscale, &pass);
     if (ret < 0)
@@ -533,7 +536,7 @@ static int init_legacy_subpass(SwsGraph *graph, SwsContext 
*sws,
 
     if (c->dstXYZ && !(c->srcXYZ && unscaled)) {
         ret = ff_sws_graph_add_pass(graph, AV_PIX_FMT_RGB48, dst_w, dst_h, 
pass,
-                                    1, run_rgb2xyz, NULL, c, NULL, &pass);
+                                    0, 1, run_rgb2xyz, NULL, c, NULL, &pass);
         if (ret < 0)
             return ret;
     }
@@ -715,7 +718,7 @@ static void run_lut3d(const SwsFrame *out, const SwsFrame 
*in, int y, int h,
     frame_shift(out, y, out_data);
 
     ff_sws_lut3d_apply(lut, in_data[0], in->linesize[0], out_data[0],
-                       out->linesize[0], pass->width, h);
+                       out->linesize[0], out->width, h);
 }
 
 static int adapt_colors(SwsGraph *graph, const SwsFormat *src_fmt,
@@ -777,7 +780,7 @@ static int adapt_colors(SwsGraph *graph, const SwsFormat 
*src_fmt,
     }
 
     return ff_sws_graph_add_pass(graph, fmt_out, src.width, src.height,
-                                 input, 1, run_lut3d, setup_lut3d, lut,
+                                 input, 0, 1, run_lut3d, setup_lut3d, lut,
                                  free_lut3d, output);
 }
 
@@ -812,7 +815,7 @@ static int init_passes(SwsGraph *graph)
 
     /* Add threaded memcpy pass */
     return ff_sws_graph_add_pass(graph, dst.format, dst.width, dst.height,
-                                 pass, 1, run_copy, NULL, NULL, NULL, &pass);
+                                 pass, 0, 1, run_copy, NULL, NULL, NULL, 
&pass);
 }
 
 static void sws_graph_worker(void *priv, int jobnr, int threadnr, int nb_jobs,
@@ -821,7 +824,7 @@ static void sws_graph_worker(void *priv, int jobnr, int 
threadnr, int nb_jobs,
     SwsGraph *graph = priv;
     const SwsPass *pass = graph->exec.pass;
     const int slice_y = jobnr * pass->slice_h;
-    const int slice_h = FFMIN(pass->slice_h, pass->height - slice_y);
+    const int slice_h = FFMIN(pass->slice_h, pass->lines - slice_y);
 
     pass->run(graph->exec.output, graph->exec.input, slice_y, slice_h, pass);
 }
@@ -1016,7 +1019,7 @@ int ff_sws_graph_run(SwsGraph *graph, const AVFrame *dst, 
const AVFrame *src)
         }
 
         if (pass->num_slices == 1) {
-            pass->run(graph->exec.output, graph->exec.input, 0, pass->height, 
pass);
+            pass->run(graph->exec.output, graph->exec.input, 0, pass->lines, 
pass);
         } else {
             avpriv_slicethread_execute(graph->slicethread, pass->num_slices, 
0);
         }
diff --git a/libswscale/graph.h b/libswscale/graph.h
index adf4b19675..eff2dcc47f 100644
--- a/libswscale/graph.h
+++ b/libswscale/graph.h
@@ -83,7 +83,7 @@ struct SwsPass {
     SwsPassFunc run;
     SwsBackend backend; /* backend this pass is using, or 0 */
     enum AVPixelFormat format; /* new pixel format */
-    int width, height; /* new output size */
+    int lines;         /* pass dispatch size */
     int slice_h;       /* filter granularity */
     int num_slices;
 
@@ -184,6 +184,7 @@ int ff_sws_graph_create(SwsContext *ctx, const SwsFormat 
*dst, const SwsFormat *
  * @param w      Width of the output image.
  * @param h      Height of the output image.
  * @param input  Previous pass to read from, or NULL for the input image.
+ * @param lines  Override the number of lines processed for this pass. 
(Optional)
  * @param align  Minimum slice alignment for this pass, or 0 for no threading.
  * @param run    Filter function to run.
  * @param setup  Optional setup function to run from the main thread.
@@ -194,7 +195,8 @@ int ff_sws_graph_create(SwsContext *ctx, const SwsFormat 
*dst, const SwsFormat *
  */
 int ff_sws_graph_add_pass(SwsGraph *graph, enum AVPixelFormat fmt,
                           int width, int height, SwsPass *input,
-                          int align, SwsPassFunc run, SwsPassSetup setup,
+                          int lines, int align,
+                          SwsPassFunc run, SwsPassSetup setup,
                           void *priv, void (*free)(void *priv),
                           SwsPass **out_pass);
 
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 44248195d7..abf67b0d55 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -202,6 +202,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 {
     const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
     const bool float_in = indesc->flags & AV_PIX_FMT_FLAG_FLOAT;
+    const int width = out->width;
 
     SwsOpPass *p = pass->priv;
     SwsOpExec *exec = &p->exec_base;
@@ -209,9 +210,9 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 
     /* Set up main loop parameters */
     const unsigned block_size = comp->block_size;
-    const size_t num_blocks   = (pass->width + block_size - 1) / block_size;
+    const size_t num_blocks   = (width + block_size - 1) / block_size;
     const size_t aligned_w    = num_blocks * block_size;
-    if (aligned_w < pass->width) /* overflow */
+    if (aligned_w < width) /* overflow */
         return AVERROR(EINVAL);
     p->num_blocks   = num_blocks;
     p->memcpy_first = false;
@@ -280,14 +281,14 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
     *tail = *exec;
 
     const size_t safe_width = safe_blocks * block_size;
-    const size_t tail_size  = pass->width - safe_width;
+    const size_t tail_size  = width - safe_width;
     p->tail_off_out  = pixel_bytes(safe_width, p->pixel_bits_out, 
AV_ROUND_DOWN);
     p->tail_size_out = pixel_bytes(tail_size,  p->pixel_bits_out, AV_ROUND_UP);
     p->tail_blocks   = num_blocks - safe_blocks;
 
     if (exec->in_offset_x) {
         p->tail_off_in  = exec->in_offset_x[safe_width];
-        p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
+        p->tail_size_in = exec->in_offset_x[width - 1] - p->tail_off_in;
         p->tail_size_in += pixel_bytes(p->filter_size_h, p->pixel_bits_in, 
AV_ROUND_UP);
     } else {
         p->tail_off_in  = pixel_bytes(safe_width, p->pixel_bits_in, 
AV_ROUND_DOWN);
@@ -387,7 +388,7 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
      *    memcpy the last column on the output side if unpadded.
      */
 
-    const bool memcpy_in  = p->memcpy_last && y + h == pass->height ||
+    const bool memcpy_in  = p->memcpy_last && y + h == pass->lines ||
                             p->memcpy_first && y == 0;
     const bool memcpy_out = p->memcpy_out;
     const size_t num_blocks  = p->num_blocks;
@@ -515,7 +516,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         SwsCompiledOp c = *comp;
         av_free(p);
         ret = ff_sws_graph_add_pass(graph, dst->format, dst->width, 
dst->height,
-                                    input, c.slice_align, c.func_opaque,
+                                    input, 0, c.slice_align, c.func_opaque,
                                     NULL, c.priv, c.free, output);
         if (ret >= 0)
             (*output)->backend = comp->backend->flags;
@@ -616,7 +617,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
     }
 
     ret = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
-                                input, comp->slice_align, op_pass_run,
+                                input, 0, comp->slice_align, op_pass_run,
                                 op_pass_setup, p, op_pass_free, output);
     if (ret < 0)
         return ret;
-- 
2.52.0


>From 3a2c5050c6eb5dd2ffdf74b0af735c4d76ee9013 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 13:30:08 +0200
Subject: [PATCH 08/16] swscale: fix format equality check

I can't say I remember why this logic was written this way, but I can't
think of any good reason why we should exclude comparing the image
dimensions here - the intent is obviously to allow passthrough / noop.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/swscale.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index c0cdd17b78..508967a13c 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1512,7 +1512,7 @@ int sws_frame_setup(SwsContext *ctx, const AVFrame *dst, 
const AVFrame *src)
 
         src_ok = ff_test_fmt(backends, &src_fmt, 0);
         dst_ok = ff_test_fmt(backends, &dst_fmt, 1);
-        if ((!src_ok || !dst_ok) && !ff_props_equal(&src_fmt, &dst_fmt)) {
+        if ((!src_ok || !dst_ok) && !ff_fmt_equal(&src_fmt, &dst_fmt)) {
             err_msg = src_ok ? "Unsupported output" : "Unsupported input";
             ret = AVERROR(ENOTSUP);
             goto fail;
-- 
2.52.0


>From 7e7c1c0d94634543a7cc215d7c3b32235e487e25 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 13:31:30 +0200
Subject: [PATCH 09/16] swscale/format: nuke ff_props_equal()

And merge it with the more clear ff_fmt_equal().

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/libswscale/format.h b/libswscale/format.h
index 36158da55a..24d53b8e80 100644
--- a/libswscale/format.h
+++ b/libswscale/format.h
@@ -120,23 +120,17 @@ static inline int ff_color_equal(const SwsColor *c1, 
const SwsColor *c2)
             ff_prim_equal(&c1->gamut, &c2->gamut);
 }
 
-/* Tests only the static components of a colorspace, ignoring dimensions and 
per-frame data */
-static inline int ff_props_equal(const SwsFormat *fmt1, const SwsFormat *fmt2)
-{
-    return fmt1->interlaced == fmt2->interlaced &&
-           fmt1->format     == fmt2->format     &&
-           fmt1->range      == fmt2->range      &&
-           fmt1->csp        == fmt2->csp        &&
-           fmt1->loc        == fmt2->loc        &&
-           ff_color_equal(&fmt1->color, &fmt2->color);
-}
-
 /* Tests only the static components of a colorspace, ignoring per-frame data */
 static inline int ff_fmt_equal(const SwsFormat *fmt1, const SwsFormat *fmt2)
 {
     return fmt1->width      == fmt2->width      &&
            fmt1->height     == fmt2->height     &&
-           ff_props_equal(fmt1, fmt2);
+           fmt1->interlaced == fmt2->interlaced &&
+           fmt1->format     == fmt2->format     &&
+           fmt1->range      == fmt2->range      &&
+           fmt1->csp        == fmt2->csp        &&
+           fmt1->loc        == fmt2->loc        &&
+           ff_color_equal(&fmt1->color, &fmt2->color);
 }
 
 static inline int ff_fmt_align(enum AVPixelFormat fmt)
-- 
2.52.0


>From aedede0cee249e32a7cd9712255d4d33f89e5858 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 14:25:02 +0200
Subject: [PATCH 10/16] swscale/format: add SwsFormat.field

This metadata is needed to compute the correct chroma sampling offsets.
We previously stored this in graph->field, but that's a bad place for it,
because it doesn't survive the translation to the ops abstraction layer.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.c | 1 +
 libswscale/format.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/libswscale/format.c b/libswscale/format.c
index c34d335500..a68565e8e5 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -388,6 +388,7 @@ SwsFormat ff_fmt_from_frame(const AVFrame *frame, int field)
     if (frame->flags & AV_FRAME_FLAG_INTERLACED) {
         fmt.height = (fmt.height + (field == FIELD_TOP)) >> 1;
         fmt.interlaced = 1;
+        fmt.field = field;
     }
 
     /* Set luminance and gamut information */
diff --git a/libswscale/format.h b/libswscale/format.h
index 24d53b8e80..9b852efd39 100644
--- a/libswscale/format.h
+++ b/libswscale/format.h
@@ -77,6 +77,7 @@ static inline void ff_color_update_dynamic(SwsColor *dst, 
const SwsColor *src)
 typedef struct SwsFormat {
     int width, height;
     int interlaced;
+    int field;
     enum AVPixelFormat format;
     enum AVPixelFormat hw_format;
     enum AVColorRange range;
@@ -126,6 +127,7 @@ static inline int ff_fmt_equal(const SwsFormat *fmt1, const 
SwsFormat *fmt2)
     return fmt1->width      == fmt2->width      &&
            fmt1->height     == fmt2->height     &&
            fmt1->interlaced == fmt2->interlaced &&
+           fmt1->field      == fmt2->field      &&
            fmt1->format     == fmt2->format     &&
            fmt1->range      == fmt2->range      &&
            fmt1->csp        == fmt2->csp        &&
-- 
2.52.0


>From 4653e68aaba6754cd63a8da4327113564a9a9d0b Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 14:30:25 +0200
Subject: [PATCH 11/16] swscale/graph: nuke SwsGraph.field

No longer needed after the previous commit.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/graph.c      | 28 +++++++++++++++-------------
 libswscale/graph.h      |  7 +++----
 libswscale/swscale.c    |  2 +-
 libswscale/vulkan/ops.c |  2 +-
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/libswscale/graph.c b/libswscale/graph.c
index a765b4cd5c..73df6b8907 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -405,7 +405,7 @@ static void get_chroma_pos(SwsGraph *graph, int *h_chr_pos, 
int *v_chr_pos,
          * For 4x vertical subsampling (v_sub == 2), they are only placed
          * next to every *other* even row, so we need to shift by three luma
          * rows to get to the chroma sample. */
-        if (graph->field == FIELD_BOTTOM)
+        if (fmt->field == FIELD_BOTTOM)
             y_pos += (256 << sub_y) - 256;
 
         /* Luma row distance is doubled for fields, so halve offsets */
@@ -846,7 +846,7 @@ static void graph_uninit(SwsGraph *graph)
 }
 
 int ff_sws_graph_init(SwsGraph *graph, SwsContext *ctx, const SwsFormat *dst,
-                      const SwsFormat *src, int field)
+                      const SwsFormat *src)
 {
     int ret;
     if (graph->ctx) {
@@ -857,8 +857,9 @@ int ff_sws_graph_init(SwsGraph *graph, SwsContext *ctx, 
const SwsFormat *dst,
     graph->ctx = ctx;
     graph->src = *src;
     graph->dst = *dst;
-    graph->field = field;
     graph->opts_copy = *ctx;
+    av_assert0(src->interlaced == dst->interlaced);
+    av_assert0(src->field      == dst->field);
 
     if (ctx->threads == 1) {
         graph->num_threads = 1;
@@ -895,13 +896,13 @@ error:
 }
 
 int ff_sws_graph_create(SwsContext *ctx, const SwsFormat *dst, const SwsFormat 
*src,
-                        int field, SwsGraph **out_graph)
+                        SwsGraph **out_graph)
 {
     SwsGraph *graph = ff_sws_graph_alloc();
     if (!graph)
         return AVERROR(ENOMEM);
 
-    int ret = ff_sws_graph_init(graph, ctx, dst, src, field);
+    int ret = ff_sws_graph_init(graph, ctx, dst, src);
     if (ret < 0) {
         ff_sws_graph_free(&graph);
         return ret;
@@ -950,7 +951,7 @@ static int opts_equal(const SwsContext *c1, const 
SwsContext *c2)
 }
 
 int ff_sws_graph_reinit(SwsGraph *graph, SwsContext *ctx, const SwsFormat *dst,
-                        const SwsFormat *src, int field)
+                        const SwsFormat *src)
 {
     if (ff_fmt_equal(&graph->src, src) && ff_fmt_equal(&graph->dst, dst) &&
         opts_equal(ctx, &graph->opts_copy))
@@ -960,7 +961,7 @@ int ff_sws_graph_reinit(SwsGraph *graph, SwsContext *ctx, 
const SwsFormat *dst,
     }
 
     graph_uninit(graph);
-    return ff_sws_graph_init(graph, ctx, dst, src, field);
+    return ff_sws_graph_init(graph, ctx, dst, src);
 }
 
 void ff_sws_graph_update_metadata(SwsGraph *graph, const SwsColor *color)
@@ -971,16 +972,17 @@ void ff_sws_graph_update_metadata(SwsGraph *graph, const 
SwsColor *color)
     ff_color_update_dynamic(&graph->src.color, color);
 }
 
-static void get_field(SwsGraph *graph, const AVFrame *avframe, SwsFrame *frame)
+static void get_field(SwsGraph *graph, const SwsFormat *fmt,
+                      const AVFrame *avframe, SwsFrame *frame)
 {
     ff_sws_frame_from_avframe(frame, avframe);
 
     if (!(avframe->flags & AV_FRAME_FLAG_INTERLACED)) {
-        av_assert1(!graph->field);
+        av_assert1(!fmt->field);
         return;
     }
 
-    if (graph->field == FIELD_BOTTOM) {
+    if (fmt->field == FIELD_BOTTOM) {
         /* Odd rows, offset by one line */
         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
         for (int i = 0; i < 4; i++) {
@@ -995,7 +997,7 @@ static void get_field(SwsGraph *graph, const AVFrame 
*avframe, SwsFrame *frame)
     for (int i = 0; i < 4; i++)
         frame->linesize[i] <<= 1;
 
-    frame->height = (frame->height + (graph->field == FIELD_TOP)) >> 1;
+    frame->height = (frame->height + (fmt->field == FIELD_TOP)) >> 1;
 }
 
 int ff_sws_graph_run(SwsGraph *graph, const AVFrame *dst, const AVFrame *src)
@@ -1004,8 +1006,8 @@ int ff_sws_graph_run(SwsGraph *graph, const AVFrame *dst, 
const AVFrame *src)
     av_assert0(src->format == graph->src.hw_format || src->format == 
graph->src.format);
 
     SwsFrame src_field, dst_field;
-    get_field(graph, dst, &dst_field);
-    get_field(graph, src, &src_field);
+    get_field(graph, &graph->dst, dst, &dst_field);
+    get_field(graph, &graph->src, src, &src_field);
 
     for (int i = 0; i < graph->num_passes; i++) {
         const SwsPass *pass = graph->passes[i];
diff --git a/libswscale/graph.h b/libswscale/graph.h
index eff2dcc47f..cb06f480cc 100644
--- a/libswscale/graph.h
+++ b/libswscale/graph.h
@@ -143,7 +143,6 @@ typedef struct SwsGraph {
      * Currently active format and processing parameters.
      */
     SwsFormat src, dst;
-    int field;
 
     /**
      * Temporary execution state inside ff_sws_graph_run(); used to pass
@@ -166,13 +165,13 @@ SwsGraph *ff_sws_graph_alloc(void);
  * negative error.
  */
 int ff_sws_graph_init(SwsGraph *graph, SwsContext *ctx, const SwsFormat *dst,
-                      const SwsFormat *src, int field);
+                      const SwsFormat *src);
 
 /**
  * Allocate and initialize the filter graph. Returns 0 or a negative error.
  */
 int ff_sws_graph_create(SwsContext *ctx, const SwsFormat *dst, const SwsFormat 
*src,
-                        int field, SwsGraph **out_graph);
+                        SwsGraph **out_graph);
 
 
 /**
@@ -223,7 +222,7 @@ void ff_sws_graph_update_metadata(SwsGraph *graph, const 
SwsColor *color);
  * will have no effect.
  */
 int ff_sws_graph_reinit(SwsGraph *graph, SwsContext *ctx, const SwsFormat *dst,
-                        const SwsFormat *src, int field);
+                        const SwsFormat *src);
 
 /**
  * Dispatch the filter graph on a single field of the given frames. Internally
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 508967a13c..969456efcc 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1527,7 +1527,7 @@ int sws_frame_setup(SwsContext *ctx, const AVFrame *dst, 
const AVFrame *src)
             }
         }
 
-        ret = ff_sws_graph_reinit(s->graph[field], ctx, &dst_fmt, &src_fmt, 
field);
+        ret = ff_sws_graph_reinit(s->graph[field], ctx, &dst_fmt, &src_fmt);
         if (ret < 0) {
             err_msg = "Failed initializing scaling graph";
             goto fail;
diff --git a/libswscale/vulkan/ops.c b/libswscale/vulkan/ops.c
index 1218fab2c7..9d5f201e69 100644
--- a/libswscale/vulkan/ops.c
+++ b/libswscale/vulkan/ops.c
@@ -153,7 +153,7 @@ static void process(const SwsFrame *dst, const SwsFrame 
*src, int y, int h,
     });
 
     if (p->interlaced) {
-        uint32_t field = pass->graph ? pass->graph->field : 0;
+        uint32_t field = pass->graph ? pass->graph->dst.field : 0;
         ff_vk_shader_update_push_const(&p->s->vkctx, ec, &p->shd,
                                        VK_SHADER_STAGE_COMPUTE_BIT,
                                        0, sizeof(field), &field);
-- 
2.52.0


>From 1f6dc79c807ab6044e8b4ef990386b2e9d9e918e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 15:07:24 +0200
Subject: [PATCH 12/16] swscale/format: factor out ff_sws_chroma_pos() helper

Moved here from graph.c, as it's needed for the new chroma scaling code.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.c | 41 ++++++++++++++++++++++++++++++++
 libswscale/format.h |  7 ++++++
 libswscale/graph.c  | 58 ++++++++++-----------------------------------
 3 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/libswscale/format.c b/libswscale/format.c
index a68565e8e5..0538ab9db4 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -1393,6 +1393,47 @@ linear_mat3(const AVRational m00, const AVRational m01, 
const AVRational m02,
     return c;
 }
 
+void ff_sws_chroma_pos(const SwsFormat *fmt, bool *incomplete,
+                       int *out_x_pos, int *out_y_pos)
+{
+    enum AVChromaLocation chroma_loc = fmt->loc;
+    const int sub_x = fmt->desc->log2_chroma_w;
+    const int sub_y = fmt->desc->log2_chroma_h;
+    int x_pos, y_pos;
+
+    /* Explicitly default to center siting for compatibility with swscale */
+    if (chroma_loc == AVCHROMA_LOC_UNSPECIFIED) {
+        chroma_loc = AVCHROMA_LOC_CENTER;
+        *incomplete |= sub_x || sub_y;
+    }
+
+    /* av_chroma_location_enum_to_pos() always gives us values in the range 
from
+     * 0 to 256, but we need to adjust this to the true value range of the
+     * subsampling grid, which may be larger for h/v_sub > 1 */
+    av_chroma_location_enum_to_pos(&x_pos, &y_pos, chroma_loc);
+    x_pos *= (1 << sub_x) - 1;
+    y_pos *= (1 << sub_y) - 1;
+
+    /* Fix vertical chroma position for interlaced frames */
+    if (sub_y && fmt->interlaced) {
+        /* When vertically subsampling, chroma samples are effectively only
+         * placed next to even rows. To access them from the odd field, we need
+         * to account for this shift by offsetting the distance of one luma 
row.
+         *
+         * For 4x vertical subsampling (v_sub == 2), they are only placed
+         * next to every *other* even row, so we need to shift by three luma
+         * rows to get to the chroma sample. */
+        if (fmt->field == FIELD_BOTTOM)
+            y_pos += (256 << sub_y) - 256;
+
+        /* Luma row distance is doubled for fields, so halve offsets */
+        y_pos >>= 1;
+    }
+
+    *out_x_pos = x_pos;
+    *out_y_pos = y_pos;
+}
+
 int ff_sws_decode_colors(SwsContext *ctx, SwsPixelType type,
                          SwsOpList *ops, const SwsFormat *fmt, bool 
*incomplete)
 {
diff --git a/libswscale/format.h b/libswscale/format.h
index 9b852efd39..ea2ab7dc41 100644
--- a/libswscale/format.h
+++ b/libswscale/format.h
@@ -161,6 +161,13 @@ int ff_test_fmt(SwsBackend backends, const SwsFormat *fmt, 
int output);
 /* Returns true if the formats are incomplete, false otherwise */
 bool ff_infer_colors(SwsColor *src, SwsColor *dst);
 
+/**
+ * Wrapper around av_chroma_location_enum_to_pos() that accounts for
+ * the per-field offset introduced by interlacing.
+ */
+void ff_sws_chroma_pos(const SwsFormat *fmt, bool *incomplete,
+                       int *out_xpos, int *out_ypos);
+
 typedef struct SwsOpList SwsOpList;
 typedef enum SwsPixelType SwsPixelType;
 
diff --git a/libswscale/graph.c b/libswscale/graph.c
index 73df6b8907..a99cd7cadf 100644
--- a/libswscale/graph.c
+++ b/libswscale/graph.c
@@ -375,49 +375,6 @@ static void run_legacy_swscale(const SwsFrame *out, const 
SwsFrame *in,
                sws->src_h, out_data, out->linesize, y, h);
 }
 
-static void get_chroma_pos(SwsGraph *graph, int *h_chr_pos, int *v_chr_pos,
-                           const SwsFormat *fmt)
-{
-    enum AVChromaLocation chroma_loc = fmt->loc;
-    const int sub_x = fmt->desc->log2_chroma_w;
-    const int sub_y = fmt->desc->log2_chroma_h;
-    int x_pos, y_pos;
-
-    /* Explicitly default to center siting for compatibility with swscale */
-    if (chroma_loc == AVCHROMA_LOC_UNSPECIFIED) {
-        chroma_loc = AVCHROMA_LOC_CENTER;
-        graph->incomplete |= sub_x || sub_y;
-    }
-
-    /* av_chroma_location_enum_to_pos() always gives us values in the range 
from
-     * 0 to 256, but we need to adjust this to the true value range of the
-     * subsampling grid, which may be larger for h/v_sub > 1 */
-    av_chroma_location_enum_to_pos(&x_pos, &y_pos, chroma_loc);
-    x_pos *= (1 << sub_x) - 1;
-    y_pos *= (1 << sub_y) - 1;
-
-    /* Fix vertical chroma position for interlaced frames */
-    if (sub_y && fmt->interlaced) {
-        /* When vertically subsampling, chroma samples are effectively only
-         * placed next to even rows. To access them from the odd field, we need
-         * to account for this shift by offsetting the distance of one luma 
row.
-         *
-         * For 4x vertical subsampling (v_sub == 2), they are only placed
-         * next to every *other* even row, so we need to shift by three luma
-         * rows to get to the chroma sample. */
-        if (fmt->field == FIELD_BOTTOM)
-            y_pos += (256 << sub_y) - 256;
-
-        /* Luma row distance is doubled for fields, so halve offsets */
-        y_pos >>= 1;
-    }
-
-    /* Explicitly strip chroma offsets when not subsampling, because it
-     * interferes with the operation of flags like SWS_FULL_CHR_H_INP */
-    *h_chr_pos = sub_x ? x_pos : -513;
-    *v_chr_pos = sub_y ? y_pos : -513;
-}
-
 static void legacy_chr_pos(SwsGraph *graph, int *chr_pos, int override, int 
*warned)
 {
     if (override == -513 || override == *chr_pos)
@@ -582,8 +539,8 @@ static int add_legacy_sws_pass(SwsGraph *graph, const 
SwsFormat *src,
     sws->dst_h      = dst->height;
     sws->dst_format = dst->format;
     sws->dst_range  = dst->range == AVCOL_RANGE_JPEG;
-    get_chroma_pos(graph, &sws->src_h_chr_pos, &sws->src_v_chr_pos, src);
-    get_chroma_pos(graph, &sws->dst_h_chr_pos, &sws->dst_v_chr_pos, dst);
+    ff_sws_chroma_pos(src, &graph->incomplete, &sws->src_h_chr_pos, 
&sws->src_v_chr_pos);
+    ff_sws_chroma_pos(dst, &graph->incomplete, &sws->dst_h_chr_pos, 
&sws->dst_v_chr_pos);
 
     graph->incomplete |= src->range == AVCOL_RANGE_UNSPECIFIED;
     graph->incomplete |= dst->range == AVCOL_RANGE_UNSPECIFIED;
@@ -594,6 +551,17 @@ static int add_legacy_sws_pass(SwsGraph *graph, const 
SwsFormat *src,
     legacy_chr_pos(graph, &sws->dst_h_chr_pos, ctx->dst_h_chr_pos, &warned);
     legacy_chr_pos(graph, &sws->dst_v_chr_pos, ctx->dst_v_chr_pos, &warned);
 
+    /* Explicitly strip chroma offsets when not subsampling, because it
+     * interferes with the operation of flags like SWS_FULL_CHR_H_INP */
+    if (!src->desc->log2_chroma_w)
+        sws->src_h_chr_pos = -513;
+    if (!src->desc->log2_chroma_h)
+        sws->src_v_chr_pos = -513;
+    if (!dst->desc->log2_chroma_w)
+        sws->dst_h_chr_pos = -513;
+    if (!dst->desc->log2_chroma_h)
+        sws->dst_v_chr_pos = -513;
+
     for (int i = 0; i < SWS_NUM_SCALER_PARAMS; i++)
         sws->scaler_params[i] = ctx->scaler_params[i];
 
-- 
2.52.0


>From 8215e9bbea07173b59e842a860df58aca18ff01a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 8 Jun 2026 14:44:01 +0200
Subject: [PATCH 13/16] swscale/filters: add option for adding an input pixel
 offset

This is needed for chroma subsampling, which requires a different filter
offset for chroma subsamples (according to the frame's chroma location).

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/filters.c | 3 ++-
 libswscale/filters.h | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/libswscale/filters.c b/libswscale/filters.c
index 7e8865659e..81a388e692 100644
--- a/libswscale/filters.c
+++ b/libswscale/filters.c
@@ -75,7 +75,7 @@ static void compute_row(SwsFilterWeights *f, const 
SwsFilterFunction *fun,
      * the entire square from (0,0) to (1,1). When normalizing between 
different
      * image sizes, we therefore need to add/subtract off these 0.5 offsets.
      */
-    const double src_pos = (dst_pos + 0.5) * ratio_inv - 0.5;
+    const double src_pos = (dst_pos + 0.5) * ratio_inv - 0.5 + f->offset;
     if (f->filter_size == 1) {
         *pos = fmin(fmax(round(src_pos), 0.0), f->src_size - 1);
         *out = SWS_FILTER_SCALE;
@@ -257,6 +257,7 @@ int ff_sws_filter_generate(void *log, const SwsFilterParams 
*params,
     memcpy(filter->name, fun.name, sizeof(filter->name));
     filter->src_size = params->src_size;
     filter->dst_size = params->dst_size;
+    filter->offset = params->offset;
     filter->filter_size = filter_size;
     if (filter->filter_size == 1)
         filter->sum_positive = SWS_FILTER_SCALE;
diff --git a/libswscale/filters.h b/libswscale/filters.h
index 1bfdb196b8..143fc2c5e3 100644
--- a/libswscale/filters.h
+++ b/libswscale/filters.h
@@ -56,6 +56,13 @@ typedef struct SwsFilterParams {
      */
     int src_size;
     int dst_size;
+
+    /**
+     * The sample offset, in units of input pixels. This is added onto all
+     * sampled coordinates directly, i.e. a value of offset = 1.0 would shift
+     * the output to the top/left by one whole source pixel.
+     */
+    double offset;
 } SwsFilterParams;
 
 /**
@@ -88,6 +95,7 @@ typedef struct SwsFilterWeights {
      */
     int src_size;
     int dst_size;
+    double offset;
 
     /**
      * Extra metadata about the filter, used to inform the optimizer / range
-- 
2.52.0


>From fa1ca69a8bde7a74e7c01777c75bb7c7e0754ee3 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 15:36:08 +0200
Subject: [PATCH 14/16] swscale/filters: add ability to set a virtual output
 size

Odd-size luma planes are not exact multiples of the chroma plane; but the
sample grid is still matched as though it were. We need to account for this
when translating a luma sample to the corresponding chroma sample coordinates.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/filters.c |  7 ++++++-
 libswscale/filters.h | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/libswscale/filters.c b/libswscale/filters.c
index 81a388e692..fb54d2bbd8 100644
--- a/libswscale/filters.c
+++ b/libswscale/filters.c
@@ -194,7 +194,11 @@ int ff_sws_filter_generate(void *log, const 
SwsFilterParams *params,
     if (scaler == SWS_SCALE_AUTO)
         scaler = SWS_SCALE_BICUBIC;
 
-    const double ratio = (double) params->dst_size / params->src_size;
+    double virtual_size = params->virtual_size;
+    if (!virtual_size)
+        virtual_size = params->dst_size;
+
+    const double ratio = virtual_size / params->src_size;
     double stretch = 1.0;
     if (ratio < 1.0 && scaler != SWS_SCALE_POINT) {
         /* Widen filter for downscaling (anti-aliasing) */
@@ -257,6 +261,7 @@ int ff_sws_filter_generate(void *log, const SwsFilterParams 
*params,
     memcpy(filter->name, fun.name, sizeof(filter->name));
     filter->src_size = params->src_size;
     filter->dst_size = params->dst_size;
+    filter->virtual_size = virtual_size;
     filter->offset = params->offset;
     filter->filter_size = filter_size;
     if (filter->filter_size == 1)
diff --git a/libswscale/filters.h b/libswscale/filters.h
index 143fc2c5e3..a10e6f3964 100644
--- a/libswscale/filters.h
+++ b/libswscale/filters.h
@@ -57,6 +57,20 @@ typedef struct SwsFilterParams {
     int src_size;
     int dst_size;
 
+    /**
+     * The virtual output size. If zero, this is assumed to be the same as
+     * `dst_size`. Matters for e.g. chroma subsampling, where the the luma
+     * plane may be smaller than the dst_size. For example, a 99x99 input
+     * image has a chroma size of 50x50, which would be 100x100 after
+     * chroma upscaling; but is sampled only at 99x99 resolution. In this
+     * instance, dst_size is 99x99 and virtual_size is 100x100.
+     *
+     * The upscaling offset from this shift is implicit and does not need
+     * to be accounted for in `offset`. In other words, `offset` is taken
+     * relative to the virtual size, not the sampled size.
+     */
+    double virtual_size;
+
     /**
      * The sample offset, in units of input pixels. This is added onto all
      * sampled coordinates directly, i.e. a value of offset = 1.0 would shift
@@ -95,6 +109,7 @@ typedef struct SwsFilterWeights {
      */
     int src_size;
     int dst_size;
+    double virtual_size;
     double offset;
 
     /**
-- 
2.52.0


>From 8cc6b2ddafe184cf544a0a10a954570740e94912 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 19 Jun 2026 16:30:28 +0200
Subject: [PATCH 15/16] swscale/tests/swscale: fix unscaled subsampled chroma
 format check

This should be matching against the *chroma* scaler, not the main scaler.
Of course, under normal circumstances, scaler_sub matches scaler, but this
allows users to explicitly override this defaulting by setting e.g.

-scaler none -scaler_sub bicubic

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/tests/swscale.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
index b5b8faeeff..6de9327ba9 100644
--- a/libswscale/tests/swscale.c
+++ b/libswscale/tests/swscale.c
@@ -691,8 +691,9 @@ static inline int fmt_is_supported_by_hw(enum AVPixelFormat 
fmt)
 
 static inline int fmt_disabled(const struct options *opts, enum AVPixelFormat 
fmt)
 {
+    const int scaler_sub = opts->scaler_sub ? opts->scaler_sub : opts->scaler;
     return (hw_device_constr && !fmt_is_supported_by_hw(fmt)) ||
-           (opts->scaler < 0 && fmt_is_subsampled(fmt));
+           (scaler_sub < 0 && fmt_is_subsampled(fmt));
 }
 
 static inline int test_formats(const struct options *opts,
-- 
2.52.0


>From b3689e792fdbaaac690ffb98e03dc75ca56483a6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Sat, 20 Jun 2026 02:56:18 +0200
Subject: [PATCH 16/16] swscale/uops: simplify permute naming scheme

We also drop the useless/unused mask from the permute ops.

Avoids a bunch of otherwise duplicate permute ops. Now that this is
handled by SWS_UOP_MOVE for x86, there is no downside to this.

The FATE change is a pure rename of the uops dumps.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c           |  22 ++-
 libswscale/uops_macros.h    | 258 ++++++++++++++----------------------
 tests/ref/fate/sws-ops-list |   2 +-
 3 files changed, 112 insertions(+), 170 deletions(-)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index 1bd3e2f763..a0bbf5ddbc 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -155,6 +155,10 @@ void ff_sws_uop_name(const SwsUOp *op, char 
buf[SWS_UOP_NAME_MAX])
         av_bprintf(&bp, "_%u", par->shift.amount);
         break;
     case SWS_UOP_PERMUTE:
+        av_bprint_chars(&bp, '_', 1);
+        for (int i = 0; i < 4; i++)
+            av_bprint_chars(&bp, "xyzw"[par->swizzle.in[i]], 1);
+        break;
     case SWS_UOP_COPY:
         av_bprint_chars(&bp, '_', 1);
         for (int i = 0; i < 4; i++) {
@@ -597,13 +601,13 @@ static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags 
flags, const SwsOp *op
     SwsUOp uop = {
         .type = pixel_type_to_int(op->type),
         .uop  = SWS_UOP_PERMUTE,
-        .mask = ff_sws_comp_mask_needed(op),
         .par.swizzle.in = {0, 1, 2, 3},
     };
 
+    SwsCompMask needed = ff_sws_comp_mask_needed(op);
     SwsCompMask seen = 0;
     for (int i = 0; i < 4; i++) {
-        if (!SWS_COMP_TEST(uop.mask, i))
+        if (!SWS_COMP_TEST(needed, i))
             continue;
         const int src = op->swizzle.in[i];
         if (SWS_COMP_TEST(seen, src))
@@ -615,7 +619,7 @@ static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags 
flags, const SwsOp *op
     if (uop.uop == SWS_UOP_PERMUTE) {
         /* Prevent overlap by moving unused components to unseen indices */
         for (int i = 0; i < 4; i++) {
-            if (SWS_COMP_TEST(uop.mask, i))
+            if (SWS_COMP_TEST(needed, i))
                 continue;
 
             /* Prefer identity mapping if possible */
@@ -634,10 +638,14 @@ static int translate_swizzle(SwsUOpList *ops, SwsUOpFlags 
flags, const SwsOp *op
         }
     }
 
-    /* Remove remaining trivial / identity components from the mask */
-    for (int i = 0; i < 4; i++) {
-        if (uop.par.swizzle.in[i] == i)
-            uop.mask &= ~SWS_COMP(i);
+    if (uop.uop == SWS_UOP_COPY) {
+        /* Remove remaining trivial / identity components from the mask */
+        for (int i = 0; i < 4; i++) {
+            if (uop.par.swizzle.in[i] == i)
+                needed &= ~SWS_COMP(i);
+        }
+
+        uop.mask = needed;
     }
 
     return ff_sws_uop_list_append(ops, &uop);
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index 5d9e1a8026..77a4900248 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -102,67 +102,43 @@
 #define SWS_FOR_STRUCT_U8_WRITE_BIT(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_write_bit_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_WRITE_BIT       , .mask = 0x1)
 #define SWS_FOR_U8_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u8_permute_x_y                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u8_permute_x_z                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u8_permute_x_w                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u8_permute_y_w                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x2, 0, 3, 2, 1) \
-    MACRO(__VA_ARGS__, u8_permute_y_x                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x2, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xy_yx                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x3, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xy_yw                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x3, 1, 3, 2, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xy_zw                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x3, 2, 3, 0, 1) \
-    MACRO(__VA_ARGS__, u8_permute_xy_wx                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x3, 3, 0, 2, 1) \
-    MACRO(__VA_ARGS__, u8_permute_z_x                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x4, 1, 2, 0, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xz_zx                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x5, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u8_permute_yz_zy                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x6, 0, 2, 1, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_yzx                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 1, 2, 0, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_yzw                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zxy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 2, 0, 1, 3) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zwy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 2, 3, 1, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_wzy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x7, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u8_permute_w_x                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x8, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u8_permute_yw_wy                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xa, 0, 3, 2, 1) \
-    MACRO(__VA_ARGS__, u8_permute_zw_xz                        , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xc, 3, 1, 0, 2) \
-    MACRO(__VA_ARGS__, u8_permute_xzw_zwx                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xd, 2, 1, 3, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xzw_wxz                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xd, 3, 1, 0, 2) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zwy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xe, 0, 2, 3, 1) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_xyz                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xe, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zxy                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xe, 3, 2, 0, 1) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zyx                      , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xe, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_yzwx                    , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xf, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wxyz                    , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xf, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wzxy                    , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xf, 3, 2, 0, 1) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wzyx                    , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0xf, 3, 2, 1, 0)
+    MACRO(__VA_ARGS__, u8_permute_xzyw                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 0, 2, 1, 3) \
+    MACRO(__VA_ARGS__, u8_permute_xzwy                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 0, 2, 3, 1) \
+    MACRO(__VA_ARGS__, u8_permute_xwzy                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 0, 3, 2, 1) \
+    MACRO(__VA_ARGS__, u8_permute_yxzw                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 1, 0, 2, 3) \
+    MACRO(__VA_ARGS__, u8_permute_yzxw                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 1, 2, 0, 3) \
+    MACRO(__VA_ARGS__, u8_permute_yzwx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 1, 2, 3, 0) \
+    MACRO(__VA_ARGS__, u8_permute_ywzx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 1, 3, 2, 0) \
+    MACRO(__VA_ARGS__, u8_permute_zxyw                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 2, 0, 1, 3) \
+    MACRO(__VA_ARGS__, u8_permute_zyxw                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 2, 1, 0, 3) \
+    MACRO(__VA_ARGS__, u8_permute_zywx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 2, 1, 3, 0) \
+    MACRO(__VA_ARGS__, u8_permute_zwxy                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 2, 3, 0, 1) \
+    MACRO(__VA_ARGS__, u8_permute_zwyx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 2, 3, 1, 0) \
+    MACRO(__VA_ARGS__, u8_permute_wxyz                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 0, 1, 2) \
+    MACRO(__VA_ARGS__, u8_permute_wxzy                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 0, 2, 1) \
+    MACRO(__VA_ARGS__, u8_permute_wyxz                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 1, 0, 2) \
+    MACRO(__VA_ARGS__, u8_permute_wyzx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 1, 2, 0) \
+    MACRO(__VA_ARGS__, u8_permute_wzxy                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 2, 0, 1) \
+    MACRO(__VA_ARGS__, u8_permute_wzyx                         , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x0, 3, 2, 1, 0)
 #define SWS_FOR_STRUCT_U8_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u8_permute_x_y                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_x_z                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_x_w                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_y_w                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{0, 3, 2, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_y_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xy_yx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xy_yw                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{1, 3, 2, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xy_zw                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{2, 3, 0, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_xy_wx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{3, 0, 2, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_z_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x4, .par.swizzle.in = 
{1, 2, 0, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xz_zx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_yz_zy                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x6, .par.swizzle.in = 
{0, 2, 1, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_yzx                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 0, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_yzw                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zxy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 0, 1, 3}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_zwy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 3, 1, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xyz_wzy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_w_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_yw_wy                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xa, .par.swizzle.in = 
{0, 3, 2, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_zw_xz                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xc, .par.swizzle.in = 
{3, 1, 0, 2}) \
-    MACRO(__VA_ARGS__, u8_permute_xzw_zwx                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xd, .par.swizzle.in = 
{2, 1, 3, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xzw_wxz                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xd, .par.swizzle.in = 
{3, 1, 0, 2}) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zwy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{0, 2, 3, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_xyz                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zxy                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 2, 0, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_yzw_zyx                      , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_yzwx                    , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wxyz                    , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wzxy                    , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 2, 0, 1}) \
-    MACRO(__VA_ARGS__, u8_permute_xyzw_wzyx                    , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 2, 1, 0})
+    MACRO(__VA_ARGS__, u8_permute_xzyw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 2, 1, 3}) \
+    MACRO(__VA_ARGS__, u8_permute_xzwy                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 2, 3, 1}) \
+    MACRO(__VA_ARGS__, u8_permute_xwzy                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 3, 2, 1}) \
+    MACRO(__VA_ARGS__, u8_permute_yxzw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 0, 2, 3}) \
+    MACRO(__VA_ARGS__, u8_permute_yzxw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 0, 3}) \
+    MACRO(__VA_ARGS__, u8_permute_yzwx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 3, 0}) \
+    MACRO(__VA_ARGS__, u8_permute_ywzx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 3, 2, 0}) \
+    MACRO(__VA_ARGS__, u8_permute_zxyw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 0, 1, 3}) \
+    MACRO(__VA_ARGS__, u8_permute_zyxw                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 0, 3}) \
+    MACRO(__VA_ARGS__, u8_permute_zywx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 3, 0}) \
+    MACRO(__VA_ARGS__, u8_permute_zwxy                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 3, 0, 1}) \
+    MACRO(__VA_ARGS__, u8_permute_zwyx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 3, 1, 0}) \
+    MACRO(__VA_ARGS__, u8_permute_wxyz                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 1, 2}) \
+    MACRO(__VA_ARGS__, u8_permute_wxzy                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 2, 1}) \
+    MACRO(__VA_ARGS__, u8_permute_wyxz                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 0, 2}) \
+    MACRO(__VA_ARGS__, u8_permute_wyzx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 2, 0}) \
+    MACRO(__VA_ARGS__, u8_permute_wzxy                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 2, 0, 1}) \
+    MACRO(__VA_ARGS__, u8_permute_wzyx                         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 2, 1, 0})
 #define SWS_FOR_U8_COPY(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_copy_yz_xx                           , SWS_PIXEL_U8 
, SWS_UOP_COPY            , 0x6, 0, 0, 0, 3) \
     MACRO(__VA_ARGS__, u8_copy_yzw_xxx                         , SWS_PIXEL_U8 
, SWS_UOP_COPY            , 0xe, 0, 0, 0, 0) \
@@ -440,51 +416,35 @@
 #define SWS_FOR_U16_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_STRUCT_U16_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_U16_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u16_permute_x_y                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u16_permute_x_z                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u16_permute_x_w                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x1, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u16_permute_y_w                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x2, 0, 3, 2, 1) \
-    MACRO(__VA_ARGS__, u16_permute_y_x                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x2, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xy_yx                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x3, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xy_wx                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x3, 3, 0, 2, 1) \
-    MACRO(__VA_ARGS__, u16_permute_xz_zx                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x5, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xz_zw                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x5, 2, 1, 3, 0) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_yzx                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 1, 2, 0, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_yzw                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zxy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 2, 0, 1, 3) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zwy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 2, 3, 1, 0) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_wzy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x7, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u16_permute_w_x                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x8, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u16_permute_zw_xz                       , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xc, 3, 1, 0, 2) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_zwy                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xe, 0, 2, 3, 1) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_xyz                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xe, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_zyx                     , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xe, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_yzwx                   , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xf, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_wxyz                   , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xf, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_wzyx                   , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0xf, 3, 2, 1, 0)
+    MACRO(__VA_ARGS__, u16_permute_xzwy                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 0, 2, 3, 1) \
+    MACRO(__VA_ARGS__, u16_permute_xwzy                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 0, 3, 2, 1) \
+    MACRO(__VA_ARGS__, u16_permute_yxzw                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 1, 0, 2, 3) \
+    MACRO(__VA_ARGS__, u16_permute_yzxw                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 1, 2, 0, 3) \
+    MACRO(__VA_ARGS__, u16_permute_yzwx                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 1, 2, 3, 0) \
+    MACRO(__VA_ARGS__, u16_permute_zxyw                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 2, 0, 1, 3) \
+    MACRO(__VA_ARGS__, u16_permute_zyxw                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 2, 1, 0, 3) \
+    MACRO(__VA_ARGS__, u16_permute_zywx                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 2, 1, 3, 0) \
+    MACRO(__VA_ARGS__, u16_permute_zwyx                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 2, 3, 1, 0) \
+    MACRO(__VA_ARGS__, u16_permute_wxyz                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 3, 0, 1, 2) \
+    MACRO(__VA_ARGS__, u16_permute_wxzy                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 3, 0, 2, 1) \
+    MACRO(__VA_ARGS__, u16_permute_wyxz                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 3, 1, 0, 2) \
+    MACRO(__VA_ARGS__, u16_permute_wyzx                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 3, 1, 2, 0) \
+    MACRO(__VA_ARGS__, u16_permute_wzyx                        , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x0, 3, 2, 1, 0)
 #define SWS_FOR_STRUCT_U16_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u16_permute_x_y                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_x_z                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_x_w                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_y_w                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{0, 3, 2, 1}) \
-    MACRO(__VA_ARGS__, u16_permute_y_x                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xy_yx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xy_wx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{3, 0, 2, 1}) \
-    MACRO(__VA_ARGS__, u16_permute_xz_zx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xz_zw                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{2, 1, 3, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_yzx                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 0, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_yzw                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zxy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 0, 1, 3}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_zwy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 3, 1, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_xyz_wzy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_w_x                         , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_zw_xz                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xc, .par.swizzle.in = 
{3, 1, 0, 2}) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_zwy                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{0, 2, 3, 1}) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_xyz                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u16_permute_yzw_zyx                     , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_yzwx                   , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_wxyz                   , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u16_permute_xyzw_wzyx                   , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 2, 1, 0})
+    MACRO(__VA_ARGS__, u16_permute_xzwy                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 2, 3, 1}) \
+    MACRO(__VA_ARGS__, u16_permute_xwzy                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 3, 2, 1}) \
+    MACRO(__VA_ARGS__, u16_permute_yxzw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 0, 2, 3}) \
+    MACRO(__VA_ARGS__, u16_permute_yzxw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 0, 3}) \
+    MACRO(__VA_ARGS__, u16_permute_yzwx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 3, 0}) \
+    MACRO(__VA_ARGS__, u16_permute_zxyw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 0, 1, 3}) \
+    MACRO(__VA_ARGS__, u16_permute_zyxw                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 0, 3}) \
+    MACRO(__VA_ARGS__, u16_permute_zywx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 3, 0}) \
+    MACRO(__VA_ARGS__, u16_permute_zwyx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 3, 1, 0}) \
+    MACRO(__VA_ARGS__, u16_permute_wxyz                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 1, 2}) \
+    MACRO(__VA_ARGS__, u16_permute_wxzy                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 2, 1}) \
+    MACRO(__VA_ARGS__, u16_permute_wyxz                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 0, 2}) \
+    MACRO(__VA_ARGS__, u16_permute_wyzx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 2, 0}) \
+    MACRO(__VA_ARGS__, u16_permute_wzyx                        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 2, 1, 0})
 #define SWS_FOR_U16_COPY(MACRO, ...) \
     MACRO(__VA_ARGS__, u16_copy_yz_xx                          , 
SWS_PIXEL_U16, SWS_UOP_COPY            , 0x6, 0, 0, 0, 3) \
     MACRO(__VA_ARGS__, u16_copy_yzw_xxy                        , 
SWS_PIXEL_U16, SWS_UOP_COPY            , 0xe, 0, 0, 0, 1)
@@ -744,65 +704,39 @@
 #define SWS_FOR_U32_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_STRUCT_U32_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_U32_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u32_permute_x_y                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u32_permute_x_z                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u32_permute_x_w                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x1, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u32_permute_y_w                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x2, 0, 3, 2, 1) \
-    MACRO(__VA_ARGS__, u32_permute_y_x                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x2, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xy_yx                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x3, 1, 0, 2, 3) \
-    MACRO(__VA_ARGS__, u32_permute_z_x                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x4, 1, 2, 0, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xz_zx                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x5, 2, 1, 0, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xz_zw                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x5, 2, 1, 3, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xz_wx                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x5, 3, 1, 0, 2) \
-    MACRO(__VA_ARGS__, u32_permute_yz_zy                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x6, 0, 2, 1, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_yzx                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x7, 1, 2, 0, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_yzw                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x7, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_zxy                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x7, 2, 0, 1, 3) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_zwy                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x7, 2, 3, 1, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_wzy                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x7, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u32_permute_w_y                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x8, 0, 2, 3, 1) \
-    MACRO(__VA_ARGS__, u32_permute_w_x                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x8, 3, 1, 2, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xw_yx                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x9, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u32_permute_yw_wy                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xa, 0, 3, 2, 1) \
-    MACRO(__VA_ARGS__, u32_permute_yw_xy                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xa, 3, 0, 2, 1) \
-    MACRO(__VA_ARGS__, u32_permute_zw_xz                       , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xc, 3, 1, 0, 2) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_xyz                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xe, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_zxy                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xe, 3, 2, 0, 1) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_zyx                     , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xe, 3, 2, 1, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_yzwx                   , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xf, 1, 2, 3, 0) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wxyz                   , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xf, 3, 0, 1, 2) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wzxy                   , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xf, 3, 2, 0, 1) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wzyx                   , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0xf, 3, 2, 1, 0)
+    MACRO(__VA_ARGS__, u32_permute_xzyw                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 0, 2, 1, 3) \
+    MACRO(__VA_ARGS__, u32_permute_xzwy                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 0, 2, 3, 1) \
+    MACRO(__VA_ARGS__, u32_permute_xwzy                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 0, 3, 2, 1) \
+    MACRO(__VA_ARGS__, u32_permute_yxzw                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 1, 0, 2, 3) \
+    MACRO(__VA_ARGS__, u32_permute_yzxw                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 1, 2, 0, 3) \
+    MACRO(__VA_ARGS__, u32_permute_yzwx                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 1, 2, 3, 0) \
+    MACRO(__VA_ARGS__, u32_permute_zxyw                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 2, 0, 1, 3) \
+    MACRO(__VA_ARGS__, u32_permute_zyxw                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 2, 1, 0, 3) \
+    MACRO(__VA_ARGS__, u32_permute_zywx                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 2, 1, 3, 0) \
+    MACRO(__VA_ARGS__, u32_permute_zwyx                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 2, 3, 1, 0) \
+    MACRO(__VA_ARGS__, u32_permute_wxyz                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 0, 1, 2) \
+    MACRO(__VA_ARGS__, u32_permute_wxzy                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 0, 2, 1) \
+    MACRO(__VA_ARGS__, u32_permute_wyxz                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 1, 0, 2) \
+    MACRO(__VA_ARGS__, u32_permute_wyzx                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 1, 2, 0) \
+    MACRO(__VA_ARGS__, u32_permute_wzxy                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 2, 0, 1) \
+    MACRO(__VA_ARGS__, u32_permute_wzyx                        , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x0, 3, 2, 1, 0)
 #define SWS_FOR_STRUCT_U32_PERMUTE(MACRO, ...) \
-    MACRO(__VA_ARGS__, u32_permute_x_y                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_x_z                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_x_w                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x1, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_y_w                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{0, 3, 2, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_y_x                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x2, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xy_yx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x3, .par.swizzle.in = 
{1, 0, 2, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_z_x                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x4, .par.swizzle.in = 
{1, 2, 0, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xz_zx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{2, 1, 0, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xz_zw                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{2, 1, 3, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xz_wx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x5, .par.swizzle.in = 
{3, 1, 0, 2}) \
-    MACRO(__VA_ARGS__, u32_permute_yz_zy                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x6, .par.swizzle.in = 
{0, 2, 1, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_yzx                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 0, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_yzw                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_zxy                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 0, 1, 3}) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_zwy                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{2, 3, 1, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xyz_wzy                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x7, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_w_y                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{0, 2, 3, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_w_x                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x8, .par.swizzle.in = 
{3, 1, 2, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xw_yx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x9, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_yw_wy                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xa, .par.swizzle.in = 
{0, 3, 2, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_yw_xy                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xa, .par.swizzle.in = 
{3, 0, 2, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_zw_xz                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xc, .par.swizzle.in = 
{3, 1, 0, 2}) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_xyz                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_zxy                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 2, 0, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_yzw_zyx                     , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xe, .par.swizzle.in = 
{3, 2, 1, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_yzwx                   , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{1, 2, 3, 0}) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wxyz                   , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 0, 1, 2}) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wzxy                   , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 2, 0, 1}) \
-    MACRO(__VA_ARGS__, u32_permute_xyzw_wzyx                   , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0xf, .par.swizzle.in = 
{3, 2, 1, 0})
+    MACRO(__VA_ARGS__, u32_permute_xzyw                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 2, 1, 3}) \
+    MACRO(__VA_ARGS__, u32_permute_xzwy                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 2, 3, 1}) \
+    MACRO(__VA_ARGS__, u32_permute_xwzy                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{0, 3, 2, 1}) \
+    MACRO(__VA_ARGS__, u32_permute_yxzw                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 0, 2, 3}) \
+    MACRO(__VA_ARGS__, u32_permute_yzxw                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 0, 3}) \
+    MACRO(__VA_ARGS__, u32_permute_yzwx                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{1, 2, 3, 0}) \
+    MACRO(__VA_ARGS__, u32_permute_zxyw                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 0, 1, 3}) \
+    MACRO(__VA_ARGS__, u32_permute_zyxw                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 0, 3}) \
+    MACRO(__VA_ARGS__, u32_permute_zywx                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 1, 3, 0}) \
+    MACRO(__VA_ARGS__, u32_permute_zwyx                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{2, 3, 1, 0}) \
+    MACRO(__VA_ARGS__, u32_permute_wxyz                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 1, 2}) \
+    MACRO(__VA_ARGS__, u32_permute_wxzy                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 0, 2, 1}) \
+    MACRO(__VA_ARGS__, u32_permute_wyxz                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 0, 2}) \
+    MACRO(__VA_ARGS__, u32_permute_wyzx                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 1, 2, 0}) \
+    MACRO(__VA_ARGS__, u32_permute_wzxy                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 2, 0, 1}) \
+    MACRO(__VA_ARGS__, u32_permute_wzyx                        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_PERMUTE         , .mask = 0x0, .par.swizzle.in = 
{3, 2, 1, 0})
 #define SWS_FOR_U32_COPY(MACRO, ...) \
     MACRO(__VA_ARGS__, u32_copy_yz_xx                          , 
SWS_PIXEL_U32, SWS_UOP_COPY            , 0x6, 0, 0, 0, 3) \
     MACRO(__VA_ARGS__, u32_copy_yzw_xxx                        , 
SWS_PIXEL_U32, SWS_UOP_COPY            , 0xe, 0, 0, 0, 0) \
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index dcda011ccc..d47e674bec 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-e490d908612d059c644e64b43247fb08
+a5779f7e6e5f6a56d8150261343369ac
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale: backport isolated or minor commits/fixes from my subsampling dev branch (PR #23538)

Reply via email to