[FFmpeg-devel] [PR] swscale: add support for integer linear ops (PR #23461)

Niklas Haas via ffmpeg-devel Fri, 12 Jun 2026 03:53:39 -0700

PR #23461 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23461
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23461.patch


These are more efficient than converting to f32 when the result is expected to 
be exact. This series also lays some of the groundwork for an upcoming change I 
have planned to delete SWS_OP_SCALE entirely and handle it as a linear op on 
the ops level.

As a side note, I'm also planning on nuking some of the more costly 
optimization steps from ops_optimizer.c, like the splitting of linear ops, in 
general - and instead handling these on the uops level during linear->uop 
translation.


>From 25ce6977b0fd1d432929017894c2c4994554595c Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 18:38:49 +0200
Subject: [PATCH 01/14] swscale/uops: add ff_sws_uop_list_remove_at()

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c | 11 +++++++++++
 libswscale/uops.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index b73aedb6e1..5a1ba8c794 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -397,6 +397,17 @@ int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop)
     return 0;
 }
 
+void ff_sws_uop_list_remove_at(SwsUOpList *uops, int index, int count)
+{
+    const int end = uops->num_ops - count;
+    av_assert2(index >= 0 && count >= 0 && index + count <= uops->num_ops);
+    for (int i = 0; i < count; i++)
+        uop_uninit(&uops->ops[index + i]);
+    for (int i = index; i < end; i++)
+        uops->ops[i] = uops->ops[i + count];
+    uops->num_ops = end;
+}
+
 int ff_sws_dither_height(const SwsDitherUOp *dither)
 {
     int max_offset = 0;
diff --git a/libswscale/uops.h b/libswscale/uops.h
index b2e9af30a4..4bf56e57d2 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -239,6 +239,7 @@ typedef struct SwsUOpList {
 
 SwsUOpList *ff_sws_uop_list_alloc(void);
 void ff_sws_uop_list_free(SwsUOpList **ops);
+void ff_sws_uop_list_remove_at(SwsUOpList *uops, int index, int count);
 
 /* Takes over ownership of `uop` and sets it to {0}, even on failure. */
 int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);
-- 
2.52.0


>From 8af9e46568867932c1f0e4a2d73fb42317f1ebfd Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 18:39:39 +0200
Subject: [PATCH 02/14] swscale/ops_optimizer: add ff_sws_uop_list_optimize()
 placeholder

This will be populated with logic in the near future. I decided to
also include the skeleton of the loop here, guarded by #if 0 to
avoid triggering unused variable warnings until we actually use it.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 19 +++++++++++++++++++
 libswscale/uops.c          |  3 ++-
 libswscale/uops.h          |  5 +++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 08fef00c63..1165dab480 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -902,6 +902,25 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
     return AVERROR(EINVAL);
 }
 
+int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops)
+{
+#if 0
+    static const SwsUOp dummy = {0};
+
+retry:
+    for (int i = 0; i < uops->num_ops; i++) {
+        const SwsUOp *next = i < uops->num_ops - 1 ? &uops->ops[i + 1] : 
&dummy;
+        SwsUOp *op = &uops->ops[i];
+
+        switch (op->uop) {
+            /* placeholder */
+        }
+    }
+#endif
+
+    return 0;
+}
+
 /**
  * Determine a suitable intermediate buffer format for a given combination
  * of pixel types and number of planes. The exact interpretation of these
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 5a1ba8c794..fdd07c9a0a 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -870,7 +870,8 @@ int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList 
*ops,
             return ret;
         input = ops->ops[i].comps;
     }
-    return 0;
+
+    return ff_sws_uop_list_optimize(ctx, flags, uops);
 }
 
 static int register_uop(struct AVTreeNode **root, const SwsUOp *uop)
diff --git a/libswscale/uops.h b/libswscale/uops.h
index 4bf56e57d2..eac8ee191e 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -244,6 +244,11 @@ void ff_sws_uop_list_remove_at(SwsUOpList *uops, int 
index, int count);
 /* Takes over ownership of `uop` and sets it to {0}, even on failure. */
 int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);
 
+/**
+ * Called internally by ff_sws_ops_translate().
+ */
+int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops);
+
 /**
  * Translate a list of operations down to micro-ops, which can be further
  * optimized and then directly executed by backends.
-- 
2.52.0


>From a976ee2c09b368570649eb064e1ed03356380e58 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 19:23:05 +0200
Subject: [PATCH 03/14] swscale/ops_optimizer: move convert->expand promotion
 to uops layer

This changes a lot of the op lists, but without affecting the translated
micro-ops. Slightly regresses AArch64 until this ops backend is also updated
to use the uops architecture, which should hopefully be soon.

Also affects the packed shuffle solver, which covers fewer cases now; but
this is taken care of in a different commit.

 rgb24 16x16 -> rgb48le 16x16:
   [ u8 +++X] SWS_OP_READ         : 3 elem(s) packed >> 0
     min: {0 0 0 _}, max: {255 255 255 _}
-  [ u8 +++X] SWS_OP_CONVERT      : u8 -> u16 (expand)
+  [ u8 +++X] SWS_OP_CONVERT      : u8 -> u16
+    min: {0 0 0 _}, max: {255 255 255 _}
+  [u16 +++X] SWS_OP_SCALE        : * 257
     min: {0 0 0 _}, max: {65535 65535 65535 _}
   [u16 XXXX] SWS_OP_WRITE        : 3 elem(s) packed >> 0
     (X = unused, z = byteswapped, + = exact, 0 = zero)
  translated micro-ops:
     u8_read_packed_xyz
     u8_expand_pair_xyz
     u16_write_packed_xyz

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 43 ++++++++++++++++++++++++-------------
 tests/ref/fate/sws-ops-list |  2 +-
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 1165dab480..68cb9214d4 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -573,18 +573,6 @@ retry:
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }
-
-            /* Conversion followed by integer expansion */
-            if (next->op == SWS_OP_SCALE && !op->convert.expand &&
-                ff_sws_pixel_type_is_int(op->type) &&
-                ff_sws_pixel_type_is_int(op->convert.to) &&
-                !av_cmp_q(next->scale.factor,
-                          ff_sws_pixel_expand(op->type, op->convert.to)))
-            {
-                op->convert.expand = true;
-                ff_sws_op_list_remove_at(ops, n + 1, 1);
-                goto retry;
-            }
             break;
 
         case SWS_OP_MIN:
@@ -902,9 +890,21 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
     return AVERROR(EINVAL);
 }
 
+static int is_integer_scale(const SwsUOp *op, int64_t val)
+{
+    if (op->uop != SWS_UOP_SCALE)
+        return false;
+
+    switch (op->type) {
+    case SWS_PIXEL_U8:  return op->data.scalar.u8  == val;
+    case SWS_PIXEL_U16: return op->data.scalar.u16 == val;
+    case SWS_PIXEL_U32: return op->data.scalar.u32 == val;
+    default: return false;
+    }
+}
+
 int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops)
 {
-#if 0
     static const SwsUOp dummy = {0};
 
 retry:
@@ -913,10 +913,23 @@ retry:
         SwsUOp *op = &uops->ops[i];
 
         switch (op->uop) {
-            /* placeholder */
+        case SWS_UOP_TO_U16:
+            if (is_integer_scale(next, 0x101)) {
+                op->uop = SWS_UOP_EXPAND_PAIR;
+                ff_sws_uop_list_remove_at(uops, i + 1, 1);
+                goto retry;
+            }
+            break;
+
+        case SWS_UOP_TO_U32:
+            if (is_integer_scale(next, 0x1010101)) {
+                op->uop = SWS_UOP_EXPAND_QUAD;
+                ff_sws_uop_list_remove_at(uops, i + 1, 1);
+                goto retry;
+            }
+            break;
         }
     }
-#endif
 
     return 0;
 }
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index 68a1fc1105..a9c4844487 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-e2f26cb6df5c11015e613016bb1a004a
+9283f06b12ed88d36b9de29e3be003aa
-- 
2.52.0


>From bcdcd49956e7bdbce4689d0fc864926813aa7fce Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 20:19:04 +0200
Subject: [PATCH 04/14] swscale/ops: remove SwsConvertOp.expand

This distinction no longer exists on the ops level.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/aarch64/ops_impl_conv.c |  4 +---
 libswscale/ops.c                   | 11 +++--------
 libswscale/ops.h                   |  1 -
 libswscale/ops_internal.h          | 12 ------------
 libswscale/ops_optimizer.c         | 18 ++----------------
 libswscale/uops.c                  | 18 +++++-------------
 libswscale/vulkan/ops.c            | 24 +++---------------------
 tests/checkasm/sw_ops.c            | 10 ++++++----
 8 files changed, 20 insertions(+), 78 deletions(-)

diff --git a/libswscale/aarch64/ops_impl_conv.c 
b/libswscale/aarch64/ops_impl_conv.c
index 98cb89edbc..5b990a8cfd 100644
--- a/libswscale/aarch64/ops_impl_conv.c
+++ b/libswscale/aarch64/ops_impl_conv.c
@@ -120,9 +120,7 @@ static int convert_to_aarch64_impl(SwsContext *ctx, const 
SwsOpList *ops, int n,
     case SWS_OP_LSHIFT:     out->op = AARCH64_SWS_OP_LSHIFT;     break;
     case SWS_OP_RSHIFT:     out->op = AARCH64_SWS_OP_RSHIFT;     break;
     case SWS_OP_CLEAR:      out->op = AARCH64_SWS_OP_CLEAR;      break;
-    case SWS_OP_CONVERT:
-        out->op = op->convert.expand ? AARCH64_SWS_OP_EXPAND : 
AARCH64_SWS_OP_CONVERT;
-        break;
+    case SWS_OP_CONVERT:    out->op = AARCH64_SWS_OP_CONVERT;    break;
     case SWS_OP_MIN:        out->op = AARCH64_SWS_OP_MIN;        break;
     case SWS_OP_MAX:        out->op = AARCH64_SWS_OP_MAX;        break;
     case SWS_OP_SCALE:      out->op = AARCH64_SWS_OP_SCALE;      break;
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 4afeaabf54..89ff0f50d8 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -255,12 +255,8 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
     }
     case SWS_OP_CONVERT:
         if (ff_sws_pixel_type_is_int(op->convert.to)) {
-            const AVRational scale = ff_sws_pixel_expand(op->type, 
op->convert.to);
-            for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < 4; i++)
                 x[i] = x[i].den ? Q(x[i].num / x[i].den) : x[i];
-                if (op->convert.expand)
-                    x[i] = av_mul_q(x[i], scale);
-            }
         }
         return;
     case SWS_OP_DITHER:
@@ -916,10 +912,9 @@ void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op)
                    op->swizzle.x, op->swizzle.y, op->swizzle.z, op->swizzle.w);
         break;
     case SWS_OP_CONVERT:
-        av_bprintf(bp, "%-20s: %s -> %s%s", name,
+        av_bprintf(bp, "%-20s: %s -> %s", name,
                    ff_sws_pixel_type_name(op->type),
-                   ff_sws_pixel_type_name(op->convert.to),
-                   op->convert.expand ? " (expand)" : "");
+                   ff_sws_pixel_type_name(op->convert.to));
         break;
     case SWS_OP_DITHER:
         av_bprintf(bp, "%-20s: %dx%d matrix + {%d %d %d %d}", name,
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 65d9d49e60..98b2dd72ca 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -159,7 +159,6 @@ typedef struct SwsClearOp {
 
 typedef struct SwsConvertOp {
     SwsPixelType to; /* type of pixel to convert to */
-    bool expand; /* if true, integers are expanded to the full range */
 } SwsConvertOp;
 
 typedef struct SwsClampOp {
diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 9d8da6bbb5..b6224dd486 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -28,18 +28,6 @@
 
 #define Q(N) ((AVRational) { N, 1 })
 
-static inline AVRational ff_sws_pixel_expand(SwsPixelType from, SwsPixelType 
to)
-{
-    const int src = ff_sws_pixel_type_size(from);
-    const int dst = ff_sws_pixel_type_size(to);
-    if (src > dst)
-        return Q(0);
-    int scale = 1;
-    for (int i = 1; i < dst / src; i++)
-        scale = (scale << (src * 8)) | 1;
-    return Q(scale);
-}
-
 static inline void ff_sws_pack_op_decode(const SwsOp *op, uint64_t mask[4], 
int shift[4])
 {
     int size = 0;
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 68cb9214d4..513bda51f5 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -369,7 +369,7 @@ retry:
             }
 
             /* Merge filter with prior conversion */
-            if (prev->op == SWS_OP_CONVERT && !prev->convert.expand) {
+            if (prev->op == SWS_OP_CONVERT) {
                 int size_from = ff_sws_pixel_type_size(prev->type);
                 int size_to   = ff_sws_pixel_type_size(op->type);
                 av_assert1(prev->convert.to == op->type);
@@ -565,9 +565,7 @@ retry:
             }
 
             /* Transitive conversion */
-            if (next->op == SWS_OP_CONVERT &&
-                op->convert.expand == next->convert.expand)
-            {
+            if (next->op == SWS_OP_CONVERT) {
                 av_assert1(op->convert.to == next->type);
                 op->convert.to = next->convert.to;
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
@@ -841,18 +839,6 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
             }
             break;
 
-        case SWS_OP_CONVERT: {
-            if (!op->convert.expand)
-                return AVERROR(ENOTSUP);
-            for (int i = 0; i < 4; i++) {
-                switch (ff_sws_pixel_type_size(op->type)) {
-                case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
-                case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
-                }
-            }
-            break;
-        }
-
         case SWS_OP_WRITE: {
             if (op->rw.frac || op->rw.filter.op || ff_sws_rw_op_planes(op) > 1)
                 return AVERROR(ENOTSUP);
diff --git a/libswscale/uops.c b/libswscale/uops.c
index fdd07c9a0a..5dc854ffd9 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -786,19 +786,11 @@ static int translate_op(SwsContext *ctx, SwsUOpList 
*uops, SwsUOpFlags flags,
 
     switch (op->op) {
     case SWS_OP_CONVERT:
-        if (op->convert.expand) {
-            av_assert0(op->type == SWS_PIXEL_U8);
-            switch (op->convert.to) {
-            case SWS_PIXEL_U16: uop.uop = SWS_UOP_EXPAND_PAIR; break;
-            case SWS_PIXEL_U32: uop.uop = SWS_UOP_EXPAND_QUAD; break;
-            }
-        } else {
-            switch (op->convert.to) {
-            case SWS_PIXEL_U8:  uop.uop = SWS_UOP_TO_U8;  break;
-            case SWS_PIXEL_U16: uop.uop = SWS_UOP_TO_U16; break;
-            case SWS_PIXEL_U32: uop.uop = SWS_UOP_TO_U32; break;
-            case SWS_PIXEL_F32: uop.uop = SWS_UOP_TO_F32; break;
-            }
+        switch (op->convert.to) {
+        case SWS_PIXEL_U8:  uop.uop = SWS_UOP_TO_U8;  break;
+        case SWS_PIXEL_U16: uop.uop = SWS_UOP_TO_U16; break;
+        case SWS_PIXEL_U32: uop.uop = SWS_UOP_TO_U32; break;
+        case SWS_PIXEL_F32: uop.uop = SWS_UOP_TO_F32; break;
         }
         break;
     case SWS_OP_UNPACK:
diff --git a/libswscale/vulkan/ops.c b/libswscale/vulkan/ops.c
index 1218fab2c7..5df72d8ba6 100644
--- a/libswscale/vulkan/ops.c
+++ b/libswscale/vulkan/ops.c
@@ -531,15 +531,6 @@ static void define_shader_consts(SwsContext *sws, const 
SwsOpList *ops,
         av_assert0((id->nb_const_ids + 31) <= FF_ARRAY_ELEMS(id->const_ids));
         const SwsOp *op = &ops->ops[n];
         switch (op->op) {
-        case SWS_OP_CONVERT:
-            if (ff_sws_pixel_type_is_int(op->convert.to) && 
op->convert.expand) {
-                AVRational m = ff_sws_pixel_expand(op->type, op->convert.to);
-                int tmp = spi_OpConstantUInt(spi, id->u32_type, m.num);
-                tmp = spi_OpConstantComposite(spi, id->u32vec4_type,
-                                              tmp, tmp, tmp, tmp);
-                id->const_ids[id->nb_const_ids++] = tmp;
-            }
-            break;
         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (!SWS_COMP_TEST(op->clear.mask, i))
@@ -1183,9 +1174,7 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
                                        op->swizzle.in[3]);
             break;
         case SWS_OP_CONVERT:
-            if (ff_sws_pixel_type_is_int(cur_type) && op->convert.expand)
-                data = spi_OpIMul(spi, type_v, data, 
id->const_ids[nb_const_ids++]);
-            else if (op->type == SWS_PIXEL_F32 && type_s == id->u32_type)
+            if (op->type == SWS_PIXEL_F32 && type_s == id->u32_type)
                 data = spi_OpConvertFToU(spi, type_v, data);
             else if (op->type != SWS_PIXEL_F32 && type_s == id->f32_type)
                 data = spi_OpConvertUToF(spi, type_v, data);
@@ -1535,15 +1524,8 @@ static int add_ops_glsl(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
                        op->op == SWS_OP_LSHIFT ? "<<" : ">>", 
op->shift.amount);
             break;
         case SWS_OP_CONVERT:
-            if (ff_sws_pixel_type_is_int(cur_type) && op->convert.expand) {
-                const AVRational sc = ff_sws_pixel_expand(op->type, 
op->convert.to);
-                av_bprintf(&shd->src, "    %s = %s((%s*%i)/%i);\n",
-                           type_name, type_v, ff_sws_pixel_type_name(op->type),
-                           sc.num, sc.den);
-            } else {
-                av_bprintf(&shd->src, "    %s = %s(%s);\n",
-                           type_name, type_v, 
ff_sws_pixel_type_name(op->type));
-            }
+            av_bprintf(&shd->src, "    %s = %s(%s);\n",
+                        type_name, type_v, ff_sws_pixel_type_name(op->type));
             break;
         case SWS_OP_DITHER: {
             int size = (1 << op->dither.size_log2);
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index 99140ced52..bff56ebd2f 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -25,6 +25,7 @@
 #include "libavutil/refstruct.h"
 
 #include "libswscale/ops.h"
+#include "libswscale/ops_internal.h"
 #include "libswscale/ops_dispatch.h"
 #include "libswscale/uops.h"
 #include "libswscale/uops_macros.h"
@@ -559,10 +560,11 @@ static void check_expand(const char *name, const SwsUOp 
*uop)
     CHECK_MASK(name, uop->mask, NULL, uop->type, dst, {
         .op   = SWS_OP_CONVERT,
         .type = uop->type,
-        .convert = {
-            .to = dst,
-            .expand = true,
-        },
+        .convert.to = dst,
+    }, {
+        .op   = SWS_OP_SCALE,
+        .type = dst,
+        .scale.factor = uop->uop == SWS_UOP_EXPAND_PAIR ? Q(0x101) : 
Q(0x1010101),
     });
 }
 
-- 
2.52.0


>From 3274926872a8cde52a04c625dfde079b27bd4b33 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 19:29:29 +0200
Subject: [PATCH 05/14] swscale/ops: remove SwsLinearOp.mask

This was originally introduced to make matching linear ops against
implementations faster. However, since this is now handled on the uops
level, there is no more reason to carry this metadata on the ops level.

Simplifies a lot of places in the code. It will simplify even more, once
the linear optimizations are moved to the uops level.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/aarch64/ops_impl_conv.c |  3 ++-
 libswscale/format.c                |  7 +------
 libswscale/ops.c                   | 12 +++++++-----
 libswscale/ops.h                   | 10 ++--------
 libswscale/ops_optimizer.c         | 17 +++++++----------
 libswscale/uops.c                  |  3 ++-
 tests/checkasm/sw_ops.c            |  1 -
 7 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/libswscale/aarch64/ops_impl_conv.c 
b/libswscale/aarch64/ops_impl_conv.c
index 5b990a8cfd..3a136a154e 100644
--- a/libswscale/aarch64/ops_impl_conv.c
+++ b/libswscale/aarch64/ops_impl_conv.c
@@ -205,9 +205,10 @@ static int convert_to_aarch64_impl(SwsContext *ctx, const 
SwsOpList *ops, int n,
          *   11: m[i][j] is any other coefficient
          */
         out->mask = 0;
+        const uint32_t lin_mask = ff_sws_linear_mask(&op->lin);
         for (int i = 0; i < 4; i++) {
             /* Skip unused or identity rows */
-            if (!SWS_OP_NEEDED(op, i) || !(op->lin.mask & SWS_MASK_ROW(i)))
+            if (!SWS_OP_NEEDED(op, i) || !(lin_mask & SWS_MASK_ROW(i)))
                 continue;
             MASK_SET(out->mask, i, 1);
             for (int j = 0; j < 5; j++) {
diff --git a/libswscale/format.c b/libswscale/format.c
index d34e9d7be4..81c9ed5574 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -1203,7 +1203,6 @@ static SwsLinearOp fmt_encode_range(const SwsFormat *fmt, 
bool *incomplete)
         c.m[0][0] = av_neg_q(c.m[0][0]);
     }
 
-    c.mask = ff_sws_linear_mask(c);
     return c;
 }
 
@@ -1222,7 +1221,6 @@ static SwsLinearOp fmt_decode_range(const SwsFormat *fmt, 
bool *incomplete)
     if (!(fmt->desc->flags & AV_PIX_FMT_FLAG_ALPHA))
         c.m[3][4] = Q1;
 
-    c.mask = ff_sws_linear_mask(c);
     return c;
 }
 
@@ -1381,15 +1379,12 @@ linear_mat3(const AVRational m00, const AVRational m01, 
const AVRational m02,
             const AVRational m10, const AVRational m11, const AVRational m12,
             const AVRational m20, const AVRational m21, const AVRational m22)
 {
-    SwsLinearOp c = {{
+    return (SwsLinearOp) {{
         { m00, m01, m02, Q0, Q0 },
         { m10, m11, m12, Q0, Q0 },
         { m20, m21, m22, Q0, Q0 },
         {  Q0,  Q0,  Q0, Q1, Q0 },
     }};
-
-    c.mask = ff_sws_linear_mask(c);
-    return c;
 }
 
 int ff_sws_decode_colors(SwsContext *ctx, SwsPixelType type,
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 89ff0f50d8..5c455fdd75 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -769,20 +769,22 @@ int ff_sws_op_list_max_size(const SwsOpList *ops)
     return max_size;
 }
 
-uint32_t ff_sws_linear_mask(const SwsLinearOp c)
+uint32_t ff_sws_linear_mask(const SwsLinearOp *c)
 {
     uint32_t mask = 0;
     for (int i = 0; i < 4; i++) {
         for (int j = 0; j < 5; j++) {
-            if (av_cmp_q(c.m[i][j], Q(i == j)))
+            if (av_cmp_q(c->m[i][j], Q(i == j)))
                 mask |= SWS_MASK(i, j);
         }
     }
     return mask;
 }
 
-static const char *describe_lin_mask(uint32_t mask)
+static const char *describe_lin_op(const SwsLinearOp *c)
 {
+    const uint32_t mask = ff_sws_linear_mask(c);
+
     /* Try to be fairly descriptive without assuming too much */
     static const struct {
         char name[24];
@@ -818,7 +820,7 @@ static const char *describe_lin_mask(uint32_t mask)
             return patterns[i].name;
     }
 
-    av_unreachable("Invalid linear mask!");
+    av_unreachable("Invalid linear op!");
     return "ERR";
 }
 
@@ -932,7 +934,7 @@ void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op)
         av_bprintf(bp, " <= x");
         break;
     case SWS_OP_LINEAR:
-        av_bprintf(bp, "%-20s: %s [", name, describe_lin_mask(op->lin.mask));
+        av_bprintf(bp, "%-20s: %s [", name, describe_lin_op(&op->lin));
         for (int i = 0; i < 4; i++) {
             av_bprintf(bp, "%s[", i ? " " : "");
             for (int j = 0; j < 5; j++) {
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 98b2dd72ca..2d0e57aa42 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -183,14 +183,8 @@ typedef struct SwsLinearOp {
      *   [ Out.y ] = [ F G H I J ] * [ x y z w 1 ]
      *   [ Out.z ] = [ K L M N O ]
      *   [ Out.w ] = [ P Q R S T ]
-     *
-     * The mask keeps track of which components differ from an identity matrix.
-     * There may be more efficient implementations of particular subsets, for
-     * example the common subset of {A, E, G, J, M, O} can be implemented with
-     * just three fused multiply-add operations.
      */
     AVRational m[4][5];
-    uint32_t mask; /* m[i][j] <-> 1 << (5 * i + j) */
 } SwsLinearOp;
 
 #define SWS_MASK(I, J)  (1 << (5 * (I) + (J)))
@@ -214,8 +208,8 @@ enum {
     SWS_MASK_MAT4  = SWS_MASK_ALL & ~SWS_MASK_OFF4,
 };
 
-/* Helper function to compute the correct mask */
-uint32_t ff_sws_linear_mask(SwsLinearOp);
+/* m[i][j] <-> 1 << (5 * i + j) */
+uint32_t ff_sws_linear_mask(const SwsLinearOp *);
 
 typedef struct SwsFilterOp {
     SwsFilterWeights *kernel; /* filter kernel (refstruct) */
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 513bda51f5..e0a6165631 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -249,7 +249,7 @@ static bool extract_scalar(const SwsLinearOp *c, SwsComps 
comps, SwsComps prev,
     SwsScaleOp scale = {0};
 
     /* There are components not on the main diagonal */
-    if (c->mask & ~SWS_MASK_DIAG4)
+    if (ff_sws_linear_mask(c) & ~SWS_MASK_DIAG4)
         return false;
 
     for (int i = 0; i < 4; i++) {
@@ -271,6 +271,7 @@ static bool extract_scalar(const SwsLinearOp *c, SwsComps 
comps, SwsComps prev,
 static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
                                   SwsClearOp *out_clear)
 {
+    const uint32_t mask = ff_sws_linear_mask(c);
     SwsClearOp clear = {0};
     bool ret = false;
 
@@ -280,12 +281,11 @@ static bool extract_constant_rows(SwsLinearOp *c, 
SwsComps prev,
             const_row &= c->m[i][j].num == 0 || /* scalar is zero */
                          (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
         }
-        if (const_row && (c->mask & SWS_MASK_ROW(i))) {
+        if (const_row && (mask & SWS_MASK_ROW(i))) {
             clear.mask |= SWS_COMP(i);
             clear.value[i] = c->m[i][4];
             for (int j = 0; j < 5; j++)
                 c->m[i][j] = Q(i == j);
-            c->mask &= ~SWS_MASK_ROW(i);
             ret = true;
         }
     }
@@ -331,7 +331,6 @@ static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, 
SwsSwizzleOp *out_sw
     if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
         return false; /* no swizzle was identified */
 
-    c.mask = ff_sws_linear_mask(c);
     *out_swiz = swiz;
     *op = c;
     return true;
@@ -620,12 +619,13 @@ retry:
             break;
 
         case SWS_OP_LINEAR: {
+            const uint32_t mask = ff_sws_linear_mask(&op->lin);
             SwsSwizzleOp swizzle;
             SwsClearOp clear;
             SwsScaleOp scale;
 
             /* No-op (identity) linear operation */
-            if (!op->lin.mask) {
+            if (!mask) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
@@ -644,7 +644,6 @@ retry:
                         op->lin.m[i][j] = sum;
                     }
                 }
-                op->lin.mask = ff_sws_linear_mask(op->lin);
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }
@@ -652,22 +651,20 @@ retry:
             /* Optimize away zero columns */
             for (int j = 0; j < 4; j++) {
                 const uint32_t col = SWS_MASK_COL(j);
-                if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask 
& col))
+                if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(mask & col))
                     continue;
                 for (int i = 0; i < 4; i++)
                     op->lin.m[i][j] = Q(i == j);
-                op->lin.mask &= ~col;
                 goto retry;
             }
 
             /* Optimize away unused rows */
             for (int i = 0; i < 4; i++) {
                 const uint32_t row = SWS_MASK_ROW(i);
-                if (SWS_OP_NEEDED(op, i) || !(op->lin.mask & row))
+                if (SWS_OP_NEEDED(op, i) || !(mask & row))
                     continue;
                 for (int j = 0; j < 5; j++)
                     op->lin.m[i][j] = Q(i == j);
-                op->lin.mask &= ~row;
                 goto retry;
             }
 
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 5dc854ffd9..f07da9fc65 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -711,11 +711,12 @@ static int translate_linear_op(SwsContext *ctx, 
SwsUOpList *ops,
         .uop  = SWS_UOP_LINEAR,
     };
 
+    const uint32_t mask = ff_sws_linear_mask(&op->lin);
     const bool bitexact = ctx->flags & SWS_BITEXACT;
     uint32_t exact = 0;
 
     for (int i = 0; i < 4; i++) {
-        if (SWS_OP_NEEDED(op, i) && (op->lin.mask & SWS_MASK_ROW(i)))
+        if (SWS_OP_NEEDED(op, i) && (mask & SWS_MASK_ROW(i)))
             uop.mask |= SWS_COMP(i);
         for (int j = 0; j < 5; j++) {
             const AVRational k = op->lin.m[i][j];
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index bff56ebd2f..706b49fe2e 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -699,7 +699,6 @@ static void check_linear(const char *name, const SwsUOp 
*uop)
         }
     }
 
-    lin.mask = ff_sws_linear_mask(lin);
     CHECK(name, 4, 4, type, type, {
         .op   = SWS_OP_LINEAR,
         .type = type,
-- 
2.52.0


>From b561c9ea9bc54eb0cf354afe9a25971f38bf3bed Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 19:37:13 +0200
Subject: [PATCH 06/14] swscale/uops: move expand_bit from translation stage to
 optimizer

In anticipation of SWS_OP_SCALE being nuked.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 10 ++++++++++
 libswscale/uops.c          | 26 ++------------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index e0a6165631..f68778805f 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -911,6 +911,16 @@ retry:
                 goto retry;
             }
             break;
+
+        case SWS_UOP_SCALE: {
+            const int bits = 8 * ff_sws_pixel_type_size(op->type);
+            if (is_integer_scale(op, UINT64_MAX >> (64 - bits))) {
+                op->uop = SWS_UOP_EXPAND_BIT;
+                memset(&op->par, 0, sizeof(op->par));
+                goto retry;
+            }
+            break;
+        }
         }
     }
 
diff --git a/libswscale/uops.c b/libswscale/uops.c
index f07da9fc65..1eb26fdb1d 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -740,24 +740,6 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList 
*ops,
     return ff_sws_uop_list_append(ops, &uop);
 }
 
-static bool is_expand_bit(SwsPixelType type, AVRational factor)
-{
-    if (factor.den != 1)
-        return false;
-
-    switch (type) {
-    case SWS_PIXEL_U8:  return factor.num == UINT8_MAX;
-    case SWS_PIXEL_U16: return factor.num == UINT16_MAX;
-    case SWS_PIXEL_U32: return factor.num == UINT32_MAX;
-    case SWS_PIXEL_F32: return false;
-    case SWS_PIXEL_NONE:
-    case SWS_PIXEL_TYPE_NB: break;
-    }
-
-    av_unreachable("Invalid pixel type!");
-    return false;
-}
-
 static int translate_op(SwsContext *ctx, SwsUOpList *uops, SwsUOpFlags flags,
                         const SwsOp *op, const SwsComps *input)
 {
@@ -825,12 +807,8 @@ static int translate_op(SwsContext *ctx, SwsUOpList *uops, 
SwsUOpFlags flags,
         }
         break;
     case SWS_OP_SCALE:
-        if (is_expand_bit(op->type, op->scale.factor)) {
-            uop.uop = SWS_UOP_EXPAND_BIT;
-        } else {
-            uop.uop = SWS_UOP_SCALE;
-            uop.data.scalar = Q2PIXEL(op->scale.factor);
-        }
+        uop.uop = SWS_UOP_SCALE;
+        uop.data.scalar = Q2PIXEL(op->scale.factor);
         break;
     case SWS_OP_MIN:
     case SWS_OP_MAX:
-- 
2.52.0


>From 2acd2aa39072ca3239c916adcedf4c739288fcff Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 16:25:20 +0200
Subject: [PATCH 07/14] swscale/uops_tmpl: use intermediate variable for linear
 results

Instead of directly writing to the output buffer. This might result in
slightly better codegen in principle, though GCC does not seem to care.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops_tmpl.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c
index 44e8551083..18a3f9ec55 100644
--- a/libswscale/uops_tmpl.c
+++ b/libswscale/uops_tmpl.c
@@ -782,11 +782,12 @@ DECL_FUNC(linear, const SwsCompMask mask, const uint32_t 
one, const uint32_t zer
     ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
 
 #define LIN_ROW(I, var) do {                                    \
-    var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];              \
-    if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx);  \
-    if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy);  \
-    if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz);  \
-    if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww);  \
+    pixel_t tmp = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];         \
+    if (!(zero & SWS_MASK(I, 0))) tmp += LIN_VAL(I, 0, xx);     \
+    if (!(zero & SWS_MASK(I, 1))) tmp += LIN_VAL(I, 1, yy);     \
+    if (!(zero & SWS_MASK(I, 2))) tmp += LIN_VAL(I, 2, zz);     \
+    if (!(zero & SWS_MASK(I, 3))) tmp += LIN_VAL(I, 3, ww);     \
+    var[i] = tmp;                                               \
 } while (0)
 
         if (X) LIN_ROW(0, x);
-- 
2.52.0


>From 6c6671f8acf1349d0ea15bd0e9cf0ce8a2461f7e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 16:26:29 +0200
Subject: [PATCH 08/14] swscale/uops_tmpl: avoid overflow of intermediate
 linear products

Multiplying e.g. uint16_t * uint16_t results in implicit promotion to
signed integer, which might overflow (undefined behavior). Force an explicit
cast to (inter_t). This shouldn't affect codegen because the intermediate
result is cast right back to `pixel_t`.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops_tmpl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c
index 18a3f9ec55..2fb241f1fe 100644
--- a/libswscale/uops_tmpl.c
+++ b/libswscale/uops_tmpl.c
@@ -779,7 +779,7 @@ DECL_FUNC(linear, const SwsCompMask mask, const uint32_t 
one, const uint32_t zer
         const pixel_t ww = w[i];
 
 #define LIN_VAL(I, J, val) \
-    ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
+    ((one & SWS_MASK(I, J)) ? (val) : (inter_t) c.m[I][J] * (val))
 
 #define LIN_ROW(I, var) do {                                    \
     pixel_t tmp = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];         \
-- 
2.52.0


>From 49d90c5b0ef8ed093e5ce1b9589949962cd8f90b Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 17:18:46 +0200
Subject: [PATCH 09/14] swscale/ops: lift the SWS_OP_LINEAR float-only
 restriction

The reference backend does not need any change, it's already written to
handle integer linear ops as a result of the previous commits.

The x86 implementation will be updated in a future commit.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops.c        | 1 -
 libswscale/ops.h        | 7 ++++++-
 libswscale/uops.c       | 2 +-
 tests/checkasm/sw_ops.c | 6 ++++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/libswscale/ops.c b/libswscale/ops.c
index 5c455fdd75..ff710da4de 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -275,7 +275,6 @@ void ff_sws_apply_op_q(const SwsOp *op, AVRational x[4])
             x[i] = av_max_q(x[i], op->clamp.limit[i]);
         return;
     case SWS_OP_LINEAR: {
-        av_assert1(!ff_sws_pixel_type_is_int(op->type));
         const AVRational orig[4] = { x[0], x[1], x[2], x[3] };
         for (int i = 0; i < 4; i++) {
             AVRational sum = op->lin.m[i][4];
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 2d0e57aa42..6285784709 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -52,9 +52,9 @@ typedef enum SwsOpType {
     SWS_OP_MIN,             /* numeric minimum */
     SWS_OP_MAX,             /* numeric maximum */
     SWS_OP_SCALE,           /* multiplication by scalar */
+    SWS_OP_LINEAR,          /* generalized linear affine transform */
 
     /* Floating-point only arithmetic operations. */
-    SWS_OP_LINEAR,          /* generalized linear affine transform */
     SWS_OP_DITHER,          /* add dithering noise */
 
     /* Filtering operations. Always output floating point. */
@@ -183,6 +183,11 @@ typedef struct SwsLinearOp {
      *   [ Out.y ] = [ F G H I J ] * [ x y z w 1 ]
      *   [ Out.z ] = [ K L M N O ]
      *   [ Out.w ] = [ P Q R S T ]
+     *
+     * Note: For integer types, these entries may exceed the value range of
+     * the pixel type itself. It's implied that implementations will perform
+     * all intermediate calculations modulo 2^N where N is the bit depth of
+     * the pixel type, including the final result.
      */
     AVRational m[4][5];
 } SwsLinearOp;
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 1eb26fdb1d..f78bec0f94 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -731,7 +731,7 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList 
*ops,
         }
     }
 
-    if (flags & SWS_UOP_FLAG_FMA) {
+    if ((flags & SWS_UOP_FLAG_FMA) && !ff_sws_pixel_type_is_int(op->type)) {
         /* multiplication by 1 and 0 are always exact by definition */
         uop.uop = SWS_UOP_LINEAR_FMA;
         uop.par.lin.exact = exact | uop.par.lin.zero | uop.par.lin.one;
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index 706b49fe2e..eb47f5dddf 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -685,7 +685,6 @@ static void check_clear(const char *name, const SwsUOp *uop)
 static void check_linear(const char *name, const SwsUOp *uop)
 {
     const SwsPixelType type = uop->type;
-    av_assert0(!ff_sws_pixel_type_is_int(type));
 
     SwsLinearOp lin;
     for (int i = 0; i < 4; i++) {
@@ -694,8 +693,11 @@ static void check_linear(const char *name, const SwsUOp 
*uop)
                 lin.m[i][j] = (AVRational) { 1, 1 };
             else if (uop->par.lin.zero & SWS_MASK(i, j))
                 lin.m[i][j] = (AVRational) { 0, 1 };
-            else
+            else {
+                /* This may overflow individual multiplications, but the
+                 * overflow behavior is defined (see SwsLinearOp) */
                 lin.m[i][j] = rndq(type);
+            }
         }
     }
 
-- 
2.52.0


>From 341a1ff17139ac94176fe07bd8423b1ddc2c509a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 19:11:56 +0200
Subject: [PATCH 10/14] tests/checkasm/sw_ops: avoid zero/one random linear
 coefficients

This would conflict with the zero/one mask and lead to the wrong uop being
selected. This was always latent, but previously almost impossible to hit as
it would require a 1-in-2^63 chance.

However, the generalization of SWS_UOP_LINEAR to smaller types (e.g. 8-bit)
makes this significantly more probable to hit.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 tests/checkasm/sw_ops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index eb47f5dddf..492f69ed61 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -696,7 +696,9 @@ static void check_linear(const char *name, const SwsUOp 
*uop)
             else {
                 /* This may overflow individual multiplications, but the
                  * overflow behavior is defined (see SwsLinearOp) */
-                lin.m[i][j] = rndq(type);
+                do {
+                    lin.m[i][j] = rndq(type);
+                } while (lin.m[i][j].den == 1 && !(lin.m[i][j].num & ~1));
             }
         }
     }
-- 
2.52.0


>From 6f5596b3e31d5350245c3ad2c857c6b5f0f4fba3 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 19:53:59 +0200
Subject: [PATCH 11/14] swscale/ops_optimizer: fold clears into linear ops

This does not affect any current op lists, but slightly modifies the series
of internal steps (clears are first folded into linear ops and then
extracted again, rather than being commuted past reduced scaling ops).

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index f68778805f..f76d1fe5c5 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -502,6 +502,24 @@ retry:
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }
+
+            /* Fold into linear op */
+            if (next->op == SWS_OP_LINEAR) {
+                for (int j = 0; j < 4; j++) {
+                    if (!SWS_COMP_TEST(op->clear.mask, j))
+                        continue;
+
+                    const AVRational x = op->clear.value[j];
+                    for (int i = 0; i < 4; i++) {
+                        const AVRational kx = av_mul_q(next->lin.m[i][j], x);
+                        next->lin.m[i][4] = av_add_q(next->lin.m[i][4], kx);
+                        next->lin.m[i][j] = Q(0);
+                    }
+                }
+
+                ff_sws_op_list_remove_at(ops, n, 1);
+                goto retry;
+            }
             break;
 
         case SWS_OP_SWIZZLE:
-- 
2.52.0


>From 32fdc6c924fb02388131a3e9995592c69a7f45e7 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 15:21:44 +0200
Subject: [PATCH 12/14] swscale/ops_optimizer: unpromote integer-only linear
 ops

This avoids a whole class of unnecessary round trips through F32 for exact
integer-only linear ops.

This is a huge win especially for the x86 backend which no longer needs to
worry about cross-lane shuffles when internally expanding and repacking the
32-bit intermediate results.

Affects a large number of ops lists, including but not limited to:

 monow 16x16 -> rgb24 16x16:
   [ u8 +XXX] SWS_OP_READ         : 1 elem(s) planar >> 3
     min: {0 _ _ _}, max: {1 _ _ _}
-  [ u8 +XXX] SWS_OP_CONVERT      : u8 -> f32
-    min: {0 _ _ _}, max: {1 _ _ _}
-  [f32 +++X] SWS_OP_SWIZZLE      : 0003
+  [ u8 +++X] SWS_OP_SWIZZLE      : 0003
     min: {0 0 0 _}, max: {1 1 1 _}
-  [f32 +++X] SWS_OP_LINEAR       : diag3+off3 [[-255 0 0 0 255] [0 -255 0 0 
255] [0 0 -255 0 255] [0 0 0 1 0]]
-    min: {0 0 0 _}, max: {255 255 255 _}
-  [f32 +++X] SWS_OP_CONVERT      : f32 -> u8
+  [ u8 +++X] SWS_OP_LINEAR       : diag3+off3 [[-255 0 0 0 255] [0 -255 0 0 
255] [0 0 -255 0 255] [0 0 0 1 0]]
     min: {0 0 0 _}, max: {255 255 255 _}
   [ u8 XXXX] SWS_OP_WRITE        : 3 elem(s) packed >> 0
     (X = unused, z = byteswapped, + = exact, 0 = zero)
  translated micro-ops:
     u8_read_bit_x
-    u8_to_f32_x
-    u32_copy_yz_xx
-    f32_linear_xyz_x000x_0x00x_00x0x
-    f32_to_u8_xyz
+    u8_copy_yz_xx
+    u8_linear_xyz_x000x_0x00x_00x0x
     u8_write_packed_xyz

 bgr4 16x16 -> rgb48le 16x16:
   [ u8 +XXX] SWS_OP_READ         : 1 elem(s) packed >> 1
   [ u8 +++X] SWS_OP_UNPACK       : {1 2 1 0}
     min: {0 0 0 _}, max: {1 3 1 _}
-  [ u8 +++X] SWS_OP_CONVERT      : u8 -> f32
+  [ u8 +++X] SWS_OP_CONVERT      : u8 -> u16
     min: {0 0 0 _}, max: {1 3 1 _}
-  [f32 +++X] SWS_OP_SWIZZLE      : 2103
+  [u16 +++X] SWS_OP_SWIZZLE      : 2103
     min: {0 0 0 _}, max: {1 3 1 _}
-  [f32 +++X] SWS_OP_LINEAR       : diag3 [[65535 0 0 0 0] [0 21845 0 0 0] [0 0 
65535 0 0] [0 0 0 1 0]]
-    min: {0 0 0 _}, max: {65535 65535 65535 _}
-  [f32 +++X] SWS_OP_CONVERT      : f32 -> u16
+  [u16 +++X] SWS_OP_LINEAR       : diag3 [[65535 0 0 0 0] [0 21845 0 0 0] [0 0 
65535 0 0] [0 0 0 1 0]]
     min: {0 0 0 _}, max: {65535 65535 65535 _}
   [u16 XXXX] SWS_OP_WRITE        : 3 elem(s) packed >> 0
     (X = unused, z = byteswapped, + = exact, 0 = zero)
  translated micro-ops:
     u8_read_nibble_x
     u8_unpack_xyz_121
-    u8_to_f32_xyz
-    u32_permute_xz_zx
-    f32_linear_xyz_x0000_0x000_00x00
-    f32_to_u16_xyz
+    u8_to_u16_xyz
+    u16_permute_xz_zx
+    u16_linear_xyz_x0000_0x000_00x00
     u16_write_packed_xyz

 uyva 16x16 -> yuva444p16le 16x16:
   [ u8 ++++] SWS_OP_READ         : 4 elem(s) packed >> 0
     min: {0 0 0 0}, max: {255 255 255 255}
-  [ u8 ++++] SWS_OP_CONVERT      : u8 -> f32
+  [ u8 ++++] SWS_OP_CONVERT      : u8 -> u16
     min: {0 0 0 0}, max: {255 255 255 255}
-  [f32 ++++] SWS_OP_SWIZZLE      : 1023
+  [u16 ++++] SWS_OP_SWIZZLE      : 1023
     min: {0 0 0 0}, max: {255 255 255 255}
-  [f32 ++++] SWS_OP_LINEAR       : diag4 [[256 0 0 0 0] [0 256 0 0 0] [0 0 256 
0 0] [0 0 0 257 0]]
-    min: {0 0 0 0}, max: {65280 65280 65280 65535}
-  [f32 ++++] SWS_OP_CONVERT      : f32 -> u16
+  [u16 ++++] SWS_OP_LINEAR       : diag4 [[256 0 0 0 0] [0 256 0 0 0] [0 0 256 
0 0] [0 0 0 257 0]]
     min: {0 0 0 0}, max: {65280 65280 65280 65535}
   [u16 XXXX] SWS_OP_WRITE        : 4 elem(s) planar >> 0
     (X = unused, z = byteswapped, + = exact, 0 = zero)
  translated micro-ops:
     u8_read_packed_xyzw
-    u8_to_f32_xyzw
-    u32_permute_xy_yx
-    f32_linear_xyzw_x0000_0x000_00x00_000x0
-    f32_to_u16_xyzw
+    u8_to_u16_xyzw
+    u16_permute_xy_yx
+    u16_linear_xyzw_x0000_0x000_00x00_000x0
     u16_write_planar_xyzw

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c  | 12 ++++++++++
 libswscale/uops_macros.h    | 44 ++++++++++++++++++++++++++++++-------
 tests/ref/fate/sws-ops-list |  2 +-
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index f76d1fe5c5..7c7e871480 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -803,6 +803,18 @@ retry:
                 goto retry;
             }
             break;
+
+        case SWS_OP_LINEAR:
+            /* Exact integer linear transformation */
+            if (next->op == SWS_OP_CONVERT &&
+                ff_sws_pixel_type_is_int(next->convert.to) &&
+                op_result_is_exact(op))
+            {
+                op->type = next->convert.to;
+                FFSWAP(SwsOp, *op, *next);
+                goto retry;
+            }
+            break;
         }
     }
 
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index f63d046aa3..083660f480 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -268,11 +268,13 @@
 #define SWS_FOR_U8_TO_U16(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_to_u16_x                             , SWS_PIXEL_U8 
, SWS_UOP_TO_U16          , 0x1) \
     MACRO(__VA_ARGS__, u8_to_u16_xyz                           , SWS_PIXEL_U8 
, SWS_UOP_TO_U16          , 0x7) \
-    MACRO(__VA_ARGS__, u8_to_u16_yzw                           , SWS_PIXEL_U8 
, SWS_UOP_TO_U16          , 0xe)
+    MACRO(__VA_ARGS__, u8_to_u16_yzw                           , SWS_PIXEL_U8 
, SWS_UOP_TO_U16          , 0xe) \
+    MACRO(__VA_ARGS__, u8_to_u16_xyzw                          , SWS_PIXEL_U8 
, SWS_UOP_TO_U16          , 0xf)
 #define SWS_FOR_STRUCT_U8_TO_U16(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_to_u16_x                             , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_TO_U16          , .mask = 0x1) \
     MACRO(__VA_ARGS__, u8_to_u16_xyz                           , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_TO_U16          , .mask = 0x7) \
-    MACRO(__VA_ARGS__, u8_to_u16_yzw                           , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_TO_U16          , .mask = 0xe)
+    MACRO(__VA_ARGS__, u8_to_u16_yzw                           , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_TO_U16          , .mask = 0xe) \
+    MACRO(__VA_ARGS__, u8_to_u16_xyzw                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_TO_U16          , .mask = 0xf)
 #define SWS_FOR_U8_TO_U32(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_to_u32_x                             , SWS_PIXEL_U8 
, SWS_UOP_TO_U32          , 0x1) \
     MACRO(__VA_ARGS__, u8_to_u32_xyz                           , SWS_PIXEL_U8 
, SWS_UOP_TO_U32          , 0x7)
@@ -359,8 +361,20 @@
     MACRO(__VA_ARGS__, u8_clear_xzw_1xx                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_xzw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x8, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u8_clear_yzw_xx1                        , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x8, .par.clear.zero = 0x0)
-#define SWS_FOR_U8_LINEAR(MACRO, ...)
-#define SWS_FOR_STRUCT_U8_LINEAR(MACRO, ...)
+#define SWS_FOR_U8_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u8_linear_x_x000x                       , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x1, 0x41040, 0xbefae) \
+    MACRO(__VA_ARGS__, u8_linear_x_x0001                       , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x1, 0x41050, 0xbefae) \
+    MACRO(__VA_ARGS__, u8_linear_y_0x000                       , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x2, 0x41001, 0xbefbe) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x000x_0x00x_00x0x         , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbadae) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x0000_0x000_00x00         , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbefbe) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x0001_0x00x_00x01         , SWS_PIXEL_U8 
, SWS_UOP_LINEAR          , 0x7, 0x44010, 0xbadae)
+#define SWS_FOR_STRUCT_U8_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u8_linear_x_x000x                       , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefae) \
+    MACRO(__VA_ARGS__, u8_linear_x_x0001                       , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x1, .par.lin.one = 
0x41050, .par.lin.zero = 0xbefae) \
+    MACRO(__VA_ARGS__, u8_linear_y_0x000                       , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x2, .par.lin.one = 
0x41001, .par.lin.zero = 0xbefbe) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x000x_0x00x_00x0x         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbadae) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x0000_0x000_00x00         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbefbe) \
+    MACRO(__VA_ARGS__, u8_linear_xyz_x0001_0x00x_00x01         , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x44010, .par.lin.zero = 0xbadae)
 #define SWS_FOR_U8_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_STRUCT_U8_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_U8_DITHER(MACRO, ...)
@@ -687,8 +701,16 @@
     MACRO(__VA_ARGS__, u16_clear_xzw_1xx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x1, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_yzw_xxx                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u16_clear_yzw_xx1                       , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_CLEAR           , .mask = 0xe, .par.clear.one = 
0x8, .par.clear.zero = 0x0)
-#define SWS_FOR_U16_LINEAR(MACRO, ...)
-#define SWS_FOR_STRUCT_U16_LINEAR(MACRO, ...)
+#define SWS_FOR_U16_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u16_linear_x_x000x                      , 
SWS_PIXEL_U16, SWS_UOP_LINEAR          , 0x1, 0x41040, 0xbefae) \
+    MACRO(__VA_ARGS__, u16_linear_xyz_x000x_0x00x_00x0x        , 
SWS_PIXEL_U16, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbadae) \
+    MACRO(__VA_ARGS__, u16_linear_xyz_x0000_0x000_00x00        , 
SWS_PIXEL_U16, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbefbe) \
+    MACRO(__VA_ARGS__, u16_linear_xyzw_x0000_0x000_00x00_000x0 , 
SWS_PIXEL_U16, SWS_UOP_LINEAR          , 0xf, 0x00000, 0xbefbe)
+#define SWS_FOR_STRUCT_U16_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u16_linear_x_x000x                      , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_LINEAR          , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefae) \
+    MACRO(__VA_ARGS__, u16_linear_xyz_x000x_0x00x_00x0x        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbadae) \
+    MACRO(__VA_ARGS__, u16_linear_xyz_x0000_0x000_00x00        , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbefbe) \
+    MACRO(__VA_ARGS__, u16_linear_xyzw_x0000_0x000_00x00_000x0 , .type = 
SWS_PIXEL_U16, .uop = SWS_UOP_LINEAR          , .mask = 0xf, .par.lin.one = 
0x0, .par.lin.zero = 0xbefbe)
 #define SWS_FOR_U16_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_STRUCT_U16_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_U16_DITHER(MACRO, ...)
@@ -967,8 +989,14 @@
     MACRO(__VA_ARGS__, u32_clear_yw_xx                         , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_CLEAR           , .mask = 0xa, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u32_clear_xyw_xxx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_CLEAR           , .mask = 0xb, .par.clear.one = 
0x0, .par.clear.zero = 0x0) \
     MACRO(__VA_ARGS__, u32_clear_xzw_xxx                       , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_CLEAR           , .mask = 0xd, .par.clear.one = 
0x0, .par.clear.zero = 0x0)
-#define SWS_FOR_U32_LINEAR(MACRO, ...)
-#define SWS_FOR_STRUCT_U32_LINEAR(MACRO, ...)
+#define SWS_FOR_U32_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u32_linear_x_x000x                      , 
SWS_PIXEL_U32, SWS_UOP_LINEAR          , 0x1, 0x41040, 0xbefae) \
+    MACRO(__VA_ARGS__, u32_linear_xyz_x000x_0x00x_00x0x        , 
SWS_PIXEL_U32, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbadae) \
+    MACRO(__VA_ARGS__, u32_linear_xyz_x0000_0x000_00x00        , 
SWS_PIXEL_U32, SWS_UOP_LINEAR          , 0x7, 0x40000, 0xbefbe)
+#define SWS_FOR_STRUCT_U32_LINEAR(MACRO, ...) \
+    MACRO(__VA_ARGS__, u32_linear_x_x000x                      , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_LINEAR          , .mask = 0x1, .par.lin.one = 
0x41040, .par.lin.zero = 0xbefae) \
+    MACRO(__VA_ARGS__, u32_linear_xyz_x000x_0x00x_00x0x        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbadae) \
+    MACRO(__VA_ARGS__, u32_linear_xyz_x0000_0x000_00x00        , .type = 
SWS_PIXEL_U32, .uop = SWS_UOP_LINEAR          , .mask = 0x7, .par.lin.one = 
0x40000, .par.lin.zero = 0xbefbe)
 #define SWS_FOR_U32_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_STRUCT_U32_LINEAR_FMA(MACRO, ...)
 #define SWS_FOR_U32_DITHER(MACRO, ...)
diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list
index a9c4844487..28206463a7 100644
--- a/tests/ref/fate/sws-ops-list
+++ b/tests/ref/fate/sws-ops-list
@@ -1 +1 @@
-9283f06b12ed88d36b9de29e3be003aa
+0d88869d128750c6e311ef137ead0678
-- 
2.52.0


>From 012413ea26a21f53de89096ab048095dffbeeeff Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 18:59:38 +0200
Subject: [PATCH 13/14] swscale/uops: rename misleading mat4 to mat4x5

This used to be a 4x4 matrix at some point during development.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c      | 2 +-
 libswscale/uops.h      | 2 +-
 libswscale/uops_tmpl.c | 4 ++--
 libswscale/x86/ops.c   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index f78bec0f94..89b39cc846 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -721,7 +721,7 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList 
*ops,
         for (int j = 0; j < 5; j++) {
             const AVRational k = op->lin.m[i][j];
             const SwsPixel px = Q2PIXEL(k);
-            uop.data.mat4[i][j] = px;
+            uop.data.mat4x5[i][j] = px;
             if (k.num == 0)
                 uop.par.lin.zero |= SWS_MASK(i, j);
             else if (k.num == k.den)
diff --git a/libswscale/uops.h b/libswscale/uops.h
index eac8ee191e..5860206b3d 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -211,7 +211,7 @@ typedef struct SwsUOp {
         SwsPixel *ptr;              /* refstruct */
         SwsPixel scalar;
         SwsPixel vec4[4];
-        SwsPixel mat4[4][5];        /* row major */
+        SwsPixel mat4x5[4][5];      /* row major */
         void *opaque;               /* reserved for internal use */
     } data;
 } SwsUOp;
diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c
index 2fb241f1fe..7e5fd4633e 100644
--- a/libswscale/uops_tmpl.c
+++ b/libswscale/uops_tmpl.c
@@ -752,8 +752,8 @@ DECL_SETUP(setup_linear, params, out)
 
     for (int i = 0; i < 4; i++) {
         for (int j = 0; j < 4; j++)
-            c.m[i][j] = uop->data.mat4[i][j].px;
-        c.k[i] = uop->data.mat4[i][4].px;
+            c.m[i][j] = uop->data.mat4x5[i][j].px;
+        c.k[i] = uop->data.mat4x5[i][4].px;
     }
 
     out->priv.ptr = av_memdup(&c, sizeof(c));
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 1adb73e21c..761a31172f 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -280,7 +280,7 @@ static int setup_dither(const SwsImplParams *params, 
SwsImplResult *out)
 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
 {
     const SwsUOp *uop = params->uop;
-    out->priv.ptr = av_memdup(uop->data.mat4, sizeof(uop->data.mat4));
+    out->priv.ptr = av_memdup(uop->data.mat4x5, sizeof(uop->data.mat4x5));
     out->free = ff_op_priv_free;
     return out->priv.ptr ? 0 : AVERROR(ENOMEM);
 }
-- 
2.52.0


>From c38e78fa7277ca3ae17be6086285b9a8703666aa Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 11 Jun 2026 20:00:32 +0200
Subject: [PATCH 14/14] swscale/x86/ops_int: implement integer linear
 transformations

This is a bit inefficient for the hyper-special case of e.g. a single
integer multiplication only. In particular, the AVX2 path actually ends up
slower than the SSE4 path; though we still need to implement it due to the
block size needing to match the rest of the chain.

I plan on maybe adding a special case to cover the isolated / single
component case down the line.

  name                                                cycles (vs ref)
  u8_linear_x_x000x_c:                                1043.7
  u8_linear_x_x000x_x86_sse4:                          425.1 ( 2.45x)
  u8_linear_x_x000x_x86_avx2:                          946.7 ( 1.10x)
  u8_linear_x_x0001_c:                                1048.4
  u8_linear_x_x0001_x86_sse4:                          432.8 ( 2.40x)
  u8_linear_x_x0001_x86_avx2:                          921.9 ( 1.11x)
  u8_linear_xyz_x000x_0x00x_00x0x_c:                  1188.1
  u8_linear_xyz_x000x_0x00x_00x0x_x86_sse4:            773.8 ( 1.53x)
  u8_linear_xyz_x000x_0x00x_00x0x_x86_avx2:            587.2 ( 1.74x)
  u8_linear_xyz_x0000_0x000_00x00_c:                  1148.1
  u8_linear_xyz_x0000_0x000_00x00_x86_sse4:            737.0 ( 1.55x)
  u8_linear_xyz_x0000_0x000_00x00_x86_avx2:            913.9 ( 1.25x)
  u8_linear_xyz_x0001_0x00x_00x01_c:                  1185.8
  u8_linear_xyz_x0001_0x00x_00x01_x86_sse4:            767.5 ( 1.54x)
  u8_linear_xyz_x0001_0x00x_00x01_x86_avx2:            723.5 ( 1.49x)
  u8_linear_y_0x000_c:                                1039.1
  u8_linear_y_0x000_x86_sse4:                          406.2 ( 2.55x)
  u8_linear_y_0x000_x86_avx2:                          933.0 ( 1.11x)
  u16_linear_x_x000x_c:                               1205.8
  u16_linear_x_x000x_x86_avx2:                         592.9 ( 2.02x)
  u16_linear_xyz_x000x_0x00x_00x0x_c:                 1421.9
  u16_linear_xyz_x000x_0x00x_00x0x_x86_avx2:           493.3 ( 2.88x)
  u16_linear_xyz_x0000_0x000_00x00_c:                 1319.6
  u16_linear_xyz_x0000_0x000_00x00_x86_avx2:           506.7 ( 2.60x)
  u16_linear_xyzw_x0000_0x000_00x00_000x0_c:          1413.4
  u16_linear_xyzw_x0000_0x000_00x00_000x0_x86_avx2:    467.1 ( 3.02x)

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c             |  47 +++++++++++++-
 libswscale/x86/ops_float.asm     |  12 ++--
 libswscale/x86/ops_include.asm   |  11 +++-
 libswscale/x86/ops_int.asm       | 101 +++++++++++++++++++++++++++++++
 libswscale/x86/uops_macros.asm.h |   1 +
 5 files changed, 163 insertions(+), 9 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 761a31172f..eb96c97929 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -277,7 +277,49 @@ static int setup_dither(const SwsImplParams *params, 
SwsImplResult *out)
     return 0;
 }
 
+static void splat_lane(void *dst, SwsPixelType type, SwsPixel px)
+{
+    switch (ff_sws_pixel_type_size(type)) {
+    case 1:
+        memset(dst, px.u8, 16);
+        break;
+    case 2:
+        for (int i = 0; i < 8; i++)
+            ((uint16_t *) dst)[i] = px.u16;
+        break;
+    case 4:
+        for (int i = 0; i < 4; i++)
+            ((uint32_t *) dst)[i] = px.u32;
+        break;
+    }
+}
+
 static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
+{
+    uint8_t *mat = av_malloc(4 * 5 * 16); /* one lane per component */
+    if (!mat)
+        return AVERROR(ENOMEM);
+    out->priv.ptr = mat;
+    out->free = ff_op_priv_free;
+
+    const SwsUOp *uop = params->uop;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 5; j++) {
+            SwsPixel px = uop->data.mat4x5[i][j];
+            SwsPixelType type = uop->type;
+            if (type == SWS_PIXEL_U8) {
+                type = SWS_PIXEL_U16; /* for pmullw */
+                px.u16 = (px.u8 << 8) | px.u8;
+            }
+
+            splat_lane(mat, type, px);
+            mat += 16;
+        }
+    }
+    return 0;
+}
+
+static int setup_linear_fma(const SwsImplParams *params, SwsImplResult *out)
 {
     const SwsUOp *uop = params->uop;
     out->priv.ptr = av_memdup(uop->data.mat4x5, sizeof(uop->data.mat4x5));
@@ -326,7 +368,8 @@ SWS_FOR_STRUCT(TYPE, UNPACK,          DECL_ENTRY, EXT, 
NULL, NULL)
 SWS_FOR_STRUCT(TYPE, PACK,            DECL_ENTRY, EXT, NULL, NULL)             
 \
 SWS_FOR_STRUCT(TYPE, LSHIFT,          DECL_ENTRY, EXT, NULL, NULL)             
 \
 SWS_FOR_STRUCT(TYPE, RSHIFT,          DECL_ENTRY, EXT, NULL, NULL)             
 \
-SWS_FOR_STRUCT(TYPE, LINEAR_FMA,      DECL_ENTRY, EXT, NULL, setup_linear)     
 \
+SWS_FOR_STRUCT(TYPE, LINEAR,          DECL_ENTRY, EXT, NULL, setup_linear)     
 \
+SWS_FOR_STRUCT(TYPE, LINEAR_FMA,      DECL_ENTRY, EXT, NULL, setup_linear_fma) 
 \
 SWS_FOR_STRUCT(TYPE, DITHER,          DECL_ENTRY, EXT, NULL, setup_dither)     
 \
 /* end of macro */
 
@@ -366,6 +409,7 @@ static const SwsOpTable ops_u8##EXT = {
         SWS_FOR(U8, READ_PLANAR,    REF_ENTRY, EXT)                            
 \
         SWS_FOR(U8, WRITE_PLANAR,   REF_ENTRY, EXT)                            
 \
         SWS_FOR(U8, CLEAR,          REF_ENTRY, EXT)                            
 \
+        SWS_FOR(U8, LINEAR,         REF_ENTRY, EXT)                            
 \
         NULL                                                                   
 \
     },                                                                         
 \
 };
@@ -381,6 +425,7 @@ static const SwsOpTable ops_u16##EXT = {
     .block_size = SIZE,                                                        
 \
     .entries = {                                                               
 \
         REF_OPS_COMMON(EXT, U16)                                               
 \
+        SWS_FOR(U16, LINEAR, REF_ENTRY, EXT)                                   
 \
         SWS_FOR(U8,  TO_U16, REF_ENTRY, EXT)                                   
 \
         SWS_FOR(U16, TO_U8,  REF_ENTRY, EXT)                                   
 \
         SWS_FOR(U8,  EXPAND_PAIR, REF_ENTRY, EXT)                              
 \
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 13f110b20b..adad237be2 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -534,9 +534,7 @@ IF W,   maxps mw2, m11
 ;---------------------------------------------------------
 ; Linear operations
 
-%define LIN_MASK(I, J) (1 << (5 * (I) + (J)))
-
-%macro linear_muladd 5 ; dst, src, use_coef, coef, use_fma
+%macro linear_muladdps 5 ; dst, src, use_coef, coef, use_fma
     %if INIT ; dst is already initialized
         %if %3 && %5
             fmaddps %1, %4, %2, %1
@@ -570,10 +568,10 @@ IF LOAD(0), vbroadcastss m12, [%2 + 0 * BYTES]
 IF LOAD(1), vbroadcastss m13, [%2 + 1 * BYTES]
 IF LOAD(2), vbroadcastss m14, [%2 + 2 * BYTES]
 IF LOAD(3), vbroadcastss m15, [%2 + 3 * BYTES]
-IF NEED(0), linear_muladd %1, mx%4, LOAD(0), m12, FMA(0)
-IF NEED(1), linear_muladd %1, my%4, LOAD(1), m13, FMA(1)
-IF NEED(2), linear_muladd %1, mz%4, LOAD(2), m14, FMA(2)
-IF NEED(3), linear_muladd %1, mw%4, LOAD(3), m15, FMA(3)
+IF NEED(0), linear_muladdps %1, mx%4, LOAD(0), m12, FMA(0)
+IF NEED(1), linear_muladdps %1, my%4, LOAD(1), m13, FMA(1)
+IF NEED(2), linear_muladdps %1, mz%4, LOAD(2), m14, FMA(2)
+IF NEED(3), linear_muladdps %1, mw%4, LOAD(3), m15, FMA(3)
             assert INIT, SWS_UOP_LINEAR should not contain empty rows
 %endmacro
 
diff --git a/libswscale/x86/ops_include.asm b/libswscale/x86/ops_include.asm
index 073ed31e57..85777d7529 100644
--- a/libswscale/x86/ops_include.asm
+++ b/libswscale/x86/ops_include.asm
@@ -146,6 +146,9 @@ endstruc
 %define SWS_COMP_INV(mask)      ((mask) ^ SWS_COMP_ALL)
 %define SWS_COMP_ELEMS(N)       ((1 << (N)) - 1)
 
+%define LIN_MASK(I, J) (1 << (5 * (I) + (J)))
+%define LIN_COL(J) (LIN_MASK(0, J) | LIN_MASK(1, J) | LIN_MASK(2, J) | 
LIN_MASK(3, J))
+
 ;---------------------------------------------------------
 ; Common macros for declaring operations
 
@@ -326,13 +329,19 @@ endstruc
     %endif
 %endmacro
 
-; Alternate name; for nested usage (to work around NASM limitations)
+; Alternate names; for nested usage (to work around NASM limitations)
 %macro IF1 2+
     %if %1
         %2
     %endif
 %endmacro
 
+%macro IF2 2+
+    %if %1
+        %2
+    %endif
+%endmacro
+
 %macro shl_log2 2 ; dst, amount
     %if %2 == 64
         shl %1, 6
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index 111e6d0796..4e71917c01 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -709,6 +709,106 @@ assert 0, SWS_UOP_LINEAR_FMA is not implemented for 
integer types
 assert 0, SWS_UOP_DITHER is not implemented for integer types
 %endmacro
 
+;---------------------------------------------------------
+; Linear operations
+
+%macro linear_muladdw 4 ; dst, src, use_coef, coef
+    %if INIT ; dst is already initialized
+        %if %3
+            pmullw %4, %2
+            paddw %1, %4
+        %else
+            paddw %1, %2
+        %endif
+    %else
+        %assign INIT 1
+        %if %3
+            pmullw %1, %2, %4
+        %else
+            mova %1, %2
+        %endif
+    %endif
+%endmacro
+
+%macro linear_row 3 ; dst, src, row
+%xdefine NEED(J) (!(ZERO_MASK & LIN_MASK(%3, J)))
+%xdefine LOAD(J) (NEED(J) && !(ONE_MASK & LIN_MASK(%3, J)))
+%assign INIT 0 ; track whether `dst` already contains data
+
+    %if !(ZERO_MASK & LIN_MASK(%3, 4)) ; nonzero output offset
+            %assign INIT 1
+            VBROADCASTI128 %1, [%2 + 4 * 16]
+    %endif
+IF LOAD(0), VBROADCASTI128 m12, [%2 + 0 * 16]
+IF LOAD(1), VBROADCASTI128 m13, [%2 + 1 * 16]
+IF LOAD(2), VBROADCASTI128 m14, [%2 + 2 * 16]
+IF LOAD(3), VBROADCASTI128 m15, [%2 + 3 * 16]
+IF NEED(0), linear_muladdw %1, IN0, LOAD(0), m12
+IF NEED(1), linear_muladdw %1, IN1, LOAD(1), m13
+IF NEED(2), linear_muladdw %1, IN2, LOAD(2), m14
+IF NEED(3), linear_muladdw %1, IN3, LOAD(3), m15
+            assert INIT, SWS_UOP_LINEAR should not contain empty rows
+%endmacro
+
+; Swap the high and low bytes of `dst` and `out` and merge back into `dst`
+%macro linear_rot 4 ; have_out, need_in, dst, out
+    %if %1 || %2 ; we also need to rotate pure input registers
+        %if %1
+            psllw %4, 8
+        %else
+            psllw %4, %3, 8
+        %endif
+            psrlw %3, 8
+            por %3, %4
+    %endif
+%endmacro
+
+%macro linear_pass 0-1 ; suffix
+%xdefine USED(J) (!(ZERO_MASK & LIN_COL(J)))
+%xdefine IN0 mx%1
+%xdefine IN1 my%1
+%xdefine IN2 mz%1
+%xdefine IN3 mw%1
+
+IF1 X,  linear_row m8,  tmp0q +  0 * 16, 0
+IF1 Y,  linear_row m9,  tmp0q +  5 * 16, 1
+IF1 Z,  linear_row m10, tmp0q + 10 * 16, 2
+IF1 W,  linear_row m11, tmp0q + 15 * 16, 3
+
+    %if BITS == 8
+        ; swap high/low bits and compute the other half; this discards the
+        ; garbage high byte produced by each sub-pass
+        linear_rot X, USED(0), IN0, m8
+        linear_rot Y, USED(1), IN1, m9
+        linear_rot Z, USED(2), IN2, m10
+        linear_rot W, USED(3), IN3, m11
+IF1 X,  linear_row m8,  tmp0q +  0 * 16, 0
+IF1 Y,  linear_row m9,  tmp0q +  5 * 16, 1
+IF1 Z,  linear_row m10, tmp0q + 10 * 16, 2
+IF1 W,  linear_row m11, tmp0q + 15 * 16, 3
+        linear_rot X, USED(0), IN0, m8
+        linear_rot Y, USED(1), IN1, m9
+        linear_rot Z, USED(2), IN2, m10
+        linear_rot W, USED(3), IN3, m11
+    %else
+IF X,   mova IN0, m8
+IF Y,   mova IN1, m9
+IF Z,   mova IN2, m10
+IF W,   mova IN3, m11
+    %endif
+%endmacro
+
+%macro LINEAR 2
+%assign ONE_MASK   %1
+%assign ZERO_MASK  %2
+
+        mov tmp0q, [implq + SwsOpImpl.priv] ; address of matrix
+        LOAD_CONT tmp1q
+        linear_pass
+IF2 V2, linear_pass 2
+        CONTINUE tmp1q
+%endmacro
+
 ;---------------------------------------------------------
 ; Instantiate above macros to generate all uop kernels
 
@@ -732,6 +832,7 @@ assert 0, SWS_UOP_DITHER is not implemented for integer 
types
     DECL_%1_RSHIFT          (RSHIFT)
     DECL_%1_LINEAR_FMA      (LINEAR_FMA)
     DECL_%1_DITHER          (DITHER)
+    DECL_%1_LINEAR          (LINEAR)
 %endmacro
 
 %macro decl_type_invariant 0
diff --git a/libswscale/x86/uops_macros.asm.h b/libswscale/x86/uops_macros.asm.h
index d9565d12f2..e8b9a2c4a3 100644
--- a/libswscale/x86/uops_macros.asm.h
+++ b/libswscale/x86/uops_macros.asm.h
@@ -71,6 +71,7 @@
     {DEF_MACRO(TO_U32,              TYPE)}, \
     {DEF_MACRO(TO_F32,              TYPE)}, \
     {DEF_MACRO(SCALE,               TYPE)}, \
+    {DEF_MACRO(LINEAR,              TYPE)}, \
     {DEF_MACRO(LINEAR_FMA,          TYPE)}, \
     {DEF_MACRO(ADD,                 TYPE)}, \
     {DEF_MACRO(MIN,                 TYPE)}, \
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale: add support for integer linear ops (PR #23461)

Reply via email to