PR #23235 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23235
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23235.patch


>From c6bf1adb78870477c359f9c48ebdaacbfd3ea70b Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 02:26:20 +0900
Subject: [PATCH 1/9] vulkan/rangecoder: fix encoding issue when -1 != 0xFF

This was an oversight while microoptimizing. The outstanding_byte can
reach 0xFF in some situations, which was causing errors when encoding,
particularly with 32-bit floats.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/rangecoder.glsl | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/libavcodec/vulkan/rangecoder.glsl 
b/libavcodec/vulkan/rangecoder.glsl
index 17741793f0..cbb346da8d 100644
--- a/libavcodec/vulkan/rangecoder.glsl
+++ b/libavcodec/vulkan/rangecoder.glsl
@@ -46,7 +46,9 @@ struct RangeCoder {
     uint     low;
     uint     range;
     uint16_t outstanding_count;
-    uint8_t  outstanding_byte;
+    /* note: -1 is the value at init, and it has a different meaning from 0xFF,
+     * so we have to handle both, meaning we have to keep this signed */
+    int16_t  outstanding_byte;
 };
 
 shared RangeCoder rc;
@@ -61,29 +63,29 @@ void rac_init(uint bs_start, uint bs_len)
     rc.low = 0;
     rc.range = 0xFF00;
     rc.outstanding_count = uint16_t(0);
-    rc.outstanding_byte = uint8_t(0xFF);
+    rc.outstanding_byte = int16_t(-1);
 }
 
 #ifdef FULL_RENORM
-/* Full renorm version that can handle outstanding_byte == 0xFF */
+/* Full renorm version that can handle the initial outstanding_byte < 0 */
 void renorm_encoder(void)
 {
-    if (rc.outstanding_byte == 0xFF) {
-        rc.outstanding_byte = uint8_t(rc.low >> 8);
+    if (rc.outstanding_byte < int16_t(0)) {
+        rc.outstanding_byte = int16_t(rc.low >> 8);
     } else if (rc.low <= 0xFF00) {
-        slice_data[rc.bs_off++].v = rc.outstanding_byte;
+        slice_data[rc.bs_off++].v = uint8_t(rc.outstanding_byte);
         uint16_t cnt = rc.outstanding_count;
         for (; cnt > 0; cnt--)
             slice_data[rc.bs_off++].v = uint8_t(0xFF);
         rc.outstanding_count = uint16_t(0);
-        rc.outstanding_byte = uint8_t(rc.low >> 8);
+        rc.outstanding_byte = int16_t(rc.low >> 8);
     } else if (rc.low >= 0x10000) {
-        slice_data[rc.bs_off++].v = rc.outstanding_byte + uint8_t(1);
+        slice_data[rc.bs_off++].v = uint8_t(rc.outstanding_byte) + uint8_t(1);
         uint16_t cnt = rc.outstanding_count;
         for (; cnt > 0; cnt--)
             slice_data[rc.bs_off++].v = uint8_t(0x00);
         rc.outstanding_count = uint16_t(0);
-        rc.outstanding_byte = uint8_t(bitfieldExtract(rc.low, 8, 8));
+        rc.outstanding_byte = int16_t(bitfieldExtract(rc.low, 8, 8));
     } else {
         rc.outstanding_count++;
     }
@@ -108,10 +110,10 @@ void renorm_encoder(void)
         return;
     }
 
-    uint8_t outstanding_byte = rc.outstanding_byte;
+    uint8_t outstanding_byte = uint8_t(rc.outstanding_byte);
 
     rc.outstanding_count = uint16_t(0);
-    rc.outstanding_byte  = uint8_t(low >> 8);
+    rc.outstanding_byte  = int16_t(low >> 8);
 
     uint8_t obs = uint8_t(low > 0xFF00);
     uint8_t fill = obs - uint8_t(1); /* unsigned underflow */
-- 
2.52.0


>From e7e7cfffbba4e2df5f692fc57edba364c071117e Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:29:40 +0900
Subject: [PATCH 2/9] vulkan/ffv1_enc_remap: clear the full 65536-entry fltmap

Float pixfmts are meant to be normalized between [0, 1], but in case they
were not, and negative numbers were present, then the top bits would be
filled with garbage.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/ffv1_enc_remap.comp.glsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/vulkan/ffv1_enc_remap.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
index 5bef2eac13..bc905a6695 100644
--- a/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
@@ -45,7 +45,7 @@ void load_fltmap(uint slice_idx, uint p)
     uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
                            gl_NumWorkGroups.y, 0);
 
-    for (uint i = gl_LocalInvocationIndex; i < 32768;
+    for (uint i = gl_LocalInvocationIndex; i < 65536u;
          i += (gl_WorkGroupSize.x * gl_WorkGroupSize.y))
         fltmap[slice_idx][p][i] = 0;
 
-- 
2.52.0


>From b9ae64695135b614b3ea5ee70b7d077901fcc592 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:30:31 +0900
Subject: [PATCH 3/9] vulkan/ffv1_enc: pass the correct base and offset to
 OFFBUF in init_golomb

Ugh, my previous fix on this was only right in some cases, this is a general 
fix.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/ffv1_enc.comp.glsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl 
b/libavcodec/vulkan/ffv1_enc.comp.glsl
index 85c31d7482..38119780f7 100644
--- a/libavcodec/vulkan/ffv1_enc.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc.comp.glsl
@@ -149,7 +149,7 @@ PutBitContext pb;
 void init_golomb(void)
 {
     hdr_len = rac_terminate();
-    init_put_bits(pb, OFFBUF(u8buf, rc.bs_start, hdr_len),
+    init_put_bits(pb, OFFBUF(u8buf, slice_data, rc.bs_start + hdr_len),
                   slice_size_max - hdr_len);
 }
 
-- 
2.52.0


>From 662e816fba854009e54d6b5a952457bfd254aa6a Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:30:58 +0900
Subject: [PATCH 4/9] vulkan/ffv1_enc: skip GOLOMB encode_line when !bits for
 FLOAT formats

Same as the arithmetic coded path. I skipped out on adding this here.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/ffv1_enc.comp.glsl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl 
b/libavcodec/vulkan/ffv1_enc.comp.glsl
index 38119780f7..1381c041db 100644
--- a/libavcodec/vulkan/ffv1_enc.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc.comp.glsl
@@ -164,6 +164,9 @@ void encode_line(in SliceContext sc, readonly uimage2D img, 
uint state_off,
         w = ceil_rshift(w, chroma_shift.x);
         sp >>= chroma_shift;
     }
+#elif defined(FLOAT)
+    if (bits == 0)
+        return;
 #endif
 
     linecache_load(img, sp, y, comp);
-- 
2.52.0


>From c3489a1df9c3cad8b57ee3714f25200200c1fd6c Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:32:24 +0900
Subject: [PATCH 5/9] vulkan/ffv1: read raw 16-bit float images via R16_UINT
 view to preserve denormals

GPUs filter out denormals when reading floats via imageLoad. Denormals shouldn't
be present in general, but if they are, this is a lossless codec, and we have to
preserve them. This allows reading the exact values.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/ffv1enc_vulkan.c                |  6 +++++-
 libavcodec/vulkan/ffv1_enc.comp.glsl       | 13 ++++++++-----
 libavcodec/vulkan/ffv1_enc_remap.comp.glsl |  7 ++++---
 libavutil/vulkan.c                         |  4 ++--
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 036f126134..92d46f7ddf 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -368,8 +368,12 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
     ff_vk_exec_start(&fv->s, exec);
     fd->idx = exec->idx;
 
+    /* For float pixel formats we want the raw bit pattern, not a value
+     * already passed through fp16/fp32 conversion (which can flush
+     * denormals). Use a UINT view in that case. */
     RET(ff_vk_create_imageviews(&fv->s, exec, src_views, src,
-                                FF_VK_REP_NATIVE));
+                                f->remap_mode ? FF_VK_REP_UINT
+                                              : FF_VK_REP_NATIVE));
 
     ff_vk_exec_add_dep_buf(&fv->s, exec, &slice_data_ref, 1, has_inter);
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl 
b/libavcodec/vulkan/ffv1_enc.comp.glsl
index 1381c041db..90ce8293b9 100644
--- a/libavcodec/vulkan/ffv1_enc.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc.comp.glsl
@@ -34,13 +34,15 @@ layout (set = 0, binding = 2, scalar) uniform crc_ieee_buf {
 layout (set = 1, binding = 1, scalar) writeonly buffer slice_results_buf {
     uint32_t slice_results[];
 };
+/* Source images are bound as UINT (raw bits) regardless of the underlying
+ * pixel format. Integer formats are passed through unchanged; for float
+ * formats this avoids the fp16/fp32 conversion that would otherwise flush
+ * denormals before we get to look at them. */
+layout (set = 1, binding = 3) uniform uimage2D src[];
 #ifdef FLOAT
-layout (set = 1, binding = 3) uniform image2D src[];
 layout (set = 1, binding = 5) readonly buffer fltmap_buf {
     uint fltmap[][4][65536];
 };
-#else
-layout (set = 1, binding = 3) uniform uimage2D src[];
 #endif
 
 #ifndef GOLOMB
@@ -237,9 +239,10 @@ ivec4 load_components(uint slice_idx, in SliceContext sc, 
ivec2 pos)
 {
     ivec4 pix;
 #ifdef FLOAT
+    /* Source view is r16_uint so imageLoad returns the raw fp16 bit pattern
+     * in .x; no conversion is performed and denormals survive. */
     for (int i = 0; i < color_planes; i++) {
-        float16_t v = float16_t(imageLoad(src[i], pos));
-        uint16_t iv = float16BitsToUint16(v);
+        uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu;
         pix[i] = int(fltmap[slice_idx][i][iv]);
     }
 #else
diff --git a/libavcodec/vulkan/ffv1_enc_remap.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
index bc905a6695..685e4d6219 100644
--- a/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_remap.comp.glsl
@@ -27,7 +27,7 @@
 #include "common.glsl"
 #include "ffv1_common.glsl"
 
-layout (set = 1, binding = 1) uniform image2D src[];
+layout (set = 1, binding = 1) uniform uimage2D src[];
 
 layout (set = 1, binding = 2) buffer fltmap_buf {
     uint fltmap[][4][65536];
@@ -53,8 +53,9 @@ void load_fltmap(uint slice_idx, uint p)
 
     for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += 
gl_WorkGroupSize.y) {
         for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += 
gl_WorkGroupSize.x) {
-            vec4 pix = imageLoad(src[p], ivec2(x, y));
-            uint16_t pix_idx = float16BitsToUint16(float16_t(pix[0]));
+            /* Source view is r16_uint so the .x lane is the raw fp16 bit
+             * pattern; no conversion is performed and denormals survive. */
+            uint pix_idx = imageLoad(src[p], ivec2(x, y))[0] & 0xFFFFu;
             atomicOr(fltmap[slice_idx][p][pix_idx], 1);
         }
     }
diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index 460c704507..db6d2f6775 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -1927,8 +1927,8 @@ static VkFormat map_fmt_to_rep(VkFormat fmt, enum 
FFVkShaderRepFormat rep_fmt)
         {
             VK_FORMAT_R16_SFLOAT,
             VK_FORMAT_R16_SFLOAT,
-            VK_FORMAT_UNDEFINED,
-            VK_FORMAT_UNDEFINED,
+            VK_FORMAT_R16_SINT,
+            VK_FORMAT_R16_UINT,
         },
     };
 #undef REPS_FMT_PACK
-- 
2.52.0


>From 4dc6317d5f4e3a370980a2ec1ea8a2fec6af0e8b Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:33:24 +0900
Subject: [PATCH 6/9] vulkan/ffv1_enc_rct_search: barrier before reading
 score_mode

There was a race condition where the main invocation would race ahead and use
values not yet written by other invocs.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
index d72c667b10..52171ab221 100644
--- a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
@@ -126,6 +126,10 @@ void coeff_search(inout SliceContext sc)
         }
     }
 
+    /* All lanes must have finished accumulating into score_mode before
+     * lane (0,0) inspects it for the argmin. */
+    barrier();
+
     if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
         uint min_score = 0xFFFFFFFF;
         uint min_idx = 3;
-- 
2.52.0


>From 55290a63754ab850b693894d44cdbc16a8889481 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:33:53 +0900
Subject: [PATCH 7/9] vulkan/ffv1_enc_rct_search: write slice_rct_coef directly
 by main invoc

The issue is that SliceContext was passed as an inout, which caused all
invocs to locally copy and modify it.
When the main invoc wrote it, only the very last written value was used,
choosing the wrong coeffs.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
index 52171ab221..dc25f50831 100644
--- a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
@@ -108,7 +108,7 @@ void process(ivec2 pos)
     }
 }
 
-void coeff_search(inout SliceContext sc)
+void coeff_search(uint slice_idx)
 {
     uvec2 img_size = imageSize(src[0]);
     uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
@@ -139,7 +139,7 @@ void coeff_search(inout SliceContext sc)
                 min_idx = i;
             }
         }
-        sc.slice_rct_coef = rct_y_coeff[min_idx];
+        slice_ctx[slice_idx].slice_rct_coef = rct_y_coeff[min_idx];
     }
 }
 
@@ -149,5 +149,5 @@ void main(void)
         return;
 
     const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
-    coeff_search(slice_ctx[slice_idx]);
+    coeff_search(slice_idx);
 }
-- 
2.52.0


>From 6023695f64cc1ffdb80f13b76911a42bbd975256 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:34:40 +0900
Subject: [PATCH 8/9] vulkan/ffv1_enc_rct_search: fix slice dimension
 iterations

This was a mess, we were using incorrect pixels outside of the image boundaries 
as
valid, the iteration had undefined behaviour since it was non-uniform across 
the workgroup.

Calculate the per-invoc iterations from the slice dimensions instead, making 
all of
them identical. And add a valid flag to decide whether to use them or not. And 
fix the
synchronization.

Sponsored-by: Sovereign Tech Fund
---
 .../vulkan/ffv1_enc_rct_search.comp.glsl      | 38 ++++++++++++++-----
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
index dc25f50831..191b188f22 100644
--- a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl
@@ -94,15 +94,19 @@ uint get_dist(ivec3 cur)
 shared uint score_cols[gl_WorkGroupSize.y] = { };
 shared uint score_mode[16] = { };
 
-void process(ivec2 pos)
+/* One scoring step: publish this lane's tx_pix to shared, then read the
+ * neighbours' to compute the prediction error. `valid` selects whether
+ * this lane has a real pixel; invalid lanes write zero into pix_buf so
+ * the cache stays well-defined while still participating in the barrier. */
+void process(ivec2 pos, bool valid, int i)
 {
-    ivec3 pix = load_components(pos);
-
-    for (int i = 0; i < NUM_CHECKS; i++) {
-        ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]);
-        pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = 
tx_pix;
-        memoryBarrierShared();
+    ivec3 pix = valid ? load_components(pos) : ivec3(0);
+    ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]);
+    pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] =
+        valid ? tx_pix : ivec3(0);
+    barrier();
 
+    if (valid) {
         uint dist = get_dist(tx_pix);
         atomicAdd(score_mode[i], dist);
     }
@@ -120,9 +124,23 @@ void coeff_search(uint slice_idx)
     uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
                            gl_NumWorkGroups.y, 0);
 
-    for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += 
gl_WorkGroupSize.y) {
-        for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += 
gl_WorkGroupSize.x) {
-            process(ivec2(x, y));
+    /* Uniform iteration: every lane in the workgroup runs the same number
+     * of tile iterations so that the in-process barrier is always reached
+     * by all lanes. Lanes outside the slice extents pass valid=false. */
+    uint sw = sxe - sxs;
+    uint sh = sye - sys;
+    uint n_xi = (sw + gl_WorkGroupSize.x - 1u) / gl_WorkGroupSize.x;
+    uint n_yi = (sh + gl_WorkGroupSize.y - 1u) / gl_WorkGroupSize.y;
+
+    for (uint yi = 0u; yi < n_yi; yi++) {
+        uint y = sys + yi*gl_WorkGroupSize.y + gl_LocalInvocationID.y;
+        for (uint xi = 0u; xi < n_xi; xi++) {
+            uint x = sxs + xi*gl_WorkGroupSize.x + gl_LocalInvocationID.x;
+            bool valid = (x < sxe) && (y < sye);
+            for (int i = 0; i < NUM_CHECKS; i++) {
+                process(ivec2(x, y), valid, i);
+                barrier();
+            }
         }
     }
 
-- 
2.52.0


>From c7082683cd0f877c23614b1b68308f83410fc7d0 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 26 May 2026 11:38:29 +0900
Subject: [PATCH 9/9] vulkan/ffv1: add 32-bit float RGB encoding and a rice +
 remap path

This implements 32-bit float RGB encoding and makes the Vulkan implementation
on-par with the C implementation.

Sponsored-by: Sovereign Tech Fund
---
 libavcodec/ffv1_vulkan.h                      |   1 +
 libavcodec/ffv1enc_vulkan.c                   | 153 ++++++++++++++++-
 libavcodec/vulkan/Makefile                    |   4 +-
 libavcodec/vulkan/ffv1_common.glsl            |   1 +
 libavcodec/vulkan/ffv1_enc.comp.glsl          |  27 ++-
 .../ffv1_enc_rgb_float_golomb.comp.glsl       |  33 ++++
 libavcodec/vulkan/ffv1_enc_setup.comp.glsl    | 127 +++++++++++++-
 libavcodec/vulkan/ffv1_enc_sort32.comp.glsl   | 155 ++++++++++++++++++
 8 files changed, 477 insertions(+), 24 deletions(-)
 create mode 100644 libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
 create mode 100644 libavcodec/vulkan/ffv1_enc_sort32.comp.glsl

diff --git a/libavcodec/ffv1_vulkan.h b/libavcodec/ffv1_vulkan.h
index 9a206afaca..d6ae0f3fee 100644
--- a/libavcodec/ffv1_vulkan.h
+++ b/libavcodec/ffv1_vulkan.h
@@ -48,6 +48,7 @@ typedef struct FFv1ShaderParams {
     int sar[2];
     int pic_mode;
     uint32_t slice_size_max;
+    uint32_t max_pixels_per_slice;
 } FFv1ShaderParams;
 
 #endif /* AVCODEC_FFV1_VULKAN_H */
diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 92d46f7ddf..7c22ced785 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -72,6 +72,7 @@ typedef struct VulkanEncodeFFv1Context {
 
     FFVulkanShader rct_search;
     FFVulkanShader remap;
+    FFVulkanShader sort32;
     FFVulkanShader setup;
     FFVulkanShader reset;
     FFVulkanShader enc;
@@ -101,6 +102,8 @@ typedef struct VulkanEncodeFFv1Context {
     int optimize_rct;
 
     int is_rgb;
+    int is_float32;
+    uint32_t max_pixels_per_slice;
     int ppi;
     int chunks;
 } VulkanEncodeFFv1Context;
@@ -141,6 +144,12 @@ extern const unsigned int ff_ffv1_enc_remap_comp_spv_len;
 extern const unsigned char ff_ffv1_enc_rgb_float_comp_spv_data[];
 extern const unsigned int ff_ffv1_enc_rgb_float_comp_spv_len;
 
+extern const unsigned char ff_ffv1_enc_rgb_float_golomb_comp_spv_data[];
+extern const unsigned int ff_ffv1_enc_rgb_float_golomb_comp_spv_len;
+
+extern const unsigned char ff_ffv1_enc_sort32_comp_spv_data[];
+extern const unsigned int ff_ffv1_enc_sort32_comp_spv_len;
+
 static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
                           AVFrame *enc_in, VkImageView *enc_in_views,
                           FFVkBuffer *slice_data_buf, uint32_t slice_data_size,
@@ -203,6 +212,37 @@ static int run_remap(AVCodecContext *avctx, 
FFVkExecContext *exec,
     return 0;
 }
 
+static int run_sort32(AVCodecContext *avctx, FFVkExecContext *exec,
+                      AVFrame *enc_in, VkImageView *enc_in_views,
+                      FFVkBuffer *units_buf, uint32_t units_size,
+                      FFv1ShaderParams *pd)
+{
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanFunctions *vk = &fv->s.vkfn;
+
+    /* Update descriptors */
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->sort32,
+                                  enc_in, enc_in_views,
+                                  1, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->sort32,
+                                    1, 2, 0,
+                                    units_buf,
+                                    0, units_size*f->slice_count,
+                                    VK_FORMAT_UNDEFINED);
+
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->sort32);
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->sort32,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(FFv1ShaderParams), pd);
+
+    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
+
+    return 0;
+}
+
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                            FFVkExecContext *exec,
                                            const AVFrame *pict)
@@ -279,15 +319,19 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
     slice_data_buf = (FFVkBuffer *)slice_data_ref->data;
 
     if (f->remap_mode) {
-        const AVPixFmtDescriptor *desc = 
av_pix_fmt_desc_get(fv->s.frames->sw_format);
-        remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t);
+        if (fv->is_float32) {
+            /* Per (slice, plane): [units : max_pixels*2 uints] + [bitmap : 
max_pixels uints]. */
+            remap_data_size = 4*fv->max_pixels_per_slice*3*sizeof(uint32_t);
+        } else {
+            const AVPixFmtDescriptor *desc = 
av_pix_fmt_desc_get(fv->s.frames->sw_format);
+            remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t);
+        }
 
         RET(ff_vk_get_pooled_buffer(&fv->s, &fv->remap_data_pool,
                                     &remap_data_ref,
                                     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
                                     NULL, remap_data_size*f->slice_count,
                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
-
         remap_data_buf = (FFVkBuffer *)remap_data_ref->data;
     }
 
@@ -348,6 +392,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 :
                     !(pict->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 2 : 1,
         .slice_size_max = out_data_buf->size / f->slice_count,
+        .max_pixels_per_slice = fv->max_pixels_per_slice,
     };
 
     for (int i = 0; i < f->quant_table_count; i++) {
@@ -420,8 +465,13 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
     }
 
     if (f->remap_mode) {
-        RET(run_remap(avctx, exec, src, src_views,
-                      remap_data_buf, remap_data_size, &pd));
+        if (fv->is_float32) {
+            RET(run_sort32(avctx, exec, src, src_views,
+                           remap_data_buf, remap_data_size, &pd));
+        } else {
+            RET(run_remap(avctx, exec, src, src_views,
+                          remap_data_buf, remap_data_size, &pd));
+        }
 
         /* Make sure the writes are visible to the setup shader */
         ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf,
@@ -519,6 +569,14 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
                       COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
                       COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT,
                       0, slice_data_size*f->slice_count);
+
+    /* Setup writes the per-pixel compact_idx (or compact_idx-of-value)
+     * back into the remap buffer; the encode shader reads it. */
+    if (f->remap_mode)
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
+                          0, remap_data_size*f->slice_count);
     if (f->key_frame || fv->force_pcm)
         ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
                           COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
@@ -906,6 +964,54 @@ fail:
     return err;
 }
 
+static int init_sort32_shader(AVCodecContext *avctx, VkSpecializationInfo *sl)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFVulkanShader *shd = &fv->sort32;
+
+    uint32_t wg_x = FFMIN(fv->max_pixels_per_slice, 256);
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
+                      (uint32_t []) { wg_x, 1, 1 }, 0);
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1ShaderParams),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    const FFVulkanDescriptorSetBinding desc_set_const[] = {
+        { /* rangecoder_buf */
+            .type   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set_const, 1, 1, 0);
+
+    const FFVulkanDescriptorSetBinding desc_set[] = {
+        { /* slice_data_buf */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        { /* src */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+        },
+        { /* units */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0);
+
+    RET(ff_vk_shader_link(&fv->s, shd,
+                          ff_ffv1_enc_sort32_comp_spv_data,
+                          ff_ffv1_enc_sort32_comp_spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
+
+fail:
+    return err;
+}
+
 static int init_remap_shader(AVCodecContext *avctx, VkSpecializationInfo *sl)
 {
     int err;
@@ -1105,9 +1211,14 @@ static int init_encode_shader(AVCodecContext *avctx, 
VkSpecializationInfo *sl)
                                     4 + fv->is_rgb + !!f->remap_mode, 0, 0);
 
     if (f->remap_mode) {
-        ff_vk_shader_link(&fv->s, shd,
-                          ff_ffv1_enc_rgb_float_comp_spv_data,
-                          ff_ffv1_enc_rgb_float_comp_spv_len, "main");
+        if (fv->ctx.ac == AC_GOLOMB_RICE)
+            ff_vk_shader_link(&fv->s, shd,
+                              ff_ffv1_enc_rgb_float_golomb_comp_spv_data,
+                              ff_ffv1_enc_rgb_float_golomb_comp_spv_len, 
"main");
+        else
+            ff_vk_shader_link(&fv->s, shd,
+                              ff_ffv1_enc_rgb_float_comp_spv_data,
+                              ff_ffv1_enc_rgb_float_comp_spv_len, "main");
     } else if (fv->ctx.ac == AC_GOLOMB_RICE) {
         if (fv->is_rgb)
             ff_vk_shader_link(&fv->s, shd,
@@ -1304,6 +1415,26 @@ static av_cold int 
vulkan_encode_ffv1_init(AVCodecContext *avctx)
     fv->is_rgb = !(f->colorspace == 0 && avctx->sw_pix_fmt != AV_PIX_FMT_YA8) 
&&
                  !(avctx->sw_pix_fmt == AV_PIX_FMT_YA8);
 
+    fv->is_float32 = (avctx->sw_pix_fmt == AV_PIX_FMT_GBRPF32 ||
+                      avctx->sw_pix_fmt == AV_PIX_FMT_GBRAPF32);
+
+    if (fv->is_float32) {
+        /* Compute the worst-case slice geometry. With version >= 4 the slice
+         * boundaries are computed via slice_coord() which rounds up, so any
+         * single slice has at most ceil(width/num_h_slices) * 
ceil(height/num_v_slices)
+         * pixels. */
+        uint32_t mw = (avctx->width  + f->num_h_slices - 1) / f->num_h_slices;
+        uint32_t mh = (avctx->height + f->num_v_slices - 1) / f->num_v_slices;
+        /* Round up to next pow2 for bitonic sort */
+        uint32_t n = 1;
+        uint32_t pn = mw*mh;
+        while (n < pn)
+            n <<= 1;
+        if (n < 2)
+            n = 2;
+        fv->max_pixels_per_slice = n;
+    }
+
     /* Init rct search shader */
     fv->optimize_rct = fv->is_rgb && f->version >= 4 &&
                        !fv->force_pcm && fv->optimize_rct;
@@ -1325,7 +1456,10 @@ static av_cold int 
vulkan_encode_ffv1_init(AVCodecContext *avctx)
     }
 
     if (f->remap_mode) {
-        err = init_remap_shader(avctx, sl);
+        if (fv->is_float32)
+            err = init_sort32_shader(avctx, sl);
+        else
+            err = init_remap_shader(avctx, sl);
         if (err < 0)
             return err;
     }
@@ -1420,6 +1554,7 @@ static av_cold int 
vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
     ff_vk_shader_free(&fv->s, &fv->remap);
+    ff_vk_shader_free(&fv->s, &fv->sort32);
     ff_vk_shader_free(&fv->s, &fv->rct_search);
 
     if (fv->exec_ctx_info) {
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index c6817967c7..f86931727d 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -13,7 +13,9 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += 
vulkan/ffv1_enc_setup.comp.spv.o \
                                       vulkan/ffv1_enc_rgb_golomb.comp.spv.o \
                                       vulkan/ffv1_enc_rct_search.comp.spv.o \
                                       vulkan/ffv1_enc_remap.comp.spv.o \
-                                      vulkan/ffv1_enc_rgb_float.comp.spv.o
+                                      vulkan/ffv1_enc_rgb_float.comp.spv.o \
+                                      
vulkan/ffv1_enc_rgb_float_golomb.comp.spv.o \
+                                      vulkan/ffv1_enc_sort32.comp.spv.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \
                                       vulkan/ffv1_dec_reset.comp.spv.o \
diff --git a/libavcodec/vulkan/ffv1_common.glsl 
b/libavcodec/vulkan/ffv1_common.glsl
index 8580a0777f..3d3b6753c6 100644
--- a/libavcodec/vulkan/ffv1_common.glsl
+++ b/libavcodec/vulkan/ffv1_common.glsl
@@ -75,6 +75,7 @@ layout (push_constant, scalar) uniform pushConstants {
     ivec2 sar;
     int pic_mode;
     uint slice_size_max;
+    uint max_pixels_per_slice;
 };
 
 #include "rangecoder.glsl"
diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl 
b/libavcodec/vulkan/ffv1_enc.comp.glsl
index 90ce8293b9..1c30e91828 100644
--- a/libavcodec/vulkan/ffv1_enc.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc.comp.glsl
@@ -40,8 +40,8 @@ layout (set = 1, binding = 1, scalar) writeonly buffer 
slice_results_buf {
  * denormals before we get to look at them. */
 layout (set = 1, binding = 3) uniform uimage2D src[];
 #ifdef FLOAT
-layout (set = 1, binding = 5) readonly buffer fltmap_buf {
-    uint fltmap[][4][65536];
+layout (set = 1, binding = 5, scalar) readonly buffer fltmap_buf {
+    uint fltmap[];
 };
 #endif
 
@@ -239,11 +239,24 @@ ivec4 load_components(uint slice_idx, in SliceContext sc, 
ivec2 pos)
 {
     ivec4 pix;
 #ifdef FLOAT
-    /* Source view is r16_uint so imageLoad returns the raw fp16 bit pattern
-     * in .x; no conversion is performed and denormals survive. */
-    for (int i = 0; i < color_planes; i++) {
-        uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu;
-        pix[i] = int(fltmap[slice_idx][i][iv]);
+    if (c_bits >= 32) {
+        /* 32-bit float: per-pixel-position bitmap lookup. The bitmap region
+         * follows the units region in the same buffer. */
+        ivec2 rel = pos - sc.slice_pos;
+        uint pixel_idx = uint(rel.x + sc.slice_dim.x*rel.y);
+        uint plane_stride = max_pixels_per_slice*3u;
+        for (int i = 0; i < color_planes; i++) {
+            uint base = (slice_idx*4u + uint(i))*plane_stride
+                        + max_pixels_per_slice*2u;
+            pix[i] = int(fltmap[base + pixel_idx]);
+        }
+    } else {
+        /* 16-bit float: value-indexed lookup. Source view is r16_uint so
+         * imageLoad returns the raw fp16 bit pattern in .x. */
+        for (int i = 0; i < color_planes; i++) {
+            uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu;
+            pix[i] = int(fltmap[(slice_idx*4u + uint(i))*65536u + iv]);
+        }
     }
 #else
     pix = ivec4(imageLoad(src[0], pos));
diff --git a/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
new file mode 100644
index 0000000000..e4535eb08f
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
@@ -0,0 +1,33 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2026 Lynne <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_shader_image_load_formatted : require
+
+layout (set = 1, binding = 4) uniform uimage2D tmp;
+
+#define PB_UNALIGNED
+#define GOLOMB
+#define FLOAT
+#define RGB
+#include "ffv1_enc.comp.glsl"
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
index 53a8d7f13f..e931019a43 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
@@ -23,13 +23,13 @@
 #pragma shader_stage(compute)
 #extension GL_GOOGLE_include_directive : require
 
-#define NB_CONTEXTS 2
+#define NB_CONTEXTS 6
 #define FULL_RENORM
 #include "common.glsl"
 #include "ffv1_common.glsl"
 
-layout (set = 1, binding = 1) buffer fltmap_buf {
-    uint fltmap[][4][65536];
+layout (set = 1, binding = 1, scalar) buffer fltmap_buf {
+    uint fltmap[];
 };
 
 void init_slice(inout SliceContext sc, uint slice_idx)
@@ -81,6 +81,7 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
     const int flip = (remap_mode == 2) ? 0x7FFF : 0;
 
     for (int p = 0; p < color_planes; p++) {
+        const uint base = (slice_idx*4u + uint(p))*65536u;
         uint j = 0;
         uint lu = 0;
         int run = 0;
@@ -90,15 +91,15 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
 
         put_usymbol(0, 0);
 
-        for (int i = 0; i < NB_CONTEXTS; i++)
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
             rc_state[i] = uint8_t(128);
 
         int cnt = 0;
         for (int i = 0; i < rct_offset; i++) {
             int ri = i ^ (((i & 0x8000) != 0) ? 0 : flip);
-            uint u = uint(fltmap[slice_idx][p][ri] != 0);
+            uint u = uint(fltmap[base + uint(ri)] != 0u);
 
-            fltmap[slice_idx][p][ri] = uint16_t(j);
+            fltmap[base + uint(ri)] = j;
             j += u;
 
             if (lu == u) {
@@ -117,6 +118,115 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
     }
 }
 
+/* The 32-bit float remap uses 6 contexts: state[lu][category][bit] with
+ * lu = 0,1 and category = 0 (run/step-1), 1 (delta -- unused here), 2 (mul). 
*/
+#define CTX_F32(lu, cat) ((uint(lu)*3u + uint(cat))*CONTEXT_SIZE)
+
+void encode_float32_remap(uint slice_idx, inout SliceContext sc)
+{
+    const uint slice_w = uint(sc.slice_dim.x);
+    const uint slice_h = uint(sc.slice_dim.y);
+    const uint pixel_num = slice_w * slice_h;
+    const uint plane_stride = max_pixels_per_slice*3u;
+
+    for (int p = 0; p < color_planes; p++) {
+        /* Layout: per (slice, plane) we have [units : max_pixels*8 bytes]
+         * followed by [bitmap : max_pixels*4 bytes]. The units region is
+         * read-only here, the bitmap region is written. */
+        const uint plane_base = (slice_idx*4u + uint(p))*plane_stride;
+        const uint bitmap_base = plane_base + max_pixels_per_slice*2u;
+
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
+            rc_state[i] = uint8_t(128);
+
+        put_usymbol(1, CTX_F32(0, 0));
+
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
+            rc_state[i] = uint8_t(128);
+
+        /* last_val is the last unique value (or 0xFFFFFFFF as the "before
+         * any value" sentinel; this lets step = val - last_val give val+1
+         * for the first emission via unsigned wraparound). */
+        uint last_val = 0xFFFFFFFFu;
+        uint lu = 0;
+        uint run = 0;
+        int ci = -1;
+        bool emit_first_mul = true;
+
+        for (uint i = 0; i < pixel_num; i++) {
+            uint u_val = fltmap[plane_base + 2u*i + 0u];
+            uint u_ndx = fltmap[plane_base + 2u*i + 1u];
+
+            /* Duplicate of the previous unique value? Reuse ci. */
+            if (i > 0u && last_val == u_val) {
+                fltmap[bitmap_base + u_ndx] = uint(ci);
+                continue;
+            }
+
+            uint step = u_val - last_val;
+
+            if (lu == 0u) {
+                put_usymbol(step - 1u, CTX_F32(0, 0));
+
+                if (emit_first_mul) {
+                    put_usymbol(1, CTX_F32(0, 2));
+                    emit_first_mul = false;
+                }
+
+                last_val = u_val;
+                if (step == 1u) {
+                    lu = 1;
+                    run = 0;
+                }
+            } else {
+                if (step == 1u) {
+                    run++;
+                    last_val = u_val;
+                } else {
+                    if (run > 0u) {
+                        put_usymbol(run, CTX_F32(1, 0));
+                        put_usymbol(0, CTX_F32(1, 0));
+                        last_val += 2u;
+                    } else {
+                        put_usymbol(0, CTX_F32(1, 0));
+                        last_val += 1u;
+                    }
+                    lu = 0;
+                    run = 0;
+
+                    step = u_val - last_val;
+                    put_usymbol(step - 1u, CTX_F32(0, 0));
+
+                    last_val = u_val;
+                    if (step == 1u) {
+                        lu = 1;
+                        run = 0;
+                    }
+                }
+            }
+
+            ci++;
+            fltmap[bitmap_base + u_ndx] = uint(ci);
+        }
+
+        if (lu == 1u) {
+            if (run > 0u) {
+                put_usymbol(run, CTX_F32(1, 0));
+                put_usymbol(0, CTX_F32(1, 0));
+                last_val += 2u;
+            } else {
+                put_usymbol(0, CTX_F32(1, 0));
+                last_val += 1u;
+            }
+        }
+
+        if (last_val != 0xFFFFFFFFu)
+            put_usymbol(0xFFFFFFFFu - last_val, CTX_F32(0, 0));
+
+        sc.remap_count[p] = ci + 1;
+    }
+}
+
 void write_slice_header(uint slice_idx, inout SliceContext sc)
 {
     [[unroll]]
@@ -149,7 +259,10 @@ void write_slice_header(uint slice_idx, inout SliceContext 
sc)
 
         if (remap_mode != 0) {
             put_usymbol(remap_mode, 0);
-            encode_histogram_remap(slice_idx, sc);
+            if (c_bits >= 32)
+                encode_float32_remap(slice_idx, sc);
+            else
+                encode_histogram_remap(slice_idx, sc);
         }
     }
 }
diff --git a/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl
new file mode 100644
index 0000000000..872c7daa2b
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl
@@ -0,0 +1,155 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2026 Lynne <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+#extension GL_GOOGLE_include_directive : require
+
+#define SB_QUALI readonly
+#include "common.glsl"
+#include "ffv1_common.glsl"
+
+layout (set = 1, binding = 1) uniform uimage2D src[];
+
+layout (set = 1, binding = 2, scalar) buffer fltmap_buf {
+    uint fltmap[];
+};
+
+/* The shared fltmap_buf is laid out per (slice, plane) as a
+ * [max_pixels_per_slice*3] uint block, where the first
+ * [max_pixels_per_slice*2] entries hold interleaved (val, ndx) pairs and
+ * the trailing [max_pixels_per_slice] entries are the bitmap region used
+ * by the setup/encode shaders. Padding past pixel_num is the sentinel
+ * (UINT32_MAX, UINT32_MAX) so it sorts to the end. */
+
+/* Per-workgroup bitonic-sort buffer. Limits a slice's pow2 size; large
+ * slices fall back to working in global memory. */
+shared u32vec2 smem[8192];
+
+void main(void)
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
+    uvec2 img_size = imageSize(src[0]);
+
+    uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
+                           gl_NumWorkGroups.x, 0);
+    uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
+                           gl_NumWorkGroups.x, 0);
+    uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
+                           gl_NumWorkGroups.y, 0);
+    uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
+                           gl_NumWorkGroups.y, 0);
+
+    uint slice_w = sxe - sxs;
+    uint slice_h = sye - sys;
+    uint pixel_num = slice_w * slice_h;
+
+    /* Round up to next pow2 for bitonic sort */
+    uint N = 1;
+    while (N < pixel_num)
+        N <<= 1;
+    N = max(N, 2);
+    if (N > max_pixels_per_slice)
+        N = max_pixels_per_slice;
+
+    const uint plane_stride = max_pixels_per_slice*3u;
+    const bool use_smem = N <= 8192u;
+
+    for (int p = 0; p < color_planes; p++) {
+        uint base = (slice_idx*4u + uint(p))*plane_stride;
+
+        /* Load pixels */
+        for (uint i = gl_LocalInvocationIndex; i < N;
+             i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+            uint v, ndx;
+            if (i < pixel_num) {
+                uint y = i / slice_w;
+                uint x = i - y*slice_w;
+                /* Source is bound as r32ui (FF_VK_REP_NATIVE for r32_sfloat) 
so
+                 * imageLoad returns the raw bit pattern of the float. */
+                v = imageLoad(src[p], ivec2(sxs + x, sys + y))[0];
+                if (remap_mode == 2)
+                    v = ((v & 0x80000000u) != 0u) ? v : (v ^ 0x7FFFFFFFu);
+                ndx = i;
+            } else {
+                v = 0xFFFFFFFFu;
+                ndx = 0xFFFFFFFFu;
+            }
+            if (use_smem) {
+                smem[i] = u32vec2(v, ndx);
+            } else {
+                fltmap[base + 2u*i + 0u] = v;
+                fltmap[base + 2u*i + 1u] = ndx;
+            }
+        }
+        barrier();
+        if (!use_smem) memoryBarrierBuffer();
+
+        /* Bitonic sort of the (val, ndx) pairs. */
+        for (uint k = 2; k <= N; k <<= 1) {
+            for (uint j = k >> 1; j > 0; j >>= 1) {
+                for (uint i = gl_LocalInvocationIndex; i < N;
+                     i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+                    uint partner = i ^ j;
+                    if (partner > i) {
+                        bool ascending = (i & k) == 0;
+                        u32vec2 a, b;
+                        if (use_smem) {
+                            a = smem[i];
+                            b = smem[partner];
+                        } else {
+                            a = u32vec2(fltmap[base + 2u*i + 0u],
+                                        fltmap[base + 2u*i + 1u]);
+                            b = u32vec2(fltmap[base + 2u*partner + 0u],
+                                        fltmap[base + 2u*partner + 1u]);
+                        }
+                        bool a_gt_b = (a.x > b.x) ||
+                                      (a.x == b.x && a.y > b.y);
+                        if (a_gt_b == ascending) {
+                            if (use_smem) {
+                                smem[i] = b;
+                                smem[partner] = a;
+                            } else {
+                                fltmap[base + 2u*i + 0u] = b.x;
+                                fltmap[base + 2u*i + 1u] = b.y;
+                                fltmap[base + 2u*partner + 0u] = a.x;
+                                fltmap[base + 2u*partner + 1u] = a.y;
+                            }
+                        }
+                    }
+                }
+                barrier();
+                if (!use_smem) memoryBarrierBuffer();
+            }
+        }
+
+        /* Write sorted pairs back to global */
+        if (use_smem) {
+            for (uint i = gl_LocalInvocationIndex; i < N;
+                 i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+                u32vec2 u = smem[i];
+                fltmap[base + 2u*i + 0u] = u.x;
+                fltmap[base + 2u*i + 1u] = u.y;
+            }
+            barrier();
+        }
+    }
+}
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to