This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 9a6b5ca197bc61581239752933c6e3319f4775d1 Author: Lynne <[email protected]> AuthorDate: Tue May 26 11:34:40 2026 +0900 Commit: Lynne <[email protected]> CommitDate: Tue May 26 17:47:04 2026 +0900 vulkan/ffv1_enc_rct_search: fix slice dimension iterations This was a mess, we were using incorrect pixels outside of the image boundaries as valid, the iteration had undefined behaviour since it was non-uniform across the workgroup. Calculate the per-invoc iterations from the slice dimensions instead, making all of them identical. And add a valid flag to decide whether to use them or not. And fix the synchronization. Sponsored-by: Sovereign Tech Fund --- libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl | 46 ++++++++++++++++++------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl index dc25f50831..4c4330f802 100644 --- a/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl +++ b/libavcodec/vulkan/ffv1_enc_rct_search.comp.glsl @@ -22,6 +22,7 @@ #pragma shader_stage(compute) #extension GL_GOOGLE_include_directive : require +#extension GL_KHR_shader_subgroup_arithmetic : require #define ENCODE #define SB_QUALI @@ -94,18 +95,25 @@ uint get_dist(ivec3 cur) shared uint score_cols[gl_WorkGroupSize.y] = { }; shared uint score_mode[16] = { }; -void process(ivec2 pos) +/* One scoring step: publish this lane's tx_pix to shared, then read the + * neighbours' to compute the prediction error. `valid` selects whether + * this lane has a real pixel; invalid lanes write zero into pix_buf so + * the cache stays well-defined while still participating in the barrier. */ +void process(ivec2 pos, bool valid, int i) { - ivec3 pix = load_components(pos); + ivec3 pix = valid ? load_components(pos) : ivec3(0); + ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]); + pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = + valid ? tx_pix : ivec3(0); + barrier(); - for (int i = 0; i < NUM_CHECKS; i++) { - ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]); - pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = tx_pix; - memoryBarrierShared(); + uint dist = 0u; + if (valid) + dist = get_dist(tx_pix); - uint dist = get_dist(tx_pix); - atomicAdd(score_mode[i], dist); - } + uint sum = subgroupAdd(dist); + if (subgroupElect()) + atomicAdd(score_mode[i], sum); } void coeff_search(uint slice_idx) @@ -120,9 +128,23 @@ void coeff_search(uint slice_idx) uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1, gl_NumWorkGroups.y, 0); - for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += gl_WorkGroupSize.y) { - for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += gl_WorkGroupSize.x) { - process(ivec2(x, y)); + /* Uniform iteration: every lane in the workgroup runs the same number + * of tile iterations so that the in-process barrier is always reached + * by all lanes. Lanes outside the slice extents pass valid=false. */ + uint sw = sxe - sxs; + uint sh = sye - sys; + uint n_xi = (sw + gl_WorkGroupSize.x - 1u) / gl_WorkGroupSize.x; + uint n_yi = (sh + gl_WorkGroupSize.y - 1u) / gl_WorkGroupSize.y; + + for (uint yi = 0u; yi < n_yi; yi++) { + uint y = sys + yi*gl_WorkGroupSize.y + gl_LocalInvocationID.y; + for (uint xi = 0u; xi < n_xi; xi++) { + uint x = sxs + xi*gl_WorkGroupSize.x + gl_LocalInvocationID.x; + bool valid = (x < sxe) && (y < sye); + for (int i = 0; i < NUM_CHECKS; i++) { + process(ivec2(x, y), valid, i); + barrier(); + } } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
