PR #23104 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23104
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23104.patch

This completes every single filter.
scale_vulkan's GLSL will get replaced by swscale fully. And swscale will soon 
drop its glslang dep.


>From fe16edf5aafd6fe59e56d812d4cd690aa9918cd7 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Thu, 14 May 2026 17:09:50 +0900
Subject: [PATCH 1/2] vf_nlmeans_vulkan: port to compile-time SPIR-V generation

---
 configure                                     |   2 +-
 libavfilter/vf_nlmeans_vulkan.c               | 534 +++---------------
 libavfilter/vulkan/Makefile                   |   4 +
 libavfilter/vulkan/nlmeans_denoise.comp.glsl  |  86 +++
 .../vulkan/nlmeans_horizontal.comp.glsl       | 104 ++++
 libavfilter/vulkan/nlmeans_vertical.comp.glsl | 122 ++++
 libavfilter/vulkan/nlmeans_weights.comp.glsl  | 144 +++++
 7 files changed, 550 insertions(+), 446 deletions(-)
 create mode 100644 libavfilter/vulkan/nlmeans_denoise.comp.glsl
 create mode 100644 libavfilter/vulkan/nlmeans_horizontal.comp.glsl
 create mode 100644 libavfilter/vulkan/nlmeans_vertical.comp.glsl
 create mode 100644 libavfilter/vulkan/nlmeans_weights.comp.glsl

diff --git a/configure b/configure
index 39a522e7e8..d953074c89 100755
--- a/configure
+++ b/configure
@@ -4222,7 +4222,7 @@ mptestsrc_filter_deps="gpl"
 msad_filter_select="scene_sad"
 negate_filter_deps="lut_filter"
 nlmeans_opencl_filter_deps="opencl"
-nlmeans_vulkan_filter_deps="vulkan spirv_library"
+nlmeans_vulkan_filter_deps="vulkan spirv_compiler"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index c1430707b7..902c072669 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -19,19 +19,24 @@
  */
 
 #include "libavutil/mem.h"
-#include "libavutil/random_seed.h"
-#include "libavutil/vulkan_spirv.h"
 #include "libavutil/opt.h"
 #include "vulkan_filter.h"
 
 #include "filters.h"
 #include "video.h"
 
-#define TYPE_NAME  "vec4"
+extern const unsigned char ff_nlmeans_horizontal_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_horizontal_comp_spv_len;
+extern const unsigned char ff_nlmeans_vertical_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_vertical_comp_spv_len;
+extern const unsigned char ff_nlmeans_weights_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_weights_comp_spv_len;
+extern const unsigned char ff_nlmeans_denoise_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_denoise_comp_spv_len;
+
+/* Must be kept in sync with the definitions in the nlmeans_* shaders */
 #define TYPE_ELEMS 4
 #define TYPE_SIZE  (TYPE_ELEMS*4)
-#define TYPE_BLOCK_ELEMS 16
-#define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
 #define WG_SIZE 32
 
 typedef struct NLMeansVulkanContext {
@@ -80,210 +85,60 @@ typedef struct IntegralPushData {
     uint32_t nb_components;
 } IntegralPushData;
 
-static void shared_shd_def(FFVulkanShader *shd) {
-    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
-    GLSLC(0,                                                                  
);
-    GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
-    GLSLF(0, #define T_ALIGN %i                                               
,TYPE_SIZE);
-    GLSLF(0, #define T_BLOCK_ELEMS %i                                         
,TYPE_BLOCK_ELEMS);
-    GLSLF(0, #define T_BLOCK_ALIGN %i                                         
,TYPE_BLOCK_SIZE);
-    GLSLC(0,                                                                  
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
-    GLSLC(1,     DTYPE v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, struct Block {                                                   
);
-    GLSLC(1,     DTYPE data[T_BLOCK_ELEMS];                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN) 
buffer BlockBuffer {  );
-    GLSLC(1,     Block v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
-    GLSLC(1,     uvec4 width;                                                 
);
-    GLSLC(1,     uvec4 height;                                                
);
-    GLSLC(1,     vec4 strength;                                               
);
-    GLSLC(1,     uvec4 comp_off;                                              
);
-    GLSLC(1,     uvec4 comp_plane;                                            
);
-    GLSLC(1,     DataBuffer integral_base;                                    
);
-    GLSLC(1,     uint64_t integral_size;                                      
);
-    GLSLC(1,     uint64_t int_stride;                                         
);
-    GLSLC(1,     uint xyoffs_start;                                           
);
-    GLSLC(1,     uint nb_components;                                          
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0,                                                                  
);
-
-    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
-                                VK_SHADER_STAGE_COMPUTE_BIT);
-}
-
 static av_cold int init_integral_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
                                           FFVulkanShader *shd_horizontal,
                                           FFVulkanShader *shd_vertical,
-                                          FFVkSPIRVCompiler *spv,
-                                          const AVPixFmtDescriptor *desc, int 
planes)
+                                          int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
     FFVulkanShader *shd;
-    FFVulkanDescriptorSetBinding *desc_set;
 
+    /* Horizontal pass */
     shd = shd_horizontal;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_horizontal",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, 1, 1,
-                          0));
-    shared_shd_def(shd);
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, 1, 1 }, 0);
 
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     BlockBuffer b_dst;                                            
  );
-    GLSLC(1,     Block block;                                                  
  );
-    GLSLC(1,     DTYPE s2;                                                     
  );
-    GLSLC(1,     DTYPE prefix_sum;                                             
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     int k;                                                        
  );
-    GLSLC(1,     int o;                                                        
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos.y = int(gl_GlobalInvocationID.x);                         
  );
-    GLSLC(1,     if (pos.y < height[c_plane]) {                                
  );
-    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
-    GLSLC(2,         offset = int_stride * uint64_t(pos.y);                    
  );
-    GLSLC(2,         b_dst = BlockBuffer(uint64_t(integral_data) + offset);    
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(2,         for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {    
  );
-    GLSLC(3,             block = b_dst.v[k];                                   
  );
-    GLSLC(3,             for (o = 0; o < T_BLOCK_ELEMS; o++) {                 
  );
-    GLSLC(4,                 s2 = block.data[o];                               
  );
-    GLSLC(4,                 block.data[o] = s2 + prefix_sum;                  
  );
-    GLSLC(4,                 prefix_sum += s2;                                 
  );
-    GLSLC(3,             }                                                     
  );
-    GLSLC(3,             b_dst.v[k] = block;                                   
  );
-    GLSLC(2,         }                                                         
  );
-    GLSLC(1,     }                                                             
  );
-    GLSLC(0, }                                                                 
  );
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
 
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_horizontal_comp_spv_data,
+                          ff_nlmeans_horizontal_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
+    /* Vertical pass */
     shd = shd_vertical;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_vertical",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, 1, 1,
-                          0));
-    shared_shd_def(shd);
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, 1, 1 }, 0);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name       = "input_img",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali  = "readonly",
-            .dimensions = 2,
-            .elems      = planes,
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    const FFVulkanDescriptorSetBinding desc_set_img[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 0, 0));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 1, 0, 0);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "xyoffsets_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "ivec2 xyoffsets[];",
+    const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+        { /* xyoffsets_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
 
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     float s1;                                                     
  );
-    GLSLC(1,     DTYPE s2;                                                     
  );
-    GLSLC(1,     DTYPE prefix_sum;                                             
  );
-    GLSLC(1,     uvec2 size;                                                   
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     ivec2 pos_off;                                                
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_off;                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    for (int i = 0; i < TYPE_ELEMS; i++)
-        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos.x = int(gl_GlobalInvocationID.x);                         
  );
-    GLSLC(1,     if (pos.x < width[c_plane]) {                                 
  );
-    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
-    GLSLC(2,         for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {       
  );
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y);                
  );
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);   
  );
-    GLSLC(4,             s1 = imageLoad(input_img[c_plane], pos)[c_off];       
  );
-    for (int i = 0; i < TYPE_ELEMS; i++) {
-        GLSLF(4,         pos_off = pos + offs[%i];                             
  ,i);
-        GLSLC(4,         if (!IS_WITHIN(uvec2(pos_off), size))                 
  );
-        GLSLF(5,             s2[%i] = s1;                                      
  ,i);
-        GLSLC(4,         else                                                  
  );
-        GLSLF(5,             s2[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
-    }
-    GLSLC(4,             s2 = (s1 - s2) * (s1 - s2);                           
  );
-    GLSLC(3,             dst.v[pos.x] = s2 + prefix_sum;                       
  );
-    GLSLC(3,             prefix_sum += s2;                                     
  );
-    GLSLC(2,         }                                                         
  );
-    GLSLC(1,     }                                                             
  );
-    GLSLC(0, }                                                                 
  );
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_vertical_comp_spv_data,
+                          ff_nlmeans_vertical_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -305,172 +160,48 @@ typedef struct WeightsPushData {
 } WeightsPushData;
 
 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
-                                         FFVulkanShader *shd,
-                                         FFVkSPIRVCompiler *spv,
-                                         const AVPixFmtDescriptor *desc,
-                                         int planes)
+                                         FFVulkanShader *shd, int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    FFVulkanDescriptorSetBinding *desc_set;
 
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, WG_SIZE, 1,
-                          0));
-
-    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
-    GLSLC(0,                                                                  
);
-    GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
-    GLSLF(0, #define T_ALIGN %i                                               
,TYPE_SIZE);
-    GLSLC(0,                                                                  
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
-    GLSLC(1,     DTYPE v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
-    GLSLC(1,     uvec4 width;                                                 
);
-    GLSLC(1,     uvec4 height;                                                
);
-    GLSLC(1,     uvec4 ws_offset;                                             
);
-    GLSLC(1,     uvec4 ws_stride;                                             
);
-    GLSLC(1,     ivec4 patch_size;                                            
);
-    GLSLC(1,     vec4 strength;                                               
);
-    GLSLC(1,     uvec4 comp_off;                                              
);
-    GLSLC(1,     uvec4 comp_plane;                                            
);
-    GLSLC(1,     DataBuffer integral_base;                                    
);
-    GLSLC(1,     uint64_t integral_size;                                      
);
-    GLSLC(1,     uint64_t int_stride;                                         
);
-    GLSLC(1,     uint xyoffs_start;                                           
);
-    GLSLC(1,     uint ws_count;                                               
);
-    GLSLC(1,     uint nb_components;                                          
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0,                                                                  
);
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(WeightsPushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name       = "input_img",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali  = "readonly",
-            .dimensions = 2,
-            .elems      = planes,
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+    const FFVulkanDescriptorSetBinding desc_set[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
-        {
-            .name        = "weights_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights[];",
+        { /* weights_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
-        {
-            .name        = "sums_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums[];",
+        { /* sums_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "xyoffsets_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "ivec2 xyoffsets[];",
+    const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+        { /* xyoffsets_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
 
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     uvec2 size;                                                   
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     ivec2 pos_off;                                                
  );
-    GLSLC(1,     int p;                                                        
  );
-    GLSLC(1,     float s;                                                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_off;                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(1,     uint ws_off;                                                  
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos = ivec2(gl_GlobalInvocationID.xy);                        
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.z) %% nb_components;      
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(1,     p = patch_size[comp_idx];                                     
  );
-    GLSLC(1,     s = strength[comp_idx];                                       
  );
-    GLSLC(1,     if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= 
width[c_plane] - p || pos.y >= height[c_plane] - p) );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    for (int i = 0; i < TYPE_ELEMS; i++)
-        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x; );
-    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DTYPE a;                                                      
  );
-    GLSLC(1,     DTYPE b;                                                      
  );
-    GLSLC(1,     DTYPE c;                                                      
  );
-    GLSLC(1,     DTYPE d;                                                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DTYPE patch_diff;                                             
  );
-    GLSLC(1,     vec4 src;                                                     
  );
-    GLSLC(1,     vec4 w;                                                       
  );
-    GLSLC(1,     float w_sum;                                                  
  );
-    GLSLC(1,     float sum;                                                    
  );
-    GLSLC(0,                                                                   
  );
-    for (int i = 0; i < 4; i++) {
-        GLSLF(1,     pos_off = pos + offs[%i];                                 
  ,i);
-        GLSLC(1,     if (!IS_WITHIN(uvec2(pos_off), size))                     
  );
-        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], pos)[c_off];  
  ,i);
-        GLSLC(1,     else                                                      
  );
-        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
-    }
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         offset = int_stride * uint64_t(pos.y - p);                
  );
-    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
-    GLSLC(1,         a = dst.v[pos.x - p];                                     
  );
-    GLSLC(1,         c = dst.v[pos.x + p];                                     
  );
-    GLSLC(1,         offset = int_stride * uint64_t(pos.y + p);                
  );
-    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
-    GLSLC(1,         b = dst.v[pos.x - p];                                     
  );
-    GLSLC(1,         d = dst.v[pos.x + p];                                     
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         patch_diff = d + a - b - c;                               
  );
-    GLSLC(1,         w = exp(patch_diff * s);                                  
  );
-    GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                        
  );
-    GLSLC(1,         sum = dot(w, src * 255);                                  
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         weights[ws_off] += w_sum;                                 
  );
-    GLSLC(1,         sums[ws_off] += sum;                                      
  );
-    GLSLC(0, }                                                                 
  );
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_weights_comp_spv_data,
+                          ff_nlmeans_weights_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -485,121 +216,49 @@ typedef struct DenoisePushData {
 } DenoisePushData;
 
 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
-                                         FFVulkanShader *shd, 
FFVkSPIRVCompiler *spv,
-                                         const AVPixFmtDescriptor *desc, int 
planes)
+                                         FFVulkanShader *shd, int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    FFVulkanDescriptorSetBinding *desc_set;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, WG_SIZE, 1,
-                          0));
 
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {        );
-    GLSLC(1,    uvec4 comp_off;                                           );
-    GLSLC(1,    uvec4 comp_plane;                                         );
-    GLSLC(1,    uvec4 ws_offset;                                          );
-    GLSLC(1,    uvec4 ws_stride;                                          );
-    GLSLC(1,    uint32_t ws_count;                                        );
-    GLSLC(1,    uint32_t t;                                               );
-    GLSLC(1,    uint32_t nb_components;                                   );
-    GLSLC(0, };                                                           );
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "input_img",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout  = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali   = "readonly",
-            .dimensions  = 2,
-            .elems       = planes,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+    const FFVulkanDescriptorSetBinding desc_set_img[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
-        {
-            .name        = "output_img",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout  = ff_vk_shader_rep_fmt(vkctx->output_format, 
FF_VK_REP_FLOAT),
-            .mem_quali   = "writeonly",
-            .dimensions  = 2,
-            .elems       = planes,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        { /* output_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 2, 0, 0);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "weights_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights[];",
+    const FFVulkanDescriptorSetBinding desc_set_ws[] = {
+        { /* weights_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
-        {
-            .name        = "sums_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums[];",
+        { /* sums_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_ws, 2, 0, 0);
 
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
-
-    GLSLC(0, void main()                                                      
);
-    GLSLC(0, {                                                                
);
-    GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           
);
-    GLSLC(1,     const uint plane = uint(gl_WorkGroupID.z);                   
);
-    GLSLC(1,     const uvec2 size = imageSize(output_img[plane]);             
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     uint c_off;                                                  
);
-    GLSLC(1,     uint c_plane;                                                
);
-    GLSLC(1,     uint ws_off;                                                 
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     float w_sum;                                                 
);
-    GLSLC(1,     float sum;                                                   
);
-    GLSLC(1,     vec4 src;                                                    
);
-    GLSLC(1,     vec4 r;                                                      
);
-    GLSLC(1,     uint invoc_idx;                                              
);
-    GLSLC(1,     uint comp_idx;                                               
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     if (!IS_WITHIN(pos, size))                                   
);
-    GLSLC(2,         return;                                                  
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     src = imageLoad(input_img[plane], pos);                      
);
-    GLSLC(1,     for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {   
);
-    GLSLC(2,         if (plane == comp_plane[comp_idx]) {                     
);
-    GLSLC(3,             w_sum = 0.0;                                         
);
-    GLSLC(3,             sum = 0.0;                                           
);
-    GLSLC(3,             for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {    
);
-    GLSLC(4,                 ws_off = ws_count * invoc_idx + 
ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
-    GLSLC(4,                 w_sum += weights[ws_off];                        
);
-    GLSLC(4,                 sum += sums[ws_off];                             
);
-    GLSLC(3,             }                                                    
);
-    GLSLC(3,             c_off = comp_off[comp_idx];                          
);
-    GLSLC(3,             r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 
255; );
-    GLSLC(2,         }                                                        
);
-    GLSLC(1,     }                                                            
);
-    GLSLC(1,     imageStore(output_img[plane], pos, r);                       
);
-    GLSLC(0, }                                                                
);
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_denoise_comp_spv_data,
+                          ff_nlmeans_denoise_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -610,15 +269,9 @@ static av_cold int init_filter(AVFilterContext *ctx)
     NLMeansVulkanContext *s = ctx->priv;
     FFVulkanContext *vkctx = &s->vkctx;
     const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
-    FFVkSPIRVCompiler *spv = NULL;
     int *offsets_buf;
     int offsets_dispatched = 0, nb_dispatches = 0;
 
-    const AVPixFmtDescriptor *desc;
-    desc = av_pix_fmt_desc_get(vkctx->output_format);
-    if (!desc)
-        return AVERROR(EINVAL);
-
     if (!(s->opts.r & 1)) {
         s->opts.r |= 1;
         av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to 
%i",
@@ -682,12 +335,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
 
     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / 
TYPE_ELEMS));
 
-    spv = ff_vk_spirv_init();
-    if (!spv) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
-        return AVERROR_EXTERNAL;
-    }
-
     s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
     if (!s->qf) {
         av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
@@ -698,11 +345,11 @@ static av_cold int init_filter(AVFilterContext *ctx)
     RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
 
     RET(init_integral_pipeline(vkctx, &s->e, &s->shd_horizontal, 
&s->shd_vertical,
-                               spv, desc, planes));
+                               planes));
 
-    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, spv, desc, 
planes));
+    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, planes));
 
-    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, spv, desc, 
planes));
+    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, planes));
 
     RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], 
&s->shd_vertical,
                                         1, 0, 0,
@@ -726,9 +373,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
     s->initialized = 1;
 
 fail:
-    if (spv)
-        spv->uninit(&spv);
-
     return err;
 }
 
diff --git a/libavfilter/vulkan/Makefile b/libavfilter/vulkan/Makefile
index 6d25cf8a50..cd303e535e 100644
--- a/libavfilter/vulkan/Makefile
+++ b/libavfilter/vulkan/Makefile
@@ -15,3 +15,7 @@ OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER) += 
vulkan/transpose.comp.spv.o
 OBJS-$(CONFIG_V360_VULKAN_FILTER) += vulkan/v360.comp.spv.o
 OBJS-$(CONFIG_INTERLACE_VULKAN_FILTER) += vulkan/interlace.comp.spv.o
 OBJS-$(CONFIG_XFADE_VULKAN_FILTER) += vulkan/xfade.comp.spv.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vulkan/nlmeans_horizontal.comp.spv.o \
+                                        vulkan/nlmeans_vertical.comp.spv.o \
+                                        vulkan/nlmeans_weights.comp.spv.o \
+                                        vulkan/nlmeans_denoise.comp.spv.o
diff --git a/libavfilter/vulkan/nlmeans_denoise.comp.glsl 
b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
new file mode 100644
index 0000000000..974c09318f
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    uvec4 ws_offset;
+    uvec4 ws_stride;
+    uint32_t ws_count;
+    uint32_t t;
+    uint32_t nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly  image2D input_img[];
+layout (set = 0, binding = 1) uniform writeonly image2D output_img[];
+
+layout (set = 1, binding = 0, scalar) readonly buffer weights_buffer {
+    float weights[];
+};
+
+layout (set = 1, binding = 1, scalar) readonly buffer sums_buffer {
+    float sums[];
+};
+
+void main()
+{
+    const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
+    const uint plane = uint(gl_WorkGroupID.z);
+    const ivec2 size = imageSize(output_img[plane]);
+
+    uint c_off;
+    uint c_plane;
+    uint ws_off;
+
+    float w_sum;
+    float sum;
+    vec4 src;
+    vec4 r;
+    uint invoc_idx;
+    uint comp_idx;
+
+    if (any(greaterThanEqual(pos, size)))
+        return;
+
+    src = imageLoad(input_img[plane], pos);
+    for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {
+        if (plane == comp_plane[comp_idx]) {
+            w_sum = 0.0;
+            sum = 0.0;
+            for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {
+                ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x;
+                w_sum += weights[ws_off];
+                sum += sums[ws_off];
+            }
+            c_off = comp_off[comp_idx];
+            r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 255;
+        }
+    }
+    imageStore(output_img[plane], pos, r);
+}
diff --git a/libavfilter/vulkan/nlmeans_horizontal.comp.glsl 
b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
new file mode 100644
index 0000000000..d1bd62ccb1
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+struct Block {
+    DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) 
buffer BlockBuffer {
+    Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint nb_components;
+};
+
+void main()
+{
+    uint64_t offset;
+    BlockBuffer b_dst;
+    Block block;
+    DTYPE s2;
+    DTYPE prefix_sum;
+    ivec2 pos;
+    int k;
+    int o;
+
+    DataBuffer integral_data;
+
+    uint c_plane;
+
+    uint comp_idx = uint(gl_WorkGroupID.y);
+    uint invoc_idx = uint(gl_WorkGroupID.z);
+
+    if (strength[comp_idx] == 0.0)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+
+    c_plane = comp_plane[comp_idx];
+
+    pos.y = int(gl_GlobalInvocationID.x);
+    if (pos.y < height[c_plane]) {
+        prefix_sum = DTYPE(0);
+        offset = int_stride * uint64_t(pos.y);
+        b_dst = BlockBuffer(uint64_t(integral_data) + offset);
+
+        for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {
+            block = b_dst.v[k];
+            for (o = 0; o < T_BLOCK_ELEMS; o++) {
+                s2 = block.data[o];
+                block.data[o] = s2 + prefix_sum;
+                prefix_sum += s2;
+            }
+            b_dst.v[k] = block;
+        }
+    }
+}
diff --git a/libavfilter/vulkan/nlmeans_vertical.comp.glsl 
b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
new file mode 100644
index 0000000000..d5842f4a16
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+struct Block {
+    DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) 
buffer BlockBuffer {
+    Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+    ivec2 xyoffsets[];
+};
+
+void main()
+{
+    uint64_t offset;
+    DataBuffer dst;
+    float s1;
+    DTYPE s2;
+    DTYPE prefix_sum;
+    uvec2 size;
+    ivec2 pos;
+    ivec2 pos_off;
+
+    DataBuffer integral_data;
+    ivec2 offs[TYPE_ELEMS];
+
+    uint c_off;
+    uint c_plane;
+
+    uint comp_idx = uint(gl_WorkGroupID.y);
+    uint invoc_idx = uint(gl_WorkGroupID.z);
+
+    if (strength[comp_idx] == 0.0)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+    for (uint i = 0; i < TYPE_ELEMS; i++)
+        offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+    c_off = comp_off[comp_idx];
+    c_plane = comp_plane[comp_idx];
+    size = imageSize(input_img[c_plane]);
+
+    pos.x = int(gl_GlobalInvocationID.x);
+    if (pos.x < width[c_plane]) {
+        prefix_sum = DTYPE(0);
+        for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {
+            offset = int_stride * uint64_t(pos.y);
+            dst = DataBuffer(uint64_t(integral_data) + offset);
+            s1 = imageLoad(input_img[c_plane], pos)[c_off];
+            for (int i = 0; i < TYPE_ELEMS; i++) {
+                pos_off = pos + offs[i];
+                if (any(greaterThanEqual(uvec2(pos_off), size)))
+                    s2[i] = s1;
+                else
+                    s2[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+            }
+            s2 = (s1 - s2) * (s1 - s2);
+            dst.v[pos.x] = s2 + prefix_sum;
+            prefix_sum += s2;
+        }
+    }
+}
diff --git a/libavfilter/vulkan/nlmeans_weights.comp.glsl 
b/libavfilter/vulkan/nlmeans_weights.comp.glsl
new file mode 100644
index 0000000000..24c918bd0a
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_weights.comp.glsl
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    uvec4 ws_offset;
+    uvec4 ws_stride;
+    ivec4 patch_size;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint ws_count;
+    uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 0, binding = 1, scalar) buffer weights_buffer {
+    float weights[];
+};
+
+layout (set = 0, binding = 2, scalar) buffer sums_buffer {
+    float sums[];
+};
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+    ivec2 xyoffsets[];
+};
+
+void main()
+{
+    uint64_t offset;
+    DataBuffer dst;
+    uvec2 size;
+    ivec2 pos;
+    ivec2 pos_off;
+    int p;
+    float s;
+
+    DataBuffer integral_data;
+    ivec2 offs[TYPE_ELEMS];
+
+    uint c_off;
+    uint c_plane;
+    uint ws_off;
+
+    pos = ivec2(gl_GlobalInvocationID.xy);
+    uint comp_idx = uint(gl_WorkGroupID.z) % nb_components;
+    uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;
+
+    c_off = comp_off[comp_idx];
+    c_plane = comp_plane[comp_idx];
+    p = patch_size[comp_idx];
+    s = strength[comp_idx];
+    if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= width[c_plane] - p || 
pos.y >= height[c_plane] - p)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+    for (uint i = 0; i < TYPE_ELEMS; i++)
+        offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+    ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x;
+    size = imageSize(input_img[c_plane]);
+
+    DTYPE a;
+    DTYPE b;
+    DTYPE c;
+    DTYPE d;
+
+    DTYPE patch_diff;
+    vec4 src;
+    vec4 w;
+    float w_sum;
+    float sum;
+
+    for (int i = 0; i < 4; i++) {
+        pos_off = pos + offs[i];
+        if (any(greaterThanEqual(uvec2(pos_off), size)))
+            src[i] = imageLoad(input_img[c_plane], pos)[c_off];
+        else
+            src[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+    }
+
+    offset = int_stride * uint64_t(pos.y - p);
+    dst = DataBuffer(uint64_t(integral_data) + offset);
+    a = dst.v[pos.x - p];
+    c = dst.v[pos.x + p];
+    offset = int_stride * uint64_t(pos.y + p);
+    dst = DataBuffer(uint64_t(integral_data) + offset);
+    b = dst.v[pos.x - p];
+    d = dst.v[pos.x + p];
+
+    patch_diff = d + a - b - c;
+    w = exp(patch_diff * s);
+    w_sum = w[0] + w[1] + w[2] + w[3];
+    sum = dot(w, src * 255);
+
+    weights[ws_off] += w_sum;
+    sums[ws_off] += sum;
+}
-- 
2.52.0


>From f48c81e5fea86531322ec95cfaaedd610ba57805 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 21 Apr 2026 10:00:32 +0200
Subject: [PATCH 2/2] vf_blackdetect_vulkan: port to compile-time SPIR-V
 generation

---
 configure                                |   2 +-
 libavfilter/vf_blackdetect_vulkan.c      | 119 ++++++++---------------
 libavfilter/vulkan/Makefile              |   1 +
 libavfilter/vulkan/blackdetect.comp.glsl |  64 ++++++++++++
 4 files changed, 109 insertions(+), 77 deletions(-)
 create mode 100644 libavfilter/vulkan/blackdetect.comp.glsl

diff --git a/configure b/configure
index d953074c89..32c9aacc62 100755
--- a/configure
+++ b/configure
@@ -4149,7 +4149,7 @@ ass_filter_deps="libass"
 avgblur_opencl_filter_deps="opencl"
 avgblur_vulkan_filter_deps="vulkan spirv_compiler"
 azmq_filter_deps="libzmq"
-blackdetect_vulkan_filter_deps="vulkan spirv_library"
+blackdetect_vulkan_filter_deps="vulkan spirv_compiler"
 blackframe_filter_deps="gpl"
 blend_vulkan_filter_deps="vulkan spirv_compiler"
 boxblur_filter_deps="gpl"
diff --git a/libavfilter/vf_blackdetect_vulkan.c 
b/libavfilter/vf_blackdetect_vulkan.c
index 279b057148..3abe2f9fb3 100644
--- a/libavfilter/vf_blackdetect_vulkan.c
+++ b/libavfilter/vf_blackdetect_vulkan.c
@@ -19,13 +19,14 @@
  */
 
 #include <float.h>
-#include "libavutil/vulkan_spirv.h"
 #include "libavutil/opt.h"
 #include "libavutil/timestamp.h"
 #include "vulkan_filter.h"
 
 #include "filters.h"
-#include "video.h"
+
+extern const unsigned char ff_blackdetect_comp_spv_data[];
+extern const unsigned int ff_blackdetect_comp_spv_len;
 
 typedef struct BlackDetectVulkanContext {
     FFVulkanContext vkctx;
@@ -36,12 +37,16 @@ typedef struct BlackDetectVulkanContext {
     FFVulkanShader shd;
     AVBufferPool *sum_buf_pool;
 
-    double black_min_duration_time;
-    double picture_black_ratio_th;
-    double pixel_black_th;
-    int    alpha;
+    double  picture_black_ratio_th;
+    double  pixel_black_th;
+    int     alpha;
 
-    int64_t black_start;
+    int     black_started;
+    int64_t black_start;             ///< pts start time of the first black 
picture
+    int64_t black_end;               ///< pts end time of the last black 
picture
+    double  black_min_duration_time; ///< minimum duration of detected black, 
in seconds
+    int64_t black_min_duration;      ///< minimum duration of detected black, 
expressed in timebase units
+    AVRational time_base;
 } BlackDetectVulkanContext;
 
 typedef struct BlackDetectPushData {
@@ -56,14 +61,9 @@ typedef struct BlackDetectBuf {
 static av_cold int init_filter(AVFilterContext *ctx)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
     BlackDetectVulkanContext *s = ctx->priv;
     FFVulkanContext *vkctx = &s->vkctx;
-    FFVulkanShader *shd;
-    FFVkSPIRVCompiler *spv;
-    FFVulkanDescriptorSetBinding *desc;
+    const AVFilterLink *inlink = ctx->inputs[0];
     const int plane = s->alpha ? 3 : 0;
 
     const AVPixFmtDescriptor *pixdesc = 
av_pix_fmt_desc_get(s->vkctx.input_format);
@@ -72,12 +72,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
         return AVERROR(ENOTSUP);
     }
 
-    spv = ff_vk_spirv_init();
-    if (!spv) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
-        return AVERROR_EXTERNAL;
-    }
-
     s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
     if (!s->qf) {
         av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
@@ -86,89 +80,58 @@ static av_cold int init_filter(AVFilterContext *ctx)
     }
 
     RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, s->qf->num*4, 0, 0, 0, 
NULL));
-    RET(ff_vk_shader_init(vkctx, &s->shd, "blackdetect",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_KHR_shader_subgroup_ballot" 
}, 1,
-                          32, 32, 1,
-                          0));
-    shd = &s->shd;
 
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
-    GLSLC(1,     float threshold;                                             
);
-    GLSLC(0, };                                                               
);
+    SPEC_LIST_CREATE(sl, 2, 2*sizeof(uint32_t))
+    SPEC_LIST_ADD(sl, 0, 32, plane);
+    SPEC_LIST_ADD(sl, 1, 32, SLICES);
 
-    ff_vk_shader_add_push_const(shd, 0, sizeof(BlackDetectPushData),
+    ff_vk_shader_load(&s->shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
+                      (int []) { 32, 32, 1 }, 0);
+
+    ff_vk_shader_add_push_const(&s->shd, 0, sizeof(BlackDetectPushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
-    desc = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name       = "input_img",
+    const FFVulkanDescriptorSetBinding desc[] = {
+        { /* input_img */
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout = ff_vk_shader_rep_fmt(s->vkctx.input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali  = "readonly",
-            .dimensions = 2,
-            .elems      = av_pix_fmt_count_planes(s->vkctx.input_format),
             .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
-        }, {
-            .name        = "sum_buffer",
+            .elems      = av_pix_fmt_count_planes(s->vkctx.input_format),
+        },
+        { /* sum_buffer */
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "uint slice_sum[];",
         }
     };
+    ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 2, 0, 0);
 
-    RET(ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 2, 0, 0));
-
-    GLSLC(0, shared uint wg_sum;                                              
);
-    GLSLC(0,                                                                  
);
-    GLSLC(0, void main()                                                      
);
-    GLSLC(0, {                                                                
);
-    GLSLC(1,     wg_sum = 0u;                                                 
);
-    GLSLC(1,     barrier();                                                   
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           
);
-    GLSLF(1,     if (!IS_WITHIN(pos, imageSize(input_img[%d])))               
,plane);
-    GLSLC(2,         return;                                                  
);
-    GLSLF(1,     float value = imageLoad(input_img[%d], pos).x;               
,plane);
-    GLSLC(1,     uvec4 isblack = subgroupBallot(value <= threshold);          
);
-    GLSLC(1,     if (subgroupElect())                                         
);
-    GLSLC(2,         atomicAdd(wg_sum, subgroupBallotBitCount(isblack));      
);
-    GLSLC(1,     barrier();                                                   
);
-    GLSLC(1,     if (gl_LocalInvocationIndex == 0u)                           
);
-    GLSLF(2,         atomicAdd(slice_sum[gl_WorkGroupID.x %% %du], wg_sum);   
,SLICES);
-    GLSLC(0, }                                                                
);
-
-    RET(spv->compile_shader(vkctx, spv, &s->shd, &spv_data, &spv_len, "main",
-                            &spv_opaque));
-    RET(ff_vk_shader_link(vkctx, &s->shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, &s->shd,
+                          ff_blackdetect_comp_spv_data,
+                          ff_blackdetect_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, &s->e, &s->shd));
 
+    s->time_base = inlink->time_base;
+    s->black_min_duration = s->black_min_duration_time / av_q2d(s->time_base);
     s->black_start = AV_NOPTS_VALUE;
     s->initialized = 1;
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-    if (spv)
-        spv->uninit(&spv);
-
     return err;
 }
 
 static void report_black_region(AVFilterContext *ctx, int64_t black_end)
 {
     BlackDetectVulkanContext *s = ctx->priv;
-    const AVFilterLink *inlink = ctx->inputs[0];
+
     if (s->black_start == AV_NOPTS_VALUE)
         return;
 
-    if ((black_end - s->black_start) >= s->black_min_duration_time / 
av_q2d(inlink->time_base)) {
+    if ((black_end - s->black_start) >= s->black_min_duration) {
         av_log(ctx, AV_LOG_INFO,
                "black_start:%s black_end:%s black_duration:%s\n",
-               av_ts2timestr(s->black_start, &inlink->time_base),
-               av_ts2timestr(black_end, &inlink->time_base),
-               av_ts2timestr(black_end - s->black_start, &inlink->time_base));
+               av_ts2timestr(s->black_start, &s->time_base),
+               av_ts2timestr(black_end, &s->time_base),
+               av_ts2timestr(black_end - s->black_start, &s->time_base));
     }
 }
 
@@ -359,11 +322,15 @@ fail:
 static void blackdetect_vulkan_uninit(AVFilterContext *avctx)
 {
     BlackDetectVulkanContext *s = avctx->priv;
-    AVFilterLink *inlink = avctx->inputs[0];
-    FilterLink *inl = ff_filter_link(inlink);
     FFVulkanContext *vkctx = &s->vkctx;
 
-    report_black_region(avctx, inl->current_pts);
+    /* avctx->inputs[0] is NULL if the filter is freed before its input was
+     * ever linked (e.g. invalid options abort filter creation). s->initialized
+     * guarantees a frame was processed, so the input link is valid. */
+    if (s->initialized) {
+        FilterLink *inl = ff_filter_link(avctx->inputs[0]);
+        report_black_region(avctx, inl->current_pts);
+    }
 
     ff_vk_exec_pool_free(vkctx, &s->e);
     ff_vk_shader_free(vkctx, &s->shd);
diff --git a/libavfilter/vulkan/Makefile b/libavfilter/vulkan/Makefile
index cd303e535e..2cfe9cfa93 100644
--- a/libavfilter/vulkan/Makefile
+++ b/libavfilter/vulkan/Makefile
@@ -2,6 +2,7 @@ clean::
        $(RM) $(CLEANSUFFIXES:%=libavfilter/vulkan/%)
 
 OBJS-$(CONFIG_AVGBLUR_VULKAN_FILTER) += vulkan/avgblur.comp.spv.o
+OBJS-$(CONFIG_BLACKDETECT_VULKAN_FILTER) += vulkan/blackdetect.comp.spv.o
 OBJS-$(CONFIG_BLEND_VULKAN_FILTER) += vulkan/blend.comp.spv.o
 OBJS-$(CONFIG_BWDIF_VULKAN_FILTER) += vulkan/bwdif.comp.spv.o
 OBJS-$(CONFIG_CHROMABER_VULKAN_FILTER) += vulkan/chromaber.comp.spv.o
diff --git a/libavfilter/vulkan/blackdetect.comp.glsl 
b/libavfilter/vulkan/blackdetect.comp.glsl
new file mode 100644
index 0000000000..21e7601060
--- /dev/null
+++ b/libavfilter/vulkan/blackdetect.comp.glsl
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2025 (c) Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_EXT_null_initializer : require
+
+layout (constant_id = 0) const uint plane = 0;
+layout (constant_id = 1) const uint slices = 0;
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+layout (set = 0, binding = 1, scalar) buffer sum_buffer {
+    uint slice_sum[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    float threshold;
+};
+
+shared uint wg_sum = { };
+
+void main()
+{
+    ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
+
+    /* oob invocs still must reach the barrier, but must'nt
+     * get counted in, threshold is positive, so the fake value of 0.0 would
+     * otherwise be counted as black. */
+    bool in_bounds = all(lessThan(pos, imageSize(input_img[plane])));
+    float value = 0.0f;
+    if (in_bounds)
+        value = imageLoad(input_img[plane], pos).x;
+
+    uvec4 isblack = subgroupBallot(in_bounds && value <= threshold);
+    if (subgroupElect())
+        atomicAdd(wg_sum, subgroupBallotBitCount(isblack));
+
+    barrier();
+    if (gl_LocalInvocationIndex == 0)
+        atomicAdd(slice_sum[gl_WorkGroupID.x % slices], wg_sum);
+}
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to