PR #23431 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23431 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23431.patch
This adds an APV Vulkan encoder, to complement the decoder. >From 93f23f8c3664b12d513251012481d950221f3c1c Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Wed, 10 Jun 2026 15:33:32 +0900 Subject: [PATCH 1/2] cbs_apv: relax minimum tile-sizes --- libavcodec/cbs_apv_syntax_template.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libavcodec/cbs_apv_syntax_template.c b/libavcodec/cbs_apv_syntax_template.c index cd26a4556c..1a97878bb9 100644 --- a/libavcodec/cbs_apv_syntax_template.c +++ b/libavcodec/cbs_apv_syntax_template.c @@ -115,12 +115,14 @@ static int FUNC(tile_info)(CodedBitstreamContext *ctx, RWContext *rw, CodedBitstreamAPVContext *priv = ctx->priv_data; int frame_width_in_mbs = (fh->frame_info.frame_width + 15) / 16; int frame_height_in_mbs = (fh->frame_info.frame_height + 15) / 16; - uint32_t min_tile_width = FFMAX(APV_MIN_TILE_WIDTH_IN_MBS, - (frame_width_in_mbs + APV_MAX_TILE_COLS - 1) / - APV_MAX_TILE_COLS); - uint32_t min_tile_height = FFMAX(APV_MIN_TILE_HEIGHT_IN_MBS, - (frame_height_in_mbs + APV_MAX_TILE_ROWS - 1) / - APV_MAX_TILE_ROWS); + /* The spec also demands tile_width >= APV_MIN_TILE_WIDTH_IN_MBS (16) + * and tile_height >= APV_MIN_TILE_HEIGHT_IN_MBS (8); we deliberately + * accept smaller tiles (down to the 20x20 grid cap, which the fixed + * arrays rely on) so sub-minimum experimental streams keep working. */ + uint32_t min_tile_width = (frame_width_in_mbs + APV_MAX_TILE_COLS - 1) / + APV_MAX_TILE_COLS; + uint32_t min_tile_height = (frame_height_in_mbs + APV_MAX_TILE_ROWS - 1) / + APV_MAX_TILE_ROWS; int err; u(20, tile_width_in_mbs, min_tile_width, MAX_UINT_BITS(20)); -- 2.52.0 >From 7a24385dc502d29620a6b938d3cfc9055982a415 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Wed, 10 Jun 2026 15:35:56 +0900 Subject: [PATCH 2/2] lavc/apv_encode_vulkan: add a Vulkan APV encoder --- configure | 1 + libavcodec/Makefile | 1 + libavcodec/allcodecs.c | 1 + libavcodec/apv_encode_vulkan.c | 1104 ++++++++++++++++++ libavcodec/vulkan/Makefile | 7 +- libavcodec/vulkan/apv_encode_dct.comp.glsl | 204 ++++ libavcodec/vulkan/apv_encode_tiles.comp.glsl | 422 +++++++ 7 files changed, 1739 insertions(+), 1 deletion(-) create mode 100644 libavcodec/apv_encode_vulkan.c create mode 100644 libavcodec/vulkan/apv_encode_dct.comp.glsl create mode 100644 libavcodec/vulkan/apv_encode_tiles.comp.glsl diff --git a/configure b/configure index 1abf53b678..17b285b392 100755 --- a/configure +++ b/configure @@ -3147,6 +3147,7 @@ apng_encoder_select="deflate_wrapper llvidencdsp" aptx_encoder_select="audio_frame_queue" aptx_hd_encoder_select="audio_frame_queue" apv_decoder_select="cbs_apv" +apv_vulkan_encoder_select="vulkan spirv_compiler cbs_apv" asv1_decoder_select="blockdsp bswapdsp idctdsp" asv1_encoder_select="aandcttables bswapdsp fdctdsp pixblockdsp" asv2_decoder_select="blockdsp bswapdsp idctdsp" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index c6b878207b..efa1b5aa75 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -252,6 +252,7 @@ OBJS-$(CONFIG_APTX_HD_ENCODER) += aptxenc.o aptx.o OBJS-$(CONFIG_APNG_DECODER) += png.o pngdec.o pngdsp.o OBJS-$(CONFIG_APNG_ENCODER) += png.o pngenc.o OBJS-$(CONFIG_APV_DECODER) += apv_decode.o apv_entropy.o apv_dsp.o +OBJS-$(CONFIG_APV_VULKAN_ENCODER) += apv_encode_vulkan.o OBJS-$(CONFIG_ARBC_DECODER) += arbc.o OBJS-$(CONFIG_ARGO_DECODER) += argo.o OBJS-$(CONFIG_SSA_DECODER) += assdec.o ass.o diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 0815d46f79..53e5c256aa 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -50,6 +50,7 @@ extern const FFCodec ff_ansi_decoder; extern const FFCodec ff_apng_encoder; extern const FFCodec ff_apng_decoder; extern const FFCodec ff_apv_decoder; +extern const FFCodec ff_apv_vulkan_encoder; extern const FFCodec ff_arbc_decoder; extern const FFCodec ff_argo_decoder; extern const FFCodec ff_asv1_encoder; diff --git a/libavcodec/apv_encode_vulkan.c b/libavcodec/apv_encode_vulkan.c new file mode 100644 index 0000000000..058f692636 --- /dev/null +++ b/libavcodec/apv_encode_vulkan.c @@ -0,0 +1,1104 @@ +/* + * Copyright (c) 2026 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <math.h> +#include <stdlib.h> + +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/vulkan.h" +#include "libavutil/vulkan_spirv.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "encode.h" +#include "hwconfig.h" +#include "internal.h" + +#include "apv.h" +#include "cbs.h" +#include "cbs_apv.h" + +extern const unsigned char ff_apv_encode_dct_comp_spv_data[]; +extern const unsigned int ff_apv_encode_dct_comp_spv_len; + +extern const unsigned char ff_apv_encode_tiles_comp_spv_data[]; +extern const unsigned int ff_apv_encode_tiles_comp_spv_len; + +extern const unsigned char ff_seg_gather_comp_spv_data[]; +extern const unsigned int ff_seg_gather_comp_spv_len; + +#define APV_DEFAULT_QMAT 16 +#define APV_MAX_NUM_COMP 4 + +typedef struct DCTPushData { + int frame_dim[2]; + int tile_count[2]; + int tile_mb_dim[2]; + int log2_chroma_sub[2]; + int num_comp; + int bit_depth; + float qf[APV_MAX_NUM_COMP]; /* per-component fact/(level_scale*2^qp_shift) */ + uint8_t qmat[64]; /* quantisation matrix, raster order */ +} DCTPushData; + +typedef struct EntropyPushData { + VkDeviceAddress bytestream; + int tile_count[2]; + int num_comp; + uint32_t slot_size; + uint32_t comp_base; /* component index this dispatch's z=0 maps to */ + uint32_t blocks_per_tile; /* uniform coeff stride, in blocks */ + int frame_mb[2]; /* frame size in MBs (luma basis) */ + int tile_mb_dim[2]; /* full-tile size in MBs */ + uint32_t blocks_per_mb; /* blocks per MB of this dispatch's components */ +} EntropyPushData; + +typedef struct CompactPushData { + VkDeviceAddress sparse; + VkDeviceAddress compacted; + uint32_t slot_size; +} CompactPushData; + +typedef struct VulkanEncodeAPVFrameData { + AVBufferRef *coeffs_ref; + AVBufferRef *bytestream_ref; + AVBufferRef *compacted_ref; + AVBufferRef *sizes_ref; + + int64_t pts; + int64_t duration; + void *frame_opaque; + AVBufferRef *frame_opaque_ref; + int flags; +} VulkanEncodeAPVFrameData; + +typedef struct VulkanEncodeAPVContext { + const AVClass *class; + + FFVulkanContext s; + AVVulkanDeviceQueueFamily *qf; + FFVkExecPool exec_pool; + + FFVulkanShader shd_dct; + FFVulkanShader shd_entropy[2]; /* [0] luma-sized, [1] chroma-sized */ + FFVulkanShader shd_compact; + + /* Per-frame buffer pools */ + AVBufferPool *coeffs_pool; + AVBufferPool *bytestream_pool; + AVBufferPool *compacted_pool; + AVBufferPool *sizes_pool; + + /* DCT/quantize push constants -- encoder-constant, built once at init. */ + DCTPushData dct_push; + + /* CBS used to assemble the output packet */ + CodedBitstreamContext *cbc; + CodedBitstreamFragment au; + + AVFrame *frame; + + /* Async machinery */ + int async_depth; + int in_flight; + VulkanEncodeAPVFrameData *exec_ctx_info; + + /* Derived per-encoder state */ + int frame_mb_x, frame_mb_y; /* MBs in the frame (luma basis) */ + int tile_cols, tile_rows; + int tile_mb_w, tile_mb_h; /* MBs per tile (luma basis) */ + int tile_count; + int blocks_per_mb; /* luma; always 4 */ + int chroma_blocks_per_mb; /* 4 for 4:4:4, 2 for 4:2:2 */ + int num_comp; + int bit_depth; + enum AVPixelFormat sw_format; + + int profile_idc; + int level_idc; + int band_idc; + int chroma_format_idc; + + size_t coeffs_size; /* total size of coeffs buffer */ + size_t bytestream_size; /* total size of bytestream buffer */ + size_t slot_size; /* per-tile-component bytestream slot size */ + size_t sizes_size; /* total size of sizes buffer */ + + /* User options */ + int tile_w_mbs_opt; + int tile_h_mbs_opt; + int qp_y; + int qp_c; + int qmatrix; /* APV_QMATRIX_*: quantisation matrix select */ + + /* Benchmark knob (env APV_VULKAN_HEADERS_ONLY): the GPU still encodes, + * but the tiles are never downloaded and packets carry headers only. */ + int headers_only; + + /* Benchmark knob (env APV_VULKAN_SKIP_ENTROPY): skip the entropy + * dispatch to isolate the DCT pass. Implies headers_only. */ + int skip_entropy; +} VulkanEncodeAPVContext; + +/* + * HEVC default 8x8 intra scaling list (ITU-T H.265, Table 7-6): flat through + * the low-frequency core, a gentle ramp toward the high-frequency corner. + * Raster order; the matrix is symmetric, so APV's [y][x]/[x][y] indexing is + * immaterial. APV and HEVC share the "16 = neutral" convention, so the list + * transfers without rescaling. + */ +static const uint8_t apv_qmat_hevc_intra[64] = { + 16, 16, 16, 16, 17, 18, 21, 24, + 16, 16, 16, 16, 17, 19, 22, 25, + 16, 16, 17, 18, 20, 22, 25, 29, + 16, 16, 18, 21, 24, 27, 31, 36, + 17, 17, 20, 24, 30, 35, 41, 47, + 18, 19, 22, 27, 35, 44, 54, 65, + 21, 22, 25, 31, 41, 54, 70, 88, + 24, 25, 29, 36, 47, 65, 88, 115, +}; + +enum { + APV_QMATRIX_FLAT = 0, /* uniform 16 (the spec default) */ + APV_QMATRIX_HEVC = 1, /* HEVC default intra scaling list */ +}; + +/* + * The active quantisation-matrix value at raster index i. Both the q_matrix + * signalled in the frame header and the encoder's pf table are derived from + * this single accessor, so they cannot disagree -- a mismatch would quantise + * against a different matrix than the decoder dequantises with. + */ +static int apv_qmatrix_value(int qmatrix, int i) +{ + return qmatrix == APV_QMATRIX_HEVC ? apv_qmat_hevc_intra[i] + : APV_DEFAULT_QMAT; +} + +static const uint8_t apv_level_scale[6] = { 40, 45, 51, 57, 64, 71 }; + +static int chroma_format_from_pix_fmt(enum AVPixelFormat sw_fmt) +{ + switch (sw_fmt) { + case AV_PIX_FMT_YUV422P10: + case AV_PIX_FMT_YUV422P12: + return APV_CHROMA_FORMAT_422; + case AV_PIX_FMT_YUV444P10: + case AV_PIX_FMT_YUV444P12: + return APV_CHROMA_FORMAT_444; + case AV_PIX_FMT_GRAY10: + case AV_PIX_FMT_GRAY12: + return APV_CHROMA_FORMAT_400; + case AV_PIX_FMT_YUVA444P10: + case AV_PIX_FMT_YUVA444P12: + return APV_CHROMA_FORMAT_4444; + default: + return -1; + } +} + +static int profile_idc_from_pix_fmt(enum AVPixelFormat sw_fmt) +{ + switch (sw_fmt) { + case AV_PIX_FMT_GRAY10: return APV_PROFILE_400_10; + case AV_PIX_FMT_YUV422P10: return APV_PROFILE_422_10; + case AV_PIX_FMT_YUV422P12: return APV_PROFILE_422_12; + case AV_PIX_FMT_YUV444P10: return APV_PROFILE_444_10; + case AV_PIX_FMT_YUV444P12: return APV_PROFILE_444_12; + case AV_PIX_FMT_YUVA444P10: return APV_PROFILE_4444_10; + case AV_PIX_FMT_YUVA444P12: return APV_PROFILE_4444_12; + default: return -1; + } +} + +static int init_dct_shader(AVCodecContext *avctx) +{ + int err; + VulkanEncodeAPVContext *ev = avctx->priv_data; + FFVulkanShader *shd = &ev->shd_dct; + + SPEC_LIST_CREATE(sl, 1, sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 16, 32, 4); /* nb_blocks: blocks_per_mb per workgroup */ + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { 8, 4, 1 }, 0); + + ff_vk_shader_add_push_const(shd, 0, sizeof(DCTPushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "coeffs_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .elems = av_pix_fmt_count_planes(ev->sw_format), + }, + }; + RET(ff_vk_shader_add_descriptor_set(&ev->s, shd, desc_set, 2, 0, 0)); + + RET(ff_vk_shader_link(&ev->s, shd, + ff_apv_encode_dct_comp_spv_data, + ff_apv_encode_dct_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(&ev->s, &ev->exec_pool, shd)); + +fail: + return err; +} + +static int init_entropy_shader(AVCodecContext *avctx, int blocks_per_mb, + FFVulkanShader *shd) +{ + int err; + VulkanEncodeAPVContext *ev = avctx->priv_data; + + /* One workgroup per tile-component, one invocation per transform block. + * Luma and chroma tile-components hold different block counts under + * chroma sub-sampling, so each gets a pipeline with its own size. */ + uint32_t wg = ev->tile_mb_w * ev->tile_mb_h * blocks_per_mb; + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, + (uint32_t []) { wg, 1, 1 }, 0); + + ff_vk_shader_add_push_const(shd, 0, sizeof(EntropyPushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "coeffs_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "sizes_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(&ev->s, shd, desc_set, 2, 0, 0)); + + RET(ff_vk_shader_link(&ev->s, shd, + ff_apv_encode_tiles_comp_spv_data, + ff_apv_encode_tiles_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(&ev->s, &ev->exec_pool, shd)); + +fail: + return err; +} + +static int init_compact_shader(AVCodecContext *avctx) +{ + int err; + VulkanEncodeAPVContext *ev = avctx->priv_data; + FFVulkanShader *shd = &ev->shd_compact; + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, + (uint32_t []) { 256, 1, 1 }, 0); + + ff_vk_shader_add_push_const(shd, 0, sizeof(CompactPushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "sizes_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(&ev->s, shd, desc_set, 1, 0, 0)); + + RET(ff_vk_shader_link(&ev->s, shd, + ff_seg_gather_comp_spv_data, + ff_seg_gather_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(&ev->s, &ev->exec_pool, shd)); + +fail: + return err; +} + +/* + * The DCT/quantize shader's push constants are entirely encoder-constant: + * frame geometry, the per-component quant scale qf, and the quantisation + * matrix. Build them once -- nothing here changes between frames. + */ +static void build_dct_push_const(AVCodecContext *avctx) +{ + VulkanEncodeAPVContext *ev = avctx->priv_data; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(ev->sw_format); + DCTPushData *pd = &ev->dct_push; + const double fact = (double)(1 << (ev->bit_depth - 1)); + + pd->frame_dim[0] = avctx->width; + pd->frame_dim[1] = avctx->height; + pd->tile_count[0] = ev->tile_cols; + pd->tile_count[1] = ev->tile_rows; + pd->tile_mb_dim[0] = ev->tile_mb_w; + pd->tile_mb_dim[1] = ev->tile_mb_h; + pd->log2_chroma_sub[0] = desc->log2_chroma_w; + pd->log2_chroma_sub[1] = desc->log2_chroma_h; + pd->num_comp = ev->num_comp; + pd->bit_depth = ev->bit_depth; + + /* + * qf[c] = fact / (level_scale * 2^qp_shift). The encoder uses one QP per + * component, so this never varies by tile. Component 3 is alpha + * (4:4:4:4): full-resolution, so it takes the luma QP. + */ + for (int c = 0; c < APV_MAX_NUM_COMP; c++) { + int qp = (c == 0 || c == 3) ? ev->qp_y : ev->qp_c; + int level_scale = apv_level_scale[qp % 6]; + int qp_shift = qp / 6; + pd->qf[c] = + (float)(fact / ((double)level_scale * (double)(1 << qp_shift))); + } + + /* + * The 8-bit quantisation matrix. The shader stages it to shared memory + * and quantises with 1024 / qmat[i], the reciprocal partner of the + * decoder's per-coefficient dequant -- the same matrix that gets + * signalled in the frame header. + */ + for (int i = 0; i < 64; i++) + pd->qmat[i] = apv_qmatrix_value(ev->qmatrix, i); +} + +static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *frame) +{ + int err = 0; + VulkanEncodeAPVContext *ev = avctx->priv_data; + FFVulkanFunctions *vk = &ev->s.vkfn; + VulkanEncodeAPVFrameData *fd = exec->opaque; + VkImageView views[AV_NUM_DATA_POINTERS]; + + VkImageMemoryBarrier2 img_bar[AV_NUM_DATA_POINTERS]; + int nb_img_bar = 0; + VkBufferMemoryBarrier2 buf_bar[4]; + int nb_buf_bar = 0; + + FFVkBuffer *coeffs_buf; + FFVkBuffer *bytestream_buf; + FFVkBuffer *compacted_buf; + FFVkBuffer *sizes_buf; + + /* Allocate per-frame buffers */ + RET(ff_vk_get_pooled_buffer(&ev->s, &ev->coeffs_pool, &fd->coeffs_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + NULL, ev->coeffs_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + coeffs_buf = (FFVkBuffer *)fd->coeffs_ref->data; + + /* The entropy shader writes the bitstream here, sparsely -- one + * worst-case-sized slot per tile-component. Device-local, so those GPU + * writes stay in VRAM and never cross PCIe. */ + RET(ff_vk_get_pooled_buffer(&ev->s, &ev->bytestream_pool, + &fd->bytestream_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, ev->bytestream_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + bytestream_buf = (FFVkBuffer *)fd->bytestream_ref->data; + + /* The compaction shader gathers the sparse slots into here, contiguous. + * Host-visible + host-cached so the CPU readback is a fast cached copy, + * and the GPU writes it as one coalesced sequential stream. */ + RET(ff_vk_get_pooled_buffer(&ev->s, &ev->compacted_pool, + &fd->compacted_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, ev->bytestream_size, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT)); + compacted_buf = (FFVkBuffer *)fd->compacted_ref->data; + + RET(ff_vk_get_pooled_buffer(&ev->s, &ev->sizes_pool, &fd->sizes_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + NULL, ev->sizes_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)); + sizes_buf = (FFVkBuffer *)fd->sizes_ref->data; + + ff_vk_exec_start(&ev->s, exec); + + ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->coeffs_ref, 1, 1); + ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->bytestream_ref, 1, 1); + ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->compacted_ref, 1, 1); + ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->sizes_ref, 1, 1); + + RET(ff_vk_exec_add_dep_frame(&ev->s, exec, frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + RET(ff_vk_create_imageviews(&ev->s, exec, views, frame, FF_VK_REP_INT)); + + ff_vk_frame_barrier(&ev->s, exec, frame, + img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_READ_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* DCT + Quantize pass */ + { + ff_vk_shader_update_desc_buffer(&ev->s, exec, &ev->shd_dct, + 0, 0, 0, + coeffs_buf, 0, coeffs_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&ev->s, exec, &ev->shd_dct, + frame, views, + 0, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_exec_bind_shader(&ev->s, exec, &ev->shd_dct); + ff_vk_shader_update_push_const(&ev->s, exec, &ev->shd_dct, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(ev->dct_push), &ev->dct_push); + + vk->CmdDispatch(exec->buf, + ev->frame_mb_x, ev->frame_mb_y, ev->num_comp); + } + + /* Barrier: wait for coeff writes before entropy */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], coeffs_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE, + 0, coeffs_buf->size); + + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + /* + * Entropy encoding pass. Luma (component 0) and chroma (components + * 1..num_comp-1) run as two dispatches: under chroma sub-sampling their + * tile-components hold different block counts, hence different workgroup + * sizes -- one pipeline each. The two write disjoint memory and need no + * barrier between them, so the GPU is free to overlap them. + */ + for (int p = 0; !ev->skip_entropy && p < 2; p++) { + FFVulkanShader *shd = &ev->shd_entropy[p]; + uint32_t z_comps = (p == 0) ? 1 : ev->num_comp - 1; + + if (z_comps == 0) + continue; /* 4:0:0 (monochrome) has no chroma components */ + + EntropyPushData pd = { + .bytestream = bytestream_buf->address, + .tile_count = { ev->tile_cols, ev->tile_rows }, + .num_comp = ev->num_comp, + .slot_size = (uint32_t)ev->slot_size, + .comp_base = (uint32_t)p, + .blocks_per_tile = (uint32_t)ev->tile_mb_w * ev->tile_mb_h * + ev->blocks_per_mb, + .frame_mb = { ev->frame_mb_x, ev->frame_mb_y }, + .tile_mb_dim = { ev->tile_mb_w, ev->tile_mb_h }, + .blocks_per_mb = (uint32_t)(p == 0 ? ev->blocks_per_mb + : ev->chroma_blocks_per_mb), + }; + + ff_vk_shader_update_desc_buffer(&ev->s, exec, shd, 0, 0, 0, + coeffs_buf, 0, coeffs_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_desc_buffer(&ev->s, exec, shd, 0, 1, 0, + sizes_buf, 0, sizes_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ev->s, exec, shd); + ff_vk_shader_update_push_const(&ev->s, exec, shd, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, ev->tile_cols, ev->tile_rows, z_comps); + } + + /* Compaction pass: gather the sparse per-tile-component slots into one + * contiguous, host-visible buffer. Reads VRAM, writes the host buffer as + * a coalesced stream -- the device->host transfer the CPU then reads. */ + if (!ev->headers_only) { + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], bytestream_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE, + 0, bytestream_buf->size); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], sizes_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE, + 0, sizes_buf->size); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + CompactPushData pd = { + .sparse = bytestream_buf->address, + .compacted = compacted_buf->address, + .slot_size = (uint32_t)ev->slot_size, + }; + + ff_vk_shader_update_desc_buffer(&ev->s, exec, &ev->shd_compact, + 0, 0, 0, + sizes_buf, 0, sizes_buf->size, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(&ev->s, exec, &ev->shd_compact); + ff_vk_shader_update_push_const(&ev->s, exec, &ev->shd_compact, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, ev->tile_count * ev->num_comp, 1, 1); + } + + err = ff_vk_exec_submit(&ev->s, exec); + if (err < 0) + return err; + + return 0; + +fail: + ff_vk_exec_discard_deps(&ev->s, exec); + return err; +} + +static int build_packet(AVCodecContext *avctx, FFVkExecContext *exec, + AVPacket *pkt) +{ + int err = 0; + VulkanEncodeAPVContext *ev = avctx->priv_data; + FFVulkanFunctions *vk = &ev->s.vkfn; + VulkanEncodeAPVFrameData *fd = exec->opaque; + FFVkBuffer *compacted_buf = (FFVkBuffer *)fd->compacted_ref->data; + FFVkBuffer *sizes_buf = (FFVkBuffer *)fd->sizes_ref->data; + APVRawFrame *raw_frame = NULL; + + /* Wait for the GPU encode to finish */ + ff_vk_exec_wait(&ev->s, exec); + + const uint32_t *sizes = NULL; + static uint8_t headers_only_tile; /* 1-byte token tile data */ + + /* Headers-only benchmark mode never touches the GPU output. */ + if (!ev->headers_only) { + /* Invalidate mapped memory if needed */ + if (!(compacted_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + VkMappedMemoryRange r = { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = compacted_buf->mem, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + vk->InvalidateMappedMemoryRanges(ev->s.hwctx->act_dev, 1, &r); + } + if (!(sizes_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + VkMappedMemoryRange r = { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = sizes_buf->mem, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + vk->InvalidateMappedMemoryRanges(ev->s.hwctx->act_dev, 1, &r); + } + sizes = (const uint32_t *)sizes_buf->mapped_mem; + } + + /* Allocate the cbs frame structure */ + raw_frame = av_mallocz(sizeof(*raw_frame)); + if (!raw_frame) + return AVERROR(ENOMEM); + + raw_frame->pbu_header.pbu_type = APV_PBU_PRIMARY_FRAME; + raw_frame->pbu_header.group_id = 1; + + APVRawFrameHeader *fh = &raw_frame->frame_header; + fh->frame_info.profile_idc = ev->profile_idc; + fh->frame_info.level_idc = ev->level_idc; + fh->frame_info.band_idc = ev->band_idc; + fh->frame_info.frame_width = avctx->width; + fh->frame_info.frame_height = avctx->height; + fh->frame_info.chroma_format_idc = ev->chroma_format_idc; + fh->frame_info.bit_depth_minus8 = ev->bit_depth - 8; + fh->frame_info.capture_time_distance = 0; + + fh->color_description_present_flag = 0; + /* Inferred values when the flag is 0, per the spec. */ + fh->color_primaries = 2; + fh->transfer_characteristics = 2; + fh->matrix_coefficients = 2; + fh->full_range_flag = 0; + + /* compute_pf_table() builds the encoder's pf scale from the same matrix; + * the two must stay in sync. use_q_matrix is only signalled when the + * matrix is non-uniform (a flat 16 matrix is the inferred default). */ + fh->use_q_matrix = ev->qmatrix != APV_QMATRIX_FLAT; + for (int c = 0; c < ev->num_comp; c++) + for (int y = 0; y < 8; y++) + for (int x = 0; x < 8; x++) + fh->quantization_matrix.q_matrix[c][y][x] = + apv_qmatrix_value(ev->qmatrix, y * 8 + x); + + fh->tile_info.tile_width_in_mbs = ev->tile_mb_w; + fh->tile_info.tile_height_in_mbs = ev->tile_mb_h; + fh->tile_info.tile_size_present_in_fh_flag = 0; + + /* Populate each tile. The compacted buffer holds each tile-component's + * data back to back, in (tile, component) order -- the same layout the + * gather shader produced. */ + uint32_t comp_off = 0; + for (int t = 0; t < ev->tile_count; t++) { + APVRawTile *tile = &raw_frame->tile[t]; + uint32_t total_tile_data = 0; + + tile->tile_header.tile_header_size = + 4 + ev->num_comp * (4 + 1) + 1; + tile->tile_header.tile_index = t; + + for (int c = 0; c < ev->num_comp; c++) { + uint32_t sz; + if (ev->headers_only) { + /* No readback: one token byte (CBS requires size >= 1). */ + sz = 1; + tile->tile_data[c] = &headers_only_tile; + } else { + sz = sizes[t * ev->num_comp + c]; + tile->tile_data[c] = compacted_buf->mapped_mem + comp_off; + comp_off += sz; + } + tile->tile_header.tile_data_size[c] = sz; + tile->tile_header.tile_qp[c] = + (c == 0 || c == 3) ? ev->qp_y : ev->qp_c; + total_tile_data += sz; + } + tile->tile_header.reserved_zero_8bits = 0; + tile->tile_dummy_byte_size = 0; + tile->tile_dummy_byte = NULL; + + raw_frame->tile_size[t] = + tile->tile_header.tile_header_size + total_tile_data; + } + + /* Assemble fragment using cbs_apv */ + ff_cbs_fragment_reset(&ev->au); + + err = ff_cbs_insert_unit_content(&ev->au, -1, APV_PBU_PRIMARY_FRAME, + raw_frame, NULL); + if (err < 0) { + av_freep(&raw_frame); + return err; + } + /* raw_frame is now owned by the fragment unit */ + raw_frame = NULL; + + /* Assemble straight into the packet: ff_cbs_write_packet() hands pkt a + * reference to CBS's own assembled buffer -- no copy. */ + err = ff_cbs_write_packet(ev->cbc, pkt, &ev->au); + if (err < 0) + return err; + + pkt->pts = fd->pts; + pkt->dts = fd->pts; + pkt->duration = fd->duration; + pkt->flags |= AV_PKT_FLAG_KEY; /* APV is all intra */ + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + pkt->opaque = fd->frame_opaque; + pkt->opaque_ref = fd->frame_opaque_ref; + fd->frame_opaque_ref = NULL; + } + + av_log(avctx, AV_LOG_VERBOSE, "Encoded APV frame: %i bytes (%.2f MiB)\n", + pkt->size, pkt->size / (1024.0 * 1024.0)); + + av_buffer_unref(&fd->coeffs_ref); + av_buffer_unref(&fd->bytestream_ref); + av_buffer_unref(&fd->compacted_ref); + av_buffer_unref(&fd->sizes_ref); + + return 0; +} + +static int vulkan_encode_apv_receive_packet(AVCodecContext *avctx, + AVPacket *pkt) +{ + int err; + VulkanEncodeAPVContext *ev = avctx->priv_data; + VulkanEncodeAPVFrameData *fd; + FFVkExecContext *exec; + AVFrame *frame; + + while (1) { + exec = ff_vk_exec_get(&ev->s, &ev->exec_pool); + + if (exec->had_submission) { + exec->had_submission = 0; + ev->in_flight--; + return build_packet(avctx, exec, pkt); + } + + frame = ev->frame; + err = ff_encode_get_frame(avctx, frame); + if (err < 0 && err != AVERROR_EOF) + return err; + else if (err == AVERROR_EOF) { + if (!ev->in_flight) + return err; + continue; + } + + fd = exec->opaque; + fd->pts = frame->pts; + fd->duration = frame->duration; + fd->flags = frame->flags; + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + fd->frame_opaque = frame->opaque; + fd->frame_opaque_ref = frame->opaque_ref; + frame->opaque_ref = NULL; + } + + err = submit_frame(avctx, exec, frame); + av_frame_unref(frame); + if (err < 0) + return err; + + ev->in_flight++; + if (ev->in_flight < ev->async_depth) + return AVERROR(EAGAIN); + } + return 0; +} + +static av_cold int vulkan_encode_apv_close(AVCodecContext *avctx) +{ + VulkanEncodeAPVContext *ev = avctx->priv_data; + + ff_vk_exec_pool_free(&ev->s, &ev->exec_pool); + + ff_vk_shader_free(&ev->s, &ev->shd_dct); + ff_vk_shader_free(&ev->s, &ev->shd_entropy[0]); + ff_vk_shader_free(&ev->s, &ev->shd_entropy[1]); + ff_vk_shader_free(&ev->s, &ev->shd_compact); + + if (ev->exec_ctx_info) { + for (int i = 0; i < ev->async_depth; i++) { + VulkanEncodeAPVFrameData *fd = &ev->exec_ctx_info[i]; + av_buffer_unref(&fd->coeffs_ref); + av_buffer_unref(&fd->bytestream_ref); + av_buffer_unref(&fd->compacted_ref); + av_buffer_unref(&fd->sizes_ref); + av_buffer_unref(&fd->frame_opaque_ref); + } + av_freep(&ev->exec_ctx_info); + } + + av_buffer_pool_uninit(&ev->coeffs_pool); + av_buffer_pool_uninit(&ev->bytestream_pool); + av_buffer_pool_uninit(&ev->compacted_pool); + av_buffer_pool_uninit(&ev->sizes_pool); + + ff_cbs_fragment_free(&ev->au); + ff_cbs_close(&ev->cbc); + + av_frame_free(&ev->frame); + ff_vk_uninit(&ev->s); + + return 0; +} + +static av_cold int vulkan_encode_apv_init(AVCodecContext *avctx) +{ + int err; + VulkanEncodeAPVContext *ev = avctx->priv_data; + AVHWFramesContext *hwfc; + + if (!avctx->hw_frames_ctx) { + av_log(avctx, AV_LOG_ERROR, "An AVHWFramesContext is required.\n"); + return AVERROR(EINVAL); + } + hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + ev->sw_format = hwfc->sw_format; + + ev->profile_idc = profile_idc_from_pix_fmt(ev->sw_format); + ev->chroma_format_idc = chroma_format_from_pix_fmt(ev->sw_format); + if (ev->profile_idc < 0 || ev->chroma_format_idc < 0) { + av_log(avctx, AV_LOG_ERROR, "Unsupported sw_format %s for APV.\n", + av_get_pix_fmt_name(ev->sw_format)); + return AVERROR(EINVAL); + } + + /* All four APV chroma formats are supported -- 4:0:0, 4:2:2, 4:4:4 and + * 4:4:4:4. The profile_idc / chroma_format_idc checks above already + * reject any pixel format that is not one of them. */ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(ev->sw_format); + ev->bit_depth = desc->comp[0].depth; + ev->num_comp = desc->nb_components; + ev->blocks_per_mb = 4; /* luma: 16x16 MB -> 4 8x8 blocks */ + ev->chroma_blocks_per_mb = 4 >> (desc->log2_chroma_w + desc->log2_chroma_h); + ev->level_idc = 33; /* placeholder, real value depends on resolution and bitrate */ + ev->band_idc = 0; + + /* Frame dimensions in macroblocks */ + ev->frame_mb_x = (avctx->width + APV_MB_WIDTH - 1) / APV_MB_WIDTH; + ev->frame_mb_y = (avctx->height + APV_MB_HEIGHT - 1) / APV_MB_HEIGHT; + + /* The 20x20 tile grid cap is structural (fixed-size arrays everywhere); + * the spec additionally demands tiles of at least 16x8 MBs. Each + * tile-component maps to one entropy workgroup, one invocation per + * transform block. */ + int grid_tw = (ev->frame_mb_x + APV_MAX_TILE_COLS - 1) / APV_MAX_TILE_COLS; + int grid_th = (ev->frame_mb_y + APV_MAX_TILE_ROWS - 1) / APV_MAX_TILE_ROWS; + int min_tw = FFMAX(APV_MIN_TILE_WIDTH_IN_MBS, grid_tw); + int min_th = FFMAX(APV_MIN_TILE_HEIGHT_IN_MBS, grid_th); + + /* tile_w/tile_h pick the tile size in MBs; 0 selects the spec minimum. + * An explicit request below the spec minimum is honoured down to the + * grid cap -- non-conformant, but more tiles mean shorter (serial) + * entropy streams, which is the decode speed lever. */ + ev->tile_mb_w = ev->tile_w_mbs_opt > 0 ? ev->tile_w_mbs_opt : min_tw; + ev->tile_mb_h = ev->tile_h_mbs_opt > 0 ? ev->tile_h_mbs_opt : min_th; + ev->tile_mb_w = FFMIN(FFMAX(ev->tile_mb_w, grid_tw), ev->frame_mb_x); + ev->tile_mb_h = FFMIN(FFMAX(ev->tile_mb_h, grid_th), ev->frame_mb_y); + if (ev->tile_mb_w < APV_MIN_TILE_WIDTH_IN_MBS || + ev->tile_mb_h < APV_MIN_TILE_HEIGHT_IN_MBS) + av_log(avctx, AV_LOG_WARNING, + "Tile size %dx%d MBs is below the spec minimum of %dx%d: " + "NON-CONFORMANT bitstream, most decoders will reject it.\n", + ev->tile_mb_w, ev->tile_mb_h, + APV_MIN_TILE_WIDTH_IN_MBS, APV_MIN_TILE_HEIGHT_IN_MBS); + + /* Left to default, grow the tile toward 1024 transform blocks (the + * entropy workgroup ceiling) while it still divides the frame. Bigger + * tiles mean fewer tile-components, which the compaction pass strongly + * prefers -- it is the dominant win for throughput. */ + if (!ev->tile_w_mbs_opt && !ev->tile_h_mbs_opt) { + while (ev->tile_mb_w * 2 <= ev->frame_mb_x && + ev->frame_mb_x % (ev->tile_mb_w * 2) == 0 && + (ev->tile_mb_w * 2) * ev->tile_mb_h * ev->blocks_per_mb <= 1024) + ev->tile_mb_w *= 2; + while (ev->tile_mb_h * 2 <= ev->frame_mb_y && + ev->frame_mb_y % (ev->tile_mb_h * 2) == 0 && + ev->tile_mb_w * (ev->tile_mb_h * 2) * ev->blocks_per_mb <= 1024) + ev->tile_mb_h *= 2; + } + + /* Ceil division: the rightmost column / bottom row of tiles take the + * remainder MBs (spec-legal; the tile grid is closed at the frame edge, + * so those tiles may be smaller than the signalled tile size). */ + ev->tile_cols = (ev->frame_mb_x + ev->tile_mb_w - 1) / ev->tile_mb_w; + ev->tile_rows = (ev->frame_mb_y + ev->tile_mb_h - 1) / ev->tile_mb_h; + ev->tile_count = ev->tile_cols * ev->tile_rows; + + if (ev->tile_count > APV_MAX_TILE_COUNT) { + av_log(avctx, AV_LOG_ERROR, "Too many tiles (%d).\n", ev->tile_count); + return AVERROR(EINVAL); + } + + /* The entropy shader runs one invocation per block in a tile-component + * and its shared buffers are sized for 1024. */ + if (ev->tile_mb_w * ev->tile_mb_h * ev->blocks_per_mb > 1024) { + av_log(avctx, AV_LOG_ERROR, + "Tile-component has too many transform blocks (%d > 1024).\n", + ev->tile_mb_w * ev->tile_mb_h * ev->blocks_per_mb); + return AVERROR_PATCHWELCOME; + } + + /* qp_chroma left at 0 means "use the luma QP". */ + if (ev->qp_c == 0) + ev->qp_c = ev->qp_y; + + /* Validate QP range */ + int max_qp = 3 + ev->bit_depth * 6; + if (ev->qp_y < 0 || ev->qp_y > max_qp || ev->qp_c < 0 || ev->qp_c > max_qp) { + av_log(avctx, AV_LOG_ERROR, + "QP out of range [0, %d]: qp_y=%d, qp_c=%d.\n", + max_qp, ev->qp_y, ev->qp_c); + return AVERROR(EINVAL); + } + + /* Buffer sizing */ + size_t blocks_per_tile = (size_t)ev->tile_mb_w * ev->tile_mb_h * ev->blocks_per_mb; + ev->coeffs_size = (size_t)ev->tile_count * ev->num_comp * + blocks_per_tile * APV_BLK_COEFFS * sizeof(int16_t); + + /* Worst-case per-tile-component bytestream: each coefficient at most ~32 bits. + * Round up generously. */ + ev->slot_size = blocks_per_tile * APV_BLK_COEFFS * 8; + ev->slot_size = FFALIGN(ev->slot_size, 64); + ev->bytestream_size = (size_t)ev->tile_count * ev->num_comp * ev->slot_size; + ev->sizes_size = (size_t)ev->tile_count * ev->num_comp * sizeof(uint32_t); + + av_log(avctx, AV_LOG_VERBOSE, + "APV Vulkan encoder: %dx%d, %d tiles (%dx%d MBs each), " + "qp_y=%d qp_c=%d, coeffs=%zu KiB, bytestream=%zu KiB\n", + avctx->width, avctx->height, ev->tile_count, + ev->tile_mb_w, ev->tile_mb_h, ev->qp_y, ev->qp_c, + ev->coeffs_size / 1024, ev->bytestream_size / 1024); + + ev->headers_only = !!getenv("APV_VULKAN_HEADERS_ONLY"); + ev->skip_entropy = !!getenv("APV_VULKAN_SKIP_ENTROPY"); + if (ev->skip_entropy) + ev->headers_only = 1; /* the bitstream is never produced */ + if (ev->headers_only) + av_log(avctx, AV_LOG_WARNING, + "APV_VULKAN_HEADERS_ONLY set: tiles will not be downloaded " + "or assembled; output packets contain headers only.\n"); + if (ev->skip_entropy) + av_log(avctx, AV_LOG_WARNING, + "APV_VULKAN_SKIP_ENTROPY set: entropy dispatch skipped " + "(DCT-only benchmark mode).\n"); + + /* Init Vulkan */ + err = ff_vk_init(&ev->s, avctx, NULL, avctx->hw_frames_ctx); + if (err < 0) + return err; + + ev->qf = ff_vk_qf_find(&ev->s, VK_QUEUE_COMPUTE_BIT, 0); + if (!ev->qf) { + av_log(avctx, AV_LOG_ERROR, "Device has no compute queues!\n"); + return AVERROR(ENOTSUP); + } + + err = ff_vk_exec_pool_init(&ev->s, ev->qf, &ev->exec_pool, + ev->async_depth, 0, 0, 0, NULL); + if (err < 0) + return err; + + /* Init CBS for assembling output */ + err = ff_cbs_init(&ev->cbc, AV_CODEC_ID_APV, avctx); + if (err < 0) + return err; + + /* Shaders */ + err = init_dct_shader(avctx); + if (err < 0) + return err; + err = init_entropy_shader(avctx, ev->blocks_per_mb, &ev->shd_entropy[0]); + if (err < 0) + return err; + err = init_entropy_shader(avctx, ev->chroma_blocks_per_mb, + &ev->shd_entropy[1]); + if (err < 0) + return err; + err = init_compact_shader(avctx); + if (err < 0) + return err; + + /* The DCT/quantize shader's push constants never change frame to frame; + * build them once. */ + build_dct_push_const(avctx); + + ev->frame = av_frame_alloc(); + if (!ev->frame) + return AVERROR(ENOMEM); + + /* Async data pool */ + ev->async_depth = ev->exec_pool.pool_size; + ev->exec_ctx_info = av_calloc(ev->async_depth, sizeof(*ev->exec_ctx_info)); + if (!ev->exec_ctx_info) + return AVERROR(ENOMEM); + for (int i = 0; i < ev->async_depth; i++) + ev->exec_pool.contexts[i].opaque = &ev->exec_ctx_info[i]; + + return 0; +} + +#define OFFSET(x) offsetof(VulkanEncodeAPVContext, x) +#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM +static const AVOption vulkan_encode_apv_options[] = { + { "qp", "Quantization parameter (luma)", OFFSET(qp_y), + AV_OPT_TYPE_INT, { .i64 = 22 }, 0, 255, VE }, + { "qp_chroma", "Chroma quantization parameter (0 = same as luma qp)", OFFSET(qp_c), + AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 255, VE }, + { "qmatrix", "Quantization matrix", OFFSET(qmatrix), + AV_OPT_TYPE_INT, { .i64 = APV_QMATRIX_HEVC }, 0, 1, VE, "qmatrix" }, + { "flat", "Uniform matrix, all 16 (APV spec default)", 0, + AV_OPT_TYPE_CONST, { .i64 = APV_QMATRIX_FLAT }, 0, 0, VE, "qmatrix" }, + { "hevc", "HEVC default intra scaling list (mild perceptual shaping)", 0, + AV_OPT_TYPE_CONST, { .i64 = APV_QMATRIX_HEVC }, 0, 0, VE, "qmatrix" }, + /* The minimum legal tile is 16x8 MBs; the maxima are this encoder's + * ceiling of 1024 transform blocks per tile-component (256 MBs): with + * the other dimension at its minimum, width <= 32 and height <= 16. A + * value of 0 is the sentinel for the adaptive per-frame default. */ + { "tile_width", "Tile width in macroblocks (0 = adaptive, auto-sized per frame)", OFFSET(tile_w_mbs_opt), + AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 32, VE }, + { "tile_height", "Tile height in macroblocks (0 = adaptive, auto-sized per frame)", OFFSET(tile_h_mbs_opt), + AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 16, VE }, + { "async_depth", "Internal parallelization depth", OFFSET(async_depth), + AV_OPT_TYPE_INT, { .i64 = 1 }, 1, INT_MAX, VE }, + { NULL } +}; + +static const FFCodecDefault vulkan_encode_apv_defaults[] = { + { "g", "1" }, + { NULL }, +}; + +static const AVClass vulkan_encode_apv_class = { + .class_name = "apv_vulkan", + .item_name = av_default_item_name, + .option = vulkan_encode_apv_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVCodecHWConfigInternal *const vulkan_encode_apv_hw_configs[] = { + HW_CONFIG_ENCODER_FRAMES(VULKAN, VULKAN), + NULL, +}; + +const FFCodec ff_apv_vulkan_encoder = { + .p.name = "apv_vulkan", + CODEC_LONG_NAME("Advanced Professional Video (Vulkan)"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_APV, + .priv_data_size = sizeof(VulkanEncodeAPVContext), + .init = &vulkan_encode_apv_init, + FF_CODEC_RECEIVE_PACKET_CB(&vulkan_encode_apv_receive_packet), + .close = &vulkan_encode_apv_close, + .p.priv_class = &vulkan_encode_apv_class, + .p.capabilities = AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_HARDWARE | + AV_CODEC_CAP_DR1 | + AV_CODEC_CAP_ENCODER_FLUSH | + AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | FF_CODEC_CAP_EOF_FLUSH, + .defaults = vulkan_encode_apv_defaults, + CODEC_PIXFMTS(AV_PIX_FMT_VULKAN), + .hw_configs = vulkan_encode_apv_hw_configs, + .p.wrapper_name = "vulkan", +}; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index c6817967c7..51ad98e000 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -4,6 +4,10 @@ clean:: OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan/apv_decode.comp.spv.o \ vulkan/apv_idct.comp.spv.o +OBJS-$(CONFIG_APV_VULKAN_ENCODER) += vulkan/apv_encode_dct.comp.spv.o \ + vulkan/apv_encode_tiles.comp.spv.o \ + vulkan/seg_gather.comp.spv.o + OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_reset.comp.spv.o \ vulkan/ffv1_enc_reset_golomb.comp.spv.o \ @@ -13,7 +17,8 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_rgb_golomb.comp.spv.o \ vulkan/ffv1_enc_rct_search.comp.spv.o \ vulkan/ffv1_enc_remap.comp.spv.o \ - vulkan/ffv1_enc_rgb_float.comp.spv.o + vulkan/ffv1_enc_rgb_float.comp.spv.o \ + vulkan/seg_gather.comp.spv.o OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \ vulkan/ffv1_dec_reset.comp.spv.o \ diff --git a/libavcodec/vulkan/apv_encode_dct.comp.glsl b/libavcodec/vulkan/apv_encode_dct.comp.glsl new file mode 100644 index 0000000000..0f6693a4f3 --- /dev/null +++ b/libavcodec/vulkan/apv_encode_dct.comp.glsl @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2026 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" +#include "dct.glsl" + +#define APV_MAX_NUM_COMP 4 +#define APV_MAX_TILE_COLS 20 +#define APV_MAX_TILE_ROWS 20 +#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS) +#define APV_MIN_TRANS_COEFF -32768 +#define APV_MAX_TRANS_COEFF 32767 +#define APV_TR_SIZE 8 +#define APV_BLK_COEFFS (APV_TR_SIZE * APV_TR_SIZE) +#define APV_MB_SIZE 16 + +/* + * Buffer holding per-tile, per-component coefficient blocks. + * Layout (linear): + * tile_y * tile_cols * num_comp * blocks_per_tile * 64 + * + tile_x * num_comp * blocks_per_tile * 64 + * + comp * blocks_per_tile * 64 + * + block_in_tile * 64 + * + coeff_in_block + * + * blocks_per_tile is computed by the host as: + * mbs_per_tile_x * mbs_per_tile_y * blocks_per_mb[comp] + * where blocks_per_mb is 4 (luma) or 4 (chroma in 444), etc. + */ +layout (set = 0, binding = 0, scalar) writeonly buffer coeffs_buf { + int16_t coeffs[]; +}; + +layout (set = 0, binding = 1) uniform readonly iimage2D src[]; + +layout (push_constant, scalar) uniform pushConstants { + ivec2 frame_dim; /* in pixels */ + ivec2 tile_count; /* number of tile columns/rows */ + ivec2 tile_mb_dim; /* MBs per tile (cols, rows) */ + ivec2 log2_chroma_sub; /* 0/0 for 444, 1/0 for 422, etc. */ + int num_comp; + int bit_depth; + /* Per-component quant scale fact/(level_scale*2^qp_shift). The encoder + * uses one QP per component, so it never varies by tile. */ + float qf[APV_MAX_NUM_COMP]; + /* The quantisation matrix (raster order), the same one signalled in the + * frame header. Staged into shared memory at the top of main(). */ + uint8_t qmat[64]; +}; + +/* Workgroup-local copy of qmat, filled once per workgroup (prores_raw style). */ +shared uint8_t qmat_buf[64]; + +void main(void) +{ + /* Workgroup grid: + * x: total MB columns over the frame (frame_mb_x) + * y: total MB rows over the frame (frame_mb_y) + * z: component index [0..num_comp) + * + * Local size (8, 4, 1): + * gl_LocalInvocationID.x in [0..7] = row index inside an 8x8 block + * gl_LocalInvocationID.y in [0..3] = which 8x8 block within the MB + * + * Luma and 4:4:4 chroma use all 4 blocks; 4:2:2 chroma uses 2 (a vertical + * pair) and the surplus two invocations early-out before the store. + */ + + /* Stage the quantisation matrix into shared memory once, the same way + * prores_raw does -- one workgroup-wide copy instead of a push-constant + * read for every coefficient. */ + if (gl_LocalInvocationIndex == 0u) { + [[unroll]] + for (uint i = 0u; i < 64u; i++) + qmat_buf[i] = qmat[i]; + } + barrier(); + + const uint comp = gl_WorkGroupID.z; + const uint mb_x_lin = gl_WorkGroupID.x; + const uint mb_y_lin = gl_WorkGroupID.y; + + /* Map workgroup to its tile (luma coords) */ + const ivec2 sub_shift = (comp == 0u) ? ivec2(0) : log2_chroma_sub; + + /* Compute which tile this MB belongs to in MB units */ + const int tx = int(mb_x_lin) / tile_mb_dim.x; + const int ty = int(mb_y_lin) / tile_mb_dim.y; + const int tile_idx = ty * tile_count.x + tx; + const int mb_x_in_tile = int(mb_x_lin) - tx * tile_mb_dim.x; + const int mb_y_in_tile = int(mb_y_lin) - ty * tile_mb_dim.y; + + /* Remainder tiles: the rightmost tile column may be narrower than + * tile_mb_dim.x. Pack block indices by the tile's ACTUAL width so the + * entropy pass sees them contiguously. */ + const int frame_mb_x = (frame_dim.x + APV_MB_SIZE - 1) / APV_MB_SIZE; + const int actual_tw = min(tile_mb_dim.x, frame_mb_x - tx * tile_mb_dim.x); + const int mb_in_tile = mb_y_in_tile * actual_tw + mb_x_in_tile; + + /* + * Per-MB block grid for this component: luma and 4:4:4 chroma are 2x2 + * (4 blocks); 4:2:2 chroma is 1 wide x 2 tall (2 blocks). Derived from + * the chroma sub-sampling shift. + */ + const uint bw = 2u >> uint(sub_shift.x); + const uint bh = 2u >> uint(sub_shift.y); + const uint nb_blk = bw * bh; + + /* + * Uniform coefficient stride: every tile-component is allocated the luma + * block count (4 per MB). A sub-sampled chroma component leaves the rest + * of its region unused, which keeps the buffer layout flat. + */ + const uint blocks_per_tile = uint(tile_mb_dim.x * tile_mb_dim.y) * 4u; + + /* Block index within the workgroup -> position inside the MB */ + const uint blk = gl_LocalInvocationID.y; + const uint row = gl_LocalInvocationID.x; + + /* Block coordinate inside the (possibly sub-sampled) macroblock */ + const ivec2 block_offset = ivec2(int(blk % bw), int(blk / bw)); + + /* Compute pixel coordinate for this thread's row of samples */ + ivec2 mb_origin_luma = ivec2(int(mb_x_lin), int(mb_y_lin)) * APV_MB_SIZE; + ivec2 mb_origin = mb_origin_luma >> sub_shift; + ivec2 block_origin = mb_origin + block_offset * APV_TR_SIZE; + ivec2 coord = block_origin + ivec2(0, int(row)); + + /* Clamp to image bounds, in case frame dimensions are not aligned */ + ivec2 img_dim = imageSize(src[comp]); + coord = min(coord, img_dim - ivec2(1)); + + const float bias = float(1 << (bit_depth - 1)); + const float fact = bias; + + /* Load 8 horizontal samples, subtract bias, normalize to ~[-1,1] */ + [[unroll]] + for (int i = 0; i < 8; i++) { + int s = imageLoad(src[comp], coord + ivec2(i, 0)).x; + blocks[blk][row * 9u + uint(i)] = (float(s) - bias) / fact; + } + + barrier(); + + /* Column DCT (offset varies on x-axis, traverses rows via stride 9) */ + fdct8(blk, row, 9); + barrier(); + + /* Row DCT (offset varies on y-axis, traverses cols via stride 1) */ + fdct8(blk, row * 9u, 1); + barrier(); + + /* + * Quantize and store. Each thread writes its row. + * level = round( fdct2d((sample - bias)/fact) * qf * 1024/qmat[i] ) + * fdct8() is the exact orthonormal forward DCT, the reciprocal of the + * spec iDCT (apv_decode_transquant_c), which reconstructs + * sample - bias = (qmat[i]*level_scale*2^qp_shift / 1024) + * * iDCT_ortho(level). + * qf[comp] carries fact/(level_scale*2^qp_shift); the per-coefficient + * factor 1024/qmat[i] inverts the decoder's per-coefficient dequant. + */ + const float scale_const = qf[comp]; + + /* Compute coefficient base offset in the buffer */ + const uint tile_lin = uint(tile_idx); + const uint coeff_base = + (tile_lin * uint(num_comp) + comp) * blocks_per_tile * 64u + + (uint(mb_in_tile) * nb_blk + blk) * 64u; + + /* Surplus invocations of a sub-sampled component (blk >= nb_blk) took + * part in the barriers above but must not write any coefficients. */ + if (blk < nb_blk) { + [[unroll]] + for (int i = 0; i < 8; i++) { + float v = blocks[blk][row * 9u + uint(i)]; + float pf = 1024.0f / float(qmat_buf[row * 8u + uint(i)]); + int lvl = int(round(v * scale_const * pf)); + lvl = clamp(lvl, APV_MIN_TRANS_COEFF, APV_MAX_TRANS_COEFF); + coeffs[coeff_base + row * 8u + uint(i)] = int16_t(lvl); + } + } +} diff --git a/libavcodec/vulkan/apv_encode_tiles.comp.glsl b/libavcodec/vulkan/apv_encode_tiles.comp.glsl new file mode 100644 index 0000000000..dd4ba878a0 --- /dev/null +++ b/libavcodec/vulkan/apv_encode_tiles.comp.glsl @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2026 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_rotate : require + +#define PB_UNALIGNED +#include "common.glsl" + +#define APV_BLK_COEFFS 64 + +/* + * One workgroup encodes one tile-component. The workgroup size (set by the + * host via spec constants 253/254/255) equals the number of transform blocks + * in the tile-component, so there is exactly one invocation per block. + * + * The minimum APV tile is 16x8 MBs -> 16*8*4 = 512 blocks. The buffers below + * are sized for 1024 (a 2x tile). With a guaranteed subgroup size of >= 32, + * a 1024-invocation workgroup has at most 1024/32 = 32 subgroups. + */ +#define MAX_BLOCKS 1024 +#define MAX_SUBGROUPS 32 + +/* ff_zigzag_direct, packed: each byte is the raster index (y*8 + x). */ +const uint8_t zigzag[64] = { + uint8_t( 0), uint8_t( 1), uint8_t( 8), uint8_t(16), + uint8_t( 9), uint8_t( 2), uint8_t( 3), uint8_t(10), + uint8_t(17), uint8_t(24), uint8_t(32), uint8_t(25), + uint8_t(18), uint8_t(11), uint8_t( 4), uint8_t( 5), + uint8_t(12), uint8_t(19), uint8_t(26), uint8_t(33), + uint8_t(40), uint8_t(48), uint8_t(41), uint8_t(34), + uint8_t(27), uint8_t(20), uint8_t(13), uint8_t( 6), + uint8_t( 7), uint8_t(14), uint8_t(21), uint8_t(28), + uint8_t(35), uint8_t(42), uint8_t(49), uint8_t(56), + uint8_t(57), uint8_t(50), uint8_t(43), uint8_t(36), + uint8_t(29), uint8_t(22), uint8_t(15), uint8_t(23), + uint8_t(30), uint8_t(37), uint8_t(44), uint8_t(51), + uint8_t(58), uint8_t(59), uint8_t(52), uint8_t(45), + uint8_t(38), uint8_t(31), uint8_t(39), uint8_t(46), + uint8_t(53), uint8_t(60), uint8_t(61), uint8_t(54), + uint8_t(47), uint8_t(55), uint8_t(62), uint8_t(63), +}; + +/* Coefficients are int16, accessed through a packed u32 view: the mask + * build streams whole words, and the few nonzero values are extracted + * on demand. */ +layout (set = 0, binding = 0, scalar) readonly buffer coeffs_buf { + uint32_t coeffs32[]; +}; + +int coeff_at(uint base16, uint i) +{ + uint idx = base16 + i; + uint w = coeffs32[idx >> 1]; + return (int(w << ((1u - (idx & 1u)) << 4))) >> 16; +} + +/* + * Inverse zig-zag: zz_inv[raster] = scan position. Indexed only by + * unrolled-constant indices, so it folds into immediates. + */ +const uint8_t zz_inv[64] = { + uint8_t( 0), uint8_t( 1), uint8_t( 5), uint8_t( 6), + uint8_t(14), uint8_t(15), uint8_t(27), uint8_t(28), + uint8_t( 2), uint8_t( 4), uint8_t( 7), uint8_t(13), + uint8_t(16), uint8_t(26), uint8_t(29), uint8_t(42), + uint8_t( 3), uint8_t( 8), uint8_t(12), uint8_t(17), + uint8_t(25), uint8_t(30), uint8_t(41), uint8_t(43), + uint8_t( 9), uint8_t(11), uint8_t(18), uint8_t(24), + uint8_t(31), uint8_t(40), uint8_t(44), uint8_t(53), + uint8_t(10), uint8_t(19), uint8_t(23), uint8_t(32), + uint8_t(39), uint8_t(45), uint8_t(52), uint8_t(54), + uint8_t(20), uint8_t(22), uint8_t(33), uint8_t(38), + uint8_t(46), uint8_t(51), uint8_t(55), uint8_t(60), + uint8_t(21), uint8_t(34), uint8_t(37), uint8_t(47), + uint8_t(50), uint8_t(56), uint8_t(59), uint8_t(61), + uint8_t(35), uint8_t(36), uint8_t(48), uint8_t(49), + uint8_t(57), uint8_t(58), uint8_t(62), uint8_t(63), +}; + +/* Zig-zag-domain nonzero map of one block: bit s = coefficient at scan + * position s is nonzero. 32 sequential word loads, constant bit targets. */ +uint64_t nz_mask(uint base16) +{ + uint64_t mask = uint64_t(0); + uint b32 = base16 >> 1; + [[unroll]] + for (uint w = 0u; w < 32u; w++) { + uint v = coeffs32[b32 + w]; + if ((v & 0xFFFFu) != 0u) + mask |= uint64_t(1) << zz_inv[2u * w]; + if ((v >> 16) != 0u) + mask |= uint64_t(1) << zz_inv[2u * w + 1u]; + } + return mask; +} + +/* Index of the lowest set bit; mask must be nonzero. */ +int findLSB64(uint64_t m) +{ + u32vec2 h = unpack32(m); + return (h.x != 0u) ? findLSB(h.x) : 32 + findLSB(h.y); +} + +layout (set = 0, binding = 1, scalar) writeonly buffer sizes_buf { + uint32_t tile_comp_sizes[]; +}; + +layout (push_constant, scalar) uniform pushConstants { + u8buf bytestream; /* device address of the bytestream buffer */ + ivec2 tile_count; /* number of tile columns/rows */ + int num_comp; + uint slot_size; /* per-tile-component bytestream slot size */ + uint comp_base; /* component index of this dispatch's z = 0 */ + uint blocks_per_tile; /* uniform coeff stride, in blocks */ + ivec2 frame_mb; /* frame size in MBs (luma basis) */ + ivec2 tile_mb_dim; /* full-tile size in MBs */ + uint blocks_per_mb; /* blocks per MB of this dispatch's components */ +}; + +shared uint32_t sg_lasttail[MAX_SUBGROUPS]; /* last block's tail, per subgroup */ +shared uint32_t sg_scan [MAX_SUBGROUPS]; /* per-subgroup scan totals */ + +/* + * Flush only the complete bytes held by the writer. The trailing partial + * byte is intentionally left unwritten: it is the block's tail, and the next + * block writes it (prepended to its own first byte). flush_put_bits() rounds + * the byte count up; this rounds it down. + */ +void flush_whole_bytes(inout PutBitContext pb) +{ + if (pb.bit_left < BUF_BITS) + pb.bit_buf <<= pb.bit_left; + + uint to_write = uint(BUF_BITS - pb.bit_left) >> 3; + + u8buf bs = u8buf(pb.buf); + for (int i = 0; i < to_write; i++) + bs[i].v = BYTE_EXTRACT(pb.bit_buf, BUF_BYTES - uint8_t(1) - i); + pb.buf = uint64_t(bs) + to_write; +} + +/* + * A block coder bundles the output writer, a running bit count, a rolling + * copy of the last emitted bits, and a flag selecting whether to actually + * write. The count and encode passes share the exact same code path, so the + * counted length always matches the written one. + */ +struct BlockCoder { + PutBitContext pb; + uint nbits; + uint roll; + bool wr; +}; + +void emit(inout BlockCoder bc, uint32_t n, uint32_t value) +{ + bc.nbits += n; + bc.roll = (bc.roll << n) | value; /* keeps the last >= 8 emitted bits */ + if (bc.wr) + put_bits(bc.pb, n, value); +} + +/* Variable-length code from the APV spec, section 7.2.4. */ +void write_vlc(inout BlockCoder bc, int kParam, int symbolVal) +{ + int threshold1 = 1 << kParam; + int threshold2 = threshold1 << 1; + + if (symbolVal < threshold1) { + emit(bc, 1u, 1u); + if (kParam > 0) + emit(bc, uint(kParam), uint(symbolVal)); + } else if (symbolVal < threshold2) { + emit(bc, 2u, 0u); + if (kParam > 0) + emit(bc, uint(kParam), uint(symbolVal - threshold1)); + } else { + emit(bc, 2u, 1u); + int adjusted = symbolVal - threshold1; + int n_plus_k = findMSB(adjusted); + int n = n_plus_k - kParam; + if (n > 0) + emit(bc, uint(n), 0u); + emit(bc, 1u, 1u); + int residual = adjusted - (1 << n_plus_k); + if (n_plus_k > 0) + emit(bc, uint(n_plus_k), uint(residual)); + } +} + +/* abs() of the first non-zero AC coefficient in zig-zag order, 0 if none. */ +int first_ac_level(uint base16, uint64_t mask) +{ + uint64_t mac = mask >> 1; + if (mac == uint64_t(0)) + return 0; + int sp = 1 + findLSB64(mac); + return abs(coeff_at(base16, uint(zigzag[sp]))); +} + +/* Entropy-code one transform block given its predicted context. The nonzero + * map drives the zero-runs, so only the nonzero values are ever loaded. */ +void process_block(inout BlockCoder bc, uint base16, uint64_t mask, + int prev_dc, int prev_k_dc, int prev_1st_ac) +{ + /* DC */ + int dc = coeff_at(base16, 0u); + int dc_diff = dc - prev_dc; + int abs_dc = abs(dc_diff); + write_vlc(bc, prev_k_dc, abs_dc); + if (abs_dc != 0) + emit(bc, 1u, dc_diff < 0 ? 1u : 0u); + + /* AC, zig-zag from position 1. m's bit 0 = scan position scan_pos. */ + uint64_t m = mask >> 1; + int scan_pos = 1; + int prev_level = prev_1st_ac; + int prev_run = 0; + + while (scan_pos < APV_BLK_COEFFS) { + int run = (m == uint64_t(0)) ? APV_BLK_COEFFS - scan_pos + : findLSB64(m); + + int k_run = clamp(prev_run >> 2, 0, 2); + write_vlc(bc, k_run, run); + + scan_pos += run; + prev_run = run; + + if (scan_pos < APV_BLK_COEFFS) { + int level = coeff_at(base16, uint(zigzag[scan_pos])); + int abs_level_m1 = abs(level) - 1; + int k_level = clamp(prev_level >> 2, 0, 4); + write_vlc(bc, k_level, abs_level_m1); + emit(bc, 1u, level < 0 ? 1u : 0u); + + prev_level = abs_level_m1 + 1; + scan_pos++; + m >>= run + 1; + } + } +} + +/* DC predictor context for block 'idx' -- pure function of preceding DCs. */ +void dc_context(uint coeff_base, uint idx, out int prev_dc, out int prev_k_dc) +{ + if (idx == 0u) { + prev_dc = 0; + prev_k_dc = 5; + return; + } + uint base = coeff_base + idx * APV_BLK_COEFFS; + int dc_m1 = coeff_at(base - APV_BLK_COEFFS, 0u); + int dc_m2 = (idx == 1u) ? 0 : coeff_at(base - 2u * APV_BLK_COEFFS, 0u); + prev_dc = dc_m1; + prev_k_dc = min(abs(dc_m1 - dc_m2) >> 1, 5); +} + +void main(void) +{ + const uint b = gl_LocalInvocationID.x; + const uint comp = comp_base + gl_WorkGroupID.z; + const uint tile_idx = gl_WorkGroupID.y * uint(tile_count.x) + gl_WorkGroupID.x; + const uint tile_comp = tile_idx * uint(num_comp) + comp; + + /* + * Remainder tiles: the rightmost column / bottom row of tiles may be + * smaller than the full tile the workgroup was sized for. Invocations at + * b >= nb stay for the barriers and scans (contributing zeros) but never + * read coefficients or write bitstream. Active blocks are always the + * contiguous prefix [0, nb), so the tail-handoff chain below never + * crosses an inactive->active boundary. + */ + const int actual_tw = min(tile_mb_dim.x, + frame_mb.x - int(gl_WorkGroupID.x) * tile_mb_dim.x); + const int actual_th = min(tile_mb_dim.y, + frame_mb.y - int(gl_WorkGroupID.y) * tile_mb_dim.y); + const uint nb = uint(actual_tw * actual_th) * blocks_per_mb; + + /* + * This tile-component's coefficient region. The coeff buffer uses a + * uniform per-tile-component stride (the luma block count); a chroma + * tile-component simply reads fewer blocks from its region. + */ + const uint coeff_base = tile_comp * blocks_per_tile * APV_BLK_COEFFS; + const uint blk = coeff_base + b * APV_BLK_COEFFS; + + int prev_dc = 0, prev_k_dc = 5; + uint64_t mask = uint64_t(0); + if (b < nb) { + dc_context(coeff_base, b, prev_dc, prev_k_dc); + mask = nz_mask(blk); + } + + /* + * Scan #1: prev_1st_ac_level is the first-AC level of the most recent + * *non-empty* block before this one. Pack (blockIndex+1, level) into a + * key (empty blocks -> 0) so a plain Max picks the highest-index prior + * non-empty block. Two levels: subgroup scan, then across subgroups. + */ + int my_first_ac = (b < nb) ? first_ac_level(blk, mask) : 0; + uint key = (my_first_ac == 0) ? 0u + : (((b + 1u) << 16) | uint(my_first_ac)); + + uint key_excl = subgroupExclusiveMax(key); + uint key_total = subgroupMax(key); + if (subgroupElect()) + sg_scan[gl_SubgroupID] = key_total; + barrier(); + + uint carry = key_excl; + for (uint i = 0u; i < gl_NumSubgroups; i++) { + if (i >= gl_SubgroupID) + break; + carry = max(carry, sg_scan[i]); + } + int prev_1st_ac = (carry == 0u) ? 0 : int(carry & 0xFFFFu); + barrier(); /* sg_scan is reused by scan #2 */ + + /* + * Count pass: measure this block's coded length, and -- for free, since + * we walk every emitted bit anyway -- keep a rolling copy of the last + * bits in cnt.roll. + */ + BlockCoder cnt; + cnt.nbits = 0u; + cnt.roll = 0u; + cnt.wr = false; + if (b < nb) + process_block(cnt, blk, mask, prev_dc, prev_k_dc, prev_1st_ac); + + /* + * Scan #2: exclusive prefix sum of the bit counts gives each block's + * start offset (in bits) within the tile-component bitstream. + */ + uint bits_excl = subgroupExclusiveAdd(cnt.nbits); + uint bits_total = subgroupAdd(cnt.nbits); + if (subgroupElect()) + sg_scan[gl_SubgroupID] = bits_total; + barrier(); + + uint off = bits_excl; + for (uint i = 0u; i < gl_NumSubgroups; i++) { + if (i >= gl_SubgroupID) + break; + off += sg_scan[i]; + } + + /* + * This block's tail = its last (end_bit & 7) bits. The next block + * prepends these so its own writer starts on a byte boundary; that makes + * every block write a disjoint run of whole bytes -- no atomics, no + * shared bytes. end_bit is known only now (after scan #2). + * + * The predecessor's tail is just the left-neighbour lane, so a subgroup + * rotate fetches it from a register; only at a subgroup boundary (lane 0) + * does it fall back to shared memory -- the previous subgroup's last block. + */ + uint end_bit = off + cnt.nbits; + uint tail_n = end_bit & 7u; + uint my_tail = cnt.roll & ((1u << tail_n) - 1u); + + uint prev_tail = subgroupRotate(my_tail, gl_SubgroupSize - 1u); + if (gl_SubgroupInvocationID == gl_SubgroupSize - 1u) + sg_lasttail[gl_SubgroupID] = my_tail; + barrier(); + if (gl_SubgroupInvocationID == 0u && gl_SubgroupID > 0u) + prev_tail = sg_lasttail[gl_SubgroupID - 1u]; + + /* Write pass: each block writes bytes [off>>3, end_bit>>3). Inactive + * remainder-tile invocations write nothing. */ + if (b >= nb) + return; + + BlockCoder enc; + init_put_bits(enc.pb, + OFFBUF(u8buf, bytestream, tile_comp * slot_size + (off >> 3u)), + uint64_t(slot_size)); + enc.nbits = 0u; + enc.roll = 0u; + enc.wr = true; + + /* Prepend the predecessor's tail so this block's first byte comes out + * complete. (off & 7 == 0 for block 0, which has no predecessor.) */ + if (b > 0u) + put_bits(enc.pb, off & 7u, prev_tail); + + process_block(enc, blk, mask, prev_dc, prev_k_dc, prev_1st_ac); + + if (b == nb - 1u) { + /* Last block: no successor, so flush the trailing partial byte too + * (the tile-component is byte-aligned, zero-padded), and record the + * total size. */ + flush_put_bits(enc.pb); + tile_comp_sizes[tile_comp] = (end_bit + 7u) >> 3u; + } else { + /* Leave the trailing partial byte for the next block to prepend. */ + flush_whole_bytes(enc.pb); + } +} -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
