This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 5ad8c67e6cce8809dde67af87e23989f940389c1 Author: Lynne <[email protected]> AuthorDate: Sun Nov 2 14:38:33 2025 +0000 Commit: Lynne <[email protected]> CommitDate: Tue May 19 17:43:53 2026 +0900 apv_decode: add a Vulkan hwaccel --- Changelog | 1 + configure | 2 + libavcodec/Makefile | 1 + libavcodec/apv_decode.c | 6 + libavcodec/hwaccels.h | 1 + libavcodec/version.h | 2 +- libavcodec/vulkan/Makefile | 3 + libavcodec/vulkan/apv_decode.comp.glsl | 216 +++++++++++++++ libavcodec/vulkan/apv_idct.comp.glsl | 117 ++++++++ libavcodec/vulkan_apv.c | 490 +++++++++++++++++++++++++++++++++ libavcodec/vulkan_decode.c | 7 + 11 files changed, 845 insertions(+), 1 deletion(-) diff --git a/Changelog b/Changelog index b0f87694e8..f4171592db 100644 --- a/Changelog +++ b/Changelog @@ -11,6 +11,7 @@ version <next>: - Add AMF Frame Rate Converter (vf_frc_amf) filter - SMPTE 2094-50 metadata support and passthrough - ProRes RAW VideoToolbox hwaccel +- APV Vulkan hwaccel version 8.1: diff --git a/configure b/configure index 730b4ac46d..514f1723d6 100755 --- a/configure +++ b/configure @@ -3403,6 +3403,8 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore" vulkan_deps="threads" vulkan_deps_any="libdl LoadLibrary" +apv_vulkan_hwaccel_deps="vulkan spirv_compiler" +apv_vulkan_hwaccel_select="apv_decoder" av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1" av1_d3d11va_hwaccel_select="av1_decoder" av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 85d35913f3..4935cfc3b3 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -1047,6 +1047,7 @@ OBJS-$(CONFIG_VIDEOTOOLBOX) += videotoolbox.o OBJS-$(CONFIG_VDPAU) += vdpau.o OBJS-$(CONFIG_VULKAN) += vulkan.o vulkan_video.o +OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan_decode.o vulkan_apv.o OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL) += dxva2_av1.o OBJS-$(CONFIG_AV1_DXVA2_HWACCEL) += dxva2_av1.o OBJS-$(CONFIG_AV1_D3D12VA_HWACCEL) += dxva2_av1.o d3d12va_av1.o diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index 0ba3495501..c916b2328e 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -51,6 +51,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx, enum AVPixelFormat pix_fmt) { enum AVPixelFormat pix_fmts[] = { +#if CONFIG_APV_VULKAN_HWACCEL + AV_PIX_FMT_VULKAN, +#endif pix_fmt, AV_PIX_FMT_NONE, }; @@ -603,6 +606,9 @@ const FFCodec ff_apv_decoder = { AV_CODEC_CAP_FRAME_THREADS, .caps_internal = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM, .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_APV_VULKAN_HWACCEL + HWACCEL_VULKAN(apv), +#endif NULL }, }; diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h index 35c5d57377..4f8da46c13 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -19,6 +19,7 @@ #ifndef AVCODEC_HWACCELS_H #define AVCODEC_HWACCELS_H +extern const struct FFHWAccel ff_apv_vulkan_hwaccel; extern const struct FFHWAccel ff_av1_d3d11va_hwaccel; extern const struct FFHWAccel ff_av1_d3d11va2_hwaccel; extern const struct FFHWAccel ff_av1_d3d12va_hwaccel; diff --git a/libavcodec/version.h b/libavcodec/version.h index 2a08e42d7e..0ef6c991f3 100644 --- a/libavcodec/version.h +++ b/libavcodec/version.h @@ -29,7 +29,7 @@ #include "version_major.h" -#define LIBAVCODEC_VERSION_MINOR 33 +#define LIBAVCODEC_VERSION_MINOR 34 #define LIBAVCODEC_VERSION_MICRO 100 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index c779b49fad..c6817967c7 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -1,6 +1,9 @@ clean:: $(RM) $(CLEANSUFFIXES:%=libavcodec/vulkan/%) +OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan/apv_decode.comp.spv.o \ + vulkan/apv_idct.comp.spv.o + OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_reset.comp.spv.o \ vulkan/ffv1_enc_reset_golomb.comp.spv.o \ diff --git a/libavcodec/vulkan/apv_decode.comp.glsl b/libavcodec/vulkan/apv_decode.comp.glsl new file mode 100644 index 0000000000..6db24b5372 --- /dev/null +++ b/libavcodec/vulkan/apv_decode.comp.glsl @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" + +#define APV_MAX_NUM_COMP 4 +#define APV_MAX_TILE_COLS 20 +#define APV_MAX_TILE_ROWS 20 +#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS) +#define APV_MIN_TRANS_COEFF -32768 +#define APV_MAX_TRANS_COEFF 32767 +#define APV_TR_SIZE 8 +#define APV_BLK_COEFFS (APV_TR_SIZE * APV_TR_SIZE) +#define APV_MB_SIZE (ivec2(16, 16)) + +layout (set = 0, binding = 0) uniform writeonly uimage2D dst[]; +layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { + uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint8_t q_matrix[APV_MAX_NUM_COMP][8][8]; + uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint16_t tile_col[APV_MAX_TILE_COLS + 1]; + uint16_t tile_row[APV_MAX_TILE_ROWS + 1]; +}; + +layout (push_constant, scalar) uniform pushConstants { + u8buf tile_data; + ivec2 tile_count; + ivec2 log2_chroma_sub; + int components; + int bit_depth; +}; + +GetBitContext gb; + +int apv_read_vlc(int k) +{ + /* Top 32 bits, longest valid APV code is 1 + 2*5 + 5 = 16 bits */ + uint bits = show_bits(gb, 32); + uint mask = (1u << k) - 1u; + + /* 1xxx: short, length 1+k, value = next k bits */ + if (bits >= 0x80000000u) { + skip_bits(gb, 1 + k); + return int((bits >> (31 - k)) & mask); + } + + /* 00xxx: short, length 2+k, value = (1<<k) + next k bits */ + if (bits < 0x40000000u) { + skip_bits(gb, 2 + k); + return int((bits >> (30 - k)) & mask) + (1 << k); + } + + /* 01 prefix + (n leading zeros) + 1 + (n+k value bits), + * after shifting out the 01 prefix, findMSB tells us n */ + uint suffix = bits << 2; + if (suffix == 0u) + return APV_MAX_TRANS_COEFF + 1; + + int n = 31 - findMSB(suffix); + skip_bits(gb, 3 + n); + /* (2<<k) + ((1<<n)-1) * (1<<k) is equal to ((1<<n) + 1) << k */ + return (((1 << n) + 1) << k) + int(get_bits(gb, n + k)); +} + +/* ff_zigzag_direct, packed: each byte is the raster index (y*8 + x). */ +const uint8_t zigzag[64] = { + uint8_t( 0), uint8_t( 1), uint8_t( 8), uint8_t(16), + uint8_t( 9), uint8_t( 2), uint8_t( 3), uint8_t(10), + uint8_t(17), uint8_t(24), uint8_t(32), uint8_t(25), + uint8_t(18), uint8_t(11), uint8_t( 4), uint8_t( 5), + uint8_t(12), uint8_t(19), uint8_t(26), uint8_t(33), + uint8_t(40), uint8_t(48), uint8_t(41), uint8_t(34), + uint8_t(27), uint8_t(20), uint8_t(13), uint8_t( 6), + uint8_t( 7), uint8_t(14), uint8_t(21), uint8_t(28), + uint8_t(35), uint8_t(42), uint8_t(49), uint8_t(56), + uint8_t(57), uint8_t(50), uint8_t(43), uint8_t(36), + uint8_t(29), uint8_t(22), uint8_t(15), uint8_t(23), + uint8_t(30), uint8_t(37), uint8_t(44), uint8_t(51), + uint8_t(58), uint8_t(59), uint8_t(52), uint8_t(45), + uint8_t(38), uint8_t(31), uint8_t(39), uint8_t(46), + uint8_t(53), uint8_t(60), uint8_t(61), uint8_t(54), + uint8_t(47), uint8_t(55), uint8_t(62), uint8_t(63), +}; + +int prev_dc; +int prev_k_dc; +int prev_1st_ac_level; + +void decode_block(ivec2 pos, uint comp) +{ + int dc_coeff; + int abs_diff = apv_read_vlc(prev_k_dc); + + if (abs_diff != 0) { + if (get_bit(gb)) + dc_coeff = prev_dc - abs_diff; + else + dc_coeff = prev_dc + abs_diff; + } else { + dc_coeff = prev_dc; + } + + if (dc_coeff < APV_MIN_TRANS_COEFF || + dc_coeff > APV_MAX_TRANS_COEFF) + return; + + imageStore(dst[comp], pos, uvec4(uint(dc_coeff) & 0xFFFFu)); + prev_dc = dc_coeff; + prev_k_dc = min(abs_diff >> 1, 5); + + /* ACs */ + int scan_pos = 1; + int first_ac = 1; + int prev_level = prev_1st_ac_level; + int prev_run = 0; + + do { + int coeff_zero_run; + + int k_param = clamp(prev_run >> 2, 0, 2); + coeff_zero_run = apv_read_vlc(k_param); + + if (coeff_zero_run > APV_BLK_COEFFS - scan_pos) + return; + + /* image was already pre-cleared to all zeroes */ + scan_pos += coeff_zero_run; + prev_run = coeff_zero_run; + + if (scan_pos < APV_BLK_COEFFS) { + int abs_ac_coeff_minus1; + int level; + + k_param = clamp(prev_level >> 2, 0, 4); + abs_ac_coeff_minus1 = apv_read_vlc(k_param); + bool sign_ac_coeff = get_bit(gb); + + if (sign_ac_coeff) + level = -abs_ac_coeff_minus1 - 1; + else + level = abs_ac_coeff_minus1 + 1; + + if (level < APV_MIN_TRANS_COEFF || level > APV_MAX_TRANS_COEFF) + return; + + int zz = int(zigzag[scan_pos]); + imageStore(dst[comp], pos + ivec2(zz & 7, zz >> 3), uvec4(uint(level) & 0xFFFFu)); + + prev_level = abs_ac_coeff_minus1 + 1; + if (first_ac != 0) { + prev_1st_ac_level = prev_level; + first_ac = 0; + } + + scan_pos++; + } + } while (scan_pos < APV_BLK_COEFFS); +} + +void main(void) +{ + const ivec2 tile_pos = ivec2(gl_WorkGroupID.xy); + const uint comp_idx = uint(gl_WorkGroupID.z); + + /* EC state */ + prev_dc = 0; + prev_k_dc = 5; + prev_1st_ac_level = 0; + + const int num_tiles = tile_count.x * tile_count.y; + const int tile_idx = tile_pos.y * tile_count.x + tile_pos.x; + const uvec2 tile_bs = tile_offset[int(comp_idx) * num_tiles + tile_idx]; + init_get_bits(gb, u8buf(tile_data + tile_bs.x), int(tile_bs.y)); + + ivec2 sub_shift = comp_idx == 0 ? ivec2(0) : log2_chroma_sub; + ivec2 tile_start = ivec2(tile_col[tile_pos.x], tile_row[tile_pos.y]); + ivec2 tile_dim = ivec2(tile_col[tile_pos.x + 1], + tile_row[tile_pos.y + 1]) - tile_start; + ivec2 tile_mb_dim = tile_dim / APV_MB_SIZE; + ivec2 blk_mb_dim = ivec2(2, 2) >> sub_shift; + + ivec2 mb, blk; + for (mb.y = 0; mb.y < tile_mb_dim.y; mb.y++) { + for (mb.x = 0; mb.x < tile_mb_dim.x; mb.x++) { + for (blk.y = 0; blk.y < blk_mb_dim.y; blk.y++) { + for (blk.x = 0; blk.x < blk_mb_dim.x; blk.x++) { + ivec2 pos = (APV_MB_SIZE*mb + + APV_TR_SIZE*blk + tile_start) >> sub_shift; + + decode_block(pos, comp_idx); + } + } + } + } +} diff --git a/libavcodec/vulkan/apv_idct.comp.glsl b/libavcodec/vulkan/apv_idct.comp.glsl new file mode 100644 index 0000000000..2b56bc02fc --- /dev/null +++ b/libavcodec/vulkan/apv_idct.comp.glsl @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" +#include "dct.glsl" + +#define APV_MAX_NUM_COMP 4 +#define APV_MAX_TILE_COLS 20 +#define APV_MAX_TILE_ROWS 20 +#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS) +#define APV_TR_SIZE 8 +#define APV_BLOCKS_PER_WG 8 + +layout (set = 0, binding = 0) uniform uimage2D dst[]; +layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { + uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint8_t q_matrix[APV_MAX_NUM_COMP][8][8]; + uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint16_t tile_col[APV_MAX_TILE_COLS + 1]; + uint16_t tile_row[APV_MAX_TILE_ROWS + 1]; +}; + +layout (push_constant, scalar) uniform pushConstants { + u8buf tile_data; + ivec2 tile_count; + ivec2 log2_chroma_sub; + int components; + int bit_depth; +}; + +const int apv_level_scale[6] = { 40, 45, 51, 57, 64, 71 }; + +void main(void) +{ + const uvec3 wgid = gl_WorkGroupID; + const uint comp = wgid.z; + + const uvec3 lid = gl_LocalInvocationID; + const uint block = (lid.y << 2) | (lid.x >> 3); /* 0..7 block in chunk */ + const uint col = lid.x & 0x7u; /* 0..7 column in block */ + + /* one workgroup handles eight horizontally neighbouring blocks */ + const int blk_x = int(wgid.x) * APV_BLOCKS_PER_WG + int(block); + const int blk_y = int(wgid.y); + const ivec2 pos = ivec2(blk_x, blk_y) * APV_TR_SIZE; + + /* note: some oddness happens on tile-boundaries */ + const ivec2 sub_shift = (comp == 0u) ? ivec2(0) : log2_chroma_sub; + const ivec2 luma_pos = pos << sub_shift; + + /* figure out the tile position */ + int tx = 0; + while (tx + 1 < tile_count.x && int(tile_col[tx + 1]) <= luma_pos.x) + tx++; + int ty = 0; + while (ty + 1 < tile_count.y && int(tile_row[ty + 1]) <= luma_pos.y) + ty++; + + const int tile_idx = ty * tile_count.x + tx; + const int qp = int(tile_qp[int(comp) * APV_MAX_TILE_COUNT + tile_idx]); + const int level_scale = apv_level_scale[qp % 6]; + const int qp_shift = qp / 6; + + const int half_range = 1 << (bit_depth - 1); + const int max_val = (1 << bit_depth) - 1; + const float fact = float(half_range); + const float norm = 1.0f / (1024.0f * fact); /* DCT normalization const */ + + [[unroll]] + for (uint y = 0u; y < 8u; y++) { + /* load */ + int raw = int(imageLoad(dst[comp], pos + ivec2(col, y)).x); + int coeff = sign_extend(raw, 16); + /* dequant + norm */ + int qs = level_scale * int(q_matrix[comp][col][y]) * (1 << qp_shift); + float v = float(coeff * qs) * norm; + /* scale */ + blocks[block][y * 9u + col] = v * idct_scale[y * 8u + col]; + } + barrier(); + + idct8(block, col, 9); + barrier(); + + blocks[block][col * 9u] += 1.0f; + + idct8(block, col * 9u, 1); + barrier(); + + [[unroll]] + for (int y = 0; y < 8; y++) { + float v = round(blocks[block][y * 9u + col] * fact); + imageStore(dst[comp], pos + ivec2(col, y), + uvec4(uint(clamp(int(v), 0, max_val)))); + } +} diff --git a/libavcodec/vulkan_apv.c b/libavcodec/vulkan_apv.c new file mode 100644 index 0000000000..891b7b74c5 --- /dev/null +++ b/libavcodec/vulkan_apv.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vulkan_decode.h" +#include "hwaccel_internal.h" + +#include "apv_decode.h" +#include "libavutil/mem.h" + +extern const unsigned char ff_apv_decode_comp_spv_data[]; +extern const unsigned int ff_apv_decode_comp_spv_len; + +extern const unsigned char ff_apv_idct_comp_spv_data[]; +extern const unsigned int ff_apv_idct_comp_spv_len; + +const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc = { + .codec_id = AV_CODEC_ID_APV, + .queue_flags = VK_QUEUE_COMPUTE_BIT, +}; + +typedef struct APVVulkanDecodePicture { + FFVulkanDecodePicture vp; + + AVBufferRef *frame_data_buf; + uint32_t *frame_data; + int tile_num; +} APVVulkanDecodePicture; + +typedef struct APVVulkanDecodeContext { + FFVulkanShader decode; + FFVulkanShader idct; + + AVBufferPool *frame_data_pool; +} APVVulkanDecodeContext; + +typedef struct DecodePushData { + VkDeviceAddress tile_data; + int tile_count[2]; + int log2_chroma_sub[2]; + int components; + int bit_depth; +} DecodePushData; + +static int vk_apv_start_frame(AVCodecContext *avctx, + const AVBufferRef *buffer_ref, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + int err; + APVDecodeContext *apv = avctx->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + /* Host map the input tile data if supported */ + if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) + ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data, + buffer_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT); + + /* Allocate frame data buffer */ + int fd_size = (2*4*APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP + + (64 + APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP + + (APV_MAX_TILE_COLS + 1 + APV_MAX_TILE_ROWS + 1)*2; + + err = ff_vk_get_pooled_buffer(&ctx->s, &apvvk->frame_data_pool, + &apvvp->frame_data_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, fd_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + + /* Frame data */ + FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data; + uint8_t *fd = frame_data->mapped_mem; + + fd += 2*4*APV_MAX_TILE_COUNT*APV_MAX_NUM_COMP; /* Tile offsets go first */ + + /* per-component qmatrix and QPs */ + for (int i = 0; i < APV_MAX_NUM_COMP; i++) + memcpy(fd + 64*i, + apv->cur_raw_frame->frame_header.quantization_matrix.q_matrix[i], + 64); + fd += 64*APV_MAX_NUM_COMP; + + for (int i = 0; i < APV_MAX_NUM_COMP; i++) { + for (int j = 0; j < APV_MAX_TILE_COUNT; j++) + fd[j] = apv->cur_raw_frame->tile[j].tile_header.tile_qp[i]; + fd += APV_MAX_TILE_COUNT; + } + + /* tile col/row offset */ + memcpy(fd, apv->tile_info.col_starts, (APV_MAX_TILE_COLS+1)*2); + fd += (APV_MAX_TILE_COLS+1)*2; + memcpy(fd, apv->tile_info.row_starts, (APV_MAX_TILE_ROWS+1)*2); + + /* Prepare frame to be used */ + err = ff_vk_decode_prepare_frame_sdr(dec, apv->output_frame, vp, 1, + FF_VK_REP_NATIVE, 0); + if (err < 0) + return err; + + return 0; +} + +static int vk_apv_decode_slice(AVCodecContext *avctx, + const uint8_t *data, + uint32_t size) +{ + APVDecodeContext *apv = avctx->priv_data; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data; + FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL; + + if (slices_buf && slices_buf->host_ref) { + AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 0)*sizeof(uint32_t), + data - slices_buf->mapped_mem); + AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 1)*sizeof(uint32_t), + size); + + apvvp->tile_num++; + } else { + int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0, + &apvvp->tile_num, + (const uint32_t **)&apvvp->frame_data); + if (err < 0) + return err; + + AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 0)*sizeof(uint32_t), + apvvp->frame_data[apvvp->tile_num - 1]); + AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 1)*sizeof(uint32_t), + size); + } + + return 0; +} + +static int vk_apv_end_frame(AVCodecContext *avctx) +{ + int err; + APVDecodeContext *apv = avctx->priv_data; + const CodedBitstreamAPVContext *apv_cbc = apv->cbc->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + FFVulkanFunctions *vk = &ctx->s.vkfn; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data; + FFVkBuffer *frame_data_buf = (FFVkBuffer *)apvvp->frame_data_buf->data; + + AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + enum AVPixelFormat sw_format = hwfc->sw_format; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(sw_format); + + VkImageMemoryBarrier2 img_bar[8]; + int nb_img_bar = 0; + + FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool); + ff_vk_exec_start(&ctx->s, exec); + + /* Make sure the buffer is flushed */ + RET(ff_vk_flush_buffer(&ctx->s, frame_data_buf, 0, frame_data_buf->size, 1)); + + /* Prepare deps */ + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, apv->output_frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + err = ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value, + apv->output_frame); + if (err < 0) + return err; + + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0)); + vp->slices_buf = NULL; + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &apvvp->frame_data_buf, 1, 0)); + apvvp->frame_data_buf = NULL; + + AVVkFrame *vkf = (AVVkFrame *)apv->output_frame->data[0]; + vkf->layout[0] = VK_IMAGE_LAYOUT_UNDEFINED; + vkf->access[0] = VK_ACCESS_2_NONE; + + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, + img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* Zero frame */ + for (int i = 0; i < ff_vk_count_images(vkf); i++) + vk->CmdClearColorImage(exec->buf, vkf->img[i], + VK_IMAGE_LAYOUT_GENERAL, + &((VkClearColorValue) { 0 }), + 1, &((VkImageSubresourceRange) { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + })); + + /* Wait for the frame to get zeroed out before continuing */ + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* Setup push data */ + DecodePushData pd = (DecodePushData) { + .tile_data = slices_buf->address, + .tile_count = { apv->tile_info.tile_cols, apv->tile_info.tile_rows }, + .log2_chroma_sub = { desc->log2_chroma_w, desc->log2_chroma_h }, + .components = desc->nb_components, + .bit_depth = apv_cbc->bit_depth, + }; + + /* Decoding */ + ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->decode, + apv->output_frame, vp->view.out, + 0, 0, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->decode, + 0, 1, 0, + frame_data_buf, + 0, frame_data_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->decode); + ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->decode, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, + apv->tile_info.tile_cols, apv->tile_info.tile_rows, + desc->nb_components); + + /* Wait for all decoding to finish */ + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* iDCT */ + ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->idct, + apv->output_frame, vp->view.out, + 0, 0, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->idct, + 0, 1, 0, + frame_data_buf, + 0, frame_data_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->idct); + ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->idct, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + /* one workgroup per group of 8 horizontally adjacent transform blocks, + * in the luma basis coords, in case a block is OOB writes/reads are ignored */ + int idct_cx = 0, idct_by = 0; + for (int comp = 0; comp < desc->nb_components; comp++) { + int sw = (comp == 0) ? 0 : desc->log2_chroma_w; + int sh = (comp == 0) ? 0 : desc->log2_chroma_h; + int bx = (avctx->coded_width + (1 << (3 + sw)) - 1) >> (3 + sw); + int by = (avctx->coded_height + (1 << (3 + sh)) - 1) >> (3 + sh); + idct_cx = FFMAX(idct_cx, (bx + 7) >> 3); + idct_by = FFMAX(idct_by, by); + } + vk->CmdDispatch(exec->buf, idct_cx, idct_by, desc->nb_components); + + err = ff_vk_exec_submit(&ctx->s, exec); + if (err < 0) + return err; + +fail: + return 0; +} + +static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s, + FFVkExecPool *pool, FFVulkanShader *shd) +{ + int err; + AVHWFramesContext *dec_frames_ctx; + dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, + (uint32_t []) { 1, 1, 1 }, 0); + ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "frame_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + } + }; + ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0); + + RET(ff_vk_shader_link(s, shd, + ff_apv_decode_comp_spv_data, + ff_apv_decode_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + return err; +} + +static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, + FFVkExecPool *pool, FFVulkanShader *shd) +{ + int err; + AVHWFramesContext *dec_frames_ctx; + dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + + SPEC_LIST_CREATE(sl, 1 + 64, (1 + 64)*sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 16, 32, 8); /* nb_blocks per workgroup */ + + const double idct_8_scales[8] = { + cos(4.0*M_PI/16.0) / 2.0, cos(1.0*M_PI/16.0) / 2.0, + cos(2.0*M_PI/16.0) / 2.0, cos(3.0*M_PI/16.0) / 2.0, + cos(4.0*M_PI/16.0) / 2.0, cos(5.0*M_PI/16.0) / 2.0, + cos(6.0*M_PI/16.0) / 2.0, cos(7.0*M_PI/16.0) / 2.0, + }; + for (int i = 0; i < 64; i++) + SPEC_LIST_ADD(sl, 18 + i, 32, + av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7])); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { 32, 2, 1 }, 0); + ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format), + }, + { + .name = "frame_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0); + + RET(ff_vk_shader_link(s, shd, + ff_apv_idct_comp_spv_data, + ff_apv_idct_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + return err; +} + +static void vk_decode_apv_uninit(FFVulkanDecodeShared *ctx) +{ + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + + ff_vk_shader_free(&ctx->s, &apvvk->decode); + ff_vk_shader_free(&ctx->s, &apvvk->idct); + + av_buffer_pool_uninit(&apvvk->frame_data_pool); + + av_freep(&apvvk); +} + +static int vk_decode_apv_init(AVCodecContext *avctx) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + + err = ff_vk_decode_init(avctx); + if (err < 0) + return err; + + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx = av_mallocz(sizeof(*apvvk)); + if (!apvvk) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->sd_ctx_free = &vk_decode_apv_uninit; + + RET(init_decode_shader(avctx, &ctx->s, &ctx->exec_pool, + &apvvk->decode)); + + RET(init_idct_shader(avctx, &ctx->s, &ctx->exec_pool, + &apvvk->idct)); + +fail: + return err; +} + +static void vk_apv_free_frame_priv(AVRefStructOpaque _hwctx, void *data) +{ + AVHWDeviceContext *dev_ctx = _hwctx.nc; + + APVVulkanDecodePicture *apvvp = data; + FFVulkanDecodePicture *vp = &apvvp->vp; + + ff_vk_decode_free_frame(dev_ctx, vp); + + av_buffer_unref(&apvvp->frame_data_buf); +} + +const FFHWAccel ff_apv_vulkan_hwaccel = { + .p.name = "apv_vulkan", + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_APV, + .p.pix_fmt = AV_PIX_FMT_VULKAN, + .start_frame = &vk_apv_start_frame, + .decode_slice = &vk_apv_decode_slice, + .end_frame = &vk_apv_end_frame, + .free_frame_priv = &vk_apv_free_frame_priv, + .frame_priv_data_size = sizeof(APVVulkanDecodePicture), + .init = &vk_decode_apv_init, + .update_thread_context = &ff_vk_update_thread_context, + .uninit = &ff_vk_decode_uninit, + .frame_params = &ff_vk_frame_params, + .priv_data_size = sizeof(FFVulkanDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE, +}; diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c index e87436b881..8c312221a3 100644 --- a/libavcodec/vulkan_decode.c +++ b/libavcodec/vulkan_decode.c @@ -28,6 +28,7 @@ #define DECODER_IS_SDR(codec_id) \ (((codec_id) == AV_CODEC_ID_FFV1) || \ ((codec_id) == AV_CODEC_ID_DPX) || \ + ((codec_id) == AV_CODEC_ID_APV) || \ ((codec_id) == AV_CODEC_ID_PRORES_RAW) || \ ((codec_id) == AV_CODEC_ID_PRORES)) @@ -55,6 +56,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc; #if CONFIG_DPX_VULKAN_HWACCEL extern const FFVulkanDecodeDescriptor ff_vk_dec_dpx_desc; #endif +#if CONFIG_APV_VULKAN_HWACCEL +extern const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc; +#endif static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_H264_VULKAN_HWACCEL @@ -81,6 +85,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_DPX_VULKAN_HWACCEL &ff_vk_dec_dpx_desc, #endif +#if CONFIG_APV_VULKAN_HWACCEL + &ff_vk_dec_apv_desc, +#endif }; typedef struct FFVulkanDecodeProfileData { _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
