PR #23089 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23089 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23089.patch
This commit adds an APV decoder written in shaders. Going to do some benchmarks later. It's done in the style of ProRes, but with differences. The iDCT uses 8 horizontal adjacent blocks as the minimum unit, times 8 gives 64, which is able to fill an AMD wavefront pretty cleanly. The VLC parser could use some more optimizations. >From 641de5ab83db7afeb4c11bdb15158b5f1721feac Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Sun, 2 Nov 2025 14:38:10 +0000 Subject: [PATCH 1/5] apv_decode: add hardware decoding hooks --- libavcodec/apv_decode.c | 149 ++++++++++++++++++++++++---------------- libavcodec/apv_decode.h | 35 +++++++++- 2 files changed, 124 insertions(+), 60 deletions(-) diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index 4c0fd78cb0..4d9380b332 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -27,42 +27,15 @@ #include "apv.h" #include "apv_decode.h" -#include "apv_dsp.h" #include "avcodec.h" -#include "cbs.h" #include "cbs_apv.h" #include "codec_internal.h" #include "decode.h" #include "internal.h" #include "thread.h" - - -typedef struct APVDerivedTileInfo { - uint8_t tile_cols; - uint8_t tile_rows; - uint16_t num_tiles; - // The spec uses an extra element on the end of these arrays - // not corresponding to any tile. - uint16_t col_starts[APV_MAX_TILE_COLS + 1]; - uint16_t row_starts[APV_MAX_TILE_ROWS + 1]; -} APVDerivedTileInfo; - -typedef struct APVDecodeContext { - CodedBitstreamContext *cbc; - APVDSPContext dsp; - - CodedBitstreamFragment au; - APVDerivedTileInfo tile_info; - - AVPacket *pkt; - AVFrame *output_frame; - atomic_int tile_errors; - - int nb_unit; - - uint8_t warned_additional_frames; - uint8_t warned_unknown_pbu_types; -} APVDecodeContext; +#include "hwconfig.h" +#include "hwaccel_internal.h" +#include "config_components.h" static const enum AVPixelFormat apv_format_table[5][4] = { { AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14, AV_PIX_FMT_GRAY16 }, @@ -74,10 +47,23 @@ static const enum AVPixelFormat apv_format_table[5][4] = { static APVVLCLUT decode_lut; +static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx, + enum AVPixelFormat pix_fmt) +{ + enum AVPixelFormat pix_fmts[] = { + pix_fmt, + AV_PIX_FMT_NONE, + }; + + return ff_get_format(avctx, pix_fmts); +} + static int apv_decode_check_format(AVCodecContext *avctx, const APVRawFrameHeader *header) { - int err, bit_depth; + int err, bit_depth, dimensions_changed = 0; + enum AVPixelFormat pix_fmt; + APVDecodeContext *apv = avctx->priv_data; avctx->profile = header->frame_info.profile_idc; avctx->level = header->frame_info.level_idc; @@ -88,23 +74,36 @@ static int apv_decode_check_format(AVCodecContext *avctx, avpriv_request_sample(avctx, "Bit depth %d", bit_depth); return AVERROR_PATCHWELCOME; } - avctx->pix_fmt = - apv_format_table[header->frame_info.chroma_format_idc][(bit_depth - 10) >> 1]; - if (avctx->pix_fmt == AV_PIX_FMT_NONE) { + pix_fmt = apv_format_table[header->frame_info.chroma_format_idc][(bit_depth - 10) >> 1]; + if (pix_fmt == AV_PIX_FMT_NONE) { avpriv_request_sample(avctx, "YUVA444P14"); return AVERROR_PATCHWELCOME; } - err = ff_set_dimensions(avctx, - FFALIGN(header->frame_info.frame_width, 16), - FFALIGN(header->frame_info.frame_height, 16)); - if (err < 0) { - // Unsupported frame size. - return err; + if (avctx->width != header->frame_info.frame_width || + avctx->width != header->frame_info.frame_height) { + err = ff_set_dimensions(avctx, + FFALIGN(header->frame_info.frame_width, 16), + FFALIGN(header->frame_info.frame_height, 16)); + if (err < 0) { + // Unsupported frame size. + return err; + } + avctx->width = header->frame_info.frame_width; + avctx->height = header->frame_info.frame_height; + dimensions_changed = 1; + } + + if (pix_fmt != apv->pix_fmt || dimensions_changed) { + apv->pix_fmt = pix_fmt; + + err = get_pixel_format(avctx, pix_fmt); + if (err < 0) + return err; + + avctx->pix_fmt = err; } - avctx->width = header->frame_info.frame_width; - avctx->height = header->frame_info.frame_height; avctx->sample_aspect_ratio = (AVRational){ 1, 1 }; @@ -138,6 +137,8 @@ static av_cold int apv_decode_init(AVCodecContext *avctx) APVDecodeContext *apv = avctx->priv_data; int err; + apv->pix_fmt = AV_PIX_FMT_NONE; + ff_thread_once(&apv_entropy_once, apv_entropy_build_decode_lut); err = ff_cbs_init(&apv->cbc, AV_CODEC_ID_APV, avctx); @@ -217,7 +218,7 @@ static int apv_decode_tile_component(AVCodecContext *avctx, void *data, int comp_index = job % apv_cbc->num_comp; const AVPixFmtDescriptor *pix_fmt_desc = - av_pix_fmt_desc_get(avctx->pix_fmt); + av_pix_fmt_desc_get(apv->pix_fmt); int sub_w_shift = comp_index == 0 ? 0 : pix_fmt_desc->log2_chroma_w; int sub_h_shift = comp_index == 0 ? 0 : pix_fmt_desc->log2_chroma_h; @@ -367,7 +368,7 @@ static int apv_decode(AVCodecContext *avctx, AVFrame *output, if (avctx->skip_frame == AVDISCARD_ALL) return 0; - desc = av_pix_fmt_desc_get(avctx->pix_fmt); + desc = av_pix_fmt_desc_get(apv->pix_fmt); av_assert0(desc); err = ff_thread_get_buffer(avctx, output, 0); @@ -379,22 +380,51 @@ static int apv_decode(AVCodecContext *avctx, AVFrame *output, apv_derive_tile_info(tile_info, &input->frame_header); - // Each component within a tile is independent of every other, - // so we can decode all in parallel. - job_count = tile_info->num_tiles * desc->nb_components; + if (avctx->hwaccel) { + const FFHWAccel *hwaccel = ffhwaccel(avctx->hwaccel); - avctx->execute2(avctx, apv_decode_tile_component, - input, NULL, job_count); + err = ff_hwaccel_frame_priv_alloc(avctx, &apv->hwaccel_picture_private); + if (err < 0) + return err; - err = atomic_load_explicit(&apv->tile_errors, memory_order_relaxed); - if (err > 0) { - av_log(avctx, AV_LOG_ERROR, - "Decode errors in %d tile components.\n", err); - if (avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) { - // Output the frame anyway. - output->flags |= AV_FRAME_FLAG_CORRUPT; - } else { - return AVERROR_INVALIDDATA; + err = hwaccel->start_frame(avctx, apv->pkt->buf, + apv->pkt->data, apv->pkt->size); + if (err < 0) + return err; + + for (int j = 0; j < desc->nb_components; j++) { + for (int i = 0; i < tile_info->num_tiles; i++) { + APVRawTile *tile = &input->tile[i]; + err = hwaccel->decode_slice(avctx, tile->tile_data[j], + tile->tile_header.tile_data_size[j]); + if (err < 0) + return err; + } + } + + err = hwaccel->end_frame(avctx); + if (err < 0) + return err; + + av_refstruct_unref(&apv->hwaccel_picture_private); + } else { + // Each component within a tile is independent of every other, + // so we can decode all in parallel. + job_count = tile_info->num_tiles * desc->nb_components; + + avctx->execute2(avctx, apv_decode_tile_component, + input, NULL, job_count); + + err = atomic_load_explicit(&apv->tile_errors, memory_order_relaxed); + if (err > 0) { + av_log(avctx, AV_LOG_ERROR, + "Decode errors in %d tile components.\n", err); + if (avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) { + // Output the frame anyway. + output->flags |= AV_FRAME_FLAG_CORRUPT; + } else { + return AVERROR_INVALIDDATA; + } } } @@ -571,4 +601,7 @@ const FFCodec ff_apv_decoder = { AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, .caps_internal = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM, + .hw_configs = (const AVCodecHWConfigInternal *const []) { + NULL + }, }; diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h index 5671d89552..60259e1b5e 100644 --- a/libavcodec/apv_decode.h +++ b/libavcodec/apv_decode.h @@ -20,11 +20,14 @@ #define AVCODEC_APV_DECODE_H #include <stdint.h> +#include <stdatomic.h> #include "apv.h" -#include "avcodec.h" -#include "get_bits.h" +#include "apv_dsp.h" +#include "cbs.h" +#include "get_bits.h" +#include "libavutil/frame.h" // Number of bits in the entropy look-up tables. // It may be desirable to tune this per-architecture, as a larger LUT @@ -81,6 +84,34 @@ typedef struct APVEntropyState { uint8_t prev_k_level; } APVEntropyState; +typedef struct APVDerivedTileInfo { + uint8_t tile_cols; + uint8_t tile_rows; + uint16_t num_tiles; + // The spec uses an extra element on the end of these arrays + // not corresponding to any tile. + uint16_t col_starts[APV_MAX_TILE_COLS + 1]; + uint16_t row_starts[APV_MAX_TILE_ROWS + 1]; +} APVDerivedTileInfo; + +typedef struct APVDecodeContext { + CodedBitstreamContext *cbc; + APVDSPContext dsp; + + CodedBitstreamFragment au; + APVDerivedTileInfo tile_info; + + AVPacket *pkt; + AVFrame *output_frame; + void *hwaccel_picture_private; + atomic_int tile_errors; + + enum AVPixelFormat pix_fmt; + int nb_unit; + + uint8_t warned_additional_frames; + uint8_t warned_unknown_pbu_types; +} APVDecodeContext; /** * Build the decoder VLC look-up tables. -- 2.52.0 >From b0dc052732f9d237e0eebe3fb331dd99d9e65049 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 4 Nov 2025 19:02:02 +0100 Subject: [PATCH 2/5] apv_decode: expose decode_lut This makes it usable outside of apv_decode.c --- libavcodec/apv_decode.c | 6 +++--- libavcodec/apv_decode.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index 4d9380b332..c48dc0b588 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -45,7 +45,7 @@ static const enum AVPixelFormat apv_format_table[5][4] = { { AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_NONE, AV_PIX_FMT_YUVA444P16 }, }; -static APVVLCLUT decode_lut; +APVVLCLUT ff_apv_decode_lut; static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx, enum AVPixelFormat pix_fmt) @@ -129,7 +129,7 @@ static AVOnce apv_entropy_once = AV_ONCE_INIT; static av_cold void apv_entropy_build_decode_lut(void) { - ff_apv_entropy_build_decode_lut(&decode_lut); + ff_apv_entropy_build_decode_lut(&ff_apv_decode_lut); } static av_cold int apv_decode_init(AVCodecContext *avctx) @@ -248,7 +248,7 @@ static int apv_decode_tile_component(AVCodecContext *avctx, void *data, APVEntropyState entropy_state = { .log_ctx = avctx, - .decode_lut = &decode_lut, + .decode_lut = &ff_apv_decode_lut, .prev_dc = 0, .prev_k_dc = 5, .prev_k_level = 0, diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h index 60259e1b5e..dee908eebb 100644 --- a/libavcodec/apv_decode.h +++ b/libavcodec/apv_decode.h @@ -113,6 +113,8 @@ typedef struct APVDecodeContext { uint8_t warned_unknown_pbu_types; } APVDecodeContext; +extern APVVLCLUT ff_apv_decode_lut; + /** * Build the decoder VLC look-up tables. */ -- 2.52.0 >From b68860343bda3e73b60f3f8e2b978d9cb49929fd Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 4 Nov 2025 21:43:22 +0100 Subject: [PATCH 3/5] apv_decode: expose the current frame header We need to know the quantization matrix. --- libavcodec/apv_decode.c | 3 ++- libavcodec/apv_decode.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index c48dc0b588..1fd96e41de 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -82,7 +82,7 @@ static int apv_decode_check_format(AVCodecContext *avctx, } if (avctx->width != header->frame_info.frame_width || - avctx->width != header->frame_info.frame_height) { + avctx->height != header->frame_info.frame_height) { err = ff_set_dimensions(avctx, FFALIGN(header->frame_info.frame_width, 16), FFALIGN(header->frame_info.frame_height, 16)); @@ -375,6 +375,7 @@ static int apv_decode(AVCodecContext *avctx, AVFrame *output, if (err < 0) return err; + apv->cur_raw_frame = input; apv->output_frame = output; atomic_store_explicit(&apv->tile_errors, 0, memory_order_relaxed); diff --git a/libavcodec/apv_decode.h b/libavcodec/apv_decode.h index dee908eebb..dd65f1b948 100644 --- a/libavcodec/apv_decode.h +++ b/libavcodec/apv_decode.h @@ -24,6 +24,7 @@ #include "apv.h" #include "apv_dsp.h" +#include "cbs_apv.h" #include "cbs.h" #include "get_bits.h" @@ -100,6 +101,7 @@ typedef struct APVDecodeContext { CodedBitstreamFragment au; APVDerivedTileInfo tile_info; + APVRawFrame *cur_raw_frame; AVPacket *pkt; AVFrame *output_frame; -- 2.52.0 >From 95dec40ca75a8ce4271e420fd9be8ce0f87765de Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Sun, 14 Dec 2025 06:28:16 +0100 Subject: [PATCH 4/5] apv_decode: switch slice decode printout to TRACE It's much too noisy with multiple thousands of slices. --- libavcodec/apv_decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index 1fd96e41de..d03463e5e5 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -310,7 +310,7 @@ static int apv_decode_tile_component(AVCodecContext *avctx, void *data, } } - av_log(avctx, AV_LOG_DEBUG, + av_log(avctx, AV_LOG_TRACE, "Decoded tile %d component %d: %dx%d MBs starting at (%d,%d)\n", tile_index, comp_index, tile_mb_width, tile_mb_height, tile_start_x, tile_start_y); -- 2.52.0 >From 5a079b956b32139bfac26908d1e1a17ca07ee50d Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Sun, 2 Nov 2025 14:38:33 +0000 Subject: [PATCH 5/5] apv_decode: add a Vulkan hwaccel --- configure | 2 + libavcodec/Makefile | 1 + libavcodec/apv_decode.c | 6 + libavcodec/hwaccels.h | 1 + libavcodec/vulkan/Makefile | 3 + libavcodec/vulkan/apv_decode.comp.glsl | 221 +++++++++++ libavcodec/vulkan/apv_idct.comp.glsl | 117 ++++++ libavcodec/vulkan_apv.c | 524 +++++++++++++++++++++++++ libavcodec/vulkan_decode.c | 7 + libavcodec/vulkan_prores_raw.c | 1 - 10 files changed, 882 insertions(+), 1 deletion(-) create mode 100644 libavcodec/vulkan/apv_decode.comp.glsl create mode 100644 libavcodec/vulkan/apv_idct.comp.glsl create mode 100644 libavcodec/vulkan_apv.c diff --git a/configure b/configure index 39a522e7e8..5980ef2074 100755 --- a/configure +++ b/configure @@ -3403,6 +3403,8 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore" vulkan_deps="threads" vulkan_deps_any="libdl LoadLibrary" +apv_vulkan_hwaccel_deps="vulkan spirv_compiler" +apv_vulkan_hwaccel_select="apv_decoder" av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1" av1_d3d11va_hwaccel_select="av1_decoder" av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1" diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 85d35913f3..4935cfc3b3 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -1047,6 +1047,7 @@ OBJS-$(CONFIG_VIDEOTOOLBOX) += videotoolbox.o OBJS-$(CONFIG_VDPAU) += vdpau.o OBJS-$(CONFIG_VULKAN) += vulkan.o vulkan_video.o +OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan_decode.o vulkan_apv.o OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL) += dxva2_av1.o OBJS-$(CONFIG_AV1_DXVA2_HWACCEL) += dxva2_av1.o OBJS-$(CONFIG_AV1_D3D12VA_HWACCEL) += dxva2_av1.o d3d12va_av1.o diff --git a/libavcodec/apv_decode.c b/libavcodec/apv_decode.c index d03463e5e5..cdd6aad1a8 100644 --- a/libavcodec/apv_decode.c +++ b/libavcodec/apv_decode.c @@ -51,6 +51,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx, enum AVPixelFormat pix_fmt) { enum AVPixelFormat pix_fmts[] = { +#if CONFIG_APV_VULKAN_HWACCEL + AV_PIX_FMT_VULKAN, +#endif pix_fmt, AV_PIX_FMT_NONE, }; @@ -603,6 +606,9 @@ const FFCodec ff_apv_decoder = { AV_CODEC_CAP_FRAME_THREADS, .caps_internal = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM, .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_APV_VULKAN_HWACCEL + HWACCEL_VULKAN(apv), +#endif NULL }, }; diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h index 3de191288a..9af49eedb8 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -19,6 +19,7 @@ #ifndef AVCODEC_HWACCELS_H #define AVCODEC_HWACCELS_H +extern const struct FFHWAccel ff_apv_vulkan_hwaccel; extern const struct FFHWAccel ff_av1_d3d11va_hwaccel; extern const struct FFHWAccel ff_av1_d3d11va2_hwaccel; extern const struct FFHWAccel ff_av1_d3d12va_hwaccel; diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index c779b49fad..c6817967c7 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -1,6 +1,9 @@ clean:: $(RM) $(CLEANSUFFIXES:%=libavcodec/vulkan/%) +OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan/apv_decode.comp.spv.o \ + vulkan/apv_idct.comp.spv.o + OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_reset.comp.spv.o \ vulkan/ffv1_enc_reset_golomb.comp.spv.o \ diff --git a/libavcodec/vulkan/apv_decode.comp.glsl b/libavcodec/vulkan/apv_decode.comp.glsl new file mode 100644 index 0000000000..7567e8e120 --- /dev/null +++ b/libavcodec/vulkan/apv_decode.comp.glsl @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" + +#define APV_MAX_NUM_COMP 4 +#define APV_MAX_TILE_COLS 20 +#define APV_MAX_TILE_ROWS 20 +#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS) +#define APV_MIN_TRANS_COEFF -32768 +#define APV_MAX_TRANS_COEFF 32767 +#define APV_VLC_LUT_BITS 9 +#define APV_VLC_LUT_SIZE (1 << APV_VLC_LUT_BITS) +#define APV_TR_SIZE 8 +#define APV_BLK_COEFFS (APV_TR_SIZE * APV_TR_SIZE) +#define APV_MB_SIZE (ivec2(16, 16)) + +layout (set = 0, binding = 0) uniform writeonly uimage2D dst[]; +layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { + uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint8_t q_matrix[APV_MAX_NUM_COMP][8][8]; + uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint16_t tile_col[APV_MAX_TILE_COLS + 1]; + uint16_t tile_row[APV_MAX_TILE_ROWS + 1]; +}; + +struct SingleCLUTEntry { + uint16_t result; + uint8_t consume; + uint8_t more; +}; + +layout (set = 1, binding = 0, scalar) readonly uniform vlc_tab_buf { + SingleCLUTEntry single_lut[6][APV_VLC_LUT_SIZE]; +}; + +layout (push_constant, scalar) uniform pushConstants { + u8buf tile_data; + ivec2 tile_count; + ivec2 log2_chroma_sub; + int components; + int bit_depth; +}; + +GetBitContext gb; + +int apv_read_vlc(int k_param) +{ + uint next_bits = show_bits(gb, APV_VLC_LUT_BITS); + SingleCLUTEntry ent = single_lut[k_param][next_bits]; + + if (ent.more != 0) { + skip_bits(gb, ent.consume); + + next_bits = show_bits(gb, 16); + int leading_zeroes = 15 - findMSB(next_bits); + + if (leading_zeroes == 0) + return APV_MAX_TRANS_COEFF + 1; + + skip_bits(gb, leading_zeroes + 1); + + return (2 << k_param) + + ((1 << leading_zeroes) - 1) * (1 << k_param) + + int(get_bits(gb, leading_zeroes + k_param)); + } + + skip_bits(gb, ent.consume); + return int(ent.result); +} + +int prev_dc; +int prev_k_dc; +int prev_1st_ac_level; + +const ivec2 zigzag[64] = { + ivec2(0, 0), ivec2(1, 0), ivec2(0, 1), ivec2(0, 2), + ivec2(1, 1), ivec2(2, 0), ivec2(3, 0), ivec2(2, 1), + ivec2(1, 2), ivec2(0, 3), ivec2(0, 4), ivec2(1, 3), + ivec2(2, 2), ivec2(3, 1), ivec2(4, 0), ivec2(5, 0), + ivec2(4, 1), ivec2(3, 2), ivec2(2, 3), ivec2(1, 4), + ivec2(0, 5), ivec2(0, 6), ivec2(1, 5), ivec2(2, 4), + ivec2(3, 3), ivec2(4, 2), ivec2(5, 1), ivec2(6, 0), + ivec2(7, 0), ivec2(6, 1), ivec2(5, 2), ivec2(4, 3), + ivec2(3, 4), ivec2(2, 5), ivec2(1, 6), ivec2(0, 7), + ivec2(1, 7), ivec2(2, 6), ivec2(3, 5), ivec2(4, 4), + ivec2(5, 3), ivec2(6, 2), ivec2(7, 1), ivec2(7, 2), + ivec2(6, 3), ivec2(5, 4), ivec2(4, 5), ivec2(3, 6), + ivec2(2, 7), ivec2(3, 7), ivec2(4, 6), ivec2(5, 5), + ivec2(6, 4), ivec2(7, 3), ivec2(7, 4), ivec2(6, 5), + ivec2(5, 6), ivec2(4, 7), ivec2(5, 7), ivec2(6, 6), + ivec2(7, 5), ivec2(7, 6), ivec2(6, 7), ivec2(7, 7), +}; + +void decode_block(ivec2 pos, uint comp) +{ + int dc_coeff; + int abs_diff = apv_read_vlc(prev_k_dc); + + if (abs_diff != 0) { + if (get_bit(gb)) + dc_coeff = prev_dc - abs_diff; + else + dc_coeff = prev_dc + abs_diff; + } else { + dc_coeff = prev_dc; + } + + if (dc_coeff < APV_MIN_TRANS_COEFF || + dc_coeff > APV_MAX_TRANS_COEFF) + return; + + imageStore(dst[comp], pos, uvec4(uint(dc_coeff) & 0xFFFFu)); + prev_dc = dc_coeff; + prev_k_dc = min(abs_diff >> 1, 5); + + /* ACs */ + int scan_pos = 1; + int first_ac = 1; + int prev_level = prev_1st_ac_level; + int prev_run = 0; + + do { + int coeff_zero_run; + + int k_param = clamp(prev_run >> 2, 0, 2); + coeff_zero_run = apv_read_vlc(k_param); + + if (coeff_zero_run > APV_BLK_COEFFS - scan_pos) + return; + + /* image was already pre-cleared to all zeroes */ + scan_pos += coeff_zero_run; + prev_run = coeff_zero_run; + + if (scan_pos < APV_BLK_COEFFS) { + int abs_ac_coeff_minus1; + int level; + + k_param = clamp(prev_level >> 2, 0, 4); + abs_ac_coeff_minus1 = apv_read_vlc(k_param); + bool sign_ac_coeff = get_bit(gb); + + if (sign_ac_coeff) + level = -abs_ac_coeff_minus1 - 1; + else + level = abs_ac_coeff_minus1 + 1; + + if (level < APV_MIN_TRANS_COEFF || level > APV_MAX_TRANS_COEFF) + return; + + imageStore(dst[comp], pos + zigzag[scan_pos], uvec4(uint(level) & 0xFFFFu)); + + prev_level = abs_ac_coeff_minus1 + 1; + if (first_ac != 0) { + prev_1st_ac_level = prev_level; + first_ac = 0; + } + + ++scan_pos; + } + } while (scan_pos < APV_BLK_COEFFS); +} + +void main(void) +{ + const ivec2 tile_pos = ivec2(gl_WorkGroupID.xy); + const uint comp_idx = uint(gl_WorkGroupID.z); + + /* EC state */ + prev_dc = 0; + prev_k_dc = 5; + prev_1st_ac_level = 0; + + const int num_tiles = tile_count.x * tile_count.y; + const int tile_idx = tile_pos.y * tile_count.x + tile_pos.x; + const uvec2 tile_bs = tile_offset[int(comp_idx) * num_tiles + tile_idx]; + init_get_bits(gb, u8buf(tile_data + tile_bs.x), int(tile_bs.y)); + + ivec2 sub_shift = comp_idx == 0 ? ivec2(0) : log2_chroma_sub; + ivec2 tile_start = ivec2(tile_col[tile_pos.x], tile_row[tile_pos.y]); + ivec2 tile_dim = ivec2(tile_col[tile_pos.x + 1], + tile_row[tile_pos.y + 1]) - tile_start; + ivec2 tile_mb_dim = tile_dim / APV_MB_SIZE; + ivec2 blk_mb_dim = ivec2(2, 2) >> sub_shift; + + ivec2 mb, blk; + for (mb.y = 0; mb.y < tile_mb_dim.y; mb.y++) { + for (mb.x = 0; mb.x < tile_mb_dim.x; mb.x++) { + for (blk.y = 0; blk.y < blk_mb_dim.y; blk.y++) { + for (blk.x = 0; blk.x < blk_mb_dim.x; blk.x++) { + ivec2 pos = (APV_MB_SIZE*mb + + APV_TR_SIZE*blk + tile_start) >> sub_shift; + + decode_block(pos, comp_idx); + } + } + } + } +} diff --git a/libavcodec/vulkan/apv_idct.comp.glsl b/libavcodec/vulkan/apv_idct.comp.glsl new file mode 100644 index 0000000000..2b56bc02fc --- /dev/null +++ b/libavcodec/vulkan/apv_idct.comp.glsl @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#version 460 +#pragma shader_stage(compute) +#extension GL_GOOGLE_include_directive : require + +#include "common.glsl" +#include "dct.glsl" + +#define APV_MAX_NUM_COMP 4 +#define APV_MAX_TILE_COLS 20 +#define APV_MAX_TILE_ROWS 20 +#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS) +#define APV_TR_SIZE 8 +#define APV_BLOCKS_PER_WG 8 + +layout (set = 0, binding = 0) uniform uimage2D dst[]; +layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { + uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint8_t q_matrix[APV_MAX_NUM_COMP][8][8]; + uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT]; + uint16_t tile_col[APV_MAX_TILE_COLS + 1]; + uint16_t tile_row[APV_MAX_TILE_ROWS + 1]; +}; + +layout (push_constant, scalar) uniform pushConstants { + u8buf tile_data; + ivec2 tile_count; + ivec2 log2_chroma_sub; + int components; + int bit_depth; +}; + +const int apv_level_scale[6] = { 40, 45, 51, 57, 64, 71 }; + +void main(void) +{ + const uvec3 wgid = gl_WorkGroupID; + const uint comp = wgid.z; + + const uvec3 lid = gl_LocalInvocationID; + const uint block = (lid.y << 2) | (lid.x >> 3); /* 0..7 block in chunk */ + const uint col = lid.x & 0x7u; /* 0..7 column in block */ + + /* one workgroup handles eight horizontally neighbouring blocks */ + const int blk_x = int(wgid.x) * APV_BLOCKS_PER_WG + int(block); + const int blk_y = int(wgid.y); + const ivec2 pos = ivec2(blk_x, blk_y) * APV_TR_SIZE; + + /* note: some oddness happens on tile-boundaries */ + const ivec2 sub_shift = (comp == 0u) ? ivec2(0) : log2_chroma_sub; + const ivec2 luma_pos = pos << sub_shift; + + /* figure out the tile position */ + int tx = 0; + while (tx + 1 < tile_count.x && int(tile_col[tx + 1]) <= luma_pos.x) + tx++; + int ty = 0; + while (ty + 1 < tile_count.y && int(tile_row[ty + 1]) <= luma_pos.y) + ty++; + + const int tile_idx = ty * tile_count.x + tx; + const int qp = int(tile_qp[int(comp) * APV_MAX_TILE_COUNT + tile_idx]); + const int level_scale = apv_level_scale[qp % 6]; + const int qp_shift = qp / 6; + + const int half_range = 1 << (bit_depth - 1); + const int max_val = (1 << bit_depth) - 1; + const float fact = float(half_range); + const float norm = 1.0f / (1024.0f * fact); /* DCT normalization const */ + + [[unroll]] + for (uint y = 0u; y < 8u; y++) { + /* load */ + int raw = int(imageLoad(dst[comp], pos + ivec2(col, y)).x); + int coeff = sign_extend(raw, 16); + /* dequant + norm */ + int qs = level_scale * int(q_matrix[comp][col][y]) * (1 << qp_shift); + float v = float(coeff * qs) * norm; + /* scale */ + blocks[block][y * 9u + col] = v * idct_scale[y * 8u + col]; + } + barrier(); + + idct8(block, col, 9); + barrier(); + + blocks[block][col * 9u] += 1.0f; + + idct8(block, col * 9u, 1); + barrier(); + + [[unroll]] + for (int y = 0; y < 8; y++) { + float v = round(blocks[block][y * 9u + col] * fact); + imageStore(dst[comp], pos + ivec2(col, y), + uvec4(uint(clamp(int(v), 0, max_val)))); + } +} diff --git a/libavcodec/vulkan_apv.c b/libavcodec/vulkan_apv.c new file mode 100644 index 0000000000..dec46e2850 --- /dev/null +++ b/libavcodec/vulkan_apv.c @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2025 Lynne <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vulkan_decode.h" +#include "hwaccel_internal.h" + +#include "apv_decode.h" +#include "libavutil/mem.h" + +extern const unsigned char ff_apv_decode_comp_spv_data[]; +extern const unsigned int ff_apv_decode_comp_spv_len; + +extern const unsigned char ff_apv_idct_comp_spv_data[]; +extern const unsigned int ff_apv_idct_comp_spv_len; + +const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc = { + .codec_id = AV_CODEC_ID_APV, + .queue_flags = VK_QUEUE_COMPUTE_BIT, +}; + +typedef struct APVVulkanDecodePicture { + FFVulkanDecodePicture vp; + + AVBufferRef *frame_data_buf; + uint32_t *frame_data; + int tile_num; +} APVVulkanDecodePicture; + +typedef struct APVVulkanDecodeContext { + FFVulkanShader decode; + FFVulkanShader idct; + + FFVkBuffer clut_buf; + AVBufferPool *frame_data_pool; +} APVVulkanDecodeContext; + +typedef struct DecodePushData { + VkDeviceAddress tile_data; + int tile_count[2]; + int log2_chroma_sub[2]; + int components; + int bit_depth; +} DecodePushData; + +static int vk_apv_start_frame(AVCodecContext *avctx, + const AVBufferRef *buffer_ref, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + int err; + APVDecodeContext *apv = avctx->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + /* Host map the input tile data if supported */ + if (0 && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) + ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data, + buffer_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT); + + /* Allocate frame data buffer */ + int fd_size = (2*4*APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP + + (64 + APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP + + (APV_MAX_TILE_COLS + 1 + APV_MAX_TILE_ROWS + 1)*2; + + err = ff_vk_get_pooled_buffer(&ctx->s, &apvvk->frame_data_pool, + &apvvp->frame_data_buf, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, fd_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if (err < 0) + return err; + + /* Frame data */ + FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data; + uint8_t *fd = frame_data->mapped_mem; + + fd += 2*4*APV_MAX_TILE_COUNT*APV_MAX_NUM_COMP; /* Tile offsets go first */ + + /* per-component qmatrix and QPs */ + for (int i = 0; i < APV_MAX_NUM_COMP; i++) + memcpy(fd + 64*i, + apv->cur_raw_frame->frame_header.quantization_matrix.q_matrix[i], + 64); + fd += 64*APV_MAX_NUM_COMP; + + for (int i = 0; i < APV_MAX_NUM_COMP; i++) { + for (int j = 0; j < APV_MAX_TILE_COUNT; j++) + fd[j] = apv->cur_raw_frame->tile[j].tile_header.tile_qp[i]; + fd += APV_MAX_TILE_COUNT; + } + + /* tile col/row offset */ + memcpy(fd, apv->tile_info.col_starts, (APV_MAX_TILE_COLS+1)*2); + fd += (APV_MAX_TILE_COLS+1)*2; + memcpy(fd, apv->tile_info.row_starts, (APV_MAX_TILE_ROWS+1)*2); + + /* Prepare frame to be used */ + err = ff_vk_decode_prepare_frame_sdr(dec, apv->output_frame, vp, 1, + FF_VK_REP_NATIVE, 0); + if (err < 0) + return err; + + return 0; +} + +static int vk_apv_decode_slice(AVCodecContext *avctx, + const uint8_t *data, + uint32_t size) +{ + APVDecodeContext *apv = avctx->priv_data; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data; + FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL; + + if (slices_buf && slices_buf->host_ref) { + AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 0)*sizeof(uint32_t), + data - slices_buf->mapped_mem); + AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 1)*sizeof(uint32_t), + size); + + apvvp->tile_num++; + } else { + int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0, + &apvvp->tile_num, + (const uint32_t **)&apvvp->frame_data); + if (err < 0) + return err; + + AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 0)*sizeof(uint32_t), + apvvp->frame_data[apvvp->tile_num - 1]); + AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 1)*sizeof(uint32_t), + size); + } + + return 0; +} + +static int vk_apv_end_frame(AVCodecContext *avctx) +{ + int err; + APVDecodeContext *apv = avctx->priv_data; + const CodedBitstreamAPVContext *apv_cbc = apv->cbc->priv_data; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + FFVulkanFunctions *vk = &ctx->s.vkfn; + + APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private; + FFVulkanDecodePicture *vp = &apvvp->vp; + + FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data; + FFVkBuffer *frame_data_buf = (FFVkBuffer *)apvvp->frame_data_buf->data; + + AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + enum AVPixelFormat sw_format = hwfc->sw_format; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(sw_format); + + VkImageMemoryBarrier2 img_bar[8]; + int nb_img_bar = 0; + + FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool); + ff_vk_exec_start(&ctx->s, exec); + + /* Make sure the buffer is flushed */ + RET(ff_vk_flush_buffer(&ctx->s, frame_data_buf, 0, frame_data_buf->size, 1)); + + /* Prepare deps */ + RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, apv->output_frame, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)); + + err = ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value, + apv->output_frame); + if (err < 0) + return err; + + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0)); + vp->slices_buf = NULL; + RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &apvvp->frame_data_buf, 1, 0)); + apvvp->frame_data_buf = NULL; + + AVVkFrame *vkf = (AVVkFrame *)apv->output_frame->data[0]; + vkf->layout[0] = VK_IMAGE_LAYOUT_UNDEFINED; + vkf->access[0] = VK_ACCESS_2_NONE; + + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, + img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* Zero frame */ + for (int i = 0; i < ff_vk_count_images(vkf); i++) + vk->CmdClearColorImage(exec->buf, vkf->img[i], + VK_IMAGE_LAYOUT_GENERAL, + &((VkClearColorValue) { 0 }), + 1, &((VkImageSubresourceRange) { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + })); + + /* Wait for the frame to get zeroed out before continuing */ + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* Setup push data */ + DecodePushData pd = (DecodePushData) { + .tile_data = slices_buf->address, + .tile_count = { apv->tile_info.tile_cols, apv->tile_info.tile_rows }, + .log2_chroma_sub = { desc->log2_chroma_w, desc->log2_chroma_h }, + .components = desc->nb_components, + .bit_depth = apv_cbc->bit_depth, + }; + + /* Decoding */ + ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->decode, + apv->output_frame, vp->view.out, + 0, 0, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->decode, + 0, 1, 0, + frame_data_buf, + 0, frame_data_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->decode); + ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->decode, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, + apv->tile_info.tile_cols, apv->tile_info.tile_rows, + desc->nb_components); + + /* Wait for all decoding to finish */ + ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pImageMemoryBarriers = img_bar, + .imageMemoryBarrierCount = nb_img_bar, + }); + nb_img_bar = 0; + + /* iDCT */ + ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->idct, + apv->output_frame, vp->view.out, + 0, 0, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->idct, + 0, 1, 0, + frame_data_buf, + 0, frame_data_buf->size, + VK_FORMAT_UNDEFINED); + + ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->idct); + ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->idct, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + /* one workgroup per group of 8 horizontally adjacent transform blocks, + * in the luma basis coords, in case a block is OOB writes/reads are ignored */ + int idct_cx = 0, idct_by = 0; + for (int comp = 0; comp < desc->nb_components; comp++) { + int sw = (comp == 0) ? 0 : desc->log2_chroma_w; + int sh = (comp == 0) ? 0 : desc->log2_chroma_h; + int bx = (avctx->coded_width + (1 << (3 + sw)) - 1) >> (3 + sw); + int by = (avctx->coded_height + (1 << (3 + sh)) - 1) >> (3 + sh); + idct_cx = FFMAX(idct_cx, (bx + 7) >> 3); + idct_by = FFMAX(idct_by, by); + } + vk->CmdDispatch(exec->buf, idct_cx, idct_by, desc->nb_components); + + err = ff_vk_exec_submit(&ctx->s, exec); + if (err < 0) + return err; + +fail: + return 0; +} + +static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s, + FFVkExecPool *pool, FFVulkanShader *shd) +{ + int err; + AVHWFramesContext *dec_frames_ctx; + dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, + (uint32_t []) { 1, 1, 1 }, 0); + ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format), + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + { + .name = "frame_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + } + }; + ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0); + + const FFVulkanDescriptorSetBinding vlc_desc = { + .name = "vlc_tab_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }; + ff_vk_shader_add_descriptor_set(s, shd, &vlc_desc, 1, 1, 0); + + RET(ff_vk_shader_link(s, shd, + ff_apv_decode_comp_spv_data, + ff_apv_decode_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + return err; +} + +static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, + FFVkExecPool *pool, FFVulkanShader *shd) +{ + int err; + AVHWFramesContext *dec_frames_ctx; + dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data; + + SPEC_LIST_CREATE(sl, 1 + 64, (1 + 64)*sizeof(uint32_t)) + SPEC_LIST_ADD(sl, 16, 32, 8); /* nb_blocks per workgroup */ + + const double idct_8_scales[8] = { + cos(4.0*M_PI/16.0) / 2.0, cos(1.0*M_PI/16.0) / 2.0, + cos(2.0*M_PI/16.0) / 2.0, cos(3.0*M_PI/16.0) / 2.0, + cos(4.0*M_PI/16.0) / 2.0, cos(5.0*M_PI/16.0) / 2.0, + cos(6.0*M_PI/16.0) / 2.0, cos(7.0*M_PI/16.0) / 2.0, + }; + for (int i = 0; i < 64; i++) + SPEC_LIST_ADD(sl, 18 + i, 32, + av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7])); + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, + (uint32_t []) { 32, 2, 1 }, 0); + ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + FFVulkanDescriptorSetBinding desc_set[] = { + { + .name = "dst", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format), + }, + { + .name = "frame_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0); + + RET(ff_vk_shader_link(s, shd, + ff_apv_idct_comp_spv_data, + ff_apv_idct_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(s, pool, shd)); + +fail: + return err; +} + +static void vk_decode_apv_uninit(FFVulkanDecodeShared *ctx) +{ + APVVulkanDecodeContext *apvvk = ctx->sd_ctx; + + ff_vk_shader_free(&ctx->s, &apvvk->decode); + ff_vk_shader_free(&ctx->s, &apvvk->idct); + + av_buffer_pool_uninit(&apvvk->frame_data_pool); + + ff_vk_free_buf(&ctx->s, &apvvk->clut_buf); + + av_freep(&apvvk); +} + +static int vk_decode_apv_init(AVCodecContext *avctx) +{ + int err; + FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data; + + err = ff_vk_decode_init(avctx); + if (err < 0) + return err; + + FFVulkanDecodeShared *ctx = dec->shared_ctx; + APVVulkanDecodeContext *apvvk = ctx->sd_ctx = av_mallocz(sizeof(*apvvk)); + if (!apvvk) { + err = AVERROR(ENOMEM); + goto fail; + } + + ctx->sd_ctx_free = &vk_decode_apv_uninit; + + RET(init_decode_shader(avctx, &ctx->s, &ctx->exec_pool, + &apvvk->decode)); + + RET(init_idct_shader(avctx, &ctx->s, &ctx->exec_pool, + &apvvk->idct)); + + /* CLUT for decoding */ + size_t buf_size = 6*APV_VLC_LUT_SIZE*4; + APVSingleVLCLUTEntry *buf_mapped; + RET(ff_vk_create_buf(&ctx->s, &apvvk->clut_buf, + buf_size, + NULL, NULL, + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)); + RET(ff_vk_map_buffer(&ctx->s, &apvvk->clut_buf, (void *)&buf_mapped, 0)); + + memcpy(buf_mapped, &ff_apv_decode_lut.single_lut, + sizeof(ff_apv_decode_lut.single_lut)); + + RET(ff_vk_unmap_buffer(&ctx->s, &apvvk->clut_buf, 1)); + + ff_vk_shader_update_desc_buffer(&ctx->s, &ctx->exec_pool.contexts[0], + &apvvk->decode, + 1, 0, 0, + &apvvk->clut_buf, + 0, buf_size, + VK_FORMAT_UNDEFINED); + +fail: + return err; +} + +static void vk_apv_free_frame_priv(AVRefStructOpaque _hwctx, void *data) +{ + AVHWDeviceContext *dev_ctx = _hwctx.nc; + + APVVulkanDecodePicture *apvvp = data; + FFVulkanDecodePicture *vp = &apvvp->vp; + + ff_vk_decode_free_frame(dev_ctx, vp); + + av_buffer_unref(&apvvp->frame_data_buf); +} + +const FFHWAccel ff_apv_vulkan_hwaccel = { + .p.name = "apv_vulkan", + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_APV, + .p.pix_fmt = AV_PIX_FMT_VULKAN, + .start_frame = &vk_apv_start_frame, + .decode_slice = &vk_apv_decode_slice, + .end_frame = &vk_apv_end_frame, + .free_frame_priv = &vk_apv_free_frame_priv, + .frame_priv_data_size = sizeof(APVVulkanDecodePicture), + .init = &vk_decode_apv_init, + .update_thread_context = &ff_vk_update_thread_context, + .uninit = &ff_vk_decode_uninit, + .frame_params = &ff_vk_frame_params, + .priv_data_size = sizeof(FFVulkanDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE, +}; diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c index e87436b881..8c312221a3 100644 --- a/libavcodec/vulkan_decode.c +++ b/libavcodec/vulkan_decode.c @@ -28,6 +28,7 @@ #define DECODER_IS_SDR(codec_id) \ (((codec_id) == AV_CODEC_ID_FFV1) || \ ((codec_id) == AV_CODEC_ID_DPX) || \ + ((codec_id) == AV_CODEC_ID_APV) || \ ((codec_id) == AV_CODEC_ID_PRORES_RAW) || \ ((codec_id) == AV_CODEC_ID_PRORES)) @@ -55,6 +56,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc; #if CONFIG_DPX_VULKAN_HWACCEL extern const FFVulkanDecodeDescriptor ff_vk_dec_dpx_desc; #endif +#if CONFIG_APV_VULKAN_HWACCEL +extern const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc; +#endif static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_H264_VULKAN_HWACCEL @@ -81,6 +85,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = { #if CONFIG_DPX_VULKAN_HWACCEL &ff_vk_dec_dpx_desc, #endif +#if CONFIG_APV_VULKAN_HWACCEL + &ff_vk_dec_apv_desc, +#endif }; typedef struct FFVulkanDecodeProfileData { diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 392b74a863..aae455dc42 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -343,7 +343,6 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, (uint32_t []) { 8, nb_blocks, 4 }, 0); - ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData), VK_SHADER_STAGE_COMPUTE_BIT); add_desc(avctx, s, shd); -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
