PR #23559 opened by Niklas Haas (haasn) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23559 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23559.patch
This fixes downscaling case by implementing proper anti-aliasing. >From ae542f0c67b1f9fe9ae687d5927a72b3eeb6ed1e Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 22 Jun 2026 16:20:14 +0200 Subject: [PATCH 1/4] avfilter/scale_filters: add internal copy of libswscale/filters.c Useful for GPU-based filters, which may also need to compute filter weights. Since we cannot cross-link to internal functions, we need to recompile this helper inside libavfilter.c. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/scale_filters.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 libavfilter/scale_filters.c diff --git a/libavfilter/scale_filters.c b/libavfilter/scale_filters.c new file mode 100644 index 0000000000..347d073210 --- /dev/null +++ b/libavfilter/scale_filters.c @@ -0,0 +1,19 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libswscale/filters.c" -- 2.52.0 >From 12fc0251ba6fe1728fd3f7ca817d5c9b05d510d3 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 22 Jun 2026 18:43:59 +0200 Subject: [PATCH 2/4] avfilter/vf_scale_cuda: generalize kernel signature to accept weights Ignored for now by the existing fixed function kernels. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.cu | 27 ++++++++++++++++++++------- libavfilter/vf_scale_cuda.h | 5 +++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index ca7955344b..032b0e9c48 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -28,7 +28,9 @@ using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo, int dst_width, int dst_height, int src_left, int src_top, int src_width, int src_height, - int bit_depth, float param); + int bit_depth, float param, + const float *weights, const int *offsets, + int filter_size); // --- CONVERSION LOGIC --- @@ -90,14 +92,16 @@ static inline __device__ ushort conv_16to10pl(ushort in) __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \ int dst_width, int dst_height, int dst_pitch, \ int src_left, int src_top, int src_width, int src_height, \ - float param, int mpeg_range) + float param, int mpeg_range, \ + const float *weights, const int *offsets, int filter_size) #define SUB_F(m, plane) \ subsample_func_##m(src_tex[plane], xo, yo, \ dst_width, dst_height, \ src_left, src_top, \ src_width, src_height, \ - in_bit_depth, param) + in_bit_depth, param, \ + weights, offsets, filter_size) // FFmpeg passes pitch in bytes, CUDA uses potentially larger types #define FIXED_PITCH \ @@ -1095,7 +1099,9 @@ __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex, int dst_width, int dst_height, int src_left, int src_top, int src_width, int src_height, - int bit_depth, float param) + int bit_depth, float param, + const float *weights, const int *offsets, + int filter_size) { float hscale = (float)src_width / (float)dst_width; float vscale = (float)src_height / (float)dst_height; @@ -1111,7 +1117,9 @@ __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex, int dst_width, int dst_height, int src_left, int src_top, int src_width, int src_height, - int bit_depth, float param) + int bit_depth, float param, + const float *weights, const int *offsets, + int filter_size) { float hscale = (float)src_width / (float)dst_width; float vscale = (float)src_height / (float)dst_height; @@ -1143,7 +1151,9 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex, int dst_width, int dst_height, int src_left, int src_top, int src_width, int src_height, - int bit_depth, float param) + int bit_depth, float param, + const float *weights, const int *offsets, + int filter_size) { float hscale = (float)src_width / (float)dst_width; float vscale = (float)src_height / (float)dst_height; @@ -1197,7 +1207,10 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex, params.dst_width, params.dst_height, params.dst_pitch, \ params.src_left, params.src_top, \ params.src_width, params.src_height, \ - params.param, params.mpeg_range); + params.param, params.mpeg_range, \ + (const float*) params.weights, \ + (const int*) params.offsets, \ + params.filter_size); extern "C" { diff --git a/libavfilter/vf_scale_cuda.h b/libavfilter/vf_scale_cuda.h index d685f73072..391631699e 100644 --- a/libavfilter/vf_scale_cuda.h +++ b/libavfilter/vf_scale_cuda.h @@ -44,6 +44,11 @@ typedef struct { int src_height; float param; int mpeg_range; + + /* Weights for the generic filter kernel */ + CUdeviceptr weights; + CUdeviceptr offsets; + int filter_size; } CUDAScaleKernelParams; #endif -- 2.52.0 >From 19491071cf22fc09bd36dddb581f8e5c7df44f73 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 22 Jun 2026 18:51:01 +0200 Subject: [PATCH 3/4] avfilter/vf_scale_cuda: add generic 1D filter kernel This can be useful for any sort of separable filtering with arbitrary weights. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.cu | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index 032b0e9c48..01398c3db8 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -1186,6 +1186,43 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex, #undef PIX } +enum ScaleDir { + SCALE_DIR_X, + SCALE_DIR_Y, +}; + +template<typename T, int dir> +__device__ static inline T Subsample_Generic(cudaTextureObject_t tex, + int xo, int yo, + int dst_width, int dst_height, + int src_left, int src_top, + int src_width, int src_height, + int bit_depth, float param, + const float *weights, const int *offsets, + int filter_size) +{ + const float factor = bit_depth > 8 ? 0xFFFF : 0xFF; + + floatT sum; + vec_set_scalar(sum, 0.0f); + + if (dir == SCALE_DIR_X) { + const float *row = &weights[xo * filter_size]; + const float x = 0.5f + src_left + offsets[xo]; + const float y = 0.5f + src_top + yo; + for (int i = 0; i < filter_size; i++) + sum += tex2D<floatT>(tex, x + i, y) * row[i]; + } else { + const float *col = &weights[yo * filter_size]; + const float x = 0.5f + src_left + xo; + const float y = 0.5f + src_top + offsets[yo]; + for (int i = 0; i < filter_size; i++) + sum += tex2D<floatT>(tex, x, y + i) * col[i]; + } + + return from_floatN<T, floatT>(sum * factor); +} + /// --- FUNCTION EXPORTS --- #define KERNEL_ARGS(T) CUDAScaleKernelParams params @@ -1373,4 +1410,46 @@ LANCZOS_KERNELS_RGB(rgb0) LANCZOS_KERNELS_RGB(bgr0) LANCZOS_KERNELS_RGB(rgba) LANCZOS_KERNELS_RGB(bgra) + +#define GENERIC_KERNEL(D, DIR, C, S) \ + __global__ void Subsample_Generic_##D##_##C##S( \ + KERNEL_ARGS(Convert_##C::out_T##S)) \ + { \ + SUBSAMPLE((Convert_##C::Convert##S< \ + Subsample_Generic<Convert_##C::in_T, DIR>, \ + Subsample_Generic<Convert_##C::in_T_uv, DIR> >), \ + Convert_##C::out_T##S) \ + } + +#define GENERIC_KERNEL_RAW(C) \ + GENERIC_KERNEL(h, SCALE_DIR_X, C,) \ + GENERIC_KERNEL(h, SCALE_DIR_X, C,_uv) \ + GENERIC_KERNEL(v, SCALE_DIR_Y, C,) \ + GENERIC_KERNEL(v, SCALE_DIR_Y, C,_uv) + +#define GENERIC_KERNELS(C) \ + GENERIC_KERNEL_RAW(planar8_ ## C) \ + GENERIC_KERNEL_RAW(planar10_ ## C) \ + GENERIC_KERNEL_RAW(planar16_ ## C) \ + GENERIC_KERNEL_RAW(semiplanar8_ ## C) \ + GENERIC_KERNEL_RAW(semiplanar10_ ## C) \ + GENERIC_KERNEL_RAW(semiplanar16_ ## C) + +#define GENERIC_KERNELS_RGB(C) \ + GENERIC_KERNEL_RAW(rgb0_ ## C) \ + GENERIC_KERNEL_RAW(bgr0_ ## C) \ + GENERIC_KERNEL_RAW(rgba_ ## C) \ + GENERIC_KERNEL_RAW(bgra_ ## C) + +GENERIC_KERNELS(planar8) +GENERIC_KERNELS(planar10) +GENERIC_KERNELS(planar16) +GENERIC_KERNELS(semiplanar8) +GENERIC_KERNELS(semiplanar10) +GENERIC_KERNELS(semiplanar16) + +GENERIC_KERNELS_RGB(rgb0) +GENERIC_KERNELS_RGB(bgr0) +GENERIC_KERNELS_RGB(rgba) +GENERIC_KERNELS_RGB(bgra) } -- 2.52.0 >From dbe67fd95ad95f9378d5e1702cb1add577fadd88 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Mon, 22 Jun 2026 19:16:31 +0200 Subject: [PATCH 4/4] avfilter/vf_scale_cuda: add generic filter kernel implementation This may be faster or slower than the existing specialized kernels, so I opted not to prefer it by default. I also deliberately didn't expose additional filter function capabilites yet. The main motivating reason here is to get correct anti-aliasing behavior when downscaling, which is currently completely broken. Signed-off-by: Niklas Haas <[email protected]> --- doc/filters.texi | 6 + libavfilter/Makefile | 2 +- libavfilter/vf_scale_cuda.c | 333 ++++++++++++++++++++++++++++++++++-- 3 files changed, 322 insertions(+), 19 deletions(-) diff --git a/doc/filters.texi b/doc/filters.texi index 1a649cf794..d4062353a2 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -27463,6 +27463,12 @@ frame-consumer that exhausts the limited decoder frame pool. If set to 1, frames are passed through as-is if they match the desired output parameters. This is the default behaviour. +@item use_filters +If set to 1, filter with a generic weight LUt instead of using fixed-function +shader kernels. May be faster or slower depending on the hardware. A value +of @code{auto} (the default) enables this automatically when required for +correct anti-aliasing when downscaling. + @item param Algorithm-Specific parameter. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 5f0760a2ff..a8c14230ce 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -473,7 +473,7 @@ OBJS-$(CONFIG_SAB_FILTER) += vf_sab.o OBJS-$(CONFIG_SCALE_FILTER) += vf_scale.o scale_eval.o framesync.o OBJS-$(CONFIG_SCALE_D3D11_FILTER) += vf_scale_d3d11.o scale_eval.o OBJS-$(CONFIG_SCALE_D3D12_FILTER) += vf_scale_d3d12.o scale_eval.o -OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o scale_eval.o \ +OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o scale_eval.o scale_filters.o \ vf_scale_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_SCALE_NPP_FILTER) += vf_scale_npp.o scale_eval.o OBJS-$(CONFIG_SCALE_QSV_FILTER) += vf_vpp_qsv.o diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 01cac53348..403e4a2cca 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -22,14 +22,20 @@ #include <float.h> #include <stdio.h> +#include <string.h> +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" #include "libavutil/cuda_check.h" #include "libavutil/internal.h" +#include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/refstruct.h" + +#include "libswscale/filters.h" #include "avfilter.h" #include "filters.h" @@ -81,6 +87,19 @@ enum { INTERP_ALGO_COUNT }; +enum { + FILTER_OUT, + FILTER_TMP, + FILTER_NB, +}; + +typedef struct CUDAScaleFilter { + CUdeviceptr weights; ///< float[dst_size][filter_size] + CUdeviceptr offsets; ///< int[dst_size] + int filter_size; + int dst_size; +} CUDAScaleFilter; + typedef struct CUDAScaleContext { const AVClass *class; @@ -112,14 +131,19 @@ typedef struct CUDAScaleContext { CUcontext cu_ctx; CUmodule cu_module; - CUfunction cu_func; - CUfunction cu_func_uv; + CUfunction cu_func[FILTER_NB]; + CUfunction cu_func_uv[FILTER_NB]; CUstream cu_stream; int interp_algo; int interp_use_linear; int interp_as_integer; + CUDAScaleFilter filters[FILTER_NB]; + CUDAScaleFilter filters_uv[FILTER_NB]; + AVFrame *inter_buf; /* intermediate buffer for separated scaling */ + int use_filters; /* -1 for auto */ + float param; } CUDAScaleContext; @@ -138,23 +162,42 @@ static av_cold int cudascale_init(AVFilterContext *ctx) return 0; } +static void filter_uninit(CudaFunctions *cu, CUDAScaleFilter *filter) +{ + if (filter->weights) + cu->cuMemFree(filter->weights); + if (filter->offsets) + cu->cuMemFree(filter->offsets); + memset(filter, 0, sizeof(*filter)); +} + static av_cold void cudascale_uninit(AVFilterContext *ctx) { CUDAScaleContext *s = ctx->priv; - if (s->hwctx && s->cu_module) { + if (s->hwctx) { CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUcontext dummy; CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); - CHECK_CU(cu->cuModuleUnload(s->cu_module)); - s->cu_module = NULL; + + for (int i = 0; i < FF_ARRAY_ELEMS(s->filters); i++) { + filter_uninit(cu, &s->filters[i]); + filter_uninit(cu, &s->filters_uv[i]); + } + + if (s->cu_module) { + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); } av_frame_free(&s->frame); av_buffer_unref(&s->frames_ctx); av_frame_free(&s->tmp_frame); + av_frame_free(&s->inter_buf); } static av_cold int init_hwframe_ctx(CUDAScaleContext *s, AVBufferRef *device_ctx, int width, int height) @@ -194,6 +237,50 @@ fail: return ret; } +static av_cold int inter_buf_init(CUDAScaleContext *s, AVBufferRef *device_ctx, + enum AVPixelFormat format, int width, int height) +{ + AVBufferRef *ref = NULL; + AVHWFramesContext *fctx; + int ret; + + ref = av_hwframe_ctx_alloc(device_ctx); + if (!ref) + return AVERROR(ENOMEM); + fctx = (AVHWFramesContext*)ref->data; + + fctx->format = AV_PIX_FMT_CUDA; + fctx->sw_format = format; + fctx->width = FFALIGN(width, 32); + fctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(ref); + if (ret < 0) + goto fail; + + av_assert0(!s->inter_buf); + s->inter_buf = av_frame_alloc(); + if (!s->inter_buf) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = av_hwframe_get_buffer(ref, s->inter_buf, 0); + if (ret < 0) + goto fail; + + s->inter_buf->width = width; + s->inter_buf->height = height; + + av_buffer_unref(&ref); + return 0; + +fail: + av_frame_free(&s->inter_buf); + av_buffer_unref(&ref); + return ret; +} + static int format_is_supported(enum AVPixelFormat fmt) { for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) @@ -285,6 +372,18 @@ static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int if (in_width == out_width && in_height == out_height && in_format == out_format && s->interp_algo == INTERP_ALGO_DEFAULT) s->interp_algo = INTERP_ALGO_NEAREST; + + if (s->interp_algo == INTERP_ALGO_NEAREST) { + s->use_filters = 0; + } else if (s->use_filters < 0 && (in_width < out_width || in_height < out_height)) + s->use_filters = 1; /* downscaling; needed for anti-aliasing */ + + if (s->use_filters) { + ret = inter_buf_init(s, in_frames_ctx->device_ref, in_format, + out_width, in_height); + if (ret < 0) + return ret; + } } outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); @@ -310,6 +409,14 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) extern const unsigned char ff_vf_scale_cuda_ptx_data[]; extern const unsigned int ff_vf_scale_cuda_ptx_len; + if (s->use_filters) { + /* Final pass is always vertical unless not vertically scaling */ + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + function_infix = inlink->h == outlink->h ? "Generic_h" : "Generic_v"; + s->interp_use_linear = 0; + s->interp_as_integer = 0; + } else { switch(s->interp_algo) { case INTERP_ALGO_NEAREST: function_infix = "Nearest"; @@ -336,6 +443,7 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n"); return AVERROR_BUG; } + } ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); if (ret < 0) @@ -347,7 +455,7 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) goto fail; snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s", function_infix, in_fmt_name, out_fmt_name); - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, buf)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_OUT], s->cu_module, buf)); if (ret < 0) { av_log(ctx, AV_LOG_FATAL, "Unsupported conversion: %s -> %s\n", in_fmt_name, out_fmt_name); ret = AVERROR(ENOSYS); @@ -356,17 +464,173 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) av_log(ctx, AV_LOG_DEBUG, "Luma filter: %s (%s -> %s)\n", buf, av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt)); snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, in_fmt_name, out_fmt_name); - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, buf)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_OUT], s->cu_module, buf)); if (ret < 0) goto fail; av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt)); + if (s->inter_buf) { + /* Intermediate pass is always horizontal */ + snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s", in_fmt_name, in_fmt_name); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_TMP], s->cu_module, buf)); + if (ret < 0) + goto fail; + + snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s_uv", in_fmt_name, in_fmt_name); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_TMP], s->cu_module, buf)); + if (ret < 0) + goto fail; + } + fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; } +static av_cold int cudascale_filter_init(AVFilterContext *ctx, + CUDAScaleFilter *f, + int src_size, int dst_size, + double virtual_size) +{ + CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + SwsFilterParams params = { + .scaler_params = { SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT }, + .src_size = src_size, + .dst_size = dst_size, + .virtual_size = virtual_size, + }; + + switch (s->interp_algo) { + case INTERP_ALGO_NEAREST: return 0; /* no weights needed */ + case INTERP_ALGO_BILINEAR: params.scaler = SWS_SCALE_BILINEAR; break; + case INTERP_ALGO_LANCZOS: params.scaler = SWS_SCALE_LANCZOS; break; + case INTERP_ALGO_DEFAULT: + case INTERP_ALGO_BICUBIC: + params.scaler = SWS_SCALE_BICUBIC; + params.scaler_params[0] = params.scaler_params[1] = 0.0; + if (s->param != SCALE_CUDA_PARAM_DEFAULT) + params.scaler_params[1] = s->param; + break; + } + + SwsFilterWeights *weights = NULL; + int ret = ff_sws_filter_generate(ctx, ¶ms, &weights); + if (ret < 0) { + if (ret == AVERROR(ENOTSUP)) { + av_log(ctx, AV_LOG_ERROR, "Filter size exceeds the maximum " + "currently supported by the CUDA scaler (%d).\n", + SWS_FILTER_SIZE_MAX); + } + return ret; + } + + float *tmp = av_malloc_array(weights->num_weights, sizeof(*tmp)); + if (!tmp) { + ret = AVERROR(ENOMEM); + goto fail; + } + for (size_t i = 0; i < weights->num_weights; i++) + tmp[i] = weights->weights[i] / (float) SWS_FILTER_SCALE; + + f->filter_size = weights->filter_size; + f->dst_size = dst_size; + + const size_t weights_size = weights->num_weights * sizeof(*tmp); + ret = CHECK_CU(cu->cuMemAlloc(&f->weights, weights_size)); + if (ret < 0) + goto fail; + ret = CHECK_CU(cu->cuMemcpyHtoD(f->weights, tmp, weights_size)); + if (ret < 0) + goto fail; + + const size_t offsets_size = dst_size * sizeof(*weights->offsets); + ret = CHECK_CU(cu->cuMemAlloc(&f->offsets, offsets_size)); + if (ret < 0) + goto fail; + ret = CHECK_CU(cu->cuMemcpyHtoD(f->offsets, weights->offsets, offsets_size)); + if (ret < 0) + goto fail; + + av_log(ctx, AV_LOG_VERBOSE, " using %d tap '%s' filter: %d -> %d\n", + f->filter_size, weights->name, src_size, dst_size); + + ret = 0; + +fail: + av_free(tmp); + av_refstruct_unref(&weights); + return ret; +} + +static av_cold int cudascale_setup_filters(AVFilterContext *ctx) +{ + CUDAScaleContext *s = ctx->priv; + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + int ret; + + const int sub_x = s->in_desc->log2_chroma_w; + const int sub_y = s->in_desc->log2_chroma_h; + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; + + int pass_x = -1, pass_y = -1; + if (inlink->w != outlink->w && inlink->h != outlink->h) { + /* Always perform the horizontal scaling pass first */ + pass_x = FILTER_TMP; + pass_y = FILTER_OUT; + } else if (inlink->w != outlink->w) { + pass_x = FILTER_OUT; + } else if (inlink->h != outlink->h) { + pass_y = FILTER_OUT; + } + + if (pass_x >= 0) { + ret = cudascale_filter_init(ctx, &s->filters[pass_x], + inlink->w, outlink->w, 0.0); + if (ret < 0) + goto fail; + if (s->in_planes > 1) { + const int src_size = AV_CEIL_RSHIFT(inlink->w, sub_x); + const int dst_size = AV_CEIL_RSHIFT(outlink->w, sub_x); + const double ratio = (double) outlink->w / inlink->w; + ret = cudascale_filter_init(ctx, &s->filters_uv[pass_x], + src_size, dst_size, src_size * ratio); + if (ret < 0) + goto fail; + } + } + + if (pass_y >= 0) { + ret = cudascale_filter_init(ctx, &s->filters[pass_y], + inlink->h, outlink->h, 0.0); + if (ret < 0) + goto fail; + if (s->in_planes > 1) { + const int src_size = AV_CEIL_RSHIFT(inlink->h, sub_y); + const int dst_size = AV_CEIL_RSHIFT(outlink->h, sub_y); + const double ratio = (double) outlink->h / inlink->h; + ret = cudascale_filter_init(ctx, &s->filters_uv[pass_y], + src_size, dst_size, src_size * ratio); + if (ret < 0) + goto fail; + } + } + + ret = 0; + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return ret; +} + static av_cold int cudascale_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; @@ -427,6 +691,12 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink) outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), s->passthrough ? " (passthrough)" : ""); + if (s->use_filters) { + ret = cudascale_setup_filters(ctx); + if (ret < 0) + return ret; + } + ret = cudascale_load_functions(ctx); if (ret < 0) return ret; @@ -439,7 +709,8 @@ fail: static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height, - AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch, int mpeg_range) + AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch, int mpeg_range, + const CUDAScaleFilter *filter) { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; @@ -460,9 +731,15 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, .src_width = src_width, .src_height = src_height, .param = s->param, - .mpeg_range = mpeg_range + .mpeg_range = mpeg_range, }; + if (filter) { + params.weights = filter->weights; + params.offsets = filter->offsets; + params.filter_size = filter->filter_size; + } + void *args[] = { ¶ms }; return CHECK_CU(cu->cuLaunchKernel(func, @@ -470,7 +747,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); } -static int scalecuda_resize(AVFilterContext *ctx, +static int scalecuda_resize(AVFilterContext *ctx, int pass, AVFrame *out, AVFrame *in) { CUDAScaleContext *s = ctx->priv; @@ -479,6 +756,13 @@ static int scalecuda_resize(AVFilterContext *ctx, int i, ret; int mpeg_range = in->color_range != AVCOL_RANGE_JPEG; + const AVPixFmtDescriptor *out_desc = s->out_desc; + int out_planes = s->out_planes; + if (pass == FILTER_TMP) { + out_desc = s->in_desc; + out_planes = s->in_planes; + } + CUtexObject tex[4] = { 0, 0, 0, 0 }; int crop_width = (in->width - in->crop_right) - in->crop_left; @@ -520,23 +804,25 @@ static int scalecuda_resize(AVFilterContext *ctx, } // scale primary plane(s). Usually Y (and A), or single plane of RGB frames. - ret = call_resize_kernel(ctx, s->cu_func, + ret = call_resize_kernel(ctx, s->cu_func[pass], tex, in->crop_left, in->crop_top, crop_width, crop_height, - out, out->width, out->height, out->linesize[0], mpeg_range); + out, out->width, out->height, out->linesize[0], mpeg_range, + &s->filters[pass]); if (ret < 0) goto exit; - if (s->out_planes > 1) { + if (out_planes > 1) { // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane. - ret = call_resize_kernel(ctx, s->cu_func_uv, tex, + ret = call_resize_kernel(ctx, s->cu_func_uv[pass], tex, AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w), AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h), AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w), AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h), out, - AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), - AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), - out->linesize[1], mpeg_range); + AV_CEIL_RSHIFT(out->width, out_desc->log2_chroma_w), + AV_CEIL_RSHIFT(out->height, out_desc->log2_chroma_h), + out->linesize[1], mpeg_range, + &s->filters_uv[pass]); if (ret < 0) goto exit; } @@ -558,7 +844,16 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) AVFrame *src = in; int ret; - ret = scalecuda_resize(ctx, s->frame, src); + if (s->inter_buf) { + /* Handle first pass separately */ + s->inter_buf->color_range = in->color_range; + ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in); + if (ret < 0) + return ret; + src = s->inter_buf; + } + + ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src); if (ret < 0) return ret; @@ -653,6 +948,8 @@ static const AVOption options[] = { { "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_LANCZOS }, 0, 0, FLAGS, .unit = "interp_algo" }, { "format", "Output video pixel format", OFFSET(format), AV_OPT_TYPE_PIXEL_FMT, { .i64 = AV_PIX_FMT_NONE }, INT_MIN, INT_MAX, .flags=FLAGS }, { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + { "use_filters", "Use generic filters instead of fixed function kernels", OFFSET(use_filters), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, FLAGS, .unit = "use_filters" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = -1}, 0, 0, FLAGS, .unit = "use_filters" }, { "param", "Algorithm-Specific parameter", OFFSET(param), AV_OPT_TYPE_FLOAT, { .dbl = SCALE_CUDA_PARAM_DEFAULT }, -FLT_MAX, FLT_MAX, FLAGS }, { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, SCALE_FORCE_OAR_NB-1, FLAGS, .unit = "force_oar" }, { "disable", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = SCALE_FORCE_OAR_DISABLE }, 0, 0, FLAGS, .unit = "force_oar" }, -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
