This patch adds 'vf_overlay_cuda' filter. 
It draws one picture on top of another on CUDA GPU. 
For the end-user, it's similar to 'vf_overlay_opencl' and other overlay 
filters. 

This filter would be especially useful for building video processing pipelines 
that execute fully on the CUDA GPU. For example, the following pipeline would 
be possible: decode -> scale -> overlay -> encode, without copying frames 
between CPU and GPU in between.

Technical details.

Supported sw input formats are NV12 and YUV420P for main input, and NV12, 
YUV420P and YUVA420P for overlay input. 
Main and overlay sw formats should match (i.e, overlaying YUVA420P on NV12 is 
not implemented). 
All pixel format conversions are needed to be done with 'format' or 'scale_npp' 
filters before 'overlay_cuda'.

It was needed to slightly modify 'hwcontext_cuda.c' to allow overlays with 
alpha channel:
 - Allow AV_PIX_FMT_YUVA420P to enable hwuploading frames with alpha channel to 
GPU.
 - Do not shift Height of 4rd plane (alpha) when uploading to GPU.

Examples.

- Overlay picture on top of video (main: YUVJ420P->NV12, overlay: NV12)
$ ffmpeg -y -init_hw_device cuda=cuda -filter_hw_device cuda -hwaccel cuvid \
  -c:v h264_cuvid -i main.mp4 \
  -i ~/overlay.jpg \
  -filter_complex "[1:v]format=nv12, hwupload[overlay], 
[0:v][overlay]overlay_cuda=x=0:y=0:shortest=false" \
  -an -c:v h264_nvenc -b:v 5M output.mp4

- Overlay one video on top of another (main: NV12, overlay: NV12)
$ ffmpeg -y \
  -hwaccel cuvid -c:v h264_cuvid -i main.mp4 \
  -hwaccel cuvid -c:v h264_cuvid -i overlay.mp4 \
  -filter_complex "[1:v]scale_npp=512:-1[o], 
[v:0][o]overlay_cuda=x=100:y=100:shortest=true" \
  -an -c:v h264_nvenc -b:v 5M output.mp4

- Overlay picture with alpha channel on top of video (main: NV12->YUV420P, 
overlay: RGBA->YUVA420P)
$ ffmpeg -y \
  -init_hw_device cuda=cuda -filter_hw_device cuda -hwaccel cuvid \
  -c:v h264_cuvid -i ~/main.mp4 \
  -i ~/overlay.png \
  -filter_complex "[1:v]format=yuva420p, hwupload[o], 
[v:0]scale_npp=format=yuv420p[m], [m][o]overlay_cuda=x=0:y=0:shortest=false" \
  -an -c:v h264_nvenc -b:v 5M output.mp4

Patch attached.

P.S. This is my first patch, I would be grateful for any feedback to know if 
I'm doing things correctly or not.
Thanks!


Signed-off-by: Yaroslav Pogrebnyak <yyyaros...@gmail.com>
---
 configure                      |   2 +
 libavfilter/Makefile           |   1 +
 libavfilter/allfilters.c       |   1 +
 libavfilter/vf_overlay_cuda.c  | 451 +++++++++++++++++++++++++++++++++
 libavfilter/vf_overlay_cuda.cu |  54 ++++
 libavutil/hwcontext_cuda.c     |   3 +-
 6 files changed, 511 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/vf_overlay_cuda.c
 create mode 100644 libavfilter/vf_overlay_cuda.cu

diff --git a/configure b/configure
index 18f2841765..b08dc7bd62 100755
--- a/configure
+++ b/configure
@@ -3026,6 +3026,8 @@ scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 thumbnail_cuda_filter_deps="ffnvcodec"
 thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
+overlay_cuda_filter_deps="ffnvcodec"
+overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
 amf_deps_any="libdl LoadLibrary"
 nvenc_deps="ffnvcodec"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 750412da6b..1ecaeae372 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -328,6 +328,7 @@ OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)         += vf_overlay_opencl.o opencl.o \
                                                 opencl/overlay.o framesync.o
 OBJS-$(CONFIG_OVERLAY_QSV_FILTER)            += vf_overlay_qsv.o framesync.o
 OBJS-$(CONFIG_OVERLAY_VULKAN_FILTER)         += vf_overlay_vulkan.o vulkan.o
+OBJS-$(CONFIG_OVERLAY_CUDA_FILTER)           += vf_overlay_cuda.o framesync.o vf_overlay_cuda.ptx.o
 OBJS-$(CONFIG_OWDENOISE_FILTER)              += vf_owdenoise.o
 OBJS-$(CONFIG_PAD_FILTER)                    += vf_pad.o
 OBJS-$(CONFIG_PAD_OPENCL_FILTER)             += vf_pad_opencl.o opencl.o opencl/pad.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 501e5d041b..fb32bef788 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -312,6 +312,7 @@ extern AVFilter ff_vf_overlay;
 extern AVFilter ff_vf_overlay_opencl;
 extern AVFilter ff_vf_overlay_qsv;
 extern AVFilter ff_vf_overlay_vulkan;
+extern AVFilter ff_vf_overlay_cuda;
 extern AVFilter ff_vf_owdenoise;
 extern AVFilter ff_vf_pad;
 extern AVFilter ff_vf_pad_opencl;
diff --git a/libavfilter/vf_overlay_cuda.c b/libavfilter/vf_overlay_cuda.c
new file mode 100644
index 0000000000..2fa4ea4443
--- /dev/null
+++ b/libavfilter/vf_overlay_cuda.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2020 Yaroslav Pogrebnyak <yyyaros...@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Overlay one video on top of another using cuda hardware acceleration
+ */
+
+#include "libavutil/log.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+
+#include "avfilter.h"
+#include "framesync.h"
+#include "internal.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, ctx->hwctx->internal->cuda_dl, x)
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+
+#define BLOCK_X 32
+#define BLOCK_Y 16
+
+static const enum AVPixelFormat supported_main_formats[] = {
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NONE,
+};
+
+static const enum AVPixelFormat supported_overlay_formats[] = {
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUVA420P,
+    AV_PIX_FMT_NONE,
+};
+
+/**
+ * OverlayCUDAContext
+ */
+typedef struct OverlayCUDAContext {
+    const AVClass      *class;
+
+    enum AVPixelFormat in_format_overlay;
+    enum AVPixelFormat in_format_main;
+
+    AVBufferRef *device_ref;
+    AVCUDADeviceContext *hwctx;
+
+    CUcontext cu_ctx;
+    CUmodule cu_module;
+    CUfunction cu_func;
+    CUstream cu_stream;
+
+    FFFrameSync fs;
+
+    int x_position;
+    int y_position;
+
+} OverlayCUDAContext;
+
+/**
+ * Helper to find out if provided format is supported by filter
+ */
+static int format_is_supported(const enum AVPixelFormat formats[], enum AVPixelFormat fmt)
+{
+    for (int i = 0; formats[i] != AV_PIX_FMT_NONE; i++)
+        if (formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+/**
+ * Helper checks if we can process main and overlay pixel formats
+ */
+static int formats_match(const enum AVPixelFormat format_main, const enum AVPixelFormat format_overlay) {
+    switch(format_main) {
+        case AV_PIX_FMT_NV12:
+            return format_overlay == AV_PIX_FMT_NV12;
+        case AV_PIX_FMT_YUV420P:
+            return format_overlay == AV_PIX_FMT_YUV420P ||
+                   format_overlay == AV_PIX_FMT_YUVA420P;
+        default:
+            return 0;
+    }
+}
+
+/**
+ * Call overlay kernell for a plane
+ */
+static int overlay_cuda_call_kernel(
+    OverlayCUDAContext *ctx,
+    int x_position, int y_position,
+    uint8_t* main_data, int main_linesize,
+    int main_width, int main_height,
+    uint8_t* overlay_data, int overlay_linesize,
+    int overlay_width, int overlay_height,
+    uint8_t* alpha_data, int alpha_linesize,
+    int alpha_adj_x, int alpha_adj_y) {
+
+    CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
+
+    void* kernel_args[] = {
+        &x_position, &y_position,
+        &main_data, &main_linesize,
+        &overlay_data, &overlay_linesize,
+        &overlay_width, &overlay_height,
+        &alpha_data, &alpha_linesize,
+        &alpha_adj_x, &alpha_adj_y,
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(
+        ctx->cu_func,
+        DIV_UP(main_width, BLOCK_X), DIV_UP(main_height, BLOCK_Y), 1,
+        BLOCK_X, BLOCK_Y, 1,
+        0, ctx->cu_stream, kernel_args, NULL));
+}
+
+/**
+ * Perform blend overlay picture over main picture
+ */
+static int overlay_cuda_blend(FFFrameSync *fs)
+{
+    int ret;
+
+    AVFilterContext *avctx = fs->parent;
+    OverlayCUDAContext *ctx = avctx->priv;
+    AVFilterLink *outlink = avctx->outputs[0];
+
+    CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = ctx->hwctx->cuda_ctx;
+
+    AVFrame *input_main, *input_overlay, *out;
+
+    ctx->cu_ctx = cuda_ctx;
+
+    // read main and overlay frames from inputs
+
+    ret = ff_framesync_get_frame(fs, 0, &input_main, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = ff_framesync_get_frame(fs, 1, &input_overlay, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!input_main || !input_overlay) {
+        return AVERROR_BUG;
+    }
+
+    ret = av_frame_make_writable(input_main);
+    if (ret < 0) {
+        return ret;
+    }
+
+    // push cuda context
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0) {
+        return ret;
+    }
+
+    // overlay first plane
+
+    overlay_cuda_call_kernel(ctx,
+        ctx->x_position, ctx->y_position,
+        input_main->data[0], input_main->linesize[0],
+        input_main->width, input_main->height,
+        input_overlay->data[0], input_overlay->linesize[0],
+        input_overlay->width, input_overlay->height,
+        input_overlay->data[3], input_overlay->linesize[3], 1, 1);
+
+    // overlay rest planes depending on pixel format
+
+    switch(ctx->in_format_overlay) {
+
+    case AV_PIX_FMT_NV12:
+        overlay_cuda_call_kernel(ctx,
+            ctx->x_position, ctx->y_position / 2,
+            input_main->data[1], input_main->linesize[1],
+            input_main->width, input_main->height / 2,
+            input_overlay->data[1], input_overlay->linesize[1],
+            input_overlay->width, input_overlay->height / 2,
+            0, 0, 0, 0);
+
+        break;
+
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_YUVA420P:
+        overlay_cuda_call_kernel(ctx,
+            ctx->x_position / 2 , ctx->y_position / 2,
+            input_main->data[1], input_main->linesize[1],
+            input_main->width / 2, input_main->height / 2,
+            input_overlay->data[1], input_overlay->linesize[1],
+            input_overlay->width / 2, input_overlay->height / 2,
+            input_overlay->data[3], input_overlay->linesize[3], 2, 2);
+
+        overlay_cuda_call_kernel(ctx,
+            ctx->x_position / 2 , ctx->y_position / 2,
+            input_main->data[2], input_main->linesize[2],
+            input_main->width / 2, input_main->height / 2,
+            input_overlay->data[2], input_overlay->linesize[2],
+            input_overlay->width / 2, input_overlay->height / 2,
+            input_overlay->data[3], input_overlay->linesize[3], 2, 2);
+
+        break;
+
+    default:
+        av_log(ctx, AV_LOG_ERROR, "Passed unsupported overlay pixel format\n");
+        return AVERROR_BUG;
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    out = av_frame_alloc();
+    av_frame_ref(out, input_main);
+    av_frame_copy_props(out, input_main);
+
+    return ff_filter_frame(outlink, out);
+}
+
+/**
+ * Initialize overlay_cuda
+ */
+static av_cold int overlay_cuda_init(AVFilterContext *avctx)
+{
+    OverlayCUDAContext* ctx = avctx->priv;
+    ctx->fs.on_event = &overlay_cuda_blend;
+
+    return 0;
+}
+
+/**
+ * Uninitialize overlay_cuda
+ */
+static av_cold void overlay_cuda_uninit(AVFilterContext *avctx)
+{
+    OverlayCUDAContext* ctx = avctx->priv;
+
+    ff_framesync_uninit(&ctx->fs);
+
+    if (ctx->hwctx && ctx->cu_module) {
+        CUcontext dummy;
+        CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
+        CHECK_CU(cu->cuCtxPushCurrent(ctx->cu_ctx));
+        CHECK_CU(cu->cuModuleUnload(ctx->cu_module));
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+}
+
+/**
+ * Activate overlay_cuda
+ */
+static int overlay_cuda_activate(AVFilterContext *avctx)
+{
+    OverlayCUDAContext *ctx = avctx->priv;
+
+    return ff_framesync_activate(&ctx->fs);
+}
+
+/**
+ * Query formats
+ */
+static int overlay_cuda_query_formats(AVFilterContext *avctx)
+{
+    static const enum AVPixelFormat pixel_formats[] = {
+        AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
+    };
+
+    AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
+
+    return ff_set_common_formats(avctx, pix_fmts);
+}
+
+/**
+ * Configure output
+ */
+static int overlay_cuda_config_output(AVFilterLink *outlink)
+{
+
+    extern char vf_overlay_cuda_ptx[];
+
+    int err;
+    AVFilterContext* avctx = outlink->src;
+    OverlayCUDAContext* ctx = avctx->priv;
+
+    AVFilterLink *inlink = avctx->inputs[0];
+    AVHWFramesContext  *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+
+    AVFilterLink *inlink_overlay = avctx->inputs[1];
+    AVHWFramesContext  *frames_ctx_overlay = (AVHWFramesContext*)inlink_overlay->hw_frames_ctx->data;
+
+    CUcontext dummy, cuda_ctx;
+    CudaFunctions *cu;
+
+    // check main input formats
+
+    if (!frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on main input\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->in_format_main = frames_ctx->sw_format;
+    if (!format_is_supported(supported_main_formats, ctx->in_format_main)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported main input format: %s\n",
+               av_get_pix_fmt_name(ctx->in_format_main));
+        return AVERROR(ENOSYS);
+    }
+
+    // check overlay input formats
+
+    if (!frames_ctx_overlay) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on overlay input\n");
+        return AVERROR(EINVAL);
+    }
+
+    ctx->in_format_overlay = frames_ctx_overlay->sw_format;
+    if (!format_is_supported(supported_overlay_formats, ctx->in_format_overlay)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported overlay input format: %s\n",
+            av_get_pix_fmt_name(ctx->in_format_overlay));
+        return AVERROR(ENOSYS);
+    }
+
+    // check we can overlay pictures with those pixel formats
+
+    if (!formats_match(ctx->in_format_main, ctx->in_format_overlay)) {
+        av_log(ctx, AV_LOG_ERROR, "Can't overlay %s on %s \n",
+            av_get_pix_fmt_name(ctx->in_format_overlay), av_get_pix_fmt_name(ctx->in_format_main));
+        return AVERROR(EINVAL);
+    }
+
+    // initialize
+
+    ctx->hwctx = frames_ctx->device_ctx->hwctx;
+    cuda_ctx = ctx->hwctx->cuda_ctx;
+    ctx->fs.time_base = inlink->time_base;
+
+    ctx->cu_stream = ctx->hwctx->stream;
+    ctx->device_ref = ((AVHWFramesContext*)inlink->hw_frames_ctx->data)->device_ref;
+
+    outlink->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
+
+    // load functions
+
+    cu = ctx->hwctx->internal->cuda_dl;
+
+    err = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (err < 0) {
+        return err;
+    }
+
+    err = CHECK_CU(cu-> cuModuleLoadData(&ctx->cu_module, vf_overlay_cuda_ptx));
+    if (err < 0) {
+        return err;
+    }
+
+    err = CHECK_CU(cu->cuModuleGetFunction(&ctx->cu_func, ctx->cu_module, "Overlay_Cuda"));
+    if (err < 0) {
+        return err;
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    // init dual input
+
+    err = ff_framesync_init_dualinput(&ctx->fs, avctx);
+    if (err < 0) {
+        return err;
+    }
+
+    return ff_framesync_configure(&ctx->fs);
+}
+
+
+#define OFFSET(x) offsetof(OverlayCUDAContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption overlay_cuda_options[] = {
+    { "x", "Overlay x position",
+      OFFSET(x_position), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, .flags = FLAGS },
+    { "y", "Overlay y position",
+      OFFSET(y_position), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, .flags = FLAGS },
+    { "eof_action", "Action to take when encountering EOF from secondary input ",
+        OFFSET(fs.opt_eof_action), AV_OPT_TYPE_INT, { .i64 = EOF_ACTION_REPEAT },
+        EOF_ACTION_REPEAT, EOF_ACTION_PASS, .flags = FLAGS, "eof_action" },
+        { "repeat", "Repeat the previous frame.",   0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_REPEAT }, .flags = FLAGS, "eof_action" },
+        { "endall", "End both streams.",            0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_ENDALL }, .flags = FLAGS, "eof_action" },
+        { "pass",   "Pass through the main input.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_PASS },   .flags = FLAGS, "eof_action" },
+    { "shortest", "force termination when the shortest input terminates", OFFSET(fs.opt_shortest), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+    { "repeatlast", "repeat overlay of the last overlay frame", OFFSET(fs.opt_repeatlast), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS },
+    { NULL },
+};
+
+FRAMESYNC_DEFINE_CLASS(overlay_cuda, OverlayCUDAContext, fs);
+
+static const AVFilterPad overlay_cuda_inputs[] = {
+    {
+        .name         = "main",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+    {
+        .name         = "overlay",
+        .type         = AVMEDIA_TYPE_VIDEO,
+    },
+    { NULL }
+};
+
+static const AVFilterPad overlay_cuda_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = &overlay_cuda_config_output,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_overlay_cuda = {
+    .name            = "overlay_cuda",
+    .description     = NULL_IF_CONFIG_SMALL("Overlay one video on top of another using CUDA"),
+    .priv_size       = sizeof(OverlayCUDAContext),
+    .priv_class      = &overlay_cuda_class,
+    .init            = &overlay_cuda_init,
+    .uninit          = &overlay_cuda_uninit,
+    .activate        = &overlay_cuda_activate,
+    .query_formats   = &overlay_cuda_query_formats,
+    .inputs          = overlay_cuda_inputs,
+    .outputs         = overlay_cuda_outputs,
+    .preinit         = overlay_cuda_framesync_preinit,
+    .flags_internal  = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_overlay_cuda.cu b/libavfilter/vf_overlay_cuda.cu
new file mode 100644
index 0000000000..43ec36c2ed
--- /dev/null
+++ b/libavfilter/vf_overlay_cuda.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Yaroslav Pogrebnyak <yyyaros...@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+extern "C" {
+
+__global__ void Overlay_Cuda(
+    int x_position, int y_position,
+    unsigned char* main, int main_linesize,
+    unsigned char* overlay, int overlay_linesize,
+    int overlay_w, int overlay_h,
+    unsigned char* overlay_alpha, int alpha_linesize,
+    int alpha_adj_x, int alpha_adj_y)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= overlay_w + x_position ||
+        y >= overlay_h + y_position ||
+        x < x_position ||
+        y < y_position ) {
+
+        return;
+    }
+
+    int overlay_x = x - x_position;
+    int overlay_y = y - y_position;
+
+    float alpha = 1.0;
+    if (alpha_linesize) {
+        alpha = overlay_alpha[alpha_adj_x * overlay_x  + alpha_adj_y * overlay_y * alpha_linesize] / 255.0f;
+    }
+
+    main[x + y*main_linesize] = alpha * overlay[overlay_x + overlay_y * overlay_linesize] + (1.0f - alpha) * main[x + y*main_linesize];
+}
+
+}
+
diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
index a87c280cf7..3c4e36dde7 100644
--- a/libavutil/hwcontext_cuda.c
+++ b/libavutil/hwcontext_cuda.c
@@ -39,6 +39,7 @@ typedef struct CUDAFramesContext {
 static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_NV12,
     AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUVA420P,
     AV_PIX_FMT_YUV444P,
     AV_PIX_FMT_P010,
     AV_PIX_FMT_P016,
@@ -274,7 +275,7 @@ static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst,
             .srcPitch      = src->linesize[i],
             .dstPitch      = dst->linesize[i],
             .WidthInBytes  = FFMIN(src->linesize[i], dst->linesize[i]),
-            .Height        = src->height >> (i ? priv->shift_height : 0),
+            .Height        = src->height >> ( (i == 0 || i == 3) ? 0 : priv->shift_height),
         };
 
         ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to