On 24/01/18 23:44, Ben Chang wrote:
> Hi,
> 
> Please help review this patch to reduce stack frame size per GPU thread. The 
> default allocation size per thread (1024 bytes) is excessive and can be 
> reduced to 128 bytes based on nvidia cuda kernel compilation statistics. This 
> should help with reducing video memory usage per cuda context.
> 
> From b0b76b28b1af7dec0b5419ba9625085f7516e1a6 Mon Sep 17 00:00:00 2001
> From: Ben Chang <b...@nvidia.com>
> Date: Tue, 23 Jan 2018 19:45:59 -0800
> Subject: [PATCH] Reduce cuda context's stack frame size limit through
>  cuCtxSetLimit. The default stack limit is 1024 byte per GPU thread. This
>  reduces limit to 128 byte as verified against current cuda kernel compilation
>  statistic. This will reduce local memory allocated per cuda context.
> 
> ---
>  compat/cuda/dynlink_cuda.h   | 10 ++++++++++
>  compat/cuda/dynlink_loader.h |  2 ++
>  libavcodec/nvenc.c           |  6 ++++++
>  libavutil/hwcontext_cuda.c   |  6 ++++++
>  4 files changed, 24 insertions(+)
> 
> diff --git a/compat/cuda/dynlink_cuda.h b/compat/cuda/dynlink_cuda.h
> index 3a13611..b08a777 100644
> --- a/compat/cuda/dynlink_cuda.h
> +++ b/compat/cuda/dynlink_cuda.h
> @@ -59,6 +59,15 @@ typedef enum CUmemorytype_enum {
>      CU_MEMORYTYPE_DEVICE = 2
>  } CUmemorytype;
>  
> +typedef enum CUlimit_enum {
> +    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack 
> size */
> +    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO 
> size */
> +    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap 
> size */
> +    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device 
> runtime launch synchronize depth */
> +    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device 
> runtime pending launch count */
> +    CU_LIMIT_MAX
> +} CUlimit;
> +
>  typedef struct CUDA_MEMCPY2D_st {
>      size_t srcXInBytes;
>      size_t srcY;
> @@ -86,6 +95,7 @@ typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int 
> ordinal);
>  typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
>  typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, 
> CUdevice dev);
>  typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int 
> flags, CUdevice dev);
> +typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
>  typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
>  typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
>  typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
> diff --git a/compat/cuda/dynlink_loader.h b/compat/cuda/dynlink_loader.h
> index fa43782..55030ef 100644
> --- a/compat/cuda/dynlink_loader.h
> +++ b/compat/cuda/dynlink_loader.h
> @@ -118,6 +118,7 @@ typedef struct CudaFunctions {
>      tcuDeviceGetName *cuDeviceGetName;
>      tcuDeviceComputeCapability *cuDeviceComputeCapability;
>      tcuCtxCreate_v2 *cuCtxCreate;
> +    tcuCtxSetLimit *cuCtxSetLimit;
>      tcuCtxPushCurrent_v2 *cuCtxPushCurrent;
>      tcuCtxPopCurrent_v2 *cuCtxPopCurrent;
>      tcuCtxDestroy_v2 *cuCtxDestroy;
> @@ -197,6 +198,7 @@ static inline int cuda_load_functions(CudaFunctions 
> **functions, void *logctx)
>      LOAD_SYMBOL(cuDeviceGetName, tcuDeviceGetName, "cuDeviceGetName");
>      LOAD_SYMBOL(cuDeviceComputeCapability, tcuDeviceComputeCapability, 
> "cuDeviceComputeCapability");
>      LOAD_SYMBOL(cuCtxCreate, tcuCtxCreate_v2, "cuCtxCreate_v2");
> +    LOAD_SYMBOL(cuCtxSetLimit, tcuCtxSetLimit, "cuCtxSetLimit");
>      LOAD_SYMBOL(cuCtxPushCurrent, tcuCtxPushCurrent_v2, 
> "cuCtxPushCurrent_v2");
>      LOAD_SYMBOL(cuCtxPopCurrent, tcuCtxPopCurrent_v2, "cuCtxPopCurrent_v2");
>      LOAD_SYMBOL(cuCtxDestroy, tcuCtxDestroy_v2, "cuCtxDestroy_v2");
> diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
> index 4a91d99..2da251b 100644
> --- a/libavcodec/nvenc.c
> +++ b/libavcodec/nvenc.c
> @@ -420,6 +420,12 @@ static av_cold int nvenc_check_device(AVCodecContext 
> *avctx, int idx)
>          goto fail;
>      }
>  
> +    cu_res = dl_fn->cuda_dl->cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 128);
> +    if (cu_res != CUDA_SUCCESS) {
> +        av_log(avctx, AV_LOG_FATAL, "Failed reducing CUDA context stack 
> limit for NVENC: 0x%x\n", (int)cu_res);
> +        goto fail;
> +    }
> +
>      ctx->cu_context = ctx->cu_context_internal;
>  
>      if ((ret = nvenc_pop_context(avctx)) < 0)

Does this actually have any effect?  I was under the impression that the CUDA 
context created inside the NVENC encoder wouldn't actually be used for any CUDA 
operations at all (really just a GPU device handle).

> diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
> index 37827a7..1f022fa 100644
> --- a/libavutil/hwcontext_cuda.c
> +++ b/libavutil/hwcontext_cuda.c
> @@ -386,6 +386,12 @@ static int cuda_device_create(AVHWDeviceContext *ctx, 
> const char *device,
>          goto error;
>      }
>  
> +    err = cu->cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 128);
> +    if (err != CUDA_SUCCESS) {
> +        av_log(ctx, AV_LOG_ERROR, "Error reducing CUDA context stack 
> limit\n");
> +        goto error;
> +    }
> +
>      cu->cuCtxPopCurrent(&dummy);
>  
>      hwctx->internal->is_allocated = 1;
> -- 
> 2.9.1
> 

This is technically a user-visible change, since it will apply to all user 
programs run on the CUDA context created here as well as those inside ffmpeg.  
I'm not sure how many people actually use that, though, so maybe it won't 
affect anyone.

If the stack limit is violated, what happens?  Will that be undefined behaviour 
with random effects (crash / incorrect results), or is it likely to be caught 
at program compile/load-time?

Thanks,

- Mark
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to