Commit: 0d750d7c064bbb1e1fb5fe2ae14a8496863a890b Author: Patrick Mours Date: Thu Feb 13 15:15:38 2020 +0100 Branches: master https://developer.blender.org/rB0d750d7c064bbb1e1fb5fe2ae14a8496863a890b
Fix OptiX denoising when multiple CUDA streams are active =================================================================== M intern/cycles/device/device_optix.cpp =================================================================== diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index fc32679e794..39110cc0959 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -119,17 +119,8 @@ struct KernelParams { threads = (int)sqrt((float)threads); \ int xblocks = ((w) + threads - 1) / threads; \ int yblocks = ((h) + threads - 1) / threads; \ - check_result_cuda_ret(cuLaunchKernel(func, \ - xblocks, \ - yblocks, \ - 1, \ - threads, \ - threads, \ - 1, \ - 0, \ - cuda_stream[thread_index], \ - args, \ - 0)); \ + check_result_cuda_ret( \ + cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \ } \ (void)0 @@ -195,7 +186,7 @@ class OptiXDevice : public CUDADevice { OptixTraversableHandle tlas_handle = 0; OptixDenoiser denoiser = NULL; - vector<pair<int2, CUdeviceptr>> denoiser_state; + pair<int2, CUdeviceptr> denoiser_state = {}; int denoiser_input_passes = 0; public: @@ -250,9 +241,6 @@ class OptiXDevice : public CUDADevice { launch_params.data_elements = sizeof(KernelParams); // Allocate launch parameter buffer memory on device launch_params.alloc_to_device(info.cpu_threads); - - // Create denoiser state entries for all threads (but do not allocate yet) - denoiser_state.resize(info.cpu_threads); } ~OptiXDevice() { @@ -267,9 +255,8 @@ class OptiXDevice : public CUDADevice { cuMemFree(mem); } - // Free denoiser state for all threads - for (const pair<int2, CUdeviceptr> &state : denoiser_state) { - cuMemFree(state.second); + if (denoiser_state.second) { + cuMemFree(denoiser_state.second); } sbt_data.free(); @@ -571,7 +558,7 @@ class OptiXDevice : public CUDADevice { if (tile.task == RenderTile::PATH_TRACE) launch_render(task, tile, thread_index); else if (tile.task == RenderTile::DENOISE) - launch_denoise(task, tile, thread_index); + launch_denoise(task, tile); task.release_tile(tile); if (task.get_cancel() && !task.need_finish_queue) break; // User requested cancellation @@ -596,7 +583,7 @@ class OptiXDevice : public CUDADevice { tile.stride = task.stride; tile.buffers = task.buffers; - launch_denoise(task, tile, thread_index); + launch_denoise(task, tile); } } @@ -670,7 +657,7 @@ class OptiXDevice : public CUDADevice { } } - bool launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index) + bool launch_denoise(DeviceTask &task, RenderTile &rtile) { // Update current sample (for display and NLM denoising task) rtile.sample = rtile.start_sample + rtile.num_samples; @@ -807,8 +794,8 @@ class OptiXDevice : public CUDADevice { check_result_optix_ret( optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); - auto &state = denoiser_state[thread_index].second; - auto &state_size = denoiser_state[thread_index].first; + auto &state = denoiser_state.second; + auto &state_size = denoiser_state.first; const size_t scratch_size = sizes.recommendedScratchSizeInBytes; const size_t scratch_offset = sizes.stateSizeInBytes; @@ -824,7 +811,7 @@ class OptiXDevice : public CUDADevice { // Initialize denoiser state for the current tile size check_result_optix_ret(optixDenoiserSetup(denoiser, - cuda_stream[thread_index], + 0, rect_size.x, rect_size.y, state, @@ -872,7 +859,7 @@ class OptiXDevice : public CUDADevice { // Finally run denonising OptixDenoiserParams params = {}; // All parameters are disabled/zero check_result_optix_ret(optixDenoiserInvoke(denoiser, - cuda_stream[thread_index], + 0, ¶ms, state, scratch_offset, @@ -902,12 +889,11 @@ class OptiXDevice : public CUDADevice { "kernel_cuda_filter_convert_from_rgb", rtiles[9].w, rtiles[9].h, output_args); # endif - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); + check_result_cuda_ret(cuStreamSynchronize(0)); task.unmap_neighbor_tiles(rtiles, this); } else { - assert(thread_index == 0); // Run CUDA denoising kernels DenoisingTask denoising(this, task); CUDADevice::denoise(rtile, denoising); @@ -1436,6 +1422,15 @@ class OptiXDevice : public CUDADevice { void task_add(DeviceTask &task) override { + struct OptiXDeviceTask : public DeviceTask { + OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task) + { + // Using task index parameter instead of thread index, since number of CUDA streams may + // differ from number of threads + run = function_bind(&OptiXDevice::thread_run, device, *this, task_index); + } + }; + // Upload texture information to device if it has changed since last launch load_texture_info(); @@ -1445,20 +1440,17 @@ class OptiXDevice : public CUDADevice { return; } + if (task.type == DeviceTask::DENOISE || task.type == DeviceTask::DENOISE_BUFFER) { + // Execute denoising in a single thread (e.g. to avoid race conditions during creation) + task_pool.push(new OptiXDeviceTask(this, task, 0)); + return; + } + // Split task into smaller ones list<DeviceTask> tasks; task.split(tasks, info.cpu_threads); // Queue tasks in internal task pool - struct OptiXDeviceTask : public DeviceTask { - OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task) - { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - run = function_bind(&OptiXDevice::thread_run, device, *this, task_index); - } - }; - int task_index = 0; for (DeviceTask &task : tasks) task_pool.push(new OptiXDeviceTask(this, task, task_index++)); _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org https://lists.blender.org/mailman/listinfo/bf-blender-cvs