[Bf-blender-cvs] [f5656204352] master: Fix T92985: CUDA errors with Cycles film convert kernels

Patrick Mours Wed, 10 Nov 2021 06:52:24 -0800

Commit: f56562043521a5c160585aea3f28167b4d3bc77d
Author: Patrick Mours
Date:   Wed Nov 10 14:37:15 2021 +0100
Branches: master
https://developer.blender.org/rBf56562043521a5c160585aea3f28167b4d3bc77d


Fix T92985: CUDA errors with Cycles film convert kernels

rB3a4c8f406a3a3bf0627477c6183a594fa707a6e2 changed the macros that create the 
film
convert kernel entry points, but in the process accidentally changed the 
parameter definition
to one of those (which caused CUDA launch and misaligned address errors) and 
changed the
implementation as well. This restores the correct implementation from before.

In addition, the `ccl_gpu_kernel_threads` macro did not work as intended and 
caused the
generated launch bounds to end up with an incorrect input for the second 
parameter (it was
set to "thread_num_registers", rather than the result of the block number 
calculation). I'm
not entirely sure why, as the macro definition looked sound to me. Decided to 
simply go with
two separate macros instead, to simplify and solve this.

Also changed how state is captured with the `ccl_gpu_kernel_lambda` macro 
slightly, to avoid
a compiler warning (expression has no effect) that otherwise occurred.

Maniphest Tasks: T92985

Differential Revision: https://developer.blender.org/D13175

===================================================================

M       intern/cycles/kernel/CMakeLists.txt
M       intern/cycles/kernel/device/cuda/config.h
M       intern/cycles/kernel/device/gpu/kernel.h
M       intern/cycles/kernel/device/hip/config.h
M       intern/cycles/kernel/device/metal/compat.h

===================================================================

diff --git a/intern/cycles/kernel/CMakeLists.txt 
b/intern/cycles/kernel/CMakeLists.txt
index f311b0e74bb..39cb886b16e 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -379,7 +379,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
     ${SRC_KERNEL_HEADERS}
     ${SRC_KERNEL_DEVICE_GPU_HEADERS}
     ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
-    ${SRC_KERNEL_DEVICE_METAL_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
   set(cuda_cubins)
diff --git a/intern/cycles/kernel/device/cuda/config.h 
b/intern/cycles/kernel/device/cuda/config.h
index e333fe90332..003881d7912 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,25 +92,19 @@
 
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
-
-#define ccl_gpu_kernel_threads(block_num_threads) \
-  extern "C" __global__ void __launch_bounds__(block_num_threads)
-
-#define ccl_gpu_kernel_threads_registers(block_num_threads, 
thread_num_registers) \
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS 
/ \
                                                    (block_num_threads * 
thread_num_registers))
 
-/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
-#define SELECT_MACRO(_1, _2, NAME, ...) NAME
-#define ccl_gpu_kernel(...) \
-  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, 
ccl_gpu_kernel_threads)(__VA_ARGS__)
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
 
 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
 
 #define ccl_gpu_kernel_call(x) x
 
-/* define a function object where "func" is the lambda body, and additional 
parameters are used to
+/* Define a function object where "func" is the lambda body, and additional 
parameters are used to
  * specify captured state  */
 #define ccl_gpu_kernel_lambda(func, ...) \
   struct KernelLambda { \
@@ -119,8 +113,7 @@
     { \
       return (func); \
     } \
-  } ccl_gpu_kernel_lambda_pass; \
-  ccl_gpu_kernel_lambda_pass
+  } ccl_gpu_kernel_lambda_pass
 
 /* sanity checks */
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h 
b/intern/cycles/kernel/device/gpu/kernel.h
index 2ec6a49ec7b..e954178ec63 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -56,8 +56,7 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-ccl_gpu_kernel_signature(integrator_reset,
-                          int num_states)
+    ccl_gpu_kernel_signature(integrator_reset, int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -265,7 +264,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, 
GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_queued_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -273,14 +272,14 @@ 
ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int kernel_index)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 
kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -288,25 +287,26 @@ 
ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int kernel_index)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 
kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_active_paths_array,
                              int num_states,
                              ccl_global int *indices,
                              ccl_global int *num_indices)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_terminated_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -314,11 +314,12 @@ 
ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int indices_offset)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, 
ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -326,11 +327,12 @@ 
ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int indices_offset)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 
0);
+
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, 
ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_sorted_paths_array,
                              int num_states,
                              int num_states_limit,
@@ -343,37 +345,37 @@ 
ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
   ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == 
kernel_index) ?
                             INTEGRATOR_STATE(state, path, shader_sort_key) :
                             GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
-                        int kernel_index)
-    .kernel_index = kernel_index;
-  
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   const uint state_index = ccl_gpu_global_id_x();
-  gpu_parallel_sorted_index_array(
-      state_index,
-      num_states,
-      num_states_limit,
-      indices,
+  gpu_parallel_sorted_index_array(state_index,
+                                  num_states,
+                                  num_states_limit,
+                                  indices,
                                   num_indices,
                                   key_counter,
                                   key_prefix_sum,
                                   ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_paths_array,
                              int num_states,
                              ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && 
(INTEGRATOR_STATE(state, path, queued_kernel) != 0),
-                        int num_active_paths)
-    .num_active_paths = num_active_paths;
-  
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, path, queued_kernel) != 
0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
   
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_states,
                              ccl_global const int *active_terminated_states,
                              const int active_states_offset,
@@ -390,22 +392,23 @@ 
ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
   }
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && 
(INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
-                       

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
Bf-blender-cvs@blender.org
List details, subscription details or unsubscribe:
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [f5656204352] master: Fix T92985: CUDA errors with Cycles film convert kernels

Reply via email to