[Bf-blender-cvs] [28b8751] cycles_kernel_split: Support rendering of big tile sizes

varunsundar08 Wed, 15 Apr 2015 08:38:10 -0700

Commit: 28b87518a1ad20894b4e29fc50e04f0c424a125d
Author: varunsundar08
Date:   Wed Apr 8 19:40:47 2015 +0530
Branches: cycles_kernel_split
https://developer.blender.org/rB28b87518a1ad20894b4e29fc50e04f0c424a125d


Support rendering of big tile sizes

===================================================================

M       intern/cycles/device/device_opencl.cpp
M       intern/cycles/kernel/kernel_Background_BufferUpdate.cl
M       intern/cycles/kernel/kernel_DataInit.cl
M       intern/cycles/kernel/kernel_SumAllRadiance.cl
M       intern/cycles/render/buffers.h

===================================================================

diff --git a/intern/cycles/device/device_opencl.cpp 
b/intern/cycles/device/device_opencl.cpp
index 103f710..58de549 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -544,15 +544,6 @@ public:
        size_t PathState_size;
        size_t Intersection_size;
 
-       /* Volume of ShaderData; ShaderData (in split_kernel) is a
-        * Structure-Of-Arrays implementation; We need to calculate memory
-        * required for a single thread
-        */
-       size_t ShaderData_volume;
-
-       /* This is total ShaderClosure size required for one thread */
-       size_t ShaderClosure_size;
-
        /* Sizes of memory required for shadow blocked function */
        size_t AOAlpha_size;
        size_t AOBSDF_size;
@@ -562,7 +553,7 @@ public:
        size_t Intersection_coop_AO_size;
        size_t Intersection_coop_DL_size;
 
-       /* This is sizeof_output_buffer / tile_size */
+       /* Amount of memory in output buffer associated with one pixel */
        size_t per_thread_output_buffer_size;
 
        /* Total allocatable available device memory */
@@ -595,9 +586,6 @@ public:
        unsigned int max_work_groups;
 #endif
 
-       /* Flag denoting if rendering the scene with current tile size is 
possible */
-       bool cannot_render_scene;
-
        /* Marked True in constructor and marked false at the end of 
path_trace() */
        bool first_tile;
 
@@ -857,6 +845,7 @@ public:
                Intersection_coop_DL_size = sizeof(Intersection);
 
                per_thread_output_buffer_size = 0;
+
                per_thread_memory = 0;
                render_scene_input_data_size = 0;
                hostRayStateArray = NULL;
@@ -865,7 +854,6 @@ public:
                work_pool_wgs = NULL;
                max_work_groups = 0;
 #endif
-               cannot_render_scene = false;
                first_tile = true;
 
 #else
@@ -2396,6 +2384,37 @@ public:
        }
 #endif
 
+#ifdef __SPLIT_KERNEL__
+       /* Returns size of KernelGlobals structure associated with OpenCL */
+       size_t get_KernelGlobals_size() {
+               /* Copy dummy KernelGlobals related to OpenCL from 
kernel_globals.h to fetch its size */
+               typedef struct KernelGlobals {
+                       ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+                       ccl_global type *name;
+#include "kernel_textures.h"
+               } KernelGlobals;
+
+               return sizeof(KernelGlobals);
+       }
+
+       /* Returns size of Structure of arrays implementation of */
+       size_t get_shaderdata_soa_size() {
+               size_t num_shader_soa_ptr = SD_NUM_FLOAT3 + SD_NUM_INT + 
SD_NUM_FLOAT
+#ifdef __DPDU__
+                       + SD_NUM_DPDU_FLOAT3
+#endif
+#ifdef __RAY_DIFFERENTIAL__
+                       + SD_NUM_RAY_DIFFERENTIALS_DIFFERENTIAL3
+                       + SD_NUM_DIFFERENTIAL
+#endif
+                       + SD_NUM_RAY_DP_DIFFERENTIAL3;
+
+               return (num_shader_soa_ptr * sizeof(void *));
+       }
+
+#endif
+
        void path_trace(RenderTile& rtile, int sample)
        {
                /* cast arguments to cl types */
@@ -2409,11 +2428,6 @@ public:
                cl_int d_offset = rtile.offset;
                cl_int d_stride = rtile.stride;
 #ifdef __SPLIT_KERNEL__
-               (void)sample;
-
-               if(cannot_render_scene) {
-                       return;
-               }
 
                /* ray_state and hostRayStateArray should be of same size */
                assert(hostRayState_size == rayState_size);
@@ -2422,25 +2436,39 @@ public:
                size_t global_size[2];
                size_t local_size[2] = { SPLIT_KERNEL_LOCAL_SIZE_X, 
SPLIT_KERNEL_LOCAL_SIZE_Y };
 
+               /* Set the range of samples to be processed for every ray in 
path-regeneration logic */
+               cl_int start_sample = rtile.start_sample;
+               cl_int end_sample = rtile.start_sample + rtile.num_samples;
+               cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+               global_size[0] = (((d_w - 1) / local_size[0]) + 1) * 
local_size[0];
+               global_size[1] = (((d_h - 1) / local_size[1]) + 1) * 
local_size[1];
+               unsigned int num_parallel_samples = 1;
+#else
+               /* We may not need all global_size[0] threads; We only need as 
much as num_parallel_samples * d_w */
+               global_size[0] = num_parallel_samples * d_w;
+               global_size[0] = (((global_size[0] - 1) / local_size[0]) + 1) * 
local_size[0];
+
+               assert(global_size[0] * global_size[1] <= num_parallel_threads);
+               assert(global_size[0] * global_size[1] >= d_w * d_h);
+#endif // __WORK_STEALING__
+
+               /* Allocate all required global memory once */
                if(first_tile) {
+                       size_t num_global_elements = 
rtile.max_render_feasible_tile_size.x * rtile.max_render_feasible_tile_size.y;
 
 #ifdef __MULTI_CLOSURE__
-                       ShaderClosure_size = get_shader_closure_size(clos_max);
+                       size_t ShaderClosure_size = 
get_shader_closure_size(clos_max);
 #else
-                       ShaderClosure_size = 
get_shader_closure_size(MAX_CLOSURE);
+                       size_t ShaderClosure_size = 
get_shader_closure_size(MAX_CLOSURE);
 #endif
-                       ShaderData_volume = 
get_shader_data_size(ShaderClosure_size);
-
-                       /* Determine texture memories once */
-#define KERNEL_TEX(type, ttype, name) \
-                       render_scene_input_data_size += get_tex_size(#name);
-#include "kernel_textures.h"
 
 #ifdef __WORK_STEALING__
                        /* Calculate max groups */
                        size_t max_global_size[2];
-                       size_t tile_x = rtile.tile_size.x;
-                       size_t tile_y = rtile.tile_size.y;
+                       size_t tile_x = rtile.max_render_feasible_tile_size.x;
+                       size_t tile_y = rtile.max_render_feasible_tile_size.y;
                        max_global_size[0] = (((tile_x - 1) / local_size[0]) + 
1) * local_size[0];
                        max_global_size[1] = (((tile_y - 1) / local_size[1]) + 
1) * local_size[1];
                        max_work_groups = (max_global_size[0] * 
max_global_size[1]) / (local_size[0] * local_size[1]);
@@ -2457,457 +2485,314 @@ public:
                        use_queues_flag = clCreateBuffer(cxContext, 
CL_MEM_READ_WRITE, sizeof(char), NULL, &ciErr);
                        assert(ciErr == CL_SUCCESS && "Can't create 
use_queues_flag memory");
 
-                       /* Calculate per thread memory */
-                       size_t output_buffer_size = 0;
-                       ciErr = clGetMemObjectInfo(d_buffer, CL_MEM_SIZE, 
sizeof(output_buffer_size), &output_buffer_size, NULL);
-                       assert(ciErr == CL_SUCCESS && "Can't get d_buffer mem 
object info");
-
-                       /* This value is different when running on AMD and NV */
-                       per_thread_output_buffer_size = output_buffer_size / 
(d_w * d_h);
-
-                       per_thread_memory = rng_size + throughput_size + 
L_transparent_size + rayState_size + work_element_size
-                                + ISLamp_size + PathRadiance_size + Ray_size + 
PathState_size
-                                + Intersection_size                  /* 
Overall isect */
-                                + Intersection_coop_AO_size          /* 
Instersection_coop_AO */
-                                + Intersection_coop_DL_size          /* 
Intersection coop DL */
-                                + ShaderData_volume       /* Overall 
ShaderData */
-                                + ShaderData_volume       /* 
ShaderData_coop_DL */
-                                + (ShaderData_volume * 2) /* ShaderData coop 
shadow */
-                                + LightRay_size + BSDFEval_size + AOAlpha_size 
+ AOBSDF_size + AOLightRay_size
-                                + (sizeof(int) * NUM_QUEUES)
-                                + per_thread_output_buffer_size;
-
-                       int user_set_tile_w = rtile.tile_size.x;
-                       int user_set_tile_h = rtile.tile_size.y;
-
-                       total_allocatable_parallel_sample_processing_memory = 
total_allocatable_memory
-                       - sizeof(int)* NUM_QUEUES                               
                 /* Queue index size */
-                       - sizeof(char)                                          
                 /* use_queues */
-                       -render_scene_input_data_size                           
                 /* size for textures, bvh etc */
-                       - (user_set_tile_w * user_set_tile_h) * 
per_thread_output_buffer_size    /* max d_buffer size possible */
-                       - (user_set_tile_w * user_set_tile_h) * sizeof(RNG)     
                 /* max d_rng_state size possible */
-#ifdef __WORK_STEALING__
-                       - max_work_groups * sizeof(unsigned int)
-#endif
-                       - DATA_ALLOCATION_MEM_FACTOR;
-               }
-
-               /* Set the range of samples to be processed for every ray in 
path-regeneration logic */
-               cl_int start_sample = rtile.start_sample;
-               cl_int end_sample = rtile.start_sample + rtile.num_samples;
-               cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-               /* TODO : support dynamic num_parallel_samples in work_stealing
-                * Do not change the values of 
num_parallel_samples/num_parallel_threads
-                */
-               unsigned int num_parallel_samples = 0;
-               global_size[0] = (((rtile.tile_size.x - 1) / local_size[0]) + 
1) * local_size[0];
-               global_size[1] = (((rtile.tile_size.y - 1) / local_size[1]) + 
1) * local_size[1];
-               unsigned int num_parallel_threads = global_size[0] * 
global_size[1];
-
-               /* Check if we can process atleast one sample */
-               num_parallel_samples = 
(total_allocatable_parallel_sample_processing_memory / (per_thread_memory * 
num_parallel_threads));
-               num_parallel_samples = (num_parallel_samples > 0) ? 1 : 0;
-#else
-               unsigned int num_parallel_threads = 
total_allocatable_parallel_sample_processing_memory / per_thread_memory;
-
-               /* Estimate maximum global work size that can be launched */
-               global_size[1] = (((d_h - 1) / local_size[1]) + 1) * 
local_size[1];
-               global_size[0] = num_parallel_threads / global_size[1];
-               global_size[0] = (global_size[0] / local_size[0]) * 
local_size[0];
-
-               /* Estimate number of parallel samples that can be processed in 
parallel */
-               unsigned int num_parallel_samples = (global_size[0] / d_w) <= 
rtile.num_samples ? (global_size[0] / d_w) : rtile.num_samples;
-               /* Wavefront size in AMD is 64 */
-               num_parallel_samples = ((num_parallel_samples / 64) == 0) ?
-                       num_parallel_samples :
-                       (num_parallel_samples / 64) * 64;
-#endif
-
-               if(num_parallel_samples == 0) {
-                       /* Rough estimate maximum rectangular tile size for 
this scene, to report to the user */
-                       size_t scene_alloc_memory = total_allocatable_memory
-                               - sizeof(int)* NUM_QUEUES
-                               - sizeof(char)
-                               -render_scene_input_data_size
-                               - DATA_ALLOCATION_MEM_FACTOR;
-                       unsigned int tile_max_x = 8, tile_max_y = 8;
-                       bool max_rect_tile_reached = false;
-                       while(!max_rect_tile_reached) {
-                               unsigned int num_parallel_samples_possible = 0;
-#ifdef __WORK_STEALING__
-                               unsigned int current_max_global_size[2];
-                               current_max_global_size[0] = (((tile_max_x - 1) 
/ local_size[0]) + 1) * local_size[0];
-                               current_max_global_size[1] = (((tile_max_y - 1) 
/ local_size[1]) + 1) * local_size[1];
-                               unsigned int current_max_work_groups = 
(current_max_global_size[0] * current_max_global_size[1]) / (local_size[0] * 
local_size[1]);
-#endif
-                               size_t memory_for_parallel_sample_processing = 
scene_alloc_memory
-#ifdef __WORK_STEALING__
-                                       - current_max_work_groups * 
sizeof(unsigned int)
-#endif
-                                       - (tile_max_x * tile_max_y) * 
per_thread_output_buffer_size
-                                       - (tile_max_x * tile_max_y) * size

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
Bf-blender-cvs@blender.org
http://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [28b8751] cycles_kernel_split: Support rendering of big tile sizes

Reply via email to