Commit: 28b87518a1ad20894b4e29fc50e04f0c424a125d Author: varunsundar08 Date: Wed Apr 8 19:40:47 2015 +0530 Branches: cycles_kernel_split https://developer.blender.org/rB28b87518a1ad20894b4e29fc50e04f0c424a125d
Support rendering of big tile sizes =================================================================== M intern/cycles/device/device_opencl.cpp M intern/cycles/kernel/kernel_Background_BufferUpdate.cl M intern/cycles/kernel/kernel_DataInit.cl M intern/cycles/kernel/kernel_SumAllRadiance.cl M intern/cycles/render/buffers.h =================================================================== diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 103f710..58de549 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -544,15 +544,6 @@ public: size_t PathState_size; size_t Intersection_size; - /* Volume of ShaderData; ShaderData (in split_kernel) is a - * Structure-Of-Arrays implementation; We need to calculate memory - * required for a single thread - */ - size_t ShaderData_volume; - - /* This is total ShaderClosure size required for one thread */ - size_t ShaderClosure_size; - /* Sizes of memory required for shadow blocked function */ size_t AOAlpha_size; size_t AOBSDF_size; @@ -562,7 +553,7 @@ public: size_t Intersection_coop_AO_size; size_t Intersection_coop_DL_size; - /* This is sizeof_output_buffer / tile_size */ + /* Amount of memory in output buffer associated with one pixel */ size_t per_thread_output_buffer_size; /* Total allocatable available device memory */ @@ -595,9 +586,6 @@ public: unsigned int max_work_groups; #endif - /* Flag denoting if rendering the scene with current tile size is possible */ - bool cannot_render_scene; - /* Marked True in constructor and marked false at the end of path_trace() */ bool first_tile; @@ -857,6 +845,7 @@ public: Intersection_coop_DL_size = sizeof(Intersection); per_thread_output_buffer_size = 0; + per_thread_memory = 0; render_scene_input_data_size = 0; hostRayStateArray = NULL; @@ -865,7 +854,6 @@ public: work_pool_wgs = NULL; max_work_groups = 0; #endif - cannot_render_scene = false; first_tile = true; #else @@ -2396,6 +2384,37 @@ public: } #endif +#ifdef __SPLIT_KERNEL__ + /* Returns size of KernelGlobals structure associated with OpenCL */ + size_t get_KernelGlobals_size() { + /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to fetch its size */ + typedef struct KernelGlobals { + ccl_constant KernelData *data; +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name; +#include "kernel_textures.h" + } KernelGlobals; + + return sizeof(KernelGlobals); + } + + /* Returns size of Structure of arrays implementation of */ + size_t get_shaderdata_soa_size() { + size_t num_shader_soa_ptr = SD_NUM_FLOAT3 + SD_NUM_INT + SD_NUM_FLOAT +#ifdef __DPDU__ + + SD_NUM_DPDU_FLOAT3 +#endif +#ifdef __RAY_DIFFERENTIAL__ + + SD_NUM_RAY_DIFFERENTIALS_DIFFERENTIAL3 + + SD_NUM_DIFFERENTIAL +#endif + + SD_NUM_RAY_DP_DIFFERENTIAL3; + + return (num_shader_soa_ptr * sizeof(void *)); + } + +#endif + void path_trace(RenderTile& rtile, int sample) { /* cast arguments to cl types */ @@ -2409,11 +2428,6 @@ public: cl_int d_offset = rtile.offset; cl_int d_stride = rtile.stride; #ifdef __SPLIT_KERNEL__ - (void)sample; - - if(cannot_render_scene) { - return; - } /* ray_state and hostRayStateArray should be of same size */ assert(hostRayState_size == rayState_size); @@ -2422,25 +2436,39 @@ public: size_t global_size[2]; size_t local_size[2] = { SPLIT_KERNEL_LOCAL_SIZE_X, SPLIT_KERNEL_LOCAL_SIZE_Y }; + /* Set the range of samples to be processed for every ray in path-regeneration logic */ + cl_int start_sample = rtile.start_sample; + cl_int end_sample = rtile.start_sample + rtile.num_samples; + cl_int num_samples = rtile.num_samples; + +#ifdef __WORK_STEALING__ + global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; + global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; + unsigned int num_parallel_samples = 1; +#else + /* We may not need all global_size[0] threads; We only need as much as num_parallel_samples * d_w */ + global_size[0] = num_parallel_samples * d_w; + global_size[0] = (((global_size[0] - 1) / local_size[0]) + 1) * local_size[0]; + + assert(global_size[0] * global_size[1] <= num_parallel_threads); + assert(global_size[0] * global_size[1] >= d_w * d_h); +#endif // __WORK_STEALING__ + + /* Allocate all required global memory once */ if(first_tile) { + size_t num_global_elements = rtile.max_render_feasible_tile_size.x * rtile.max_render_feasible_tile_size.y; #ifdef __MULTI_CLOSURE__ - ShaderClosure_size = get_shader_closure_size(clos_max); + size_t ShaderClosure_size = get_shader_closure_size(clos_max); #else - ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE); + size_t ShaderClosure_size = get_shader_closure_size(MAX_CLOSURE); #endif - ShaderData_volume = get_shader_data_size(ShaderClosure_size); - - /* Determine texture memories once */ -#define KERNEL_TEX(type, ttype, name) \ - render_scene_input_data_size += get_tex_size(#name); -#include "kernel_textures.h" #ifdef __WORK_STEALING__ /* Calculate max groups */ size_t max_global_size[2]; - size_t tile_x = rtile.tile_size.x; - size_t tile_y = rtile.tile_size.y; + size_t tile_x = rtile.max_render_feasible_tile_size.x; + size_t tile_y = rtile.max_render_feasible_tile_size.y; max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; max_work_groups = (max_global_size[0] * max_global_size[1]) / (local_size[0] * local_size[1]); @@ -2457,457 +2485,314 @@ public: use_queues_flag = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, sizeof(char), NULL, &ciErr); assert(ciErr == CL_SUCCESS && "Can't create use_queues_flag memory"); - /* Calculate per thread memory */ - size_t output_buffer_size = 0; - ciErr = clGetMemObjectInfo(d_buffer, CL_MEM_SIZE, sizeof(output_buffer_size), &output_buffer_size, NULL); - assert(ciErr == CL_SUCCESS && "Can't get d_buffer mem object info"); - - /* This value is different when running on AMD and NV */ - per_thread_output_buffer_size = output_buffer_size / (d_w * d_h); - - per_thread_memory = rng_size + throughput_size + L_transparent_size + rayState_size + work_element_size - + ISLamp_size + PathRadiance_size + Ray_size + PathState_size - + Intersection_size /* Overall isect */ - + Intersection_coop_AO_size /* Instersection_coop_AO */ - + Intersection_coop_DL_size /* Intersection coop DL */ - + ShaderData_volume /* Overall ShaderData */ - + ShaderData_volume /* ShaderData_coop_DL */ - + (ShaderData_volume * 2) /* ShaderData coop shadow */ - + LightRay_size + BSDFEval_size + AOAlpha_size + AOBSDF_size + AOLightRay_size - + (sizeof(int) * NUM_QUEUES) - + per_thread_output_buffer_size; - - int user_set_tile_w = rtile.tile_size.x; - int user_set_tile_h = rtile.tile_size.y; - - total_allocatable_parallel_sample_processing_memory = total_allocatable_memory - - sizeof(int)* NUM_QUEUES /* Queue index size */ - - sizeof(char) /* use_queues */ - -render_scene_input_data_size /* size for textures, bvh etc */ - - (user_set_tile_w * user_set_tile_h) * per_thread_output_buffer_size /* max d_buffer size possible */ - - (user_set_tile_w * user_set_tile_h) * sizeof(RNG) /* max d_rng_state size possible */ -#ifdef __WORK_STEALING__ - - max_work_groups * sizeof(unsigned int) -#endif - - DATA_ALLOCATION_MEM_FACTOR; - } - - /* Set the range of samples to be processed for every ray in path-regeneration logic */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - cl_int num_samples = rtile.num_samples; - -#ifdef __WORK_STEALING__ - /* TODO : support dynamic num_parallel_samples in work_stealing - * Do not change the values of num_parallel_samples/num_parallel_threads - */ - unsigned int num_parallel_samples = 0; - global_size[0] = (((rtile.tile_size.x - 1) / local_size[0]) + 1) * local_size[0]; - global_size[1] = (((rtile.tile_size.y - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_parallel_threads = global_size[0] * global_size[1]; - - /* Check if we can process atleast one sample */ - num_parallel_samples = (total_allocatable_parallel_sample_processing_memory / (per_thread_memory * num_parallel_threads)); - num_parallel_samples = (num_parallel_samples > 0) ? 1 : 0; -#else - unsigned int num_parallel_threads = total_allocatable_parallel_sample_processing_memory / per_thread_memory; - - /* Estimate maximum global work size that can be launched */ - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - global_size[0] = num_parallel_threads / global_size[1]; - global_size[0] = (global_size[0] / local_size[0]) * local_size[0]; - - /* Estimate number of parallel samples that can be processed in parallel */ - unsigned int num_parallel_samples = (global_size[0] / d_w) <= rtile.num_samples ? (global_size[0] / d_w) : rtile.num_samples; - /* Wavefront size in AMD is 64 */ - num_parallel_samples = ((num_parallel_samples / 64) == 0) ? - num_parallel_samples : - (num_parallel_samples / 64) * 64; -#endif - - if(num_parallel_samples == 0) { - /* Rough estimate maximum rectangular tile size for this scene, to report to the user */ - size_t scene_alloc_memory = total_allocatable_memory - - sizeof(int)* NUM_QUEUES - - sizeof(char) - -render_scene_input_data_size - - DATA_ALLOCATION_MEM_FACTOR; - unsigned int tile_max_x = 8, tile_max_y = 8; - bool max_rect_tile_reached = false; - while(!max_rect_tile_reached) { - unsigned int num_parallel_samples_possible = 0; -#ifdef __WORK_STEALING__ - unsigned int current_max_global_size[2]; - current_max_global_size[0] = (((tile_max_x - 1) / local_size[0]) + 1) * local_size[0]; - current_max_global_size[1] = (((tile_max_y - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int current_max_work_groups = (current_max_global_size[0] * current_max_global_size[1]) / (local_size[0] * local_size[1]); -#endif - size_t memory_for_parallel_sample_processing = scene_alloc_memory -#ifdef __WORK_STEALING__ - - current_max_work_groups * sizeof(unsigned int) -#endif - - (tile_max_x * tile_max_y) * per_thread_output_buffer_size - - (tile_max_x * tile_max_y) * size @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org http://lists.blender.org/mailman/listinfo/bf-blender-cvs