Thanks for testing. I've also been doing some experimenting with compile flags and other things here. So far it seems I can make my 650M render a few percentages faster compared to CUDA 4.2, but 460 GT is still considerably slower with the BMW scene (2m30s with 5.5 compared to 2m01s with 4.2), and 580 GTX had a similar difference. It seems you are testing with a 6xx card so that makes sense.
Patch attached for those who want to test this with 5.0/5.5. On Mon, Jun 3, 2013 at 8:46 PM, Jürgen Herrmann <shadow...@me.com> wrote: > Hi there, > > > > I did some tests with cuda 5.0 and 5.5 today and changed the nvcc > optimization flags for cycles_kernel_cuda. > > > > I found out the following: > > > > - “--opencc-options “ is deprecated for sm_20 and up and should be > removed from compiler options > > - Stating “-O3” and “—use_fast_math” as nvcc options brings massive > speedup on my system (more below) > > - We shouldn’t complain about new cuda toolsets that are slow, we > should find a solution as we can’t use old software forever… > > > > To the speedups: > > > > Example 1: > > system: i7-3820 @ 3.60GHz, GeForce GTK 660 > > > > Blender (cycles_cuda_kernel) compiled with standard settings: > > Mike_pan file took 02:06.60 to render > > > > Blender (cycles_cuda_kernel) compiled with –O3 –use-fast-math: > > Mike_pan took 01:39:93 > > > > There is no optical difference in the render results: > > > > Image1: http://www.pasteall.org/pic/52757 > > Image2: http://www.pasteall.org/pic/52758 > > > > I bet there’s more potential in there. > > > > /Jürgen > > _______________________________________________ > Bf-committers mailing list > Bf-committers@blender.org > http://lists.blender.org/mailman/listinfo/bf-committers
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index f32c6dd..27978b9 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -46,6 +46,7 @@ public: map<device_ptr, bool> tex_interp_map; int cuDevId; bool first_error; + vector<CUstream> cuStreams; struct PixelMem { GLuint cuPBO; @@ -205,6 +206,12 @@ public: if(cuda_error_(result, "cuCtxCreate")) return; + const int num_streams = 8; + cuStreams.resize(num_streams); + + for(int i = 0; i < num_streams; i++) + cuStreamCreate(&cuStreams[i], 0); + cuda_pop_context(); } @@ -212,6 +219,9 @@ public: { task_pool.stop(); + for(int i = 0; i < cuStreams.size(); i++) + cuStreamDestroy(cuStreams[i]); + cuda_push_context(); cuda_assert(cuCtxDetach(cuContext)) } @@ -514,7 +524,7 @@ public: } } - void path_trace(RenderTile& rtile, int sample) + void path_trace(RenderTile& rtile, int sample, CUstream stream) { if(have_error()) return; @@ -575,9 +585,9 @@ public: cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)) - cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)) + cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, stream)) - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } @@ -882,12 +892,35 @@ public: void thread_run(DeviceTask *task) { if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; + vector<RenderTile> concurrent_tiles(cuStreams.size()); + vector<bool> have_tile(cuStreams.size()); /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + while(1) { + int start_sample = -1; + int end_sample = -1; + + for(int i = 0; i < concurrent_tiles.size(); i++) { + RenderTile& tile = concurrent_tiles[i]; + + if(task->acquire_tile(this, tile)) { + have_tile[i] = true; + + if(start_sample == -1) { + start_sample = tile.start_sample; + end_sample = tile.start_sample + tile.num_samples; + } + else { + start_sample = min(start_sample, tile.start_sample); + end_sample = max(end_sample, tile.start_sample + tile.num_samples); + } + } + else + have_tile[i] = false; + } + + if(start_sample == -1) + break; for(int sample = start_sample; sample < end_sample; sample++) { if (task->get_cancel()) { @@ -895,21 +928,35 @@ public: break; } - path_trace(tile, sample); + for(int i = 0; i < concurrent_tiles.size(); i++) { + if(have_tile[i]) { + RenderTile& tile = concurrent_tiles[i]; + int tile_end_sample = tile.start_sample + tile.num_samples; - tile.sample = sample + 1; + if(sample > tile.start_sample && sample < tile_end_sample) { + path_trace(tile, sample, cuStreams[i]); + tile.sample = sample + 1; - task->update_progress(tile); + if(i == 0) + task->update_progress(tile); + } + } + } } - task->release_tile(tile); + for(int i = 0; i < concurrent_tiles.size(); i++) { + if(have_tile[i]) { + RenderTile& tile = concurrent_tiles[i]; + task->release_tile(tile); + } + } } } else if(task->type == DeviceTask::SHADER) { shader(*task); cuda_push_context(); - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } } @@ -930,7 +977,7 @@ public: tonemap(task, task.buffer, task.rgba); cuda_push_context(); - cuda_assert(cuCtxSynchronize()) + //cuda_assert(cuCtxSynchronize()) cuda_pop_context(); } else { diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 41048c7..45dfce7 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -129,7 +129,7 @@ if(WITH_CYCLES_CUDA_BINARIES) add_custom_command( OUTPUT ${cuda_cubin} - COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC + COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" -O3 --use_fast_math -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC DEPENDS ${cuda_sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h index a85a4ec..f66f87e 100644 --- a/intern/cycles/kernel/kernel_bvh.h +++ b/intern/cycles/kernel/kernel_bvh.h @@ -134,22 +134,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg, float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3); /* intersect ray against child nodes */ - float3 ood = P * idir; - NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x; - NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x; - NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y; - NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y; - NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z; - NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z; + NO_EXTENDED_PRECISION float c0lox = (n0xy.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0hix = (n0xy.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c0loy = (n0xy.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c0hiy = (n0xy.w - P.y)* idir.y; + NO_EXTENDED_PRECISION float c0loz = (nz.x - P.z) * idir.z; + NO_EXTENDED_PRECISION float c0hiz = (nz.y - P.z) * idir.z; NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); - NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z; - NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z; - NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x; - NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x; - NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y; - NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y; + NO_EXTENDED_PRECISION float c1loz = (nz.z - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1hiz = (nz.w - P.z) * idir.z; + NO_EXTENDED_PRECISION float c1lox = (n1xy.x - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1hix = (n1xy.y - P.x) * idir.x; + NO_EXTENDED_PRECISION float c1loy = (n1xy.z - P.y) * idir.y; + NO_EXTENDED_PRECISION float c1hiy = (n1xy.w - P.y) * idir.y; NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); @@ -157,6 +156,7 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg, if(difl != 0.0f) { float hdiff = 1.0f + difl; float ldiff = 1.0f - difl; + if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { c0min = max(ldiff * c0min, c0min - extmax); c0max = min(hdiff * c0max, c0max + extmax); diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index a11f8f4..d3c6071 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -33,7 +33,7 @@ #define __device __device__ __inline__ #define __device_inline __device__ __inline__ -#define __device_noinline __device__ __noinline__ +#define __device_noinline __device__ __inline__ #define __global #define __shared __shared__ #define __constant
_______________________________________________ Bf-committers mailing list Bf-committers@blender.org http://lists.blender.org/mailman/listinfo/bf-committers