Re: [Bf-committers] massive cuda speed improvements with Cuda 5.0/5.5

Brecht Van Lommel Mon, 03 Jun 2013 12:22:06 -0700

Thanks for testing. I've also been doing some experimenting with
compile flags and other things here. So far it seems I can make my
650M render a few percentages faster compared to CUDA 4.2, but 460 GT
is still considerably slower with the BMW scene (2m30s with 5.5
compared to 2m01s with 4.2), and 580 GTX had a similar difference. It
seems you are testing with a 6xx card so that makes sense.


Patch attached for those who want to test this with 5.0/5.5.

On Mon, Jun 3, 2013 at 8:46 PM, Jürgen Herrmann <shadow...@me.com> wrote:
> Hi there,
>
>
>
> I did some tests with cuda 5.0 and 5.5 today and changed the nvcc
> optimization flags for cycles_kernel_cuda.
>
>
>
> I found out the following:
>
>
>
> -          “--opencc-options “ is deprecated for sm_20 and up and should be
> removed from compiler options
>
> -          Stating “-O3” and “—use_fast_math” as nvcc options brings massive
> speedup on my system (more below)
>
> -          We shouldn’t complain about new cuda toolsets that are slow, we
> should find a solution as we can’t use old software forever…
>
>
>
> To the speedups:
>
>
>
> Example 1:
>
> system: i7-3820 @ 3.60GHz, GeForce GTK 660
>
>
>
> Blender (cycles_cuda_kernel) compiled with standard settings:
>
> Mike_pan file took 02:06.60 to render
>
>
>
> Blender (cycles_cuda_kernel) compiled with –O3 –use-fast-math:
>
> Mike_pan took 01:39:93
>
>
>
> There is no optical difference in the render results:
>
>
>
> Image1: http://www.pasteall.org/pic/52757
>
> Image2: http://www.pasteall.org/pic/52758
>
>
>
> I bet there’s more potential in there.
>
>
>
> /Jürgen
>
> _______________________________________________
> Bf-committers mailing list
> Bf-committers@blender.org
> http://lists.blender.org/mailman/listinfo/bf-committers

diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index f32c6dd..27978b9 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -46,6 +46,7 @@ public:
        map<device_ptr, bool> tex_interp_map;
        int cuDevId;
        bool first_error;
+       vector<CUstream> cuStreams;
 
        struct PixelMem {
                GLuint cuPBO;
@@ -205,6 +206,12 @@ public:
                if(cuda_error_(result, "cuCtxCreate"))
                        return;
 
+               const int num_streams = 8;
+               cuStreams.resize(num_streams);
+
+               for(int i = 0; i < num_streams; i++)
+                       cuStreamCreate(&cuStreams[i], 0);
+
                cuda_pop_context();
        }
 
@@ -212,6 +219,9 @@ public:
        {
                task_pool.stop();
 
+               for(int i = 0; i < cuStreams.size(); i++)
+                       cuStreamDestroy(cuStreams[i]);
+
                cuda_push_context();
                cuda_assert(cuCtxDetach(cuContext))
        }
@@ -514,7 +524,7 @@ public:
                }
        }
 
-       void path_trace(RenderTile& rtile, int sample)
+       void path_trace(RenderTile& rtile, int sample, CUstream stream)
        {
                if(have_error())
                        return;
@@ -575,9 +585,9 @@ public:
 
                cuda_assert(cuFuncSetCacheConfig(cuPathTrace, 
CU_FUNC_CACHE_PREFER_L1))
                cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, 
ythreads, 1))
-               cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
+               cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, 
stream))
 
-               cuda_assert(cuCtxSynchronize())
+               //cuda_assert(cuCtxSynchronize())
 
                cuda_pop_context();
        }
@@ -882,12 +892,35 @@ public:
        void thread_run(DeviceTask *task)
        {
                if(task->type == DeviceTask::PATH_TRACE) {
-                       RenderTile tile;
+                       vector<RenderTile> concurrent_tiles(cuStreams.size());
+                       vector<bool> have_tile(cuStreams.size());
                        
                        /* keep rendering tiles until done */
-                       while(task->acquire_tile(this, tile)) {
-                               int start_sample = tile.start_sample;
-                               int end_sample = tile.start_sample + 
tile.num_samples;
+                       while(1) {
+                               int start_sample = -1;
+                               int end_sample = -1;
+
+                               for(int i = 0; i < concurrent_tiles.size(); 
i++) {
+                                       RenderTile& tile = concurrent_tiles[i];
+
+                                       if(task->acquire_tile(this, tile)) {
+                                               have_tile[i] = true;
+
+                                               if(start_sample == -1) {
+                                                       start_sample = 
tile.start_sample;
+                                                       end_sample = 
tile.start_sample + tile.num_samples;
+                                               }
+                                               else {
+                                                       start_sample = 
min(start_sample, tile.start_sample);
+                                                       end_sample = 
max(end_sample, tile.start_sample + tile.num_samples);
+                                               }
+                                       }
+                                       else
+                                               have_tile[i] = false;
+                               }
+
+                               if(start_sample == -1)
+                                       break;
 
                                for(int sample = start_sample; sample < 
end_sample; sample++) {
                                        if (task->get_cancel()) {
@@ -895,21 +928,35 @@ public:
                                                        break;
                                        }
 
-                                       path_trace(tile, sample);
+                                       for(int i = 0; i < 
concurrent_tiles.size(); i++) {
+                                               if(have_tile[i]) {
+                                                       RenderTile& tile = 
concurrent_tiles[i];
+                                                       int tile_end_sample = 
tile.start_sample + tile.num_samples;
 
-                                       tile.sample = sample + 1;
+                                                       if(sample > 
tile.start_sample && sample < tile_end_sample) {
+                                                               
path_trace(tile, sample, cuStreams[i]);
+                                                               tile.sample = 
sample + 1;
 
-                                       task->update_progress(tile);
+                                                               if(i == 0)
+                                                                       
task->update_progress(tile);
+                                                       }
+                                               }
+                                       }
                                }
 
-                               task->release_tile(tile);
+                               for(int i = 0; i < concurrent_tiles.size(); 
i++) {
+                                       if(have_tile[i]) {
+                                               RenderTile& tile = 
concurrent_tiles[i];
+                                               task->release_tile(tile);
+                                       }
+                               }
                        }
                }
                else if(task->type == DeviceTask::SHADER) {
                        shader(*task);
 
                        cuda_push_context();
-                       cuda_assert(cuCtxSynchronize())
+                       //cuda_assert(cuCtxSynchronize())
                        cuda_pop_context();
                }
        }
@@ -930,7 +977,7 @@ public:
                        tonemap(task, task.buffer, task.rgba);
 
                        cuda_push_context();
-                       cuda_assert(cuCtxSynchronize())
+                       //cuda_assert(cuCtxSynchronize())
                        cuda_pop_context();
                }
                else {
diff --git a/intern/cycles/kernel/CMakeLists.txt 
b/intern/cycles/kernel/CMakeLists.txt
index 41048c7..45dfce7 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -129,7 +129,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
                add_custom_command(
                        OUTPUT ${cuda_cubin}
-                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} 
-m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o 
${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" 
--maxrregcount=24 --opencc-options -OPT:Olimit=0 
-I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm 
-DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} 
-m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o 
${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" -O3 
--use_fast_math -I${CMAKE_CURRENT_SOURCE_DIR}/../util 
-I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= 
-DNVCC
                        DEPENDS ${cuda_sources})
 
                delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" 
${CYCLES_INSTALL_PATH}/lib)
diff --git a/intern/cycles/kernel/kernel_bvh.h 
b/intern/cycles/kernel/kernel_bvh.h
index a85a4ec..f66f87e 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -134,22 +134,21 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg,
        float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
 
        /* intersect ray against child nodes */
-       float3 ood = P * idir;
-       NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
-       NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
-       NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
-       NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;
-       NO_EXTENDED_PRECISION float c0loz = nz.x * idir.z - ood.z;
-       NO_EXTENDED_PRECISION float c0hiz = nz.y * idir.z - ood.z;
+       NO_EXTENDED_PRECISION float c0lox = (n0xy.x - P.x) * idir.x;
+       NO_EXTENDED_PRECISION float c0hix = (n0xy.y - P.x) * idir.x;
+       NO_EXTENDED_PRECISION float c0loy = (n0xy.z - P.y) * idir.y;
+       NO_EXTENDED_PRECISION float c0hiy = (n0xy.w - P.y)* idir.y;
+       NO_EXTENDED_PRECISION float c0loz = (nz.x - P.z) * idir.z;
+       NO_EXTENDED_PRECISION float c0hiz = (nz.y - P.z) * idir.z;
        NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, 
c0hiy), min(c0loz, c0hiz), 0.0f);
        NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, 
c0hiy), max(c0loz, c0hiz), t);
 
-       NO_EXTENDED_PRECISION float c1loz = nz.z * idir.z - ood.z;
-       NO_EXTENDED_PRECISION float c1hiz = nz.w * idir.z - ood.z;
-       NO_EXTENDED_PRECISION float c1lox = n1xy.x * idir.x - ood.x;
-       NO_EXTENDED_PRECISION float c1hix = n1xy.y * idir.x - ood.x;
-       NO_EXTENDED_PRECISION float c1loy = n1xy.z * idir.y - ood.y;
-       NO_EXTENDED_PRECISION float c1hiy = n1xy.w * idir.y - ood.y;
+       NO_EXTENDED_PRECISION float c1loz = (nz.z - P.z) * idir.z;
+       NO_EXTENDED_PRECISION float c1hiz = (nz.w - P.z) * idir.z;
+       NO_EXTENDED_PRECISION float c1lox = (n1xy.x - P.x) * idir.x;
+       NO_EXTENDED_PRECISION float c1hix = (n1xy.y - P.x) * idir.x;
+       NO_EXTENDED_PRECISION float c1loy = (n1xy.z - P.y) * idir.y;
+       NO_EXTENDED_PRECISION float c1hiy = (n1xy.w - P.y)  * idir.y;
        NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, 
c1hiy), min(c1loz, c1hiz), 0.0f);
        NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, 
c1hiy), max(c1loz, c1hiz), t);
 
@@ -157,6 +156,7 @@ __device_inline void bvh_node_intersect(KernelGlobals *kg,
        if(difl != 0.0f) {
                float hdiff = 1.0f + difl;
                float ldiff = 1.0f - difl;
+
                if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) {
                        c0min = max(ldiff * c0min, c0min - extmax);
                        c0max = min(hdiff * c0max, c0max + extmax);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h 
b/intern/cycles/kernel/kernel_compat_cuda.h
index a11f8f4..d3c6071 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -33,7 +33,7 @@
 
 #define __device  __device__ __inline__
 #define __device_inline  __device__ __inline__
-#define __device_noinline  __device__ __noinline__
+#define __device_noinline  __device__ __inline__
 #define __global
 #define __shared __shared__
 #define __constant

_______________________________________________
Bf-committers mailing list
Bf-committers@blender.org
http://lists.blender.org/mailman/listinfo/bf-committers

Re: [Bf-committers] massive cuda speed improvements with Cuda 5.0/5.5

Reply via email to