Hi Brecht, 

nice work on Cycles performance in the last days, thank you very much ;-)
I realized that you didn't introduce the -O3 flag for nvcc yet. Are there
problem when using it?
This could boost performance a little bit more on all platforms.

/Jürgen

-----Ursprüngliche Nachricht-----
Von: bf-blender-cvs-boun...@blender.org
[mailto:bf-blender-cvs-boun...@blender.org] Im Auftrag von Brecht Van Lommel
Gesendet: Mittwoch, 19. Juni 2013 19:54
An: bf-blender-...@blender.org
Betreff: [Bf-blender-cvs] SVN commit: /data/svn/bf-blender [57580]
trunk/blender/intern/cycles: Cycles: prepare to make CUDA 5.0 the official
version we use

Revision: 57580
 
http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision
=57580
Author:   blendix
Date:     2013-06-19 17:54:23 +0000 (Wed, 19 Jun 2013)
Log Message:
-----------
Cycles: prepare to make CUDA 5.0 the official version we use

* Add CUDA compiler version detection to cmake/scons/runtime
* Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA
5.x
  is used, these were workarounds for CUDA 4.2 bugs
* Change max number of registers to 32 for sm 2.x (based on performance
tests
  from Martijn Berger and confirmed here), and also for NVidia OpenCL.

Overall it seems that with these changes and the latest CUDA 5.0 download,
that performance is as good as or better than the 2.67b release with the
scenes and graphics cards I tested.

Modified Paths:
--------------
    trunk/blender/intern/cycles/device/device_cuda.cpp
    trunk/blender/intern/cycles/device/device_opencl.cpp
    trunk/blender/intern/cycles/kernel/CMakeLists.txt
    trunk/blender/intern/cycles/kernel/SConscript
    trunk/blender/intern/cycles/kernel/kernel_jitter.h
    trunk/blender/intern/cycles/kernel/kernel_shader.h
    trunk/blender/intern/cycles/util/util_cuda.cpp
    trunk/blender/intern/cycles/util/util_cuda.h

Modified: trunk/blender/intern/cycles/device/device_cuda.cpp
===================================================================
--- trunk/blender/intern/cycles/device/device_cuda.cpp  2013-06-19 17:17:51
UTC (rev 57579)
+++ trunk/blender/intern/cycles/device/device_cuda.cpp  2013-06-19 17:54:23
UTC (rev 57580)
@@ -271,21 +271,65 @@
                        return "";
                }
 
+               int cuda_version = cuCompilerVersion();
+
+               if(cuda_version == 0) {
+                       cuda_error_message("CUDA nvcc compiler version could
not be parsed.");
+                       return "";
+               }
+
+               if(cuda_version != 50)
+                       printf("CUDA version %d.%d detected, build may
succeed but only CUDA 
+5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+
                /* compile */
                string kernel = path_join(kernel_path, "kernel.cu");
                string include = kernel_path;
                const int machine = system_cpu_bits();
-               const int maxreg = 24;
+               string arch_flags;
 
+               /* build flags depending on CUDA version and arch */
+               if(cuda_version < 50) {
+                       /* CUDA 4.x */
+                       if(major == 1) {
+                               /* sm_1x */
+                               arch_flags = "--maxrregcount=24
--opencc-options -OPT:Olimit=0";
+                       }
+                       else if(major == 2) {
+                               /* sm_2x */
+                               arch_flags = "--maxrregcount=24";
+                       }
+                       else {
+                               /* sm_3x */
+                               arch_flags = "--maxrregcount=32";
+                       }
+               }
+               else {
+                       /* CUDA 4.x */
+                       if(major == 1) {
+                               /* sm_1x */
+                               arch_flags = "--maxrregcount=24
--opencc-options -OPT:Olimit=0 --use_fast_math";
+                       }
+                       else if(major == 2) {
+                               /* sm_2x */
+                               arch_flags = "--maxrregcount=32
--use_fast_math";
+                       }
+                       else {
+                               /* sm_3x */
+                               arch_flags = "--maxrregcount=32
--use_fast_math";
+                       }
+               }
+
                double starttime = time_dt();
                printf("Compiling CUDA kernel ...\n");
 
                path_create_directories(cubin);
 
                string command = string_printf("\"%s\" -arch=sm_%d%d -m%d
--cubin \"%s\" "
-                       "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d
--opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
-                       nvcc.c_str(), major, minor, machine, kernel.c_str(),
cubin.c_str(), maxreg, include.c_str());
+                       "-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC
-D__KERNEL_CUDA_VERSION__=%d",
+                       nvcc.c_str(), major, minor, machine, kernel.c_str(),
cubin.c_str(), 
+arch_flags.c_str(), include.c_str(), cuda_version);
 
+               printf("%s\n", command.c_str());
+
                if(system(command.c_str()) == -1) {
                        cuda_error_message("Failed to execute compilation
command, see console for details.");
                        return "";

Modified: trunk/blender/intern/cycles/device/device_opencl.cpp
===================================================================
--- trunk/blender/intern/cycles/device/device_opencl.cpp        2013-06-19
17:17:51 UTC (rev 57579)
+++ trunk/blender/intern/cycles/device/device_opencl.cpp        2013-06-19
17:54:23 UTC (rev 57580)
@@ -85,7 +85,7 @@
        string build_options = " -cl-fast-relaxed-math ";
 
        if(platform == "NVIDIA CUDA")
-               build_options += "-D__KERNEL_OPENCL_NVIDIA__
-cl-nv-maxrregcount=24 -cl-nv-verbose ";
+               build_options += "-D__KERNEL_OPENCL_NVIDIA__
-cl-nv-maxrregcount=32 
+-cl-nv-verbose ";
 
        else if(platform == "Apple")
                build_options += "-D__KERNEL_OPENCL_APPLE__
-Wno-missing-prototypes ";

Modified: trunk/blender/intern/cycles/kernel/CMakeLists.txt
===================================================================
--- trunk/blender/intern/cycles/kernel/CMakeLists.txt   2013-06-19 17:17:51
UTC (rev 57579)
+++ trunk/blender/intern/cycles/kernel/CMakeLists.txt   2013-06-19 17:54:23
UTC (rev 57580)
@@ -117,32 +117,68 @@
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
+       # 32 bit or 64 bit
        if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
                set(CUDA_BITS 64)
        else()
                set(CUDA_BITS 32)
        endif()
 
+       # CUDA version
+       execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version"
OUTPUT_VARIABLE NVCC_OUT)
+       string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1"
CUDA_VERSION_MAJOR ${NVCC_OUT})
+       string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2"
CUDA_VERSION_MINOR ${NVCC_OUT})
+       set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
+
+       # build for each arch
        set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS}
${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
        set(cuda_cubins)
 
        foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
                set(cuda_cubin kernel_${arch}.cubin)
 
-               if(${arch} MATCHES "sm_1[0-9]")
-                       # sm_1x
-                       set(cuda_arch_flags "--maxrregcount=24
--opencc-options -OPT:Olimit=0")
-               elseif(${arch} MATCHES "sm_2[0-9]")
-                       # sm_2x
-                       set(cuda_arch_flags "--maxrregcount=24")
+               set(cuda_version_flags
"-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
+
+               # warn for other versions
+               if(CUDA_VERSION MATCHES "50")
                else()
-                       # sm_3x
-                       set(cuda_arch_flags "--maxrregcount=32")
+                       message(STATUS "CUDA version 
+${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed 
+but only CUDA 5.0 is officially supported")
                endif()
+
+               # build flags depending on CUDA version and arch
+               if(CUDA_VERSION LESS 50)
+                       # CUDA 4.x
+                       if(${arch} MATCHES "sm_1[0-9]")
+                               # sm_1x
+                               set(cuda_arch_flags "--maxrregcount=24
--opencc-options -OPT:Olimit=0")
+                       elseif(${arch} MATCHES "sm_2[0-9]")
+                               # sm_2x
+                               set(cuda_arch_flags "--maxrregcount=24")
+                       else()
+                               # sm_3x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       endif()
+
+                       set(cuda_math_flags "")
+               else()
+                       # CUDA 5.x
+                       if(${arch} MATCHES "sm_1[0-9]")
+                               # sm_1x
+                               set(cuda_arch_flags "--maxrregcount=24
--opencc-options -OPT:Olimit=0")
+                       elseif(${arch} MATCHES "sm_2[0-9]")
+                               # sm_2x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       else()
+                               # sm_3x
+                               set(cuda_arch_flags "--maxrregcount=32")
+                       endif()
+
+                       set(cuda_math_flags "--use_fast_math")
+               endif()
                
                add_custom_command(
                        OUTPUT ${cuda_cubin}
-                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch}
-m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o
${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v"
${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util
-I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END= -DNVCC
+                       COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch}
-m${CUDA_BITS} --cubin 
+${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o 
+${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" 
+${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} 
+-I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm 
+-DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
                        DEPENDS ${cuda_sources})
 
                delayed_install("${CMAKE_CURRENT_BINARY_DIR}"
"${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)

Modified: trunk/blender/intern/cycles/kernel/SConscript
===================================================================
--- trunk/blender/intern/cycles/kernel/SConscript       2013-06-19 17:17:51
UTC (rev 57579)
+++ trunk/blender/intern/cycles/kernel/SConscript       2013-06-19 17:54:23
UTC (rev 57580)
@@ -25,6 +25,8 @@
 #
 # ***** END GPL LICENSE BLOCK *****
 
+import re
+import subprocess
 import sys
 import os
 import Blender as B
@@ -60,10 +62,19 @@
     svm_dir = os.path.join(source_dir, "../svm")
     closure_dir = os.path.join(source_dir, "../closure")
 
+    # get CUDA version
+    nvcc_pipe = subprocess.Popen([nvcc,
"--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    output, erroroutput = nvcc_pipe.communicate()
+    cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
+    cuda_version = int(cuda_major_minor[0])*10 + 
+ int(cuda_major_minor[1])
+
+    if cuda_version != 50:
+        print("CUDA version %d.%d detected, build may succeed but only 
+ CUDA 5.0 is officially supported." % (cuda_version/10, 
+ cuda_version%10))
+
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
-    nvcc_flags += " --cubin --ptxas-options=\"-v\" --maxrregcount=24"
-    nvcc_flags += " --opencc-options -OPT:Olimit=0"
+    nvcc_flags += " --cubin --ptxas-options=\"-v\""
+    nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir,
closure_dir)
 
@@ -75,8 +86,32 @@
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-        command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch,
nvcc_flags, kernel_file, cubin_file)
+               # build flags depending on CUDA version and arch
+        if cuda_version < 50:
+            # CUDA 4.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options
-OPT:Olimit=0"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=24"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32"
+        else:
+            # CUDA 5.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options
-OPT:Olimit=0 --use_fast_math"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
 
+        command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, 
+ arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
+
         kernel.Command(cubin_file, 'kernel.cu', command)
         kernel.Depends(cubin_file, dependencies)
 

Modified: trunk/blender/intern/cycles/kernel/kernel_jitter.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_jitter.h  2013-06-19 17:17:51
UTC (rev 57579)
+++ trunk/blender/intern/cycles/kernel/kernel_jitter.h  2013-06-19 17:54:23
UTC (rev 57580)
@@ -137,7 +137,7 @@
 }
 
 #ifdef __CMJ__
-__device_noinline float cmj_sample_1D(int s, int N, int p)
+__device float cmj_sample_1D(int s, int N, int p)
 {
        uint x = cmj_permute(s, N, p * 0x68bc21eb);
        float jx = cmj_randfloat(s, p * 0x967a889b); @@ -146,7 +146,7 @@
        return (x + jx)*invN;
 }
 
-__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float
*fy)
+__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
        int m = float_to_int(sqrtf(N));
        int n = (N + m - 1)/m;

Modified: trunk/blender/intern/cycles/kernel/kernel_shader.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_shader.h  2013-06-19 17:17:51
UTC (rev 57579)

@@ Diff output truncated at 10240 characters. @@
_______________________________________________
Bf-blender-cvs mailing list
bf-blender-...@blender.org
http://lists.blender.org/mailman/listinfo/bf-blender-cvs

_______________________________________________
Bf-committers mailing list
Bf-committers@blender.org
http://lists.blender.org/mailman/listinfo/bf-committers

Reply via email to