Commit: da4ef05e4dfb700a61910e6d8e02183d7c272963 Author: Michael Jones Date: Tue Jul 12 15:32:46 2022 +0200 Branches: master https://developer.blender.org/rBda4ef05e4dfb700a61910e6d8e02183d7c272963
Cycles: Apple Silicon optimization to specialize intersection kernels The Metal backend now compiles and caches a second set of kernels which are optimized for scene contents, enabled for Apple Silicon. The implementation supports doing this both for intersection and shading kernels. However this is currently only enabled for intersection kernels that are quick to compile, and already give a good speedup. Enabling this for shading kernels would be faster still, however this also causes a long wait times and would need a good user interface to control this. M1 Max samples per minute (macOS 13.0): PSO_GENERIC PSO_SPECIALIZED_INTERSECT PSO_SPECIALIZED_SHADE barbershop_interior 83.4 89.5 93.7 bmw27 1486.1 1671.0 1825.8 classroom 175.2 196.8 206.3 fishy_cat 674.2 704.3 719.3 junkshop 205.4 212.0 257.7 koro 310.1 336.1 342.8 monster 376.7 418.6 424.1 pabellon 273.5 325.4 339.8 sponza 830.6 929.6 1142.4 victor 86.7 96.4 96.3 wdas_cloud 111.8 112.7 183.1 Code contributed by Jason Fielder, Morteza Mostajabodaveh and Michael Jones Differential Revision: https://developer.blender.org/D14645 =================================================================== M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/CMakeLists.txt A intern/cycles/kernel/device/metal/function_constants.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/kernel/svm/svm.h M intern/cycles/kernel/types.h M intern/cycles/scene/scene.cpp M intern/cycles/util/string.cpp M intern/cycles/util/string.h =================================================================== diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 340be85e853..e7916ec3a52 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,7 @@ class DeviceQueue; class Progress; class CPUKernels; class CPUKernelThreadGlobals; +class Scene; /* Device Types */ @@ -186,6 +187,11 @@ class Device { return 0; } + /* Called after kernel texture setup, and prior to integrator state setup. */ + virtual void optimize_for_scene(Scene *scene) + { + } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { /* Memory is always resident if this is not a multi device, regardless of whether the pointer diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 4aea8d697a5..99e60d3a788 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -75,7 +75,8 @@ class MetalDevice : public Device { std::vector<id<MTLTexture>> texture_slot_map; bool use_metalrt = false; - bool use_function_specialisation = false; + MetalPipelineType kernel_specialization_level = PSO_GENERIC; + std::atomic_bool async_compile_and_load = false; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -91,9 +92,7 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); - string get_source(const uint kernel_features); - - string compile_kernel(const uint kernel_features, const char *name); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; @@ -111,7 +110,9 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id<MTLLibrary> compile(string const &source); + virtual void optimize_for_scene(Scene *scene) override; + + bool compile_and_load(MetalPipelineType pso_type); /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index ba9317e3204..d8bb3b867cd 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,6 +6,8 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # include "util/debug.h" # include "util/md5.h" # include "util/path.h" @@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; + + /* Specialize the intersection kernels on Apple GPUs by default as these can be built very + * quickly. */ + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { + kernel_specialization_level = (MetalPipelineType)atoi(envstr); + } + metal_printf("kernel_specialization_level = %s\n", + kernel_type_as_string( + (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -209,12 +222,11 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } -string MetalDevice::get_source(const uint kernel_features) +void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { - string build_options; - + string global_defines; if (use_adaptive_compilation()) { - build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) { @@ -222,52 +234,78 @@ string MetalDevice::get_source(const uint kernel_features) } if (use_metalrt) { - build_options += "-D__METALRT__ "; + global_defines += "#define __METALRT__\n"; if (motion_blur) { - build_options += "-D__METALRT_MOTION__ "; + global_defines += "#define __METALRT_MOTION__\n"; } } # ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; + global_defines += "#define __KERNEL_DEBUG__\n"; # endif switch (device_vendor) { default: break; case METAL_GPU_INTEL: - build_options += "-D__KERNEL_METAL_INTEL__ "; + global_defines += "#define __KERNEL_METAL_INTEL__\n"; break; case METAL_GPU_AMD: - build_options += "-D__KERNEL_METAL_AMD__ "; + global_defines += "#define __KERNEL_METAL_AMD__\n"; break; case METAL_GPU_APPLE: - build_options += "-D__KERNEL_METAL_APPLE__ "; + global_defines += "#define __KERNEL_METAL_APPLE__\n"; break; } - /* reformat -D defines list into compilable form */ - vector<string> components; - string_replace(build_options, "-D", ""); - string_split(components, build_options, " "); + string &source = this->source[pso_type]; + source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); - string globalDefines; - for (const string &component : components) { - vector<string> assignments; - string_split(assignments, component, "="); - if (assignments.size() == 2) - globalDefines += string_printf( - "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); - else - globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + /* Perform any required specialization on the source. + * With Metal function constants we can generate a single variant of the kernel source which can + * be repeatedly respecialized. + */ + string baked_constants; + + /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of + * the same character length. Build a string of all active constant values which is then hashed + * in order to identify the PSO. + */ + if (pso_type != PSO_GENERIC) { + const double starttime = time_dt(); + +# define KERNEL_STRUCT_BEGIN(name, parent) \ + string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); + + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + baked_constants += string(#parent "." #name "=") + \ + to_string(_type(launch_params.data.parent.name)) + "\n"; + +# include "kernel/data_template.h" + + /* Opt in to all of available specializations. This can be made more granular for the + * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests, + * but the overhead should be negligible as these are very quick to (re)build and aren't + * serialized to disk via MTLBinaryArchives. + */ + global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n"; + + metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0); } - string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; - source = path_source_replace_includes(source, path_get("source")); - - metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + source = global_defines + source; + metal_printf("================\n%s================\n\%s================\n", + global_defines.c_str(), + baked_constants.c_str()); - return source; + /* Generate an MD5 from the source and include any baked constants. This is used when caching + * PSOs. */ + MD5Hash md5; + md5.append(baked_constants); + md5.append(source); + source_md5[pso_type] = md5.get_hex(); } bool MetalDevice::load_kernels(const uint _kernel_features) @@ -283,28 +321,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - - const double starttime = time_dt(); - - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - metal_printf("Front-end compilation finished in %.1f seconds (generic)\n", - time_dt() - starttime); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - bool result = MetalDeviceKernels::load(this, false); + bool result = compile_and_load(PSO_GENERIC); reserve_local_memory(kernel_features); - return result; } -id<MTLLibrary> MetalDevice::compile(string const &source) +bool MetalDevice::compile_and_load(MetalPipelineType pso_type) { + make_source(pso_type, kernel_features); + + if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) { + /* We already have a full set of matching pipelines which are cached or queued. */ + metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type)); + return true; + } + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; @@ -312,19 +344,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source) options.languageVersion = MTLLanguageVersion2_4; } + if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) { + path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))), + source[pso_type]); + } + + const double starttime = time_dt(); + NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) - options:options - error:&error]; + mtlLibrary[pso_type] = [mtlDevice newLibr @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs