[Bf-blender-cvs] [2d994de77c3] master: Cycles: MetalRT optimisation for subsurface intersection queries
Commit: 2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d Author: Michael Jones Date: Mon Feb 6 19:09:51 2023 + Branches: master https://developer.blender.org/rB2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d Cycles: MetalRT optimisation for subsurface intersection queries This patch optimises subsurface intersection queries on MetalRT. Currently intersect_local traverses from the scene root, retrospectively discarding all non-local hits. Using a lookup of bottom level acceleration structures, we can explicitly query only the relevant instance. On M1 Max, with MetalRT selected, this can give a render speedup of 15-20% for scenes like Monster which make heavy use of subsurface scattering. Patch authored by Marco Giordano. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17153 === M intern/cycles/device/metal/bvh.h M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/kernel/device/metal/bvh.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/metal/kernel.metal === diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h index 519cbf00294..5448a3ae41d 100644 --- a/intern/cycles/device/metal/bvh.h +++ b/intern/cycles/device/metal/bvh.h @@ -21,6 +21,7 @@ class BVHMetal : public BVH { API_AVAILABLE(macos(11.0)) vector> blas_array; + vector blas_lookup; bool motion_blur = false; diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index a7fd64d3c98..c692b762d86 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress, uint32_t instance_index = 0; uint32_t motion_transform_index = 0; + +// allocate look up buffer for wost case scenario +uint64_t count = objects.size(); +blas_lookup.resize(count); + for (Object *ob : objects) { /* Skip non-traceable objects */ if (!ob->is_traceable()) @@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress, /* Set user instance ID to object index */ int object_index = ob->get_device_index(); uint32_t user_id = uint32_t(object_index); + int currIndex = instance_index++; + assert(user_id < blas_lookup.size()); + blas_lookup[user_id] = accel_struct_index; /* Bake into the appropriate descriptor */ if (motion_blur) { MTLAccelerationStructureMotionInstanceDescriptor *instances = (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents]; -MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++]; +MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex]; desc.accelerationStructureIndex = accel_struct_index; desc.userID = user_id; @@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress, else { MTLAccelerationStructureUserIDInstanceDescriptor *instances = (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents]; -MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++]; +MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex]; desc.accelerationStructureIndex = accel_struct_index; desc.userID = user_id; diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index a10962b4e45..2b89ebf19c9 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -74,6 +74,11 @@ class MetalDevice : public Device { id texture_bindings_3d = nil; std::vector> texture_slot_map; + /* BLAS encoding & lookup */ + id mtlBlasArgEncoder = nil; + id blas_buffer = nil; + id blas_lookup_buffer = nil; + bool use_metalrt = false; MetalPipelineType kernel_specialization_level = PSO_GENERIC; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 35298822e41..aadf5e02934 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; arg_desc_as.access = MTLArgumentAccessReadOnly; +MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init]; +arg_desc_ptrs.dataType = MTLDataTypePointer
[Bf-blender-cvs] [654e1e901b6] master: Cycles: Use local atomics for faster shader sorting (enabled on Metal)
Commit: 654e1e901b6ae003d8ec7a0ce1bc5926d68a971f Author: Michael Jones Date: Mon Feb 6 11:16:02 2023 + Branches: master https://developer.blender.org/rB654e1e901b6ae003d8ec7a0ce1bc5926d68a971f Cycles: Use local atomics for faster shader sorting (enabled on Metal) This patch adds two new kernels: SORT_BUCKET_PASS and SORT_WRITE_PASS. These replace PREFIX_SUM and SORTED_PATHS_ARRAY on supported devices (currently implemented on Metal, but will be trivial to enable on the other backends). The new kernels exploit sort partitioning (see D15331) by sorting each partition separately using local atomics. This can give an overall render speedup of 2-3% depending on architecture. As before, we fall back to the original non-partitioned sorting when the shade [...] Reviewed By: brecht Differential Revision: https://developer.blender.org/D16909 === M intern/cycles/device/kernel.cpp M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/queue.h M intern/cycles/integrator/path_trace_work_gpu.cpp M intern/cycles/integrator/path_trace_work_gpu.h M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/gpu/parallel_active_index.h M intern/cycles/kernel/device/gpu/parallel_sorted_index.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/integrator/state.h M intern/cycles/kernel/integrator/state_flow.h M intern/cycles/kernel/types.h M intern/cycles/util/atomic.h M intern/cycles/util/debug.cpp M intern/cycles/util/debug.h === diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 27ca0d81817..903ad096586 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel) return "integrator_terminated_paths_array"; case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: return "integrator_sorted_paths_array"; +case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: + return "integrator_sort_bucket_pass"; +case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: + return "integrator_sort_write_pass"; case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: return "integrator_compact_paths_array"; case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 526535ff132..a10962b4e45 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -105,6 +105,8 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); + bool use_local_atomic_sort() const; + bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type); void make_source(MetalPipelineType pso_type, const uint kernel_features); diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 453418386a4..35298822e41 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -271,6 +271,11 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } +bool MetalDevice::use_local_atomic_sort() const +{ + return DebugFlags().metal.use_local_atomic_sort; +} + void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { string global_defines; @@ -278,6 +283,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } + if (use_local_atomic_sort()) { +global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n"; + } + if (use_metalrt) { global_defines += "#define __METALRT__\n"; if (motion_blur) { diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 6312c5f88ee..2ed230ee657 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -87,6 +87,9 @@ struct ShaderCache { break; } } + +occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024}; +occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024}; } ~ShaderCache(); diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 2a6c12e2a60..cf3a6175916 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -25,6 +25,7 @@ class MetalDeviceQueue : public Devic
[Bf-blender-cvs] [46c9f7702af] master: Cycles: Enable MetalRT opt-in for AMD/Navi2 GPUs
Commit: 46c9f7702afa9688987de6fe0deea845b826b300 Author: Michael Jones Date: Mon Feb 6 11:14:01 2023 + Branches: master https://developer.blender.org/rB46c9f7702afa9688987de6fe0deea845b826b300 Cycles: Enable MetalRT opt-in for AMD/Navi2 GPUs Reviewed By: brecht Differential Revision: https://developer.blender.org/D17043 === M intern/cycles/blender/addon/properties.py === diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index eed51eed95f..0c5d8a7cd55 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1722,13 +1722,20 @@ class CyclesPreferences(bpy.types.AddonPreferences): row.prop(self, "peer_memory") if compute_device_type == 'METAL': -import platform -# MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD -# Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed -if platform.machine() == 'arm64': +import platform, re +isNavi2 = False +for device in devices: +obj = re.search("((RX)|(Pro)|(PRO))\s+W?6\d00X",device.name) +if obj: +isNavi2 = True + +# MetalRT only works on Apple Silicon and Navi2 +if platform.machine() == 'arm64' or isNavi2: col = layout.column() col.use_property_split = True -col.prop(self, "kernel_optimization_level") +# Kernel specialization is only supported on Apple Silicon +if platform.machine() == 'arm64': +col.prop(self, "kernel_optimization_level") col.prop(self, "use_metalrt") def draw(self, context): ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [be0912a4022] master: Cycles: Prevent use of both AMD and Intel Metal devices at same time
Commit: be0912a40224b7742b07f30f8c03ec6a3273c540 Author: Michael Jones Date: Mon Feb 6 11:13:21 2023 + Branches: master https://developer.blender.org/rBbe0912a40224b7742b07f30f8c03ec6a3273c540 Cycles: Prevent use of both AMD and Intel Metal devices at same time This patch removes the option to select both AMD and Intel GPUs on system that have both. Currently both devices will be selected by default which results in crashes and other poorly understood behaviour. This patch adds precedence for using any discrete AMD GPU over an integrated Intel one. This can be overridden with CYCLES_METAL_FORCE_INTEL. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17166 === M intern/cycles/device/metal/util.mm === diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 984e7a70c76..03afe67628e 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id device) return METAL_GPU_INTEL; } else if (strstr(device_name, "AMD")) { +/* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */ +if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) { + if (atoi(str)) { +return METAL_GPU_UNKNOWN; + } +} return METAL_GPU_AMD; } else if (strstr(device_name, "Apple")) { @@ -96,6 +102,15 @@ vector> const &MetalInfo::get_usable_devices() return usable_devices; } + /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD + * one. This can be overriden with CYCLES_METAL_FORCE_INTEL. */ + bool has_usable_amd_gpu = false; + if (@available(macos 12.3, *)) { +for (id device in MTLCopyAllDevices()) { + has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD); +} + } + metal_printf("Usable Metal devices:\n"); for (id device in MTLCopyAllDevices()) { string device_name = get_device_name(device); @@ -111,8 +126,10 @@ vector> const &MetalInfo::get_usable_devices() } # if defined(MAC_OS_VERSION_13_0) -if (@available(macos 13.0, *)) { - usable |= (vendor == METAL_GPU_INTEL); +if (!has_usable_amd_gpu) { + if (@available(macos 13.0, *)) { +usable |= (vendor == METAL_GPU_INTEL); + } } # endif ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [0a3df611e7e] master: Fix T103393: Cycles: Undefine __LIGHT_TREE__ on Metal/AMD to fix perf
Commit: 0a3df611e7e2a276a46b421c579cf3efb7a3000a Author: Michael Jones Date: Mon Feb 6 11:12:22 2023 + Branches: master https://developer.blender.org/rB0a3df611e7e2a276a46b421c579cf3efb7a3000a Fix T103393: Cycles: Undefine __LIGHT_TREE__ on Metal/AMD to fix perf This patch fixes T103393 by undefining `__LIGHT_TREE__` on Metal/AMD as it has an unexpected & major impact on performance even when light trees are not in use. Patch authored by Prakash Kamliya. Reviewed By: brecht Maniphest Tasks: T103393 Differential Revision: https://developer.blender.org/D17167 === M intern/cycles/device/metal/device.mm M intern/cycles/kernel/types.h === diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index 51e3323370a..5ffd3a09d56 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -55,6 +55,10 @@ void device_metal_info(vector &devices) info.denoisers = DENOISER_NONE; info.id = id; +if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) { + info.has_light_tree = false; +} + devices.push_back(info); device_index++; } diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index cfbaba20ec1..8637c717ddc 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN #define __VOLUME__ /* TODO: solve internal compiler errors and enable light tree on HIP. */ -#ifdef __KERNEL_HIP__ +/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */ +#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__) # undef __LIGHT_TREE__ #endif ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [e270a198a54] master: Cycles: Markup to disable specialisation of kernel data fields (Metal)
Commit: e270a198a548fcacc9dfecbda29c55fe7a05b5c9 Author: Michael Jones Date: Thu Jan 19 17:57:26 2023 + Branches: master https://developer.blender.org/rBe270a198a548fcacc9dfecbda29c55fe7a05b5c9 Cycles: Markup to disable specialisation of kernel data fields (Metal) This patch adds markup to specify that certain kernel data constants should not be specialised. Currently it is used for `tabulated_sobol_sequence_size` and `sobol_index_mask` which change frequently based on the aa sample count, trash the shader cache, and have little bearing on performance. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16968 === M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/data_template.h === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 87614f656c3..917945fbdb6 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -327,10 +327,19 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat # define KERNEL_STRUCT_BEGIN(name, parent) \ string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); +bool next_member_is_specialized = true; + +# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false; + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ # define KERNEL_STRUCT_MEMBER(parent, _type, name) \ -baked_constants += string(#parent "." #name "=") + \ - to_string(_type(launch_params.data.parent.name)) + "\n"; +if (next_member_is_specialized) { \ + baked_constants += string(#parent "." #name "=") + \ +to_string(_type(launch_params.data.parent.name)) + "\n"; \ +} else { \ + string_replace(source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \ + next_member_is_specialized = true; \ +} # include "kernel/data_template.h" diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 48bdf2f0ef1..febce2840ea 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -460,13 +460,17 @@ static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nul if (!data) { data = &zero_data; } - int zero_int = 0; - [constant_values setConstantValue:&zero_int type:MTLDataType_int atIndex:Kernel_DummyConstant]; + [constant_values setConstantValue:&zero_data type:MTLDataType_int atIndex:Kernel_DummyConstant]; + + bool next_member_is_specialized = true; + +# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false; # define KERNEL_STRUCT_MEMBER(parent, _type, name) \ -[constant_values setConstantValue:&data->parent.name \ +[constant_values setConstantValue:next_member_is_specialized ? (void*)&data->parent.name : (void*)&zero_data \ type:MTLDataType_##_type \ - atIndex:KernelData_##parent##_##name]; + atIndex:KernelData_##parent##_##name]; \ +next_member_is_specialized = true; # include "kernel/data_template.h" diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h index ddc462e02f6..dceae4b77c1 100644 --- a/intern/cycles/kernel/data_template.h +++ b/intern/cycles/kernel/data_template.h @@ -10,6 +10,10 @@ #ifndef KERNEL_STRUCT_MEMBER # define KERNEL_STRUCT_MEMBER(parent, type, name) #endif +#ifndef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE +# define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE +#endif + /* Background. */ @@ -179,8 +183,8 @@ KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_indirect) KERNEL_STRUCT_MEMBER(integrator, int, use_caustics) /* Sampling pattern. */ KERNEL_STRUCT_MEMBER(integrator, int, sampling_pattern) -KERNEL_STRUCT_MEMBER(integrator, int, tabulated_sobol_sequence_size) -KERNEL_STRUCT_MEMBER(integrator, int, sobol_index_mask) +KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE KERNEL_STRUCT_MEMBER(integrator, int, tabulated_sobol_sequence_size) +KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE KERNEL_STRUCT_MEMBER(integrator, int, sobol_index_mask) KERNEL_STRUCT_MEMBER(integrator, float, scrambling_distance) /* Volume render. */ KERNEL_STRUCT_MEMBER(integrator, int, use_volumes) ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [08b3426df9e] master: Cycles: Occupancy tuning for new higher end M2 machines
Commit: 08b3426df9e5b5dd3c7cc042197bea3ea2398e75 Author: Michael Jones Date: Thu Jan 19 17:55:53 2023 + Branches: master https://developer.blender.org/rB08b3426df9e5b5dd3c7cc042197bea3ea2398e75 Cycles: Occupancy tuning for new higher end M2 machines This patch adds occupancy tuning for the newly announced high-end M2 machines, giving 10-15% render speedup over a pre-tuned build. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17037 === M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/device/metal/util.h M intern/cycles/device/metal/util.mm === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index e4ce5e19f63..48bdf2f0ef1 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -49,6 +49,18 @@ struct ShaderCache { if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) { switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) { default: +case APPLE_M2_BIG: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {384, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {640, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {1024, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {704, 704}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {640, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {896, 768}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {32, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {768, 576}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {896, 768}; + break; case APPLE_M2: occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32}; occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32}; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 837be0b0c23..f335844c3f9 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -278,7 +278,8 @@ int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; -if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) { +/* Increasing the state count doesn't notably benefit M1-family systems. */ +if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) { size_t system_ram = system_physical_ram(); size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize]; size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize]; diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index a988d01d361..c30c4ccd9bc 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -29,6 +29,7 @@ enum AppleGPUArchitecture { APPLE_UNKNOWN, APPLE_M1, APPLE_M2, + APPLE_M2_BIG, }; /* Contains static Metal helper functions. */ diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index f47638fac15..984e7a70c76 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -52,7 +52,7 @@ AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id device) return APPLE_M1; } else if (strstr(device_name, "M2")) { -return APPLE_M2; +return get_apple_gpu_core_count(device) <= 10 ? APPLE_M2 : APPLE_M2_BIG; } return APPLE_UNKNOWN; } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [a7cc6e015cf] master: Cycles: Additional Metal kernel specialisation exposed through UI
Commit: a7cc6e015cf98feca22cfc08c3356807b989a9fe Author: Michael Jones Date: Wed Jan 4 16:01:24 2023 + Branches: master https://developer.blender.org/rBa7cc6e015cf98feca22cfc08c3356807b989a9fe Cycles: Additional Metal kernel specialisation exposed through UI This patch adds a new "Kernel Optimization Level" dropdown menu to control Metal kernel specialisation. Currently this defaults to "full" optimisation, on the assumption that the changes proposed in D16371 will address usability concerns around app responsiveness and shader cache housekeeping. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16514 === M intern/cycles/blender/addon/properties.py M intern/cycles/blender/device.cpp M intern/cycles/blender/device.h M intern/cycles/blender/python.cpp M intern/cycles/blender/sync.cpp M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm === diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index eff6384c85e..a27a75e48fa 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1543,6 +1543,17 @@ class CyclesPreferences(bpy.types.AddonPreferences): default=False, ) +kernel_optimization_level: EnumProperty( +name="Kernel Optimization", +description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. If optimized kernels are not available, rendering will proceed using generic kernels until the optimized set is available in the cache. This can result in additional CPU usage for a brief time (tens of seconds).", +default='FULL', +items=( +('OFF', "Off", "Disable kernel optimization. Slowest rendering, no extra background CPU usage"), +('INTERSECT', "Intersection only", "Optimize only intersection kernels. Faster rendering, negligible extra background CPU usage"), +('FULL', "Full", "Optimize all kernels. Fastest rendering, may result in extra background CPU usage"), +), +) + def find_existing_device_entry(self, device): for device_entry in self.devices: if device_entry.id == device[2] and device_entry.type == device[1]: @@ -1711,10 +1722,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): if compute_device_type == 'METAL': import platform # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD +# Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed if platform.machine() == 'arm64': -row = layout.row() -row.use_property_split = True -row.prop(self, "use_metalrt") +col = layout.column() +col.use_property_split = True +col.prop(self, "kernel_optimization_level") +col.prop(self, "use_metalrt") def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index 22beca898f1..96e7bdd03aa 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -30,7 +30,7 @@ int blender_device_threads(BL::Scene &b_scene) return 0; } -DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background) +DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scene, bool background, bool preview) { PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); @@ -113,6 +113,18 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen device.use_metalrt = true; } + if (preview) { +/* Disable specialization for preview renders. */ +device.kernel_optimization_level = KERNEL_OPTIMIZATION_LEVEL_OFF; + } + else { +device.kernel_optimization_level = (KernelOptimizationLevel)get_enum( +cpreferences, +"kernel_optimization_level", +KERNEL_OPTIMIZATION_NUM_LEVELS, +KERNEL_OPTIMIZATION_LEVEL_FULL); + } + return device; } diff --git a/intern/cycles/blender/device.h b/intern/cycles/blender/device.h index 7a762261829..08655743eeb 100644 --- a/intern/cycles/blender/device.h +++ b/intern/cycles/blender/devi
[Bf-blender-cvs] [77c3e67d3d7] master: Cycles: Improved render start/stop responsiveness on Metal
Commit: 77c3e67d3d7d8055619491bf09f0e7626afe33f9 Author: Michael Jones Date: Wed Jan 4 14:23:33 2023 + Branches: master https://developer.blender.org/rB77c3e67d3d7d8055619491bf09f0e7626afe33f9 Cycles: Improved render start/stop responsiveness on Metal All kernel specialisation is now performed in the background regardless of kernel type, meaning that the first render will be visible a few seconds sooner. The only exception is during benchmark warm up, in which case we wait for all kernels to be cached. When stopping a render, we call a new `cancel()` method on the device which causes any outstanding compilation work to be cancelled, and we destroy the device in a detached thread so that any stale queued compilations can be safely purge [...] Reviewed By: brecht Differential Revision: https://developer.blender.org/D16371 === M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/integrator/path_trace.cpp M intern/cycles/session/session.cpp === diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index b9308dc8949..959939ddbb7 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -167,6 +167,17 @@ class Device { return true; } + /* Request cancellation of any long-running work. */ + virtual void cancel() + { + } + + /* Return true if device is ready for rendering, or report status if not. */ + virtual bool is_ready(string &status) const + { +return true; + } + /* GPU device only functions. * These may not be used on CPU or multi-devices. */ diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index e57b8628023..526535ff132 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -76,7 +76,20 @@ class MetalDevice : public Device { bool use_metalrt = false; MetalPipelineType kernel_specialization_level = PSO_GENERIC; - std::atomic_bool async_compile_and_load = false; + + int device_id = 0; + + static thread_mutex existing_devices_mutex; + static std::map active_device_ids; + + static bool is_device_cancelled(int device_id); + + static MetalDevice *get_device_by_ID(int device_idID, + thread_scoped_lock &existing_devices_mutex_lock); + + virtual bool is_ready(string &status) const override; + + virtual void cancel() override; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -92,14 +105,12 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); + bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; - void reserve_local_memory(const uint kernel_features); - - void init_host_memory(); - void load_texture_info(); void erase_allocation(device_memory &mem); @@ -112,7 +123,7 @@ class MetalDevice : public Device { virtual void optimize_for_scene(Scene *scene) override; - bool compile_and_load(MetalPipelineType pso_type); + static void compile_and_load(int device_id, MetalPipelineType pso_type); /* -- */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 95935ce2a3a..a6966bf167d 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -13,10 +13,32 @@ # include "util/path.h" # include "util/time.h" +# include + CCL_NAMESPACE_BEGIN class MetalDevice; +thread_mutex MetalDevice::existing_devices_mutex; +std::map MetalDevice::active_device_ids; + +/* Thread-safe device access for async work. Calling code must pass an appropriatelty scoped lock + * to existing_devices_mutex to safeguard against destruction of the returned instance. */ +MetalDevice *MetalDevice::get_device_by_ID(int ID, thread_scoped_lock &existing_devices_mutex_lock) +{ + auto it = active_device_ids.find(ID); + if (it != active_device_ids.end()) { +return it->second; + } + return nullptr; +} + +bool MetalDevice::is_device_cancelled(int ID) +{ + thread_scoped_lock lock(existing_devices_mutex); + return get_device_by_ID(ID, lock) == nullptr; +} + BVHLayoutMask MetalDevice::get_bvh_layout_mask() const { return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2; @@ -40,6 +62,15 @@ void MetalDevice::set_error(const string &error) MetalDe
[Bf-blender-cvs] [2dc51fccb83] master: Fix T101787, T102786. Cycles: Improved out-of-memory messaging on Metal
Commit: 2dc51fccb8387467e8a012b07ab148078e7c9e50 Author: Michael Jones Date: Wed Dec 7 13:28:59 2022 + Branches: master https://developer.blender.org/rB2dc51fccb8387467e8a012b07ab148078e7c9e50 Fix T101787, T102786. Cycles: Improved out-of-memory messaging on Metal This patch adds a new `max_working_set_exceeded()` check on Metal so that we can display a "System is out of GPU memory" message to the user. Without this, we get obtuse "CommandBuffer failed" errors at render time due to exceeding the size limit of resident resources. Likely fix for T101787 & T102786. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16713 === M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 99e60d3a788..e57b8628023 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -117,6 +117,8 @@ class MetalDevice : public Device { /* -- */ /* low-level memory management */ + bool max_working_set_exceeded(size_t safety_margin = 8 * 1024 * 1024) const; + MetalMem *generic_alloc(device_memory &mem); void generic_copy_to(device_memory &mem); diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 604abc2be1a..24836e88755 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -446,6 +446,14 @@ void MetalDevice::erase_allocation(device_memory &mem) } } +bool MetalDevice::max_working_set_exceeded(size_t safety_margin) const +{ + /* We're allowed to allocate beyond the safe working set size, but then if all resources are made + * resident we will get command buffer failures at render time. */ + size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin; + return (stats.mem_used > available); +} + MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) { size_t size = mem.memory_size(); @@ -523,6 +531,11 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) mmem->use_UMA = false; } + if (max_working_set_exceeded()) { +set_error("System is out of GPU memory"); +return nullptr; + } + return mmem; } @@ -921,9 +934,8 @@ void MetalDevice::tex_alloc(device_texture &mem) << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; -assert(mtlTexture); - if (!mtlTexture) { + set_error("System is out of GPU memory"); return; } @@ -955,7 +967,10 @@ void MetalDevice::tex_alloc(device_texture &mem) << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; -assert(mtlTexture); +if (!mtlTexture) { + set_error("System is out of GPU memory"); + return; +} [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height) mipmapLevel:0 @@ -1017,6 +1032,10 @@ void MetalDevice::tex_alloc(device_texture &mem) need_texture_info = true; texture_info[slot].data = uint64_t(slot) | (sampler_index << 32); + + if (max_working_set_exceeded()) { +set_error("System is out of GPU memory"); + } } void MetalDevice::tex_free(device_texture &mem) @@ -1077,6 +1096,10 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } } } + + if (max_working_set_exceeded()) { +set_error("System is out of GPU memory"); + } } CCL_NAMESPACE_END ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [3e247f0f76e] blender-v3.3-release: Cycles: Enable MetalRT pointclouds & other fixes
Commit: 3e247f0f76ec98a09ce0f206a7e6878cb1521807 Author: Michael Jones Date: Mon Nov 14 16:51:48 2022 + Branches: blender-v3.3-release https://developer.blender.org/rB3e247f0f76ec98a09ce0f206a7e6878cb1521807 Cycles: Enable MetalRT pointclouds & other fixes Differential Revision: https://developer.blender.org/D16499 === M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/data_template.h M intern/cycles/kernel/device/metal/bvh.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/scene/object.cpp === diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 09c4ace081e..a7fd64d3c98 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -496,7 +496,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress, num_motion_steps = pointcloud->get_motion_steps(); } -const size_t num_aabbs = num_motion_steps; +const size_t num_aabbs = num_motion_steps * num_points; MTLResourceOptions storage_mode; if (device.hasUnifiedMemory) { @@ -757,6 +757,10 @@ bool BVHMetal::build_TLAS(Progress &progress, } } +if (num_instances == 0) { + return false; +} + /**/ BVH_status("Building TLAS | %7d instances", (int)num_instances); /**/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index d1250b83d22..6feeaa0707c 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -301,6 +301,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat MD5Hash md5; md5.append(baked_constants); md5.append(source); + if (use_metalrt) { +md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK)); + } source_md5[pso_type] = md5.get_hex(); } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 11393f8b7e1..3e88d2daea7 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,6 +54,10 @@ enum MetalPipelineType { PSO_NUM }; +# define METALRT_FEATURE_MASK \ +(KERNEL_FEATURE_HAIR | KERNEL_FEATURE_HAIR_THICK | KERNEL_FEATURE_POINTCLOUD | \ + KERNEL_FEATURE_OBJECT_MOTION) + const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { @@ -67,9 +71,7 @@ struct MetalKernelPipeline { KernelData kernel_data_; bool use_metalrt; - bool metalrt_hair; - bool metalrt_hair_thick; - bool metalrt_pointcloud; + uint32_t metalrt_features = 0; int threads_per_threadgroup; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index f3a2fc9ec6c..e22b0159108 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -225,12 +225,9 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR); - request.pipeline->metalrt_hair_thick = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - request.pipeline->metalrt_pointcloud = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + request.pipeline->metalrt_features = device->use_metalrt ? + (device->kernel_features & METALRT_FEATURE_MASK) : + 0; { thread_scoped_lock lock(cache_mutex); @@ -267,9 +264,13 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); - bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR; + bool device_metalrt_hair_thick = use_metalrt && + device->kernel_features & KERNEL_F
[Bf-blender-cvs] [021c8c7cd0c] blender-v3.3-release: Cycles: Tweak inlining policy on Metal
Commit: 021c8c7cd0c7472eb182d72c11d7201faa13c1f2 Author: Michael Jones Date: Tue Sep 27 17:01:17 2022 +0100 Branches: blender-v3.3-release https://developer.blender.org/rB021c8c7cd0c7472eb182d72c11d7201faa13c1f2 Cycles: Tweak inlining policy on Metal This patch optimises the Metal inlining policy. It gives a small speedup (2-3% on M1 Max) with no notable compilation slowdown vs what is already in master. Previously noted compilation slowdowns (as reported in T100102) were caused by forcing inlining for `ccl_device`, but we get better rendering perf by relying on compiler heuristics in these cases. Backported to 3.3 because this also fixes a test failure. Differential Revision: https://developer.blender.org/D16081 === M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index b86d1f64307..c321f4451f6 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,28 +29,13 @@ using namespace metal::raytracing; /* Qualifiers */ -/* Inline everything for Apple GPUs. This gives ~1.1x speedup and 10% spill - * reduction for integator_shade_surface. However it comes at the cost of - * longer compile times (~4.5 minutes on M1 Max) and is disabled for that - * reason, until there is a user option to manually enable it. */ - -#if 0 // defined(__KERNEL_METAL_APPLE__) - -# define ccl_device __attribute__((always_inline)) -# define ccl_device_inline __attribute__((always_inline)) -# define ccl_device_forceinline __attribute__((always_inline)) -# define ccl_device_noinline __attribute__((always_inline)) - +#define ccl_device +#define ccl_device_inline ccl_device __attribute__((always_inline)) +#define ccl_device_forceinline ccl_device __attribute__((always_inline)) +#if defined(__KERNEL_METAL_APPLE__) +# define ccl_device_noinline ccl_device #else - -# define ccl_device -# define ccl_device_inline ccl_device -# define ccl_device_forceinline ccl_device -# if defined(__KERNEL_METAL_APPLE__) -#define ccl_device_noinline ccl_device -# else -#define ccl_device_noinline ccl_device __attribute__((noinline)) -# endif +# define ccl_device_noinline ccl_device __attribute__((noinline)) #endif #define ccl_device_noinline_cpu ccl_device ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [edae67c0367] blender-v3.4-release: Cycles: Enable MetalRT pointclouds & other fixes
Commit: edae67c036795bc3d3bd8fdb93875e68a99c7a18 Author: Michael Jones Date: Mon Nov 14 16:51:48 2022 + Branches: blender-v3.4-release https://developer.blender.org/rBedae67c036795bc3d3bd8fdb93875e68a99c7a18 Cycles: Enable MetalRT pointclouds & other fixes Cherrypicking D16499 into blender-v3.4-release === M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/data_template.h M intern/cycles/kernel/device/metal/bvh.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/scene/object.cpp === diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 09c4ace081e..a7fd64d3c98 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -496,7 +496,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress, num_motion_steps = pointcloud->get_motion_steps(); } -const size_t num_aabbs = num_motion_steps; +const size_t num_aabbs = num_motion_steps * num_points; MTLResourceOptions storage_mode; if (device.hasUnifiedMemory) { @@ -757,6 +757,10 @@ bool BVHMetal::build_TLAS(Progress &progress, } } +if (num_instances == 0) { + return false; +} + /**/ BVH_status("Building TLAS | %7d instances", (int)num_instances); /**/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 6f1042b1e55..604abc2be1a 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -307,6 +307,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat MD5Hash md5; md5.append(baked_constants); md5.append(source); + if (use_metalrt) { +md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK)); + } source_md5[pso_type] = md5.get_hex(); } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 11393f8b7e1..3e88d2daea7 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,6 +54,10 @@ enum MetalPipelineType { PSO_NUM }; +# define METALRT_FEATURE_MASK \ +(KERNEL_FEATURE_HAIR | KERNEL_FEATURE_HAIR_THICK | KERNEL_FEATURE_POINTCLOUD | \ + KERNEL_FEATURE_OBJECT_MOTION) + const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { @@ -67,9 +71,7 @@ struct MetalKernelPipeline { KernelData kernel_data_; bool use_metalrt; - bool metalrt_hair; - bool metalrt_hair_thick; - bool metalrt_pointcloud; + uint32_t metalrt_features = 0; int threads_per_threadgroup; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 55938d1a03a..2136cb06ed2 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -232,12 +232,9 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR); - request.pipeline->metalrt_hair_thick = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - request.pipeline->metalrt_pointcloud = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + request.pipeline->metalrt_features = device->use_metalrt ? + (device->kernel_features & METALRT_FEATURE_MASK) : + 0; { thread_scoped_lock lock(cache_mutex); @@ -274,9 +271,13 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); - bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR; + bool device_metalrt_hair_thick = use_metalrt && + device->kernel_features & KERNEL_FEATURE_HAIR_TH
[Bf-blender-cvs] [b0e2e454967] master: Cycles: Enable MetalRT pointclouds & other fixes
Commit: b0e2e4549676817f23a6122aeeefc0d07bc62a42 Author: Michael Jones Date: Mon Nov 14 15:35:47 2022 + Branches: master https://developer.blender.org/rBb0e2e4549676817f23a6122aeeefc0d07bc62a42 Cycles: Enable MetalRT pointclouds & other fixes Code authored by Marco Giordano. This fixes pointcloud rendering on MetalRT and some other subtle MetalRT bugs: - Incorrect kernel hashing - Missing specialisation constants - Incorrect visibility filtering - Missing null pointer check Reviewed By: brecht Differential Revision: https://developer.blender.org/D16499 === M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/data_template.h M intern/cycles/kernel/device/metal/bvh.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/scene/object.cpp === diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 09c4ace081e..a7fd64d3c98 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -496,7 +496,7 @@ bool BVHMetal::build_BLAS_pointcloud(Progress &progress, num_motion_steps = pointcloud->get_motion_steps(); } -const size_t num_aabbs = num_motion_steps; +const size_t num_aabbs = num_motion_steps * num_points; MTLResourceOptions storage_mode; if (device.hasUnifiedMemory) { @@ -757,6 +757,10 @@ bool BVHMetal::build_TLAS(Progress &progress, } } +if (num_instances == 0) { + return false; +} + /**/ BVH_status("Building TLAS | %7d instances", (int)num_instances); /**/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 6f1042b1e55..604abc2be1a 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -307,6 +307,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat MD5Hash md5; md5.append(baked_constants); md5.append(source); + if (use_metalrt) { +md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK)); + } source_md5[pso_type] = md5.get_hex(); } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 11393f8b7e1..3e88d2daea7 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,6 +54,10 @@ enum MetalPipelineType { PSO_NUM }; +# define METALRT_FEATURE_MASK \ +(KERNEL_FEATURE_HAIR | KERNEL_FEATURE_HAIR_THICK | KERNEL_FEATURE_POINTCLOUD | \ + KERNEL_FEATURE_OBJECT_MOTION) + const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { @@ -67,9 +71,7 @@ struct MetalKernelPipeline { KernelData kernel_data_; bool use_metalrt; - bool metalrt_hair; - bool metalrt_hair_thick; - bool metalrt_pointcloud; + uint32_t metalrt_features = 0; int threads_per_threadgroup; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 35cf832c537..86e5a78692e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -274,12 +274,9 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR); - request.pipeline->metalrt_hair_thick = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - request.pipeline->metalrt_pointcloud = device->use_metalrt && - (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); + request.pipeline->metalrt_features = device->use_metalrt ? + (device->kernel_features & METALRT_FEATURE_MASK) : + 0; { thread_scoped_lock lock(cache_mutex); @@ -316,9 +313,13 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); - bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); - bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FE
[Bf-blender-cvs] [2c596319a48] master: Cycles: Cache only up to 5 kernels of each type on Metal
Commit: 2c596319a4888aa40bfdf41f9ea5d446179141d0 Author: Michael Jones Date: Fri Nov 11 18:10:16 2022 + Branches: master https://developer.blender.org/rB2c596319a4888aa40bfdf41f9ea5d446179141d0 Cycles: Cache only up to 5 kernels of each type on Metal This patch adapts D14754 for the Metal backend. Kernels of the same type are already organised into subdirectories which simplifies type matching. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16469 === M intern/cycles/device/metal/kernel.mm M intern/cycles/util/path.cpp M intern/cycles/util/path.h === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index dc8af9a5358..35cf832c537 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -618,7 +618,9 @@ void MetalKernelPipeline::compile() metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); path_create_directories(metalbin_path); -if (path_exists(metalbin_path) && use_binary_archive) { +/* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as + * intended. */ +if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) { if (@available(macOS 11.0, *)) { MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; @@ -695,6 +697,9 @@ void MetalKernelPipeline::compile() metal_printf("Failed to save binary archive, error:\n%s\n", [[error localizedDescription] UTF8String]); } +else { + path_cache_kernel_mark_added_and_clear_old(metalbin_path); +} } } }; diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp index 17cff2f2977..cb6b8d7a740 100644 --- a/intern/cycles/util/path.cpp +++ b/intern/cycles/util/path.cpp @@ -2,8 +2,11 @@ * Copyright 2011-2022 Blender Foundation */ #include "util/path.h" +#include "util/algorithm.h" +#include "util/map.h" #include "util/md5.h" #include "util/string.h" +#include "util/vector.h" #include #include @@ -898,19 +901,54 @@ FILE *path_fopen(const string &path, const string &mode) #endif } -void path_cache_clear_except(const string &name, const set &except) +/* LRU Cache for Kernels */ + +static void path_cache_kernel_mark_used(const string &path) { - string dir = path_user_get("cache"); + std::time_t current_time = std::time(nullptr); + OIIO::Filesystem::last_write_time(path, current_time); +} - if (path_exists(dir)) { -directory_iterator it(dir), it_end; +bool path_cache_kernel_exists_and_mark_used(const string &path) +{ + if (path_exists(path)) { +path_cache_kernel_mark_used(path); +return true; + } + else { +return false; + } +} -for (; it != it_end; ++it) { - string filename = path_filename(it->path()); +void path_cache_kernel_mark_added_and_clear_old(const string &new_path, +const size_t max_old_kernel_of_same_type) +{ + path_cache_kernel_mark_used(new_path); + + string dir = path_dirname(new_path); + if (!path_exists(dir)) { +return; + } + + /* Remove older kernels within the same directory. */ + directory_iterator it(dir), it_end; + vector> same_kernel_types; + + for (; it != it_end; ++it) { +const string &path = it->path(); +if (path == new_path) { + continue; +} + +std::time_t last_time = OIIO::Filesystem::last_write_time(path); +same_kernel_types.emplace_back(last_time, path); + } + + if (same_kernel_types.size() > max_old_kernel_of_same_type) { +sort(same_kernel_types.begin(), same_kernel_types.end()); - if (string_startswith(filename, name.c_str())) -if (except.find(filename) == except.end()) - path_remove(it->path()); +for (int i = 0; i < same_kernel_types.size() - max_old_kernel_of_same_type; i++) { + path_remove(same_kernel_types[i].second); } } } diff --git a/intern/cycles/util/path.h b/intern/cycles/util/path.h index 48b1fb65919..6d02267e182 100644 --- a/intern/cycles/util/path.h +++ b/intern/cycles/util/path.h @@ -55,8 +55,15 @@ bool path_remove(const string &path); /* source code utility */ string path_source_replace_includes(const string &source, const string &path); -/* cache utility */ -void path_cache_clear_except(const string &name, const set &except); +/* Simple least-recently-used cache for kernels. + * + * Kernels of same type are cached in the same directory. + * Whenever a kernel is used, its last modified tim
[Bf-blender-cvs] [74140d41b1d] master: Cycles: Apple GPU threadgroup tuning
Commit: 74140d41b1dc8e447658ca77a061fc7d9a47052c Author: Michael Jones Date: Fri Nov 4 15:59:55 2022 + Branches: master https://developer.blender.org/rB74140d41b1dc8e447658ca77a061fc7d9a47052c Cycles: Apple GPU threadgroup tuning This patch tunes maximum threads-per-threadgroup and threads-per-block for faster renders on Apple GPUs. Appropriate tuning is selected based on the GPU architecture (M1 or M2). We see a benchmark uplift of around 5-10% on M1 family chips. Similar uplift is expected on M2 with upcoming OS changes. (Ref T101931) Reviewed By: brecht Maniphest Tasks: T101931 Differential Revision: https://developer.blender.org/D16299 === M intern/cycles/device/metal/kernel.mm === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 55938d1a03a..dc8af9a5358 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel) struct ShaderCache { ShaderCache(id _mtlDevice) : mtlDevice(_mtlDevice) { +/* Initialize occupancy tuning LUT. */ +if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) { + switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) { +default: +case APPLE_M2: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024}; + break; +case APPLE_M1: + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384}; + occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832}; + break; + } +} } ~ShaderCache(); @@ -73,6 +103,11 @@ struct ShaderCache { std::function completionHandler; }; + struct OccupancyTuningParameters { +int threads_per_threadgroup = 0; +int num_threads_per_block = 0; + } occupancy_tuning[DEVICE_KERNEL_NUM]; + std::mutex cache_mutex; PipelineCollection pipelines[DEVICE_KERNEL_NUM]; @@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; + if (occupancy_tuning[device_kernel].threads_per_threadgroup) { +request.pipeline->threads_per_threadgroup = +occupancy_tuning[device_kernel].threads_per_threadgroup; +request.pipeline->num_threads_per_block = +occupancy_tuning[device_kernel].num_threads_per_block; + } + /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; request.pipeline->metalrt_hair = device->use_metalrt && @@ -374,13 +416,6 @@ void MetalKernelPipeline::compile() const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); - int threads_per_threadgroup = this->threads_per_threadgroup; - if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && - device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) { -/* Always use 512 for the sorting kernels */ -threads_per_threadgroup = 512; - } - NSString *entryPoint = [@(function_name.c_str()) copy]; NSError *error = NULL; @@ -644,12 +679,14 @@ void MetalKernelPipeline::compile() return; } -int num_threads_per_block =
[Bf-blender-cvs] [633d314b75a] master: Fix T101790: MNEE caustic settings are not visible in the UI when using Metal
Commit: 633d314b75a1e84c9ed93e09047f87f34ddab802 Author: Michael Jones Date: Tue Oct 25 19:36:13 2022 +0100 Branches: master https://developer.blender.org/rB633d314b75a1e84c9ed93e09047f87f34ddab802 Fix T101790: MNEE caustic settings are not visible in the UI when using Metal This patch fixes T101790 by adding a macOS version check for deciding whether to show the caustics settings in the UI (MNEE kernels don't compile on macOS < 13.0) Reviewed By: brecht Maniphest Tasks: T101790 Differential Revision: https://developer.blender.org/D16339 === M intern/cycles/blender/addon/ui.py === diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index f763fe0eb0b..581533db0b6 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -149,6 +149,14 @@ def get_effective_preview_denoiser(context): return 'OIDN' +def use_mnee(context): +# The MNEE kernel doesn't compile on macOS < 13. +if use_metal(context): +import platform +v, _, _ = platform.mac_ver() +if float(v) < 13.0: +return False +return True class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): bl_label = "Sampling" @@ -1235,7 +1243,7 @@ class CYCLES_OBJECT_PT_shading_caustics(CyclesButtonsPanel, Panel): @classmethod def poll(cls, context): -return CyclesButtonsPanel.poll(context) and not use_metal(context) and context.object.type != 'LIGHT' +return CyclesButtonsPanel.poll(context) and use_mnee(context) and context.object.type != 'LIGHT' def draw(self, context): layout = self.layout @@ -1449,7 +1457,7 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel): sub.active = not (light.type == 'AREA' and clamp.is_portal) sub.prop(clamp, "cast_shadow") sub.prop(clamp, "use_multiple_importance_sampling", text="Multiple Importance") -if not use_metal(context): +if use_mnee(context): sub.prop(clamp, "is_caustics_light", text="Shadow Caustics") if light.type == 'AREA': ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [8dd7b5b26b3] master: Cycles: Metal integrator state size tuning
Commit: 8dd7b5b26b394207b5941d49750f7e3abadaf82a Author: Michael Jones Date: Mon Oct 24 10:23:56 2022 +0100 Branches: master https://developer.blender.org/rB8dd7b5b26b394207b5941d49750f7e3abadaf82a Cycles: Metal integrator state size tuning This patch tunes the integrator state sizing for Metal (`num_concurrent_states` and `num_concurrent_busy_states`). On all GPUs architecture, we adjust the busy:total states ratio to be 1:4 which gives better rendering performance than the previous 1:16 ratio (independent of total state count). This gives a small performance uplift (e.g. 2-3% on M1 Ultra). Additionally for M2 architectures, we double the overall state size if there is available headroom. Inclusive of the first change, we can expect uplift of close to 10% in future, as this results in larger dispatch sizes and minimises work submission overheads. In order to make an accurate determination of available headroom, we defer the calculation of `num_concurrent_states` and `num_concurrent_busy_states` until the time of integrator state allocation (i.e. after all of the scene data h [...] Reviewed By: brecht Differential Revision: https://developer.blender.org/D16313 === M intern/cycles/device/cuda/queue.cpp M intern/cycles/device/cuda/queue.h M intern/cycles/device/hip/queue.cpp M intern/cycles/device/hip/queue.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/oneapi/queue.cpp M intern/cycles/device/oneapi/queue.h M intern/cycles/device/queue.h M intern/cycles/integrator/path_trace_work_gpu.cpp === diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 84b0a1e0dd6..69fae03e32c 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int CUDADeviceQueue::num_concurrent_busy_states() const +int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index b450f5b3592..7107afe70c9 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue { ~CUDADeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 3f8b6267100..e93a9b4df3a 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int HIPDeviceQueue::num_concurrent_busy_states() const +int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = hip_device_->get_num_multiprocessors() * hip_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 729d8a19acb..df0678108af 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue { ~HIPDeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 4b929b6bc0a..6f1042b1e55 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat } source = global_defines + source; +# if 0 metal_printf("\n%s\n\%s\n", global_defines.c_str(), baked_constants.c_str()); +# endif /* Generate an MD5 from the source and include any baked constants. This is used when caching * PSOs. */ diff --git a/intern/cycles/device/metal/kernel.mm
[Bf-blender-cvs] [ba67a383fa3] master: Cycles: Enable MNEE on Metal (macOS >= 13)
Commit: ba67a383fa3931b95bebd9ce92c9fc71928fb409 Author: Michael Jones Date: Wed Oct 12 17:06:06 2022 +0100 Branches: master https://developer.blender.org/rBba67a383fa3931b95bebd9ce92c9fc71928fb409 Cycles: Enable MNEE on Metal (macOS >= 13) This patch enables MNEE on macOS >= 13. There was an inefficiency in the calculation of spill requirements, fixed as of macOS 13. This patch also adds a temporary inlining workaround for a Metal compiler bug which causes `mnee_compute_constraint_derivatives` to behave incorrectly. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16235 === M intern/cycles/device/metal/device_impl.mm M intern/cycles/kernel/integrator/mnee.h M intern/cycles/kernel/types.h M tests/python/cycles_render_tests.py === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index d1250b83d22..6a16d4bb3b4 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -254,6 +254,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat break; } + NSProcessInfo *processInfo = [NSProcessInfo processInfo]; + NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion]; + global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n"; + string &source = this->source[pso_type]; source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; source = path_source_replace_includes(source, path_get("source")); diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h index 038f0379bbc..23885306885 100644 --- a/intern/cycles/kernel/integrator/mnee.h +++ b/intern/cycles/kernel/integrator/mnee.h @@ -279,7 +279,15 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg, } /* Compute constraint derivatives. */ -ccl_device_forceinline bool mnee_compute_constraint_derivatives( + +# if defined(__KERNEL_METAL__) +/* Temporary workaround for front-end compilation bug (incorrect MNEE rendering when this is + * inlined). */ +__attribute__((noinline)) +# else +ccl_device_forceinline +# endif +bool mnee_compute_constraint_derivatives( int vertex_count, ccl_private ManifoldVertex *vertices, ccl_private const float3 &surface_sample_pos, diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 1469d915d15..8f7cfd19169 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -85,9 +85,9 @@ CCL_NAMESPACE_BEGIN # define __VOLUME_RECORD_ALL__ #endif /* !__KERNEL_GPU__ */ -/* MNEE currently causes "Compute function exceeds available temporary registers" - * on Metal, disabled for now. */ -#ifndef __KERNEL_METAL__ +/* MNEE caused "Compute function exceeds available temporary registers" in macOS < 13 due to a bug + * in spill buffer allocation sizing. */ +#if !defined(__KERNEL_METAL__) || (__KERNEL_METAL_MACOS__ >= 13) # define __MNEE__ #endif diff --git a/tests/python/cycles_render_tests.py b/tests/python/cycles_render_tests.py index 4f823f854bf..c7e12dd5b7c 100644 --- a/tests/python/cycles_render_tests.py +++ b/tests/python/cycles_render_tests.py @@ -33,7 +33,7 @@ BLACKLIST_OPTIX = [ ] BLACKLIST_METAL = [ -# No MNEE for Metal currently +# MNEE only works on Metal with macOS >= 13 "underwater_caustics.blend", ] ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [2653775c666] blender-v3.3-release: Cycles: Disable binary archives on macOS < 13.0
Commit: 2653775c666053d4867b24f3dec88c3fbe654afd Author: Michael Jones Date: Tue Oct 4 18:52:40 2022 +0100 Branches: blender-v3.3-release https://developer.blender.org/rB2653775c666053d4867b24f3dec88c3fbe654afd Cycles: Disable binary archives on macOS < 13.0 (Cherry pick D16082) An bug with binary archives was fixed in macOS 13.0 which stops some spurious kernel recompilations. In older macOS versions, falling back on the system shader cache will prevent recompilations in most instances (this is the same behaviour as in Blender 3.1.x and 3.2.x). Reviewed By: brecht Differential Revision: https://developer.blender.org/D16141 === M intern/cycles/device/metal/kernel.mm === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 385cb412b06..f3a2fc9ec6c 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -308,22 +308,25 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M bool MetalKernelPipeline::should_use_binary_archive() const { - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { -if (atoi(str) != 0) { - /* Don't archive if we have opted out by env var. */ - return false; + /* Issues with binary archives in older macOS versions. */ + if (@available(macOS 13.0, *)) { +if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (atoi(str) != 0) { +/* Don't archive if we have opted out by env var. */ +return false; + } } - } - if (pso_type == PSO_GENERIC) { -/* Archive the generic kernels. */ -return true; - } +if (pso_type == PSO_GENERIC) { + /* Archive the generic kernels. */ + return true; +} - if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && - device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { -/* Archive all shade kernels - they take a long time to compile. */ -return true; +if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && +device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Archive all shade kernels - they take a long time to compile. */ + return true; +} } /* The remaining kernels are all fast to compile. They may get cached by the system shader cache, ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [2b88ee50fb7] master: Cycles: Tweak inlining policy on Metal
Commit: 2b88ee50fb7b3ed7e6c0704eee8b39b404219430 Author: Michael Jones Date: Tue Sep 27 17:01:17 2022 +0100 Branches: master https://developer.blender.org/rB2b88ee50fb7b3ed7e6c0704eee8b39b404219430 Cycles: Tweak inlining policy on Metal This patch optimises the Metal inlining policy. It gives a small speedup (2-3% on M1 Max) with no notable compilation slowdown vs what is already in master. Previously noted compilation slowdowns (as reported in T100102) were caused by forcing inlining for `ccl_device`, but we get better rendering perf by relying on compiler heuristics in these cases. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16081 === M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 130a9ebafae..f689e93e5a2 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,28 +29,13 @@ using namespace metal::raytracing; /* Qualifiers */ -/* Inline everything for Apple GPUs. This gives ~1.1x speedup and 10% spill - * reduction for integator_shade_surface. However it comes at the cost of - * longer compile times (~4.5 minutes on M1 Max) and is disabled for that - * reason, until there is a user option to manually enable it. */ - -#if 0 // defined(__KERNEL_METAL_APPLE__) - -# define ccl_device __attribute__((always_inline)) -# define ccl_device_inline __attribute__((always_inline)) -# define ccl_device_forceinline __attribute__((always_inline)) -# define ccl_device_noinline __attribute__((always_inline)) - +#define ccl_device +#define ccl_device_inline ccl_device __attribute__((always_inline)) +#define ccl_device_forceinline ccl_device __attribute__((always_inline)) +#if defined(__KERNEL_METAL_APPLE__) +# define ccl_device_noinline ccl_device #else - -# define ccl_device -# define ccl_device_inline ccl_device -# define ccl_device_forceinline ccl_device -# if defined(__KERNEL_METAL_APPLE__) -#define ccl_device_noinline ccl_device -# else -#define ccl_device_noinline ccl_device __attribute__((noinline)) -# endif +# define ccl_device_noinline ccl_device __attribute__((noinline)) #endif #define ccl_device_noinline_cpu ccl_device ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [fc604a0be3a] master: Cycles: Disable binary archives on macOS < 13.0
Commit: fc604a0be3a9ad1bf7f646dd70d4f106c7df2a75 Author: Michael Jones Date: Tue Sep 27 14:34:37 2022 +0100 Branches: master https://developer.blender.org/rBfc604a0be3a9ad1bf7f646dd70d4f106c7df2a75 Cycles: Disable binary archives on macOS < 13.0 An bug with binary archives was fixed in macOS 13.0 which stops some spurious kernel recompilations. In older macOS versions, falling back on the system shader cache will prevent recompilations in most instances (this is the same behaviour as in Blender 3.1.x and 3.2.x). Reviewed By: brecht Differential Revision: https://developer.blender.org/D16082 === M intern/cycles/device/metal/kernel.mm === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 385cb412b06..172c456f0c0 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -308,26 +308,29 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M bool MetalKernelPipeline::should_use_binary_archive() const { - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { -if (atoi(str) != 0) { - /* Don't archive if we have opted out by env var. */ - return false; + /* Issues with binary archives in older macOS versions. */ + if (@available(macOS 13.0, *)) { +if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (atoi(str) != 0) { +/* Don't archive if we have opted out by env var. */ +return false; + } } - } - if (pso_type == PSO_GENERIC) { -/* Archive the generic kernels. */ -return true; - } +if (pso_type == PSO_GENERIC) { + /* Archive the generic kernels. */ + return true; +} - if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && - device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { -/* Archive all shade kernels - they take a long time to compile. */ -return true; - } +if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && +device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Archive all shade kernels - they take a long time to compile. */ + return true; +} - /* The remaining kernels are all fast to compile. They may get cached by the system shader cache, - * but will be quick to regenerate if not. */ +/* The remaining kernels are all fast to compile. They may get cached by the system shader cache, + * but will be quick to regenerate if not. */ + } return false; } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [da4ef05e4df] master: Cycles: Apple Silicon optimization to specialize intersection kernels
Commit: da4ef05e4dfb700a61910e6d8e02183d7c272963 Author: Michael Jones Date: Tue Jul 12 15:32:46 2022 +0200 Branches: master https://developer.blender.org/rBda4ef05e4dfb700a61910e6d8e02183d7c272963 Cycles: Apple Silicon optimization to specialize intersection kernels The Metal backend now compiles and caches a second set of kernels which are optimized for scene contents, enabled for Apple Silicon. The implementation supports doing this both for intersection and shading kernels. However this is currently only enabled for intersection kernels that are quick to compile, and already give a good speedup. Enabling this for shading kernels would be faster still, however this also causes a long wait times and would need a good user interface to control this. M1 Max samples per minute (macOS 13.0): PSO_GENERIC PSO_SPECIALIZED_INTERSECT PSO_SPECIALIZED_SHADE barbershop_interior 83.4 89.5 93.7 bmw27 1486.11671.0 1825.8 classroom175.2 196.8 206.3 fishy_cat674.2 704.3 719.3 junkshop 205.4 212.0 257.7 koro 310.1 336.1 342.8 monster 376.7 418.6 424.1 pabellon 273.5 325.4 339.8 sponza 830.6 929.6 1142.4 victor86.7 96.4 96.3 wdas_cloud 111.8 112.7 183.1 Code contributed by Jason Fielder, Morteza Mostajabodaveh and Michael Jones Differential Revision: https://developer.blender.org/D14645 === M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/CMakeLists.txt A intern/cycles/kernel/device/metal/function_constants.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/kernel/svm/svm.h M intern/cycles/kernel/types.h M intern/cycles/scene/scene.cpp M intern/cycles/util/string.cpp M intern/cycles/util/string.h === diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 340be85e853..e7916ec3a52 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,7 @@ class DeviceQueue; class Progress; class CPUKernels; class CPUKernelThreadGlobals; +class Scene; /* Device Types */ @@ -186,6 +187,11 @@ class Device { return 0; } + /* Called after kernel texture setup, and prior to integrator state setup. */ + virtual void optimize_for_scene(Scene *scene) + { + } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { /* Memory is always resident if this is not a multi device, regardless of whether the pointer diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 4aea8d697a5..99e60d3a788 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -75,7 +75,8 @@ class MetalDevice : public Device { std::vector> texture_slot_map; bool use_metalrt = false; - bool use_function_specialisation = false; + MetalPipelineType kernel_specialization_level = PSO_GENERIC; + std::atomic_bool async_compile_and_load = false; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -91,9 +92,7 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); - string get_source(const uint kernel_features); - - string compile_kernel(const uint kernel_features, const char *name); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; @@ -111,7 +110,9 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id compile(string const &source); + virtual void optimize_for_scene(Scene *scene) override; + + bool compile_and_load(MetalPipelineType pso_type); /* -- */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index ba9317e3204..d8bb3b867cd 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,6 +6,8 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # inclu
[Bf-blender-cvs] [5653c5fcdd9] master: Cycles: keep track of SVM nodes used in kernels
Commit: 5653c5fcdd9f424dc05ddf73b18ba8294daf4788 Author: Michael Jones Date: Tue Jul 12 17:22:36 2022 +0200 Branches: master https://developer.blender.org/rB5653c5fcdd9f424dc05ddf73b18ba8294daf4788 Cycles: keep track of SVM nodes used in kernels To be used for specialization in Metal, to automatically leave out unused nodes from the kernel. Ref D14645 === M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/data_template.h A intern/cycles/kernel/svm/node_types_template.h M intern/cycles/kernel/svm/svm.h M intern/cycles/kernel/svm/types.h M intern/cycles/scene/shader_nodes.cpp M intern/cycles/scene/svm.cpp M intern/cycles/scene/svm.h === diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 4ff947e7136..527cc4ec111 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -154,6 +154,7 @@ set(SRC_KERNEL_SVM_HEADERS svm/math_util.h svm/mix.h svm/musgrave.h + svm/node_types_template.h svm/noise.h svm/noisetex.h svm/normal.h diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h index 22f945f1335..b06ac62a5d8 100644 --- a/intern/cycles/kernel/data_template.h +++ b/intern/cycles/kernel/data_template.h @@ -194,6 +194,13 @@ KERNEL_STRUCT_MEMBER(integrator, int, direct_light_sampling_type) KERNEL_STRUCT_MEMBER(integrator, int, pad1) KERNEL_STRUCT_END(KernelIntegrator) +/* SVM. For shader specialization. */ + +KERNEL_STRUCT_BEGIN(KernelSVMUsage, svm_usage) +#define SHADER_NODE_TYPE(type) KERNEL_STRUCT_MEMBER(svm_usage, int, type) +#include "kernel/svm/node_types_template.h" +KERNEL_STRUCT_END(KernelSVMUsage) + #undef KERNEL_STRUCT_BEGIN #undef KERNEL_STRUCT_MEMBER #undef KERNEL_STRUCT_END diff --git a/intern/cycles/kernel/svm/node_types_template.h b/intern/cycles/kernel/svm/node_types_template.h new file mode 100644 index 000..39d279be4cb --- /dev/null +++ b/intern/cycles/kernel/svm/node_types_template.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifndef SHADER_NODE_TYPE +# define SHADER_NODE_TYPE(name) +#endif + +/* NOTE: for best OpenCL performance, item definition in the enum must + * match the switch case order in `svm.h`. */ + +SHADER_NODE_TYPE(NODE_END) +SHADER_NODE_TYPE(NODE_SHADER_JUMP) +SHADER_NODE_TYPE(NODE_CLOSURE_BSDF) +SHADER_NODE_TYPE(NODE_CLOSURE_EMISSION) +SHADER_NODE_TYPE(NODE_CLOSURE_BACKGROUND) +SHADER_NODE_TYPE(NODE_CLOSURE_SET_WEIGHT) +SHADER_NODE_TYPE(NODE_CLOSURE_WEIGHT) +SHADER_NODE_TYPE(NODE_EMISSION_WEIGHT) +SHADER_NODE_TYPE(NODE_MIX_CLOSURE) +SHADER_NODE_TYPE(NODE_JUMP_IF_ZERO) +SHADER_NODE_TYPE(NODE_JUMP_IF_ONE) +SHADER_NODE_TYPE(NODE_GEOMETRY) +SHADER_NODE_TYPE(NODE_CONVERT) +SHADER_NODE_TYPE(NODE_TEX_COORD) +SHADER_NODE_TYPE(NODE_VALUE_F) +SHADER_NODE_TYPE(NODE_VALUE_V) +SHADER_NODE_TYPE(NODE_ATTR) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR) +SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DX) +SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DY) +SHADER_NODE_TYPE(NODE_SET_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_VECTOR_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_TEX_IMAGE) +SHADER_NODE_TYPE(NODE_TEX_IMAGE_BOX) +SHADER_NODE_TYPE(NODE_TEX_NOISE) +SHADER_NODE_TYPE(NODE_SET_BUMP) +SHADER_NODE_TYPE(NODE_ATTR_BUMP_DX) +SHADER_NODE_TYPE(NODE_ATTR_BUMP_DY) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DX) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DY) +SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DX) +SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DY) +SHADER_NODE_TYPE(NODE_CLOSURE_SET_NORMAL) +SHADER_NODE_TYPE(NODE_ENTER_BUMP_EVAL) +SHADER_NODE_TYPE(NODE_LEAVE_BUMP_EVAL) +SHADER_NODE_TYPE(NODE_HSV) +SHADER_NODE_TYPE(NODE_CLOSURE_HOLDOUT) +SHADER_NODE_TYPE(NODE_FRESNEL) +SHADER_NODE_TYPE(NODE_LAYER_WEIGHT) +SHADER_NODE_TYPE(NODE_CLOSURE_VOLUME) +SHADER_NODE_TYPE(NODE_PRINCIPLED_VOLUME) +SHADER_NODE_TYPE(NODE_MATH) +SHADER_NODE_TYPE(NODE_VECTOR_MATH) +SHADER_NODE_TYPE(NODE_RGB_RAMP) +SHADER_NODE_TYPE(NODE_GAMMA) +SHADER_NODE_TYPE(NODE_BRIGHTCONTRAST) +SHADER_NODE_TYPE(NODE_LIGHT_PATH) +SHADER_NODE_TYPE(NODE_OBJECT_INFO) +SHADER_NODE_TYPE(NODE_PARTICLE_INFO) +SHADER_NODE_TYPE(NODE_HAIR_INFO) +SHADER_NODE_TYPE(NODE_POINT_INFO) +SHADER_NODE_TYPE(NODE_TEXTURE_MAPPING) +SHADER_NODE_TYPE(NODE_MAPPING) +SHADER_NODE_TYPE(NODE_MIN_MAX) +SHADER_NODE_TYPE(NODE_CAMERA) +SHADER_NODE_TYPE(NODE_TEX_ENVIRONMENT) +SHADER_NODE_TYPE(NODE_TEX_SKY) +SHADER_NODE_TYPE(NODE_TEX_GRADIENT) +SHADER_NODE_TYPE(NODE_TEX_VORONOI) +SHADER_NODE_TYPE(NODE_TEX_MUSGRAVE) +SHADER_NODE_TYPE(NODE_TEX_WAVE) +SHADER_NODE_TYPE(NODE_TEX_MAGIC) +SHADER_NODE_TYPE(NODE_TEX_CHECKER) +SHADER_NODE_TYPE(NODE_TEX_BRICK) +SHADER_NODE_TYPE(NODE_TEX_WHITE_NOISE) +SHADER_NODE_TYPE(NODE_NORMAL) +SHADER_NODE_TYPE(NODE_LIGHT_FALLOFF) +SHADER_NODE_TYPE(NODE_IES) +SHADER
[Bf-blender-cvs] [fd19555be3d] arcpatch-D14645: Cycles: refactor to move part of KernelData definition to template header
Commit: fd19555be3d78575375aa990de60f1ad375e1f06 Author: Michael Jones Date: Thu Jul 14 17:40:21 2022 +0100 Branches: arcpatch-D14645 https://developer.blender.org/rBfd19555be3d78575375aa990de60f1ad375e1f06 Cycles: refactor to move part of KernelData definition to template header To be used for specialization on Metal in a following commit, turning these members into compile time constants. Cycles: keep track of SVM nodes used in kernels To be used for specialization in Metal, to automatically leave out unused nodes from the kernel. Cycles: Apple Silicon optimizations (~20% uplift on M1 Max) M1 Max samples/min over 30 seconds (macOS 13.0): ``` PSO_GENERIC PSO_SPECIALIZED_INTERSECT PSO_SPECIALIZED_SHADE barbershop_interior 83.4 89.5 93.7 bmw27 1486.11671.0 1825.8 classroom175.2 196.8 206.3 fishy_cat674.2 704.3 719.3 junkshop 205.4 212.0 257.7 koro 310.1 336.1 342.8 monster 376.7 418.6 424.1 pabellon 273.5 325.4 339.8 sponza 830.6 929.6 1142.4 victor86.7 96.4 96.3 wdas_cloud 111.8 112.7 183.1 ``` Next steps: [ ] ~~Include SHADER_EVAL kernels in the "must cache" list~~ //(limited benefit to specializing one off shade steps)// [ ] Adapt / merge with dynamic kernel compilation caching patch (D14754) [x] Separate specialization of intersection (fast building) and shading (slow building) kernels [x] Rate-limiting and invalidation of kernel compilation requests [ ] UI for enabling / disabling background compilation --- With this patch, the Metal backend compiles & caches a second set of kernels which are optimized for scene content, enabled for Apple Silicon. The optimized kernels result in faster render times, but are slower to compile. They are compiled in the background and swapped in when ready. The optimizations are: - ~~Aggressive inlining. This is not scene-specific, but hasn't been enabled for the generic kernels because it inflates compile time quite a lot. It results in better register usage, reducing the spill that we're seeing in some kernels. Possible adjustments: 1) take the compile hit for generic kernels since they're only compiled once (and it helps in general), or 2) add a _second_ set of generic_kernels with aggressive inlining enabled.~~ //(enabled by D14923)// - ~8% uplift in isolation for 3 benchmarking scenes - Substitution of KernelData constants. Select members of KernelData struct are replaced with macros that are #defined at the top of source. Only constants pertaining to the rendering algorithm is specialized, rather than constants which might affect artistic look. - ~13% uplift in isolation for 3 benchmarking scenes - Removal of unused SVM nodes in `svm_eval_nodes`. In combination with the other optimizations, this results in a further drop in register usage by eliminating dead code that can't be identified by static analysis. Code contributed by Jason Fielder, Morteza Mostajabodaveh and Michael Jones Differential Revision: https://developer.blender.org/D15456 === M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/kernel/CMakeLists.txt A intern/cycles/kernel/device/metal/function_constants.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/kernel/svm/svm.h M intern/cycles/kernel/types.h M intern/cycles/scene/scene.cpp M intern/cycles/util/string.cpp M intern/cycles/util/string.h === diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 927caae600c..1681acb9836 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,7 @@ class DeviceQueue; class Progress; class CPUKernels; class CPUKernelThreadGlobals; +class Scene; /* Device Types */ @@ -184,6 +185,11 @@ class Device { return 0; } + /* Called after kernel texture setup, and prior to integrator state setup. */ + virtual void optimize_for_scene(Scene *scene) + { + } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { /* Memory is always resident if this is not a multi device, regardless of whether the pointer diff --git a/intern/cycles/device/metal/device_im
[Bf-blender-cvs] [4b1d315017e] master: Cycles: Improve cache usage on Apple GPUs by chunking active indices
Commit: 4b1d315017ef103f3034160d349b3c3c21a4cd6a Author: Michael Jones Date: Wed Jul 13 20:56:57 2022 +0100 Branches: master https://developer.blender.org/rB4b1d315017ef103f3034160d349b3c3c21a4cd6a Cycles: Improve cache usage on Apple GPUs by chunking active indices This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks. Reviewed By: brecht Differential Revision: https://developer.blender.org/D15331 === M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/metal/util.h M intern/cycles/device/metal/util.mm M intern/cycles/device/queue.h M intern/cycles/integrator/path_trace.cpp M intern/cycles/integrator/path_trace_work_gpu.cpp M intern/cycles/integrator/path_trace_work_gpu.h M intern/cycles/kernel/integrator/state.h M intern/cycles/kernel/integrator/state_flow.h === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 87c83242240..ba9317e3204 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -217,6 +217,10 @@ string MetalDevice::get_source(const uint kernel_features) build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); } + if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) { +build_options += " -D__KERNEL_SORT_PARTITIONING__ "; + } + if (use_metalrt) { build_options += "-D__METALRT__ "; if (motion_blur) { @@ -652,7 +656,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) /* Update data storage pointers in launch parameters. */ if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ -const size_t pointer_block_size = sizeof(IntegratorStateGPU); +const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..836289172f7 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partitions(const size_t) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 03e60b6bb6e..6a9cc552098 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,6 +293,23 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } +int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const +{ + /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + if (metal_device_->launch_params.data.max_shaders >= 300) { +return 1; + } + + const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements( + metal_device_->mtlDevice); + if (optimal_partition_elements) { +return num_concurrent_states(state_size) / optimal_partition_elements; + } + return 1; +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ @@ -359,7 +376,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + -sizeof(IntegratorStateGPU); +offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, (uint8_t *)&metal_device_->launch_params + plain_old_l
[Bf-blender-cvs] [d8e9647ae26] master: Cycles: Add diagnostic tracing of MTLLibrary compilation time
Commit: d8e9647ae26b1681f1a2345975e52c512ff15e20 Author: Michael Jones Date: Thu Jun 23 10:05:45 2022 +0100 Branches: master https://developer.blender.org/rBd8e9647ae26b1681f1a2345975e52c512ff15e20 Cycles: Add diagnostic tracing of MTLLibrary compilation time Reviewed By: sergey Differential Revision: https://developer.blender.org/D15268 === M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 8edcd8d118d..a0abb3fca37 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -280,14 +280,17 @@ bool MetalDevice::load_kernels(const uint _kernel_features) motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; source[PSO_GENERIC] = get_source(kernel_features); + + const double starttime = time_dt(); + mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); + metal_printf("Front-end compilation finished in %.1f seconds (generic)\n", time_dt() - starttime); + MD5Hash md5; md5.append(source[PSO_GENERIC]); source_md5[PSO_GENERIC] = md5.get_hex(); - metal_printf("Front-end compilation finished (generic)\n"); - bool result = MetalDeviceKernels::load(this, false); reserve_local_memory(kernel_features); ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [532b33973bb] master: Cycles: Tidy of KernelData patchup code
Commit: 532b33973bb71f91a5962c9f9c63ff26bf51bd67 Author: Michael Jones Date: Wed Jun 22 22:36:33 2022 +0100 Branches: master https://developer.blender.org/rB532b33973bb71f91a5962c9f9c63ff26bf51bd67 Cycles: Tidy of KernelData patchup code Reviewed By: sergey Differential Revision: https://developer.blender.org/D15267 === M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 0954f586d40..8edcd8d118d 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -627,7 +627,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { if (strcmp(name, "data") == 0) { assert(size == sizeof(KernelData)); -memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size); +memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData)); return; } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [328a911379d] master: Cycles: Distinguish Apple GPUs by core count
Commit: 328a911379d445c9acef1b67f429e8c3454dda6c Author: Michael Jones Date: Wed Jun 22 22:32:34 2022 +0100 Branches: master https://developer.blender.org/rB328a911379d445c9acef1b67f429e8c3454dda6c Cycles: Distinguish Apple GPUs by core count This patch suffixes Apple GPU device names with `(GPU - # cores)` so that variant GPUs with the same chipset can be distinguished. Currently benchmark scores for these M1 family GPUs are being incorrectly merged: - M1: 7 or 8 cores - M1 Pro: 14 or 16 cores - M1 Max: 24 or 32 cores - M1 Ultra: 48 or 64 cores Reviewed By: brecht, sergey Differential Revision: https://developer.blender.org/D15257 === M intern/cycles/device/metal/device.mm M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/util.h M intern/cycles/device/metal/util.mm M release/scripts/addons M source/tools === diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index d7f190fc01e..51e3323370a 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -34,7 +34,8 @@ void device_metal_info(vector &devices) int device_index = 0; for (id &device : usable_devices) { /* Compute unique ID for persistent user preferences. */ -string device_name = [device.name UTF8String]; +string device_name = MetalInfo::get_device_name(device); + string id = string("METAL_") + device_name; /* Hardware ID might not be unique, add device number in that case. */ @@ -48,12 +49,6 @@ void device_metal_info(vector &devices) info.type = DEVICE_METAL; info.description = string_remove_trademark(string(device_name)); -/* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and - * GPU */ -if (info.description == system_cpu_brand_string()) { - info.description += " (GPU)"; -} - info.num = device_index; /* We don't know if it's used for display, but assume it is. */ info.display_device = true; @@ -69,14 +64,15 @@ string device_metal_capabilities() { string result = ""; auto allDevices = MTLCopyAllDevices(); - uint32_t num_devices = allDevices.count; + uint32_t num_devices = (uint32_t)allDevices.count; if (num_devices == 0) { return "No Metal devices found\n"; } result += string_printf("Number of devices: %u\n", num_devices); for (id device in allDevices) { -result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]); +string device_name = MetalInfo::get_device_name(device); +result += string_printf("\t\tDevice: %s\n", device_name.c_str()); } return result; diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 0e6817d94f8..4aea8d697a5 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -42,7 +42,6 @@ class MetalDevice : public Device { nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ /*---*/ - string device_name; MetalGPUVendor device_vendor; uint kernel_features; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 0a89055af34..0954f586d40 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -9,6 +9,7 @@ # include "util/debug.h" # include "util/md5.h" # include "util/path.h" +# include "util/time.h" CCL_NAMESPACE_BEGIN @@ -43,10 +44,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile auto usable_devices = MetalInfo::get_usable_devices(); assert(mtlDevId < usable_devices.size()); mtlDevice = usable_devices[mtlDevId]; - device_name = [mtlDevice.name UTF8String]; - device_vendor = MetalInfo::get_vendor_from_device_name(device_name); + device_vendor = MetalInfo::get_device_vendor(mtlDevice); assert(device_vendor != METAL_GPU_UNKNOWN); - metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); + metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str()); /* determine default storage mode based on whether UMA is supported */ diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index f728967835d..fd32d8a260f 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -25,10 +25,19 @@ enum MetalGPUVendor { METAL_GPU_INTEL = 3, }; +enum AppleGPUArchitecture { + APPLE_UNKNOWN, + APPLE_M1, + APPLE_M2, +}; +
[Bf-blender-cvs] [19e0b60f3e1] master: Cycles: MetalDeviceQueue - capture of multiple dispatches, and some tidying
Commit: 19e0b60f3e1270a34b52d7829169ab8af6c816cb Author: Michael Jones Date: Mon Jun 13 12:33:43 2022 +0100 Branches: master https://developer.blender.org/rB19e0b60f3e1270a34b52d7829169ab8af6c816cb Cycles: MetalDeviceQueue - capture of multiple dispatches, and some tidying This patch adds a new mode of gpu capture (env var `CYCLES_DEBUG_METAL_CAPTURE_SAMPLES`) to capture a block of dispatches between "reset" calls. It also fixes member data naming inconsistencies and adds some missing OS version checks. Screenshot showing .gputrace capture in Xcode 14.0 beta (using `CYCLES_DEBUG_METAL_CAPTURE_SAMPLES="1"` and `CYCLES_DEBUG_METAL_CAPTURE_LIMIT="10"`): {F13155703} Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15179 === M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm === diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index de20514de0b..b0bd487c86d 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -38,45 +38,50 @@ class MetalDeviceQueue : public DeviceQueue { virtual void copy_from_device(device_memory &mem) override; protected: + void setup_capture(); + void update_capture(DeviceKernel kernel); + void begin_capture(); + void end_capture(); void prepare_resources(DeviceKernel kernel); id get_compute_encoder(DeviceKernel kernel); id get_blit_encoder(); - MetalDevice *metal_device; - MetalBufferPool temp_buffer_pool; + MetalDevice *metal_device_; + MetalBufferPool temp_buffer_pool_; API_AVAILABLE(macos(11.0), ios(14.0)) - MTLCommandBufferDescriptor *command_buffer_desc = nullptr; - id mtlDevice = nil; - id mtlCommandQueue = nil; - id mtlCommandBuffer = nil; - id mtlComputeEncoder = nil; - id mtlBlitEncoder = nil; + MTLCommandBufferDescriptor *command_buffer_desc_ = nullptr; + id mtlDevice_ = nil; + id mtlCommandQueue_ = nil; + id mtlCommandBuffer_ = nil; + id mtlComputeEncoder_ = nil; + id mtlBlitEncoder_ = nil; API_AVAILABLE(macos(10.14), ios(14.0)) - id shared_event = nil; + id shared_event_ = nil; API_AVAILABLE(macos(10.14), ios(14.0)) - MTLSharedEventListener *shared_event_listener = nil; + MTLSharedEventListener *shared_event_listener_ = nil; - dispatch_queue_t event_queue; - dispatch_semaphore_t wait_semaphore; + dispatch_queue_t event_queue_; + dispatch_semaphore_t wait_semaphore_; struct CopyBack { void *host_pointer; void *gpu_mem; uint64_t size; }; - std::vector copy_back_mem; + std::vector copy_back_mem_; - uint64_t shared_event_id; - uint64_t command_buffers_submitted = 0; - uint64_t command_buffers_completed = 0; - Stats &stats; + uint64_t shared_event_id_; + uint64_t command_buffers_submitted_ = 0; + uint64_t command_buffers_completed_ = 0; + Stats &stats_; void close_compute_encoder(); void close_blit_encoder(); - bool verbose_tracing = false; + bool verbose_tracing_ = false; + bool label_command_encoders_ = false; /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */ @@ -85,28 +90,30 @@ class MetalDeviceQueue : public DeviceQueue { int work_size; uint64_t timing_id; }; - std::vector command_encoder_labels; - id timing_shared_event = nil; - uint64_t timing_shared_event_id; - uint64_t command_buffer_start_timing_id; + std::vector command_encoder_labels_; + API_AVAILABLE(macos(10.14), ios(14.0)) + id timing_shared_event_ = nil; + uint64_t timing_shared_event_id_; + uint64_t command_buffer_start_timing_id_; struct TimingStats { double total_time = 0.0; uint64_t total_work_size = 0; uint64_t num_dispatches = 0; }; - TimingStats timing_stats[DEVICE_KERNEL_NUM]; - double last_completion_time = 0.0; + TimingStats timing_stats_[DEVICE_KERNEL_NUM]; + double last_completion_time_ = 0.0; /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */ - id mtlCaptureScope = nil; - DeviceKernel capture_kernel; - int capture_dispatch = 0; - int capture_dispatch_counter = 0; - bool is_capturing = false; - bool is_capturing_to_disk = false; - bool has_captured_to_disk = false; + id mtlCaptureScope_ = nil; + DeviceKernel capture_kernel_; + int capture_dispatch_counter_ = 0; + bool capture_samples_ = false; + int capture_reset_counter_ = 0; + bool is_capturing_ = false; + bool is_capturing_to_disk_ = false; + bool has_captured_to_disk_ = false; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 9d8625e1455..0e260886abb 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -17,79 +17,180 @@ CCL_NAMESPACE_BEGIN /* MetalDeviceQueue */ MetalDeviceQueue::MetalDeviceQueue(Meta
[Bf-blender-cvs] [4412e14708c] master: Cycles: Useful Metal backend debug & profiling functionality
Commit: 4412e14708c5625c3fe84bc75fce2ca6de6f58c9 Author: Michael Jones Date: Tue Jun 7 11:08:21 2022 +0100 Branches: master https://developer.blender.org/rB4412e14708c5625c3fe84bc75fce2ca6de6f58c9 Cycles: Useful Metal backend debug & profiling functionality This patch adds some useful debugging & profiling env vars to the Metal backend: - `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render - `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose) - `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled: ``` --- Kernel name Total threads Dispatches Avg. T/DTime Time% --- integrator_init_from_camera 657,407,232 161 4,083,274 0.24s 0.51% integrator_intersect_closest1,629,288,440 681 2,392,494 15.18s 32.12% integrator_intersect_shadow 751,652,291 470 1,599,260 5.80s 12.28% integrator_shade_background 304,612,074 263 1,158,220 1.16s 2.45% integrator_shade_surface1,159,764,041 676 1,715,627 20.57s 43.52% integrator_shade_shadow 598,885,847 418 1,432,741 1.27s 2.69% integrator_queued_paths_array 2,969,650,130 805 3,689,006 0.35s 0.74% integrator_queued_shadow_paths_array 593,936,619 379 1,567,115 0.14s 0.29% integrator_terminated_paths_array 22,205,417 155 143,260 0.05s 0.10% integrator_sorted_paths_array 2,517,140,043 676 3,723,579 1.65s 3.50% integrator_compact_paths_array648,912,748 155 4,186,533 0.03s 0.07% integrator_compact_states 20,872,687 155 134,662 0.14s 0.29% integrator_terminated_shadow_paths_array 374,100,675 438 854,111 0.16s 0.33% integrator_compact_shadow_paths_array 503,768,657 438 1,150,156 0.05s 0.10% integrator_compact_shadow_states 37,664,941 202 186,460 0.23s 0.50% integrator_reset 25,165,8246 4,194,304 0.06s 0.12% film_convert_combined_half_rgba 3,110,4006 518,400 0.00s 0.01% prefix_sum676 676 1 0.19s 0.40% --- 6,760 47.27s 100.00% --- ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D15044 === M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/metal/util.h M intern/cycles/kernel/device/gpu/kernel.h === diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 086fbb093ba..09c4ace081e 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -11,6 +11,7 @@ # include "util/progress.h" # include "device/metal/bvh.h" +# include "device/metal/util.h" CCL_NAMESPACE_BEGIN @@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN { \ string str = string_printf(__VA_ARGS__); \ progress.set_substatus(str); \ + metal_printf("%s\n", str.c_str()); \ } BVHMetal::BVHMetal(const BVHParams ¶ms_, diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 7506b9b069f..0e6817d94f8 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -31,6 +31,8 @@ class MetalDevice : public Device { string source[PSO_NUM]; string source_md5[PSO_NUM]; + bool capture_enabled = false; + KernelParamsMetal launch_params = {0}; /* MetalRT members --*/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 16aabacb4cf..086bf0af979 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl
[Bf-blender-cvs] [f2d39b810b4] temp-pbvh-split: Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives
Commit: f2d39b810b4902bb5accbac7c5b2e8ec1e60c679 Author: Michael Jones Date: Wed May 11 14:52:49 2022 +0100 Branches: temp-pbvh-split https://developer.blender.org/rBf2d39b810b4902bb5accbac7c5b2e8ec1e60c679 Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case: ``` diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index ad268ae7057..6aa1a56056e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + request.pipeline->metalrt_hair = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR); + request.pipeline->metalrt_hair_thick = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + request.pipeline->metalrt_pointcloud = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); { thread_scoped_lock lock(cache_mutex); @@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); + bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); MetalKernelPipeline *best_pipeline = nullptr; for (auto &pipeline : collection) { ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D14923 === M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 27c58ce6d2f..7506b9b069f 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -28,7 +28,8 @@ class MetalDevice : public Device { id mtlGeneralCommandQueue = nil; id mtlAncillaryArgEncoder = nil; /* encoder used for fetching device pointers from MTLBuffers */ - string source_used_for_compile[PSO_NUM]; + string source[PSO_NUM]; + string source_md5[PSO_NUM]; KernelParamsMetal launch_params = {0}; @@ -72,7 +73,6 @@ class MetalDevice : public Device { id texture_bindings_3d = nil; std::vector> texture_slot_map; - MetalDeviceKernels kernels; bool use_metalrt = false; bool use_function_specialisation = false; @@ -110,6 +110,8 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + id compile(string const &source); + /* -- */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index c01f51fb506..e1438a9d6e2 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - NSError *error = NULL; + source[PSO_GENERIC] = get_source(kernel_features); + mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - for (int i = 0; i < PSO_NUM; i++) { -if (mtlLibrary[i]) { - [mtlLibrary[i] release]; - mtlLibrary[i] = nil; -} - } + MD5Hash md5; + md
[Bf-blender-cvs] [007184bcf21] master: Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives
Commit: 007184bcf2121296fa244871382670b0f06210c0 Author: Michael Jones Date: Wed May 11 14:52:49 2022 +0100 Branches: master https://developer.blender.org/rB007184bcf2121296fa244871382670b0f06210c0 Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case: ``` diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index ad268ae7057..6aa1a56056e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + request.pipeline->metalrt_hair = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR); + request.pipeline->metalrt_hair_thick = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + request.pipeline->metalrt_pointcloud = device->use_metalrt && + (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); { thread_scoped_lock lock(cache_mutex); @@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel) /* metalrt options */ bool use_metalrt = device->use_metalrt; - bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; + bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR); + bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK); + bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD); MetalKernelPipeline *best_pipeline = nullptr; for (auto &pipeline : collection) { ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D14923 === M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 27c58ce6d2f..7506b9b069f 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -28,7 +28,8 @@ class MetalDevice : public Device { id mtlGeneralCommandQueue = nil; id mtlAncillaryArgEncoder = nil; /* encoder used for fetching device pointers from MTLBuffers */ - string source_used_for_compile[PSO_NUM]; + string source[PSO_NUM]; + string source_md5[PSO_NUM]; KernelParamsMetal launch_params = {0}; @@ -72,7 +73,6 @@ class MetalDevice : public Device { id texture_bindings_3d = nil; std::vector> texture_slot_map; - MetalDeviceKernels kernels; bool use_metalrt = false; bool use_function_specialisation = false; @@ -110,6 +110,8 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + id compile(string const &source); + /* -- */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index c01f51fb506..e1438a9d6e2 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - NSError *error = NULL; + source[PSO_GENERIC] = get_source(kernel_features); + mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - for (int i = 0; i < PSO_NUM; i++) { -if (mtlLibrary[i]) { - [mtlLibrary[i] release]; - mtlLibrary[i] = nil; -} - } + MD5Hash md5; + md
[Bf-blender-cvs] [b82de02e7ce] master: Cycles: Enable inlining on Apple Silicon for 1.1x speedup
Commit: b82de02e7ce857e20b842a074c0068b146a9fd79 Author: Michael Jones Date: Tue Apr 26 19:00:35 2022 +0100 Branches: master https://developer.blender.org/rBb82de02e7ce857e20b842a074c0068b146a9fd79 Cycles: Enable inlining on Apple Silicon for 1.1x speedup This is a stripped down version of D14645 without the scene specialisation optimisations. The two major changes in this patch are: - Enables more aggressive inlining on Apple Silicon resulting in a 1.1x speedup and 10% reduction in spill, at the cost of longer pipeline build times - Revival of shader binary archives through a new ShaderCache which is shared between MetalDevice instances using the same physical MTLDevice. This mitigates the extra compile times via explicit caching (rather than, as before, relying on the implicit system shader cache which can be purged without notice) Reviewed By: brecht Differential Revision: https://developer.blender.org/D14763 === M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 27c58ce6d2f..d7311ee985f 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -28,7 +28,8 @@ class MetalDevice : public Device { id mtlGeneralCommandQueue = nil; id mtlAncillaryArgEncoder = nil; /* encoder used for fetching device pointers from MTLBuffers */ - string source_used_for_compile[PSO_NUM]; + string source[PSO_NUM]; + string source_md5[PSO_NUM]; KernelParamsMetal launch_params = {0}; @@ -110,6 +111,12 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + id compile(string const &source); + + const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const; + + bool kernel_available(DeviceKernel kernel) const; + /* -- */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index c01f51fb506..7d1212cb37c 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - NSError *error = NULL; + source[PSO_GENERIC] = get_source(kernel_features); + mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - for (int i = 0; i < PSO_NUM; i++) { -if (mtlLibrary[i]) { - [mtlLibrary[i] release]; - mtlLibrary[i] = nil; -} - } + MD5Hash md5; + md5.append(source[PSO_GENERIC]); + source_md5[PSO_GENERIC] = md5.get_hex(); + + metal_printf("Front-end compilation finished (generic)\n"); + + bool result = kernels.load(this, false); + reserve_local_memory(kernel_features); + + return result; +} + +id MetalDevice::compile(string const &source) +{ MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; } - else { -return false; - } - - string metalsrc; - - /* local helper: dump source to disk and return filepath */ - auto dump_source = [&](int kernel_type) -> string { -string &source = source_used_for_compile[kernel_type]; -string metalsrc = path_cache_get(path_join("kernels", - string_printf("%s.%s.metal", - kernel_type_as_string(kernel_type), - util_md5_string(source).c_str(; -path_write_text(metalsrc, source); -return metalsrc; - }; - - /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor, - * then compile it into a MTLLibrary */ - auto fetch_and_compile_source = [&](int kernel_type) { -/* Record the source used to compile this library, for hash building later. */ -string &source = source_used_for_compile[kernel_type]; - -switch (kernel_type) { - case PSO_GENERIC: { -source = get_source(kernel_features); -break; - } - case PSO_SPECIALISED: { -/* PSO_SPECIALISED derives from PSO_GENERIC */ -string &generic_source = source_used_for_compi
[Bf-blender-cvs] [869a46df298] master: Cycles fp consistency for Apple Silicon CPUs
Commit: 869a46df2980818644db4823fb1d29e9d525b645 Author: Michael Jones Date: Tue Apr 12 19:36:55 2022 +0100 Branches: master https://developer.blender.org/rB869a46df2980818644db4823fb1d29e9d525b645 Cycles fp consistency for Apple Silicon CPUs Propagate the fp settings from the main thread to all the worker threads (the fp settings includes the FZ settings among other things) - this guarantees consistency in execution of floating point math regardless if its executed in tbb thread arena or on main thread Add FZ mode to arm64/aarch64 in parallel to the way its been done on intel processors, currently compiling for arm target does not set this mode at all, hence potentially runs slower and with possible results mismatch with intel x86. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14454 === M intern/cycles/integrator/path_trace.cpp M intern/cycles/util/simd.h === diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index ab134179602..f1e70b7f28f 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -355,6 +355,9 @@ void PathTrace::path_trace(RenderWork &render_work) const int num_works = path_trace_works_.size(); + tbb::task_group_context *tbb_ctx = tbb::task::self().group(); + tbb_ctx->capture_fp_settings(); + tbb::parallel_for(0, num_works, [&](int i) { const double work_start_time = time_dt(); const int num_samples = render_work.path_trace.num_samples; diff --git a/intern/cycles/util/simd.h b/intern/cycles/util/simd.h index 15dda4e76a8..6772025d1de 100644 --- a/intern/cycles/util/simd.h +++ b/intern/cycles/util/simd.h @@ -32,6 +32,12 @@ # define SIMD_SET_FLUSH_TO_ZERO \ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#elif defined(__aarch64__) || defined(_M_ARM64) +#define _MM_FLUSH_ZERO_ON 24 +#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr)) +#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr)) +# define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON); +# define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON) #else # define SIMD_SET_FLUSH_TO_ZERO #endif @@ -104,6 +110,21 @@ static struct PosInfTy { static struct StepTy { } step ccl_attr_maybe_unused; +#endif +#if defined(__aarch64__) || defined(_M_ARM64) +__forceinline int set_fz(uint32_t flag) { +uint64_t old_fpcr, new_fpcr; +__get_fpcr(old_fpcr); +new_fpcr = old_fpcr | (1ULL << flag); +__set_fpcr(new_fpcr); +__get_fpcr(old_fpcr); +return old_fpcr == new_fpcr; +} +__forceinline int get_fz(uint32_t flag) { +uint64_t cur_fpcr; +__get_fpcr(cur_fpcr); +return (cur_fpcr & (1ULL<< flag)) > 0 ? 1 : 0 ; +} #endif /* Utilities used by Neon */ ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [952a613d384] blender-v3.1-release: Cycles: Hide MetalRT checkbox for AMD GPUs
Commit: 952a613d3843a7ab47bd8063da71c277ee0a013f Author: Michael Jones Date: Tue Feb 22 17:09:28 2022 + Branches: blender-v3.1-release https://developer.blender.org/rB952a613d3843a7ab47bd8063da71c277ee0a013f Cycles: Hide MetalRT checkbox for AMD GPUs This patch hides the MetalRT checkbox for AMD GPUs, pending fixes for MetalRT argument encoding on AMD. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14175 === M intern/cycles/blender/addon/properties.py M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index ef686fc0c70..84d0e95acd8 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1527,9 +1527,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): row.prop(self, "peer_memory") if compute_device_type == 'METAL': -row = layout.row() -row.use_property_split = True -row.prop(self, "use_metalrt") +import platform +# MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD +if platform.machine() == 'arm64': +row = layout.row() +row.use_property_split = True +row.prop(self, "use_metalrt") def draw(self, context): diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 8ced0210e30..7291dd880ca 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -90,11 +90,11 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile } case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; + use_metalrt = info.use_metalrt; break; } } - use_metalrt = info.use_metalrt; if (auto metalrt = getenv("CYCLES_METALRT")) { use_metalrt = (atoi(metalrt) != 0); } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [27d3140b136] blender-v3.1-release: Cycles: Fix Metal kernel compilation for AMD GPUs
Commit: 27d3140b1363b852f449c81f941974fbd644464a Author: Michael Jones Date: Thu Feb 10 18:03:52 2022 + Branches: blender-v3.1-release https://developer.blender.org/rB27d3140b1363b852f449c81f941974fbd644464a Cycles: Fix Metal kernel compilation for AMD GPUs Workaround for a compilation issue preventing kernels compiling for AMD GPUs: Avoid problematic use of templates on Metal by making `gpu_parallel_active_index_array` a wrapper macro, and moving `blocksize` to be a macro parameter. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14081 === M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/gpu/parallel_active_index.h === diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index eed005803e2..7ebf8777b91 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -295,7 +295,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -310,7 +310,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -348,7 +348,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -391,7 +391,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -424,7 +424,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index a5320edcb3c..12b93cd77a9 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -31,44 +31,26 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif +#ifndef __KERNEL_METAL__ +template +__device__ +#endif +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, #ifdef __KERNEL_METAL__ -struct ActiveIndexContext { - ActiveIndexContext(int _thread_index, - int _global_index, - int _threadgroup_size, - int _simdgroup_size, - int _simd_lane_index, - int _simd_group_index, - int _num_simd_groups, - threadgroup int *_simdgroup_offset) - : thread_index(_thread_index), -global_index(_global_index), -blocksize(_threadgroup_size), -ccl_gpu_warp_size
[Bf-blender-cvs] [40fce61a6ab] blender-v3.1-release: Cycles: enable Metal on AMD GPUs, set macOS minimum versions
Commit: 40fce61a6abe79508022d3e0cd3a29e187f18e74 Author: Michael Jones Date: Fri Feb 11 19:19:51 2022 +0100 Branches: blender-v3.1-release https://developer.blender.org/rB40fce61a6abe79508022d3e0cd3a29e187f18e74 Cycles: enable Metal on AMD GPUs, set macOS minimum versions * Apple Silicon support enabled on macOS 12.2+ * AMD support enabled on macOS 12.3+ This patch also fixes a device enumeration crash on certain AMD configs which was caused by over-release of MTLDevice objects. Differential Revision: https://developer.blender.org/D14090 === M intern/cycles/device/metal/device.mm M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/util.h M intern/cycles/device/metal/util.mm === diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index bc893adea17..ef592438980 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -39,33 +39,20 @@ bool device_metal_init() return true; } -static int device_metal_get_num_devices_safe(uint32_t *num_devices) -{ - *num_devices = MTLCopyAllDevices().count; - return 0; -} - void device_metal_info(vector &devices) { - uint32_t num_devices = 0; - device_metal_get_num_devices_safe(&num_devices); - if (num_devices == 0) { -return; - } - - vector usable_devices; - MetalInfo::get_usable_devices(&usable_devices); + auto usable_devices = MetalInfo::get_usable_devices(); /* Devices are numbered consecutively across platforms. */ set unique_ids; int device_index = 0; - for (MetalPlatformDevice &device : usable_devices) { + for (id &device : usable_devices) { /* Compute unique ID for persistent user preferences. */ -const string &device_name = device.device_name; +string device_name = [device.name UTF8String]; string id = string("METAL_") + device_name; /* Hardware ID might not be unique, add device number in that case. */ if (unique_ids.find(id) != unique_ids.end()) { - id += string_printf("_ID_%d", num_devices); + id += string_printf("_ID_%d", device_index); } unique_ids.insert(id); @@ -94,15 +81,13 @@ void device_metal_info(vector &devices) string device_metal_capabilities() { string result = ""; - string error_msg = ""; - uint32_t num_devices = 0; - assert(device_metal_get_num_devices_safe(&num_devices)); + auto allDevices = MTLCopyAllDevices(); + uint32_t num_devices = allDevices.count; if (num_devices == 0) { return "No Metal devices found\n"; } result += string_printf("Number of devices: %u\n", num_devices); - NSArray> *allDevices = MTLCopyAllDevices(); for (id device in allDevices) { result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]); } diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index cdaafc60ab0..8ced0210e30 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -53,16 +53,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile mtlDevId = info.num; /* select chosen device */ - vector usable_devices; - MetalInfo::get_usable_devices(&usable_devices); - if (usable_devices.size() == 0) { -set_error("Metal: no devices found."); -return; - } + auto usable_devices = MetalInfo::get_usable_devices(); assert(mtlDevId < usable_devices.size()); - MetalPlatformDevice &platform_device = usable_devices[mtlDevId]; - mtlDevice = platform_device.device_id; - device_name = platform_device.device_name; + mtlDevice = usable_devices[mtlDevId]; + device_name = [mtlDevice.name UTF8String]; device_vendor = MetalInfo::get_vendor_from_device_name(device_name); assert(device_vendor != METAL_GPU_UNKNOWN); metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); @@ -458,7 +452,8 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) id metal_buffer = nil; MTLResourceOptions options = default_storage_mode; - /* Workaround for "bake" unit tests which fail if RenderBuffers is allocated with MTLResourceStorageModeShared. */ + /* Workaround for "bake" unit tests which fail if RenderBuffers is allocated with + * MTLResourceStorageModeShared. */ if (strstr(mem.name, "RenderBuffers")) { options = MTLResourceStorageModeManaged; } @@ -769,9 +764,11 @@ void MetalDevice::tex_alloc(device_texture &mem) /* Check that dimensions fit within maximum allowable size. See https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */ - if (mem.data_width > 16384 || - me
[Bf-blender-cvs] [a44366a642b] blender-v3.1-release: Cycles: Expose "Use MetalRT" checkbox
Commit: a44366a642bc22bc725f2a700abd14f891cfde60 Author: Michael Jones Date: Thu Feb 10 15:46:49 2022 + Branches: blender-v3.1-release https://developer.blender.org/rBa44366a642bc22bc725f2a700abd14f891cfde60 Cycles: Expose "Use MetalRT" checkbox For curve-heavy scenes, memory consumption regressed when we switched from MetalRT to bvh2. Allow users to opt in to MetalRT to workaround this. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14071 === M intern/cycles/blender/addon/properties.py M intern/cycles/blender/device.cpp M intern/cycles/device/device.cpp M intern/cycles/device/device.h M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 1afb321da3d..01e73d7ed03 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1374,6 +1374,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): default=False, ) +use_metalrt: BoolProperty( +name="MetalRT (Experimental)", +description="MetalRT for ray tracing uses less memory for scenes which use curves extensively, and can give better performance in specific cases. However this support is experimental and some scenes may render incorrectly", +default=False, +) + def find_existing_device_entry(self, device): for device_entry in self.devices: if device_entry.id == device[2] and device_entry.type == device[1]: @@ -1519,6 +1525,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): row.use_property_split = True row.prop(self, "peer_memory") +if compute_device_type == 'METAL': +row = layout.row() +row.use_property_split = True +row.prop(self, "use_metalrt") + + def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp index d39381ac6f1..d7feb7d66b2 100644 --- a/intern/cycles/blender/device.cpp +++ b/intern/cycles/blender/device.cpp @@ -118,6 +118,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen device.has_peer_memory = false; } + if (get_boolean(cpreferences, "use_metalrt")) { +device.use_metalrt = true; + } + return device; } diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 4d981e45ff1..bd7dd60e58a 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -328,6 +328,7 @@ DeviceInfo Device::get_multi_device(const vector &subdevices, info.has_osl = true; info.has_profiling = true; info.has_peer_memory = false; + info.use_metalrt = false; info.denoisers = DENOISER_ALL; foreach (const DeviceInfo &device, subdevices) { @@ -374,6 +375,7 @@ DeviceInfo Device::get_multi_device(const vector &subdevices, info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; +info.use_metalrt |= device.use_metalrt; info.denoisers &= device.denoisers; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index c032773ddd0..544fe5b4a35 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -79,6 +79,7 @@ class DeviceInfo { bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ bool has_gpu_queue; /* Device supports GPU queue. */ + bool use_metalrt; /* Use MetalRT to accelerate ray queries (Metal only). */ DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector multi_devices; @@ -96,6 +97,7 @@ class DeviceInfo { has_profiling = false; has_peer_memory = false; has_gpu_queue = false; +use_metalrt = false; denoisers = DENOISER_NONE; } diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 251ba54e477..cdaafc60ab0 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -100,6 +100,7 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile } } + use_metalrt = info.use_metalrt; if (auto metalrt = getenv("CYCLES_METALRT")) { use_metalrt = (atoi(metalrt) != 0); } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [35dedc11d56] blender-v3.1-release: Fix T95477: Report error instead of crashing when Metal texture size limits exceeded.
Commit: 35dedc11d5649352326af3701aef444b39bb6aa3 Author: Michael Jones Date: Thu Feb 10 10:54:18 2022 + Branches: blender-v3.1-release https://developer.blender.org/rB35dedc11d5649352326af3701aef444b39bb6aa3 Fix T95477: Report error instead of crashing when Metal texture size limits exceeded. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14074 === M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 564c3c98759..251ba54e477 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -765,6 +765,15 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem) void MetalDevice::tex_alloc(device_texture &mem) { + /* Check that dimensions fit within maximum allowable size. + See https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf + */ + if (mem.data_width > 16384 || + mem.data_height > 16384) { +set_error(string_printf("Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)", mem.data_width, mem.data_height)); +return; + } + MTLStorageMode storage_mode = MTLStorageModeManaged; if (@available(macos 10.15, *)) { if ([mtlDevice hasUnifiedMemory] && ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [3d12dd59ce1] blender-v3.1-release: Cycles: Workaround for failing "bake" unit tests in Metal
Commit: 3d12dd59ce1714e4e3e34d8d8f73de218050b2fb Author: Michael Jones Date: Thu Feb 10 10:57:28 2022 + Branches: blender-v3.1-release https://developer.blender.org/rB3d12dd59ce1714e4e3e34d8d8f73de218050b2fb Cycles: Workaround for failing "bake" unit tests in Metal Allocate "RenderBuffers" with MTLResourceStorageModeShared. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14073 === M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 17acb2c94e4..564c3c98759 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -455,8 +455,14 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) mem.device_pointer = 0; id metal_buffer = nil; + MTLResourceOptions options = default_storage_mode; + + /* Workaround for "bake" unit tests which fail if RenderBuffers is allocated with MTLResourceStorageModeShared. */ + if (strstr(mem.name, "RenderBuffers")) { +options = MTLResourceStorageModeManaged; + } + if (size > 0) { -MTLResourceOptions options = default_storage_mode; if (mem.type == MEM_DEVICE_ONLY) { options = MTLResourceStorageModePrivate; } @@ -490,7 +496,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) mmem->mtlBuffer = metal_buffer; mmem->offset = 0; mmem->size = size; - if (mem.type != MEM_DEVICE_ONLY) { + if (options != MTLResourceStorageModePrivate) { mmem->hostPtr = [metal_buffer contents]; } else { ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [410e4e7ce18] blender-v3.1-release: Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render
Commit: 410e4e7ce1823aa15d51ee231eedc63cdf72c8e3 Author: Michael Jones Date: Thu Feb 10 10:51:11 2022 + Branches: blender-v3.1-release https://developer.blender.org/rB410e4e7ce1823aa15d51ee231eedc63cdf72c8e3 Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render Disable binary archives on Apple Silicon (issue stems from instancing multiple PSOs from the same binary archive). Pipeline creation still filters through the OS shader cache, mitigating any impact on setup times after the initial render. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14072 === M intern/cycles/device/metal/kernel.mm === diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index e9bd1cea5df..91aac8831ca 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -59,10 +59,15 @@ bool MetalDeviceKernel::load(MetalDevice *device, } bool use_binary_archive = true; - if (getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (device->device_vendor == METAL_GPU_APPLE) { +/* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */ use_binary_archive = false; } + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { +use_binary_archive = (atoi(str) == 0); + } + id archive = nil; string metalbin_path; if (use_binary_archive) { ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [f6c8a78ac68] master: Cycles: Fix bvh2 gen on Apple Silicon and use it to speed up renders
Commit: f6c8a78ac684242ba067499511a0db2fa64657fe Author: Michael Jones Date: Thu Jan 20 10:11:58 2022 + Branches: master https://developer.blender.org/rBf6c8a78ac684242ba067499511a0db2fa64657fe Cycles: Fix bvh2 gen on Apple Silicon and use it to speed up renders This patch fixes a correctness issue discovered in the `int4 select(...)` function on Apple Silicon machines, which causes bad bvh2 builds. Although the generated bvh2s give correct renders, the resulting runtime performance is terrible. This fix allows us to switch over to bvh2 on Apple Silicon giving a significant performance uplift for many of the standard benchmarking assets. It also fixes some unit test failures stemming from the use of MetalRT, and trivially enables the new pointclo [...] Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13877 === M intern/cycles/device/metal/device_impl.mm M intern/cycles/util/math_int4.h === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 5906da3680b..17acb2c94e4 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -87,17 +87,14 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile default: break; case METAL_GPU_INTEL: { - use_metalrt = false; max_threads_per_threadgroup = 64; break; } case METAL_GPU_AMD: { - use_metalrt = false; max_threads_per_threadgroup = 128; break; } case METAL_GPU_APPLE: { - use_metalrt = true; max_threads_per_threadgroup = 512; break; } diff --git a/intern/cycles/util/math_int4.h b/intern/cycles/util/math_int4.h index 9e3f001efc2..eaa9be73b63 100644 --- a/intern/cycles/util/math_int4.h +++ b/intern/cycles/util/math_int4.h @@ -131,10 +131,7 @@ ccl_device_inline int4 clamp(const int4 &a, const int4 &mn, const int4 &mx) ccl_device_inline int4 select(const int4 &mask, const int4 &a, const int4 &b) { # ifdef __KERNEL_SSE__ - const __m128 m = _mm_cvtepi32_ps(mask); - /* TODO(sergey): avoid cvt. */ - return int4(_mm_castps_si128( - _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b); + return int4(_mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))); # else return make_int4( (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w); ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [17cab47ed10] master: Cycles: Fix T94736: Crash when modifying strength of world environment texture
Commit: 17cab47ed10a99818f9cdd15657c1231e312da25 Author: Michael Jones Date: Wed Jan 19 17:57:24 2022 + Branches: master https://developer.blender.org/rB17cab47ed10a99818f9cdd15657c1231e312da25 Cycles: Fix T94736: Crash when modifying strength of world environment texture This patch fixes crash T94736 on Metal in which the launch_params were not being updated to reflect destruction of MetalMem objects. Reviewed By: brecht Differential Revision: https://developer.blender.org/D13875 === M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm === diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index a420a3ba704..8d289beda13 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -115,6 +115,8 @@ class MetalDevice : public Device { void load_texture_info(); + void erase_allocation(device_memory &mem); + virtual bool should_use_graphics_interop() override; virtual unique_ptr gpu_queue_create() override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 4ad5a3caebc..1105fb20360 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -432,6 +432,25 @@ void MetalDevice::load_texture_info() } } +void MetalDevice::erase_allocation(device_memory &mem) +{ + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + auto it = metal_mem_map.find(&mem); + if (it != metal_mem_map.end()) { +MetalMem *mmem = it->second.get(); + +/* blank out reference to MetalMem* in the launch params (fixes crash T94736) */ +if (mmem->pointer_index >= 0) { + device_ptr *pointers = (device_ptr*)&launch_params; + pointers[mmem->pointer_index] = 0; +} +metal_mem_map.erase(it); + } +} + MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) { size_t size = mem.memory_size(); @@ -561,11 +580,7 @@ void MetalDevice::generic_free(device_memory &mem) mmem.mtlBuffer = nil; } -stats.mem_free(mem.device_size); -mem.device_pointer = 0; -mem.device_size = 0; - -metal_mem_map.erase(&mem); +erase_allocation(mem); } } @@ -954,10 +969,7 @@ void MetalDevice::tex_free(device_texture &mem) delayed_free_list.push_back(mmem.mtlTexture); mmem.mtlTexture = nil; } -stats.mem_free(mem.device_size); -mem.device_pointer = 0; -mem.device_size = 0; -metal_mem_map.erase(&mem); +erase_allocation(mem); } } ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [efe3d60a2c8] master: Cycles: Fix Metal build
Commit: efe3d60a2c8306aefd41bc304548da35b67c252c Author: Michael Jones Date: Fri Jan 7 15:28:43 2022 + Branches: master https://developer.blender.org/rBefe3d60a2c8306aefd41bc304548da35b67c252c Cycles: Fix Metal build This patch fixes a couple of new Metal kernel compilation errors: 1) a kernel parameter count overflow, and 2) missing address space qualifiers. Reviewed By: brecht Differential Revision: https://developer.blender.org/D13763 === M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/metal/compat.h === diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 027b2a7a8c7..00c727b48cb 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -821,8 +821,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_flow != PASS_UNUSED) { kernel_assert(render_pass_motion != PASS_UNUSED); -const float *motion_in = buffer + render_pass_motion; -float *flow_out = guiding_pixel + guiding_pass_flow; +ccl_global const float *motion_in = buffer + render_pass_motion; +ccl_global float *flow_out = guiding_pixel + guiding_pass_flow; flow_out[0] = -motion_in[0] * pixel_scale; flow_out[1] = -motion_in[1] * pixel_scale; diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index a51afc37fc0..1222b68f0ee 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -98,8 +98,12 @@ using namespace metal::raytracing; #define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; #define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; #define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; -#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16 -#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) +#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; +#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; +#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; +#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20; +#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20 +#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) /* Generate a struct containing the entry-point parameters and a "run" * method which can access them implicitly via this-> */ ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [e688c927eb3] master: Fix T94022: Both options GPU/CPU checked under preferences cause viewport render crash. (ARM/Metal)
Commit: e688c927eb3a82b21ba744ec57540ea2cd4f44c8 Author: Michael Jones Date: Mon Dec 13 22:20:16 2021 + Branches: master https://developer.blender.org/rBe688c927eb3a82b21ba744ec57540ea2cd4f44c8 Fix T94022: Both options GPU/CPU checked under preferences cause viewport render crash. (ARM/Metal) This fixes crash T94022 when selecting live viewport render with both GPU & CPU devices selected. It is caused by incorrect `KernelBVHLayout` assignment. Similar to `BVH_LAYOUT_MULTI_OPTIX` for Optix, this patch adds a `BVH_LAYOUT_MULTI_METAL` to correctly redirect to the correct Metal BVH layout type. Reviewed By: brecht Differential Revision: https://developer.blender.org/D13561 === M intern/cycles/bvh/bvh.cpp M intern/cycles/device/multi/device.cpp M intern/cycles/kernel/types.h M intern/cycles/scene/geometry.cpp M intern/cycles/scene/object.cpp === diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 540bf52f7ac..703639e29f3 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -44,6 +44,7 @@ const char *bvh_layout_name(BVHLayout layout) case BVH_LAYOUT_METAL: return "METAL"; case BVH_LAYOUT_MULTI_OPTIX: +case BVH_LAYOUT_MULTI_METAL: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: case BVH_LAYOUT_MULTI_METAL_EMBREE: return "MULTI"; @@ -115,6 +116,7 @@ BVH *BVH::create(const BVHParams ¶ms, break; #endif case BVH_LAYOUT_MULTI_OPTIX: +case BVH_LAYOUT_MULTI_METAL: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: case BVH_LAYOUT_MULTI_METAL_EMBREE: return new BVHMulti(params, geometry, objects); diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index baab84f4035..5ec3ef1b785 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -124,6 +124,11 @@ class MultiDevice : public Device { return BVH_LAYOUT_MULTI_OPTIX; } +/* With multiple Metal devices, every device needs its own acceleration structure */ +if (bvh_layout_mask == BVH_LAYOUT_METAL) { + return BVH_LAYOUT_MULTI_METAL; +} + /* When devices do not share a common BVH layout, fall back to creating one for each */ const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { @@ -155,6 +160,7 @@ class MultiDevice : public Device { } assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL || bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE || bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE); @@ -179,6 +185,8 @@ class MultiDevice : public Device { BVHParams params = bvh->params; if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX) params.bvh_layout = BVH_LAYOUT_OPTIX; +else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL) + params.bvh_layout = BVH_LAYOUT_METAL; else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : BVH_LAYOUT_EMBREE; diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index c39289224ad..34f909a06d9 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1224,7 +1224,8 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_MULTI_OPTIX = (1 << 3), BVH_LAYOUT_MULTI_OPTIX_EMBREE = (1 << 4), BVH_LAYOUT_METAL = (1 << 5), - BVH_LAYOUT_MULTI_METAL_EMBREE = (1 << 6), + BVH_LAYOUT_MULTI_METAL = (1 << 6), + BVH_LAYOUT_MULTI_METAL_EMBREE = (1 << 7), /* Default BVH layout to use for CPU. */ BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE, diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp index 346b030817f..ca330c5544d 100644 --- a/intern/cycles/scene/geometry.cpp +++ b/intern/cycles/scene/geometry.cpp @@ -166,7 +166,7 @@ bool Geometry::need_build_bvh(BVHLayout layout) const { return is_instanced() || layout == BVH_LAYOUT_OPTIX || layout == BVH_LAYOUT_MULTI_OPTIX || layout == BVH_LAYOUT_METAL || layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE || - layout == BVH_LAYOUT_MULTI_METAL_EMBREE; + layout == BVH_LAYOUT_MULTI_METAL || layout == BVH_LAYOUT_MULTI_METAL_EMBREE; } bool Geometry::is_instanced() const diff --git a/intern/cycles/scene/object.cpp b/intern/cycles/scene/object.cpp index bf224a81af5..77c6c6614e3 100644 --- a/intern/cycles/scene/object.cpp +++ b/intern/cycles/scene/object.cpp @@ -533,7 +533,7 @@ void Obje
[Bf-blender-cvs] [e23b54a59f0] master: Cycles: Fix OS version warnings
Commit: e23b54a59f0428399bbf4de5ba007ad764fa80be Author: Michael Jones Date: Wed Dec 8 15:07:11 2021 + Branches: master https://developer.blender.org/rBe23b54a59f0428399bbf4de5ba007ad764fa80be Cycles: Fix OS version warnings This patch suppresses OS version warnings and hides currently unsupported Metal GPUs when enumerating devices. Reviewed By: brecht Differential Revision: https://developer.blender.org/D13506 === M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/metal/util.mm === diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index fd249204646..4ad5a3caebc 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -308,14 +308,12 @@ bool MetalDevice::load_kernels(const uint _kernel_features) MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; - options.languageVersion = MTLLanguageVersion2_1; - - if (@available(macOS 11.0, *)) { -options.languageVersion = MTLLanguageVersion2_3; - } if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; } + else { +return false; + } string metalsrc; @@ -925,12 +923,14 @@ void MetalDevice::tex_alloc(device_texture &mem) } } - /* Optimize the texture for GPU access. */ - id commandBuffer = [mtlGeneralCommandQueue commandBuffer]; - id blitCommandEncoder = [commandBuffer blitCommandEncoder]; - [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture]; - [blitCommandEncoder endEncoding]; - [commandBuffer commit]; + if (@available(macos 10.14, *)) { +/* Optimize the texture for GPU access. */ +id commandBuffer = [mtlGeneralCommandQueue commandBuffer]; +id blitCommandEncoder = [commandBuffer blitCommandEncoder]; +[blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture]; +[blitCommandEncoder endEncoding]; +[commandBuffer commit]; + } /* Set Mapping and tag that we need to (re-)upload to device */ texture_slot_map[slot] = mtlTexture; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index f5c3adbce25..f948a8a0a0f 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -118,7 +118,9 @@ bool MetalDeviceKernel::load(MetalDevice *device, computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable; computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable; - computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup; + if (@available(macos 10.14, *)) { +computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup; + } computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function; diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 7aafcb2efe4..64c8bb79c49 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -70,7 +70,9 @@ class MetalDeviceQueue : public DeviceQueue { id mtlCommandBuffer = nil; id mtlComputeEncoder = nil; id mtlBlitEncoder = nil; + API_AVAILABLE(macos(10.14), ios(14.0)) id shared_event = nil; + API_AVAILABLE(macos(10.14), ios(14.0)) MTLSharedEventListener *shared_event_listener = nil; dispatch_queue_t event_queue; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index ced01e7b9b6..d04df09f49a 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -40,12 +40,14 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) mtlDevice = device->mtlDevice; mtlCommandQueue = [mtlDevice newCommandQueue]; - shared_event = [mtlDevice newSharedEvent]; - shared_event_id = 1; + if (@available(macos 10.14, *)) { +shared_event = [mtlDevice newSharedEvent]; +shared_event_id = 1; - /* Shareable event listener */ - event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL); - shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue]; +/* Shareable event listener */ +event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL); +shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue]; + } wait_semaphore = dispatch_semaphore_create(0); } @@ -57,8 +59,10 @@ MetalDeviceQueue::~MetalDeviceQueue() assert(mtlCommandBuffer == nil); assert(command_buffers_submitted =
[Bf-blender-cvs] [1552c92fb1e] master: Cycles: Fix Metal BVH crash caused by missing `WITH_METAL` define
Commit: 1552c92fb1e77f55c44627f46692a627923d9027 Author: Michael Jones Date: Tue Dec 7 21:05:58 2021 + Branches: master https://developer.blender.org/rB1552c92fb1e77f55c44627f46692a627923d9027 Cycles: Fix Metal BVH crash caused by missing `WITH_METAL` define Reviewed By: brecht Differential Revision: https://developer.blender.org/D13505 === M intern/cycles/bvh/CMakeLists.txt === diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index f275419d13c..b5c80f78f09 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -41,6 +41,7 @@ if(WITH_CYCLES_DEVICE_METAL) list(APPEND SRC ${SRC_METAL} ) + add_definitions(-DWITH_METAL) endif() set(SRC_HEADERS ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [9558fa51960] master: Cycles: Metal host-side code
Commit: 9558fa5196033390111a2348caa66ab18b8a4f89 Author: Michael Jones Date: Tue Dec 7 15:11:35 2021 + Branches: master https://developer.blender.org/rB9558fa5196033390111a2348caa66ab18b8a4f89 Cycles: Metal host-side code This patch adds the Metal host-side code: - Add all core host-side Metal backend files (device_impl, queue, etc) - Add MetalRT BVH setup files - Integrate with Cycles device enumeration code - Revive `path_source_replace_includes` in util/path (required for MSL compilation) This patch also includes a couple of small kernel-side fixes: - Add an implementation of `lgammaf` for Metal [Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik](https://users.renyi.hu/~gergonemes/) - include "work_stealing.h" inside the Metal context class because it accesses state now Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13423 === M intern/cycles/blender/CMakeLists.txt M intern/cycles/blender/addon/engine.py M intern/cycles/blender/addon/properties.py M intern/cycles/blender/addon/ui.py M intern/cycles/blender/device.cpp M intern/cycles/blender/python.cpp M intern/cycles/bvh/CMakeLists.txt M intern/cycles/bvh/bvh.cpp A intern/cycles/bvh/metal.h A intern/cycles/bvh/metal.mm M intern/cycles/cmake/external_libs.cmake M intern/cycles/device/CMakeLists.txt M intern/cycles/device/device.cpp M intern/cycles/device/device.h M intern/cycles/device/memory.h A intern/cycles/device/metal/bvh.h A intern/cycles/device/metal/bvh.mm A intern/cycles/device/metal/device.h A intern/cycles/device/metal/device.mm A intern/cycles/device/metal/device_impl.h A intern/cycles/device/metal/device_impl.mm A intern/cycles/device/metal/kernel.h A intern/cycles/device/metal/kernel.mm A intern/cycles/device/metal/queue.h A intern/cycles/device/metal/queue.mm A intern/cycles/device/metal/util.h A intern/cycles/device/metal/util.mm M intern/cycles/device/multi/device.cpp M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/util/math.h M intern/cycles/util/path.cpp M intern/cycles/util/path.h === diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index f0540486656..b4a4d487355 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -101,6 +101,11 @@ add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_DEVICE_HIP) add_definitions(-DWITH_HIP) endif() + +if(WITH_CYCLES_DEVICE_METAL) + add_definitions(-DWITH_METAL) +endif() + if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e5bb77a834a..910ac4a373e 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def _configure_argument_parser(): action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'" + "Valid options are 'CPU', 'CUDA', 'OPTIX', 'HIP' or 'METAL'." "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 0de936ddb11..8569cb7d946 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -111,7 +111,8 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), -("HIP", "HIP", "HIP", 4) +('HIP', "HIP", "HIP", 4), +('METAL', "Metal", "Metal", 5) ) enum_texture_limit = ( @@ -1312,8 +1313,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles -has_cuda, has_optix, has_hip = _cycles.get_device_types() - +has_cuda, has_optix, has_hip, has_metal = _cycles.get
[Bf-blender-cvs] [f613c4c0953] master: Cycles: MetalRT support (kernel side)
Commit: f613c4c0953ebaf993ecd55b12bab9cf2196dac4 Author: Michael Jones Date: Mon Nov 29 15:06:22 2021 + Branches: master https://developer.blender.org/rBf613c4c0953ebaf993ecd55b12bab9cf2196dac4 Cycles: MetalRT support (kernel side) This patch adds MetalRT support to Cycles kernel code. It is mostly additive in nature or confined to Metal-specific code, however there are a few areas where this interacts with other code: - MetalRT closely follows the Optix implementation, and in some cases (notably handling of transforms) it makes sense to extend Optix special-casing to MetalRT. For these generalisations we now have `__KERNEL_GPU_RAYTRACING__` instead of `__KERNEL_OPTIX__`. - MetalRT doesn't support primitive offsetting (as with `primitiveIndexOffset` in Optix), so we define and populate a new kernel texture, `__object_prim_offset`, containing per-object primitive / curve-segment offsets. This is referenced and applied in MetalRT intersection handlers. - Two new BVH layout enum values have been added: `BVH_LAYOUT_METAL` and `BVH_LAYOUT_MULTI_METAL_EMBREE` for XPU mode). Some host-side enum case handling has been updated where it is trivial to do so. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13353 === M intern/cycles/bvh/bvh.cpp M intern/cycles/device/cpu/device_impl.cpp M intern/cycles/device/multi/device.cpp M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/bvh/bvh.h A intern/cycles/kernel/bvh/metal.h M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/metal/context_begin.h M intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/kernel/device/optix/compat.h M intern/cycles/kernel/geom/motion_triangle_intersect.h M intern/cycles/kernel/geom/triangle_intersect.h M intern/cycles/kernel/integrator/subsurface_disk.h M intern/cycles/kernel/integrator/subsurface_random_walk.h M intern/cycles/kernel/textures.h M intern/cycles/kernel/types.h M intern/cycles/scene/geometry.cpp M intern/cycles/scene/object.cpp M intern/cycles/scene/object.h M intern/cycles/scene/scene.cpp M intern/cycles/scene/scene.h M intern/cycles/util/math_float3.h M intern/cycles/util/transform.h === diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index ae6655eb27b..d3c8e4db6d0 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -40,8 +40,11 @@ const char *bvh_layout_name(BVHLayout layout) return "EMBREE"; case BVH_LAYOUT_OPTIX: return "OPTIX"; +case BVH_LAYOUT_METAL: + return "METAL"; case BVH_LAYOUT_MULTI_OPTIX: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: +case BVH_LAYOUT_MULTI_METAL_EMBREE: return "MULTI"; case BVH_LAYOUT_ALL: return "ALL"; @@ -105,7 +108,10 @@ BVH *BVH::create(const BVHParams ¶ms, #endif case BVH_LAYOUT_MULTI_OPTIX: case BVH_LAYOUT_MULTI_OPTIX_EMBREE: +case BVH_LAYOUT_MULTI_METAL_EMBREE: return new BVHMulti(params, geometry, objects); +case BVH_LAYOUT_METAL: + /* host-side changes for BVH_LAYOUT_METAL are imminent */ case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: break; diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index 2ad76de70ca..62b9cc93dae 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -274,7 +274,8 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) { #ifdef WITH_EMBREE if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE) { BVHEmbree *const bvh_embree = static_cast(bvh); if (refit) { bvh_embree->refit(progress); diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index e319246d4f4..2513df63489 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -129,6 +129,10 @@ class MultiDevice : public Device { if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { return BVH_LAYOUT_MULTI_OPTIX_EMBREE; } +const BVHLayoutMask BVH_LAYOUT_METAL_EMBREE = (BVH_LAYOUT_METAL | BVH_LAYOUT_EMBREE); +if ((bvh_layout_mask_all & BVH_LAYOUT_METAL_EMBREE) == BVH_LAYOUT_METAL_EMBREE) { + return BVH_LAYOUT_MULTI_METAL_EMBREE; +} return bvh_layout_mask; } @@ -151,7 +155,8
[Bf-blender-cvs] [98a5c924fca] master: Cycles: Metal readiness: Specify DeviceQueue::enqueue arg types
Commit: 98a5c924fca00b4b39e75a4fc16585cfa040398c Author: Michael Jones Date: Mon Nov 29 14:49:53 2021 + Branches: master https://developer.blender.org/rB98a5c924fca00b4b39e75a4fc16585cfa040398c Cycles: Metal readiness: Specify DeviceQueue::enqueue arg types This patch adds new arg-type parameters to `DeviceQueue::enqueue` and its overrides. This is in preparation for the Metal backend which needs this information for correct argument encoding. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13357 === M intern/cycles/device/cuda/device_impl.cpp M intern/cycles/device/cuda/queue.cpp M intern/cycles/device/cuda/queue.h M intern/cycles/device/hip/device_impl.cpp M intern/cycles/device/hip/queue.cpp M intern/cycles/device/hip/queue.h M intern/cycles/device/optix/device_impl.cpp M intern/cycles/device/optix/queue.cpp M intern/cycles/device/optix/queue.h M intern/cycles/device/queue.h M intern/cycles/integrator/pass_accessor_gpu.cpp M intern/cycles/integrator/path_trace_work_gpu.cpp M intern/cycles/integrator/shader_eval.cpp === diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index e05fef3897c..ee55e6dc632 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -477,10 +477,10 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) * still to make it faster. */ CUDADeviceQueue queue(this); -void *d_path_index = nullptr; -void *d_render_buffer = nullptr; +device_ptr d_path_index = 0; +device_ptr d_render_buffer = 0; int d_work_size = 0; -void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; +DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); queue.init_execution(); queue.enqueue(test_kernel, 1, args); diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 09352a84181..880d7ca4cf2 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -89,7 +89,9 @@ bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const return cuda_device_->kernels.available(kernel); } -bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +bool CUDADeviceQueue::enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) { if (cuda_device_->have_error()) { return false; @@ -133,7 +135,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar 1, shared_mem_bytes, cuda_stream_, -args, +const_cast(args.values), 0), "enqueue"); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index 28613cda071..0836af12098 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -42,7 +42,9 @@ class CUDADeviceQueue : public DeviceQueue { virtual bool kernel_available(DeviceKernel kernel) const override; - virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + virtual bool enqueue(DeviceKernel kernel, + const int work_size, + DeviceKernelArguments const &args) override; virtual bool synchronize() override; diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 53c4f3f0b3f..4f1cbabc89b 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -440,10 +440,10 @@ void HIPDevice::reserve_local_memory(const uint kernel_features) * still to make it faster. */ HIPDeviceQueue queue(this); -void *d_path_index = nullptr; -void *d_render_buffer = nullptr; +device_ptr d_path_index = 0; +device_ptr d_render_buffer = 0; int d_work_size = 0; -void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; +DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size); queue.init_execution(); queue.enqueue(test_kernel, 1, args); diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 0f053ccbeb5..42841324ed6 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -89,7 +89,9 @@ bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const return hip_device_->kernels.avail
[Bf-blender-cvs] [eb7827e7970] master: Cycles: Fix film convert address space mismatch on Metal
Commit: eb7827e7970cca8e3fb0e0bf39e8742e69f0b2b6 Author: Michael Jones Date: Wed Nov 24 20:34:27 2021 + Branches: master https://developer.blender.org/rBeb7827e7970cca8e3fb0e0bf39e8742e69f0b2b6 Cycles: Fix film convert address space mismatch on Metal This patch fixes an address space mismatch in the film convert kernels on Metal. The `film_get_pass_pixel_...` functions take a `ccl_private` result pointer, but the film convert kernels pass a `ccl_global` memory pointer. Specialising the pass-fetch functions with templates results in compilation errors on Visual Studio, so instead this patch just adds an intermediate local on Metal. Reviewed By: brecht Differential Revision: https://developer.blender.org/D13350 === M intern/cycles/kernel/device/gpu/kernel.h === diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 22e2a61a06d..24702de496c 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -547,6 +547,33 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb #endif } +#ifdef __KERNEL_METAL__ + +/* Fetch into a local variable on Metal - there is minimal overhead. Templating the + * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */ +# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \ +float local_pixel[4]; \ +film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \ +if (input_channel_count >= 1) { \ + pixel[0] = local_pixel[0]; \ +} \ +if (input_channel_count >= 2) { \ + pixel[1] = local_pixel[1]; \ +} \ +if (input_channel_count >= 3) { \ + pixel[2] = local_pixel[2]; \ +} \ +if (input_channel_count >= 4) { \ + pixel[3] = local_pixel[3]; \ +} + +#else + +# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \ +film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); + +#endif + #define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ ccl_gpu_kernel_signature(film_convert_##variant, \ @@ -574,7 +601,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb ccl_global float *pixel = pixels + \ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ \ -film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ +FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \ } \ \ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ ___ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs
[Bf-blender-cvs] [d19e35873f6] master: Cycles: several small fixes and additions for MSL
Commit: d19e35873f67c90b251ca38e007a83aa1eada211 Author: Michael Jones Date: Thu Nov 18 14:25:05 2021 +0100 Branches: master https://developer.blender.org/rBd19e35873f67c90b251ca38e007a83aa1eada211 Cycles: several small fixes and additions for MSL This patch contains many small leftover fixes and additions that are required for Metal-enablement: - Address space fixes and a few other small compile fixes - Addition of missing functionality to the Metal adapter headers - Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for atomic support & maths functions) Ref T92212 Differential Revision: https://developer.blender.org/D13263 === M intern/cycles/kernel/bvh/util.h M intern/cycles/kernel/device/cuda/compat.h M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/gpu/parallel_active_index.h M intern/cycles/kernel/device/hip/compat.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/metal/globals.h M intern/cycles/kernel/device/optix/compat.h M intern/cycles/kernel/film/accumulate.h M intern/cycles/kernel/geom/attribute.h M intern/cycles/kernel/geom/subd_triangle.h M intern/cycles/kernel/sample/lcg.h M intern/cycles/kernel/sample/pattern.h M intern/cycles/kernel/svm/svm.h M intern/cycles/util/atomic.h M intern/cycles/util/debug.cpp M intern/cycles/util/debug.h M intern/cycles/util/half.h M intern/cycles/util/math.h M intern/cycles/util/math_float2.h M intern/cycles/util/math_float3.h M intern/cycles/util/math_float4.h M intern/cycles/util/math_int2.h M intern/cycles/util/math_int3.h M intern/cycles/util/math_matrix.h M intern/cycles/util/path.cpp M intern/cycles/util/transform.h M intern/cycles/util/types.h === diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index 8686f887021..26ba136dd79 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection * swapped = false; for (int j = 0; j < num_hits - 1; ++j) { if (hits[j].t > hits[j + 1].t) { -struct Intersection tmp_hit = hits[j]; +Intersection tmp_hit = hits[j]; float3 tmp_Ng = Ng[j]; hits[j] = hits[j + 1]; Ng[j] = Ng[j + 1]; diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index ba3aefa43bf..7f901510329 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -86,7 +86,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0x, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index dd0c6dd6893..60332af752c 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const auto num_active_pixels_mask = ccl_gpu_ballot(!converged); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { -atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask)); +atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask)); } } @@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const auto can_split_mask = ccl_gpu_ballot(can_split); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { -atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask)); +atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask)); } } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index f667ede2712..a5320edcb3c 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; /* For each thread within a warp compute how many other active states precede it. */ -const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & -ccl_gpu_thread_mask(thread_warp)); +const uint thread_offset = popcou
[Bf-blender-cvs] [d1f944c1863] master: Cycles: declare constants at program scope on Metal
Commit: d1f944c18634f215c3da0484ac3b80e994118680 Author: Michael Jones Date: Thu Nov 18 14:25:30 2021 +0100 Branches: master https://developer.blender.org/rBd1f944c18634f215c3da0484ac3b80e994118680 Cycles: declare constants at program scope on Metal MSL requires that constant address space literals be declared at program scope. This patch moves the `blackbody_table_r/g/b` and `cie_colour_match` constants into separate files so they can be declared at the appropriate scope. Ref T92212 Differential Revision: https://developer.blender.org/D13241 === M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/device/cpu/globals.h M intern/cycles/kernel/device/cuda/compat.h M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/hip/compat.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/optix/compat.h M intern/cycles/kernel/device/optix/kernel.cu M intern/cycles/kernel/svm/math_util.h M intern/cycles/kernel/svm/wavelength.h A intern/cycles/kernel/tables.h M intern/cycles/scene/shader_nodes.cpp M intern/cycles/util/defines.h === diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 36335d4c377..0b650b70961 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -273,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS ) set(SRC_KERNEL_TYPES_HEADERS + tables.h textures.h types.h ) diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h index dd0327b3f94..746e48b9880 100644 --- a/intern/cycles/kernel/device/cpu/globals.h +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -18,6 +18,7 @@ #pragma once +#include "kernel/tables.h" #include "kernel/types.h" #include "kernel/util/profiling.h" diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 7f901510329..658dec102b1 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -54,7 +54,7 @@ typedef unsigned long long uint64_t; #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 60332af752c..22e2a61a06d 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -21,6 +21,9 @@ #include "kernel/device/gpu/parallel_sorted_index.h" #include "kernel/device/gpu/work_stealing.h" +/* Include constant tables before entering Metal's context class scope (context_begin.h) */ +#include "kernel/tables.h" + #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_begin.h" #endif diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index 39bf2131c22..fff7a09e884 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -47,7 +47,7 @@ typedef unsigned long long uint64_t; #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 080109e3b83..61597a4acfc 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -45,7 +45,7 @@ using namespace metal; #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global device -#define ccl_static_constant static constant constexpr +#define ccl_inline_constant static constant constexpr #define ccl_device_constant constant #define ccl_constant const device #define ccl_gpu_shared threadgroup diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index bebb1e458eb..0619c135c39 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -53,7 +53,7 @@ typedef unsigned long long uint64_t; #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_c
[Bf-blender-cvs] [64003fa4b0b] master: Cycles: Adapt volumetric lambda functions to work on MSL
Commit: 64003fa4b0b168a5b048d980eb775d547d8d Author: Michael Jones Date: Tue Nov 16 13:41:29 2021 + Branches: master https://developer.blender.org/rB64003fa4b0b168a5b048d980eb775d547d8d Cycles: Adapt volumetric lambda functions to work on MSL This patch adapts the existing volumetric read/write lambda functions for Metal. Lambda expressions are not supported on MSL, so two new macros `VOLUME_READ_LAMBDA` and `VOLUME_WRITE_LAMBDA` have been defined with a default implementation which, on Metal, is overridden to use inline function objects. This patch also removes the last remaining mention of the now-unused `ccl_addr_space`. Ref T92212 Reviewed By: leesonw Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13234 === M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/integrator/shade_shadow.h M intern/cycles/kernel/integrator/shade_volume.h M intern/cycles/kernel/integrator/volume_stack.h M intern/cycles/kernel/types.h === diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index a839917a907..4a2c39d90fd 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -150,6 +150,31 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ // clang-format on +/* volumetric lambda functions - use function objects for lambda-like functionality */ +#define VOLUME_READ_LAMBDA(function_call) \ + struct FnObjectRead { \ +KernelGlobals kg; \ +ccl_private MetalKernelContext *context; \ +int state; \ +\ +VolumeStack operator()(const int i) const \ +{ \ + return context->function_call; \ +} \ + } volume_read_lambda_pass{kg, this, state}; + +#define VOLUME_WRITE_LAMBDA(function_call) \ + struct FnObjectWrite { \ +KernelGlobals kg; \ +ccl_private MetalKernelContext *context; \ +int state; \ +\ +void operator()(const int i, VolumeStack entry) const \ +{ \ + context->function_call; \ +} \ + } volume_write_lambda_pass{kg, this, state}; + /* make_type definitions with Metal style element initializers */ #ifdef make_float2 # undef make_float2 diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h index 1de890aae29..a68fcaa7a64 100644 --- a/intern/cycles/kernel/integrator/shade_shadow.h +++ b/intern/cycles/kernel/integrator/shade_shadow.h @@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg, shader_setup_from_volume(kg, shadow_sd, &ray); - const float step_size = volume_stack_step_size( - kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); }); + VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i)); + const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass); volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size); } diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h index f42614cc87f..c5a80eb336f 100644 --- a/intern/cycles/kernel/integrator/shade_volume.h +++ b/intern/cycles/kernel/integrator/shade_volume.h @@ -78,9 +78,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg, ccl_private ShaderData *ccl_restrict sd, ccl_private float3 *ccl_restrict extinction) { - shader_eval_volume(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) { -return integrator_state_read_shadow_volume_stack(state, i); - }); + VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i)) + shader_eval_volume(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass); if (!(sd->flag & SD_EXTINCTION)) { return false; @@ -98,9 +97,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg, ccl_private VolumeShaderCoefficients *coeff) { const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag); - shader_eval_volume(kg, state, sd, path_flag, [=](const int i) { -return integrator_state_read_volume_stack(state, i); - }); + VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i)) + shader_eval_volume(kg, state, sd, path_flag, volume_read_lambda_pass); if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) { return false; @@ -921,8 +919,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg, VOLUME_SAMPLE_DISTANCE; /* Step through volume. */ - const float step_size = volume_stack_step_size( - kg, [=](const int i) { return integrator_s
[Bf-blender-cvs] [3a4c8f406a3] master: Cycles: Adapt shared kernel/device/gpu layer for MSL
Commit: 3a4c8f406a3a3bf0627477c6183a594fa707a6e2 Author: Michael Jones Date: Tue Nov 9 21:30:46 2021 + Branches: master https://developer.blender.org/rB3a4c8f406a3a3bf0627477c6183a594fa707a6e2 Cycles: Adapt shared kernel/device/gpu layer for MSL This patch adapts the shared kernel entrypoints so that they can be compiled as MSL (Metal Shading Language). Where possible, the adaptations avoid changes in common code. In MSL, kernel function inputs are explicitly bound to resources. In the case of argument buffers, we declare a struct containing the kernel arguments, accessible via device pointer. This differs from CUDA and HIP where kernel function arguments are declared as traditional C-style function parameters. This patch adapts the entrypoints declared in kernel.h so that they can be translated via a new `ccl_gpu_kernel_signature` macro into the required parameter struct + kernel entrypoint pairin [...] MSL buffer attribution must be applied to function parameters or non-static class data members. To allow universal access to the integrator state, kernel data, and texture fetch adapters, we wrap all of the shared kernel code in a `MetalKernelContext` class. This is achieved by bracketing the appropriate kernel headers with "context_begin.h" and "context_end.h" on Metal. When calling deeper into the kernel code, we must reference the context class (e.g. `context.integrator_init_from_camer [...] Lambda expressions are not supported on MSL, so a new `ccl_gpu_kernel_lambda` macro generates an inline function object and optionally capturing any required state. This yields the same behaviour. This approach is applied to all parallel_... implementations which are templated by operation. The lambda expressions in the film_convert... kernels don't adapt cleanly to use function objects. However, these entrypoints can be macro-generated more concisely to avoid lambda expressions entirely, [...] A separate implementation of `gpu_parallel_active_index_array` is provided for Metal to workaround some subtle differences in SIMD width, and also to encapsulate some required thread parameters which must be declared as explicit entrypoint function parameters. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13109 === M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/device/cuda/compat.h M intern/cycles/kernel/device/cuda/config.h M intern/cycles/kernel/device/gpu/image.h M intern/cycles/kernel/device/gpu/kernel.h M intern/cycles/kernel/device/gpu/parallel_active_index.h M intern/cycles/kernel/device/gpu/parallel_prefix_sum.h M intern/cycles/kernel/device/gpu/parallel_sorted_index.h M intern/cycles/kernel/device/hip/compat.h M intern/cycles/kernel/device/hip/config.h M intern/cycles/kernel/device/metal/compat.h A intern/cycles/kernel/device/metal/context_begin.h A intern/cycles/kernel/device/metal/context_end.h A intern/cycles/kernel/device/metal/globals.h A intern/cycles/kernel/device/metal/kernel.metal M intern/cycles/kernel/device/optix/compat.h === diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 29ff69df864..f311b0e74bb 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP device/hip/kernel.cpp ) +set(SRC_KERNEL_DEVICE_METAL + device/metal/kernel.metal +) + set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS device/optix/globals.h ) +set(SRC_KERNEL_DEVICE_METAL_HEADERS + device/metal/compat.h + device/metal/context_begin.h + device/metal/context_end.h + device/metal/globals.h +) + set(SRC_KERNEL_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -368,6 +379,7 @@ if(WITH_CYCLES_CUDA_BINARIES) ${SRC_KERNEL_HEADERS} ${SRC_KERNEL_DEVICE_GPU_HEADERS} ${SRC_KERNEL_DEVICE_CUDA_HEADERS} +${SRC_KERNEL_DEVICE_METAL_HEADERS} ${SRC_UTIL_HEADERS} ) set(cuda_cubins) @@ -723,12 +735,14 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_OPTIX} + ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_HEADERS} ${SRC_KERNEL_DEVICE_CPU_HEADERS} ${SRC_KERNEL_DEVICE_GPU_HEADERS} ${SRC_KERNEL_DEVICE_CUDA_HEADERS} ${SRC_KERNEL_DEVICE_HIP_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} + ${SRC_KERNEL_DEVICE_METAL_HEADERS} ) source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS}) @@ -740,6 +754,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_ source_group("device
[Bf-blender-cvs] [a0f269f682d] master: Cycles: Kernel address space changes for MSL
Commit: a0f269f682dab848afc80cd322d04a0c4a815cae Author: Michael Jones Date: Thu Oct 14 13:53:40 2021 +0100 Branches: master https://developer.blender.org/rBa0f269f682dab848afc80cd322d04a0c4a815cae Cycles: Kernel address space changes for MSL This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation. MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness. The vast majority of deltas in this patch fall into one of two cases: - Ensuring ccl_private is specified for thread-local pointer types - Ensuring ccl_global is specified for device-wide pointer types Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant. In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture. The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D12864 === M intern/cycles/kernel/bvh/bvh.h M intern/cycles/kernel/bvh/bvh_local.h M intern/cycles/kernel/bvh/bvh_nodes.h M intern/cycles/kernel/bvh/bvh_shadow_all.h M intern/cycles/kernel/bvh/bvh_traversal.h M intern/cycles/kernel/bvh/bvh_util.h M intern/cycles/kernel/bvh/bvh_volume.h M intern/cycles/kernel/bvh/bvh_volume_all.h M intern/cycles/kernel/closure/alloc.h M intern/cycles/kernel/closure/bsdf.h M intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h M intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h M intern/cycles/kernel/closure/bsdf_diffuse.h M intern/cycles/kernel/closure/bsdf_diffuse_ramp.h M intern/cycles/kernel/closure/bsdf_hair.h M intern/cycles/kernel/closure/bsdf_hair_principled.h M intern/cycles/kernel/closure/bsdf_microfacet.h M intern/cycles/kernel/closure/bsdf_microfacet_multi.h M intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h M intern/cycles/kernel/closure/bsdf_oren_nayar.h M intern/cycles/kernel/closure/bsdf_phong_ramp.h M intern/cycles/kernel/closure/bsdf_principled_diffuse.h M intern/cycles/kernel/closure/bsdf_principled_sheen.h M intern/cycles/kernel/closure/bsdf_reflection.h M intern/cycles/kernel/closure/bsdf_refraction.h M intern/cycles/kernel/closure/bsdf_toon.h M intern/cycles/kernel/closure/bsdf_transparent.h M intern/cycles/kernel/closure/bsdf_util.h M intern/cycles/kernel/closure/bssrdf.h M intern/cycles/kernel/closure/emissive.h M intern/cycles/kernel/closure/volume.h M intern/cycles/kernel/device/cpu/compat.h M intern/cycles/kernel/device/cuda/compat.h M intern/cycles/kernel/device/hip/compat.h A intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/optix/compat.h M intern/cycles/kernel/geom/geom_attribute.h M intern/cycles/kernel/geom/geom_curve.h M intern/cycles/kernel/geom/geom_curve_intersect.h M intern/cycles/kernel/geom/geom_motion_curve.h M intern/cycles/kernel/geom/geom_motion_triangle.h M intern/cycles/kernel/geom/geom_motion_triangle_intersect.h M intern/cycles/kernel/geom/geom_motion_triangle_shader.h M intern/cycles/kernel/geom/geom_object.h M intern/cycles/kernel/geom/geom_patch.h M intern/cycles/kernel/geom/geom_primitive.h M intern/cycles/kernel/geom/geom_shader_data.h M intern/cycles/kernel/geom/geom_subd_triangle.h M intern/cycles/kernel/geom/geom_triangle.h M intern/cycles/kernel/geom/geom_triangle_intersect.h M intern/cycles/kernel/geom/geom_volume.h M intern/cycles/kernel/integrator/integrator_init_from_bake.h M intern/cycles/kernel/integrator/integrator_init_from_camera.h M intern/cycles/ke