Commit: 4412e14708c5625c3fe84bc75fce2ca6de6f58c9 Author: Michael Jones Date: Tue Jun 7 11:08:21 2022 +0100 Branches: master https://developer.blender.org/rB4412e14708c5625c3fe84bc75fce2ca6de6f58c9
Cycles: Useful Metal backend debug & profiling functionality This patch adds some useful debugging & profiling env vars to the Metal backend: - `CYCLES_METAL_PROFILING`: output a per-kernel timing report at the end of the render - `CYCLES_METAL_DEBUG`: enable per-dispatch tracing (very verbose) - `CYCLES_DEBUG_METAL_CAPTURE_KERNEL`: enable programatic .gputrace capture for a specified kernel index Here's an example of the timing report with `CYCLES_METAL_PROFILING` enabled: ``` --------------------------------------------------------------------------------------------------- Kernel name Total threads Dispatches Avg. T/D Time Time% --------------------------------------------------------------------------------------------------- integrator_init_from_camera 657,407,232 161 4,083,274 0.24s 0.51% integrator_intersect_closest 1,629,288,440 681 2,392,494 15.18s 32.12% integrator_intersect_shadow 751,652,291 470 1,599,260 5.80s 12.28% integrator_shade_background 304,612,074 263 1,158,220 1.16s 2.45% integrator_shade_surface 1,159,764,041 676 1,715,627 20.57s 43.52% integrator_shade_shadow 598,885,847 418 1,432,741 1.27s 2.69% integrator_queued_paths_array 2,969,650,130 805 3,689,006 0.35s 0.74% integrator_queued_shadow_paths_array 593,936,619 379 1,567,115 0.14s 0.29% integrator_terminated_paths_array 22,205,417 155 143,260 0.05s 0.10% integrator_sorted_paths_array 2,517,140,043 676 3,723,579 1.65s 3.50% integrator_compact_paths_array 648,912,748 155 4,186,533 0.03s 0.07% integrator_compact_states 20,872,687 155 134,662 0.14s 0.29% integrator_terminated_shadow_paths_array 374,100,675 438 854,111 0.16s 0.33% integrator_compact_shadow_paths_array 503,768,657 438 1,150,156 0.05s 0.10% integrator_compact_shadow_states 37,664,941 202 186,460 0.23s 0.50% integrator_reset 25,165,824 6 4,194,304 0.06s 0.12% film_convert_combined_half_rgba 3,110,400 6 518,400 0.00s 0.01% prefix_sum 676 676 1 0.19s 0.40% --------------------------------------------------------------------------------------------------- 6,760 47.27s 100.00% --------------------------------------------------------------------------------------------------- ``` Reviewed By: brecht Differential Revision: https://developer.blender.org/D15044 =================================================================== M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/queue.h M intern/cycles/device/metal/queue.mm M intern/cycles/device/metal/util.h M intern/cycles/kernel/device/gpu/kernel.h =================================================================== diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 086fbb093ba..09c4ace081e 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -11,6 +11,7 @@ # include "util/progress.h" # include "device/metal/bvh.h" +# include "device/metal/util.h" CCL_NAMESPACE_BEGIN @@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN { \ string str = string_printf(__VA_ARGS__); \ progress.set_substatus(str); \ + metal_printf("%s\n", str.c_str()); \ } BVHMetal::BVHMetal(const BVHParams ¶ms_, diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 7506b9b069f..0e6817d94f8 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -31,6 +31,8 @@ class MetalDevice : public Device { string source[PSO_NUM]; string source_md5[PSO_NUM]; + bool capture_enabled = false; + KernelParamsMetal launch_params = {0}; /* MetalRT members ----------------------------------*/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 16aabacb4cf..086bf0af979 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile use_metalrt = (atoi(metalrt) != 0); } + if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + capture_enabled = true; + } + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) } if (size > 0) { - if (mem.type == MEM_DEVICE_ONLY) { + if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) { options = MTLResourceStorageModePrivate; } diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 6cc84a20787..de20514de0b 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -12,8 +12,6 @@ # include "device/metal/util.h" # include "kernel/device/metal/globals.h" -# define metal_printf VLOG(4) << string_printf - CCL_NAMESPACE_BEGIN class MetalDevice; @@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue { void close_compute_encoder(); void close_blit_encoder(); + + bool verbose_tracing = false; + + /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */ + + struct TimingData { + DeviceKernel kernel; + int work_size; + uint64_t timing_id; + }; + std::vector<TimingData> command_encoder_labels; + id<MTLSharedEvent> timing_shared_event = nil; + uint64_t timing_shared_event_id; + uint64_t command_buffer_start_timing_id; + + struct TimingStats { + double total_time = 0.0; + uint64_t total_work_size = 0; + uint64_t num_dispatches = 0; + }; + TimingStats timing_stats[DEVICE_KERNEL_NUM]; + double last_completion_time = 0.0; + + /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */ + + id<MTLCaptureScope> mtlCaptureScope = nil; + DeviceKernel capture_kernel; + int capture_dispatch = 0; + int capture_dispatch_counter = 0; + bool is_capturing = false; + bool is_capturing_to_disk = false; + bool has_captured_to_disk = false; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index c1dab5b0d8f..8b2d5d81859 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) } wait_semaphore = dispatch_semaphore_create(0); + + if (@available(macos 10.14, *)) { + if (getenv("CYCLES_METAL_PROFILING")) { + /* Enable per-kernel timing breakdown (shown at end of render). */ + timing_shared_event = [mtlDevice newSharedEvent]; + } + if (getenv("CYCLES_METAL_DEBUG")) { + /* Enable very verbose tracing (shows every dispatch). */ + verbose_tracing = true; + } + timing_shared_event_id = 1; + } + + capture_kernel = DeviceKernel(-1); + if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + /* Enable .gputrace capture for the specified DeviceKernel. */ + MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager]; + mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice]; + mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"]; + [captureManager setDefaultCaptureScope:mtlCaptureScope]; + + capture_dispatch = -1; + if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) { + capture_dispatch = atoi(capture_dispatch_str); + capture_dispatch_counter = 0; + } + + capture_kernel = DeviceKernel(atoi(capture_kernel_str)); + printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel)); + + if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) { + if (@available(macos 10.15, *)) { + if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) { + + MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init]; + captureDescriptor.captureObject = mtlCaptureScope; + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)]; + + NSError *error; + if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) { + NSString *err = [error localizedDescription]; + printf("Start capture failed: %s\n", [err UTF8String]); + } + else { + printf("Capture started (URL: %s)\n", capture_url); + is_capturing_to_disk = true; + } + } + else { + printf("Capture to file is not supported\n"); + } + } + } + } } MetalDeviceQueue::~MetalDeviceQueue() @@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue() [mtlCommandQueue release]; mtlCommandQueue = nil; } + + if (mtlCaptureScope) { + [mtlCaptureScope release]; + } + + double total_time = 0.0; + + /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */ + int64_t total_work_size = 0; + int64_t num_dispatches = 0; + for (auto &stat : timing_stats) { + total_time += stat.total_time; + total_work_size += stat.total_work_size; + num_dispatches += stat.num_dispatches; + } + + if (num_dispatches) { + printf("\nMetal dispatch stats:\n\n"); + auto header = string_printf("%-40s %16s %12s %12s %7s %7s", + "Kernel name", + "Total threads", + "Dispatches", + "Avg. T/D", + "Time", + "Time%"); + auto divider = string(header.length(), '-'); + printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str()); + + for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) { + auto &stat = timing_stats[i]; + if (stat.num_dispatches > 0) { + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + device_kernel_as_string(DeviceKernel(i)), + string_human_readable_number(stat.total_work_size).c_str(), + string_human_readable_number(stat.num_dispatches).c_str(), + string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(), + stat.total_time, + stat.total_time * 100.0 / total_time); + } + } + printf("%s\n", divider.c_str()); + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + "", + "", + string_human_readable_number(num_dispatches).c_str(), + "", + total_time, + 100.0); + printf("%s\n\n", divider.c_str()); + } } int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const @@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { + if (kernel == capture_kernel) { + if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) { + /* Start gputrace capture. */ + if (mtlCommandBuffer) { + synchronize(); + } + [mtlCaptureScope beginScope]; + printf("[mtlCaptureScope beginScope]\n"); + is_capturing = true; + } + capture_dispatch_counter += 1; + } + if (metal_device->have_error()) { return false; } @@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); + if (timing_shared_event) { + command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id}); + } + /* Determine size requirement for argument buffer. */ size_t arg_buffer_length = 0; for (size_t i = 0; i < args.count; i++) { @@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Encode KernelParamsMetal buffers */ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + if (verbose_tracing || timing_shared_event || is_capturing) { + /* Add human-readable labels if we're doing any form of debugging / profiling. */ + mtlComputeCommandEncoder.label = [[NSString alloc] + initWithFormat:@"Metal queue launch %s, work_size %d", + device_kernel_as_string(kernel), + work_size]; + } + /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + sizeof(IntegratorStateGPU); @@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, int pointer_index = offset / sizeof(device_ptr); MetalDevice::MetalMem *mmem = @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs