Commit: 2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d Author: Michael Jones Date: Mon Feb 6 19:09:51 2023 +0000 Branches: master https://developer.blender.org/rB2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d
Cycles: MetalRT optimisation for subsurface intersection queries This patch optimises subsurface intersection queries on MetalRT. Currently intersect_local traverses from the scene root, retrospectively discarding all non-local hits. Using a lookup of bottom level acceleration structures, we can explicitly query only the relevant instance. On M1 Max, with MetalRT selected, this can give a render speedup of 15-20% for scenes like Monster which make heavy use of subsurface scattering. Patch authored by Marco Giordano. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17153 =================================================================== M intern/cycles/device/metal/bvh.h M intern/cycles/device/metal/bvh.mm M intern/cycles/device/metal/device_impl.h M intern/cycles/device/metal/device_impl.mm M intern/cycles/device/metal/kernel.h M intern/cycles/device/metal/kernel.mm M intern/cycles/device/metal/queue.mm M intern/cycles/kernel/device/metal/bvh.h M intern/cycles/kernel/device/metal/compat.h M intern/cycles/kernel/device/metal/kernel.metal =================================================================== diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h index 519cbf00294..5448a3ae41d 100644 --- a/intern/cycles/device/metal/bvh.h +++ b/intern/cycles/device/metal/bvh.h @@ -21,6 +21,7 @@ class BVHMetal : public BVH { API_AVAILABLE(macos(11.0)) vector<id<MTLAccelerationStructure>> blas_array; + vector<uint32_t> blas_lookup; bool motion_blur = false; diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index a7fd64d3c98..c692b762d86 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress, uint32_t instance_index = 0; uint32_t motion_transform_index = 0; + + // allocate look up buffer for wost case scenario + uint64_t count = objects.size(); + blas_lookup.resize(count); + for (Object *ob : objects) { /* Skip non-traceable objects */ if (!ob->is_traceable()) @@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress, /* Set user instance ID to object index */ int object_index = ob->get_device_index(); uint32_t user_id = uint32_t(object_index); + int currIndex = instance_index++; + assert(user_id < blas_lookup.size()); + blas_lookup[user_id] = accel_struct_index; /* Bake into the appropriate descriptor */ if (motion_blur) { MTLAccelerationStructureMotionInstanceDescriptor *instances = (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents]; - MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++]; + MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex]; desc.accelerationStructureIndex = accel_struct_index; desc.userID = user_id; @@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress, else { MTLAccelerationStructureUserIDInstanceDescriptor *instances = (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents]; - MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++]; + MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex]; desc.accelerationStructureIndex = accel_struct_index; desc.userID = user_id; diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index a10962b4e45..2b89ebf19c9 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -74,6 +74,11 @@ class MetalDevice : public Device { id<MTLBuffer> texture_bindings_3d = nil; std::vector<id<MTLTexture>> texture_slot_map; + /* BLAS encoding & lookup */ + id<MTLArgumentEncoder> mtlBlasArgEncoder = nil; + id<MTLBuffer> blas_buffer = nil; + id<MTLBuffer> blas_lookup_buffer = nil; + bool use_metalrt = false; MetalPipelineType kernel_specialization_level = PSO_GENERIC; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 35298822e41..aadf5e02934 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure; arg_desc_as.access = MTLArgumentAccessReadOnly; + MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init]; + arg_desc_ptrs.dataType = MTLDataTypePointer; + arg_desc_ptrs.access = MTLArgumentAccessReadOnly; + MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init]; arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable; arg_desc_ift.access = MTLArgumentAccessReadOnly; @@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */ arg_desc_ift.index = index++; [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */ + arg_desc_ift.index = index++; + [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */ + arg_desc_ptrs.index = index++; + [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */ + arg_desc_ptrs.index = index++; + [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */ [arg_desc_ift release]; [arg_desc_as release]; + [arg_desc_ptrs release]; } } mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc]; + // preparing the blas arg encoder + MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init]; + arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure; + arg_desc_blas.access = MTLArgumentAccessReadOnly; + mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]]; + [arg_desc_blas release]; + for (int i = 0; i < ancillary_desc.count; i++) { [ancillary_desc[i] release]; } @@ -1240,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) if (@available(macos 11.0, *)) { if (bvh->params.top_level) { bvhMetalRT = bvh_metal; + + // allocate required buffers for BLAS array + uint64_t count = bvhMetalRT->blas_array.size(); + uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count; + blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode]; + stats.mem_alloc(blas_buffer.allocatedSize); + + for (uint64_t i = 0; i < count; ++i) { + [mtlBlasArgEncoder setArgumentBuffer:blas_buffer + offset:i * mtlBlasArgEncoder.encodedLength]; + [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0]; + } + + count = bvhMetalRT->blas_lookup.size(); + bufferSize = sizeof(uint32_t) * count; + blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize + options:default_storage_mode]; + stats.mem_alloc(blas_lookup_buffer.allocatedSize); + + memcpy([blas_lookup_buffer contents], + bvhMetalRT -> blas_lookup.data(), + blas_lookup_buffer.allocatedSize); + + if (default_storage_mode == MTLResourceStorageModeManaged) { + [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)]; + [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)]; + } } } } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 212671f52a0..0225c5c4947 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -19,6 +19,8 @@ enum { METALRT_FUNC_SHADOW_BOX, METALRT_FUNC_LOCAL_TRI, METALRT_FUNC_LOCAL_BOX, + METALRT_FUNC_LOCAL_TRI_PRIM, + METALRT_FUNC_LOCAL_BOX_PRIM, METALRT_FUNC_CURVE_RIBBON, METALRT_FUNC_CURVE_RIBBON_SHADOW, METALRT_FUNC_CURVE_ALL, @@ -28,7 +30,13 @@ enum { METALRT_FUNC_NUM }; -enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM }; +enum { + METALRT_TABLE_DEFAULT, + METALRT_TABLE_SHADOW, + METALRT_TABLE_LOCAL, + METALRT_TABLE_LOCAL_PRIM, + METALRT_TABLE_NUM +}; /* Pipeline State Object types */ enum MetalPipelineType { diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 2ed230ee657..d9e977f1ed6 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -524,6 +524,8 @@ void MetalKernelPipeline::compile() "__anyhit__cycles_metalrt_shadow_all_hit_box", "__anyhit__cycles_metalrt_local_hit_tri", "__anyhit__cycles_metalrt_local_hit_box", + "__anyhit__cycles_metalrt_local_hit_tri_prim", + "__anyhit__cycles_metalrt_local_hit_box_prim", "__intersection__curve_ribbon", "__intersection__curve_ribbon_shadow", "__intersection__curve_all", @@ -614,11 +616,17 @@ void MetalKernelPipeline::compile() rt_intersection_function[METALRT_FUNC_LOCAL_BOX], rt_intersection_function[METALRT_FUNC_LOCAL_BOX], nil]; + table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray + arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM], + rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM], + rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM], + nil]; NSMutableSet *unique_functions = [NSMutableSet setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]]; if (kernel_has_intersection(device_kernel)) { linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]] diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 9137e9b1fb0..b824b75ccf4 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -482,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, if (metal_device_->bvhMetalRT) { id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct; [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2]; + [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer + offset:0 + atIndex:7]; + [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer + offset:0 + @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs