Commit: 6dcfb6df9ce671996fcb39df1a1abadefd4f1d47 Author: Nikita Sirgienko Date: Wed Feb 1 17:22:53 2023 +0100 Branches: master https://developer.blender.org/rB6dcfb6df9ce671996fcb39df1a1abadefd4f1d47
Cycles: Abstract host memory fallback for GPU devices Host memory fallback in CUDA and HIP devices is almost identical. We remove duplicated code and create a shared generic version that other devices (oneAPI) will be able to use. Reviewed By: brecht Differential Revision: https://developer.blender.org/D17173 =================================================================== M intern/cycles/device/cuda/device_impl.cpp M intern/cycles/device/cuda/device_impl.h M intern/cycles/device/device.cpp M intern/cycles/device/device.h M intern/cycles/device/hip/device_impl.cpp M intern/cycles/device/hip/device_impl.h M intern/cycles/device/memory.h =================================================================== diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index f354ba6aee1..c19a0ade332 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error) } CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) + : GPUDevice(info, stats, profiler) { + /* Verify that base class types can be used with specific backend types */ + static_assert(sizeof(texMemObject) == sizeof(CUtexObject)); + static_assert(sizeof(arrayMemObject) == sizeof(CUarray)); + first_error = true; cuDevId = info.num; @@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) need_texture_info = false; - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; pitch_alignment = 0; /* Initialize CUDA. */ @@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, * so we can predict which memory to map to host. */ - cuda_assert( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + int value; + cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + can_map_host = value != 0; cuda_assert(cuDeviceGetAttribute( &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); @@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) # endif } -void CUDADevice::init_host_memory() +void CUDADevice::get_device_memory_info(size_t &total, size_t &free) { - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + CUDAContextScope scope(this); - VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; + cuMemGetInfo(&free, &total); } -void CUDADevice::load_texture_info() +bool CUDADevice::alloc_device(void *&device_pointer, size_t size) { - if (need_texture_info) { - /* Unset flag before copying, so this does not loop indefinitely if the copy below calls - * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ - need_texture_info = false; - texture_info.copy_to_device(); - } + CUDAContextScope scope(this); + + CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size); + return mem_alloc_result == CUDA_SUCCESS; } -void CUDADevice::move_textures_to_host(size_t size, bool for_texture) +void CUDADevice::free_device(void *device_pointer) { - /* Break out of recursive call, which can happen when moving memory on a multi device. */ - static bool any_device_moving_textures_to_host = false; - if (any_device_moving_textures_to_host) { - return; - } - - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - thread_scoped_lock lock(cuda_mem_map_mutex); - foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - /* Can only move textures allocated on this device (and not those from peer devices). - * And need to ignore memory that is already on the host. */ - if (!mem.is_resident(this) || cmem->use_mapped_host) { - continue; - } - - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && - (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - lock.unlock(); - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG_WORK << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - any_device_moving_textures_to_host = true; - - /* Potentially need to call back into multi device, so pointer mapping - * and peer devices are updated. This is also necessary since the device - * pointer may just be a key here, so cannot be accessed and freed directly. - * Unfortunately it does mean that memory is reallocated on all other - * devices as well, which is potentially dangerous when still in use (since - * a thread rendering on another devices would only be caught in this mutex - * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); - size = (max_size >= size) ? 0 : size - max_size; - - any_device_moving_textures_to_host = false; - } - else { - break; - } - } - - /* Unset flag before texture info is reloaded, since it should stay in device memory. */ - move_texture_to_host = false; + CUDAContextScope scope(this); - /* Update texture info array with new pointers. */ - load_texture_info(); + cuda_assert(cuMemFree((CUdeviceptr)device_pointer)); } -CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) +bool CUDADevice::alloc_host(void *&shared_pointer, size_t size) { CUDAContextScope scope(this); - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - } - - if (mem_alloc_result != CUDA_SUCCESS) { - if (mem.type == MEM_DEVICE_ONLY) { - status = " failed, out of device memory"; - set_error("System is out of GPU memory"); - } - else { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); - } - } - - if (mem.name) { - VLOG_WORK << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats. @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org List details, subscription details or unsubscribe: https://lists.blender.org/mailman/listinfo/bf-blender-cvs