This is an automated email from the ASF dual-hosted git repository. guanmingchiu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/mahout.git
commit a7a1fd47724eb9fed76466248e85a387619855bb Author: Ping <[email protected]> AuthorDate: Sat Dec 6 23:45:03 2025 +0800 [QDP] Gracefully handles OOM (#688) * gracefully handles OOM Signed-off-by: 400Ping <[email protected]> * fix Signed-off-by: 400Ping <[email protected]> --------- Signed-off-by: 400Ping <[email protected]> --- qdp/qdp-core/src/gpu/encodings/amplitude.rs | 94 ++++++++++++-------- qdp/qdp-core/src/gpu/memory.rs | 130 ++++++++++++++++++++++++++-- qdp/qdp-core/src/gpu/pipeline.rs | 12 ++- 3 files changed, 194 insertions(+), 42 deletions(-) diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs index 38551d15c..9868a17bc 100644 --- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs +++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs @@ -29,6 +29,8 @@ use std::ffi::c_void; use cudarc::driver::DevicePtr; #[cfg(target_os = "linux")] use qdp_kernels::launch_amplitude_encode; +#[cfg(target_os = "linux")] +use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error}; use crate::preprocessing::Preprocessor; @@ -65,40 +67,56 @@ impl QuantumEncoder for AmplitudeEncoder { if host_data.len() < ASYNC_THRESHOLD { // Synchronous path for small data (avoids stream overhead) - let input_slice = { - crate::profile_scope!("GPU::H2DCopy"); - _device.htod_sync_copy(host_data) - .map_err(|e| MahoutError::MemoryAllocation(format!("Failed to allocate input buffer: {:?}", e)))? - }; + let input_bytes = host_data.len() * std::mem::size_of::<f64>(); + ensure_device_memory_available(input_bytes, "input staging buffer", Some(num_qubits))?; - let ret = { - crate::profile_scope!("GPU::KernelLaunch"); - unsafe { - launch_amplitude_encode( - *input_slice.device_ptr() as *const f64, - state_vector.ptr() as *mut c_void, + let input_slice = { + crate::profile_scope!("GPU::H2DCopy"); + _device.htod_sync_copy(host_data) + .map_err(|e| map_allocation_error( + input_bytes, + "input staging buffer", + Some(num_qubits), + e, + ))? + }; + + let ret = { + crate::profile_scope!("GPU::KernelLaunch"); + unsafe { + launch_amplitude_encode( + *input_slice.device_ptr() as *const f64, + state_vector.ptr() as *mut c_void, host_data.len(), state_len, - norm, - std::ptr::null_mut(), // default stream - ) - } - }; + norm, + std::ptr::null_mut(), // default stream + ) + } + }; - if ret != 0 { - let error_msg = format!( - "Kernel launch failed with CUDA error code: {} ({})", - ret, - cuda_error_to_string(ret) - ); - return Err(MahoutError::KernelLaunch(error_msg)); - } + if ret != 0 { + let error_msg = if ret == 2 { + format!( + "Kernel launch reported cudaErrorMemoryAllocation (likely OOM) while encoding {} elements into 2^{} state.", + host_data.len(), + num_qubits, + ) + } else { + format!( + "Kernel launch failed with CUDA error code: {} ({})", + ret, + cuda_error_to_string(ret) + ) + }; + return Err(MahoutError::KernelLaunch(error_msg)); + } - { - crate::profile_scope!("GPU::Synchronize"); - _device - .synchronize() - .map_err(|e| MahoutError::Cuda(format!("CUDA device synchronize failed: {:?}", e)))?; + { + crate::profile_scope!("GPU::Synchronize"); + _device + .synchronize() + .map_err(|e| MahoutError::Cuda(format!("CUDA device synchronize failed: {:?}", e)))?; } } else { // Async Pipeline path for large data @@ -163,11 +181,19 @@ impl AmplitudeEncoder { }; if ret != 0 { - let error_msg = format!( - "Kernel launch failed with CUDA error code: {} ({})", - ret, - cuda_error_to_string(ret) - ); + let error_msg = if ret == 2 { + format!( + "Kernel launch reported cudaErrorMemoryAllocation (likely OOM) while encoding chunk starting at offset {} (len={}).", + chunk_offset, + chunk_len + ) + } else { + format!( + "Kernel launch failed with CUDA error code: {} ({})", + ret, + cuda_error_to_string(ret) + ) + }; return Err(MahoutError::KernelLaunch(error_msg)); } diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs index 513c326c0..1ac8eabb5 100644 --- a/qdp/qdp-core/src/gpu/memory.rs +++ b/qdp/qdp-core/src/gpu/memory.rs @@ -19,6 +19,115 @@ use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr}; use qdp_kernels::CuDoubleComplex; use crate::error::{MahoutError, Result}; +#[cfg(target_os = "linux")] +fn bytes_to_mib(bytes: usize) -> f64 { + bytes as f64 / (1024.0 * 1024.0) +} + +#[cfg(target_os = "linux")] +fn cuda_error_to_string(code: i32) -> &'static str { + match code { + 0 => "cudaSuccess", + 2 => "cudaErrorMemoryAllocation", + 3 => "cudaErrorInitializationError", + 30 => "cudaErrorUnknown", + _ => "Unknown CUDA error", + } +} + +#[cfg(target_os = "linux")] +fn query_cuda_mem_info() -> Result<(usize, usize)> { + unsafe { + unsafe extern "C" { + fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32; + } + + let mut free_bytes: usize = 0; + let mut total_bytes: usize = 0; + let result = cudaMemGetInfo(&mut free_bytes as *mut usize, &mut total_bytes as *mut usize); + + if result != 0 { + return Err(MahoutError::Cuda(format!( + "cudaMemGetInfo failed: {} ({})", + result, + cuda_error_to_string(result) + ))); + } + + Ok((free_bytes, total_bytes)) + } +} + +#[cfg(target_os = "linux")] +fn build_oom_message(context: &str, requested_bytes: usize, qubits: Option<usize>, free: usize, total: usize) -> String { + let qubit_hint = qubits + .map(|q| format!(" (qubits={})", q)) + .unwrap_or_default(); + + format!( + "GPU out of memory during {context}{qubit_hint}: requested {:.2} MiB, free {:.2} MiB / total {:.2} MiB. Reduce qubits or batch size and retry.", + bytes_to_mib(requested_bytes), + bytes_to_mib(free), + bytes_to_mib(total), + ) +} + +/// Guard that checks available GPU memory before attempting a large allocation. +/// +/// Returns a MemoryAllocation error with a helpful message when the request +/// exceeds the currently reported free memory. +#[cfg(target_os = "linux")] +pub(crate) fn ensure_device_memory_available(requested_bytes: usize, context: &str, qubits: Option<usize>) -> Result<()> { + let (free, total) = query_cuda_mem_info()?; + + if (requested_bytes as u64) > (free as u64) { + return Err(MahoutError::MemoryAllocation(build_oom_message( + context, + requested_bytes, + qubits, + free, + total, + ))); + } + + Ok(()) +} + +/// Wraps CUDA allocation errors with an OOM-aware MahoutError. +#[cfg(target_os = "linux")] +pub(crate) fn map_allocation_error( + requested_bytes: usize, + context: &str, + qubits: Option<usize>, + source: impl std::fmt::Debug, +) -> MahoutError { + match query_cuda_mem_info() { + Ok((free, total)) => { + if (requested_bytes as u64) > (free as u64) { + MahoutError::MemoryAllocation(build_oom_message( + context, + requested_bytes, + qubits, + free, + total, + )) + } else { + MahoutError::MemoryAllocation(format!( + "GPU allocation failed during {context}: requested {:.2} MiB. CUDA error: {:?}", + bytes_to_mib(requested_bytes), + source, + )) + } + } + Err(e) => MahoutError::MemoryAllocation(format!( + "GPU allocation failed during {context}: requested {:.2} MiB. Unable to fetch memory info: {:?}; CUDA error: {:?}", + bytes_to_mib(requested_bytes), + e, + source, + )), + } +} + /// RAII wrapper for GPU memory buffer /// Automatically frees GPU memory when dropped pub struct GpuBufferRaw { @@ -55,20 +164,29 @@ impl GpuStateVector { /// Create GPU state vector for n qubits /// Allocates 2^n complex numbers on GPU (freed on drop) pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> { - let _size_elements = 1 << qubits; + let _size_elements: usize = 1usize << qubits; #[cfg(target_os = "linux")] { + let requested_bytes = _size_elements + .checked_mul(std::mem::size_of::<CuDoubleComplex>()) + .ok_or_else(|| MahoutError::MemoryAllocation( + format!("Requested GPU allocation size overflow (elements={})", _size_elements) + ))?; + + // Pre-flight check to gracefully fail before cudaMalloc when OOM is obvious + ensure_device_memory_available(requested_bytes, "state vector allocation", Some(qubits))?; + // Use uninitialized allocation to avoid memory bandwidth waste. // TODO: Consider using a memory pool for input buffers to avoid repeated // cudaMalloc overhead in high-frequency encode() calls. let slice = unsafe { _device.alloc::<CuDoubleComplex>(_size_elements) - }.map_err(|e| MahoutError::MemoryAllocation( - format!("Failed to allocate {} bytes of GPU memory (qubits={}): {:?}", - _size_elements * std::mem::size_of::<CuDoubleComplex>(), - qubits, - e) + }.map_err(|e| map_allocation_error( + requested_bytes, + "state vector allocation", + Some(qubits), + e, ))?; Ok(Self { diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs index fd9a5989d..3c5921c38 100644 --- a/qdp/qdp-core/src/gpu/pipeline.rs +++ b/qdp/qdp-core/src/gpu/pipeline.rs @@ -23,6 +23,8 @@ use std::sync::Arc; use std::ffi::c_void; use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, safe::CudaStream}; use crate::error::{MahoutError, Result}; +#[cfg(target_os = "linux")] +use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error}; /// Chunk processing callback for async pipeline /// @@ -92,11 +94,17 @@ where crate::profile_scope!("GPU::ChunkProcess"); + let chunk_bytes = chunk.len() * std::mem::size_of::<f64>(); + ensure_device_memory_available(chunk_bytes, "pipeline chunk buffer allocation", None)?; + // Allocate temporary device buffer for this chunk let input_chunk_dev = unsafe { device.alloc::<f64>(chunk.len()) - }.map_err(|e| MahoutError::MemoryAllocation( - format!("Failed to allocate chunk buffer: {:?}", e) + }.map_err(|e| map_allocation_error( + chunk_bytes, + "pipeline chunk buffer allocation", + None, + e, ))?; // Async copy: host to device (non-blocking, on specified stream)
