This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch dev-qdp
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/dev-qdp by this push:
new 5738b8dfb [QDP] Gracefully handles OOM (#688)
5738b8dfb is described below
commit 5738b8dfb3dbf5c50e9b1826a20c6135c1797c07
Author: Ping <[email protected]>
AuthorDate: Sat Dec 6 23:45:03 2025 +0800
[QDP] Gracefully handles OOM (#688)
* gracefully handles OOM
Signed-off-by: 400Ping <[email protected]>
* fix
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 94 ++++++++++++--------
qdp/qdp-core/src/gpu/memory.rs | 130 ++++++++++++++++++++++++++--
qdp/qdp-core/src/gpu/pipeline.rs | 12 ++-
3 files changed, 194 insertions(+), 42 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 38551d15c..9868a17bc 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -29,6 +29,8 @@ use std::ffi::c_void;
use cudarc::driver::DevicePtr;
#[cfg(target_os = "linux")]
use qdp_kernels::launch_amplitude_encode;
+#[cfg(target_os = "linux")]
+use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
use crate::preprocessing::Preprocessor;
@@ -65,40 +67,56 @@ impl QuantumEncoder for AmplitudeEncoder {
if host_data.len() < ASYNC_THRESHOLD {
// Synchronous path for small data (avoids stream overhead)
- let input_slice = {
- crate::profile_scope!("GPU::H2DCopy");
- _device.htod_sync_copy(host_data)
- .map_err(|e| MahoutError::MemoryAllocation(format!("Failed
to allocate input buffer: {:?}", e)))?
- };
+ let input_bytes = host_data.len() * std::mem::size_of::<f64>();
+ ensure_device_memory_available(input_bytes, "input staging
buffer", Some(num_qubits))?;
- let ret = {
- crate::profile_scope!("GPU::KernelLaunch");
- unsafe {
- launch_amplitude_encode(
- *input_slice.device_ptr() as *const f64,
- state_vector.ptr() as *mut c_void,
+ let input_slice = {
+ crate::profile_scope!("GPU::H2DCopy");
+ _device.htod_sync_copy(host_data)
+ .map_err(|e| map_allocation_error(
+ input_bytes,
+ "input staging buffer",
+ Some(num_qubits),
+ e,
+ ))?
+ };
+
+ let ret = {
+ crate::profile_scope!("GPU::KernelLaunch");
+ unsafe {
+ launch_amplitude_encode(
+ *input_slice.device_ptr() as *const f64,
+ state_vector.ptr() as *mut c_void,
host_data.len(),
state_len,
- norm,
- std::ptr::null_mut(), // default stream
- )
- }
- };
+ norm,
+ std::ptr::null_mut(), // default stream
+ )
+ }
+ };
- if ret != 0 {
- let error_msg = format!(
- "Kernel launch failed with CUDA error code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- );
- return Err(MahoutError::KernelLaunch(error_msg));
- }
+ if ret != 0 {
+ let error_msg = if ret == 2 {
+ format!(
+ "Kernel launch reported cudaErrorMemoryAllocation
(likely OOM) while encoding {} elements into 2^{} state.",
+ host_data.len(),
+ num_qubits,
+ )
+ } else {
+ format!(
+ "Kernel launch failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )
+ };
+ return Err(MahoutError::KernelLaunch(error_msg));
+ }
- {
- crate::profile_scope!("GPU::Synchronize");
- _device
- .synchronize()
- .map_err(|e| MahoutError::Cuda(format!("CUDA device
synchronize failed: {:?}", e)))?;
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ _device
+ .synchronize()
+ .map_err(|e| MahoutError::Cuda(format!("CUDA device
synchronize failed: {:?}", e)))?;
}
} else {
// Async Pipeline path for large data
@@ -163,11 +181,19 @@ impl AmplitudeEncoder {
};
if ret != 0 {
- let error_msg = format!(
- "Kernel launch failed with CUDA error code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- );
+ let error_msg = if ret == 2 {
+ format!(
+ "Kernel launch reported cudaErrorMemoryAllocation
(likely OOM) while encoding chunk starting at offset {} (len={}).",
+ chunk_offset,
+ chunk_len
+ )
+ } else {
+ format!(
+ "Kernel launch failed with CUDA error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )
+ };
return Err(MahoutError::KernelLaunch(error_msg));
}
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 513c326c0..1ac8eabb5 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -19,6 +19,115 @@ use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr};
use qdp_kernels::CuDoubleComplex;
use crate::error::{MahoutError, Result};
+#[cfg(target_os = "linux")]
+fn bytes_to_mib(bytes: usize) -> f64 {
+ bytes as f64 / (1024.0 * 1024.0)
+}
+
+#[cfg(target_os = "linux")]
+fn cuda_error_to_string(code: i32) -> &'static str {
+ match code {
+ 0 => "cudaSuccess",
+ 2 => "cudaErrorMemoryAllocation",
+ 3 => "cudaErrorInitializationError",
+ 30 => "cudaErrorUnknown",
+ _ => "Unknown CUDA error",
+ }
+}
+
+#[cfg(target_os = "linux")]
+fn query_cuda_mem_info() -> Result<(usize, usize)> {
+ unsafe {
+ unsafe extern "C" {
+ fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+ }
+
+ let mut free_bytes: usize = 0;
+ let mut total_bytes: usize = 0;
+ let result = cudaMemGetInfo(&mut free_bytes as *mut usize, &mut
total_bytes as *mut usize);
+
+ if result != 0 {
+ return Err(MahoutError::Cuda(format!(
+ "cudaMemGetInfo failed: {} ({})",
+ result,
+ cuda_error_to_string(result)
+ )));
+ }
+
+ Ok((free_bytes, total_bytes))
+ }
+}
+
+#[cfg(target_os = "linux")]
+fn build_oom_message(context: &str, requested_bytes: usize, qubits:
Option<usize>, free: usize, total: usize) -> String {
+ let qubit_hint = qubits
+ .map(|q| format!(" (qubits={})", q))
+ .unwrap_or_default();
+
+ format!(
+ "GPU out of memory during {context}{qubit_hint}: requested {:.2} MiB,
free {:.2} MiB / total {:.2} MiB. Reduce qubits or batch size and retry.",
+ bytes_to_mib(requested_bytes),
+ bytes_to_mib(free),
+ bytes_to_mib(total),
+ )
+}
+
+/// Guard that checks available GPU memory before attempting a large
allocation.
+///
+/// Returns a MemoryAllocation error with a helpful message when the request
+/// exceeds the currently reported free memory.
+#[cfg(target_os = "linux")]
+pub(crate) fn ensure_device_memory_available(requested_bytes: usize, context:
&str, qubits: Option<usize>) -> Result<()> {
+ let (free, total) = query_cuda_mem_info()?;
+
+ if (requested_bytes as u64) > (free as u64) {
+ return Err(MahoutError::MemoryAllocation(build_oom_message(
+ context,
+ requested_bytes,
+ qubits,
+ free,
+ total,
+ )));
+ }
+
+ Ok(())
+}
+
+/// Wraps CUDA allocation errors with an OOM-aware MahoutError.
+#[cfg(target_os = "linux")]
+pub(crate) fn map_allocation_error(
+ requested_bytes: usize,
+ context: &str,
+ qubits: Option<usize>,
+ source: impl std::fmt::Debug,
+) -> MahoutError {
+ match query_cuda_mem_info() {
+ Ok((free, total)) => {
+ if (requested_bytes as u64) > (free as u64) {
+ MahoutError::MemoryAllocation(build_oom_message(
+ context,
+ requested_bytes,
+ qubits,
+ free,
+ total,
+ ))
+ } else {
+ MahoutError::MemoryAllocation(format!(
+ "GPU allocation failed during {context}: requested {:.2}
MiB. CUDA error: {:?}",
+ bytes_to_mib(requested_bytes),
+ source,
+ ))
+ }
+ }
+ Err(e) => MahoutError::MemoryAllocation(format!(
+ "GPU allocation failed during {context}: requested {:.2} MiB.
Unable to fetch memory info: {:?}; CUDA error: {:?}",
+ bytes_to_mib(requested_bytes),
+ e,
+ source,
+ )),
+ }
+}
+
/// RAII wrapper for GPU memory buffer
/// Automatically frees GPU memory when dropped
pub struct GpuBufferRaw {
@@ -55,20 +164,29 @@ impl GpuStateVector {
/// Create GPU state vector for n qubits
/// Allocates 2^n complex numbers on GPU (freed on drop)
pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> {
- let _size_elements = 1 << qubits;
+ let _size_elements: usize = 1usize << qubits;
#[cfg(target_os = "linux")]
{
+ let requested_bytes = _size_elements
+ .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+ .ok_or_else(|| MahoutError::MemoryAllocation(
+ format!("Requested GPU allocation size overflow
(elements={})", _size_elements)
+ ))?;
+
+ // Pre-flight check to gracefully fail before cudaMalloc when OOM
is obvious
+ ensure_device_memory_available(requested_bytes, "state vector
allocation", Some(qubits))?;
+
// Use uninitialized allocation to avoid memory bandwidth waste.
// TODO: Consider using a memory pool for input buffers to avoid
repeated
// cudaMalloc overhead in high-frequency encode() calls.
let slice = unsafe {
_device.alloc::<CuDoubleComplex>(_size_elements)
- }.map_err(|e| MahoutError::MemoryAllocation(
- format!("Failed to allocate {} bytes of GPU memory
(qubits={}): {:?}",
- _size_elements *
std::mem::size_of::<CuDoubleComplex>(),
- qubits,
- e)
+ }.map_err(|e| map_allocation_error(
+ requested_bytes,
+ "state vector allocation",
+ Some(qubits),
+ e,
))?;
Ok(Self {
diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs
index fd9a5989d..3c5921c38 100644
--- a/qdp/qdp-core/src/gpu/pipeline.rs
+++ b/qdp/qdp-core/src/gpu/pipeline.rs
@@ -23,6 +23,8 @@ use std::sync::Arc;
use std::ffi::c_void;
use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, safe::CudaStream};
use crate::error::{MahoutError, Result};
+#[cfg(target_os = "linux")]
+use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
/// Chunk processing callback for async pipeline
///
@@ -92,11 +94,17 @@ where
crate::profile_scope!("GPU::ChunkProcess");
+ let chunk_bytes = chunk.len() * std::mem::size_of::<f64>();
+ ensure_device_memory_available(chunk_bytes, "pipeline chunk buffer
allocation", None)?;
+
// Allocate temporary device buffer for this chunk
let input_chunk_dev = unsafe {
device.alloc::<f64>(chunk.len())
- }.map_err(|e| MahoutError::MemoryAllocation(
- format!("Failed to allocate chunk buffer: {:?}", e)
+ }.map_err(|e| map_allocation_error(
+ chunk_bytes,
+ "pipeline chunk buffer allocation",
+ None,
+ e,
))?;
// Async copy: host to device (non-blocking, on specified stream)