(mahout) branch dev-qdp updated: [QDP] Gracefully handles OOM (#688)

guanmingchiu Sat, 06 Dec 2025 07:45:19 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch dev-qdp
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/dev-qdp by this push:
     new 5738b8dfb [QDP] Gracefully handles OOM (#688)
5738b8dfb is described below

commit 5738b8dfb3dbf5c50e9b1826a20c6135c1797c07
Author: Ping <[email protected]>
AuthorDate: Sat Dec 6 23:45:03 2025 +0800

    [QDP] Gracefully handles OOM (#688)
    
    * gracefully handles OOM
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix
    
    Signed-off-by: 400Ping <[email protected]>
    
    ---------
    
    Signed-off-by: 400Ping <[email protected]>
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |  94 ++++++++++++--------
 qdp/qdp-core/src/gpu/memory.rs              | 130 ++++++++++++++++++++++++++--
 qdp/qdp-core/src/gpu/pipeline.rs            |  12 ++-
 3 files changed, 194 insertions(+), 42 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 38551d15c..9868a17bc 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -29,6 +29,8 @@ use std::ffi::c_void;
 use cudarc::driver::DevicePtr;
 #[cfg(target_os = "linux")]
 use qdp_kernels::launch_amplitude_encode;
+#[cfg(target_os = "linux")]
+use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
 
 use crate::preprocessing::Preprocessor;
 
@@ -65,40 +67,56 @@ impl QuantumEncoder for AmplitudeEncoder {
 
             if host_data.len() < ASYNC_THRESHOLD {
                 // Synchronous path for small data (avoids stream overhead)
-            let input_slice = {
-                crate::profile_scope!("GPU::H2DCopy");
-                _device.htod_sync_copy(host_data)
-                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to allocate input buffer: {:?}", e)))?
-            };
+                let input_bytes = host_data.len() * std::mem::size_of::<f64>();
+                ensure_device_memory_available(input_bytes, "input staging 
buffer", Some(num_qubits))?;
 
-            let ret = {
-                crate::profile_scope!("GPU::KernelLaunch");
-                unsafe {
-                    launch_amplitude_encode(
-                        *input_slice.device_ptr() as *const f64,
-                        state_vector.ptr() as *mut c_void,
+                let input_slice = {
+                    crate::profile_scope!("GPU::H2DCopy");
+                    _device.htod_sync_copy(host_data)
+                        .map_err(|e| map_allocation_error(
+                            input_bytes,
+                            "input staging buffer",
+                            Some(num_qubits),
+                            e,
+                        ))?
+                };
+
+                let ret = {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    unsafe {
+                        launch_amplitude_encode(
+                            *input_slice.device_ptr() as *const f64,
+                            state_vector.ptr() as *mut c_void,
                             host_data.len(),
                             state_len,
-                        norm,
-                        std::ptr::null_mut(), // default stream
-                    )
-                }
-            };
+                            norm,
+                            std::ptr::null_mut(), // default stream
+                        )
+                    }
+                };
 
-            if ret != 0 {
-                let error_msg = format!(
-                    "Kernel launch failed with CUDA error code: {} ({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                );
-                return Err(MahoutError::KernelLaunch(error_msg));
-            }
+                if ret != 0 {
+                    let error_msg = if ret == 2 {
+                        format!(
+                            "Kernel launch reported cudaErrorMemoryAllocation 
(likely OOM) while encoding {} elements into 2^{} state.",
+                            host_data.len(),
+                            num_qubits,
+                        )
+                    } else {
+                        format!(
+                            "Kernel launch failed with CUDA error code: {} 
({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )
+                    };
+                    return Err(MahoutError::KernelLaunch(error_msg));
+                }
 
-            {
-                crate::profile_scope!("GPU::Synchronize");
-                _device
-                    .synchronize()
-                    .map_err(|e| MahoutError::Cuda(format!("CUDA device 
synchronize failed: {:?}", e)))?;
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    _device
+                        .synchronize()
+                        .map_err(|e| MahoutError::Cuda(format!("CUDA device 
synchronize failed: {:?}", e)))?;
                 }
             } else {
                 // Async Pipeline path for large data
@@ -163,11 +181,19 @@ impl AmplitudeEncoder {
             };
 
             if ret != 0 {
-                let error_msg = format!(
-                    "Kernel launch failed with CUDA error code: {} ({})",
-                    ret,
-                    cuda_error_to_string(ret)
-                );
+                let error_msg = if ret == 2 {
+                    format!(
+                        "Kernel launch reported cudaErrorMemoryAllocation 
(likely OOM) while encoding chunk starting at offset {} (len={}).",
+                        chunk_offset,
+                        chunk_len
+                    )
+                } else {
+                    format!(
+                        "Kernel launch failed with CUDA error code: {} ({})",
+                        ret,
+                        cuda_error_to_string(ret)
+                    )
+                };
                 return Err(MahoutError::KernelLaunch(error_msg));
             }
 
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 513c326c0..1ac8eabb5 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -19,6 +19,115 @@ use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr};
 use qdp_kernels::CuDoubleComplex;
 use crate::error::{MahoutError, Result};
 
+#[cfg(target_os = "linux")]
+fn bytes_to_mib(bytes: usize) -> f64 {
+    bytes as f64 / (1024.0 * 1024.0)
+}
+
+#[cfg(target_os = "linux")]
+fn cuda_error_to_string(code: i32) -> &'static str {
+    match code {
+        0 => "cudaSuccess",
+        2 => "cudaErrorMemoryAllocation",
+        3 => "cudaErrorInitializationError",
+        30 => "cudaErrorUnknown",
+        _ => "Unknown CUDA error",
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn query_cuda_mem_info() -> Result<(usize, usize)> {
+    unsafe {
+        unsafe extern "C" {
+            fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+        }
+
+        let mut free_bytes: usize = 0;
+        let mut total_bytes: usize = 0;
+        let result = cudaMemGetInfo(&mut free_bytes as *mut usize, &mut 
total_bytes as *mut usize);
+
+        if result != 0 {
+            return Err(MahoutError::Cuda(format!(
+                "cudaMemGetInfo failed: {} ({})",
+                result,
+                cuda_error_to_string(result)
+            )));
+        }
+
+        Ok((free_bytes, total_bytes))
+    }
+}
+
+#[cfg(target_os = "linux")]
+fn build_oom_message(context: &str, requested_bytes: usize, qubits: 
Option<usize>, free: usize, total: usize) -> String {
+    let qubit_hint = qubits
+        .map(|q| format!(" (qubits={})", q))
+        .unwrap_or_default();
+
+    format!(
+        "GPU out of memory during {context}{qubit_hint}: requested {:.2} MiB, 
free {:.2} MiB / total {:.2} MiB. Reduce qubits or batch size and retry.",
+        bytes_to_mib(requested_bytes),
+        bytes_to_mib(free),
+        bytes_to_mib(total),
+    )
+}
+
+/// Guard that checks available GPU memory before attempting a large 
allocation.
+///
+/// Returns a MemoryAllocation error with a helpful message when the request
+/// exceeds the currently reported free memory.
+#[cfg(target_os = "linux")]
+pub(crate) fn ensure_device_memory_available(requested_bytes: usize, context: 
&str, qubits: Option<usize>) -> Result<()> {
+    let (free, total) = query_cuda_mem_info()?;
+
+    if (requested_bytes as u64) > (free as u64) {
+        return Err(MahoutError::MemoryAllocation(build_oom_message(
+            context,
+            requested_bytes,
+            qubits,
+            free,
+            total,
+        )));
+    }
+
+    Ok(())
+}
+
+/// Wraps CUDA allocation errors with an OOM-aware MahoutError.
+#[cfg(target_os = "linux")]
+pub(crate) fn map_allocation_error(
+    requested_bytes: usize,
+    context: &str,
+    qubits: Option<usize>,
+    source: impl std::fmt::Debug,
+) -> MahoutError {
+    match query_cuda_mem_info() {
+        Ok((free, total)) => {
+            if (requested_bytes as u64) > (free as u64) {
+                MahoutError::MemoryAllocation(build_oom_message(
+                    context,
+                    requested_bytes,
+                    qubits,
+                    free,
+                    total,
+                ))
+            } else {
+                MahoutError::MemoryAllocation(format!(
+                    "GPU allocation failed during {context}: requested {:.2} 
MiB. CUDA error: {:?}",
+                    bytes_to_mib(requested_bytes),
+                    source,
+                ))
+            }
+        }
+        Err(e) => MahoutError::MemoryAllocation(format!(
+            "GPU allocation failed during {context}: requested {:.2} MiB. 
Unable to fetch memory info: {:?}; CUDA error: {:?}",
+            bytes_to_mib(requested_bytes),
+            e,
+            source,
+        )),
+    }
+}
+
 /// RAII wrapper for GPU memory buffer
 /// Automatically frees GPU memory when dropped
 pub struct GpuBufferRaw {
@@ -55,20 +164,29 @@ impl GpuStateVector {
     /// Create GPU state vector for n qubits
     /// Allocates 2^n complex numbers on GPU (freed on drop)
     pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> {
-        let _size_elements = 1 << qubits;
+        let _size_elements: usize = 1usize << qubits;
 
         #[cfg(target_os = "linux")]
         {
+            let requested_bytes = _size_elements
+                .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                .ok_or_else(|| MahoutError::MemoryAllocation(
+                    format!("Requested GPU allocation size overflow 
(elements={})", _size_elements)
+                ))?;
+
+            // Pre-flight check to gracefully fail before cudaMalloc when OOM 
is obvious
+            ensure_device_memory_available(requested_bytes, "state vector 
allocation", Some(qubits))?;
+
             // Use uninitialized allocation to avoid memory bandwidth waste.
             // TODO: Consider using a memory pool for input buffers to avoid 
repeated
             // cudaMalloc overhead in high-frequency encode() calls.
             let slice = unsafe {
                 _device.alloc::<CuDoubleComplex>(_size_elements)
-            }.map_err(|e| MahoutError::MemoryAllocation(
-                format!("Failed to allocate {} bytes of GPU memory 
(qubits={}): {:?}",
-                        _size_elements * 
std::mem::size_of::<CuDoubleComplex>(),
-                        qubits,
-                        e)
+            }.map_err(|e| map_allocation_error(
+                requested_bytes,
+                "state vector allocation",
+                Some(qubits),
+                e,
             ))?;
 
             Ok(Self {
diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs
index fd9a5989d..3c5921c38 100644
--- a/qdp/qdp-core/src/gpu/pipeline.rs
+++ b/qdp/qdp-core/src/gpu/pipeline.rs
@@ -23,6 +23,8 @@ use std::sync::Arc;
 use std::ffi::c_void;
 use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, safe::CudaStream};
 use crate::error::{MahoutError, Result};
+#[cfg(target_os = "linux")]
+use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
 
 /// Chunk processing callback for async pipeline
 ///
@@ -92,11 +94,17 @@ where
 
         crate::profile_scope!("GPU::ChunkProcess");
 
+        let chunk_bytes = chunk.len() * std::mem::size_of::<f64>();
+        ensure_device_memory_available(chunk_bytes, "pipeline chunk buffer 
allocation", None)?;
+
         // Allocate temporary device buffer for this chunk
         let input_chunk_dev = unsafe {
             device.alloc::<f64>(chunk.len())
-        }.map_err(|e| MahoutError::MemoryAllocation(
-            format!("Failed to allocate chunk buffer: {:?}", e)
+        }.map_err(|e| map_allocation_error(
+            chunk_bytes,
+            "pipeline chunk buffer allocation",
+            None,
+            e,
         ))?;
 
         // Async copy: host to device (non-blocking, on specified stream)

(mahout) branch dev-qdp updated: [QDP] Gracefully handles OOM (#688)

Reply via email to