(mahout) branch main updated: [QDP] Add zero-copy amplitude encoding from float32 GPU tensors (#999)

guanmingchiu Tue, 03 Feb 2026 23:34:48 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 42da30d42 [QDP] Add zero-copy amplitude encoding from float32 GPU 
tensors (#999)
42da30d42 is described below

commit 42da30d4274e08ead40f33d4650a99aab17e7b64
Author: Vic Wen <[email protected]>
AuthorDate: Wed Feb 4 15:34:35 2026 +0800

    [QDP] Add zero-copy amplitude encoding from float32 GPU tensors (#999)
    
    * feat: add float32 GPU pointer encoding and inverse norm calculation with 
stream support
    
    * refactor: streamline GPU state vector encoding to support precision 
conversion for both Float32 and Float64
    
    * test: add test file for GPU pointer encoding with Float32 precision
    
    * refactor: improve GPU pointer validation and update documentation for 
encoding methods
    
    * test: update unsupported encoding test to reflect changes in CUDA tensor 
encoding methods
    
    * test: add unit test for handling null pointer in GPU pointer encoding for 
Float32
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |  26 ++-
 qdp/qdp-core/src/lib.rs                     | 191 ++++++++++++++++++
 qdp/qdp-core/tests/common/mod.rs            |   8 +-
 qdp/qdp-core/tests/gpu_ptr_encoding.rs      | 300 ++++++++++++++++++++++++++--
 testing/qdp/test_bindings.py                |  53 +----
 5 files changed, 512 insertions(+), 66 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 037b3bd31..85259e18a 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -510,6 +510,28 @@ impl AmplitudeEncoder {
         device: &Arc<CudaDevice>,
         input_ptr: *const f32,
         len: usize,
+    ) -> Result<f32> {
+        unsafe {
+            Self::calculate_inv_norm_gpu_f32_with_stream(
+                device,
+                input_ptr,
+                len,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Compute inverse L2 norm on GPU for float32 input on a given stream.
+    ///
+    /// # Safety
+    /// The caller must ensure `input_ptr` points to valid GPU memory 
containing
+    /// at least `len` f32 elements on the same device as `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_ptr: *const f32,
+        len: usize,
+        stream: *mut c_void,
     ) -> Result<f32> {
         crate::profile_scope!("GPU::NormSingleF32");
 
@@ -522,7 +544,7 @@ impl AmplitudeEncoder {
                 input_ptr,
                 len,
                 *norm_buffer.device_ptr_mut() as *mut f32,
-                std::ptr::null_mut(), // default stream
+                stream,
             )
         };
 
@@ -534,6 +556,8 @@ impl AmplitudeEncoder {
             )));
         }
 
+        sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;
+
         let inv_norm_host = device
             .dtoh_sync_copy(&norm_buffer)
             .map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to 
host: {:?}", e)))?;
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 1fe172d1b..bf813c470 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -45,12 +45,63 @@ pub use pipeline_runner::{
     run_throughput_pipeline,
 };
 
+use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
 use crate::gpu::get_encoder;
 use cudarc::driver::CudaDevice;
 
+#[cfg(target_os = "linux")]
+fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> 
Result<()> {
+    use crate::gpu::cuda_ffi::{
+        CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, 
CudaPointerAttributes,
+        cudaPointerGetAttributes,
+    };
+
+    if ptr.is_null() {
+        return Err(MahoutError::InvalidInput(
+            "Input GPU pointer is null".to_string(),
+        ));
+    }
+
+    let mut attrs = CudaPointerAttributes {
+        memory_type: 0,
+        device: 0,
+        device_pointer: std::ptr::null_mut(),
+        host_pointer: std::ptr::null_mut(),
+        is_managed: 0,
+        allocation_flags: 0,
+    };
+
+    let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
+    if ret != 0 {
+        return Err(MahoutError::InvalidInput(format!(
+            "cudaPointerGetAttributes failed for input pointer: {} ({})",
+            ret,
+            cuda_error_to_string(ret)
+        )));
+    }
+
+    if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != 
CUDA_MEMORY_TYPE_MANAGED
+    {
+        return Err(MahoutError::InvalidInput(format!(
+            "Input pointer is not CUDA device memory (memory_type={})",
+            attrs.memory_type
+        )));
+    }
+
+    let device_ordinal = device.ordinal() as i32;
+    if attrs.device >= 0 && attrs.device != device_ordinal {
+        return Err(MahoutError::InvalidInput(format!(
+            "Input pointer device mismatch: pointer on cuda:{}, engine on 
cuda:{}",
+            attrs.device, device_ordinal
+        )));
+    }
+
+    Ok(())
+}
+
 /// Main entry point for Mahout QDP
 ///
 /// Manages GPU context and dispatches encoding tasks.
@@ -418,6 +469,14 @@ impl QdpEngine {
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
 
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_d)?;
+
         let state_len = 1usize << num_qubits;
         let method = encoding_method.to_ascii_lowercase();
 
@@ -600,6 +659,130 @@ impl QdpEngine {
         }
     }
 
+    /// Encode from existing GPU pointer (float32 input, amplitude encoding 
only)
+    ///
+    /// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default 
CUDA stream.
+    /// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Arguments
+    /// * `input_d` - Device pointer to input data (f32 array on GPU)
+    /// * `input_len` - Number of f32 elements in the input
+    /// * `num_qubits` - Number of qubits for encoding
+    ///
+    /// # Returns
+    /// DLPack pointer (state vector in engine precision) for zero-copy 
PyTorch integration.
+    /// Internal computation is f32; output is converted to [`Precision`] of 
the engine.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `input_len` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_from_gpu_ptr_f32_with_stream(
+                input_d,
+                input_len,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode from existing GPU pointer (float32) on a specified CUDA stream.
+    ///
+    /// # Returns
+    /// DLPack pointer (state vector in engine precision). Pass null for 
`stream` to use the default stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream 
pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
+
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+
+        let state_len = 1usize << num_qubits;
+        if input_len > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input size {} exceeds state vector size {} (2^{} qubits)",
+                input_len, state_len, num_qubits
+            )));
+        }
+
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float32)?
+        };
+
+        let inv_norm = {
+            crate::profile_scope!("GPU::NormFromPtr");
+            unsafe {
+                gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
+                    &self.device,
+                    input_d,
+                    input_len,
+                    stream,
+                )?
+            }
+        };
+
+        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float32 
buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_amplitude_encode_f32(
+                    input_d,
+                    state_ptr as *mut std::ffi::c_void,
+                    input_len,
+                    state_len,
+                    inv_norm,
+                    stream,
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode (f32) kernel failed with CUDA error 
code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
+        }
+
+        let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(state_vector.to_dlpack())
+    }
+
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ -671,6 +854,14 @@ impl QdpEngine {
             ));
         }
 
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_batch_d)?;
+
         match method.as_str() {
             "amplitude" => {
                 if sample_size == 0 {
diff --git a/qdp/qdp-core/tests/common/mod.rs b/qdp/qdp-core/tests/common/mod.rs
index 9afb31e40..25e43c262 100644
--- a/qdp/qdp-core/tests/common/mod.rs
+++ b/qdp/qdp-core/tests/common/mod.rs
@@ -14,8 +14,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/// Creates normalized test data
+/// Creates normalized test data (f64)
 #[allow(dead_code)] // Used by multiple test modules
 pub fn create_test_data(size: usize) -> Vec<f64> {
     (0..size).map(|i| (i as f64) / (size as f64)).collect()
 }
+
+/// Creates normalized test data (f32)
+#[allow(dead_code)]
+pub fn create_test_data_f32(size: usize) -> Vec<f32> {
+    (0..size).map(|i| (i as f32) / (size as f32)).collect()
+}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs 
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 7851eb1e1..e9f23da34 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -19,12 +19,39 @@
 #![cfg(target_os = "linux")]
 
 use std::ffi::c_void;
+use std::sync::Arc;
 
-use cudarc::driver::{CudaDevice, DevicePtr};
-use qdp_core::{MahoutError, QdpEngine};
+use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, DeviceSlice};
+use qdp_core::{MahoutError, Precision, QdpEngine};
 
 mod common;
 
+// ---- Helpers for f32 encode_from_gpu_ptr_f32 tests ----
+
+fn engine_f32() -> Option<QdpEngine> {
+    QdpEngine::new_with_precision(0, Precision::Float32).ok()
+}
+
+fn device_and_f32_slice(data: &[f32]) -> Option<(Arc<CudaDevice>, 
CudaSlice<f32>)> {
+    let device = CudaDevice::new(0).ok()?;
+    let slice = device.htod_sync_copy(data).ok()?;
+    Some((device, slice))
+}
+
+fn assert_dlpack_shape_2_4_and_delete(dlpack_ptr: *mut 
qdp_core::dlpack::DLManagedTensor) {
+    assert!(!dlpack_ptr.is_null());
+    unsafe {
+        let tensor = &(*dlpack_ptr).dl_tensor;
+        assert_eq!(tensor.ndim, 2);
+        let shape = std::slice::from_raw_parts(tensor.shape, 2);
+        assert_eq!(shape[0], 1);
+        assert_eq!(shape[1], 4);
+        if let Some(deleter) = (*dlpack_ptr).deleter {
+            deleter(dlpack_ptr);
+        }
+    }
+}
+
 // ---- Validation / error-path tests (return before using pointer) ----
 
 #[test]
@@ -34,7 +61,19 @@ fn test_encode_from_gpu_ptr_unknown_method() {
         Err(_) => return,
     };
 
-    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 4, 2, 
"unknown_encoding") };
+    // Need valid GPU pointer so we reach method dispatch (validation runs 
first)
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => return,
+    };
+    let data = common::create_test_data(4);
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => return,
+    };
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+    let result = unsafe { engine.encode_from_gpu_ptr(ptr, 4, 2, 
"unknown_encoding") };
 
     assert!(result.is_err());
     match result {
@@ -70,8 +109,20 @@ fn test_encode_from_gpu_ptr_amplitude_input_exceeds_state() 
{
         Err(_) => return,
     };
 
+    // Need valid GPU pointer so we reach input_len > state_len check 
(validation runs first)
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => return,
+    };
+    let data = common::create_test_data(10);
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => return,
+    };
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
     // 2 qubits -> state_len = 4; request input_len = 10
-    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 10, 2, 
"amplitude") };
+    let result = unsafe { engine.encode_from_gpu_ptr(ptr, 10, 2, "amplitude") 
};
 
     assert!(result.is_err());
     match result {
@@ -89,8 +140,19 @@ fn test_encode_batch_from_gpu_ptr_unknown_method() {
         Err(_) => return,
     };
 
-    let result =
-        unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 2, 4, 2, 
"unknown_method") };
+    // Need valid GPU pointer so we reach method dispatch (validation runs 
first)
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => return,
+    };
+    let data = common::create_test_data(8);
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => return,
+    };
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+    let result = unsafe { engine.encode_batch_from_gpu_ptr(ptr, 2, 4, 2, 
"unknown_method") };
 
     assert!(result.is_err());
     match result {
@@ -127,17 +189,34 @@ fn test_encode_from_gpu_ptr_basis_input_len_not_one() {
         Err(_) => return,
     };
 
-    // Basis single encoding expects exactly 1 value; input_len != 1 must 
return error.
-    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2, 
"basis") };
+    // Need valid GPU pointer so we reach basis input_len checks (validation 
runs first)
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => return,
+    };
+    let indices: Vec<usize> = vec![0, 1, 2];
+    let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+        Ok(b) => b,
+        Err(_) => return,
+    };
+    let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+    // Basis single encoding expects exactly 1 value; input_len == 0 returns 
empty error.
+    let result = unsafe { engine.encode_from_gpu_ptr(ptr, 0, 2, "basis") };
     assert!(result.is_err());
     match result {
         Err(MahoutError::InvalidInput(msg)) => {
-            assert!(msg.contains("exactly 1") || msg.contains("basis"));
+            assert!(
+                msg.contains("exactly 1") || msg.contains("basis") || 
msg.contains("empty"),
+                "expected exactly 1 / basis / empty, got: {}",
+                msg
+            );
         }
         _ => panic!("expected InvalidInput for input_len != 1, got {:?}", 
result),
     }
 
-    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 3, 2, 
"basis") };
+    // input_len == 3 (basis expects 1)
+    let result = unsafe { engine.encode_from_gpu_ptr(ptr, 3, 2, "basis") };
     assert!(result.is_err());
     match result {
         Err(MahoutError::InvalidInput(msg)) => {
@@ -154,8 +233,20 @@ fn 
test_encode_batch_from_gpu_ptr_basis_sample_size_not_one() {
         Err(_) => return,
     };
 
-    // Basis batch expects sample_size == 1 (one index per sample).
-    let result = unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 
2, 4, 2, "basis") };
+    // Need valid GPU pointer so we reach basis sample_size check (validation 
runs first)
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => return,
+    };
+    let indices: Vec<usize> = vec![0, 1];
+    let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+        Ok(b) => b,
+        Err(_) => return,
+    };
+    let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+    // Basis batch expects sample_size == 1 (one index per sample); 
sample_size=4.
+    let result = unsafe { engine.encode_batch_from_gpu_ptr(ptr, 2, 4, 2, 
"basis") };
     assert!(result.is_err());
     match result {
         Err(MahoutError::InvalidInput(msg)) => {
@@ -326,7 +417,8 @@ fn test_encode_batch_from_gpu_ptr_amplitude_success() {
 
 #[test]
 fn test_encode_from_gpu_ptr_basis_success() {
-    let engine = match QdpEngine::new(0) {
+    // Basis path uses ptr_f64(); engine must be Float64
+    let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
         Ok(e) => e,
         Err(_) => {
             println!("SKIP: No GPU available");
@@ -377,7 +469,8 @@ fn test_encode_from_gpu_ptr_basis_success() {
 
 #[test]
 fn test_encode_batch_from_gpu_ptr_basis_success() {
-    let engine = match QdpEngine::new(0) {
+    // Basis path uses ptr_f64(); engine must be Float64
+    let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
         Ok(e) => e,
         Err(_) => {
             println!("SKIP: No GPU available");
@@ -423,3 +516,182 @@ fn test_encode_batch_from_gpu_ptr_basis_success() {
         deleter(dlpack_ptr);
     }
 }
+
+// ---- encode_from_gpu_ptr_f32 (float32 amplitude) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0]) 
{
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_from_gpu_ptr_f32")
+    };
+    assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_with_stream_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0]) 
{
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine.encode_from_gpu_ptr_f32_with_stream(ptr, input_d.len(), 2, 
std::ptr::null_mut())
+    }
+    .expect("encode_from_gpu_ptr_f32_with_stream");
+    assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_with_stream_non_default_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr_f32_with_stream(
+                *input_d.device_ptr() as *const f32,
+                input_d.len(),
+                2,
+                stream.stream as *mut c_void,
+            )
+            .expect("encode_from_gpu_ptr_f32_with_stream (non-default stream)")
+    };
+    assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_success_f64_engine() {
+    let engine = match QdpEngine::new_with_precision(0, 
Precision::Float64).ok() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0]) 
{
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_from_gpu_ptr_f32 (Float64 engine)")
+    };
+    assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_empty_input() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_from_gpu_ptr_f32(ptr, 0, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("empty")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_null_pointer() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { engine.encode_from_gpu_ptr_f32(std::ptr::null(), 4, 
2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0, 
0.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_from_gpu_ptr_f32(ptr, input_d.len(), 
2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(
+                msg.contains("exceeds") || msg.contains("state vector"),
+                "expected 'exceeds' or 'state vector', got: {}",
+                msg
+            );
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index a0f043456..fe6b07368 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -25,8 +25,6 @@ from .qdp_test_utils import requires_qdp
 def _has_multi_gpu():
     """Check if multiple GPUs are available via PyTorch."""
     try:
-        import torch
-
         return torch.cuda.is_available() and torch.cuda.device_count() >= 2
     except ImportError:
         return False
@@ -81,7 +79,6 @@ def test_dlpack_device():
 def test_dlpack_device_id_non_zero():
     """Test device_id propagation for non-zero devices (requires multi-GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     # Test with device_id=1 (second GPU)
@@ -108,7 +105,6 @@ def test_dlpack_device_id_non_zero():
 @pytest.mark.gpu
 def test_dlpack_single_use():
     """Test that __dlpack__ can only be called once (requires GPU)."""
-    import torch
     from _qdp import QdpEngine
 
     engine = QdpEngine(0)
@@ -130,7 +126,6 @@ def test_dlpack_single_use():
 @pytest.mark.parametrize("stream", [1, 2], ids=["stream_legacy", 
"stream_per_thread"])
 def test_dlpack_with_stream(stream):
     """Test __dlpack__(stream=...) syncs CUDA stream before returning capsule 
(DLPack 0.8+)."""
-    import torch
     from _qdp import QdpEngine
 
     engine = QdpEngine(0)
@@ -149,7 +144,6 @@ def test_dlpack_with_stream(stream):
 def test_pytorch_integration():
     """Test PyTorch integration via DLPack (requires GPU and PyTorch)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     engine = QdpEngine(0)
@@ -178,7 +172,6 @@ def test_pytorch_integration():
 def test_precision(precision, expected_dtype):
     """Test different precision settings produce correct output dtypes."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     engine = QdpEngine(0, precision=precision)
@@ -207,7 +200,6 @@ def test_precision(precision, expected_dtype):
 def test_encode_tensor_cpu(data_shape, expected_shape):
     """Test encoding from CPU PyTorch tensor (1D or 2D, zero-copy)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -233,7 +225,6 @@ def test_encode_from_tensorflow_binding():
     pytest.importorskip("torch")
     tf = pytest.importorskip("tensorflow")
     import numpy as np
-    import torch
     from _qdp import QdpEngine
     import os
     import tempfile
@@ -267,7 +258,6 @@ def test_encode_from_tensorflow_binding():
 def test_encode_errors():
     """Test error handling for unified encode method."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -300,7 +290,6 @@ def test_encode_errors():
 def test_encode_cuda_tensor(data_shape, expected_shape, expected_batch_size):
     """Test encoding from CUDA tensor (1D or 2D, zero-copy)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -328,7 +317,6 @@ def test_encode_cuda_tensor(data_shape, expected_shape, 
expected_batch_size):
 def test_encode_cuda_tensor_wrong_dtype():
     """Test error when CUDA tensor has wrong dtype (non-float64)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -347,7 +335,6 @@ def test_encode_cuda_tensor_wrong_dtype():
 def test_encode_cuda_tensor_non_contiguous():
     """Test error when CUDA tensor is non-contiguous."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -373,7 +360,6 @@ def test_encode_cuda_tensor_non_contiguous():
 def test_encode_cuda_tensor_device_mismatch():
     """Test error when CUDA tensor is on wrong device (multi-GPU only)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     # Engine on device 0
@@ -390,7 +376,6 @@ def test_encode_cuda_tensor_device_mismatch():
 def test_encode_cuda_tensor_empty():
     """Test error when CUDA tensor is empty."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -416,7 +401,6 @@ def test_encode_cuda_tensor_empty():
 def test_encode_cuda_tensor_preserves_input(data_shape, is_batch):
     """Test that input CUDA tensor (1D or 2D) is not modified after 
encoding."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -438,11 +422,10 @@ def test_encode_cuda_tensor_preserves_input(data_shape, 
is_batch):
 
 @requires_qdp
 @pytest.mark.gpu
[email protected]("encoding_method", ["basis"])
[email protected]("encoding_method", ["iqp"])
 def test_encode_cuda_tensor_unsupported_encoding(encoding_method):
-    """Test error when using CUDA tensor with unsupported encoding (CUDA 
supports amplitude and angle only)."""
+    """Test error when using CUDA tensor with unsupported encoding (CUDA 
supports amplitude, angle, and basis only)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -450,13 +433,11 @@ def 
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
 
     engine = QdpEngine(0)
 
-    # CUDA tensors currently only support amplitude encoding
-    # Use non-zero data to avoid normalization issues
     data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64, 
device="cuda:0")
 
     with pytest.raises(
         RuntimeError,
-        match="only supports 'amplitude' and 'angle' methods.*Use 
tensor.cpu\\(\\)",
+        match="only supports 'amplitude', 'angle', or 'basis' methods.*Use 
tensor.cpu\\(\\)",
     ):
         engine.encode(data, 2, encoding_method)
 
@@ -474,7 +455,6 @@ def 
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
 def test_encode_3d_rejected(input_type, error_match):
     """Test error when input has 3+ dimensions (CUDA tensor, CPU tensor, or 
NumPy array)."""
     pytest.importorskip("torch")
-    import torch
     import numpy as np
     from _qdp import QdpEngine
 
@@ -520,7 +500,6 @@ def test_encode_3d_rejected(input_type, error_match):
 def test_encode_cuda_tensor_non_finite_values(tensor_factory, description):
     """Test error when CUDA tensor contains non-finite values (zeros, NaN, 
Inf)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -545,7 +524,6 @@ def 
test_encode_cuda_tensor_non_finite_values(tensor_factory, description):
 def test_encode_cuda_tensor_output_dtype(precision, expected_dtype):
     """Test that CUDA tensor encoding produces correct output dtype."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -564,7 +542,6 @@ def test_encode_cuda_tensor_output_dtype(precision, 
expected_dtype):
 def test_basis_encode_basic():
     """Test basic basis encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -589,7 +566,6 @@ def test_basis_encode_basic():
 def test_basis_encode_nonzero_index():
     """Test basis encoding with non-zero index (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -611,7 +587,6 @@ def test_basis_encode_nonzero_index():
 def test_basis_encode_3_qubits():
     """Test basis encoding with 3 qubits (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -641,7 +616,6 @@ def test_basis_encode_3_qubits():
 def test_basis_encode_errors():
     """Test error handling for basis encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -675,7 +649,6 @@ def test_basis_encode_errors():
 def test_angle_encode_basic():
     """Test basic angle encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -699,7 +672,6 @@ def test_angle_encode_basic():
 def test_angle_encode_nonzero_angles():
     """Test angle encoding with non-zero angles (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -722,7 +694,6 @@ def test_angle_encode_nonzero_angles():
 def test_angle_encode_batch():
     """Test batch angle encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -753,7 +724,6 @@ def test_angle_encode_batch():
 def test_angle_encode_errors():
     """Test error handling for angle encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -786,7 +756,6 @@ def test_encode_numpy_array(data_shape, expected_shape):
     """Test encoding from NumPy array (1D or 2D)."""
     pytest.importorskip("torch")
     import numpy as np
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -808,7 +777,6 @@ def test_encode_pathlib_path():
     """Test encoding from pathlib.Path object."""
     pytest.importorskip("torch")
     import numpy as np
-    import torch
     from pathlib import Path
     import tempfile
     import os
@@ -849,7 +817,6 @@ def test_iqp_z_encode_basic():
     - H^n transforms back to |0⟩^n
     """
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -874,7 +841,6 @@ def test_iqp_z_encode_basic():
 def test_iqp_z_encode_nonzero():
     """Test IQP-Z encoding with non-zero angles (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -900,7 +866,6 @@ def test_iqp_z_encode_nonzero():
 def test_iqp_encode_basic():
     """Test basic IQP encoding with ZZ interactions (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -926,7 +891,6 @@ def test_iqp_encode_basic():
 def test_iqp_encode_zz_effect():
     """Test that ZZ interaction produces different result than Z-only 
(requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -959,7 +923,6 @@ def test_iqp_encode_zz_effect():
 def test_iqp_encode_3_qubits():
     """Test IQP encoding with 3 qubits (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -985,7 +948,6 @@ def test_iqp_encode_3_qubits():
 def test_iqp_z_encode_batch():
     """Test batch IQP-Z encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1014,7 +976,6 @@ def test_iqp_z_encode_batch():
 def test_iqp_encode_batch():
     """Test batch IQP encoding with ZZ interactions (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1046,7 +1007,6 @@ def test_iqp_encode_batch():
 def test_iqp_encode_single_qubit():
     """Test IQP encoding with single qubit edge case (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1075,7 +1035,6 @@ def test_iqp_encode_single_qubit():
 def test_iqp_encode_errors():
     """Test error handling for IQP encoding (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1111,7 +1070,6 @@ def test_iqp_encode_errors():
 def test_iqp_fwt_normalization():
     """Test that FWT-optimized IQP produces normalized states (requires 
GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1139,7 +1097,6 @@ def test_iqp_fwt_normalization():
 def test_iqp_z_fwt_normalization():
     """Test that FWT-optimized IQP-Z produces normalized states (requires 
GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1168,7 +1125,6 @@ def test_iqp_fwt_zero_params_gives_zero_state():
     so |0⟩^n maps to |0⟩^n with amplitude 1 at index 0.
     """
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1200,7 +1156,6 @@ def test_iqp_fwt_zero_params_gives_zero_state():
 def test_iqp_fwt_batch_normalization():
     """Test that FWT-optimized batch IQP produces normalized states (requires 
GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1238,7 +1193,6 @@ def test_iqp_fwt_batch_normalization():
 def test_iqp_fwt_deterministic():
     """Test that FWT-optimized IQP is deterministic (requires GPU)."""
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():
@@ -1272,7 +1226,6 @@ def test_iqp_fwt_shared_vs_global_memory_threshold():
     - n > 10: uses global memory FWT
     """
     pytest.importorskip("torch")
-    import torch
     from _qdp import QdpEngine
 
     if not torch.cuda.is_available():

(mahout) branch main updated: [QDP] Add zero-copy amplitude encoding from float32 GPU tensors (#999)

Reply via email to