This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new d052d4e02 MAHOUT-878 Add CUDA Torch Tensor Support for QDP Python 
Binding (#881)
d052d4e02 is described below

commit d052d4e02160c28fce4a82e2a53a3deebb66bfd5
Author: Ryan Huang <[email protected]>
AuthorDate: Thu Jan 22 14:52:33 2026 +0800

    MAHOUT-878 Add CUDA Torch Tensor Support for QDP Python Binding (#881)
    
    * Add CUDA Tensor Support for QDP Python Binding
    
    * Refactor CUDA tensor handling to extract information directly from 
PyTorch tensors
    
    * Enhance error reporting for CUDA kernel launches by including error 
descriptions
    
    * linter
---
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |  31 ++-
 qdp/qdp-core/src/lib.rs                     | 268 +++++++++++++++++++++++-
 qdp/qdp-python/src/lib.rs                   | 174 +++++++++++++++-
 testing/qdp/test_bindings.py                | 312 +++++++++++++++++++++++++++-
 4 files changed, 772 insertions(+), 13 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index f4b8abd75..0720cd619 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -95,11 +95,15 @@ impl QuantumEncoder for AmplitudeEncoder {
 
                 // GPU-accelerated norm for medium+ inputs, CPU fallback for 
tiny payloads
                 let inv_norm = if host_data.len() >= GPU_NORM_THRESHOLD {
-                    Self::calculate_inv_norm_gpu(
-                        _device,
-                        *input_slice.device_ptr() as *const f64,
-                        host_data.len(),
-                    )?
+                    // SAFETY: input_slice was just allocated and copied from 
host_data,
+                    // so the pointer is valid and contains host_data.len() 
elements
+                    unsafe {
+                        Self::calculate_inv_norm_gpu(
+                            _device,
+                            *input_slice.device_ptr() as *const f64,
+                            host_data.len(),
+                        )?
+                    }
                 } else {
                     let norm = Preprocessor::calculate_l2_norm(host_data)?;
                     1.0 / norm
@@ -411,8 +415,20 @@ impl AmplitudeEncoder {
 
 impl AmplitudeEncoder {
     /// Compute inverse L2 norm on GPU using the reduction kernel.
+    ///
+    /// # Arguments
+    /// * `device` - CUDA device reference
+    /// * `input_ptr` - Device pointer to input data (f64 array on GPU)
+    /// * `len` - Number of f64 elements
+    ///
+    /// # Returns
+    /// The inverse L2 norm (1/||x||_2) of the input data
+    ///
+    /// # Safety
+    /// The caller must ensure `input_ptr` points to valid GPU memory 
containing
+    /// at least `len` f64 elements on the same device as `device`.
     #[cfg(target_os = "linux")]
-    fn calculate_inv_norm_gpu(
+    pub(crate) unsafe fn calculate_inv_norm_gpu(
         device: &Arc<CudaDevice>,
         input_ptr: *const f64,
         len: usize,
@@ -447,7 +463,8 @@ impl AmplitudeEncoder {
         let inv_norm = inv_norm_host.first().copied().unwrap_or(0.0);
         if inv_norm == 0.0 || !inv_norm.is_finite() {
             return Err(MahoutError::InvalidInput(
-                "Input data has zero norm".to_string(),
+                "Input data has zero or non-finite norm (contains NaN, Inf, or 
all zeros)"
+                    .to_string(),
             ));
         }
 
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index c5bbcf19e..f0bedb73a 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -28,7 +28,7 @@ pub mod tf_proto;
 #[macro_use]
 mod profiling;
 
-pub use error::{MahoutError, Result};
+pub use error::{MahoutError, Result, cuda_error_to_string};
 pub use gpu::memory::Precision;
 
 use std::sync::Arc;
@@ -300,6 +300,272 @@ impl QdpEngine {
             encoding_method,
         )
     }
+
+    /// Encode from existing GPU pointer (zero-copy for CUDA tensors)
+    ///
+    /// This method enables zero-copy encoding from PyTorch CUDA tensors by 
accepting
+    /// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would 
otherwise
+    /// be required.
+    ///
+    /// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr` 
to trait)
+    /// to reduce duplication with AmplitudeEncoder::encode(). This would also 
make it
+    /// easier to add GPU pointer support for other encoders (angle, basis) in 
the future.
+    ///
+    /// # Arguments
+    /// * `input_d` - Device pointer to input data (f64 array on GPU)
+    /// * `input_len` - Number of f64 elements in the input
+    /// * `num_qubits` - Number of qubits for encoding
+    /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+    ///
+    /// # Returns
+    /// DLPack pointer for zero-copy PyTorch integration
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `input_len` f64 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr(
+        &self,
+        input_d: *const f64,
+        input_len: usize,
+        num_qubits: usize,
+        encoding_method: &str,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeFromGpuPtr");
+
+        if encoding_method != "amplitude" {
+            return Err(MahoutError::NotImplemented(format!(
+                "GPU pointer encoding currently only supports 'amplitude' 
method, got '{}'",
+                encoding_method
+            )));
+        }
+
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+
+        let state_len = 1usize << num_qubits;
+        if input_len > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Input size {} exceeds state vector size {} (2^{} qubits)",
+                input_len, state_len, num_qubits
+            )));
+        }
+
+        // Allocate output state vector
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            gpu::GpuStateVector::new(&self.device, num_qubits)?
+        };
+
+        // Compute inverse L2 norm on GPU
+        let inv_norm = {
+            crate::profile_scope!("GPU::NormFromPtr");
+            // SAFETY: input_d validity is guaranteed by the caller's safety 
contract
+            unsafe {
+                gpu::AmplitudeEncoder::calculate_inv_norm_gpu(&self.device, 
input_d, input_len)?
+            }
+        };
+
+        // Get output pointer
+        let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float64 
buffer)".to_string(),
+            )
+        })?;
+
+        // Launch encoding kernel
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_amplitude_encode(
+                    input_d,
+                    state_ptr as *mut std::ffi::c_void,
+                    input_len,
+                    state_len,
+                    inv_norm,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        // Synchronize
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            self.device.synchronize().map_err(|e| {
+                MahoutError::Cuda(format!("CUDA device synchronize failed: 
{:?}", e))
+            })?;
+        }
+
+        let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(state_vector.to_dlpack())
+    }
+
+    /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
+    ///
+    /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
+    ///
+    /// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr` 
TODO).
+    ///
+    /// # Arguments
+    /// * `input_batch_d` - Device pointer to batch input data (flattened f64 
array on GPU)
+    /// * `num_samples` - Number of samples in the batch
+    /// * `sample_size` - Size of each sample in f64 elements
+    /// * `num_qubits` - Number of qubits for encoding
+    /// * `encoding_method` - Strategy (currently only "amplitude" supported)
+    ///
+    /// # Returns
+    /// Single DLPack pointer containing all encoded states (shape: 
[num_samples, 2^num_qubits])
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `num_samples * sample_size` f64 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr(
+        &self,
+        input_batch_d: *const f64,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        encoding_method: &str,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
+
+        if encoding_method != "amplitude" {
+            return Err(MahoutError::NotImplemented(format!(
+                "GPU pointer batch encoding currently only supports 
'amplitude' method, got '{}'",
+                encoding_method
+            )));
+        }
+
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        let state_len = 1usize << num_qubits;
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+
+        // Allocate output state vector
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch");
+            gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+        };
+
+        // Compute inverse norms on GPU using warp-reduced kernel
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel");
+            use cudarc::driver::DevicePtrMut;
+
+            let mut buffer = 
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm 
buffer: {:?}", e))
+            })?;
+
+            let ret = unsafe {
+                qdp_kernels::launch_l2_norm_batch(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f64,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+
+            buffer
+        };
+
+        // Validate norms on host to catch zero or NaN samples early
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = self
+                .device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms 
to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid 
norm".to_string(),
+                ));
+            }
+        }
+
+        // Launch batch kernel
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch");
+            use cudarc::driver::DevicePtr;
+
+            let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float64 
buffer)".to_string(),
+                )
+            })?;
+
+            let ret = unsafe {
+                qdp_kernels::launch_amplitude_encode_batch(
+                    input_batch_d,
+                    state_ptr as *mut std::ffi::c_void,
+                    *inv_norms_gpu.device_ptr() as *const f64,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        // Synchronize
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            self.device
+                .synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", 
e)))?;
+        }
+
+        let batch_state_vector = batch_state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(batch_state_vector.to_dlpack())
+    }
 }
 
 // Re-export key types for convenience
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index fbd5c91cb..016ee1259 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -152,7 +152,7 @@ fn is_pytorch_tensor(obj: &Bound<'_, PyAny>) -> 
PyResult<bool> {
     Ok(module_name == "torch")
 }
 
-/// Helper to validate tensor
+/// Helper to validate CPU tensor
 fn validate_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<()> {
     if !is_pytorch_tensor(tensor)? {
         return Err(PyRuntimeError::new_err("Object is not a PyTorch Tensor"));
@@ -171,6 +171,106 @@ fn validate_tensor(tensor: &Bound<'_, PyAny>) -> 
PyResult<()> {
     Ok(())
 }
 
+/// Check if a PyTorch tensor is on a CUDA device
+fn is_cuda_tensor(tensor: &Bound<'_, PyAny>) -> PyResult<bool> {
+    let device = tensor.getattr("device")?;
+    let device_type: String = device.getattr("type")?.extract()?;
+    Ok(device_type == "cuda")
+}
+
+/// Get the CUDA device index from a PyTorch tensor
+fn get_tensor_device_id(tensor: &Bound<'_, PyAny>) -> PyResult<i32> {
+    let device = tensor.getattr("device")?;
+    let device_index: i32 = device.getattr("index")?.extract()?;
+    Ok(device_index)
+}
+
+/// Validate a CUDA tensor for direct GPU encoding
+/// Checks: dtype=float64, contiguous, non-empty, device_id matches engine
+fn validate_cuda_tensor_for_encoding(
+    tensor: &Bound<'_, PyAny>,
+    expected_device_id: usize,
+    encoding_method: &str,
+) -> PyResult<()> {
+    // Check encoding method support (currently only amplitude is supported 
for CUDA tensors)
+    if encoding_method != "amplitude" {
+        return Err(PyRuntimeError::new_err(format!(
+            "CUDA tensor encoding currently only supports 'amplitude' method, 
got '{}'. \
+             Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
+            encoding_method
+        )));
+    }
+
+    // Check dtype is float64
+    let dtype = tensor.getattr("dtype")?;
+    let dtype_str: String = dtype.str()?.extract()?;
+    if !dtype_str.contains("float64") {
+        return Err(PyRuntimeError::new_err(format!(
+            "CUDA tensor must have dtype float64, got {}. Use 
tensor.to(torch.float64)",
+            dtype_str
+        )));
+    }
+
+    // Check contiguous
+    let is_contiguous: bool = tensor.call_method0("is_contiguous")?.extract()?;
+    if !is_contiguous {
+        return Err(PyRuntimeError::new_err(
+            "CUDA tensor must be contiguous. Use tensor.contiguous()",
+        ));
+    }
+
+    // Check non-empty
+    let numel: usize = tensor.call_method0("numel")?.extract()?;
+    if numel == 0 {
+        return Err(PyRuntimeError::new_err("CUDA tensor cannot be empty"));
+    }
+
+    // Check device matches engine
+    let tensor_device_id = get_tensor_device_id(tensor)?;
+    if tensor_device_id as usize != expected_device_id {
+        return Err(PyRuntimeError::new_err(format!(
+            "Device mismatch: tensor is on cuda:{}, but engine is on cuda:{}. \
+             Move tensor with tensor.to('cuda:{}')",
+            tensor_device_id, expected_device_id, expected_device_id
+        )));
+    }
+
+    Ok(())
+}
+
+/// CUDA tensor information extracted directly from PyTorch tensor
+struct CudaTensorInfo {
+    data_ptr: *const f64,
+    shape: Vec<i64>,
+}
+
+/// Extract GPU pointer directly from PyTorch CUDA tensor
+///
+/// Uses PyTorch's `data_ptr()` and `shape` APIs directly instead of DLPack 
protocol.
+/// This avoids the DLPack capsule lifecycle complexity and potential memory 
leaks
+/// from the capsule renaming pattern.
+///
+/// # Safety
+/// The returned `data_ptr` points to GPU memory owned by the source tensor.
+/// The caller must ensure the source tensor remains alive and unmodified
+/// for the entire duration that `data_ptr` is in use. Python's GIL ensures
+/// the tensor won't be garbage collected during `encode()`, but the caller
+/// must not deallocate or resize the tensor while encoding is in progress.
+fn extract_cuda_tensor_info(tensor: &Bound<'_, PyAny>) -> 
PyResult<CudaTensorInfo> {
+    // Get GPU pointer directly via tensor.data_ptr()
+    let data_ptr_int: isize = tensor.call_method0("data_ptr")?.extract()?;
+    if data_ptr_int == 0 {
+        return Err(PyRuntimeError::new_err("CUDA tensor has null data 
pointer"));
+    }
+    let data_ptr = data_ptr_int as *const f64;
+
+    // Get shape directly via tensor.shape
+    let shape_obj = tensor.getattr("shape")?;
+    let shape: Vec<i64> = shape_obj.extract()?;
+
+    Ok(CudaTensorInfo { data_ptr, shape })
+}
+
 /// PyO3 wrapper for QdpEngine
 ///
 /// Provides Python bindings for GPU-accelerated quantum state encoding.
@@ -321,6 +421,78 @@ impl QdpEngine {
 
         // Check if it's a PyTorch tensor
         if is_pytorch_tensor(data)? {
+            // Check if it's a CUDA tensor - use zero-copy GPU encoding
+            if is_cuda_tensor(data)? {
+                // Validate CUDA tensor for direct GPU encoding
+                validate_cuda_tensor_for_encoding(
+                    data,
+                    self.engine.device().ordinal(),
+                    encoding_method,
+                )?;
+
+                // Extract GPU pointer directly from PyTorch tensor
+                let tensor_info = extract_cuda_tensor_info(data)?;
+
+                let ndim: usize = data.call_method0("dim")?.extract()?;
+
+                match ndim {
+                    1 => {
+                        // 1D CUDA tensor: single sample encoding
+                        let input_len = tensor_info.shape[0] as usize;
+                        // SAFETY: tensor_info.data_ptr was obtained via 
PyTorch's data_ptr() from a
+                        // valid CUDA tensor. The tensor remains alive during 
this call
+                        // (held by Python's GIL), and we validated 
dtype/contiguity/device above.
+                        let ptr = unsafe {
+                            self.engine
+                                .encode_from_gpu_ptr(
+                                    tensor_info.data_ptr,
+                                    input_len,
+                                    num_qubits,
+                                    encoding_method,
+                                )
+                                .map_err(|e| {
+                                    PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
+                                })?
+                        };
+                        return Ok(QuantumTensor {
+                            ptr,
+                            consumed: false,
+                        });
+                    }
+                    2 => {
+                        // 2D CUDA tensor: batch encoding
+                        let num_samples = tensor_info.shape[0] as usize;
+                        let sample_size = tensor_info.shape[1] as usize;
+                        // SAFETY: Same as above - pointer from validated 
PyTorch CUDA tensor
+                        let ptr = unsafe {
+                            self.engine
+                                .encode_batch_from_gpu_ptr(
+                                    tensor_info.data_ptr,
+                                    num_samples,
+                                    sample_size,
+                                    num_qubits,
+                                    encoding_method,
+                                )
+                                .map_err(|e| {
+                                    PyRuntimeError::new_err(format!("Encoding 
failed: {}", e))
+                                })?
+                        };
+                        return Ok(QuantumTensor {
+                            ptr,
+                            consumed: false,
+                        });
+                    }
+                    _ => {
+                        return Err(PyRuntimeError::new_err(format!(
+                            "Unsupported CUDA tensor shape: {}D. Expected 1D 
tensor for single \
+                             sample encoding or 2D tensor (batch_size, 
features) for batch encoding.",
+                            ndim
+                        )));
+                    }
+                }
+            }
+
+            // CPU tensor path (existing code)
             validate_tensor(data)?;
             // PERF: Avoid Tensor -> Python list -> Vec deep copies.
             //
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index 64bf09727..590e3ec63 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -253,10 +253,314 @@ def test_encode_errors():
     with pytest.raises(RuntimeError, match="Unsupported data type"):
         engine.encode({"key": "value"}, 2, "amplitude")
 
-    # Test GPU tensor input (should fail as only CPU is supported)
-    gpu_tensor = torch.tensor([1.0, 2.0], device="cuda:0")
-    with pytest.raises(RuntimeError, match="Only CPU tensors are currently 
supported"):
-        engine.encode(gpu_tensor, 1, "amplitude")
+
[email protected]
+def test_encode_cuda_tensor_1d():
+    """Test encoding from 1D CUDA tensor (single sample, zero-copy)."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create 1D CUDA tensor
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64, 
device="cuda:0")
+    qtensor = engine.encode(data, 2, "amplitude")
+
+    # Verify result
+    result = torch.from_dlpack(qtensor)
+    assert result.is_cuda
+    assert result.shape == (1, 4)  # 2^2 = 4 amplitudes
+
+    # Verify normalization (amplitudes should have unit norm)
+    norm = torch.sqrt(torch.sum(torch.abs(result) ** 2))
+    assert torch.isclose(norm, torch.tensor(1.0, device="cuda:0"), atol=1e-6)
+
+
[email protected]
+def test_encode_cuda_tensor_2d_batch():
+    """Test encoding from 2D CUDA tensor (batch, zero-copy)."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create 2D CUDA tensor (batch_size=3, features=4)
+    data = torch.tensor(
+        [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],
+        dtype=torch.float64,
+        device="cuda:0",
+    )
+    qtensor = engine.encode(data, 2, "amplitude")
+
+    # Verify result
+    result = torch.from_dlpack(qtensor)
+    assert result.is_cuda
+    assert result.shape == (3, 4)  # batch_size=3, 2^2=4
+
+    # Verify each sample is normalized
+    for i in range(3):
+        norm = torch.sqrt(torch.sum(torch.abs(result[i]) ** 2))
+        assert torch.isclose(norm, torch.tensor(1.0, device="cuda:0"), 
atol=1e-6)
+
+
[email protected]
+def test_encode_cuda_tensor_wrong_dtype():
+    """Test error when CUDA tensor has wrong dtype (non-float64)."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create CUDA tensor with float32 dtype (wrong)
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32, 
device="cuda:0")
+    with pytest.raises(RuntimeError, match="CUDA tensor must have dtype 
float64"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_non_contiguous():
+    """Test error when CUDA tensor is non-contiguous."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create non-contiguous CUDA tensor (via transpose)
+    data = torch.tensor(
+        [[1.0, 2.0], [3.0, 4.0]], dtype=torch.float64, device="cuda:0"
+    ).t()
+    assert not data.is_contiguous()
+
+    with pytest.raises(RuntimeError, match="CUDA tensor must be contiguous"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
[email protected](
+    not _has_multi_gpu(), reason="Multi-GPU setup required for this test"
+)
+def test_encode_cuda_tensor_device_mismatch():
+    """Test error when CUDA tensor is on wrong device (multi-GPU only)."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    # Engine on device 0
+    engine = QdpEngine(0)
+
+    # Tensor on device 1 (wrong device)
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64, 
device="cuda:1")
+    with pytest.raises(RuntimeError, match="Device mismatch"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_empty():
+    """Test error when CUDA tensor is empty."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create empty CUDA tensor
+    data = torch.tensor([], dtype=torch.float64, device="cuda:0")
+    with pytest.raises(RuntimeError, match="CUDA tensor cannot be empty"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_preserves_input():
+    """Test that input CUDA tensor is not modified after encoding."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create CUDA tensor and save a copy
+    original_data = [1.0, 2.0, 3.0, 4.0]
+    data = torch.tensor(original_data, dtype=torch.float64, device="cuda:0")
+    data_clone = data.clone()
+
+    # Encode
+    qtensor = engine.encode(data, 2, "amplitude")
+    _ = torch.from_dlpack(qtensor)
+
+    # Verify original tensor is unchanged
+    assert torch.equal(data, data_clone)
+
+
[email protected]
+def test_encode_cuda_tensor_unsupported_encoding():
+    """Test error when using CUDA tensor with unsupported encoding method."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # CUDA tensors currently only support amplitude encoding
+    # Use non-zero data to avoid normalization issues
+    data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64, 
device="cuda:0")
+
+    with pytest.raises(RuntimeError, match="only supports 'amplitude' method"):
+        engine.encode(data, 2, "basis")
+
+    with pytest.raises(RuntimeError, match="only supports 'amplitude' method"):
+        engine.encode(data, 2, "angle")
+
+
[email protected]
+def test_encode_cuda_tensor_3d_rejected():
+    """Test error when CUDA tensor has 3+ dimensions."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create 3D CUDA tensor (should be rejected)
+    data = torch.randn(2, 3, 4, dtype=torch.float64, device="cuda:0")
+    with pytest.raises(RuntimeError, match="Unsupported CUDA tensor shape: 
3D"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_zero_values():
+    """Test error when CUDA tensor contains all zeros (zero norm)."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create CUDA tensor with all zeros (cannot be normalized)
+    data = torch.zeros(4, dtype=torch.float64, device="cuda:0")
+    with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_nan_values():
+    """Test error when CUDA tensor contains NaN values."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create CUDA tensor with NaN
+    data = torch.tensor(
+        [1.0, float("nan"), 3.0, 4.0], dtype=torch.float64, device="cuda:0"
+    )
+    with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_inf_values():
+    """Test error when CUDA tensor contains Inf values."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create CUDA tensor with Inf
+    data = torch.tensor(
+        [1.0, float("inf"), 3.0, 4.0], dtype=torch.float64, device="cuda:0"
+    )
+    with pytest.raises(RuntimeError, match="zero or non-finite norm"):
+        engine.encode(data, 2, "amplitude")
+
+
[email protected]
+def test_encode_cuda_tensor_output_dtype():
+    """Test that CUDA tensor encoding produces correct output dtype."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    # Test default precision (float32 -> complex64)
+    engine_f32 = QdpEngine(0, precision="float32")
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64, 
device="cuda:0")
+    result = torch.from_dlpack(engine_f32.encode(data, 2, "amplitude"))
+    assert result.dtype == torch.complex64, f"Expected complex64, got 
{result.dtype}"
+
+    # Test float64 precision (float64 -> complex128)
+    engine_f64 = QdpEngine(0, precision="float64")
+    data = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float64, 
device="cuda:0")
+    result = torch.from_dlpack(engine_f64.encode(data, 2, "amplitude"))
+    assert result.dtype == torch.complex128, f"Expected complex128, got 
{result.dtype}"
+
+
[email protected]
+def test_encode_cuda_tensor_preserves_input_batch():
+    """Test that input 2D CUDA tensor (batch) is not modified after 
encoding."""
+    pytest.importorskip("torch")
+    import torch
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+
+    # Create 2D CUDA tensor and save a copy
+    data = torch.tensor(
+        [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]],
+        dtype=torch.float64,
+        device="cuda:0",
+    )
+    data_clone = data.clone()
+
+    # Encode
+    qtensor = engine.encode(data, 2, "amplitude")
+    _ = torch.from_dlpack(qtensor)
+
+    # Verify original tensor is unchanged
+    assert torch.equal(data, data_clone)
 
 
 @pytest.mark.gpu

Reply via email to