(mahout) branch main updated: [QDP] basis GPU‑pointer support (#934)

guanmingchiu Sun, 01 Feb 2026 09:47:02 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 8cf771add [QDP] basis GPU‑pointer support (#934)
8cf771add is described below

commit 8cf771addf3e522fa540b55aaa472df90b212584
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Mon Feb 2 01:46:29 2026 +0800

    [QDP] basis GPU‑pointer support (#934)
    
    * basis GPU‑pointer support
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix pre-commit
    
    Signed-off-by: 400Ping <[email protected]>
    
    * update
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix conflicts
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix conflicts
    
    Signed-off-by: 400Ping <[email protected]>
    
    * Revert "fix conflicts"
    
    This reverts commit fd353c3c81d282718d566ff8eac8312e0144aaaf.
    
    Signed-off-by: 400Ping <[email protected]>
    
    * Revert "fix conflicts"
    
    This reverts commit 2dd586838337f1e41e1bf42854cd18fa51d8d113.
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix ci
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix pre-commit
    
    Signed-off-by: 400Ping <[email protected]>
    
    * add unit test
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix ci error
    
    Signed-off-by: 400Ping <[email protected]>
    
    * update
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix build
    
    Signed-off-by: 400Ping <[email protected]>
    
    * fix pre-commit
    
    Signed-off-by: 400Ping <[email protected]>
    
    ---------
    
    Signed-off-by: 400Ping <[email protected]>
    Signed-off-by: 400Ping <[email protected]>
---
 qdp/qdp-core/src/gpu/cuda_ffi.rs            |   4 +
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |   4 +-
 qdp/qdp-core/src/gpu/encodings/angle.rs     |   4 +-
 qdp/qdp-core/src/gpu/encodings/basis.rs     |   4 +-
 qdp/qdp-core/src/gpu/encodings/iqp.rs       |   4 +-
 qdp/qdp-core/src/lib.rs                     | 283 ++++++++++--------
 qdp/qdp-core/tests/gpu_ptr_encoding.rs      | 425 ++++++++++++++++++++++++++++
 qdp/qdp-python/src/lib.rs                   |  52 ++--
 8 files changed, 640 insertions(+), 140 deletions(-)

diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs
index 491e1382b..2ed60c311 100644
--- a/qdp/qdp-core/src/gpu/cuda_ffi.rs
+++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs
@@ -21,9 +21,12 @@ use std::ffi::c_void;
 pub(crate) const CUDA_MEMCPY_HOST_TO_DEVICE: u32 = 1;
 pub(crate) const CUDA_EVENT_DISABLE_TIMING: u32 = 0x02;
 pub(crate) const CUDA_EVENT_DEFAULT: u32 = 0x00;
+#[allow(dead_code)]
 pub(crate) const CUDA_MEMORY_TYPE_DEVICE: i32 = 2;
+#[allow(dead_code)]
 pub(crate) const CUDA_MEMORY_TYPE_MANAGED: i32 = 3;
 
+#[allow(dead_code)]
 #[repr(C)]
 pub(crate) struct CudaPointerAttributes {
     pub memory_type: i32,
@@ -45,6 +48,7 @@ unsafe extern "C" {
     pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: 
u32) -> i32;
     pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32;
 
+    #[allow(dead_code)]
     pub(crate) fn cudaPointerGetAttributes(
         attributes: *mut CudaPointerAttributes,
         ptr: *const c_void,
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 62313550d..032418c46 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -26,7 +26,7 @@ use super::QuantumEncoder;
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
 use crate::gpu::pipeline::run_dual_stream_pipeline;
 use cudarc::driver::CudaDevice;
 
@@ -70,7 +70,7 @@ impl QuantumEncoder for AmplitudeEncoder {
             // Allocate GPU state vector
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(_device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
+                GpuStateVector::new(_device, num_qubits, Precision::Float64)?
             };
 
             // Async Pipeline for large data
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs 
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 2a91cb017..d1a1091d5 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -24,7 +24,7 @@ use super::{QuantumEncoder, validate_qubit_count};
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
 #[cfg(target_os = "linux")]
 use crate::gpu::pipeline::run_dual_stream_pipeline_aligned;
 use cudarc::driver::CudaDevice;
@@ -63,7 +63,7 @@ impl QuantumEncoder for AngleEncoder {
 
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
+                GpuStateVector::new(device, num_qubits, Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs 
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index e883372f5..33e8f14bf 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -24,7 +24,7 @@ use super::{QuantumEncoder, validate_qubit_count};
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
 use cudarc::driver::CudaDevice;
 use std::sync::Arc;
 
@@ -76,7 +76,7 @@ impl QuantumEncoder for BasisEncoder {
             // Allocate GPU state vector
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
+                GpuStateVector::new(device, num_qubits, Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs 
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 245229a40..7a177a208 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -20,7 +20,7 @@ use super::QuantumEncoder;
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
 use cudarc::driver::CudaDevice;
 use std::sync::Arc;
 
@@ -87,7 +87,7 @@ impl QuantumEncoder for IqpEncoder {
 
             let state_vector = {
                 crate::profile_scope!("GPU::Alloc");
-                GpuStateVector::new(device, num_qubits, 
crate::gpu::memory::Precision::Float64)?
+                GpuStateVector::new(device, num_qubits, Precision::Float64)?
             };
 
             let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index a8028c62e..6e50414a9 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -35,67 +35,12 @@ mod profiling;
 pub use error::{MahoutError, Result, cuda_error_to_string};
 pub use gpu::memory::Precision;
 
-#[cfg(target_os = "linux")]
-use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
-#[cfg(target_os = "linux")]
-use crate::gpu::cuda_sync::sync_cuda_stream;
 use crate::gpu::get_encoder;
 use cudarc::driver::CudaDevice;
 
-#[cfg(target_os = "linux")]
-fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const f64) -> Result<()> 
{
-    use crate::gpu::cuda_ffi::{
-        CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, 
CudaPointerAttributes,
-        cudaPointerGetAttributes,
-    };
-    use std::ffi::c_void;
-
-    if ptr.is_null() {
-        return Err(MahoutError::InvalidInput(
-            "Input GPU pointer is null".to_string(),
-        ));
-    }
-
-    let mut attrs = CudaPointerAttributes {
-        memory_type: 0,
-        device: 0,
-        device_pointer: std::ptr::null_mut(),
-        host_pointer: std::ptr::null_mut(),
-        is_managed: 0,
-        allocation_flags: 0,
-    };
-
-    let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr as 
*const c_void) };
-    if ret != 0 {
-        return Err(MahoutError::InvalidInput(format!(
-            "cudaPointerGetAttributes failed for input pointer: {} ({})",
-            ret,
-            cuda_error_to_string(ret)
-        )));
-    }
-
-    if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type != 
CUDA_MEMORY_TYPE_MANAGED
-    {
-        return Err(MahoutError::InvalidInput(format!(
-            "Input pointer is not CUDA device memory (memory_type={})",
-            attrs.memory_type
-        )));
-    }
-
-    let device_ordinal = device.ordinal() as i32;
-    if attrs.device >= 0 && attrs.device != device_ordinal {
-        return Err(MahoutError::InvalidInput(format!(
-            "Input pointer device mismatch: pointer on cuda:{}, engine on 
cuda:{}",
-            attrs.device, device_ordinal
-        )));
-    }
-
-    Ok(())
-}
-
 /// Main entry point for Mahout QDP
 ///
 /// Manages GPU context and dispatches encoding tasks.
@@ -366,18 +311,15 @@ impl QdpEngine {
     /// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would 
otherwise
     /// be required.
     ///
-    /// Uses the default CUDA stream. For PyTorch stream interop, use
-    /// `encode_from_gpu_ptr_with_stream`.
-    ///
     /// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr` 
to trait)
     /// to reduce duplication with AmplitudeEncoder::encode(). This would also 
make it
     /// easier to add GPU pointer support for other encoders (angle, basis) in 
the future.
     ///
     /// # Arguments
-    /// * `input_d` - Device pointer to input data (f64 array on GPU)
-    /// * `input_len` - Number of f64 elements in the input
+    /// * `input_d` - Device pointer to input data (f64 for amplitude/angle, 
usize/int64 for basis)
+    /// * `input_len` - Number of elements in the input
     /// * `num_qubits` - Number of qubits for encoding
-    /// * `encoding_method` - Strategy (currently "amplitude" and "angle" 
supported)
+    /// * `encoding_method` - Strategy ("amplitude", "angle", or "basis")
     ///
     /// # Returns
     /// DLPack pointer for zero-copy PyTorch integration
@@ -385,12 +327,12 @@ impl QdpEngine {
     /// # Safety
     /// The input pointer must:
     /// - Point to valid GPU memory on the same device as the engine
-    /// - Contain at least `input_len` f64 elements
+    /// - Contain at least `input_len` elements of the expected dtype
     /// - Remain valid for the duration of this call
     #[cfg(target_os = "linux")]
     pub unsafe fn encode_from_gpu_ptr(
         &self,
-        input_d: *const f64,
+        input_d: *const std::ffi::c_void,
         input_len: usize,
         num_qubits: usize,
         encoding_method: &str,
@@ -406,38 +348,36 @@ impl QdpEngine {
         }
     }
 
-    /// Encode from existing GPU pointer on a specified CUDA stream.
+    /// Encode from existing GPU pointer with a specific CUDA stream.
     ///
-    /// The caller must ensure the stream is valid for the device, and that any
-    /// producer work on that stream has been enqueued before this call.
+    /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr) but uses 
the given `stream`
+    /// for kernel launches. Pass null for default stream.
     ///
     /// # Safety
-    /// In addition to the `encode_from_gpu_ptr` requirements, the stream 
pointer
-    /// must remain valid for the duration of this call.
+    /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr). 
Additionally, `stream` must
+    /// be a valid CUDA stream on the same device as the engine, or null.
     #[cfg(target_os = "linux")]
     pub unsafe fn encode_from_gpu_ptr_with_stream(
         &self,
-        input_d: *const f64,
+        input_d: *const std::ffi::c_void,
         input_len: usize,
         num_qubits: usize,
         encoding_method: &str,
-        stream: *mut c_void,
+        stream: *mut std::ffi::c_void,
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeFromGpuPtr");
 
-        if input_len == 0 {
-            return Err(MahoutError::InvalidInput(
-                "Input data cannot be empty".into(),
-            ));
-        }
-
-        validate_cuda_input_ptr(&self.device, input_d)?;
-
         let state_len = 1usize << num_qubits;
-        let method = encoding_method.to_lowercase();
+        let method = encoding_method.to_ascii_lowercase();
 
         match method.as_str() {
             "amplitude" => {
+                if input_len == 0 {
+                    return Err(MahoutError::InvalidInput(
+                        "Input data cannot be empty".into(),
+                    ));
+                }
+
                 if input_len > state_len {
                     return Err(MahoutError::InvalidInput(format!(
                         "Input size {} exceeds state vector size {} (2^{} 
qubits)",
@@ -445,6 +385,8 @@ impl QdpEngine {
                     )));
                 }
 
+                let input_d = input_d as *const f64;
+
                 let state_vector = {
                     crate::profile_scope!("GPU::Alloc");
                     gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
@@ -452,12 +394,12 @@ impl QdpEngine {
 
                 let inv_norm = {
                     crate::profile_scope!("GPU::NormFromPtr");
+                    // SAFETY: input_d validity is guaranteed by the caller's 
safety contract
                     unsafe {
-                        
gpu::AmplitudeEncoder::calculate_inv_norm_gpu_with_stream(
+                        gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
                             &self.device,
                             input_d,
                             input_len,
-                            stream,
                         )?
                     }
                 };
@@ -492,7 +434,7 @@ impl QdpEngine {
 
                 {
                     crate::profile_scope!("GPU::Synchronize");
-                    sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
                 }
 
                 let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
@@ -506,6 +448,8 @@ impl QdpEngine {
                     )));
                 }
 
+                let angles_d = input_d as *const f64;
+
                 let state_vector = {
                     crate::profile_scope!("GPU::Alloc");
                     gpu::GpuStateVector::new(&self.device, num_qubits, 
Precision::Float64)?
@@ -521,7 +465,7 @@ impl QdpEngine {
                     crate::profile_scope!("GPU::KernelLaunch");
                     let ret = unsafe {
                         qdp_kernels::launch_angle_encode(
-                            input_d,
+                            angles_d,
                             state_ptr as *mut std::ffi::c_void,
                             state_len,
                             num_qubits as u32,
@@ -540,14 +484,66 @@ impl QdpEngine {
 
                 {
                     crate::profile_scope!("GPU::Synchronize");
-                    sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+                }
+
+                let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(state_vector.to_dlpack())
+            }
+            "basis" => {
+                if input_len != 1 {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Basis encoding expects exactly 1 value (the basis 
index), got {}",
+                        input_len
+                    )));
+                }
+
+                let basis_indices_d = input_d as *const usize;
+
+                let state_vector = {
+                    crate::profile_scope!("GPU::Alloc");
+                    gpu::GpuStateVector::new(&self.device, num_qubits, 
self.precision)?
+                };
+
+                let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "State vector precision mismatch (expected float64 
buffer)".to_string(),
+                    )
+                })?;
+
+                // Use batch API with num_samples=1 to avoid D2H copy; 
launch_basis_encode takes host usize.
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_basis_encode_batch(
+                            basis_indices_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            1,
+                            state_len,
+                            num_qubits as u32,
+                            stream,
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Basis encoding kernel failed with CUDA error 
code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
                 }
 
                 let state_vector = state_vector.to_precision(&self.device, 
self.precision)?;
                 Ok(state_vector.to_dlpack())
             }
             _ => Err(MahoutError::NotImplemented(format!(
-                "GPU pointer encoding currently only supports 'amplitude' and 
'angle' methods, got '{}'",
+                "GPU pointer encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'",
                 encoding_method
             ))),
         }
@@ -556,17 +552,15 @@ impl QdpEngine {
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
-    /// Uses the default CUDA stream. For PyTorch stream interop, use
-    /// `encode_batch_from_gpu_ptr_with_stream`.
     ///
     /// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr` 
TODO).
     ///
     /// # Arguments
-    /// * `input_batch_d` - Device pointer to batch input data (flattened f64 
array on GPU)
+    /// * `input_batch_d` - Device pointer to batch input data (f64 for 
amplitude/angle, usize/int64 for basis)
     /// * `num_samples` - Number of samples in the batch
-    /// * `sample_size` - Size of each sample in f64 elements
+    /// * `sample_size` - Size of each sample in elements
     /// * `num_qubits` - Number of qubits for encoding
-    /// * `encoding_method` - Strategy (currently "amplitude" and "angle" 
supported)
+    /// * `encoding_method` - Strategy ("amplitude", "angle", or "basis")
     ///
     /// # Returns
     /// Single DLPack pointer containing all encoded states (shape: 
[num_samples, 2^num_qubits])
@@ -574,12 +568,12 @@ impl QdpEngine {
     /// # Safety
     /// The input pointer must:
     /// - Point to valid GPU memory on the same device as the engine
-    /// - Contain at least `num_samples * sample_size` f64 elements
+    /// - Contain at least `num_samples * sample_size` elements of the 
expected dtype
     /// - Remain valid for the duration of this call
     #[cfg(target_os = "linux")]
     pub unsafe fn encode_batch_from_gpu_ptr(
         &self,
-        input_batch_d: *const f64,
+        input_batch_d: *const std::ffi::c_void,
         num_samples: usize,
         sample_size: usize,
         num_qubits: usize,
@@ -597,42 +591,43 @@ impl QdpEngine {
         }
     }
 
-    /// Encode batch from existing GPU pointer on a specified CUDA stream.
+    /// Encode batch from existing GPU pointer with a specific CUDA stream.
+    ///
+    /// Same as [`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr) 
but uses the given
+    /// `stream` for kernel launches. Pass null for default stream.
     ///
     /// # Safety
-    /// In addition to the `encode_batch_from_gpu_ptr` requirements, the 
stream pointer
-    /// must remain valid for the duration of this call.
+    /// Same as 
[`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr). Additionally,
+    /// `stream` must be a valid CUDA stream on the same device as the engine, 
or null.
     #[cfg(target_os = "linux")]
     pub unsafe fn encode_batch_from_gpu_ptr_with_stream(
         &self,
-        input_batch_d: *const f64,
+        input_batch_d: *const std::ffi::c_void,
         num_samples: usize,
         sample_size: usize,
         num_qubits: usize,
         encoding_method: &str,
-        stream: *mut c_void,
+        stream: *mut std::ffi::c_void,
     ) -> Result<*mut DLManagedTensor> {
         crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
 
+        let state_len = 1usize << num_qubits;
+        let method = encoding_method.to_ascii_lowercase();
+
         if num_samples == 0 {
             return Err(MahoutError::InvalidInput(
                 "Number of samples cannot be zero".into(),
             ));
         }
 
-        if sample_size == 0 {
-            return Err(MahoutError::InvalidInput(
-                "Sample size cannot be zero".into(),
-            ));
-        }
-
-        validate_cuda_input_ptr(&self.device, input_batch_d)?;
-
-        let state_len = 1usize << num_qubits;
-        let method = encoding_method.to_ascii_lowercase();
-
         match method.as_str() {
             "amplitude" => {
+                if sample_size == 0 {
+                    return Err(MahoutError::InvalidInput(
+                        "Sample size cannot be zero".into(),
+                    ));
+                }
+
                 if sample_size > state_len {
                     return Err(MahoutError::InvalidInput(format!(
                         "Sample size {} exceeds state vector size {} (2^{} 
qubits)",
@@ -640,6 +635,8 @@ impl QdpEngine {
                     )));
                 }
 
+                let input_batch_d = input_batch_d as *const f64;
+
                 let batch_state_vector = {
                     crate::profile_scope!("GPU::AllocBatch");
                     gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
@@ -677,9 +674,9 @@ impl QdpEngine {
                     buffer
                 };
 
+                // Validate norms on host to catch zero or NaN samples early
                 {
                     crate::profile_scope!("GPU::NormValidation");
-                    sync_cuda_stream(stream, "Norm stream synchronize 
failed")?;
                     let host_inv_norms =
                         self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| 
{
                             MahoutError::Cuda(format!("Failed to copy norms to 
host: {:?}", e))
@@ -726,7 +723,7 @@ impl QdpEngine {
 
                 {
                     crate::profile_scope!("GPU::Synchronize");
-                    sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
                 }
 
                 let batch_state_vector =
@@ -736,6 +733,12 @@ impl QdpEngine {
             "angle" => {
                 use cudarc::driver::DevicePtrMut;
 
+                if sample_size == 0 {
+                    return Err(MahoutError::InvalidInput(
+                        "Sample size cannot be zero".into(),
+                    ));
+                }
+
                 if sample_size != num_qubits {
                     return Err(MahoutError::InvalidInput(format!(
                         "Angle encoding expects sample_size={} (one angle per 
qubit), got {}",
@@ -743,6 +746,8 @@ impl QdpEngine {
                     )));
                 }
 
+                let input_batch_d = input_batch_d as *const f64;
+
                 // Validate that all input angles are finite (no NaN/Inf), 
consistent with
                 // CPU and host-side batch angle encoding paths.
                 let angle_validation_buffer = {
@@ -778,7 +783,6 @@ impl QdpEngine {
 
                 {
                     
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
-                    sync_cuda_stream(stream, "Angle norm stream synchronize 
failed")?;
                     let host_norms = self
                         .device
                         .dtoh_sync_copy(&angle_validation_buffer)
@@ -833,7 +837,60 @@ impl QdpEngine {
 
                 {
                     crate::profile_scope!("GPU::Synchronize");
-                    sync_cuda_stream(stream, "CUDA stream synchronize 
failed")?;
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
+                }
+
+                let batch_state_vector =
+                    batch_state_vector.to_precision(&self.device, 
self.precision)?;
+                Ok(batch_state_vector.to_dlpack())
+            }
+            "basis" => {
+                if sample_size != 1 {
+                    return Err(MahoutError::InvalidInput(format!(
+                        "Basis encoding expects sample_size=1 (one index per 
sample), got {}",
+                        sample_size
+                    )));
+                }
+
+                let basis_indices_d = input_batch_d as *const usize;
+
+                let batch_state_vector = {
+                    crate::profile_scope!("GPU::AllocBatch");
+                    gpu::GpuStateVector::new_batch(&self.device, num_samples, 
num_qubits)?
+                };
+
+                let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+                    MahoutError::InvalidInput(
+                        "Batch state vector precision mismatch (expected 
float64 buffer)"
+                            .to_string(),
+                    )
+                })?;
+
+                {
+                    crate::profile_scope!("GPU::BatchKernelLaunch");
+                    let ret = unsafe {
+                        qdp_kernels::launch_basis_encode_batch(
+                            basis_indices_d,
+                            state_ptr as *mut std::ffi::c_void,
+                            num_samples,
+                            state_len,
+                            num_qubits as u32,
+                            stream,
+                        )
+                    };
+
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(format!(
+                            "Batch basis encoding kernel failed with CUDA 
error code: {} ({})",
+                            ret,
+                            cuda_error_to_string(ret)
+                        )));
+                    }
+                }
+
+                {
+                    crate::profile_scope!("GPU::Synchronize");
+                    gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream 
synchronize failed")?;
                 }
 
                 let batch_state_vector =
@@ -841,7 +898,7 @@ impl QdpEngine {
                 Ok(batch_state_vector.to_dlpack())
             }
             _ => Err(MahoutError::NotImplemented(format!(
-                "GPU pointer batch encoding currently only supports 
'amplitude' and 'angle' methods, got '{}'",
+                "GPU pointer batch encoding currently only supports 
'amplitude', 'angle', or 'basis' methods, got '{}'",
                 encoding_method
             ))),
         }
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs 
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
new file mode 100644
index 000000000..7851eb1e1
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -0,0 +1,425 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Unit and integration tests for encode_from_gpu_ptr and 
encode_batch_from_gpu_ptr.
+
+#![cfg(target_os = "linux")]
+
+use std::ffi::c_void;
+
+use cudarc::driver::{CudaDevice, DevicePtr};
+use qdp_core::{MahoutError, QdpEngine};
+
+mod common;
+
+// ---- Validation / error-path tests (return before using pointer) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_unknown_method() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 4, 2, 
"unknown_encoding") };
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::NotImplemented(msg)) => {
+            assert!(msg.contains("unknown_encoding") || msg.contains("only 
supports"));
+        }
+        _ => panic!("expected NotImplemented, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_empty_input() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2, 
"amplitude") };
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("empty") || msg.contains("cannot be empty"));
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_input_exceeds_state() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    // 2 qubits -> state_len = 4; request input_len = 10
+    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 10, 2, 
"amplitude") };
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("exceeds") && msg.contains("state"));
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_unknown_method() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    let result =
+        unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 2, 4, 2, 
"unknown_method") };
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::NotImplemented(msg)) => {
+            assert!(msg.contains("unknown_method") || msg.contains("only 
supports"));
+        }
+        _ => panic!("expected NotImplemented, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_amplitude_num_samples_zero() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    let result =
+        unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 0, 4, 2, 
"amplitude") };
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("zero") || msg.contains("samples"));
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_basis_input_len_not_one() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    // Basis single encoding expects exactly 1 value; input_len != 1 must 
return error.
+    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2, 
"basis") };
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("exactly 1") || msg.contains("basis"));
+        }
+        _ => panic!("expected InvalidInput for input_len != 1, got {:?}", 
result),
+    }
+
+    let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 3, 2, 
"basis") };
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("exactly 1") || msg.contains("basis"));
+        }
+        _ => panic!("expected InvalidInput for input_len != 1, got {:?}", 
result),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_basis_sample_size_not_one() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => return,
+    };
+
+    // Basis batch expects sample_size == 1 (one index per sample).
+    let result = unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 
2, 4, 2, "basis") };
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("sample_size=1") || msg.contains("one 
index"));
+        }
+        _ => panic!(
+            "expected InvalidInput for sample_size != 1, got {:?}",
+            result
+        ),
+    }
+}
+
+// ---- Happy-path tests (real GPU memory) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_success() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => {
+            println!("SKIP: No GPU available");
+            return;
+        }
+    };
+
+    let num_qubits = 4;
+    let state_len = 1 << num_qubits;
+    let data = common::create_test_data(state_len);
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => {
+            println!("SKIP: Failed to copy to device");
+            return;
+        }
+    };
+
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "amplitude")
+            .expect("encode_from_gpu_ptr should succeed")
+    };
+
+    assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null");
+
+    unsafe {
+        let managed = &mut *dlpack_ptr;
+        assert!(managed.deleter.is_some(), "Deleter must be present");
+        let deleter = managed
+            .deleter
+            .take()
+            .expect("Deleter function pointer is missing");
+        deleter(dlpack_ptr);
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_with_stream_amplitude_success() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => {
+            println!("SKIP: No GPU available");
+            return;
+        }
+    };
+
+    let num_qubits = 3;
+    let state_len = 1 << num_qubits;
+    let data = common::create_test_data(state_len);
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => {
+            println!("SKIP: Failed to copy to device");
+            return;
+        }
+    };
+
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr_with_stream(
+                ptr,
+                data.len(),
+                num_qubits,
+                "amplitude",
+                std::ptr::null_mut(),
+            )
+            .expect("encode_from_gpu_ptr_with_stream should succeed")
+    };
+
+    assert!(!dlpack_ptr.is_null());
+
+    unsafe {
+        let managed = &mut *dlpack_ptr;
+        let deleter = managed.deleter.take().expect("Deleter missing");
+        deleter(dlpack_ptr);
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_amplitude_success() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => {
+            println!("SKIP: No GPU available");
+            return;
+        }
+    };
+
+    let num_qubits = 3;
+    let state_len = 1 << num_qubits;
+    let num_samples = 4;
+    let sample_size = state_len;
+    let total = num_samples * sample_size;
+    let data = common::create_test_data(total);
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+
+    let data_d = match device.htod_sync_copy(data.as_slice()) {
+        Ok(b) => b,
+        Err(_) => {
+            println!("SKIP: Failed to copy to device");
+            return;
+        }
+    };
+
+    let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size, 
num_qubits, "amplitude")
+            .expect("encode_batch_from_gpu_ptr should succeed")
+    };
+
+    assert!(!dlpack_ptr.is_null());
+
+    unsafe {
+        let managed = &mut *dlpack_ptr;
+        let deleter = managed.deleter.take().expect("Deleter missing");
+        deleter(dlpack_ptr);
+    }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_basis_success() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => {
+            println!("SKIP: No GPU available");
+            return;
+        }
+    };
+
+    let num_qubits = 3;
+    let basis_index: usize = 0;
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+
+    let indices: Vec<usize> = vec![basis_index];
+    let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+        Ok(b) => b,
+        Err(_) => {
+            println!("SKIP: Failed to copy to device");
+            return;
+        }
+    };
+
+    let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_from_gpu_ptr(ptr, 1, num_qubits, "basis")
+            .expect("encode_from_gpu_ptr basis should succeed")
+    };
+
+    assert!(!dlpack_ptr.is_null());
+
+    unsafe {
+        let managed = &mut *dlpack_ptr;
+        assert!(managed.deleter.is_some(), "Deleter must be present");
+        let deleter = managed
+            .deleter
+            .take()
+            .expect("Deleter function pointer is missing");
+        deleter(dlpack_ptr);
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_basis_success() {
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => e,
+        Err(_) => {
+            println!("SKIP: No GPU available");
+            return;
+        }
+    };
+
+    let num_qubits = 3;
+    let num_samples = 4;
+    let sample_size = 1;
+    let state_len = 1 << num_qubits;
+    let basis_indices: Vec<usize> = (0..num_samples).map(|i| i % 
state_len).collect();
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+
+    let indices_d = match device.htod_sync_copy(basis_indices.as_slice()) {
+        Ok(b) => b,
+        Err(_) => {
+            println!("SKIP: Failed to copy to device");
+            return;
+        }
+    };
+
+    let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size, 
num_qubits, "basis")
+            .expect("encode_batch_from_gpu_ptr basis should succeed")
+    };
+
+    assert!(!dlpack_ptr.is_null());
+
+    unsafe {
+        let managed = &mut *dlpack_ptr;
+        let deleter = managed.deleter.take().expect("Deleter missing");
+        deleter(dlpack_ptr);
+    }
+}
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index fd655b2da..69743a862 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -276,30 +276,44 @@ fn get_torch_cuda_stream_ptr(tensor: &Bound<'_, PyAny>) 
-> PyResult<*mut c_void>
 }
 
 /// Validate a CUDA tensor for direct GPU encoding
-/// Checks: dtype=float64, contiguous, non-empty, device_id matches engine
+/// Checks: dtype matches encoding method, contiguous, non-empty, device_id 
matches engine
 fn validate_cuda_tensor_for_encoding(
     tensor: &Bound<'_, PyAny>,
     expected_device_id: usize,
     encoding_method: &str,
 ) -> PyResult<()> {
     let method = encoding_method.to_ascii_lowercase();
-    // Check encoding method support (currently amplitude and angle are 
supported for CUDA tensors)
-    if method != "amplitude" && method != "angle" {
-        return Err(PyRuntimeError::new_err(format!(
-            "CUDA tensor encoding currently only supports 'amplitude' and 
'angle' methods, got '{}'. \
-             Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
-            encoding_method
-        )));
-    }
 
-    // Check dtype is float64
+    // Check encoding method support and dtype (ASCII lowercase for 
case-insensitive match).
     let dtype = tensor.getattr("dtype")?;
     let dtype_str: String = dtype.str()?.extract()?;
-    if !dtype_str.contains("float64") {
-        return Err(PyRuntimeError::new_err(format!(
-            "CUDA tensor must have dtype float64, got {}. Use 
tensor.to(torch.float64)",
-            dtype_str
-        )));
+    let dtype_str_lower = dtype_str.to_ascii_lowercase();
+    match method.as_str() {
+        "amplitude" | "angle" => {
+            if !dtype_str_lower.contains("float64") {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype float64 for {} encoding, got 
{}. \
+                     Use tensor.to(torch.float64)",
+                    method, dtype_str
+                )));
+            }
+        }
+        "basis" => {
+            if !dtype_str_lower.contains("int64") {
+                return Err(PyRuntimeError::new_err(format!(
+                    "CUDA tensor must have dtype int64 for basis encoding, got 
{}. \
+                     Use tensor.to(torch.int64)",
+                    dtype_str
+                )));
+            }
+        }
+        _ => {
+            return Err(PyRuntimeError::new_err(format!(
+                "CUDA tensor encoding currently only supports 'amplitude', 
'angle', or 'basis' methods, got '{}'. \
+                 Use tensor.cpu() to convert to CPU tensor for other encoding 
methods.",
+                encoding_method
+            )));
+        }
     }
 
     // Check contiguous
@@ -370,7 +384,7 @@ struct DLPackTensorInfo {
     /// This is owned by this struct and will be freed via deleter on drop
     managed_ptr: *mut DLManagedTensor,
     /// Data pointer inside dl_tensor (GPU memory, owned by managed_ptr)
-    data_ptr: *const f64,
+    data_ptr: *const c_void,
     shape: Vec<i64>,
     /// CUDA device ID from DLPack metadata.
     /// Used for defensive validation against PyTorch API device ID.
@@ -530,7 +544,7 @@ fn extract_dlpack_tensor(_py: Python<'_>, tensor: 
&Bound<'_, PyAny>) -> PyResult
 
         Ok(DLPackTensorInfo {
             managed_ptr,
-            data_ptr,
+            data_ptr: data_ptr as *const std::ffi::c_void,
             shape,
             device_id,
         })
@@ -654,7 +668,7 @@ impl QdpEngine {
                         let ptr = unsafe {
                             self.engine
                                 .encode_from_gpu_ptr_with_stream(
-                                    tensor_info.data_ptr,
+                                    tensor_info.data_ptr as *const 
std::ffi::c_void,
                                     input_len,
                                     num_qubits,
                                     encoding_method,
@@ -677,7 +691,7 @@ impl QdpEngine {
                         let ptr = unsafe {
                             self.engine
                                 .encode_batch_from_gpu_ptr_with_stream(
-                                    tensor_info.data_ptr,
+                                    tensor_info.data_ptr as *const 
std::ffi::c_void,
                                     num_samples,
                                     sample_size,
                                     num_qubits,

(mahout) branch main updated: [QDP] basis GPU‑pointer support (#934)

Reply via email to