This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 8cf771add [QDP] basis GPU‑pointer support (#934)
8cf771add is described below
commit 8cf771addf3e522fa540b55aaa472df90b212584
Author: Jie-Kai Chang <[email protected]>
AuthorDate: Mon Feb 2 01:46:29 2026 +0800
[QDP] basis GPU‑pointer support (#934)
* basis GPU‑pointer support
Signed-off-by: 400Ping <[email protected]>
* fix pre-commit
Signed-off-by: 400Ping <[email protected]>
* update
Signed-off-by: 400Ping <[email protected]>
* fix conflicts
Signed-off-by: 400Ping <[email protected]>
* fix conflicts
Signed-off-by: 400Ping <[email protected]>
* Revert "fix conflicts"
This reverts commit fd353c3c81d282718d566ff8eac8312e0144aaaf.
Signed-off-by: 400Ping <[email protected]>
* Revert "fix conflicts"
This reverts commit 2dd586838337f1e41e1bf42854cd18fa51d8d113.
Signed-off-by: 400Ping <[email protected]>
* fix ci
Signed-off-by: 400Ping <[email protected]>
* fix
Signed-off-by: 400Ping <[email protected]>
* fix pre-commit
Signed-off-by: 400Ping <[email protected]>
* add unit test
Signed-off-by: 400Ping <[email protected]>
* fix ci error
Signed-off-by: 400Ping <[email protected]>
* update
Signed-off-by: 400Ping <[email protected]>
* fix build
Signed-off-by: 400Ping <[email protected]>
* fix pre-commit
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
Signed-off-by: 400Ping <[email protected]>
---
qdp/qdp-core/src/gpu/cuda_ffi.rs | 4 +
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 4 +-
qdp/qdp-core/src/gpu/encodings/angle.rs | 4 +-
qdp/qdp-core/src/gpu/encodings/basis.rs | 4 +-
qdp/qdp-core/src/gpu/encodings/iqp.rs | 4 +-
qdp/qdp-core/src/lib.rs | 283 ++++++++++--------
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 425 ++++++++++++++++++++++++++++
qdp/qdp-python/src/lib.rs | 52 ++--
8 files changed, 640 insertions(+), 140 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs
index 491e1382b..2ed60c311 100644
--- a/qdp/qdp-core/src/gpu/cuda_ffi.rs
+++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs
@@ -21,9 +21,12 @@ use std::ffi::c_void;
pub(crate) const CUDA_MEMCPY_HOST_TO_DEVICE: u32 = 1;
pub(crate) const CUDA_EVENT_DISABLE_TIMING: u32 = 0x02;
pub(crate) const CUDA_EVENT_DEFAULT: u32 = 0x00;
+#[allow(dead_code)]
pub(crate) const CUDA_MEMORY_TYPE_DEVICE: i32 = 2;
+#[allow(dead_code)]
pub(crate) const CUDA_MEMORY_TYPE_MANAGED: i32 = 3;
+#[allow(dead_code)]
#[repr(C)]
pub(crate) struct CudaPointerAttributes {
pub memory_type: i32,
@@ -45,6 +48,7 @@ unsafe extern "C" {
pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags:
u32) -> i32;
pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32;
+ #[allow(dead_code)]
pub(crate) fn cudaPointerGetAttributes(
attributes: *mut CudaPointerAttributes,
ptr: *const c_void,
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 62313550d..032418c46 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -26,7 +26,7 @@ use super::QuantumEncoder;
#[cfg(target_os = "linux")]
use crate::error::cuda_error_to_string;
use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
use crate::gpu::pipeline::run_dual_stream_pipeline;
use cudarc::driver::CudaDevice;
@@ -70,7 +70,7 @@ impl QuantumEncoder for AmplitudeEncoder {
// Allocate GPU state vector
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(_device, num_qubits,
crate::gpu::memory::Precision::Float64)?
+ GpuStateVector::new(_device, num_qubits, Precision::Float64)?
};
// Async Pipeline for large data
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 2a91cb017..d1a1091d5 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -24,7 +24,7 @@ use super::{QuantumEncoder, validate_qubit_count};
#[cfg(target_os = "linux")]
use crate::error::cuda_error_to_string;
use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
#[cfg(target_os = "linux")]
use crate::gpu::pipeline::run_dual_stream_pipeline_aligned;
use cudarc::driver::CudaDevice;
@@ -63,7 +63,7 @@ impl QuantumEncoder for AngleEncoder {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index e883372f5..33e8f14bf 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -24,7 +24,7 @@ use super::{QuantumEncoder, validate_qubit_count};
#[cfg(target_os = "linux")]
use crate::error::cuda_error_to_string;
use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
use cudarc::driver::CudaDevice;
use std::sync::Arc;
@@ -76,7 +76,7 @@ impl QuantumEncoder for BasisEncoder {
// Allocate GPU state vector
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 245229a40..7a177a208 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -20,7 +20,7 @@ use super::QuantumEncoder;
#[cfg(target_os = "linux")]
use crate::error::cuda_error_to_string;
use crate::error::{MahoutError, Result};
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{GpuStateVector, Precision};
use cudarc::driver::CudaDevice;
use std::sync::Arc;
@@ -87,7 +87,7 @@ impl QuantumEncoder for IqpEncoder {
let state_vector = {
crate::profile_scope!("GPU::Alloc");
- GpuStateVector::new(device, num_qubits,
crate::gpu::memory::Precision::Float64)?
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
};
let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index a8028c62e..6e50414a9 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -35,67 +35,12 @@ mod profiling;
pub use error::{MahoutError, Result, cuda_error_to_string};
pub use gpu::memory::Precision;
-#[cfg(target_os = "linux")]
-use std::ffi::c_void;
use std::sync::Arc;
use crate::dlpack::DLManagedTensor;
-#[cfg(target_os = "linux")]
-use crate::gpu::cuda_sync::sync_cuda_stream;
use crate::gpu::get_encoder;
use cudarc::driver::CudaDevice;
-#[cfg(target_os = "linux")]
-fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const f64) -> Result<()>
{
- use crate::gpu::cuda_ffi::{
- CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED,
CudaPointerAttributes,
- cudaPointerGetAttributes,
- };
- use std::ffi::c_void;
-
- if ptr.is_null() {
- return Err(MahoutError::InvalidInput(
- "Input GPU pointer is null".to_string(),
- ));
- }
-
- let mut attrs = CudaPointerAttributes {
- memory_type: 0,
- device: 0,
- device_pointer: std::ptr::null_mut(),
- host_pointer: std::ptr::null_mut(),
- is_managed: 0,
- allocation_flags: 0,
- };
-
- let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr as
*const c_void) };
- if ret != 0 {
- return Err(MahoutError::InvalidInput(format!(
- "cudaPointerGetAttributes failed for input pointer: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
-
- if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type !=
CUDA_MEMORY_TYPE_MANAGED
- {
- return Err(MahoutError::InvalidInput(format!(
- "Input pointer is not CUDA device memory (memory_type={})",
- attrs.memory_type
- )));
- }
-
- let device_ordinal = device.ordinal() as i32;
- if attrs.device >= 0 && attrs.device != device_ordinal {
- return Err(MahoutError::InvalidInput(format!(
- "Input pointer device mismatch: pointer on cuda:{}, engine on
cuda:{}",
- attrs.device, device_ordinal
- )));
- }
-
- Ok(())
-}
-
/// Main entry point for Mahout QDP
///
/// Manages GPU context and dispatches encoding tasks.
@@ -366,18 +311,15 @@ impl QdpEngine {
/// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would
otherwise
/// be required.
///
- /// Uses the default CUDA stream. For PyTorch stream interop, use
- /// `encode_from_gpu_ptr_with_stream`.
- ///
/// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr`
to trait)
/// to reduce duplication with AmplitudeEncoder::encode(). This would also
make it
/// easier to add GPU pointer support for other encoders (angle, basis) in
the future.
///
/// # Arguments
- /// * `input_d` - Device pointer to input data (f64 array on GPU)
- /// * `input_len` - Number of f64 elements in the input
+ /// * `input_d` - Device pointer to input data (f64 for amplitude/angle,
usize/int64 for basis)
+ /// * `input_len` - Number of elements in the input
/// * `num_qubits` - Number of qubits for encoding
- /// * `encoding_method` - Strategy (currently "amplitude" and "angle"
supported)
+ /// * `encoding_method` - Strategy ("amplitude", "angle", or "basis")
///
/// # Returns
/// DLPack pointer for zero-copy PyTorch integration
@@ -385,12 +327,12 @@ impl QdpEngine {
/// # Safety
/// The input pointer must:
/// - Point to valid GPU memory on the same device as the engine
- /// - Contain at least `input_len` f64 elements
+ /// - Contain at least `input_len` elements of the expected dtype
/// - Remain valid for the duration of this call
#[cfg(target_os = "linux")]
pub unsafe fn encode_from_gpu_ptr(
&self,
- input_d: *const f64,
+ input_d: *const std::ffi::c_void,
input_len: usize,
num_qubits: usize,
encoding_method: &str,
@@ -406,38 +348,36 @@ impl QdpEngine {
}
}
- /// Encode from existing GPU pointer on a specified CUDA stream.
+ /// Encode from existing GPU pointer with a specific CUDA stream.
///
- /// The caller must ensure the stream is valid for the device, and that any
- /// producer work on that stream has been enqueued before this call.
+ /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr) but uses
the given `stream`
+ /// for kernel launches. Pass null for default stream.
///
/// # Safety
- /// In addition to the `encode_from_gpu_ptr` requirements, the stream
pointer
- /// must remain valid for the duration of this call.
+ /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr).
Additionally, `stream` must
+ /// be a valid CUDA stream on the same device as the engine, or null.
#[cfg(target_os = "linux")]
pub unsafe fn encode_from_gpu_ptr_with_stream(
&self,
- input_d: *const f64,
+ input_d: *const std::ffi::c_void,
input_len: usize,
num_qubits: usize,
encoding_method: &str,
- stream: *mut c_void,
+ stream: *mut std::ffi::c_void,
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtr");
- if input_len == 0 {
- return Err(MahoutError::InvalidInput(
- "Input data cannot be empty".into(),
- ));
- }
-
- validate_cuda_input_ptr(&self.device, input_d)?;
-
let state_len = 1usize << num_qubits;
- let method = encoding_method.to_lowercase();
+ let method = encoding_method.to_ascii_lowercase();
match method.as_str() {
"amplitude" => {
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+
if input_len > state_len {
return Err(MahoutError::InvalidInput(format!(
"Input size {} exceeds state vector size {} (2^{}
qubits)",
@@ -445,6 +385,8 @@ impl QdpEngine {
)));
}
+ let input_d = input_d as *const f64;
+
let state_vector = {
crate::profile_scope!("GPU::Alloc");
gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
@@ -452,12 +394,12 @@ impl QdpEngine {
let inv_norm = {
crate::profile_scope!("GPU::NormFromPtr");
+ // SAFETY: input_d validity is guaranteed by the caller's
safety contract
unsafe {
-
gpu::AmplitudeEncoder::calculate_inv_norm_gpu_with_stream(
+ gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
&self.device,
input_d,
input_len,
- stream,
)?
}
};
@@ -492,7 +434,7 @@ impl QdpEngine {
{
crate::profile_scope!("GPU::Synchronize");
- sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
}
let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
@@ -506,6 +448,8 @@ impl QdpEngine {
)));
}
+ let angles_d = input_d as *const f64;
+
let state_vector = {
crate::profile_scope!("GPU::Alloc");
gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
@@ -521,7 +465,7 @@ impl QdpEngine {
crate::profile_scope!("GPU::KernelLaunch");
let ret = unsafe {
qdp_kernels::launch_angle_encode(
- input_d,
+ angles_d,
state_ptr as *mut std::ffi::c_void,
state_len,
num_qubits as u32,
@@ -540,14 +484,66 @@ impl QdpEngine {
{
crate::profile_scope!("GPU::Synchronize");
- sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
+ }
+ "basis" => {
+ if input_len != 1 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Basis encoding expects exactly 1 value (the basis
index), got {}",
+ input_len
+ )));
+ }
+
+ let basis_indices_d = input_d as *const usize;
+
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ gpu::GpuStateVector::new(&self.device, num_qubits,
self.precision)?
+ };
+
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+
+ // Use batch API with num_samples=1 to avoid D2H copy;
launch_basis_encode takes host usize.
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_basis_encode_batch(
+ basis_indices_d,
+ state_ptr as *mut std::ffi::c_void,
+ 1,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Basis encoding kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
}
let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
Ok(state_vector.to_dlpack())
}
_ => Err(MahoutError::NotImplemented(format!(
- "GPU pointer encoding currently only supports 'amplitude' and
'angle' methods, got '{}'",
+ "GPU pointer encoding currently only supports 'amplitude',
'angle', or 'basis' methods, got '{}'",
encoding_method
))),
}
@@ -556,17 +552,15 @@ impl QdpEngine {
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
///
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
- /// Uses the default CUDA stream. For PyTorch stream interop, use
- /// `encode_batch_from_gpu_ptr_with_stream`.
///
/// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr`
TODO).
///
/// # Arguments
- /// * `input_batch_d` - Device pointer to batch input data (flattened f64
array on GPU)
+ /// * `input_batch_d` - Device pointer to batch input data (f64 for
amplitude/angle, usize/int64 for basis)
/// * `num_samples` - Number of samples in the batch
- /// * `sample_size` - Size of each sample in f64 elements
+ /// * `sample_size` - Size of each sample in elements
/// * `num_qubits` - Number of qubits for encoding
- /// * `encoding_method` - Strategy (currently "amplitude" and "angle"
supported)
+ /// * `encoding_method` - Strategy ("amplitude", "angle", or "basis")
///
/// # Returns
/// Single DLPack pointer containing all encoded states (shape:
[num_samples, 2^num_qubits])
@@ -574,12 +568,12 @@ impl QdpEngine {
/// # Safety
/// The input pointer must:
/// - Point to valid GPU memory on the same device as the engine
- /// - Contain at least `num_samples * sample_size` f64 elements
+ /// - Contain at least `num_samples * sample_size` elements of the
expected dtype
/// - Remain valid for the duration of this call
#[cfg(target_os = "linux")]
pub unsafe fn encode_batch_from_gpu_ptr(
&self,
- input_batch_d: *const f64,
+ input_batch_d: *const std::ffi::c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
@@ -597,42 +591,43 @@ impl QdpEngine {
}
}
- /// Encode batch from existing GPU pointer on a specified CUDA stream.
+ /// Encode batch from existing GPU pointer with a specific CUDA stream.
+ ///
+ /// Same as [`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr)
but uses the given
+ /// `stream` for kernel launches. Pass null for default stream.
///
/// # Safety
- /// In addition to the `encode_batch_from_gpu_ptr` requirements, the
stream pointer
- /// must remain valid for the duration of this call.
+ /// Same as
[`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr). Additionally,
+ /// `stream` must be a valid CUDA stream on the same device as the engine,
or null.
#[cfg(target_os = "linux")]
pub unsafe fn encode_batch_from_gpu_ptr_with_stream(
&self,
- input_batch_d: *const f64,
+ input_batch_d: *const std::ffi::c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
encoding_method: &str,
- stream: *mut c_void,
+ stream: *mut std::ffi::c_void,
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
+ let state_len = 1usize << num_qubits;
+ let method = encoding_method.to_ascii_lowercase();
+
if num_samples == 0 {
return Err(MahoutError::InvalidInput(
"Number of samples cannot be zero".into(),
));
}
- if sample_size == 0 {
- return Err(MahoutError::InvalidInput(
- "Sample size cannot be zero".into(),
- ));
- }
-
- validate_cuda_input_ptr(&self.device, input_batch_d)?;
-
- let state_len = 1usize << num_qubits;
- let method = encoding_method.to_ascii_lowercase();
-
match method.as_str() {
"amplitude" => {
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"Sample size {} exceeds state vector size {} (2^{}
qubits)",
@@ -640,6 +635,8 @@ impl QdpEngine {
)));
}
+ let input_batch_d = input_batch_d as *const f64;
+
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
@@ -677,9 +674,9 @@ impl QdpEngine {
buffer
};
+ // Validate norms on host to catch zero or NaN samples early
{
crate::profile_scope!("GPU::NormValidation");
- sync_cuda_stream(stream, "Norm stream synchronize
failed")?;
let host_inv_norms =
self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e|
{
MahoutError::Cuda(format!("Failed to copy norms to
host: {:?}", e))
@@ -726,7 +723,7 @@ impl QdpEngine {
{
crate::profile_scope!("GPU::Synchronize");
- sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
}
let batch_state_vector =
@@ -736,6 +733,12 @@ impl QdpEngine {
"angle" => {
use cudarc::driver::DevicePtrMut;
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+
if sample_size != num_qubits {
return Err(MahoutError::InvalidInput(format!(
"Angle encoding expects sample_size={} (one angle per
qubit), got {}",
@@ -743,6 +746,8 @@ impl QdpEngine {
)));
}
+ let input_batch_d = input_batch_d as *const f64;
+
// Validate that all input angles are finite (no NaN/Inf),
consistent with
// CPU and host-side batch angle encoding paths.
let angle_validation_buffer = {
@@ -778,7 +783,6 @@ impl QdpEngine {
{
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
- sync_cuda_stream(stream, "Angle norm stream synchronize
failed")?;
let host_norms = self
.device
.dtoh_sync_copy(&angle_validation_buffer)
@@ -833,7 +837,60 @@ impl QdpEngine {
{
crate::profile_scope!("GPU::Synchronize");
- sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+
+ let batch_state_vector =
+ batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
+ }
+ "basis" => {
+ if sample_size != 1 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Basis encoding expects sample_size=1 (one index per
sample), got {}",
+ sample_size
+ )));
+ }
+
+ let basis_indices_d = input_batch_d as *const usize;
+
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
+ };
+
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected
float64 buffer)"
+ .to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_basis_encode_batch(
+ basis_indices_d,
+ state_ptr as *mut std::ffi::c_void,
+ num_samples,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch basis encoding kernel failed with CUDA
error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
}
let batch_state_vector =
@@ -841,7 +898,7 @@ impl QdpEngine {
Ok(batch_state_vector.to_dlpack())
}
_ => Err(MahoutError::NotImplemented(format!(
- "GPU pointer batch encoding currently only supports
'amplitude' and 'angle' methods, got '{}'",
+ "GPU pointer batch encoding currently only supports
'amplitude', 'angle', or 'basis' methods, got '{}'",
encoding_method
))),
}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
new file mode 100644
index 000000000..7851eb1e1
--- /dev/null
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -0,0 +1,425 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Unit and integration tests for encode_from_gpu_ptr and
encode_batch_from_gpu_ptr.
+
+#![cfg(target_os = "linux")]
+
+use std::ffi::c_void;
+
+use cudarc::driver::{CudaDevice, DevicePtr};
+use qdp_core::{MahoutError, QdpEngine};
+
+mod common;
+
+// ---- Validation / error-path tests (return before using pointer) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_unknown_method() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 4, 2,
"unknown_encoding") };
+
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::NotImplemented(msg)) => {
+ assert!(msg.contains("unknown_encoding") || msg.contains("only
supports"));
+ }
+ _ => panic!("expected NotImplemented, got {:?}", result),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_empty_input() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2,
"amplitude") };
+
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("empty") || msg.contains("cannot be empty"));
+ }
+ _ => panic!("expected InvalidInput, got {:?}", result),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_input_exceeds_state() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ // 2 qubits -> state_len = 4; request input_len = 10
+ let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 10, 2,
"amplitude") };
+
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("exceeds") && msg.contains("state"));
+ }
+ _ => panic!("expected InvalidInput, got {:?}", result),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_unknown_method() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ let result =
+ unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 2, 4, 2,
"unknown_method") };
+
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::NotImplemented(msg)) => {
+ assert!(msg.contains("unknown_method") || msg.contains("only
supports"));
+ }
+ _ => panic!("expected NotImplemented, got {:?}", result),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_amplitude_num_samples_zero() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ let result =
+ unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 0, 4, 2,
"amplitude") };
+
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("zero") || msg.contains("samples"));
+ }
+ _ => panic!("expected InvalidInput, got {:?}", result),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_basis_input_len_not_one() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ // Basis single encoding expects exactly 1 value; input_len != 1 must
return error.
+ let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2,
"basis") };
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("exactly 1") || msg.contains("basis"));
+ }
+ _ => panic!("expected InvalidInput for input_len != 1, got {:?}",
result),
+ }
+
+ let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 3, 2,
"basis") };
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("exactly 1") || msg.contains("basis"));
+ }
+ _ => panic!("expected InvalidInput for input_len != 1, got {:?}",
result),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_basis_sample_size_not_one() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => return,
+ };
+
+ // Basis batch expects sample_size == 1 (one index per sample).
+ let result = unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(),
2, 4, 2, "basis") };
+ assert!(result.is_err());
+ match result {
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("sample_size=1") || msg.contains("one
index"));
+ }
+ _ => panic!(
+ "expected InvalidInput for sample_size != 1, got {:?}",
+ result
+ ),
+ }
+}
+
+// ---- Happy-path tests (real GPU memory) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_amplitude_success() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 4;
+ let state_len = 1 << num_qubits;
+ let data = common::create_test_data(state_len);
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, data.len(), num_qubits, "amplitude")
+ .expect("encode_from_gpu_ptr should succeed")
+ };
+
+ assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null");
+
+ unsafe {
+ let managed = &mut *dlpack_ptr;
+ assert!(managed.deleter.is_some(), "Deleter must be present");
+ let deleter = managed
+ .deleter
+ .take()
+ .expect("Deleter function pointer is missing");
+ deleter(dlpack_ptr);
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_with_stream_amplitude_success() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 3;
+ let state_len = 1 << num_qubits;
+ let data = common::create_test_data(state_len);
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_with_stream(
+ ptr,
+ data.len(),
+ num_qubits,
+ "amplitude",
+ std::ptr::null_mut(),
+ )
+ .expect("encode_from_gpu_ptr_with_stream should succeed")
+ };
+
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let managed = &mut *dlpack_ptr;
+ let deleter = managed.deleter.take().expect("Deleter missing");
+ deleter(dlpack_ptr);
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_amplitude_success() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 3;
+ let state_len = 1 << num_qubits;
+ let num_samples = 4;
+ let sample_size = state_len;
+ let total = num_samples * sample_size;
+ let data = common::create_test_data(total);
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size,
num_qubits, "amplitude")
+ .expect("encode_batch_from_gpu_ptr should succeed")
+ };
+
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let managed = &mut *dlpack_ptr;
+ let deleter = managed.deleter.take().expect("Deleter missing");
+ deleter(dlpack_ptr);
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_basis_success() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 3;
+ let basis_index: usize = 0;
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let indices: Vec<usize> = vec![basis_index];
+ let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr(ptr, 1, num_qubits, "basis")
+ .expect("encode_from_gpu_ptr basis should succeed")
+ };
+
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let managed = &mut *dlpack_ptr;
+ assert!(managed.deleter.is_some(), "Deleter must be present");
+ let deleter = managed
+ .deleter
+ .take()
+ .expect("Deleter function pointer is missing");
+ deleter(dlpack_ptr);
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_basis_success() {
+ let engine = match QdpEngine::new(0) {
+ Ok(e) => e,
+ Err(_) => {
+ println!("SKIP: No GPU available");
+ return;
+ }
+ };
+
+ let num_qubits = 3;
+ let num_samples = 4;
+ let sample_size = 1;
+ let state_len = 1 << num_qubits;
+ let basis_indices: Vec<usize> = (0..num_samples).map(|i| i %
state_len).collect();
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+
+ let indices_d = match device.htod_sync_copy(basis_indices.as_slice()) {
+ Ok(b) => b,
+ Err(_) => {
+ println!("SKIP: Failed to copy to device");
+ return;
+ }
+ };
+
+ let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr(ptr, num_samples, sample_size,
num_qubits, "basis")
+ .expect("encode_batch_from_gpu_ptr basis should succeed")
+ };
+
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let managed = &mut *dlpack_ptr;
+ let deleter = managed.deleter.take().expect("Deleter missing");
+ deleter(dlpack_ptr);
+ }
+}
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index fd655b2da..69743a862 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -276,30 +276,44 @@ fn get_torch_cuda_stream_ptr(tensor: &Bound<'_, PyAny>)
-> PyResult<*mut c_void>
}
/// Validate a CUDA tensor for direct GPU encoding
-/// Checks: dtype=float64, contiguous, non-empty, device_id matches engine
+/// Checks: dtype matches encoding method, contiguous, non-empty, device_id
matches engine
fn validate_cuda_tensor_for_encoding(
tensor: &Bound<'_, PyAny>,
expected_device_id: usize,
encoding_method: &str,
) -> PyResult<()> {
let method = encoding_method.to_ascii_lowercase();
- // Check encoding method support (currently amplitude and angle are
supported for CUDA tensors)
- if method != "amplitude" && method != "angle" {
- return Err(PyRuntimeError::new_err(format!(
- "CUDA tensor encoding currently only supports 'amplitude' and
'angle' methods, got '{}'. \
- Use tensor.cpu() to convert to CPU tensor for other encoding
methods.",
- encoding_method
- )));
- }
- // Check dtype is float64
+ // Check encoding method support and dtype (ASCII lowercase for
case-insensitive match).
let dtype = tensor.getattr("dtype")?;
let dtype_str: String = dtype.str()?.extract()?;
- if !dtype_str.contains("float64") {
- return Err(PyRuntimeError::new_err(format!(
- "CUDA tensor must have dtype float64, got {}. Use
tensor.to(torch.float64)",
- dtype_str
- )));
+ let dtype_str_lower = dtype_str.to_ascii_lowercase();
+ match method.as_str() {
+ "amplitude" | "angle" => {
+ if !dtype_str_lower.contains("float64") {
+ return Err(PyRuntimeError::new_err(format!(
+ "CUDA tensor must have dtype float64 for {} encoding, got
{}. \
+ Use tensor.to(torch.float64)",
+ method, dtype_str
+ )));
+ }
+ }
+ "basis" => {
+ if !dtype_str_lower.contains("int64") {
+ return Err(PyRuntimeError::new_err(format!(
+ "CUDA tensor must have dtype int64 for basis encoding, got
{}. \
+ Use tensor.to(torch.int64)",
+ dtype_str
+ )));
+ }
+ }
+ _ => {
+ return Err(PyRuntimeError::new_err(format!(
+ "CUDA tensor encoding currently only supports 'amplitude',
'angle', or 'basis' methods, got '{}'. \
+ Use tensor.cpu() to convert to CPU tensor for other encoding
methods.",
+ encoding_method
+ )));
+ }
}
// Check contiguous
@@ -370,7 +384,7 @@ struct DLPackTensorInfo {
/// This is owned by this struct and will be freed via deleter on drop
managed_ptr: *mut DLManagedTensor,
/// Data pointer inside dl_tensor (GPU memory, owned by managed_ptr)
- data_ptr: *const f64,
+ data_ptr: *const c_void,
shape: Vec<i64>,
/// CUDA device ID from DLPack metadata.
/// Used for defensive validation against PyTorch API device ID.
@@ -530,7 +544,7 @@ fn extract_dlpack_tensor(_py: Python<'_>, tensor:
&Bound<'_, PyAny>) -> PyResult
Ok(DLPackTensorInfo {
managed_ptr,
- data_ptr,
+ data_ptr: data_ptr as *const std::ffi::c_void,
shape,
device_id,
})
@@ -654,7 +668,7 @@ impl QdpEngine {
let ptr = unsafe {
self.engine
.encode_from_gpu_ptr_with_stream(
- tensor_info.data_ptr,
+ tensor_info.data_ptr as *const
std::ffi::c_void,
input_len,
num_qubits,
encoding_method,
@@ -677,7 +691,7 @@ impl QdpEngine {
let ptr = unsafe {
self.engine
.encode_batch_from_gpu_ptr_with_stream(
- tensor_info.data_ptr,
+ tensor_info.data_ptr as *const
std::ffi::c_void,
num_samples,
sample_size,
num_qubits,