This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 5371667d0 [QDP] Delegate GPU pointer encoding to QuantumEncoder trait
(#1006)
5371667d0 is described below
commit 5371667d05aea52f3ce146d379f6c7cceb9fc230
Author: Vic Wen <[email protected]>
AuthorDate: Sat Feb 7 01:21:59 2026 +0800
[QDP] Delegate GPU pointer encoding to QuantumEncoder trait (#1006)
* refactor: delegate GPU pointer encoding to QuantumEncoder trait
* test: error handling for unsupported GPU encoding methods
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 158 +++++++++
qdp/qdp-core/src/gpu/encodings/angle.rs | 154 +++++++++
qdp/qdp-core/src/gpu/encodings/basis.rs | 107 ++++++
qdp/qdp-core/src/gpu/encodings/mod.rs | 44 +++
qdp/qdp-core/src/lib.rs | 492 ++--------------------------
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 10 +-
testing/qdp/test_bindings.py | 5 +-
7 files changed, 493 insertions(+), 477 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 85259e18a..fc57d189e 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -299,6 +299,164 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ let state_len = 1 << num_qubits;
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+ if input_len > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input size {} exceeds state vector size {} (2^{} qubits)",
+ input_len, state_len, num_qubits
+ )));
+ }
+ let input_d = input_d as *const f64;
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
+ };
+ let inv_norm = {
+ crate::profile_scope!("GPU::NormFromPtr");
+ unsafe { Self::calculate_inv_norm_gpu_with_stream(device, input_d,
input_len, stream)? }
+ };
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ launch_amplitude_encode(
+ input_d,
+ state_ptr as *mut c_void,
+ input_len,
+ state_len,
+ inv_norm,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Amplitude encode kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+ }
+ Ok(state_vector)
+ }
+
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_batch_d: *const c_void,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ let state_len = 1 << num_qubits;
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+ if sample_size > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Sample size {} exceeds state vector size {} (2^{} qubits)",
+ sample_size, state_len, num_qubits
+ )));
+ }
+ let input_batch_d = input_batch_d as *const f64;
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ };
+ let inv_norms_gpu = {
+ crate::profile_scope!("GPU::BatchNormKernel");
+ use cudarc::driver::DevicePtrMut;
+ let mut buffer =
device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!("Failed to allocate norm
buffer: {:?}", e))
+ })?;
+ let ret = unsafe {
+ launch_l2_norm_batch(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f64,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Norm reduction kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ buffer
+ };
+ {
+ crate::profile_scope!("GPU::NormValidation");
+ let host_inv_norms = device
+ .dtoh_sync_copy(&inv_norms_gpu)
+ .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms
to host: {:?}", e)))?;
+ if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+ return Err(MahoutError::InvalidInput(
+ "One or more samples have zero or invalid
norm".to_string(),
+ ));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ use cudarc::driver::DevicePtr;
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ let ret = unsafe {
+ launch_amplitude_encode_batch(
+ input_batch_d,
+ state_ptr as *mut c_void,
+ *inv_norms_gpu.device_ptr() as *const f64,
+ num_samples,
+ sample_size,
+ state_len,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch kernel launch failed with CUDA error code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+ }
+ Ok(batch_state_vector)
+ }
+
fn name(&self) -> &'static str {
"amplitude"
}
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index d1a1091d5..1c3e5b8f5 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -217,6 +217,160 @@ impl QuantumEncoder for AngleEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if input_len != num_qubits {
+ return Err(MahoutError::InvalidInput(format!(
+ "Angle encoding expects {} values (one per qubit), got {}",
+ num_qubits, input_len
+ )));
+ }
+ let state_len = 1 << num_qubits;
+ let angles_d = input_d as *const f64;
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
+ };
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_angle_encode(
+ angles_d,
+ state_ptr as *mut c_void,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Angle encoding kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+ Ok(state_vector)
+ }
+
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_batch_d: *const c_void,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+ if sample_size != num_qubits {
+ return Err(MahoutError::InvalidInput(format!(
+ "Angle encoding expects sample_size={} (one angle per qubit),
got {}",
+ num_qubits, sample_size
+ )));
+ }
+ let state_len = 1 << num_qubits;
+ let input_batch_d = input_batch_d as *const f64;
+ let angle_validation_buffer = {
+ crate::profile_scope!("GPU::AngleFiniteCheckBatch");
+ use cudarc::driver::DevicePtrMut;
+ let mut buffer =
device.alloc_zeros::<f64>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!(
+ "Failed to allocate angle validation buffer: {:?}",
+ e
+ ))
+ })?;
+ let ret = unsafe {
+ qdp_kernels::launch_l2_norm_batch(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f64,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Angle validation norm kernel failed with CUDA error code:
{} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ buffer
+ };
+ {
+ crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
+ let host_norms = device
+ .dtoh_sync_copy(&angle_validation_buffer)
+ .map_err(|e| {
+ MahoutError::Cuda(format!(
+ "Failed to copy angle validation norms to host: {:?}",
+ e
+ ))
+ })?;
+ if host_norms.iter().any(|v| !v.is_finite()) {
+ return Err(MahoutError::InvalidInput(
+ "Angle encoding batch contains non-finite values (NaN or
Inf)".to_string(),
+ ));
+ }
+ }
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ };
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_angle_encode_batch(
+ input_batch_d,
+ state_ptr as *mut c_void,
+ num_samples,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch angle encoding kernel failed: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+ Ok(batch_state_vector)
+ }
+
fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
validate_qubit_count(num_qubits)?;
if data.len() != num_qubits {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 33e8f14bf..569e1455e 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -225,6 +225,113 @@ impl QuantumEncoder for BasisEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if input_len != 1 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Basis encoding expects exactly 1 value (the basis index), got
{}",
+ input_len
+ )));
+ }
+ let state_len = 1 << num_qubits;
+ let basis_indices_d = input_d as *const usize;
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ GpuStateVector::new(device, num_qubits, Precision::Float64)?
+ };
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_basis_encode_batch(
+ basis_indices_d,
+ state_ptr as *mut c_void,
+ 1,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Basis encoding kernel failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+ Ok(state_vector)
+ }
+
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_batch_d: *const c_void,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if sample_size != 1 {
+ return Err(MahoutError::InvalidInput(format!(
+ "Basis encoding expects sample_size=1 (one index per sample),
got {}",
+ sample_size
+ )));
+ }
+ let state_len = 1 << num_qubits;
+ let basis_indices_d = input_batch_d as *const usize;
+ let batch_state_vector = {
+ crate::profile_scope!("GPU::AllocBatch");
+ GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ };
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string(),
+ )
+ })?;
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_basis_encode_batch(
+ basis_indices_d,
+ state_ptr as *mut c_void,
+ num_samples,
+ state_len,
+ num_qubits as u32,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch basis encoding kernel failed with CUDA error code:
{} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
+ }
+ Ok(batch_state_vector)
+ }
+
fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
// Basic validation: qubits and data availability
validate_qubit_count(num_qubits)?;
diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs
b/qdp/qdp-core/src/gpu/encodings/mod.rs
index ad1a9577a..319c8068a 100644
--- a/qdp/qdp-core/src/gpu/encodings/mod.rs
+++ b/qdp/qdp-core/src/gpu/encodings/mod.rs
@@ -22,6 +22,8 @@ use crate::error::{MahoutError, Result};
use crate::gpu::memory::GpuStateVector;
use crate::preprocessing::Preprocessor;
use cudarc::driver::CudaDevice;
+#[cfg(target_os = "linux")]
+use std::ffi::c_void;
/// Maximum number of qubits supported (16GB GPU memory limit)
/// This constant must match MAX_QUBITS in qdp-kernels/src/kernel_config.h
@@ -90,6 +92,48 @@ pub trait QuantumEncoder: Send + Sync {
/// Strategy description
fn description(&self) -> &'static str;
+
+ /// Encode from existing GPU pointer (zero-copy). Default: not supported.
+ ///
+ /// # Safety
+ /// Caller must ensure `input_d` points to valid GPU memory with at least
`input_len` elements
+ /// of the expected dtype on the same device as `device`, and `stream` is
a valid CUDA stream or null.
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr(
+ &self,
+ _device: &Arc<CudaDevice>,
+ _input_d: *const c_void,
+ _input_len: usize,
+ _num_qubits: usize,
+ _stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ Err(MahoutError::NotImplemented(format!(
+ "encode_from_gpu_ptr not supported for {}",
+ self.name()
+ )))
+ }
+
+ /// Encode batch from existing GPU pointer (zero-copy). Default: not
supported.
+ ///
+ /// # Safety
+ /// Caller must ensure `input_batch_d` points to valid GPU memory with at
least
+ /// `num_samples * sample_size` elements of the expected dtype on the same
device as `device`,
+ /// and `stream` is a valid CUDA stream or null.
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_batch_from_gpu_ptr(
+ &self,
+ _device: &Arc<CudaDevice>,
+ _input_batch_d: *const c_void,
+ _num_samples: usize,
+ _sample_size: usize,
+ _num_qubits: usize,
+ _stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ Err(MahoutError::NotImplemented(format!(
+ "encode_batch_from_gpu_ptr not supported for {}",
+ self.name()
+ )))
+ }
}
// Encoding implementations
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index bf813c470..65cea496e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -413,10 +413,6 @@ impl QdpEngine {
/// a raw GPU pointer directly, avoiding the GPU→CPU→GPU copy that would
otherwise
/// be required.
///
- /// TODO: Refactor to use QuantumEncoder trait (add `encode_from_gpu_ptr`
to trait)
- /// to reduce duplication with AmplitudeEncoder::encode(). This would also
make it
- /// easier to add GPU pointer support for other encoders (angle, basis) in
the future.
- ///
/// # Arguments
/// * `input_d` - Device pointer to input data (f64 for amplitude/angle,
usize/int64 for basis)
/// * `input_len` - Number of elements in the input
@@ -468,7 +464,6 @@ impl QdpEngine {
stream: *mut std::ffi::c_void,
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtr");
-
if input_len == 0 {
return Err(MahoutError::InvalidInput(
"Input data cannot be empty".into(),
@@ -477,186 +472,12 @@ impl QdpEngine {
validate_cuda_input_ptr(&self.device, input_d)?;
- let state_len = 1usize << num_qubits;
- let method = encoding_method.to_ascii_lowercase();
-
- match method.as_str() {
- "amplitude" => {
- if input_len == 0 {
- return Err(MahoutError::InvalidInput(
- "Input data cannot be empty".into(),
- ));
- }
-
- if input_len > state_len {
- return Err(MahoutError::InvalidInput(format!(
- "Input size {} exceeds state vector size {} (2^{}
qubits)",
- input_len, state_len, num_qubits
- )));
- }
-
- let input_d = input_d as *const f64;
-
- let state_vector = {
- crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
- };
-
- let inv_norm = {
- crate::profile_scope!("GPU::NormFromPtr");
- // SAFETY: input_d validity is guaranteed by the caller's
safety contract
- unsafe {
- gpu::AmplitudeEncoder::calculate_inv_norm_gpu(
- &self.device,
- input_d,
- input_len,
- )?
- }
- };
-
- let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "State vector precision mismatch (expected float64
buffer)".to_string(),
- )
- })?;
-
- {
- crate::profile_scope!("GPU::KernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_amplitude_encode(
- input_d,
- state_ptr as *mut std::ffi::c_void,
- input_len,
- state_len,
- inv_norm,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Amplitude encode kernel failed with CUDA error
code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
- Ok(state_vector.to_dlpack())
- }
- "angle" => {
- if input_len != num_qubits {
- return Err(MahoutError::InvalidInput(format!(
- "Angle encoding expects {} values (one per qubit), got
{}",
- num_qubits, input_len
- )));
- }
-
- let angles_d = input_d as *const f64;
-
- let state_vector = {
- crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float64)?
- };
-
- let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "State vector precision mismatch (expected float64
buffer)".to_string(),
- )
- })?;
-
- {
- crate::profile_scope!("GPU::KernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_angle_encode(
- angles_d,
- state_ptr as *mut std::ffi::c_void,
- state_len,
- num_qubits as u32,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Angle encoding kernel failed with CUDA error
code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
- Ok(state_vector.to_dlpack())
- }
- "basis" => {
- if input_len != 1 {
- return Err(MahoutError::InvalidInput(format!(
- "Basis encoding expects exactly 1 value (the basis
index), got {}",
- input_len
- )));
- }
-
- let basis_indices_d = input_d as *const usize;
-
- let state_vector = {
- crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits,
self.precision)?
- };
-
- let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "State vector precision mismatch (expected float64
buffer)".to_string(),
- )
- })?;
-
- // Use batch API with num_samples=1 to avoid D2H copy;
launch_basis_encode takes host usize.
- {
- crate::profile_scope!("GPU::KernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_basis_encode_batch(
- basis_indices_d,
- state_ptr as *mut std::ffi::c_void,
- 1,
- state_len,
- num_qubits as u32,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Basis encoding kernel failed with CUDA error
code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
- Ok(state_vector.to_dlpack())
- }
- _ => Err(MahoutError::NotImplemented(format!(
- "GPU pointer encoding currently only supports 'amplitude',
'angle', or 'basis' methods, got '{}'",
- encoding_method
- ))),
- }
+ let encoder = get_encoder(encoding_method)?;
+ let state_vector = unsafe {
+ encoder.encode_from_gpu_ptr(&self.device, input_d, input_len,
num_qubits, stream)
+ }?;
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
}
/// Encode from existing GPU pointer (float32 input, amplitude encoding
only)
@@ -787,8 +608,6 @@ impl QdpEngine {
///
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
///
- /// TODO: Refactor to use QuantumEncoder trait (see `encode_from_gpu_ptr`
TODO).
- ///
/// # Arguments
/// * `input_batch_d` - Device pointer to batch input data (f64 for
amplitude/angle, usize/int64 for basis)
/// * `num_samples` - Number of samples in the batch
@@ -844,10 +663,6 @@ impl QdpEngine {
stream: *mut std::ffi::c_void,
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeBatchFromGpuPtr");
-
- let state_len = 1usize << num_qubits;
- let method = encoding_method.to_ascii_lowercase();
-
if num_samples == 0 {
return Err(MahoutError::InvalidInput(
"Number of samples cannot be zero".into(),
@@ -862,288 +677,19 @@ impl QdpEngine {
validate_cuda_input_ptr(&self.device, input_batch_d)?;
- match method.as_str() {
- "amplitude" => {
- if sample_size == 0 {
- return Err(MahoutError::InvalidInput(
- "Sample size cannot be zero".into(),
- ));
- }
-
- if sample_size > state_len {
- return Err(MahoutError::InvalidInput(format!(
- "Sample size {} exceeds state vector size {} (2^{}
qubits)",
- sample_size, state_len, num_qubits
- )));
- }
-
- let input_batch_d = input_batch_d as *const f64;
-
- let batch_state_vector = {
- crate::profile_scope!("GPU::AllocBatch");
- gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
- };
-
- let inv_norms_gpu = {
- crate::profile_scope!("GPU::BatchNormKernel");
- use cudarc::driver::DevicePtrMut;
-
- let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
- MahoutError::MemoryAllocation(format!(
- "Failed to allocate norm buffer: {:?}",
- e
- ))
- })?;
-
- let ret = unsafe {
- qdp_kernels::launch_l2_norm_batch(
- input_batch_d,
- num_samples,
- sample_size,
- *buffer.device_ptr_mut() as *mut f64,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Norm reduction kernel failed with CUDA error
code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
-
- buffer
- };
-
- // Validate norms on host to catch zero or NaN samples early
- {
- crate::profile_scope!("GPU::NormValidation");
- let host_inv_norms =
- self.device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e|
{
- MahoutError::Cuda(format!("Failed to copy norms to
host: {:?}", e))
- })?;
-
- if host_inv_norms.iter().any(|v| !v.is_finite() || *v ==
0.0) {
- return Err(MahoutError::InvalidInput(
- "One or more samples have zero or invalid
norm".to_string(),
- ));
- }
- }
-
- {
- crate::profile_scope!("GPU::BatchKernelLaunch");
- use cudarc::driver::DevicePtr;
-
- let state_ptr = batch_state_vector.ptr_f64().ok_or_else(||
{
- MahoutError::InvalidInput(
- "Batch state vector precision mismatch (expected
float64 buffer)"
- .to_string(),
- )
- })?;
-
- let ret = unsafe {
- qdp_kernels::launch_amplitude_encode_batch(
- input_batch_d,
- state_ptr as *mut std::ffi::c_void,
- *inv_norms_gpu.device_ptr() as *const f64,
- num_samples,
- sample_size,
- state_len,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Batch kernel launch failed with CUDA error code:
{} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let batch_state_vector =
- batch_state_vector.to_precision(&self.device,
self.precision)?;
- Ok(batch_state_vector.to_dlpack())
- }
- "angle" => {
- use cudarc::driver::DevicePtrMut;
-
- if sample_size == 0 {
- return Err(MahoutError::InvalidInput(
- "Sample size cannot be zero".into(),
- ));
- }
-
- if sample_size != num_qubits {
- return Err(MahoutError::InvalidInput(format!(
- "Angle encoding expects sample_size={} (one angle per
qubit), got {}",
- num_qubits, sample_size
- )));
- }
-
- let input_batch_d = input_batch_d as *const f64;
-
- // Validate that all input angles are finite (no NaN/Inf),
consistent with
- // CPU and host-side batch angle encoding paths.
- let angle_validation_buffer = {
- crate::profile_scope!("GPU::AngleFiniteCheckBatch");
-
- let mut buffer =
self.device.alloc_zeros::<f64>(num_samples).map_err(|e| {
- MahoutError::MemoryAllocation(format!(
- "Failed to allocate angle validation buffer: {:?}",
- e
- ))
- })?;
-
- let ret = unsafe {
- qdp_kernels::launch_l2_norm_batch(
- input_batch_d,
- num_samples,
- sample_size,
- *buffer.device_ptr_mut() as *mut f64,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Angle validation norm kernel failed with CUDA
error code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
-
- buffer
- };
-
- {
-
crate::profile_scope!("GPU::AngleFiniteValidationHostCopy");
- let host_norms = self
- .device
- .dtoh_sync_copy(&angle_validation_buffer)
- .map_err(|e| {
- MahoutError::Cuda(format!(
- "Failed to copy angle validation norms to
host: {:?}",
- e
- ))
- })?;
-
- if host_norms.iter().any(|v| !v.is_finite()) {
- return Err(MahoutError::InvalidInput(
- "Angle encoding batch contains non-finite values
(NaN or Inf)"
- .to_string(),
- ));
- }
- }
-
- let batch_state_vector = {
- crate::profile_scope!("GPU::AllocBatch");
- gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
- };
-
- let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "Batch state vector precision mismatch (expected
float64 buffer)"
- .to_string(),
- )
- })?;
-
- {
- crate::profile_scope!("GPU::BatchKernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_angle_encode_batch(
- input_batch_d,
- state_ptr as *mut std::ffi::c_void,
- num_samples,
- state_len,
- num_qubits as u32,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Batch angle encoding kernel failed: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let batch_state_vector =
- batch_state_vector.to_precision(&self.device,
self.precision)?;
- Ok(batch_state_vector.to_dlpack())
- }
- "basis" => {
- if sample_size != 1 {
- return Err(MahoutError::InvalidInput(format!(
- "Basis encoding expects sample_size=1 (one index per
sample), got {}",
- sample_size
- )));
- }
-
- let basis_indices_d = input_batch_d as *const usize;
-
- let batch_state_vector = {
- crate::profile_scope!("GPU::AllocBatch");
- gpu::GpuStateVector::new_batch(&self.device, num_samples,
num_qubits)?
- };
-
- let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
- MahoutError::InvalidInput(
- "Batch state vector precision mismatch (expected
float64 buffer)"
- .to_string(),
- )
- })?;
-
- {
- crate::profile_scope!("GPU::BatchKernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_basis_encode_batch(
- basis_indices_d,
- state_ptr as *mut std::ffi::c_void,
- num_samples,
- state_len,
- num_qubits as u32,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Batch basis encoding kernel failed with CUDA
error code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream
synchronize failed")?;
- }
-
- let batch_state_vector =
- batch_state_vector.to_precision(&self.device,
self.precision)?;
- Ok(batch_state_vector.to_dlpack())
- }
- _ => Err(MahoutError::NotImplemented(format!(
- "GPU pointer batch encoding currently only supports
'amplitude', 'angle', or 'basis' methods, got '{}'",
- encoding_method
- ))),
- }
+ let encoder = get_encoder(encoding_method)?;
+ let batch_state_vector = unsafe {
+ encoder.encode_batch_from_gpu_ptr(
+ &self.device,
+ input_batch_d,
+ num_samples,
+ sample_size,
+ num_qubits,
+ stream,
+ )
+ }?;
+ let batch_state_vector = batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
}
}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index e9f23da34..c672d6956 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -80,7 +80,10 @@ fn test_encode_from_gpu_ptr_unknown_method() {
Err(MahoutError::NotImplemented(msg)) => {
assert!(msg.contains("unknown_encoding") || msg.contains("only
supports"));
}
- _ => panic!("expected NotImplemented, got {:?}", result),
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("Unknown encoder") ||
msg.contains("unknown_encoding"));
+ }
+ _ => panic!("expected NotImplemented or InvalidInput, got {:?}",
result),
}
}
@@ -159,7 +162,10 @@ fn test_encode_batch_from_gpu_ptr_unknown_method() {
Err(MahoutError::NotImplemented(msg)) => {
assert!(msg.contains("unknown_method") || msg.contains("only
supports"));
}
- _ => panic!("expected NotImplemented, got {:?}", result),
+ Err(MahoutError::InvalidInput(msg)) => {
+ assert!(msg.contains("Unknown encoder") ||
msg.contains("unknown_method"));
+ }
+ _ => panic!("expected NotImplemented or InvalidInput, got {:?}",
result),
}
}
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index fe6b07368..007def16e 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -424,7 +424,7 @@ def test_encode_cuda_tensor_preserves_input(data_shape,
is_batch):
@pytest.mark.gpu
@pytest.mark.parametrize("encoding_method", ["iqp"])
def test_encode_cuda_tensor_unsupported_encoding(encoding_method):
- """Test error when using CUDA tensor with unsupported encoding (CUDA
supports amplitude, angle, and basis only)."""
+ """Test error when using CUDA tensor with an encoding not supported on GPU
(only amplitude, angle, basis)."""
pytest.importorskip("torch")
from _qdp import QdpEngine
@@ -433,11 +433,12 @@ def
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
engine = QdpEngine(0)
+ # CUDA path only supports amplitude, angle, basis; iqp/iqp-z should raise
unsupported error
data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64,
device="cuda:0")
with pytest.raises(
RuntimeError,
- match="only supports 'amplitude', 'angle', or 'basis' methods.*Use
tensor.cpu\\(\\)",
+ match="only supports .*amplitude.*angle.*basis.*Use tensor.cpu",
):
engine.encode(data, 2, encoding_method)