This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 42da30d42 [QDP] Add zero-copy amplitude encoding from float32 GPU
tensors (#999)
42da30d42 is described below
commit 42da30d4274e08ead40f33d4650a99aab17e7b64
Author: Vic Wen <[email protected]>
AuthorDate: Wed Feb 4 15:34:35 2026 +0800
[QDP] Add zero-copy amplitude encoding from float32 GPU tensors (#999)
* feat: add float32 GPU pointer encoding and inverse norm calculation with
stream support
* refactor: streamline GPU state vector encoding to support precision
conversion for both Float32 and Float64
* test: add test file for GPU pointer encoding with Float32 precision
* refactor: improve GPU pointer validation and update documentation for
encoding methods
* test: update unsupported encoding test to reflect changes in CUDA tensor
encoding methods
* test: add unit test for handling null pointer in GPU pointer encoding for
Float32
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 26 ++-
qdp/qdp-core/src/lib.rs | 191 ++++++++++++++++++
qdp/qdp-core/tests/common/mod.rs | 8 +-
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 300 ++++++++++++++++++++++++++--
testing/qdp/test_bindings.py | 53 +----
5 files changed, 512 insertions(+), 66 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 037b3bd31..85259e18a 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -510,6 +510,28 @@ impl AmplitudeEncoder {
device: &Arc<CudaDevice>,
input_ptr: *const f32,
len: usize,
+ ) -> Result<f32> {
+ unsafe {
+ Self::calculate_inv_norm_gpu_f32_with_stream(
+ device,
+ input_ptr,
+ len,
+ std::ptr::null_mut(),
+ )
+ }
+ }
+
+ /// Compute inverse L2 norm on GPU for float32 input on a given stream.
+ ///
+ /// # Safety
+ /// The caller must ensure `input_ptr` points to valid GPU memory
containing
+ /// at least `len` f32 elements on the same device as `device`.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
+ device: &Arc<CudaDevice>,
+ input_ptr: *const f32,
+ len: usize,
+ stream: *mut c_void,
) -> Result<f32> {
crate::profile_scope!("GPU::NormSingleF32");
@@ -522,7 +544,7 @@ impl AmplitudeEncoder {
input_ptr,
len,
*norm_buffer.device_ptr_mut() as *mut f32,
- std::ptr::null_mut(), // default stream
+ stream,
)
};
@@ -534,6 +556,8 @@ impl AmplitudeEncoder {
)));
}
+ sync_cuda_stream(stream, "Norm stream synchronize failed (f32)")?;
+
let inv_norm_host = device
.dtoh_sync_copy(&norm_buffer)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy f32 norm to
host: {:?}", e)))?;
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 1fe172d1b..bf813c470 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -45,12 +45,63 @@ pub use pipeline_runner::{
run_throughput_pipeline,
};
+use std::ffi::c_void;
use std::sync::Arc;
use crate::dlpack::DLManagedTensor;
use crate::gpu::get_encoder;
use cudarc::driver::CudaDevice;
+#[cfg(target_os = "linux")]
+fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) ->
Result<()> {
+ use crate::gpu::cuda_ffi::{
+ CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED,
CudaPointerAttributes,
+ cudaPointerGetAttributes,
+ };
+
+ if ptr.is_null() {
+ return Err(MahoutError::InvalidInput(
+ "Input GPU pointer is null".to_string(),
+ ));
+ }
+
+ let mut attrs = CudaPointerAttributes {
+ memory_type: 0,
+ device: 0,
+ device_pointer: std::ptr::null_mut(),
+ host_pointer: std::ptr::null_mut(),
+ is_managed: 0,
+ allocation_flags: 0,
+ };
+
+ let ret = unsafe { cudaPointerGetAttributes(&mut attrs as *mut _, ptr) };
+ if ret != 0 {
+ return Err(MahoutError::InvalidInput(format!(
+ "cudaPointerGetAttributes failed for input pointer: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+
+ if attrs.memory_type != CUDA_MEMORY_TYPE_DEVICE && attrs.memory_type !=
CUDA_MEMORY_TYPE_MANAGED
+ {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input pointer is not CUDA device memory (memory_type={})",
+ attrs.memory_type
+ )));
+ }
+
+ let device_ordinal = device.ordinal() as i32;
+ if attrs.device >= 0 && attrs.device != device_ordinal {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input pointer device mismatch: pointer on cuda:{}, engine on
cuda:{}",
+ attrs.device, device_ordinal
+ )));
+ }
+
+ Ok(())
+}
+
/// Main entry point for Mahout QDP
///
/// Manages GPU context and dispatches encoding tasks.
@@ -418,6 +469,14 @@ impl QdpEngine {
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtr");
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+
+ validate_cuda_input_ptr(&self.device, input_d)?;
+
let state_len = 1usize << num_qubits;
let method = encoding_method.to_ascii_lowercase();
@@ -600,6 +659,130 @@ impl QdpEngine {
}
}
+ /// Encode from existing GPU pointer (float32 input, amplitude encoding
only)
+ ///
+ /// Zero-copy encoding from PyTorch CUDA float32 tensors. Uses the default
CUDA stream.
+ /// For stream interop use `encode_from_gpu_ptr_f32_with_stream`.
+ ///
+ /// # Arguments
+ /// * `input_d` - Device pointer to input data (f32 array on GPU)
+ /// * `input_len` - Number of f32 elements in the input
+ /// * `num_qubits` - Number of qubits for encoding
+ ///
+ /// # Returns
+ /// DLPack pointer (state vector in engine precision) for zero-copy
PyTorch integration.
+ /// Internal computation is f32; output is converted to [`Precision`] of
the engine.
+ ///
+ /// # Safety
+ /// The input pointer must:
+ /// - Point to valid GPU memory on the same device as the engine
+ /// - Contain at least `input_len` f32 elements
+ /// - Remain valid for the duration of this call
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_from_gpu_ptr_f32(
+ &self,
+ input_d: *const f32,
+ input_len: usize,
+ num_qubits: usize,
+ ) -> Result<*mut DLManagedTensor> {
+ unsafe {
+ self.encode_from_gpu_ptr_f32_with_stream(
+ input_d,
+ input_len,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ }
+ }
+
+ /// Encode from existing GPU pointer (float32) on a specified CUDA stream.
+ ///
+ /// # Returns
+ /// DLPack pointer (state vector in engine precision). Pass null for
`stream` to use the default stream.
+ ///
+ /// # Safety
+ /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream
pointer
+ /// must remain valid for the duration of this call.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+ &self,
+ input_d: *const f32,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<*mut DLManagedTensor> {
+ crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
+
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+
+ validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+
+ let state_len = 1usize << num_qubits;
+ if input_len > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input size {} exceeds state vector size {} (2^{} qubits)",
+ input_len, state_len, num_qubits
+ )));
+ }
+
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float32)?
+ };
+
+ let inv_norm = {
+ crate::profile_scope!("GPU::NormFromPtr");
+ unsafe {
+ gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
+ &self.device,
+ input_d,
+ input_len,
+ stream,
+ )?
+ }
+ };
+
+ let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float32
buffer)".to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode_f32(
+ input_d,
+ state_ptr as *mut std::ffi::c_void,
+ input_len,
+ state_len,
+ inv_norm,
+ stream,
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Amplitude encode (f32) kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
+ }
+
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(state_vector.to_dlpack())
+ }
+
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
///
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
@@ -671,6 +854,14 @@ impl QdpEngine {
));
}
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+
+ validate_cuda_input_ptr(&self.device, input_batch_d)?;
+
match method.as_str() {
"amplitude" => {
if sample_size == 0 {
diff --git a/qdp/qdp-core/tests/common/mod.rs b/qdp/qdp-core/tests/common/mod.rs
index 9afb31e40..25e43c262 100644
--- a/qdp/qdp-core/tests/common/mod.rs
+++ b/qdp/qdp-core/tests/common/mod.rs
@@ -14,8 +14,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-/// Creates normalized test data
+/// Creates normalized test data (f64)
#[allow(dead_code)] // Used by multiple test modules
pub fn create_test_data(size: usize) -> Vec<f64> {
(0..size).map(|i| (i as f64) / (size as f64)).collect()
}
+
+/// Creates normalized test data (f32)
+#[allow(dead_code)]
+pub fn create_test_data_f32(size: usize) -> Vec<f32> {
+ (0..size).map(|i| (i as f32) / (size as f32)).collect()
+}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 7851eb1e1..e9f23da34 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -19,12 +19,39 @@
#![cfg(target_os = "linux")]
use std::ffi::c_void;
+use std::sync::Arc;
-use cudarc::driver::{CudaDevice, DevicePtr};
-use qdp_core::{MahoutError, QdpEngine};
+use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, DeviceSlice};
+use qdp_core::{MahoutError, Precision, QdpEngine};
mod common;
+// ---- Helpers for f32 encode_from_gpu_ptr_f32 tests ----
+
+fn engine_f32() -> Option<QdpEngine> {
+ QdpEngine::new_with_precision(0, Precision::Float32).ok()
+}
+
+fn device_and_f32_slice(data: &[f32]) -> Option<(Arc<CudaDevice>,
CudaSlice<f32>)> {
+ let device = CudaDevice::new(0).ok()?;
+ let slice = device.htod_sync_copy(data).ok()?;
+ Some((device, slice))
+}
+
+fn assert_dlpack_shape_2_4_and_delete(dlpack_ptr: *mut
qdp_core::dlpack::DLManagedTensor) {
+ assert!(!dlpack_ptr.is_null());
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2);
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], 1);
+ assert_eq!(shape[1], 4);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+}
+
// ---- Validation / error-path tests (return before using pointer) ----
#[test]
@@ -34,7 +61,19 @@ fn test_encode_from_gpu_ptr_unknown_method() {
Err(_) => return,
};
- let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 4, 2,
"unknown_encoding") };
+ // Need valid GPU pointer so we reach method dispatch (validation runs
first)
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data = common::create_test_data(4);
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let result = unsafe { engine.encode_from_gpu_ptr(ptr, 4, 2,
"unknown_encoding") };
assert!(result.is_err());
match result {
@@ -70,8 +109,20 @@ fn test_encode_from_gpu_ptr_amplitude_input_exceeds_state()
{
Err(_) => return,
};
+ // Need valid GPU pointer so we reach input_len > state_len check
(validation runs first)
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data = common::create_test_data(10);
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
// 2 qubits -> state_len = 4; request input_len = 10
- let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 10, 2,
"amplitude") };
+ let result = unsafe { engine.encode_from_gpu_ptr(ptr, 10, 2, "amplitude")
};
assert!(result.is_err());
match result {
@@ -89,8 +140,19 @@ fn test_encode_batch_from_gpu_ptr_unknown_method() {
Err(_) => return,
};
- let result =
- unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(), 2, 4, 2,
"unknown_method") };
+ // Need valid GPU pointer so we reach method dispatch (validation runs
first)
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let data = common::create_test_data(8);
+ let data_d = match device.htod_sync_copy(data.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *data_d.device_ptr() as *const f64 as *const c_void;
+
+ let result = unsafe { engine.encode_batch_from_gpu_ptr(ptr, 2, 4, 2,
"unknown_method") };
assert!(result.is_err());
match result {
@@ -127,17 +189,34 @@ fn test_encode_from_gpu_ptr_basis_input_len_not_one() {
Err(_) => return,
};
- // Basis single encoding expects exactly 1 value; input_len != 1 must
return error.
- let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 0, 2,
"basis") };
+ // Need valid GPU pointer so we reach basis input_len checks (validation
runs first)
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let indices: Vec<usize> = vec![0, 1, 2];
+ let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+ // Basis single encoding expects exactly 1 value; input_len == 0 returns
empty error.
+ let result = unsafe { engine.encode_from_gpu_ptr(ptr, 0, 2, "basis") };
assert!(result.is_err());
match result {
Err(MahoutError::InvalidInput(msg)) => {
- assert!(msg.contains("exactly 1") || msg.contains("basis"));
+ assert!(
+ msg.contains("exactly 1") || msg.contains("basis") ||
msg.contains("empty"),
+ "expected exactly 1 / basis / empty, got: {}",
+ msg
+ );
}
_ => panic!("expected InvalidInput for input_len != 1, got {:?}",
result),
}
- let result = unsafe { engine.encode_from_gpu_ptr(std::ptr::null(), 3, 2,
"basis") };
+ // input_len == 3 (basis expects 1)
+ let result = unsafe { engine.encode_from_gpu_ptr(ptr, 3, 2, "basis") };
assert!(result.is_err());
match result {
Err(MahoutError::InvalidInput(msg)) => {
@@ -154,8 +233,20 @@ fn
test_encode_batch_from_gpu_ptr_basis_sample_size_not_one() {
Err(_) => return,
};
- // Basis batch expects sample_size == 1 (one index per sample).
- let result = unsafe { engine.encode_batch_from_gpu_ptr(std::ptr::null(),
2, 4, 2, "basis") };
+ // Need valid GPU pointer so we reach basis sample_size check (validation
runs first)
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => return,
+ };
+ let indices: Vec<usize> = vec![0, 1];
+ let indices_d = match device.htod_sync_copy(indices.as_slice()) {
+ Ok(b) => b,
+ Err(_) => return,
+ };
+ let ptr = *indices_d.device_ptr() as *const usize as *const c_void;
+
+ // Basis batch expects sample_size == 1 (one index per sample);
sample_size=4.
+ let result = unsafe { engine.encode_batch_from_gpu_ptr(ptr, 2, 4, 2,
"basis") };
assert!(result.is_err());
match result {
Err(MahoutError::InvalidInput(msg)) => {
@@ -326,7 +417,8 @@ fn test_encode_batch_from_gpu_ptr_amplitude_success() {
#[test]
fn test_encode_from_gpu_ptr_basis_success() {
- let engine = match QdpEngine::new(0) {
+ // Basis path uses ptr_f64(); engine must be Float64
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
Ok(e) => e,
Err(_) => {
println!("SKIP: No GPU available");
@@ -377,7 +469,8 @@ fn test_encode_from_gpu_ptr_basis_success() {
#[test]
fn test_encode_batch_from_gpu_ptr_basis_success() {
- let engine = match QdpEngine::new(0) {
+ // Basis path uses ptr_f64(); engine must be Float64
+ let engine = match QdpEngine::new_with_precision(0, Precision::Float64) {
Ok(e) => e,
Err(_) => {
println!("SKIP: No GPU available");
@@ -423,3 +516,182 @@ fn test_encode_batch_from_gpu_ptr_basis_success() {
deleter(dlpack_ptr);
}
}
+
+// ---- encode_from_gpu_ptr_f32 (float32 amplitude) ----
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0])
{
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let ptr = *input_d.device_ptr() as *const f32;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+ .expect("encode_from_gpu_ptr_f32")
+ };
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_with_stream_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0])
{
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let ptr = *input_d.device_ptr() as *const f32;
+ let dlpack_ptr = unsafe {
+ engine.encode_from_gpu_ptr_f32_with_stream(ptr, input_d.len(), 2,
std::ptr::null_mut())
+ }
+ .expect("encode_from_gpu_ptr_f32_with_stream");
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_with_stream_non_default_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let stream = device.fork_default_stream().expect("fork_default_stream");
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_f32_with_stream(
+ *input_d.device_ptr() as *const f32,
+ input_d.len(),
+ 2,
+ stream.stream as *mut c_void,
+ )
+ .expect("encode_from_gpu_ptr_f32_with_stream (non-default stream)")
+ };
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_success_f64_engine() {
+ let engine = match QdpEngine::new_with_precision(0,
Precision::Float64).ok() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0])
{
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let ptr = *input_d.device_ptr() as *const f32;
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+ .expect("encode_from_gpu_ptr_f32 (Float64 engine)")
+ };
+ assert_dlpack_shape_2_4_and_delete(dlpack_ptr);
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_empty_input() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let ptr = *input_d.device_ptr() as *const f32;
+ let result = unsafe { engine.encode_from_gpu_ptr_f32(ptr, 0, 2) };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => assert!(msg.contains("empty")),
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_null_pointer() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let result = unsafe { engine.encode_from_gpu_ptr_f32(std::ptr::null(), 4,
2) };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
+
+#[test]
+fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0,
0.0]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let ptr = *input_d.device_ptr() as *const f32;
+ let result = unsafe { engine.encode_from_gpu_ptr_f32(ptr, input_d.len(),
2) };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => {
+ assert!(
+ msg.contains("exceeds") || msg.contains("state vector"),
+ "expected 'exceeds' or 'state vector', got: {}",
+ msg
+ );
+ }
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index a0f043456..fe6b07368 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -25,8 +25,6 @@ from .qdp_test_utils import requires_qdp
def _has_multi_gpu():
"""Check if multiple GPUs are available via PyTorch."""
try:
- import torch
-
return torch.cuda.is_available() and torch.cuda.device_count() >= 2
except ImportError:
return False
@@ -81,7 +79,6 @@ def test_dlpack_device():
def test_dlpack_device_id_non_zero():
"""Test device_id propagation for non-zero devices (requires multi-GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
# Test with device_id=1 (second GPU)
@@ -108,7 +105,6 @@ def test_dlpack_device_id_non_zero():
@pytest.mark.gpu
def test_dlpack_single_use():
"""Test that __dlpack__ can only be called once (requires GPU)."""
- import torch
from _qdp import QdpEngine
engine = QdpEngine(0)
@@ -130,7 +126,6 @@ def test_dlpack_single_use():
@pytest.mark.parametrize("stream", [1, 2], ids=["stream_legacy",
"stream_per_thread"])
def test_dlpack_with_stream(stream):
"""Test __dlpack__(stream=...) syncs CUDA stream before returning capsule
(DLPack 0.8+)."""
- import torch
from _qdp import QdpEngine
engine = QdpEngine(0)
@@ -149,7 +144,6 @@ def test_dlpack_with_stream(stream):
def test_pytorch_integration():
"""Test PyTorch integration via DLPack (requires GPU and PyTorch)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
engine = QdpEngine(0)
@@ -178,7 +172,6 @@ def test_pytorch_integration():
def test_precision(precision, expected_dtype):
"""Test different precision settings produce correct output dtypes."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
engine = QdpEngine(0, precision=precision)
@@ -207,7 +200,6 @@ def test_precision(precision, expected_dtype):
def test_encode_tensor_cpu(data_shape, expected_shape):
"""Test encoding from CPU PyTorch tensor (1D or 2D, zero-copy)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -233,7 +225,6 @@ def test_encode_from_tensorflow_binding():
pytest.importorskip("torch")
tf = pytest.importorskip("tensorflow")
import numpy as np
- import torch
from _qdp import QdpEngine
import os
import tempfile
@@ -267,7 +258,6 @@ def test_encode_from_tensorflow_binding():
def test_encode_errors():
"""Test error handling for unified encode method."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -300,7 +290,6 @@ def test_encode_errors():
def test_encode_cuda_tensor(data_shape, expected_shape, expected_batch_size):
"""Test encoding from CUDA tensor (1D or 2D, zero-copy)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -328,7 +317,6 @@ def test_encode_cuda_tensor(data_shape, expected_shape,
expected_batch_size):
def test_encode_cuda_tensor_wrong_dtype():
"""Test error when CUDA tensor has wrong dtype (non-float64)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -347,7 +335,6 @@ def test_encode_cuda_tensor_wrong_dtype():
def test_encode_cuda_tensor_non_contiguous():
"""Test error when CUDA tensor is non-contiguous."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -373,7 +360,6 @@ def test_encode_cuda_tensor_non_contiguous():
def test_encode_cuda_tensor_device_mismatch():
"""Test error when CUDA tensor is on wrong device (multi-GPU only)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
# Engine on device 0
@@ -390,7 +376,6 @@ def test_encode_cuda_tensor_device_mismatch():
def test_encode_cuda_tensor_empty():
"""Test error when CUDA tensor is empty."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -416,7 +401,6 @@ def test_encode_cuda_tensor_empty():
def test_encode_cuda_tensor_preserves_input(data_shape, is_batch):
"""Test that input CUDA tensor (1D or 2D) is not modified after
encoding."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -438,11 +422,10 @@ def test_encode_cuda_tensor_preserves_input(data_shape,
is_batch):
@requires_qdp
@pytest.mark.gpu
[email protected]("encoding_method", ["basis"])
[email protected]("encoding_method", ["iqp"])
def test_encode_cuda_tensor_unsupported_encoding(encoding_method):
- """Test error when using CUDA tensor with unsupported encoding (CUDA
supports amplitude and angle only)."""
+ """Test error when using CUDA tensor with unsupported encoding (CUDA
supports amplitude, angle, and basis only)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -450,13 +433,11 @@ def
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
engine = QdpEngine(0)
- # CUDA tensors currently only support amplitude encoding
- # Use non-zero data to avoid normalization issues
data = torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float64,
device="cuda:0")
with pytest.raises(
RuntimeError,
- match="only supports 'amplitude' and 'angle' methods.*Use
tensor.cpu\\(\\)",
+ match="only supports 'amplitude', 'angle', or 'basis' methods.*Use
tensor.cpu\\(\\)",
):
engine.encode(data, 2, encoding_method)
@@ -474,7 +455,6 @@ def
test_encode_cuda_tensor_unsupported_encoding(encoding_method):
def test_encode_3d_rejected(input_type, error_match):
"""Test error when input has 3+ dimensions (CUDA tensor, CPU tensor, or
NumPy array)."""
pytest.importorskip("torch")
- import torch
import numpy as np
from _qdp import QdpEngine
@@ -520,7 +500,6 @@ def test_encode_3d_rejected(input_type, error_match):
def test_encode_cuda_tensor_non_finite_values(tensor_factory, description):
"""Test error when CUDA tensor contains non-finite values (zeros, NaN,
Inf)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -545,7 +524,6 @@ def
test_encode_cuda_tensor_non_finite_values(tensor_factory, description):
def test_encode_cuda_tensor_output_dtype(precision, expected_dtype):
"""Test that CUDA tensor encoding produces correct output dtype."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -564,7 +542,6 @@ def test_encode_cuda_tensor_output_dtype(precision,
expected_dtype):
def test_basis_encode_basic():
"""Test basic basis encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -589,7 +566,6 @@ def test_basis_encode_basic():
def test_basis_encode_nonzero_index():
"""Test basis encoding with non-zero index (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -611,7 +587,6 @@ def test_basis_encode_nonzero_index():
def test_basis_encode_3_qubits():
"""Test basis encoding with 3 qubits (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -641,7 +616,6 @@ def test_basis_encode_3_qubits():
def test_basis_encode_errors():
"""Test error handling for basis encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -675,7 +649,6 @@ def test_basis_encode_errors():
def test_angle_encode_basic():
"""Test basic angle encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -699,7 +672,6 @@ def test_angle_encode_basic():
def test_angle_encode_nonzero_angles():
"""Test angle encoding with non-zero angles (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -722,7 +694,6 @@ def test_angle_encode_nonzero_angles():
def test_angle_encode_batch():
"""Test batch angle encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -753,7 +724,6 @@ def test_angle_encode_batch():
def test_angle_encode_errors():
"""Test error handling for angle encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -786,7 +756,6 @@ def test_encode_numpy_array(data_shape, expected_shape):
"""Test encoding from NumPy array (1D or 2D)."""
pytest.importorskip("torch")
import numpy as np
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -808,7 +777,6 @@ def test_encode_pathlib_path():
"""Test encoding from pathlib.Path object."""
pytest.importorskip("torch")
import numpy as np
- import torch
from pathlib import Path
import tempfile
import os
@@ -849,7 +817,6 @@ def test_iqp_z_encode_basic():
- H^n transforms back to |0⟩^n
"""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -874,7 +841,6 @@ def test_iqp_z_encode_basic():
def test_iqp_z_encode_nonzero():
"""Test IQP-Z encoding with non-zero angles (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -900,7 +866,6 @@ def test_iqp_z_encode_nonzero():
def test_iqp_encode_basic():
"""Test basic IQP encoding with ZZ interactions (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -926,7 +891,6 @@ def test_iqp_encode_basic():
def test_iqp_encode_zz_effect():
"""Test that ZZ interaction produces different result than Z-only
(requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -959,7 +923,6 @@ def test_iqp_encode_zz_effect():
def test_iqp_encode_3_qubits():
"""Test IQP encoding with 3 qubits (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -985,7 +948,6 @@ def test_iqp_encode_3_qubits():
def test_iqp_z_encode_batch():
"""Test batch IQP-Z encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1014,7 +976,6 @@ def test_iqp_z_encode_batch():
def test_iqp_encode_batch():
"""Test batch IQP encoding with ZZ interactions (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1046,7 +1007,6 @@ def test_iqp_encode_batch():
def test_iqp_encode_single_qubit():
"""Test IQP encoding with single qubit edge case (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1075,7 +1035,6 @@ def test_iqp_encode_single_qubit():
def test_iqp_encode_errors():
"""Test error handling for IQP encoding (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1111,7 +1070,6 @@ def test_iqp_encode_errors():
def test_iqp_fwt_normalization():
"""Test that FWT-optimized IQP produces normalized states (requires
GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1139,7 +1097,6 @@ def test_iqp_fwt_normalization():
def test_iqp_z_fwt_normalization():
"""Test that FWT-optimized IQP-Z produces normalized states (requires
GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1168,7 +1125,6 @@ def test_iqp_fwt_zero_params_gives_zero_state():
so |0⟩^n maps to |0⟩^n with amplitude 1 at index 0.
"""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1200,7 +1156,6 @@ def test_iqp_fwt_zero_params_gives_zero_state():
def test_iqp_fwt_batch_normalization():
"""Test that FWT-optimized batch IQP produces normalized states (requires
GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1238,7 +1193,6 @@ def test_iqp_fwt_batch_normalization():
def test_iqp_fwt_deterministic():
"""Test that FWT-optimized IQP is deterministic (requires GPU)."""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():
@@ -1272,7 +1226,6 @@ def test_iqp_fwt_shared_vs_global_memory_threshold():
- n > 10: uses global memory FWT
"""
pytest.importorskip("torch")
- import torch
from _qdp import QdpEngine
if not torch.cuda.is_available():