This is an automated email from the ASF dual-hosted git repository.
richhuang pushed a commit to branch dev-qdp
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/dev-qdp by this push:
new f47b93920 [QDP] Set float32 to Default and leave float64 as Optional
(#712)
f47b93920 is described below
commit f47b93920ae15a1051c9ee79368d10434f6bd244
Author: Ping <[email protected]>
AuthorDate: Tue Dec 16 11:31:01 2025 +0800
[QDP] Set float32 to Default and leave float64 as Optional (#712)
* Set float32 to default
Signed-off-by: 400Ping <[email protected]>
* fix errors
Signed-off-by: 400Ping <[email protected]>
* fix python bindings
Signed-off-by: 400Ping <[email protected]>
* [chore] update log
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
---
qdp/qdp-core/src/dlpack.rs | 15 ++-
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 23 ++++-
qdp/qdp-core/src/gpu/memory.rs | 142 +++++++++++++++++++++++++---
qdp/qdp-core/src/lib.rs | 13 ++-
qdp/qdp-kernels/src/amplitude.cu | 36 +++++++
qdp/qdp-kernels/src/lib.rs | 39 ++++++++
qdp/qdp-python/README.md | 5 +-
qdp/qdp-python/src/lib.rs | 20 +++-
qdp/qdp-python/tests/test_bindings.py | 17 +++-
qdp/qdp-python/tests/test_high_fidelity.py | 20 +++-
10 files changed, 296 insertions(+), 34 deletions(-)
diff --git a/qdp/qdp-core/src/dlpack.rs b/qdp/qdp-core/src/dlpack.rs
index 42d7eaf62..883d19b37 100644
--- a/qdp/qdp-core/src/dlpack.rs
+++ b/qdp/qdp-core/src/dlpack.rs
@@ -18,7 +18,7 @@
use std::os::raw::{c_int, c_void};
use std::sync::Arc;
-use crate::gpu::memory::GpuStateVector;
+use crate::gpu::memory::{BufferStorage, GpuStateVector, Precision};
// DLPack C structures (matching dlpack/dlpack.h)
@@ -104,7 +104,7 @@ pub unsafe extern "C" fn dlpack_deleter(managed: *mut
DLManagedTensor) {
// 3. Free GPU buffer (Arc reference count)
let ctx = (*managed).manager_ctx;
if !ctx.is_null() {
- let _ = Arc::from_raw(ctx as *const crate::gpu::memory::GpuBufferRaw);
+ let _ = Arc::from_raw(ctx as *const BufferStorage);
}
// 4. Free DLManagedTensor
@@ -131,16 +131,21 @@ impl GpuStateVector {
// Increment Arc ref count (decremented in deleter)
let ctx = Arc::into_raw(self.buffer.clone()) as *mut c_void;
+ let dtype_bits = match self.precision() {
+ Precision::Float32 => 64, // complex64 (2x float32)
+ Precision::Float64 => 128, // complex128 (2x float64)
+ };
+
let tensor = DLTensor {
- data: self.ptr() as *mut c_void,
+ data: self.ptr_void(),
device: DLDevice {
device_type: DLDeviceType::kDLCUDA,
device_id: 0,
},
ndim: 1,
dtype: DLDataType {
- code: DL_COMPLEX, // Complex128
- bits: 128, // 2 * 64-bit floats
+ code: DL_COMPLEX,
+ bits: dtype_bits,
lanes: 1,
},
shape: shape_ptr,
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 844f6d1e8..715f0984e 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -99,12 +99,20 @@ impl QuantumEncoder for AmplitudeEncoder {
1.0 / norm
};
+ let state_ptr = state_vector.ptr_f64().ok_or_else(|| {
+ let actual = state_vector.precision();
+ MahoutError::InvalidInput(format!(
+ "State vector precision mismatch (expected float64
buffer, got {:?})",
+ actual
+ ))
+ })?;
+
let ret = {
crate::profile_scope!("GPU::KernelLaunch");
unsafe {
launch_amplitude_encode(
*input_slice.device_ptr() as *const f64,
- state_vector.ptr() as *mut c_void,
+ state_ptr as *mut c_void,
host_data.len(),
state_len,
inv_norm,
@@ -227,10 +235,13 @@ impl QuantumEncoder for AmplitudeEncoder {
// Launch batch kernel
{
crate::profile_scope!("GPU::BatchKernelLaunch");
+ let state_ptr = batch_state_vector.ptr_f64().ok_or_else(||
MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float64
buffer)".to_string()
+ ))?;
let ret = unsafe {
launch_amplitude_encode_batch(
*input_batch_gpu.device_ptr() as *const f64,
- batch_state_vector.ptr() as *mut c_void,
+ state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f64,
num_samples,
sample_size,
@@ -283,13 +294,17 @@ impl AmplitudeEncoder {
inv_norm: f64,
state_vector: &GpuStateVector,
) -> Result<()> {
+ let base_state_ptr = state_vector.ptr_f64().ok_or_else(||
MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float64
buffer)".to_string()
+ ))?;
+
// Use generic pipeline infrastructure
// The closure handles amplitude-specific kernel launch logic
run_dual_stream_pipeline(device, host_data, |stream, input_ptr,
chunk_offset, chunk_len| {
// Calculate offset pointer for state vector (type-safe pointer
arithmetic)
// Offset is in complex numbers (CuDoubleComplex), not f64 elements
let state_ptr_offset = unsafe {
- state_vector.ptr().cast::<u8>()
+ base_state_ptr.cast::<u8>()
.add(chunk_offset *
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
.cast::<std::ffi::c_void>()
};
@@ -338,7 +353,7 @@ impl AmplitudeEncoder {
// Calculate tail pointer (in complex numbers)
let tail_ptr = unsafe {
- state_vector.ptr().add(padding_start) as *mut c_void
+ base_state_ptr.add(padding_start) as *mut c_void
};
// Zero-fill padding region using CUDA Runtime API
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index a333e103d..72bf5bc7f 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -13,12 +13,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
+use std::ffi::c_void;
use std::sync::Arc;
use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr};
-use qdp_kernels::CuDoubleComplex;
+use qdp_kernels::{CuComplex, CuDoubleComplex};
use crate::error::{MahoutError, Result};
+/// Precision of the GPU state vector.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Precision {
+ Float32,
+ Float64,
+}
+
#[cfg(target_os = "linux")]
fn bytes_to_mib(bytes: usize) -> f64 {
bytes as f64 / (1024.0 * 1024.0)
@@ -130,28 +137,58 @@ pub(crate) fn map_allocation_error(
/// RAII wrapper for GPU memory buffer
/// Automatically frees GPU memory when dropped
-pub struct GpuBufferRaw {
- pub(crate) slice: CudaSlice<CuDoubleComplex>,
+pub struct GpuBufferRaw<T> {
+ pub(crate) slice: CudaSlice<T>,
}
-impl GpuBufferRaw {
+impl<T> GpuBufferRaw<T> {
/// Get raw pointer to GPU memory
///
/// # Safety
/// Valid only while GpuBufferRaw is alive
- pub fn ptr(&self) -> *mut CuDoubleComplex {
- *self.slice.device_ptr() as *mut CuDoubleComplex
+ pub fn ptr(&self) -> *mut T {
+ *self.slice.device_ptr() as *mut T
+ }
+}
+
+/// Storage wrapper for precision-specific GPU buffers
+pub enum BufferStorage {
+ F32(GpuBufferRaw<CuComplex>),
+ F64(GpuBufferRaw<CuDoubleComplex>),
+}
+
+impl BufferStorage {
+ fn precision(&self) -> Precision {
+ match self {
+ BufferStorage::F32(_) => Precision::Float32,
+ BufferStorage::F64(_) => Precision::Float64,
+ }
+ }
+
+ fn ptr_void(&self) -> *mut c_void {
+ match self {
+ BufferStorage::F32(buf) => buf.ptr() as *mut c_void,
+ BufferStorage::F64(buf) => buf.ptr() as *mut c_void,
+ }
+ }
+
+ fn ptr_f64(&self) -> Option<*mut CuDoubleComplex> {
+ match self {
+ BufferStorage::F64(buf) => Some(buf.ptr()),
+ _ => None,
+ }
}
}
/// Quantum state vector on GPU
///
-/// Manages complex128 array of size 2^n (n = qubits) in GPU memory.
+/// Manages complex array of size 2^n (n = qubits) in GPU memory.
/// Uses Arc for shared ownership (needed for DLPack/PyTorch integration).
/// Thread-safe: Send + Sync
+#[derive(Clone)]
pub struct GpuStateVector {
// Use Arc to allow DLPack to share ownership
- pub(crate) buffer: Arc<GpuBufferRaw>,
+ pub(crate) buffer: Arc<BufferStorage>,
pub num_qubits: usize,
pub size_elements: usize,
}
@@ -190,7 +227,7 @@ impl GpuStateVector {
))?;
Ok(Self {
- buffer: Arc::new(GpuBufferRaw { slice }),
+ buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
num_qubits: qubits,
size_elements: _size_elements,
})
@@ -203,12 +240,22 @@ impl GpuStateVector {
}
}
+ /// Get current precision of the underlying buffer.
+ pub fn precision(&self) -> Precision {
+ self.buffer.precision()
+ }
+
/// Get raw GPU pointer for DLPack/FFI
///
/// # Safety
/// Valid while GpuStateVector or any Arc clone is alive
- pub fn ptr(&self) -> *mut CuDoubleComplex {
- self.buffer.ptr()
+ pub fn ptr_void(&self) -> *mut c_void {
+ self.buffer.ptr_void()
+ }
+
+ /// Returns a double-precision pointer if the buffer stores complex128
data.
+ pub fn ptr_f64(&self) -> Option<*mut CuDoubleComplex> {
+ self.buffer.ptr_f64()
}
/// Get the number of qubits
@@ -251,7 +298,7 @@ impl GpuStateVector {
))?;
Ok(Self {
- buffer: Arc::new(GpuBufferRaw { slice }),
+ buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
num_qubits: qubits,
size_elements: total_elements,
})
@@ -262,4 +309,73 @@ impl GpuStateVector {
Err(MahoutError::Cuda("CUDA is only available on Linux. This build
does not support GPU operations.".to_string()))
}
}
+
+ /// Convert the state vector to the requested precision (GPU-side).
+ ///
+ /// For now only down-conversion from Float64 -> Float32 is supported.
+ pub fn to_precision(&self, device: &Arc<CudaDevice>, target: Precision) ->
Result<Self> {
+ if self.precision() == target {
+ return Ok(self.clone());
+ }
+
+ match (self.precision(), target) {
+ (Precision::Float64, Precision::Float32) => {
+ #[cfg(target_os = "linux")]
+ {
+ let requested_bytes = self.size_elements
+ .checked_mul(std::mem::size_of::<CuComplex>())
+ .ok_or_else(|| MahoutError::MemoryAllocation(
+ format!("Requested GPU allocation size overflow
(elements={})", self.size_elements)
+ ))?;
+
+ ensure_device_memory_available(requested_bytes, "state
vector precision conversion", Some(self.num_qubits))?;
+
+ let slice = unsafe {
+ device.alloc::<CuComplex>(self.size_elements)
+ }.map_err(|e| map_allocation_error(
+ requested_bytes,
+ "state vector precision conversion",
+ Some(self.num_qubits),
+ e,
+ ))?;
+
+ let src_ptr = self.ptr_f64().ok_or_else(||
MahoutError::InvalidInput(
+ "Source state vector is not Float64; cannot convert to
Float32".to_string()
+ ))?;
+
+ let ret = unsafe {
+ qdp_kernels::convert_state_to_float(
+ src_ptr as *const CuDoubleComplex,
+ *slice.device_ptr() as *mut CuComplex,
+ self.size_elements,
+ std::ptr::null_mut(),
+ )
+ };
+
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(
+ format!("Precision conversion kernel failed: {}",
ret)
+ ));
+ }
+
+ device.synchronize()
+ .map_err(|e| MahoutError::Cuda(format!("Failed to sync
after precision conversion: {:?}", e)))?;
+
+ Ok(Self {
+ buffer: Arc::new(BufferStorage::F32(GpuBufferRaw {
slice })),
+ num_qubits: self.num_qubits,
+ size_elements: self.size_elements,
+ })
+ }
+
+ #[cfg(not(target_os = "linux"))]
+ {
+ Err(MahoutError::Cuda("Precision conversion requires CUDA
(Linux)".to_string()))
+ }
+ }
+ _ => Err(MahoutError::NotImplemented(
+ "Requested precision conversion is not supported".to_string()
+ )),
+ }
+ }
}
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 2f8f42092..e14e2bab0 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -19,11 +19,11 @@ pub mod gpu;
pub mod error;
pub mod preprocessing;
pub mod io;
-
#[macro_use]
mod profiling;
pub use error::{MahoutError, Result};
+pub use gpu::memory::Precision;
use std::sync::Arc;
@@ -37,6 +37,7 @@ use crate::gpu::get_encoder;
/// Provides unified interface for device management, memory allocation, and
DLPack.
pub struct QdpEngine {
device: Arc<CudaDevice>,
+ precision: Precision,
}
impl QdpEngine {
@@ -45,10 +46,16 @@ impl QdpEngine {
/// # Arguments
/// * `device_id` - CUDA device ID (typically 0)
pub fn new(device_id: usize) -> Result<Self> {
+ Self::new_with_precision(device_id, Precision::Float32)
+ }
+
+ /// Initialize engine with explicit precision.
+ pub fn new_with_precision(device_id: usize, precision: Precision) ->
Result<Self> {
let device = CudaDevice::new(device_id)
.map_err(|e| MahoutError::Cuda(format!("Failed to initialize CUDA
device {}: {:?}", device_id, e)))?;
Ok(Self {
- device // CudaDevice::new already returns Arc<CudaDevice> in
cudarc 0.11
+ device, // CudaDevice::new already returns Arc<CudaDevice> in
cudarc 0.11
+ precision,
})
}
@@ -76,6 +83,7 @@ impl QdpEngine {
let encoder = get_encoder(encoding_method)?;
let state_vector = encoder.encode(&self.device, data, num_qubits)?;
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
let dlpack_ptr = {
crate::profile_scope!("DLPack::Wrap");
state_vector.to_dlpack()
@@ -121,6 +129,7 @@ impl QdpEngine {
num_qubits,
)?;
+ let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
let dlpack_ptr = state_vector.to_dlpack();
Ok(dlpack_ptr)
}
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 0e679e2b9..ea5fc27f7 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -454,6 +454,42 @@ int launch_l2_norm_batch(
return (int)cudaGetLastError();
}
+/// Kernel: convert complex128 state vector to complex64.
+__global__ void convert_state_to_complex64_kernel(
+ const cuDoubleComplex* __restrict__ input_state,
+ cuComplex* __restrict__ output_state,
+ size_t len
+) {
+ const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx >= len) return;
+
+ const cuDoubleComplex v = input_state[idx];
+ output_state[idx] = make_cuComplex((float)v.x, (float)v.y);
+}
+
+/// Launch conversion kernel from complex128 to complex64.
+int convert_state_to_float(
+ const cuDoubleComplex* input_state_d,
+ cuComplex* output_state_d,
+ size_t len,
+ cudaStream_t stream
+) {
+ if (len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ const int blockSize = 256;
+ const int gridSize = (int)((len + blockSize - 1) / blockSize);
+
+ convert_state_to_complex64_kernel<<<gridSize, blockSize, 0, stream>>>(
+ input_state_d,
+ output_state_d,
+ len
+ );
+
+ return (int)cudaGetLastError();
+}
+
// TODO: Future encoding methods:
// - launch_angle_encode (angle encoding)
// - launch_basis_encode (basis encoding)
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index f2ce2ad29..bae8782ef 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -36,6 +36,22 @@ unsafe impl cudarc::driver::DeviceRepr for CuDoubleComplex {}
#[cfg(target_os = "linux")]
unsafe impl cudarc::driver::ValidAsZeroBits for CuDoubleComplex {}
+// Complex number (matches CUDA's cuComplex / cuFloatComplex)
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct CuComplex {
+ pub x: f32, // Real part
+ pub y: f32, // Imaginary part
+}
+
+// Implement DeviceRepr for cudarc compatibility
+#[cfg(target_os = "linux")]
+unsafe impl cudarc::driver::DeviceRepr for CuComplex {}
+
+// Also implement ValidAsZeroBits for alloc_zeros support
+#[cfg(target_os = "linux")]
+unsafe impl cudarc::driver::ValidAsZeroBits for CuComplex {}
+
// CUDA kernel FFI (Linux only, dummy on other platforms)
#[cfg(target_os = "linux")]
unsafe extern "C" {
@@ -93,6 +109,18 @@ unsafe extern "C" {
stream: *mut c_void,
) -> i32;
+ /// Convert a complex128 state vector to complex64 on GPU.
+ /// Returns CUDA error code (0 = success).
+ ///
+ /// # Safety
+ /// Pointers must reference valid device memory on the provided stream.
+ pub fn convert_state_to_float(
+ input_state_d: *const CuDoubleComplex,
+ output_state_d: *mut CuComplex,
+ len: usize,
+ stream: *mut c_void,
+ ) -> i32;
+
// TODO: launch_angle_encode, launch_basis_encode
}
@@ -132,3 +160,14 @@ pub extern "C" fn launch_l2_norm_batch(
) -> i32 {
999
}
+
+#[cfg(not(target_os = "linux"))]
+#[unsafe(no_mangle)]
+pub extern "C" fn convert_state_to_float(
+ _input_state_d: *const CuDoubleComplex,
+ _output_state_d: *mut CuComplex,
+ _len: usize,
+ _stream: *mut c_void,
+) -> i32 {
+ 999
+}
diff --git a/qdp/qdp-python/README.md b/qdp/qdp-python/README.md
index aed316952..98d2b0106 100644
--- a/qdp/qdp-python/README.md
+++ b/qdp/qdp-python/README.md
@@ -7,9 +7,12 @@ PyO3 Python bindings for Apache Mahout QDP.
```python
from mahout_qdp import QdpEngine
-# Initialize on GPU 0
+# Initialize on GPU 0 (defaults to float32 output)
engine = QdpEngine(0)
+# Optional: request float64 output if you need higher precision
+# engine = QdpEngine(0, precision="float64")
+
# Encode data
data = [0.5, 0.5, 0.5, 0.5]
dlpack_ptr = engine.encode(data, num_qubits=2, encoding_method="amplitude")
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index 340aae814..04f3c5367 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -17,7 +17,7 @@
use pyo3::prelude::*;
use pyo3::exceptions::PyRuntimeError;
use pyo3::ffi;
-use qdp_core::QdpEngine as CoreEngine;
+use qdp_core::{Precision, QdpEngine as CoreEngine};
use qdp_core::dlpack::DLManagedTensor;
/// Quantum tensor wrapper implementing DLPack protocol
@@ -139,6 +139,7 @@ impl QdpEngine {
///
/// Args:
/// device_id: CUDA device ID (typically 0)
+ /// precision: Output precision ("float32" default, or "float64")
///
/// Returns:
/// QdpEngine instance
@@ -146,9 +147,20 @@ impl QdpEngine {
/// Raises:
/// RuntimeError: If CUDA device initialization fails
#[new]
- #[pyo3(signature = (device_id=0))]
- fn new(device_id: usize) -> PyResult<Self> {
- let engine = CoreEngine::new(device_id)
+ #[pyo3(signature = (device_id=0, precision="float32"))]
+ fn new(device_id: usize, precision: &str) -> PyResult<Self> {
+ let precision = match precision.to_ascii_lowercase().as_str() {
+ "float32" | "f32" | "float" => Precision::Float32,
+ "float64" | "f64" | "double" => Precision::Float64,
+ other => {
+ return Err(PyRuntimeError::new_err(format!(
+ "Unsupported precision '{}'. Use 'float32' (default) or
'float64'.",
+ other
+ )))
+ }
+ };
+
+ let engine = CoreEngine::new_with_precision(device_id, precision)
.map_err(|e| PyRuntimeError::new_err(format!("Failed to
initialize: {}", e)))?;
Ok(Self { engine })
}
diff --git a/qdp/qdp-python/tests/test_bindings.py
b/qdp/qdp-python/tests/test_bindings.py
index 0f7866299..1fc586f78 100644
--- a/qdp/qdp-python/tests/test_bindings.py
+++ b/qdp/qdp-python/tests/test_bindings.py
@@ -85,7 +85,22 @@ def test_pytorch_integration():
torch_tensor = torch.from_dlpack(qtensor)
assert torch_tensor.is_cuda
assert torch_tensor.device.index == 0
- assert torch_tensor.dtype == torch.complex128
+ assert torch_tensor.dtype == torch.complex64
# Verify shape (2 qubits = 2^2 = 4 elements)
assert torch_tensor.shape == (4,)
+
+
[email protected]
+def test_pytorch_precision_float64():
+ """Verify optional float64 precision produces complex128 tensors."""
+ pytest.importorskip("torch")
+ import torch
+ from mahout_qdp import QdpEngine
+
+ engine = QdpEngine(0, precision="float64")
+ data = [1.0, 2.0, 3.0, 4.0]
+ qtensor = engine.encode(data, 2, "amplitude")
+
+ torch_tensor = torch.from_dlpack(qtensor)
+ assert torch_tensor.dtype == torch.complex128
diff --git a/qdp/qdp-python/tests/test_high_fidelity.py
b/qdp/qdp-python/tests/test_high_fidelity.py
index 05bc41987..24f11c513 100644
--- a/qdp/qdp-python/tests/test_high_fidelity.py
+++ b/qdp/qdp-python/tests/test_high_fidelity.py
@@ -58,6 +58,15 @@ def engine():
pytest.skip(f"CUDA initialization failed: {e}")
[email protected](scope="module")
+def engine_float64():
+ """High-precision engine for fidelity-sensitive tests."""
+ try:
+ return QdpEngine(0, precision="float64")
+ except RuntimeError as e:
+ pytest.skip(f"CUDA initialization failed: {e}")
+
+
# 1. Core Logic and Boundary Tests
@@ -73,7 +82,9 @@ def engine():
(20, 1_000_000, "Large - Async Pipeline"),
],
)
-def test_amplitude_encoding_fidelity_comprehensive(engine, num_qubits,
data_size, desc):
+def test_amplitude_encoding_fidelity_comprehensive(
+ engine_float64, num_qubits, data_size, desc
+):
"""Test fidelity across sync path, async pipeline, and chunk boundaries."""
print(f"\n[Test Case] {desc} (Size: {data_size})")
@@ -87,7 +98,7 @@ def test_amplitude_encoding_fidelity_comprehensive(engine,
num_qubits, data_size
expected_state = np.concatenate([expected_state, padding])
expected_state_complex = expected_state.astype(np.complex128)
- qtensor = engine.encode(raw_data.tolist(), num_qubits, "amplitude")
+ qtensor = engine_float64.encode(raw_data.tolist(), num_qubits, "amplitude")
torch_state = torch.from_dlpack(qtensor)
assert torch_state.is_cuda, "Tensor must be on GPU"
@@ -110,6 +121,7 @@ def test_complex_integrity(engine):
qtensor = engine.encode(raw_data.tolist(), num_qubits, "amplitude")
torch_state = torch.from_dlpack(qtensor)
+ assert torch_state.dtype == torch.complex64
imag_error = torch.sum(torch.abs(torch_state.imag)).item()
print(f"\nSum of imaginary parts (should be near 0): {imag_error}")
@@ -123,12 +135,12 @@ def test_complex_integrity(engine):
@pytest.mark.gpu
-def test_numerical_stability_underflow(engine):
+def test_numerical_stability_underflow(engine_float64):
"""Test precision with extremely small values (1e-150)."""
num_qubits = 4
data = [1e-150] * 16
- qtensor = engine.encode(data, num_qubits, "amplitude")
+ qtensor = engine_float64.encode(data, num_qubits, "amplitude")
torch_state = torch.from_dlpack(qtensor)
assert not torch.isnan(torch_state).any(), "Result contains NaN for small
inputs"