This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 52bc028e7 [QDP] Add zero-copy amplitude batch encoding from float32
GPU tensors (#1029)
52bc028e7 is described below
commit 52bc028e778c08cccc148c63508c59cc46ace19f
Author: Vic Wen <[email protected]>
AuthorDate: Wed Mar 4 15:40:10 2026 +0800
[QDP] Add zero-copy amplitude batch encoding from float32 GPU tensors
(#1029)
* Add batch float32 amplitude encoding
* Add batch float32 amplitude encoding tests
---------
Co-authored-by: Ryan Huang <[email protected]>
---
qdp/qdp-core/src/encoding/mod.rs | 7 +-
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 121 ++++++++++++++++-
qdp/qdp-core/src/gpu/encodings/angle.rs | 6 +-
qdp/qdp-core/src/gpu/encodings/basis.rs | 4 +-
qdp/qdp-core/src/gpu/encodings/iqp.rs | 2 +-
qdp/qdp-core/src/gpu/memory.rs | 76 +++++++----
qdp/qdp-core/src/lib.rs | 72 ++++++++++
qdp/qdp-core/tests/dlpack.rs | 40 +++++-
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 198 ++++++++++++++++++++++++++++
qdp/qdp-kernels/src/amplitude.cu | 98 ++++++++++++++
qdp/qdp-kernels/src/lib.rs | 29 ++++
11 files changed, 613 insertions(+), 40 deletions(-)
diff --git a/qdp/qdp-core/src/encoding/mod.rs b/qdp/qdp-core/src/encoding/mod.rs
index a06548a9a..851ed6502 100644
--- a/qdp/qdp-core/src/encoding/mod.rs
+++ b/qdp/qdp-core/src/encoding/mod.rs
@@ -146,7 +146,12 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
let num_samples = reader_core.total_rows;
// Allocate output state vector
- let total_state_vector = GpuStateVector::new_batch(&engine.device,
num_samples, num_qubits)?;
+ let total_state_vector = GpuStateVector::new_batch(
+ &engine.device,
+ num_samples,
+ num_qubits,
+ crate::Precision::Float64,
+ )?;
const PIPELINE_EVENT_SLOTS: usize = 2;
let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index fc57d189e..1be318cfe 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available,
map_allocation_error};
use cudarc::driver::{DevicePtr, DevicePtrMut};
#[cfg(target_os = "linux")]
use qdp_kernels::{
- launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm,
launch_l2_norm_batch,
- launch_l2_norm_f32,
+ launch_amplitude_encode, launch_amplitude_encode_batch,
launch_amplitude_encode_batch_f32,
+ launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32,
launch_l2_norm_f32,
};
#[cfg(target_os = "linux")]
use std::ffi::c_void;
@@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
// Allocate single large GPU buffer for all states
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
// Upload input data to GPU
@@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
let input_batch_d = input_batch_d as *const f64;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel");
@@ -579,6 +579,119 @@ impl AmplitudeEncoder {
}
impl AmplitudeEncoder {
+ /// Encode a batch directly from a GPU float32 pointer.
+ ///
+ /// # Safety
+ /// The caller must ensure `input_batch_d` points to valid GPU memory
containing
+ /// at least `num_samples * sample_size` f32 elements on the same device
as `device`.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+ device: &Arc<CudaDevice>,
+ input_batch_d: *const f32,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ let state_len = 1 << num_qubits;
+ if num_samples == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Number of samples cannot be zero".into(),
+ ));
+ }
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+ if sample_size > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Sample size {} exceeds state vector size {} (2^{} qubits)",
+ sample_size, state_len, num_qubits
+ )));
+ }
+
+ let batch_state_vector =
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float32)?;
+
+ let inv_norms_gpu = {
+ crate::profile_scope!("GPU::BatchNormKernelF32");
+ use cudarc::driver::DevicePtrMut;
+
+ let mut buffer =
device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+ MahoutError::MemoryAllocation(format!(
+ "Failed to allocate f32 norm buffer: {:?}",
+ e
+ ))
+ })?;
+ let ret = unsafe {
+ launch_l2_norm_batch_f32(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ *buffer.device_ptr_mut() as *mut f32,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Norm reduction kernel f32 failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ buffer
+ };
+
+ {
+ crate::profile_scope!("GPU::NormValidationF32");
+ let host_inv_norms =
device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
+ MahoutError::Cuda(format!("Failed to copy f32 norms to host:
{:?}", e))
+ })?;
+ if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+ return Err(MahoutError::InvalidInput(
+ "One or more float32 samples have zero or invalid
norm".to_string(),
+ ));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::BatchKernelLaunchF32");
+ use cudarc::driver::DevicePtr;
+
+ let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "Batch state vector precision mismatch (expected float32
buffer)".to_string(),
+ )
+ })?;
+ let ret = unsafe {
+ launch_amplitude_encode_batch_f32(
+ input_batch_d,
+ state_ptr as *mut c_void,
+ *inv_norms_gpu.device_ptr() as *const f32,
+ num_samples,
+ sample_size,
+ state_len,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Batch kernel f32 launch failed with CUDA error code: {}
({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+ }
+
+ Ok(batch_state_vector)
+ }
+
/// Compute inverse L2 norm on GPU using the reduction kernel.
///
/// # Arguments
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 1c3e5b8f5..24ed50a6d 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let input_bytes = std::mem::size_of_val(batch_data);
@@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
}
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
@@ -412,7 +412,7 @@ impl AngleEncoder {
) -> Result<GpuStateVector> {
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 569e1455e..494b385af 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
// Allocate batch state vector
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
// Upload basis indices to GPU
@@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
let basis_indices_d = input_batch_d as *const usize;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 7a177a208..bcdc15018 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
- GpuStateVector::new_batch(device, num_samples, num_qubits)?
+ GpuStateVector::new_batch(device, num_samples, num_qubits,
Precision::Float64)?
};
let input_bytes = std::mem::size_of_val(batch_data);
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 2bb922c16..f8d0a571a 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -342,9 +342,14 @@ impl GpuStateVector {
self.size_elements
}
- /// Create GPU state vector for a batch of samples
- /// Allocates num_samples * 2^qubits complex numbers on GPU
- pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits:
usize) -> Result<Self> {
+ /// Create GPU state vector for a batch of samples with the given
precision.
+ /// Allocates `num_samples * 2^qubits` complex numbers on GPU.
+ pub fn new_batch(
+ _device: &Arc<CudaDevice>,
+ num_samples: usize,
+ qubits: usize,
+ precision: Precision,
+ ) -> Result<Self> {
let single_state_size: usize = 1usize << qubits;
let total_elements =
num_samples.checked_mul(single_state_size).ok_or_else(|| {
MahoutError::MemoryAllocation(format!(
@@ -355,34 +360,51 @@ impl GpuStateVector {
#[cfg(target_os = "linux")]
{
- let requested_bytes = total_elements
- .checked_mul(std::mem::size_of::<CuDoubleComplex>())
- .ok_or_else(|| {
- MahoutError::MemoryAllocation(format!(
- "Requested GPU allocation size overflow (elements={})",
- total_elements
- ))
- })?;
+ let buffer = match precision {
+ Precision::Float32 => {
+ let requested_bytes = total_elements
+ .checked_mul(std::mem::size_of::<CuComplex>())
+ .ok_or_else(|| {
+ MahoutError::MemoryAllocation(format!(
+ "Requested GPU allocation size overflow
(elements={})",
+ total_elements
+ ))
+ })?;
- // Pre-flight check
- ensure_device_memory_available(
- requested_bytes,
- "batch state vector allocation",
- Some(qubits),
- )?;
+ let context = "batch state vector allocation (f32)";
+ ensure_device_memory_available(requested_bytes, context,
Some(qubits))?;
- let slice =
- unsafe { _device.alloc::<CuDoubleComplex>(total_elements)
}.map_err(|e| {
- map_allocation_error(
- requested_bytes,
- "batch state vector allocation",
- Some(qubits),
- e,
- )
- })?;
+ let slice =
+ unsafe { _device.alloc::<CuComplex>(total_elements)
}.map_err(|e| {
+ map_allocation_error(requested_bytes, context,
Some(qubits), e)
+ })?;
+
+ BufferStorage::F32(GpuBufferRaw { slice })
+ }
+ Precision::Float64 => {
+ let requested_bytes = total_elements
+ .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+ .ok_or_else(|| {
+ MahoutError::MemoryAllocation(format!(
+ "Requested GPU allocation size overflow
(elements={})",
+ total_elements
+ ))
+ })?;
+
+ let context = "batch state vector allocation";
+ ensure_device_memory_available(requested_bytes, context,
Some(qubits))?;
+
+ let slice = unsafe {
_device.alloc::<CuDoubleComplex>(total_elements) }
+ .map_err(|e| {
+ map_allocation_error(requested_bytes, context,
Some(qubits), e)
+ })?;
+
+ BufferStorage::F64(GpuBufferRaw { slice })
+ }
+ };
Ok(Self {
- buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
+ buffer: Arc::new(buffer),
num_qubits: qubits,
size_elements: total_elements,
num_samples: Some(num_samples),
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index ed9cba0b8..c8146003b 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -605,6 +605,78 @@ impl QdpEngine {
Ok(state_vector.to_dlpack())
}
+ /// Encode a batch from an existing GPU pointer (float32 input, amplitude
encoding only).
+ ///
+ /// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the
default CUDA stream.
+ /// For stream interop use `encode_batch_from_gpu_ptr_f32_with_stream`.
+ ///
+ /// # Safety
+ /// The input pointer must:
+ /// - Point to valid GPU memory on the same device as the engine
+ /// - Contain at least `num_samples * sample_size` f32 elements
+ /// - Remain valid for the duration of this call
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_batch_from_gpu_ptr_f32(
+ &self,
+ input_batch_d: *const f32,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ ) -> Result<*mut DLManagedTensor> {
+ unsafe {
+ self.encode_batch_from_gpu_ptr_f32_with_stream(
+ input_batch_d,
+ num_samples,
+ sample_size,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ }
+ }
+
+ /// Encode a float32 amplitude batch from an existing GPU pointer on a
specified CUDA stream.
+ ///
+ /// # Safety
+ /// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the
stream pointer
+ /// must remain valid for the duration of this call.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+ &self,
+ input_batch_d: *const f32,
+ num_samples: usize,
+ sample_size: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<*mut DLManagedTensor> {
+ crate::profile_scope!("Mahout::EncodeBatchFromGpuPtrF32");
+
+ if num_samples == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Number of samples cannot be zero".into(),
+ ));
+ }
+ if sample_size == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Sample size cannot be zero".into(),
+ ));
+ }
+
+ validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?;
+
+ let batch_state_vector = unsafe {
+ gpu::AmplitudeEncoder::encode_batch_from_gpu_ptr_f32_with_stream(
+ &self.device,
+ input_batch_d,
+ num_samples,
+ sample_size,
+ num_qubits,
+ stream,
+ )
+ }?;
+ let batch_state_vector = batch_state_vector.to_precision(&self.device,
self.precision)?;
+ Ok(batch_state_vector.to_dlpack())
+ }
+
/// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
///
/// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
diff --git a/qdp/qdp-core/tests/dlpack.rs b/qdp/qdp-core/tests/dlpack.rs
index 3c039b371..c22dda384 100644
--- a/qdp/qdp-core/tests/dlpack.rs
+++ b/qdp/qdp-core/tests/dlpack.rs
@@ -31,8 +31,9 @@ mod dlpack_tests {
let num_samples = 4;
let num_qubits = 2; // 2^2 = 4 elements per sample
- let state_vector = GpuStateVector::new_batch(&device, num_samples,
num_qubits)
- .expect("Failed to create batch state vector");
+ let state_vector =
+ GpuStateVector::new_batch(&device, num_samples, num_qubits,
Precision::Float64)
+ .expect("Failed to create batch state vector");
let dlpack_ptr = state_vector.to_dlpack();
assert!(!dlpack_ptr.is_null());
@@ -120,6 +121,41 @@ mod dlpack_tests {
}
}
+ #[test]
+ #[cfg(target_os = "linux")]
+ fn test_dlpack_batch_shape_f32() {
+ let device = CudaDevice::new(0).unwrap();
+
+ let num_samples = 3;
+ let num_qubits = 2;
+ let state_vector =
+ GpuStateVector::new_batch(&device, num_samples, num_qubits,
Precision::Float32)
+ .expect("Failed to create Float32 batch state vector");
+
+ assert!(
+ state_vector.ptr_f32().is_some(),
+ "Float32 batch state vector should have ptr_f32()"
+ );
+ assert!(
+ state_vector.ptr_f64().is_none(),
+ "Float32 batch state vector should not have ptr_f64()"
+ );
+
+ let dlpack_ptr = state_vector.to_dlpack();
+ assert!(!dlpack_ptr.is_null());
+
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2, "DLPack tensor should be 2D");
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], num_samples as i64);
+ assert_eq!(shape[1], (1 << num_qubits) as i64);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+ }
+
/// synchronize_stream(null) is a no-op and returns Ok(()) on all
platforms.
#[test]
fn test_synchronize_stream_null() {
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index c672d6956..97d648fdc 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -52,6 +52,24 @@ fn assert_dlpack_shape_2_4_and_delete(dlpack_ptr: *mut
qdp_core::dlpack::DLManag
}
}
+fn assert_dlpack_batch_shape_and_delete(
+ dlpack_ptr: *mut qdp_core::dlpack::DLManagedTensor,
+ num_samples: i64,
+ state_len: i64,
+) {
+ assert!(!dlpack_ptr.is_null());
+ unsafe {
+ let tensor = &(*dlpack_ptr).dl_tensor;
+ assert_eq!(tensor.ndim, 2);
+ let shape = std::slice::from_raw_parts(tensor.shape, 2);
+ assert_eq!(shape[0], num_samples);
+ assert_eq!(shape[1], state_len);
+ if let Some(deleter) = (*dlpack_ptr).deleter {
+ deleter(dlpack_ptr);
+ }
+ }
+}
+
// ---- Validation / error-path tests (return before using pointer) ----
#[test]
@@ -701,3 +719,183 @@ fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len()
{
e => panic!("Expected InvalidInput, got {:?}", e),
}
}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let num_samples = 2;
+ let sample_size = 4;
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0,
0.5, 0.5, 0.5, 0.5]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr_f32(
+ *input_d.device_ptr() as *const f32,
+ num_samples,
+ sample_size,
+ 2,
+ )
+ .expect("encode_batch_from_gpu_ptr_f32")
+ };
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64,
sample_size as i64);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_with_stream_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0,
0.5, 0.5, 0.5, 0.5]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let stream = device.fork_default_stream().expect("fork_default_stream");
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr_f32_with_stream(
+ *input_d.device_ptr() as *const f32,
+ 2,
+ 4,
+ 2,
+ stream.stream as *mut c_void,
+ )
+ .expect("encode_batch_from_gpu_ptr_f32_with_stream")
+ };
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, 2, 4);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_success_f64_engine() {
+ let engine = match QdpEngine::new_with_precision(0,
Precision::Float64).ok() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0,
0.5, 0.5, 0.5, 0.5]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const
f32, 2, 4, 2)
+ .expect("encode_batch_from_gpu_ptr_f32 (Float64 engine)")
+ };
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, 2, 4);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_zero_samples() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let result = unsafe {
engine.encode_batch_from_gpu_ptr_f32(std::ptr::null(), 0, 4, 2) };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => assert!(msg.contains("zero") ||
msg.contains("samples")),
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_null_pointer() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let result = unsafe {
engine.encode_batch_from_gpu_ptr_f32(std::ptr::null(), 2, 4, 2) };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_sample_size_exceeds_state_len() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let (_device, input_d) = match device_and_f32_slice(&[1.0; 10]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let result = unsafe {
+ engine.encode_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const
f32, 2, 5, 2)
+ };
+ assert!(result.is_err());
+ match &result.unwrap_err() {
+ MahoutError::InvalidInput(msg) => {
+ assert!(msg.contains("exceeds") || msg.contains("state vector"));
+ }
+ e => panic!("Expected InvalidInput, got {:?}", e),
+ }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_odd_sample_size_success() {
+ let engine = match engine_f32() {
+ Some(e) => e,
+ None => {
+ println!("SKIP: No GPU");
+ return;
+ }
+ };
+ let num_samples = 2;
+ let sample_size = 3;
+ let num_qubits = 2;
+ let (_device, input_d) = match device_and_f32_slice(&[1.0, 2.0, 2.0, 2.0,
1.0, 2.0]) {
+ Some(t) => t,
+ None => {
+ println!("SKIP: No CUDA device");
+ return;
+ }
+ };
+ let dlpack_ptr = unsafe {
+ engine
+ .encode_batch_from_gpu_ptr_f32(
+ *input_d.device_ptr() as *const f32,
+ num_samples,
+ sample_size,
+ num_qubits,
+ )
+ .expect("encode_batch_from_gpu_ptr_f32 odd sample size")
+ };
+ assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64, (1 <<
num_qubits) as i64);
+}
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 224e6bcd6..e9091829d 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -298,6 +298,70 @@ __global__ void amplitude_encode_batch_kernel(
}
}
+/// Optimized batch amplitude encoding kernel (float32)
+///
+/// Memory Layout (row-major):
+/// - input_batch: [sample0_data | sample1_data | ... | sampleN_data]
+/// - state_batch: [sample0_state | sample1_state | ... | sampleN_state]
+///
+/// Optimizations:
+/// 1. Vectorized float2 loads for 64-bit memory transactions
+/// 2. Grid-stride loop for arbitrary batch sizes
+/// 3. Coalesced memory access within warps
+/// 4. Minimized register pressure
+__global__ void amplitude_encode_batch_kernel_f32(
+ const float* __restrict__ input_batch,
+ cuComplex* __restrict__ state_batch,
+ const float* __restrict__ inv_norms,
+ size_t num_samples,
+ size_t input_len,
+ size_t state_len
+) {
+ // Grid-stride loop pattern for flexibility
+ const size_t elements_per_sample = state_len / 2;
+ const size_t total_work = num_samples * elements_per_sample;
+ const size_t stride = gridDim.x * blockDim.x;
+
+ size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Process elements in grid-stride fashion
+ for (size_t idx = global_idx; idx < total_work; idx += stride) {
+ // Decompose linear index into (sample, element_pair)
+ const size_t sample_idx = idx / elements_per_sample;
+ const size_t elem_pair = idx % elements_per_sample;
+
+ // Calculate base addresses (strength-reduced)
+ const size_t input_base = sample_idx * input_len;
+ const size_t state_base = sample_idx * state_len;
+ const size_t elem_offset = elem_pair * 2;
+
+ // Load inverse norm (cached by L1)
+ const float inv_norm = inv_norms[sample_idx];
+
+ float v1, v2;
+ if (elem_offset + 1 < input_len) {
+ const float2 vec_data = __ldg(reinterpret_cast<const
float2*>(input_batch + input_base) + elem_pair);
+ v1 = vec_data.x;
+ v2 = vec_data.y;
+ } else if (elem_offset < input_len) {
+ v1 = __ldg(input_batch + input_base + elem_offset);
+ v2 = 0.0f;
+ } else {
+ v1 = v2 = 0.0f;
+ }
+
+ // Normalize and write as complex numbers
+ const cuComplex c1 = make_cuComplex(v1 * inv_norm, 0.0f);
+ const cuComplex c2 = make_cuComplex(v2 * inv_norm, 0.0f);
+
+ // Write to global memory (coalesced within warp)
+ state_batch[state_base + elem_offset] = c1;
+ if (elem_offset + 1 < state_len) {
+ state_batch[state_base + elem_offset + 1] = c2;
+ }
+ }
+}
+
/// Launch optimized batch amplitude encoding kernel
///
/// # Arguments
@@ -350,6 +414,40 @@ int launch_amplitude_encode_batch(
return (int)cudaGetLastError();
}
+/// Launch optimized batch amplitude encoding kernel for float32 input/output.
+int launch_amplitude_encode_batch_f32(
+ const float* input_batch_d,
+ void* state_batch_d,
+ const float* inv_norms_d,
+ size_t num_samples,
+ size_t input_len,
+ size_t state_len,
+ cudaStream_t stream
+) {
+ if (num_samples == 0 || state_len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ cuComplex* state_complex_d = static_cast<cuComplex*>(state_batch_d);
+
+ const int blockSize = DEFAULT_BLOCK_SIZE;
+ const size_t total_work = num_samples * (state_len / 2);
+ const size_t blocks_needed = (total_work + blockSize - 1) / blockSize;
+ const size_t max_blocks = MAX_GRID_BLOCKS;
+ const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed :
max_blocks;
+
+ amplitude_encode_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+ input_batch_d,
+ state_complex_d,
+ inv_norms_d,
+ num_samples,
+ input_len,
+ state_len
+ );
+
+ return (int)cudaGetLastError();
+}
+
/// Kernel: accumulate L2 norm using coalesced vectorized loads.
/// Each block atomically adds its partial sum to the output accumulator.
__global__ void l2_norm_kernel(
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index 2bbd21609..45d94593f 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -98,6 +98,21 @@ unsafe extern "C" {
stream: *mut c_void,
) -> i32;
+ /// Launch batch amplitude encoding kernel (float32 input/output)
+ /// Returns CUDA error code (0 = success)
+ ///
+ /// # Safety
+ /// Requires valid GPU pointers, must sync before freeing
+ pub fn launch_amplitude_encode_batch_f32(
+ input_batch_d: *const f32,
+ state_batch_d: *mut c_void,
+ inv_norms_d: *const f32,
+ num_samples: usize,
+ input_len: usize,
+ state_len: usize,
+ stream: *mut c_void,
+ ) -> i32;
+
/// Launch L2 norm reduction (returns inverse norm)
/// Returns CUDA error code (0 = success)
///
@@ -298,6 +313,20 @@ pub extern "C" fn launch_amplitude_encode_batch(
999
}
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_amplitude_encode_batch_f32(
+ _input_batch_d: *const f32,
+ _state_batch_d: *mut c_void,
+ _inv_norms_d: *const f32,
+ _num_samples: usize,
+ _input_len: usize,
+ _state_len: usize,
+ _stream: *mut c_void,
+) -> i32 {
+ 999
+}
+
#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
#[unsafe(no_mangle)]
pub extern "C" fn launch_l2_norm(