This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new 52bc028e7 [QDP] Add zero-copy amplitude batch encoding from float32 
GPU tensors (#1029)
52bc028e7 is described below

commit 52bc028e778c08cccc148c63508c59cc46ace19f
Author: Vic Wen <[email protected]>
AuthorDate: Wed Mar 4 15:40:10 2026 +0800

    [QDP] Add zero-copy amplitude batch encoding from float32 GPU tensors 
(#1029)
    
    * Add batch float32 amplitude encoding
    
    * Add batch float32 amplitude encoding tests
    
    ---------
    
    Co-authored-by: Ryan Huang <[email protected]>
---
 qdp/qdp-core/src/encoding/mod.rs            |   7 +-
 qdp/qdp-core/src/gpu/encodings/amplitude.rs | 121 ++++++++++++++++-
 qdp/qdp-core/src/gpu/encodings/angle.rs     |   6 +-
 qdp/qdp-core/src/gpu/encodings/basis.rs     |   4 +-
 qdp/qdp-core/src/gpu/encodings/iqp.rs       |   2 +-
 qdp/qdp-core/src/gpu/memory.rs              |  76 +++++++----
 qdp/qdp-core/src/lib.rs                     |  72 ++++++++++
 qdp/qdp-core/tests/dlpack.rs                |  40 +++++-
 qdp/qdp-core/tests/gpu_ptr_encoding.rs      | 198 ++++++++++++++++++++++++++++
 qdp/qdp-kernels/src/amplitude.cu            |  98 ++++++++++++++
 qdp/qdp-kernels/src/lib.rs                  |  29 ++++
 11 files changed, 613 insertions(+), 40 deletions(-)

diff --git a/qdp/qdp-core/src/encoding/mod.rs b/qdp/qdp-core/src/encoding/mod.rs
index a06548a9a..851ed6502 100644
--- a/qdp/qdp-core/src/encoding/mod.rs
+++ b/qdp/qdp-core/src/encoding/mod.rs
@@ -146,7 +146,12 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
     let num_samples = reader_core.total_rows;
 
     // Allocate output state vector
-    let total_state_vector = GpuStateVector::new_batch(&engine.device, 
num_samples, num_qubits)?;
+    let total_state_vector = GpuStateVector::new_batch(
+        &engine.device,
+        num_samples,
+        num_qubits,
+        crate::Precision::Float64,
+    )?;
     const PIPELINE_EVENT_SLOTS: usize = 2;
     let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;
 
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index fc57d189e..1be318cfe 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available, 
map_allocation_error};
 use cudarc::driver::{DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
-    launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, 
launch_l2_norm_batch,
-    launch_l2_norm_f32,
+    launch_amplitude_encode, launch_amplitude_encode_batch, 
launch_amplitude_encode_batch_f32,
+    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, 
launch_l2_norm_f32,
 };
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
@@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         // Allocate single large GPU buffer for all states
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
 
         // Upload input data to GPU
@@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         let input_batch_d = input_batch_d as *const f64;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel");
@@ -579,6 +579,119 @@ impl AmplitudeEncoder {
 }
 
 impl AmplitudeEncoder {
+    /// Encode a batch directly from a GPU float32 pointer.
+    ///
+    /// # Safety
+    /// The caller must ensure `input_batch_d` points to valid GPU memory 
containing
+    /// at least `num_samples * sample_size` f32 elements on the same device 
as `device`.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+
+        let batch_state_vector =
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float32)?;
+
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernelF32");
+            use cudarc::driver::DevicePtrMut;
+
+            let mut buffer = 
device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!(
+                    "Failed to allocate f32 norm buffer: {:?}",
+                    e
+                ))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel f32 failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+
+        {
+            crate::profile_scope!("GPU::NormValidationF32");
+            let host_inv_norms = 
device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
+                MahoutError::Cuda(format!("Failed to copy f32 norms to host: 
{:?}", e))
+            })?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more float32 samples have zero or invalid 
norm".to_string(),
+                ));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunchF32");
+            use cudarc::driver::DevicePtr;
+
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 
buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel f32 launch failed with CUDA error code: {} 
({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        Ok(batch_state_vector)
+    }
+
     /// Compute inverse L2 norm on GPU using the reduction kernel.
     ///
     /// # Arguments
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs 
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 1c3e5b8f5..24ed50a6d 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);
@@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
         }
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(
@@ -412,7 +412,7 @@ impl AngleEncoder {
     ) -> Result<GpuStateVector> {
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
 
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs 
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 569e1455e..494b385af 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
         // Allocate batch state vector
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
 
         // Upload basis indices to GPU
@@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
         let basis_indices_d = input_batch_d as *const usize;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs 
b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 7a177a208..bcdc15018 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, 
Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index 2bb922c16..f8d0a571a 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -342,9 +342,14 @@ impl GpuStateVector {
         self.size_elements
     }
 
-    /// Create GPU state vector for a batch of samples
-    /// Allocates num_samples * 2^qubits complex numbers on GPU
-    pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits: 
usize) -> Result<Self> {
+    /// Create GPU state vector for a batch of samples with the given 
precision.
+    /// Allocates `num_samples * 2^qubits` complex numbers on GPU.
+    pub fn new_batch(
+        _device: &Arc<CudaDevice>,
+        num_samples: usize,
+        qubits: usize,
+        precision: Precision,
+    ) -> Result<Self> {
         let single_state_size: usize = 1usize << qubits;
         let total_elements = 
num_samples.checked_mul(single_state_size).ok_or_else(|| {
             MahoutError::MemoryAllocation(format!(
@@ -355,34 +360,51 @@ impl GpuStateVector {
 
         #[cfg(target_os = "linux")]
         {
-            let requested_bytes = total_elements
-                .checked_mul(std::mem::size_of::<CuDoubleComplex>())
-                .ok_or_else(|| {
-                    MahoutError::MemoryAllocation(format!(
-                        "Requested GPU allocation size overflow (elements={})",
-                        total_elements
-                    ))
-                })?;
+            let buffer = match precision {
+                Precision::Float32 => {
+                    let requested_bytes = total_elements
+                        .checked_mul(std::mem::size_of::<CuComplex>())
+                        .ok_or_else(|| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Requested GPU allocation size overflow 
(elements={})",
+                                total_elements
+                            ))
+                        })?;
 
-            // Pre-flight check
-            ensure_device_memory_available(
-                requested_bytes,
-                "batch state vector allocation",
-                Some(qubits),
-            )?;
+                    let context = "batch state vector allocation (f32)";
+                    ensure_device_memory_available(requested_bytes, context, 
Some(qubits))?;
 
-            let slice =
-                unsafe { _device.alloc::<CuDoubleComplex>(total_elements) 
}.map_err(|e| {
-                    map_allocation_error(
-                        requested_bytes,
-                        "batch state vector allocation",
-                        Some(qubits),
-                        e,
-                    )
-                })?;
+                    let slice =
+                        unsafe { _device.alloc::<CuComplex>(total_elements) 
}.map_err(|e| {
+                            map_allocation_error(requested_bytes, context, 
Some(qubits), e)
+                        })?;
+
+                    BufferStorage::F32(GpuBufferRaw { slice })
+                }
+                Precision::Float64 => {
+                    let requested_bytes = total_elements
+                        .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                        .ok_or_else(|| {
+                            MahoutError::MemoryAllocation(format!(
+                                "Requested GPU allocation size overflow 
(elements={})",
+                                total_elements
+                            ))
+                        })?;
+
+                    let context = "batch state vector allocation";
+                    ensure_device_memory_available(requested_bytes, context, 
Some(qubits))?;
+
+                    let slice = unsafe { 
_device.alloc::<CuDoubleComplex>(total_elements) }
+                        .map_err(|e| {
+                            map_allocation_error(requested_bytes, context, 
Some(qubits), e)
+                        })?;
+
+                    BufferStorage::F64(GpuBufferRaw { slice })
+                }
+            };
 
             Ok(Self {
-                buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
+                buffer: Arc::new(buffer),
                 num_qubits: qubits,
                 size_elements: total_elements,
                 num_samples: Some(num_samples),
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index ed9cba0b8..c8146003b 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -605,6 +605,78 @@ impl QdpEngine {
         Ok(state_vector.to_dlpack())
     }
 
+    /// Encode a batch from an existing GPU pointer (float32 input, amplitude 
encoding only).
+    ///
+    /// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the 
default CUDA stream.
+    /// For stream interop use `encode_batch_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `num_samples * sample_size` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_batch_from_gpu_ptr_f32_with_stream(
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode a float32 amplitude batch from an existing GPU pointer on a 
specified CUDA stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the 
stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeBatchFromGpuPtrF32");
+
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?;
+
+        let batch_state_vector = unsafe {
+            gpu::AmplitudeEncoder::encode_batch_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let batch_state_vector = batch_state_vector.to_precision(&self.device, 
self.precision)?;
+        Ok(batch_state_vector.to_dlpack())
+    }
+
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
diff --git a/qdp/qdp-core/tests/dlpack.rs b/qdp/qdp-core/tests/dlpack.rs
index 3c039b371..c22dda384 100644
--- a/qdp/qdp-core/tests/dlpack.rs
+++ b/qdp/qdp-core/tests/dlpack.rs
@@ -31,8 +31,9 @@ mod dlpack_tests {
 
         let num_samples = 4;
         let num_qubits = 2; // 2^2 = 4 elements per sample
-        let state_vector = GpuStateVector::new_batch(&device, num_samples, 
num_qubits)
-            .expect("Failed to create batch state vector");
+        let state_vector =
+            GpuStateVector::new_batch(&device, num_samples, num_qubits, 
Precision::Float64)
+                .expect("Failed to create batch state vector");
 
         let dlpack_ptr = state_vector.to_dlpack();
         assert!(!dlpack_ptr.is_null());
@@ -120,6 +121,41 @@ mod dlpack_tests {
         }
     }
 
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn test_dlpack_batch_shape_f32() {
+        let device = CudaDevice::new(0).unwrap();
+
+        let num_samples = 3;
+        let num_qubits = 2;
+        let state_vector =
+            GpuStateVector::new_batch(&device, num_samples, num_qubits, 
Precision::Float32)
+                .expect("Failed to create Float32 batch state vector");
+
+        assert!(
+            state_vector.ptr_f32().is_some(),
+            "Float32 batch state vector should have ptr_f32()"
+        );
+        assert!(
+            state_vector.ptr_f64().is_none(),
+            "Float32 batch state vector should not have ptr_f64()"
+        );
+
+        let dlpack_ptr = state_vector.to_dlpack();
+        assert!(!dlpack_ptr.is_null());
+
+        unsafe {
+            let tensor = &(*dlpack_ptr).dl_tensor;
+            assert_eq!(tensor.ndim, 2, "DLPack tensor should be 2D");
+            let shape = std::slice::from_raw_parts(tensor.shape, 2);
+            assert_eq!(shape[0], num_samples as i64);
+            assert_eq!(shape[1], (1 << num_qubits) as i64);
+            if let Some(deleter) = (*dlpack_ptr).deleter {
+                deleter(dlpack_ptr);
+            }
+        }
+    }
+
     /// synchronize_stream(null) is a no-op and returns Ok(()) on all 
platforms.
     #[test]
     fn test_synchronize_stream_null() {
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs 
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index c672d6956..97d648fdc 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -52,6 +52,24 @@ fn assert_dlpack_shape_2_4_and_delete(dlpack_ptr: *mut 
qdp_core::dlpack::DLManag
     }
 }
 
+fn assert_dlpack_batch_shape_and_delete(
+    dlpack_ptr: *mut qdp_core::dlpack::DLManagedTensor,
+    num_samples: i64,
+    state_len: i64,
+) {
+    assert!(!dlpack_ptr.is_null());
+    unsafe {
+        let tensor = &(*dlpack_ptr).dl_tensor;
+        assert_eq!(tensor.ndim, 2);
+        let shape = std::slice::from_raw_parts(tensor.shape, 2);
+        assert_eq!(shape[0], num_samples);
+        assert_eq!(shape[1], state_len);
+        if let Some(deleter) = (*dlpack_ptr).deleter {
+            deleter(dlpack_ptr);
+        }
+    }
+}
+
 // ---- Validation / error-path tests (return before using pointer) ----
 
 #[test]
@@ -701,3 +719,183 @@ fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len() 
{
         e => panic!("Expected InvalidInput, got {:?}", e),
     }
 }
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let num_samples = 2;
+    let sample_size = 4;
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0, 
0.5, 0.5, 0.5, 0.5]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr_f32(
+                *input_d.device_ptr() as *const f32,
+                num_samples,
+                sample_size,
+                2,
+            )
+            .expect("encode_batch_from_gpu_ptr_f32")
+    };
+    assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64, 
sample_size as i64);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_with_stream_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0, 
0.5, 0.5, 0.5, 0.5]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr_f32_with_stream(
+                *input_d.device_ptr() as *const f32,
+                2,
+                4,
+                2,
+                stream.stream as *mut c_void,
+            )
+            .expect("encode_batch_from_gpu_ptr_f32_with_stream")
+    };
+    assert_dlpack_batch_shape_and_delete(dlpack_ptr, 2, 4);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_success_f64_engine() {
+    let engine = match QdpEngine::new_with_precision(0, 
Precision::Float64).ok() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 0.0, 0.0, 0.0, 
0.5, 0.5, 0.5, 0.5]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const 
f32, 2, 4, 2)
+            .expect("encode_batch_from_gpu_ptr_f32 (Float64 engine)")
+    };
+    assert_dlpack_batch_shape_and_delete(dlpack_ptr, 2, 4);
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_zero_samples() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { 
engine.encode_batch_from_gpu_ptr_f32(std::ptr::null(), 0, 4, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("zero") || 
msg.contains("samples")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_null_pointer() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { 
engine.encode_batch_from_gpu_ptr_f32(std::ptr::null(), 2, 4, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_sample_size_exceeds_state_len() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match device_and_f32_slice(&[1.0; 10]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let result = unsafe {
+        engine.encode_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const 
f32, 2, 5, 2)
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("exceeds") || msg.contains("state vector"));
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_batch_from_gpu_ptr_f32_odd_sample_size_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let num_samples = 2;
+    let sample_size = 3;
+    let num_qubits = 2;
+    let (_device, input_d) = match device_and_f32_slice(&[1.0, 2.0, 2.0, 2.0, 
1.0, 2.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_batch_from_gpu_ptr_f32(
+                *input_d.device_ptr() as *const f32,
+                num_samples,
+                sample_size,
+                num_qubits,
+            )
+            .expect("encode_batch_from_gpu_ptr_f32 odd sample size")
+    };
+    assert_dlpack_batch_shape_and_delete(dlpack_ptr, num_samples as i64, (1 << 
num_qubits) as i64);
+}
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 224e6bcd6..e9091829d 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -298,6 +298,70 @@ __global__ void amplitude_encode_batch_kernel(
     }
 }
 
+/// Optimized batch amplitude encoding kernel (float32)
+///
+/// Memory Layout (row-major):
+/// - input_batch: [sample0_data | sample1_data | ... | sampleN_data]
+/// - state_batch: [sample0_state | sample1_state | ... | sampleN_state]
+///
+/// Optimizations:
+/// 1. Vectorized float2 loads for 64-bit memory transactions
+/// 2. Grid-stride loop for arbitrary batch sizes
+/// 3. Coalesced memory access within warps
+/// 4. Minimized register pressure
+__global__ void amplitude_encode_batch_kernel_f32(
+    const float* __restrict__ input_batch,
+    cuComplex* __restrict__ state_batch,
+    const float* __restrict__ inv_norms,
+    size_t num_samples,
+    size_t input_len,
+    size_t state_len
+) {
+    // Grid-stride loop pattern for flexibility
+    const size_t elements_per_sample = state_len / 2;
+    const size_t total_work = num_samples * elements_per_sample;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Process elements in grid-stride fashion
+    for (size_t idx = global_idx; idx < total_work; idx += stride) {
+        // Decompose linear index into (sample, element_pair)
+        const size_t sample_idx = idx / elements_per_sample;
+        const size_t elem_pair = idx % elements_per_sample;
+
+        // Calculate base addresses (strength-reduced)
+        const size_t input_base = sample_idx * input_len;
+        const size_t state_base = sample_idx * state_len;
+        const size_t elem_offset = elem_pair * 2;
+
+        // Load inverse norm (cached by L1)
+        const float inv_norm = inv_norms[sample_idx];
+
+        float v1, v2;
+        if (elem_offset + 1 < input_len) {
+            const float2 vec_data = __ldg(reinterpret_cast<const 
float2*>(input_batch + input_base) + elem_pair);
+            v1 = vec_data.x;
+            v2 = vec_data.y;
+        } else if (elem_offset < input_len) {
+            v1 = __ldg(input_batch + input_base + elem_offset);
+            v2 = 0.0f;
+        } else {
+            v1 = v2 = 0.0f;
+        }
+
+        // Normalize and write as complex numbers
+        const cuComplex c1 = make_cuComplex(v1 * inv_norm, 0.0f);
+        const cuComplex c2 = make_cuComplex(v2 * inv_norm, 0.0f);
+
+        // Write to global memory (coalesced within warp)
+        state_batch[state_base + elem_offset] = c1;
+        if (elem_offset + 1 < state_len) {
+            state_batch[state_base + elem_offset + 1] = c2;
+        }
+    }
+}
+
 /// Launch optimized batch amplitude encoding kernel
 ///
 /// # Arguments
@@ -350,6 +414,40 @@ int launch_amplitude_encode_batch(
     return (int)cudaGetLastError();
 }
 
+/// Launch optimized batch amplitude encoding kernel for float32 input/output.
+int launch_amplitude_encode_batch_f32(
+    const float* input_batch_d,
+    void* state_batch_d,
+    const float* inv_norms_d,
+    size_t num_samples,
+    size_t input_len,
+    size_t state_len,
+    cudaStream_t stream
+) {
+    if (num_samples == 0 || state_len == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cuComplex* state_complex_d = static_cast<cuComplex*>(state_batch_d);
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const size_t total_work = num_samples * (state_len / 2);
+    const size_t blocks_needed = (total_work + blockSize - 1) / blockSize;
+    const size_t max_blocks = MAX_GRID_BLOCKS;
+    const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed : 
max_blocks;
+
+    amplitude_encode_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        input_batch_d,
+        state_complex_d,
+        inv_norms_d,
+        num_samples,
+        input_len,
+        state_len
+    );
+
+    return (int)cudaGetLastError();
+}
+
 /// Kernel: accumulate L2 norm using coalesced vectorized loads.
 /// Each block atomically adds its partial sum to the output accumulator.
 __global__ void l2_norm_kernel(
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index 2bbd21609..45d94593f 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -98,6 +98,21 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Launch batch amplitude encoding kernel (float32 input/output)
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Requires valid GPU pointers, must sync before freeing
+    pub fn launch_amplitude_encode_batch_f32(
+        input_batch_d: *const f32,
+        state_batch_d: *mut c_void,
+        inv_norms_d: *const f32,
+        num_samples: usize,
+        input_len: usize,
+        state_len: usize,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Launch L2 norm reduction (returns inverse norm)
     /// Returns CUDA error code (0 = success)
     ///
@@ -298,6 +313,20 @@ pub extern "C" fn launch_amplitude_encode_batch(
     999
 }
 
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_amplitude_encode_batch_f32(
+    _input_batch_d: *const f32,
+    _state_batch_d: *mut c_void,
+    _inv_norms_d: *const f32,
+    _num_samples: usize,
+    _input_len: usize,
+    _state_len: usize,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm(

Reply via email to