This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new b13c6f36d feat: add CUDA float32 batch amplitude encoding support in 
Python bindings (#1116)
b13c6f36d is described below

commit b13c6f36dd74abf415ecefc268a246dceb4d7eda
Author: Vic Wen <[email protected]>
AuthorDate: Thu Mar 5 16:04:25 2026 +0800

    feat: add CUDA float32 batch amplitude encoding support in Python bindings 
(#1116)
    
    * Add CUDA float32 batch amplitude encoding in Python bindings
    
    * Add Python tests for CUDA float32 batch amplitude encoding
    
    * fix: handle misaligned float32 batch amplitude loads
    
    * refactor: streamline tensor size extraction
    
    ---------
    
    Co-authored-by: Ryan Huang <[email protected]>
    Co-authored-by: Guan-Ming (Wesley) Chiu 
<[email protected]>
---
 qdp/qdp-kernels/src/amplitude.cu               | 15 ++++--
 qdp/qdp-python/src/engine.rs                   | 39 +++++++++++---
 qdp/qdp-python/tests/test_dlpack_validation.py | 71 +++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 18 deletions(-)

diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index e9091829d..57fa4320c 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -339,13 +339,20 @@ __global__ void amplitude_encode_batch_kernel_f32(
         const float inv_norm = inv_norms[sample_idx];
 
         float v1, v2;
-        if (elem_offset + 1 < input_len) {
-            const float2 vec_data = __ldg(reinterpret_cast<const 
float2*>(input_batch + input_base) + elem_pair);
+        const float* sample_input = input_batch + input_base;
+        const bool sample_input_aligned =
+            (reinterpret_cast<uintptr_t>(sample_input) & (alignof(float2) - 
1)) == 0;
+
+        if (sample_input_aligned && elem_offset + 1 < input_len) {
+            const float2 vec_data =
+                __ldg(reinterpret_cast<const float2*>(sample_input) + 
elem_pair);
             v1 = vec_data.x;
             v2 = vec_data.y;
         } else if (elem_offset < input_len) {
-            v1 = __ldg(input_batch + input_base + elem_offset);
-            v2 = 0.0f;
+            v1 = __ldg(sample_input + elem_offset);
+            v2 = (elem_offset + 1 < input_len)
+                ? __ldg(sample_input + elem_offset + 1)
+                : 0.0f;
         } else {
             v1 = v2 = 0.0f;
         }
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 2a768bdb2..b2eecc719 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -463,8 +463,8 @@ impl QdpEngine {
 
     /// Encode directly from a PyTorch CUDA tensor. Internal helper.
     ///
-    /// Dispatches to the core f32 GPU pointer API for 1D float32 amplitude 
encoding,
-    /// or to the float64/basis GPU pointer APIs for other dtypes and batch 
encoding.
+    /// Dispatches to the core f32 GPU pointer API for float32 amplitude 
encoding,
+    /// or to the float64/basis GPU pointer APIs for other dtypes and methods.
     fn _encode_from_cuda_tensor(
         &self,
         data: &Bound<'_, PyAny>,
@@ -479,6 +479,7 @@ impl QdpEngine {
         let is_f32 = dtype_str_lower.contains("float32");
         let method = encoding_method.to_ascii_lowercase();
         let ndim: usize = data.call_method0("dim")?.extract()?;
+        let tensor_info = extract_cuda_tensor_info(data)?;
 
         if method.as_str() == "amplitude" && is_f32 {
             match ndim {
@@ -506,10 +507,35 @@ impl QdpEngine {
                         consumed: false,
                     })
                 }
-                2 => Err(PyRuntimeError::new_err(
-                    "CUDA float32 batch amplitude encoding is not yet 
supported. \
-                     Use float64 (tensor.to(torch.float64)) or encode samples 
individually.",
-                )),
+                2 => {
+                    let num_samples = tensor_info.shape[0] as usize;
+                    let sample_size = tensor_info.shape[1] as usize;
+                    let stream_ptr = get_torch_cuda_stream_ptr(data)?;
+                    let data_ptr_u64: u64 = 
data.call_method0("data_ptr")?.extract()?;
+                    let data_ptr = data_ptr_u64 as *const f32;
+
+                    let ptr = unsafe {
+                        self.engine
+                            .encode_batch_from_gpu_ptr_f32_with_stream(
+                                data_ptr,
+                                num_samples,
+                                sample_size,
+                                num_qubits,
+                                stream_ptr,
+                            )
+                            .map_err(|e| {
+                                PyRuntimeError::new_err(format!(
+                                    "Encoding failed (float32 amplitude 
batch): {}",
+                                    e
+                                ))
+                            })?
+                    };
+
+                    Ok(QuantumTensor {
+                        ptr,
+                        consumed: false,
+                    })
+                }
                 _ => Err(PyRuntimeError::new_err(format!(
                     "Unsupported CUDA tensor shape: {}D. Expected 1D tensor 
for single \
                      sample encoding or 2D tensor (batch_size, features) for 
batch encoding.",
@@ -517,7 +543,6 @@ impl QdpEngine {
                 ))),
             }
         } else {
-            let tensor_info = extract_cuda_tensor_info(data)?;
             let stream_ptr = get_torch_cuda_stream_ptr(data)?;
 
             match ndim {
diff --git a/qdp/qdp-python/tests/test_dlpack_validation.py 
b/qdp/qdp-python/tests/test_dlpack_validation.py
index b2e605018..a612efbca 100644
--- a/qdp/qdp-python/tests/test_dlpack_validation.py
+++ b/qdp/qdp-python/tests/test_dlpack_validation.py
@@ -48,14 +48,71 @@ def test_cuda_float32_amplitude_supported():
 
 
 @pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")
-def test_cuda_float32_amplitude_2d_unsupported():
-    """2D float32 CUDA tensor with amplitude encoding should raise a clear 
error."""
+def test_cuda_float32_amplitude_2d_supported():
+    """2D float32 CUDA tensor should use the batch GPU-pointer float32 
amplitude path."""
     engine = _engine()
-    t = torch.randn(2, 4, dtype=torch.float32, device="cuda")
-    with pytest.raises(
-        RuntimeError, match="float32 batch amplitude encoding is not yet 
supported"
-    ):
-        engine.encode(t, num_qubits=2, encoding_method="amplitude")
+    t = torch.tensor(
+        [[3.0, 4.0, 0.0, 0.0], [1.0, 2.0, 2.0, 1.0]],
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    result = engine.encode(t, num_qubits=2, encoding_method="amplitude")
+    assert result is not None
+
+    qt = torch.from_dlpack(result)
+    assert qt.is_cuda
+    assert qt.shape == (2, 4)
+    assert qt.dtype == torch.complex64
+
+    expected = torch.tensor(
+        [
+            [0.6, 0.8, 0.0, 0.0],
+            [
+                1.0 / (10.0**0.5),
+                2.0 / (10.0**0.5),
+                2.0 / (10.0**0.5),
+                1.0 / (10.0**0.5),
+            ],
+        ],
+        dtype=torch.complex64,
+        device="cuda",
+    )
+    assert torch.allclose(qt, expected)
+
+
[email protected](not _cuda_available(), reason="CUDA not available")
+def test_cuda_float32_amplitude_2d_respects_engine_precision():
+    """2D float32 CUDA amplitude batch should still honor float64 engine 
output precision."""
+    engine = QdpEngine(0, precision="float64")
+    t = torch.tensor(
+        [[3.0, 4.0, 0.0, 0.0], [1.0, 2.0, 2.0, 1.0]],
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    result = engine.encode(t, num_qubits=2, encoding_method="amplitude")
+    assert result is not None
+
+    qt = torch.from_dlpack(result)
+    assert qt.is_cuda
+    assert qt.shape == (2, 4)
+    assert qt.dtype == torch.complex128
+
+    expected = torch.tensor(
+        [
+            [0.6, 0.8, 0.0, 0.0],
+            [
+                1.0 / (10.0**0.5),
+                2.0 / (10.0**0.5),
+                2.0 / (10.0**0.5),
+                1.0 / (10.0**0.5),
+            ],
+        ],
+        dtype=torch.complex128,
+        device="cuda",
+    )
+    assert torch.allclose(qt, expected)
 
 
 @pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")

Reply via email to