This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new b13c6f36d feat: add CUDA float32 batch amplitude encoding support in
Python bindings (#1116)
b13c6f36d is described below
commit b13c6f36dd74abf415ecefc268a246dceb4d7eda
Author: Vic Wen <[email protected]>
AuthorDate: Thu Mar 5 16:04:25 2026 +0800
feat: add CUDA float32 batch amplitude encoding support in Python bindings
(#1116)
* Add CUDA float32 batch amplitude encoding in Python bindings
* Add Python tests for CUDA float32 batch amplitude encoding
* fix: handle misaligned float32 batch amplitude loads
* refactor: streamline tensor size extraction
---------
Co-authored-by: Ryan Huang <[email protected]>
Co-authored-by: Guan-Ming (Wesley) Chiu
<[email protected]>
---
qdp/qdp-kernels/src/amplitude.cu | 15 ++++--
qdp/qdp-python/src/engine.rs | 39 +++++++++++---
qdp/qdp-python/tests/test_dlpack_validation.py | 71 +++++++++++++++++++++++---
3 files changed, 107 insertions(+), 18 deletions(-)
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index e9091829d..57fa4320c 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -339,13 +339,20 @@ __global__ void amplitude_encode_batch_kernel_f32(
const float inv_norm = inv_norms[sample_idx];
float v1, v2;
- if (elem_offset + 1 < input_len) {
- const float2 vec_data = __ldg(reinterpret_cast<const
float2*>(input_batch + input_base) + elem_pair);
+ const float* sample_input = input_batch + input_base;
+ const bool sample_input_aligned =
+ (reinterpret_cast<uintptr_t>(sample_input) & (alignof(float2) -
1)) == 0;
+
+ if (sample_input_aligned && elem_offset + 1 < input_len) {
+ const float2 vec_data =
+ __ldg(reinterpret_cast<const float2*>(sample_input) +
elem_pair);
v1 = vec_data.x;
v2 = vec_data.y;
} else if (elem_offset < input_len) {
- v1 = __ldg(input_batch + input_base + elem_offset);
- v2 = 0.0f;
+ v1 = __ldg(sample_input + elem_offset);
+ v2 = (elem_offset + 1 < input_len)
+ ? __ldg(sample_input + elem_offset + 1)
+ : 0.0f;
} else {
v1 = v2 = 0.0f;
}
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 2a768bdb2..b2eecc719 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -463,8 +463,8 @@ impl QdpEngine {
/// Encode directly from a PyTorch CUDA tensor. Internal helper.
///
- /// Dispatches to the core f32 GPU pointer API for 1D float32 amplitude
encoding,
- /// or to the float64/basis GPU pointer APIs for other dtypes and batch
encoding.
+ /// Dispatches to the core f32 GPU pointer API for float32 amplitude
encoding,
+ /// or to the float64/basis GPU pointer APIs for other dtypes and methods.
fn _encode_from_cuda_tensor(
&self,
data: &Bound<'_, PyAny>,
@@ -479,6 +479,7 @@ impl QdpEngine {
let is_f32 = dtype_str_lower.contains("float32");
let method = encoding_method.to_ascii_lowercase();
let ndim: usize = data.call_method0("dim")?.extract()?;
+ let tensor_info = extract_cuda_tensor_info(data)?;
if method.as_str() == "amplitude" && is_f32 {
match ndim {
@@ -506,10 +507,35 @@ impl QdpEngine {
consumed: false,
})
}
- 2 => Err(PyRuntimeError::new_err(
- "CUDA float32 batch amplitude encoding is not yet
supported. \
- Use float64 (tensor.to(torch.float64)) or encode samples
individually.",
- )),
+ 2 => {
+ let num_samples = tensor_info.shape[0] as usize;
+ let sample_size = tensor_info.shape[1] as usize;
+ let stream_ptr = get_torch_cuda_stream_ptr(data)?;
+ let data_ptr_u64: u64 =
data.call_method0("data_ptr")?.extract()?;
+ let data_ptr = data_ptr_u64 as *const f32;
+
+ let ptr = unsafe {
+ self.engine
+ .encode_batch_from_gpu_ptr_f32_with_stream(
+ data_ptr,
+ num_samples,
+ sample_size,
+ num_qubits,
+ stream_ptr,
+ )
+ .map_err(|e| {
+ PyRuntimeError::new_err(format!(
+ "Encoding failed (float32 amplitude
batch): {}",
+ e
+ ))
+ })?
+ };
+
+ Ok(QuantumTensor {
+ ptr,
+ consumed: false,
+ })
+ }
_ => Err(PyRuntimeError::new_err(format!(
"Unsupported CUDA tensor shape: {}D. Expected 1D tensor
for single \
sample encoding or 2D tensor (batch_size, features) for
batch encoding.",
@@ -517,7 +543,6 @@ impl QdpEngine {
))),
}
} else {
- let tensor_info = extract_cuda_tensor_info(data)?;
let stream_ptr = get_torch_cuda_stream_ptr(data)?;
match ndim {
diff --git a/qdp/qdp-python/tests/test_dlpack_validation.py
b/qdp/qdp-python/tests/test_dlpack_validation.py
index b2e605018..a612efbca 100644
--- a/qdp/qdp-python/tests/test_dlpack_validation.py
+++ b/qdp/qdp-python/tests/test_dlpack_validation.py
@@ -48,14 +48,71 @@ def test_cuda_float32_amplitude_supported():
@pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")
-def test_cuda_float32_amplitude_2d_unsupported():
- """2D float32 CUDA tensor with amplitude encoding should raise a clear
error."""
+def test_cuda_float32_amplitude_2d_supported():
+ """2D float32 CUDA tensor should use the batch GPU-pointer float32
amplitude path."""
engine = _engine()
- t = torch.randn(2, 4, dtype=torch.float32, device="cuda")
- with pytest.raises(
- RuntimeError, match="float32 batch amplitude encoding is not yet
supported"
- ):
- engine.encode(t, num_qubits=2, encoding_method="amplitude")
+ t = torch.tensor(
+ [[3.0, 4.0, 0.0, 0.0], [1.0, 2.0, 2.0, 1.0]],
+ dtype=torch.float32,
+ device="cuda",
+ )
+
+ result = engine.encode(t, num_qubits=2, encoding_method="amplitude")
+ assert result is not None
+
+ qt = torch.from_dlpack(result)
+ assert qt.is_cuda
+ assert qt.shape == (2, 4)
+ assert qt.dtype == torch.complex64
+
+ expected = torch.tensor(
+ [
+ [0.6, 0.8, 0.0, 0.0],
+ [
+ 1.0 / (10.0**0.5),
+ 2.0 / (10.0**0.5),
+ 2.0 / (10.0**0.5),
+ 1.0 / (10.0**0.5),
+ ],
+ ],
+ dtype=torch.complex64,
+ device="cuda",
+ )
+ assert torch.allclose(qt, expected)
+
+
[email protected](not _cuda_available(), reason="CUDA not available")
+def test_cuda_float32_amplitude_2d_respects_engine_precision():
+ """2D float32 CUDA amplitude batch should still honor float64 engine
output precision."""
+ engine = QdpEngine(0, precision="float64")
+ t = torch.tensor(
+ [[3.0, 4.0, 0.0, 0.0], [1.0, 2.0, 2.0, 1.0]],
+ dtype=torch.float32,
+ device="cuda",
+ )
+
+ result = engine.encode(t, num_qubits=2, encoding_method="amplitude")
+ assert result is not None
+
+ qt = torch.from_dlpack(result)
+ assert qt.is_cuda
+ assert qt.shape == (2, 4)
+ assert qt.dtype == torch.complex128
+
+ expected = torch.tensor(
+ [
+ [0.6, 0.8, 0.0, 0.0],
+ [
+ 1.0 / (10.0**0.5),
+ 2.0 / (10.0**0.5),
+ 2.0 / (10.0**0.5),
+ 1.0 / (10.0**0.5),
+ ],
+ ],
+ dtype=torch.complex128,
+ device="cuda",
+ )
+ assert torch.allclose(qt, expected)
@pytest.mark.skipif(not _cuda_available(), reason="CUDA not available")