This is an automated email from the ASF dual-hosted git repository.
ryankert01 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 51104cbce feat(qdp): hoist encode_from_gpu_ptr_f32 onto QuantumEncoder
trait (#1310)
51104cbce is described below
commit 51104cbcee2d7ae22e8d287adc1deda3dc41c263
Author: KUAN-HAO HUANG <[email protected]>
AuthorDate: Mon Jun 1 16:18:29 2026 +0800
feat(qdp): hoist encode_from_gpu_ptr_f32 onto QuantumEncoder trait (#1310)
---
qdp/qdp-core/src/gpu/encodings/amplitude.rs | 103 ++++++++++++++++++++
qdp/qdp-core/src/gpu/encodings/angle.rs | 22 +++++
qdp/qdp-core/src/gpu/encodings/basis.rs | 23 +++++
qdp/qdp-core/src/gpu/encodings/mod.rs | 28 ++++++
qdp/qdp-core/src/lib.rs | 74 +++-----------
qdp/qdp-core/tests/gpu_ptr_encoding.rs | 143 ++++++++++++++++++++++++++++
6 files changed, 331 insertions(+), 62 deletions(-)
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 67d9f06ac..7cf70d9ec 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -689,6 +689,27 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr_f32(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ // Delegate to the workhorse `_with_stream` fn (see angle.rs for
rationale).
+ unsafe {
+ Self::encode_from_gpu_ptr_f32_with_stream(
+ device,
+ input_d as *const f32,
+ input_len,
+ num_qubits,
+ stream,
+ )
+ }
+ }
+
fn name(&self) -> &'static str {
"amplitude"
}
@@ -811,6 +832,88 @@ impl AmplitudeEncoder {
}
impl AmplitudeEncoder {
+ /// Encode a single sample directly from a GPU float32 pointer, returning a
+ /// `GpuStateVector` (the engine wraps it as DLPack at the public
boundary).
+ ///
+ /// Symmetric with `AngleEncoder::encode_from_gpu_ptr_f32_with_stream` and
+ /// `BasisEncoder::encode_from_gpu_ptr_f32_with_stream`. The previous
arrangement
+ /// (`QdpEngine::encode_from_gpu_ptr_f32_with_stream` did this inline in
`lib.rs`)
+ /// made the trait surface asymmetric — only the batch variant had a real
+ /// `QuantumEncoder` override on amplitude.
+ ///
+ /// # Safety
+ /// Caller must ensure `input_d` points to at least `input_len` `f32`
values in
+ /// GPU-accessible memory on the same device as `device`, and `stream` is
either
+ /// null or a valid CUDA stream associated with `device`.
+ #[cfg(target_os = "linux")]
+ pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+ device: &Arc<CudaDevice>,
+ input_d: *const f32,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ if input_len == 0 {
+ return Err(MahoutError::InvalidInput(
+ "Input data cannot be empty".into(),
+ ));
+ }
+
+ let state_len = 1usize << num_qubits;
+ if input_len > state_len {
+ return Err(MahoutError::InvalidInput(format!(
+ "Input size {} exceeds state vector size {} (2^{} qubits)",
+ input_len, state_len, num_qubits
+ )));
+ }
+
+ let state_vector = {
+ crate::profile_scope!("GPU::Alloc");
+ GpuStateVector::new(device, num_qubits, Precision::Float32)?
+ };
+
+ let inv_norm = {
+ crate::profile_scope!("GPU::NormFromPtr");
+ unsafe {
+ Self::calculate_inv_norm_gpu_f32_with_stream(device, input_d,
input_len, stream)?
+ }
+ };
+
+ let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+ MahoutError::InvalidInput(
+ "State vector precision mismatch (expected float32
buffer)".to_string(),
+ )
+ })?;
+
+ {
+ crate::profile_scope!("GPU::KernelLaunch");
+ let ret = unsafe {
+ qdp_kernels::launch_amplitude_encode_f32(
+ input_d,
+ state_ptr as *mut c_void,
+ input_len,
+ state_len,
+ inv_norm,
+ stream,
+ )
+ };
+ if ret != 0 {
+ return Err(MahoutError::KernelLaunch(format!(
+ "Amplitude encode (f32) kernel failed with CUDA error
code: {} ({})",
+ ret,
+ cuda_error_to_string(ret)
+ )));
+ }
+ }
+
+ {
+ crate::profile_scope!("GPU::Synchronize");
+ sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+ }
+
+ Ok(state_vector)
+ }
+
/// Encode a batch directly from a GPU float32 pointer.
///
/// # Safety
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs
b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 99a9d2998..36e784e03 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -595,6 +595,28 @@ impl QuantumEncoder for AngleEncoder {
Ok(())
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr_f32(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ // Delegate to the workhorse `_with_stream` fn (kept as the inherent
impl so
+ // it can be called without a vtable on hot paths like `engine.rs`).
+ unsafe {
+ Self::encode_from_gpu_ptr_f32_with_stream(
+ device,
+ input_d as *const f32,
+ input_len,
+ num_qubits,
+ stream,
+ )
+ }
+ }
+
fn name(&self) -> &'static str {
"angle"
}
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs
b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 1db78cdc1..4f2cec15e 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -541,6 +541,29 @@ impl QuantumEncoder for BasisEncoder {
Ok(batch_state_vector)
}
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr_f32(
+ &self,
+ device: &Arc<CudaDevice>,
+ input_d: *const c_void,
+ input_len: usize,
+ num_qubits: usize,
+ stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ // Delegate to the workhorse `_with_stream` fn (see angle.rs for
rationale).
+ // `input_len` is unused — basis is always one index per sample — but
kept on the
+ // signature to match the trait shape used by amplitude / angle.
+ let _ = input_len;
+ unsafe {
+ Self::encode_from_gpu_ptr_f32_with_stream(
+ device,
+ input_d as *const f32,
+ num_qubits,
+ stream,
+ )
+ }
+ }
+
fn name(&self) -> &'static str {
"basis"
}
diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs
b/qdp/qdp-core/src/gpu/encodings/mod.rs
index 3f256e68a..8d0fd5b4c 100644
--- a/qdp/qdp-core/src/gpu/encodings/mod.rs
+++ b/qdp/qdp-core/src/gpu/encodings/mod.rs
@@ -135,6 +135,34 @@ pub trait QuantumEncoder: Send + Sync + 'static {
)))
}
+ /// Encode a single sample from an existing GPU pointer (zero-copy) using
an f32 input.
+ /// Default: not supported.
+ ///
+ /// This is the f32 counterpart of
[`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr). The
+ /// sibling batch variant is
[`encode_batch_from_gpu_ptr_f32`](Self::encode_batch_from_gpu_ptr_f32).
+ /// Adding a new encoder with f32 zero-copy support should override
**both** this method and
+ /// the batch variant; the previous arrangement (single-sample as a
standalone `pub unsafe
+ /// fn` on each encoder type, batch on the trait) made the pattern
accidentally divergent.
+ ///
+ /// # Safety
+ /// Caller must ensure `input_d` points to valid GPU memory with at least
`input_len`
+ /// `f32` elements on the same device as `device`, and `stream` is either
null or a valid
+ /// CUDA stream associated with `device`.
+ #[cfg(target_os = "linux")]
+ unsafe fn encode_from_gpu_ptr_f32(
+ &self,
+ _device: &Arc<CudaDevice>,
+ _input_d: *const c_void,
+ _input_len: usize,
+ _num_qubits: usize,
+ _stream: *mut c_void,
+ ) -> Result<GpuStateVector> {
+ Err(MahoutError::NotImplemented(format!(
+ "encode_from_gpu_ptr_f32 not supported for {}",
+ self.name()
+ )))
+ }
+
/// Encode multiple samples in a single GPU allocation and kernel launch
using f32 inputs.
fn encode_batch_f32(
&self,
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 5af3dd199..72d00898e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -610,72 +610,22 @@ impl QdpEngine {
) -> Result<*mut DLManagedTensor> {
crate::profile_scope!("Mahout::EncodeFromGpuPtrF32");
- if input_len == 0 {
- return Err(MahoutError::InvalidInput(
- "Input data cannot be empty".into(),
- ));
- }
-
validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
- let state_len = 1usize << num_qubits;
- if input_len > state_len {
- return Err(MahoutError::InvalidInput(format!(
- "Input size {} exceeds state vector size {} (2^{} qubits)",
- input_len, state_len, num_qubits
- )));
- }
-
- let state_vector = {
- crate::profile_scope!("GPU::Alloc");
- gpu::GpuStateVector::new(&self.device, num_qubits,
Precision::Float32)?
- };
-
- let inv_norm = {
- crate::profile_scope!("GPU::NormFromPtr");
- unsafe {
- gpu::AmplitudeEncoder::calculate_inv_norm_gpu_f32_with_stream(
- &self.device,
- input_d,
- input_len,
- stream,
- )?
- }
+ // Delegate to `AmplitudeEncoder::encode_from_gpu_ptr_f32_with_stream`
— the
+ // encoder-side workhorse. Keeping the kernel-launch + L2-norm
sequence inside
+ // the encoder makes the trait surface symmetric
(`QuantumEncoder::encode_from_gpu_ptr_f32`
+ // can override against it) and matches the angle / basis layout.
+ let state_vector = unsafe {
+ gpu::AmplitudeEncoder::encode_from_gpu_ptr_f32_with_stream(
+ &self.device,
+ input_d,
+ input_len,
+ num_qubits,
+ stream,
+ )?
};
- let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
- MahoutError::InvalidInput(
- "State vector precision mismatch (expected float32
buffer)".to_string(),
- )
- })?;
-
- {
- crate::profile_scope!("GPU::KernelLaunch");
- let ret = unsafe {
- qdp_kernels::launch_amplitude_encode_f32(
- input_d,
- state_ptr as *mut std::ffi::c_void,
- input_len,
- state_len,
- inv_norm,
- stream,
- )
- };
-
- if ret != 0 {
- return Err(MahoutError::KernelLaunch(format!(
- "Amplitude encode (f32) kernel failed with CUDA error
code: {} ({})",
- ret,
- cuda_error_to_string(ret)
- )));
- }
- }
-
- {
- crate::profile_scope!("GPU::Synchronize");
- gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize
failed")?;
- }
-
let state_vector = state_vector.to_precision(&self.device,
self.precision)?;
Ok(state_vector.to_dlpack())
}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 68ee26bed..48ba65f84 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -1751,3 +1751,146 @@ fn
test_encode_basis_from_gpu_ptr_f32_single_sample_success() {
};
unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 8) };
}
+
+// ---- Trait-method tests for `QuantumEncoder::encode_from_gpu_ptr_f32` (PR
1.5) ----
+//
+// The single-sample f32 method moved onto the `QuantumEncoder` trait in PR
1.5 so
+// future encoders only need a single override point instead of a standalone
inherent
+// fn that the dispatcher must remember to call. These tests dispatch through
+// `Encoding::encoder()` to exercise the trait method, not the inherent helper.
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_amplitude() {
+ let Some(_engine) = engine_f32() else {
+ println!("SKIP: No GPU");
+ return;
+ };
+ let num_qubits = 3usize;
+ let state_len = 1usize << num_qubits;
+ let data = common::create_test_data_f32(state_len);
+ let Some((device, data_d)) = common::copy_f32_to_device(data.as_slice())
else {
+ println!("SKIP: No CUDA device");
+ return;
+ };
+ let encoder = qdp_core::Encoding::Amplitude.encoder();
+ let state_vector = unsafe {
+ encoder
+ .encode_from_gpu_ptr_f32(
+ &device,
+ *data_d.device_ptr() as *const std::ffi::c_void,
+ state_len,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ .expect("trait method should succeed for amplitude")
+ };
+ // Use the engine's own precision conversion so we get a valid dlpack to
free.
+ let state_vector = state_vector
+ .to_precision(&device, qdp_core::Precision::Float32)
+ .expect("to_precision");
+ unsafe {
+ let dlpack = state_vector.to_dlpack();
+ common::take_deleter_and_delete(dlpack);
+ }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_angle() {
+ let Some(_engine) = engine_f32() else {
+ println!("SKIP: No GPU");
+ return;
+ };
+ let num_qubits = 3usize;
+ let data = common::create_test_data_f32(num_qubits);
+ let Some((device, data_d)) = common::copy_f32_to_device(data.as_slice())
else {
+ println!("SKIP: No CUDA device");
+ return;
+ };
+ let encoder = qdp_core::Encoding::Angle.encoder();
+ let state_vector = unsafe {
+ encoder
+ .encode_from_gpu_ptr_f32(
+ &device,
+ *data_d.device_ptr() as *const std::ffi::c_void,
+ num_qubits,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ .expect("trait method should succeed for angle")
+ };
+ let state_vector = state_vector
+ .to_precision(&device, qdp_core::Precision::Float32)
+ .expect("to_precision");
+ unsafe {
+ let dlpack = state_vector.to_dlpack();
+ common::take_deleter_and_delete(dlpack);
+ }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_basis() {
+ let Some(_engine) = engine_f32() else {
+ println!("SKIP: No GPU");
+ return;
+ };
+ let num_qubits = 3usize;
+ let Some((device, data_d)) = common::copy_f32_to_device(&[5.0_f32]) else {
+ println!("SKIP: No CUDA device");
+ return;
+ };
+ let encoder = qdp_core::Encoding::Basis.encoder();
+ let state_vector = unsafe {
+ encoder
+ .encode_from_gpu_ptr_f32(
+ &device,
+ *data_d.device_ptr() as *const std::ffi::c_void,
+ 1,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ .expect("trait method should succeed for basis")
+ };
+ let state_vector = state_vector
+ .to_precision(&device, qdp_core::Precision::Float32)
+ .expect("to_precision");
+ unsafe {
+ let dlpack = state_vector.to_dlpack();
+ common::take_deleter_and_delete(dlpack);
+ }
+}
+
+#[test]
+fn test_trait_encode_from_gpu_ptr_f32_default_not_implemented_for_phase() {
+ // The default body returns NotImplemented for encoders that don't
override.
+ // Phase / IQP / IQP-Z don't currently have an f32 zero-copy path, so the
+ // trait method must fall through to the default rather than mis-dispatch.
+ let Some(_engine) = engine_f32() else {
+ println!("SKIP: No GPU");
+ return;
+ };
+ let num_qubits = 3usize;
+ let Some((device, data_d)) = common::copy_f32_to_device(&[0.1_f32, 0.2,
0.3]) else {
+ println!("SKIP: No CUDA device");
+ return;
+ };
+ let encoder = qdp_core::Encoding::Phase.encoder();
+ let result = unsafe {
+ encoder.encode_from_gpu_ptr_f32(
+ &device,
+ *data_d.device_ptr() as *const std::ffi::c_void,
+ num_qubits,
+ num_qubits,
+ std::ptr::null_mut(),
+ )
+ };
+ match result {
+ Err(qdp_core::MahoutError::NotImplemented(msg)) => {
+ assert!(
+ msg.contains("encode_from_gpu_ptr_f32") &&
msg.contains("phase"),
+ "unexpected NotImplemented message: {msg}"
+ );
+ }
+ Ok(_) => panic!("phase should not support encode_from_gpu_ptr_f32"),
+ Err(e) => panic!("expected NotImplemented, got {:?}", e),
+ }
+}