This is an automated email from the ASF dual-hosted git repository.
richhuang pushed a commit to branch dev-qdp
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/dev-qdp by this push:
new 7a2b8fed4 [QDP] Add launch_amplitude_encode_f32 function (#739)
7a2b8fed4 is described below
commit 7a2b8fed4f6b53649977329b5394fecddb3602a2
Author: Ping <[email protected]>
AuthorDate: Thu Dec 25 19:51:10 2025 +0800
[QDP] Add launch_amplitude_encode_f32 function (#739)
* float32 follow up
Signed-off-by: 400Ping <[email protected]>
* update
Signed-off-by: 400Ping <[email protected]>
---------
Signed-off-by: 400Ping <[email protected]>
---
qdp/qdp-kernels/src/amplitude.cu | 67 ++++++++++++++++++++++++++++---
qdp/qdp-kernels/src/lib.rs | 27 +++++++++++++
qdp/qdp-kernels/tests/amplitude_encode.rs | 59 ++++++++++++++++++++++++++-
3 files changed, 147 insertions(+), 6 deletions(-)
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index ea5fc27f7..7cf94ce92 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -41,18 +41,17 @@ __global__ void amplitude_encode_kernel(
// Vectorized Load Optimization:
// If we are well within bounds, treat input as double2 to issue a single
128-bit load instruction.
- // This reduces memory transactions and improves throughput on RTX cards.
+ // Use __ldg() to pull through the read-only cache; cudaMalloc aligns to
256 bytes so the
+ // reinterpret_cast<double2*> load is naturally aligned.
if (state_idx_base + 1 < input_len) {
// Reinterpret cast to load two doubles at once
- // Note: Assumes input is reasonably aligned (standard cudaMalloc
provides 256-byte alignment)
- const double2* input_vec = reinterpret_cast<const double2*>(input);
- double2 loaded = input_vec[idx];
+ const double2 loaded = __ldg(reinterpret_cast<const double2*>(input) +
idx);
v1 = loaded.x;
v2 = loaded.y;
}
// Handle edge case: Odd input length
else if (state_idx_base < input_len) {
- v1 = input[state_idx_base];
+ v1 = __ldg(input + state_idx_base);
// v2 remains 0.0
}
@@ -66,6 +65,35 @@ __global__ void amplitude_encode_kernel(
}
}
+__global__ void amplitude_encode_kernel_f32(
+ const float* __restrict__ input,
+ cuComplex* __restrict__ state,
+ size_t input_len,
+ size_t state_len,
+ float inv_norm
+) {
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ size_t state_idx_base = idx * 2;
+ if (state_idx_base >= state_len) return;
+
+ float v1 = 0.0f;
+ float v2 = 0.0f;
+
+ if (state_idx_base + 1 < input_len) {
+ // Mirror the double kernel: cached vectorized load for two floats
+ const float2 loaded = __ldg(reinterpret_cast<const float2*>(input) +
idx);
+ v1 = loaded.x;
+ v2 = loaded.y;
+ } else if (state_idx_base < input_len) {
+ v1 = __ldg(input + state_idx_base);
+ }
+
+ state[state_idx_base] = make_cuComplex(v1 * inv_norm, 0.0f);
+ if (state_idx_base + 1 < state_len) {
+ state[state_idx_base + 1] = make_cuComplex(v2 * inv_norm, 0.0f);
+ }
+}
+
// Warp-level reduction for sum using shuffle instructions
__device__ __forceinline__ double warp_reduce_sum(double val) {
for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
@@ -137,6 +165,35 @@ int launch_amplitude_encode(
return (int)cudaGetLastError();
}
+/// Launch amplitude encoding kernel for float32
+int launch_amplitude_encode_f32(
+ const float* input_d,
+ void* state_d,
+ size_t input_len,
+ size_t state_len,
+ float inv_norm,
+ cudaStream_t stream
+) {
+ if (inv_norm <= 0.0f || !isfinite(inv_norm)) {
+ return cudaErrorInvalidValue;
+ }
+
+ cuComplex* state_complex_d = static_cast<cuComplex*>(state_d);
+
+ const int blockSize = 256;
+ const int gridSize = (state_len / 2 + blockSize - 1) / blockSize;
+
+ amplitude_encode_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+ input_d,
+ state_complex_d,
+ input_len,
+ state_len,
+ inv_norm
+ );
+
+ return (int)cudaGetLastError();
+}
+
/// Optimized batch amplitude encoding kernel
///
/// Memory Layout (row-major):
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index bae8782ef..d9fc0a163 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -69,6 +69,20 @@ unsafe extern "C" {
stream: *mut c_void,
) -> i32;
+ /// Launch amplitude encoding kernel (float32 input/output)
+ /// Returns CUDA error code (0 = success)
+ ///
+ /// # Safety
+ /// Requires valid GPU pointers, must sync before freeing
+ pub fn launch_amplitude_encode_f32(
+ input_d: *const f32,
+ state_d: *mut c_void,
+ input_len: usize,
+ state_len: usize,
+ inv_norm: f32,
+ stream: *mut c_void,
+ ) -> i32;
+
/// Launch batch amplitude encoding kernel
/// Returns CUDA error code (0 = success)
///
@@ -138,6 +152,19 @@ pub extern "C" fn launch_amplitude_encode(
999 // Error: CUDA unavailable
}
+#[cfg(not(target_os = "linux"))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_amplitude_encode_f32(
+ _input_d: *const f32,
+ _state_d: *mut c_void,
+ _input_len: usize,
+ _state_len: usize,
+ _inv_norm: f32,
+ _stream: *mut c_void,
+) -> i32 {
+ 999
+}
+
#[cfg(not(target_os = "linux"))]
#[unsafe(no_mangle)]
pub extern "C" fn launch_l2_norm(
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index e290d550c..4223dd0bb 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -19,9 +19,17 @@
#[cfg(target_os = "linux")]
use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
#[cfg(target_os = "linux")]
-use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode, launch_l2_norm,
launch_l2_norm_batch};
+use qdp_kernels::{
+ CuComplex,
+ CuDoubleComplex,
+ launch_amplitude_encode,
+ launch_amplitude_encode_f32,
+ launch_l2_norm,
+ launch_l2_norm_batch,
+};
const EPSILON: f64 = 1e-10;
+const EPSILON_F32: f32 = 1e-5;
#[test]
#[cfg(target_os = "linux")]
@@ -94,6 +102,55 @@ fn test_amplitude_encode_basic() {
println!("PASS: Basic amplitude encoding works correctly");
}
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_basic_f32() {
+ println!("Testing basic amplitude encoding (float32)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input: Vec<f32> = vec![3.0, 4.0];
+ let norm = (input[0] * input[0] + input[1] * input[1]).sqrt();
+ let inv_norm = 1.0f32 / norm;
+ let state_len = 4usize;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d = device.alloc_zeros::<CuComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode_f32(
+ *input_d.device_ptr() as *const f32,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ inv_norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ assert!((state_h[0].x - 0.6).abs() < EPSILON_F32, "First element should be
0.6");
+ assert!(state_h[0].y.abs() < EPSILON_F32, "First element imaginary should
be 0");
+ assert!((state_h[1].x - 0.8).abs() < EPSILON_F32, "Second element should
be 0.8");
+ assert!(state_h[1].y.abs() < EPSILON_F32, "Second element imaginary should
be 0");
+ assert!(state_h[2].x.abs() < EPSILON_F32, "Third element should be 0");
+ assert!(state_h[3].x.abs() < EPSILON_F32, "Fourth element should be 0");
+
+ let total_prob: f32 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+ assert!((total_prob - 1.0).abs() < EPSILON_F32, "Total probability should
be 1.0");
+
+ println!("PASS: Basic float32 amplitude encoding works correctly");
+}
+
#[test]
#[cfg(target_os = "linux")]
fn test_amplitude_encode_power_of_two() {