This is an automated email from the ASF dual-hosted git repository.
guanmingchiu pushed a commit to branch dev-qdp
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/dev-qdp by this push:
new ad691173b [QDP] add vanilla gpu kernel (#677)
ad691173b is described below
commit ad691173b01e67de691c844080609a724d32c976
Author: Ryan Huang <[email protected]>
AuthorDate: Fri Dec 5 19:33:00 2025 -0800
[QDP] add vanilla gpu kernel (#677)
* add vanilla gpu kernel
Signed-off-by: ryankert01 <[email protected]>
* Optimize amplitude encoding kernel with safety checks
Refactored kernel to use multiplication for normalization and optimized
memory access with double2. Added safety checks for input parameters.
* Remove comments on norm validation and optimization
Removed comments regarding norm checks and optimization.
* add docker dev container and test
Signed-off-by: ryankert01 <[email protected]>
* add docker dev containe
Signed-off-by: ryankert01 <[email protected]>
* delete unrelated strings
Signed-off-by: ryankert01 <[email protected]>
* delete unrelated strings
Signed-off-by: ryankert01 <[email protected]>
---------
Signed-off-by: ryankert01 <[email protected]>
---
.devcontainer/devcontainer.json | 29 ++
.devcontainer/setup.sh | 32 +++
qdp/qdp-kernels/src/amplitude.cu | 108 ++++---
qdp/qdp-kernels/tests/amplitude_encode.rs | 460 ++++++++++++++++++++++++++++++
4 files changed, 589 insertions(+), 40 deletions(-)
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..f297220f1
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,29 @@
+{
+ "name": "CUDA Python + Rust Dev",
+ "image": "nvidia/cuda:12.4.1-devel-ubuntu22.04",
+ // Give container full GPU access
+ "runArgs": [
+ "--gpus",
+ "all"
+ ],
+ // Auto mount your GitHub repo as workspace
+ "workspaceFolder": "/workspace",
+ "workspaceMount":
"source=${localWorkspaceFolder},target=/workspace,type=bind",
+ // Install Python via devcontainers-features
+ "features": {
+ "ghcr.io/devcontainers/features/python:1": {
+ "version": "3.10"
+ }
+ },
+ // Additional setup (Rust, CUDA tools, etc.)
+ "postCreateCommand": "bash /workspace/.devcontainer/setup.sh",
+ "customizations": {
+ "vscode": {
+ "extensions": [
+ "ms-python.python",
+ "tamasfe.even-better-toml",
+ "rust-lang.rust-analyzer"
+ ]
+ }
+ }
+}
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
new file mode 100644
index 000000000..e74548e98
--- /dev/null
+++ b/.devcontainer/setup.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -eux
+
+# Install Rust + Cargo
+if ! command -v cargo >/dev/null 2>&1; then
+ curl https://sh.rustup.rs -sSf | sh -s -- -y
+ echo 'source $HOME/.cargo/env' >> ~/.bashrc
+fi
+
+# Common dev tools
+apt-get update
+apt-get install -y \
+ build-essential \
+ pkg-config \
+ git \
+ vim
+
+
+# uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# peotry
+apt update
+apt install apt-utils -y
+apt install pipx -y
+pipx ensurepath
+pipx install poetry
+
+# setup pre-install hook
+poetry install --extras dev
+poetry run pre-commit install
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index da2465c46..b1cd06fea 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -15,32 +15,59 @@
// limitations under the License.
// Amplitude Encoding CUDA Kernel
-//
-// This is a minimal skeleton implementation for the Core Architecture.
-// TODO: Implement full optimized kernel with parallel normalization.
-//
-// Purpose of this skeleton:
-// - Provides the function signature required by mahout-core
-// - Ensures the project compiles and links correctly
-// - Allows CI/CD to pass for the Core PR
-//
-// The actual parallel normalization and state encoding logic will be
-// implemented in the next PR, focusing on CUDA optimization strategies.
#include <cuda_runtime.h>
#include <cuComplex.h>
+#include <vector_types.h>
+
+__global__ void amplitude_encode_kernel(
+ const double* __restrict__ input,
+ cuDoubleComplex* __restrict__ state,
+ size_t input_len,
+ size_t state_len,
+ double inv_norm
+) {
+ // We process 2 elements per thread to maximize memory bandwidth via
double2
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Each thread handles two state amplitudes (indices 2*idx and 2*idx + 1)
+ size_t state_idx_base = idx * 2;
+
+ if (state_idx_base >= state_len) return;
+
+ double v1 = 0.0;
+ double v2 = 0.0;
+
+ // Vectorized Load Optimization:
+ // If we are well within bounds, treat input as double2 to issue a single
128-bit load instruction.
+ // This reduces memory transactions and improves throughput on RTX cards.
+ if (state_idx_base + 1 < input_len) {
+ // Reinterpret cast to load two doubles at once
+ // Note: Assumes input is reasonably aligned (standard cudaMalloc
provides 256-byte alignment)
+ const double2* input_vec = reinterpret_cast<const double2*>(input);
+ double2 loaded = input_vec[idx];
+ v1 = loaded.x;
+ v2 = loaded.y;
+ }
+ // Handle edge case: Odd input length
+ else if (state_idx_base < input_len) {
+ v1 = input[state_idx_base];
+ // v2 remains 0.0
+ }
+
+ // Write output:
+ // Apply pre-calculated reciprocal (multiplication is faster than division)
+ state[state_idx_base] = make_cuDoubleComplex(v1 * inv_norm, 0.0);
+
+ // Check boundary for the second element (state_len is usually power of 2,
but good to be safe)
+ if (state_idx_base + 1 < state_len) {
+ state[state_idx_base + 1] = make_cuDoubleComplex(v2 * inv_norm, 0.0);
+ }
+}
extern "C" {
-/// Launch amplitude encoding kernel (skeleton implementation)
-///
-/// TODO: Full implementation with:
-/// - Parallel normalization kernel
-/// - Coalesced memory access patterns
-/// - Warp-level optimizations
-/// - Stream support for async execution
-///
-/// For now, this returns success to allow Core compilation.
+/// Launch amplitude encoding kernel
///
/// # Arguments
/// * input_d - Device pointer to input data (already normalized by host)
@@ -60,26 +87,27 @@ int launch_amplitude_encode(
double norm,
cudaStream_t stream
) {
- // Skeleton implementation - ensures FFI linkage is correct
- // This allows the project to compile and pass CI/CD checks.
- //
- // TODO: Implement full CUDA kernel:
- // 1. Kernel launch with optimal grid/block dimensions
- // 2. Parallel normalization and complex number construction
- // 3. Zero-padding for unused state vector elements
- // 4. Error checking and stream synchronization
-
- // Suppress unused parameter warnings (parameters will be used in full
implementation)
- (void)input_d;
- (void)state_d;
- (void)input_len;
- (void)state_len;
- (void)norm;
- (void)stream;
-
- // For now, just return success
- // TODO: Launch actual kernel here
- return cudaSuccess;
+ if (norm <= 0.0) {
+ return cudaErrorInvalidValue;
+ }
+
+ double inv_norm = 1.0 / norm;
+
+ cuDoubleComplex* state_complex_d = static_cast<cuDoubleComplex*>(state_d);
+
+ const int blockSize = 256;
+ // Halve the grid size because each thread now processes 2 elements
+ const int gridSize = (state_len / 2 + blockSize - 1) / blockSize;
+
+ amplitude_encode_kernel<<<gridSize, blockSize, 0, stream>>>(
+ input_d,
+ state_complex_d,
+ input_len,
+ state_len,
+ inv_norm // Pass reciprocal
+ );
+
+ return (int)cudaGetLastError();
}
// TODO: Future encoding methods:
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs
b/qdp/qdp-kernels/tests/amplitude_encode.rs
new file mode 100644
index 000000000..2ac125c3e
--- /dev/null
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -0,0 +1,460 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests for amplitude encoding CUDA kernel
+
+#[cfg(target_os = "linux")]
+use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
+#[cfg(target_os = "linux")]
+use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode};
+
+const EPSILON: f64 = 1e-10;
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_basic() {
+ println!("Testing basic amplitude encoding...");
+
+ // Initialize CUDA device
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test input: [3.0, 4.0] -> normalized to [0.6, 0.8]
+ let input = vec![3.0, 4.0];
+ let norm = (3.0_f64.powi(2) + 4.0_f64.powi(2)).sqrt(); // 5.0
+ let state_len = 4; // 2 qubits
+
+ // Allocate device memory
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ // Launch kernel
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ // Copy result back
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // Verify normalization: [0.6, 0.8, 0.0, 0.0]
+ assert!(
+ (state_h[0].x - 0.6).abs() < EPSILON,
+ "First element should be 0.6"
+ );
+ assert!(
+ (state_h[0].y).abs() < EPSILON,
+ "First element imaginary should be 0"
+ );
+ assert!(
+ (state_h[1].x - 0.8).abs() < EPSILON,
+ "Second element should be 0.8"
+ );
+ assert!(
+ (state_h[1].y).abs() < EPSILON,
+ "Second element imaginary should be 0"
+ );
+ assert!((state_h[2].x).abs() < EPSILON, "Third element should be 0");
+ assert!((state_h[3].x).abs() < EPSILON, "Fourth element should be 0");
+
+ // Verify state is normalized
+ let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+ assert!(
+ (total_prob - 1.0).abs() < EPSILON,
+ "Total probability should be 1.0"
+ );
+
+ println!("PASS: Basic amplitude encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_power_of_two() {
+ println!("Testing amplitude encoding with power-of-two input...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test with 8 input values (fills 3-qubit state)
+ let input: Vec<f64> = (1..=8).map(|x| x as f64).collect();
+ let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+ let state_len = 8;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // Verify all elements are correctly normalized
+ for i in 0..state_len {
+ let expected = input[i] / norm;
+ assert!(
+ (state_h[i].x - expected).abs() < EPSILON,
+ "Element {} should be {}, got {}",
+ i,
+ expected,
+ state_h[i].x
+ );
+ assert!((state_h[i].y).abs() < EPSILON, "Imaginary part should be 0");
+ }
+
+ // Verify normalization
+ let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+ assert!(
+ (total_prob - 1.0).abs() < EPSILON,
+ "Total probability should be 1.0"
+ );
+
+ println!("PASS: Power-of-two input encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_odd_input_length() {
+ println!("Testing amplitude encoding with odd input length...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test with 3 input values, state size 4
+ let input = vec![1.0, 2.0, 2.0];
+ let norm = (1.0_f64 + 4.0 + 4.0).sqrt(); // 3.0
+ let state_len = 4;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // Verify: [1/3, 2/3, 2/3, 0]
+ assert!((state_h[0].x - 1.0 / 3.0).abs() < EPSILON);
+ assert!((state_h[1].x - 2.0 / 3.0).abs() < EPSILON);
+ assert!((state_h[2].x - 2.0 / 3.0).abs() < EPSILON);
+ assert!(
+ (state_h[3].x).abs() < EPSILON,
+ "Fourth element should be padded with 0"
+ );
+
+ println!("PASS: Odd input length handled correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_large_state() {
+ println!("Testing amplitude encoding with large state vector...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Test with 1024 elements (10 qubits)
+ let input_len = 1024;
+ let input: Vec<f64> = (0..input_len).map(|i| (i + 1) as f64).collect();
+ let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+ let state_len = 1024;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // Spot check a few values
+ for i in [0, 100, 500, 1023] {
+ let expected = input[i] / norm;
+ assert!(
+ (state_h[i].x - expected).abs() < EPSILON,
+ "Element {} mismatch",
+ i
+ );
+ }
+
+ // Verify normalization
+ let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+ assert!(
+ (total_prob - 1.0).abs() < EPSILON,
+ "Total probability should be 1.0"
+ );
+
+ println!("PASS: Large state vector encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_zero_norm_error() {
+ println!("Testing amplitude encoding with zero norm (error case)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input = vec![0.0, 0.0, 0.0];
+ let norm = 0.0; // Invalid!
+ let state_len = 4;
+
+ let input_d = device.htod_copy(input).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ 3,
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ // Should return CUDA error code for invalid value
+ assert_ne!(result, 0, "Should reject zero norm");
+ println!(
+ "PASS: Zero norm correctly rejected with error code {}",
+ result
+ );
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_negative_norm_error() {
+ println!("Testing amplitude encoding with negative norm (error case)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input = vec![1.0, 2.0, 3.0];
+ let norm = -5.0; // Invalid!
+ let state_len = 4;
+
+ let input_d = device.htod_copy(input).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ 3,
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ // Should return CUDA error code for invalid value
+ assert_ne!(result, 0, "Should reject negative norm");
+ println!(
+ "PASS: Negative norm correctly rejected with error code {}",
+ result
+ );
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_vectorized_load() {
+ println!("Testing vectorized double2 memory access optimization...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Use exactly 16 elements to test vectorized loads (8 threads * 2
elements each)
+ let input: Vec<f64> = (1..=16).map(|x| x as f64).collect();
+ let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+ let state_len = 16;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // Verify all elements processed correctly through vectorized loads
+ for i in 0..state_len {
+ let expected = input[i] / norm;
+ assert!(
+ (state_h[i].x - expected).abs() < EPSILON,
+ "Vectorized load: element {} should be {}, got {}",
+ i,
+ expected,
+ state_h[i].x
+ );
+ }
+
+ println!("PASS: Vectorized memory access works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_small_input_large_state() {
+ println!("Testing small input with large state vector...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ // Only 2 input values, but 16-element state (padding with zeros)
+ let input = vec![3.0, 4.0];
+ let norm = 5.0;
+ let state_len = 16;
+
+ let input_d = device.htod_copy(input.clone()).unwrap();
+ let mut state_d =
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+ let result = unsafe {
+ launch_amplitude_encode(
+ *input_d.device_ptr() as *const f64,
+ *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+ input.len(),
+ state_len,
+ norm,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 0, "Kernel launch should succeed");
+
+ let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+ // First two elements should be normalized values
+ assert!((state_h[0].x - 0.6).abs() < EPSILON);
+ assert!((state_h[1].x - 0.8).abs() < EPSILON);
+
+ // Rest should be zero
+ for i in 2..state_len {
+ assert!(
+ state_h[i].x.abs() < EPSILON && state_h[i].y.abs() < EPSILON,
+ "Element {} should be zero-padded",
+ i
+ );
+ }
+
+ println!("PASS: Small input with large state padding works correctly");
+}
+
+#[test]
+#[cfg(not(target_os = "linux"))]
+fn test_amplitude_encode_dummy_non_linux() {
+ println!("Testing dummy implementation on non-Linux platform...");
+
+ // The dummy implementation should return error code 999
+ let result = unsafe {
+ qdp_kernels::launch_amplitude_encode(
+ std::ptr::null(),
+ std::ptr::null_mut(),
+ 0,
+ 0,
+ 1.0,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(result, 999, "Dummy implementation should return 999");
+ println!("PASS: Non-Linux dummy implementation returns expected error
code");
+}