This is an automated email from the ASF dual-hosted git repository. guanmingchiu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/mahout.git
commit 5416fce5667e9121da94291ba3e2ae6407e3797a Author: Ryan Huang <[email protected]> AuthorDate: Fri Dec 5 19:33:00 2025 -0800 [QDP] add vanilla gpu kernel (#677) * add vanilla gpu kernel Signed-off-by: ryankert01 <[email protected]> * Optimize amplitude encoding kernel with safety checks Refactored kernel to use multiplication for normalization and optimized memory access with double2. Added safety checks for input parameters. * Remove comments on norm validation and optimization Removed comments regarding norm checks and optimization. * add docker dev container and test Signed-off-by: ryankert01 <[email protected]> * add docker dev containe Signed-off-by: ryankert01 <[email protected]> * delete unrelated strings Signed-off-by: ryankert01 <[email protected]> * delete unrelated strings Signed-off-by: ryankert01 <[email protected]> --------- Signed-off-by: ryankert01 <[email protected]> --- .devcontainer/devcontainer.json | 29 ++ .devcontainer/setup.sh | 32 +++ qdp/qdp-kernels/src/amplitude.cu | 108 ++++--- qdp/qdp-kernels/tests/amplitude_encode.rs | 460 ++++++++++++++++++++++++++++++ 4 files changed, 589 insertions(+), 40 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..f297220f1 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "CUDA Python + Rust Dev", + "image": "nvidia/cuda:12.4.1-devel-ubuntu22.04", + // Give container full GPU access + "runArgs": [ + "--gpus", + "all" + ], + // Auto mount your GitHub repo as workspace + "workspaceFolder": "/workspace", + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind", + // Install Python via devcontainers-features + "features": { + "ghcr.io/devcontainers/features/python:1": { + "version": "3.10" + } + }, + // Additional setup (Rust, CUDA tools, etc.) + "postCreateCommand": "bash /workspace/.devcontainer/setup.sh", + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "tamasfe.even-better-toml", + "rust-lang.rust-analyzer" + ] + } + } +} diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100644 index 000000000..e74548e98 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -eux + +# Install Rust + Cargo +if ! command -v cargo >/dev/null 2>&1; then + curl https://sh.rustup.rs -sSf | sh -s -- -y + echo 'source $HOME/.cargo/env' >> ~/.bashrc +fi + +# Common dev tools +apt-get update +apt-get install -y \ + build-essential \ + pkg-config \ + git \ + vim + + +# uv +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env + +# peotry +apt update +apt install apt-utils -y +apt install pipx -y +pipx ensurepath +pipx install poetry + +# setup pre-install hook +poetry install --extras dev +poetry run pre-commit install diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu index da2465c46..b1cd06fea 100644 --- a/qdp/qdp-kernels/src/amplitude.cu +++ b/qdp/qdp-kernels/src/amplitude.cu @@ -15,32 +15,59 @@ // limitations under the License. // Amplitude Encoding CUDA Kernel -// -// This is a minimal skeleton implementation for the Core Architecture. -// TODO: Implement full optimized kernel with parallel normalization. -// -// Purpose of this skeleton: -// - Provides the function signature required by mahout-core -// - Ensures the project compiles and links correctly -// - Allows CI/CD to pass for the Core PR -// -// The actual parallel normalization and state encoding logic will be -// implemented in the next PR, focusing on CUDA optimization strategies. #include <cuda_runtime.h> #include <cuComplex.h> +#include <vector_types.h> + +__global__ void amplitude_encode_kernel( + const double* __restrict__ input, + cuDoubleComplex* __restrict__ state, + size_t input_len, + size_t state_len, + double inv_norm +) { + // We process 2 elements per thread to maximize memory bandwidth via double2 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Each thread handles two state amplitudes (indices 2*idx and 2*idx + 1) + size_t state_idx_base = idx * 2; + + if (state_idx_base >= state_len) return; + + double v1 = 0.0; + double v2 = 0.0; + + // Vectorized Load Optimization: + // If we are well within bounds, treat input as double2 to issue a single 128-bit load instruction. + // This reduces memory transactions and improves throughput on RTX cards. + if (state_idx_base + 1 < input_len) { + // Reinterpret cast to load two doubles at once + // Note: Assumes input is reasonably aligned (standard cudaMalloc provides 256-byte alignment) + const double2* input_vec = reinterpret_cast<const double2*>(input); + double2 loaded = input_vec[idx]; + v1 = loaded.x; + v2 = loaded.y; + } + // Handle edge case: Odd input length + else if (state_idx_base < input_len) { + v1 = input[state_idx_base]; + // v2 remains 0.0 + } + + // Write output: + // Apply pre-calculated reciprocal (multiplication is faster than division) + state[state_idx_base] = make_cuDoubleComplex(v1 * inv_norm, 0.0); + + // Check boundary for the second element (state_len is usually power of 2, but good to be safe) + if (state_idx_base + 1 < state_len) { + state[state_idx_base + 1] = make_cuDoubleComplex(v2 * inv_norm, 0.0); + } +} extern "C" { -/// Launch amplitude encoding kernel (skeleton implementation) -/// -/// TODO: Full implementation with: -/// - Parallel normalization kernel -/// - Coalesced memory access patterns -/// - Warp-level optimizations -/// - Stream support for async execution -/// -/// For now, this returns success to allow Core compilation. +/// Launch amplitude encoding kernel /// /// # Arguments /// * input_d - Device pointer to input data (already normalized by host) @@ -60,26 +87,27 @@ int launch_amplitude_encode( double norm, cudaStream_t stream ) { - // Skeleton implementation - ensures FFI linkage is correct - // This allows the project to compile and pass CI/CD checks. - // - // TODO: Implement full CUDA kernel: - // 1. Kernel launch with optimal grid/block dimensions - // 2. Parallel normalization and complex number construction - // 3. Zero-padding for unused state vector elements - // 4. Error checking and stream synchronization - - // Suppress unused parameter warnings (parameters will be used in full implementation) - (void)input_d; - (void)state_d; - (void)input_len; - (void)state_len; - (void)norm; - (void)stream; - - // For now, just return success - // TODO: Launch actual kernel here - return cudaSuccess; + if (norm <= 0.0) { + return cudaErrorInvalidValue; + } + + double inv_norm = 1.0 / norm; + + cuDoubleComplex* state_complex_d = static_cast<cuDoubleComplex*>(state_d); + + const int blockSize = 256; + // Halve the grid size because each thread now processes 2 elements + const int gridSize = (state_len / 2 + blockSize - 1) / blockSize; + + amplitude_encode_kernel<<<gridSize, blockSize, 0, stream>>>( + input_d, + state_complex_d, + input_len, + state_len, + inv_norm // Pass reciprocal + ); + + return (int)cudaGetLastError(); } // TODO: Future encoding methods: diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs b/qdp/qdp-kernels/tests/amplitude_encode.rs new file mode 100644 index 000000000..2ac125c3e --- /dev/null +++ b/qdp/qdp-kernels/tests/amplitude_encode.rs @@ -0,0 +1,460 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tests for amplitude encoding CUDA kernel + +#[cfg(target_os = "linux")] +use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut}; +#[cfg(target_os = "linux")] +use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode}; + +const EPSILON: f64 = 1e-10; + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_basic() { + println!("Testing basic amplitude encoding..."); + + // Initialize CUDA device + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Test input: [3.0, 4.0] -> normalized to [0.6, 0.8] + let input = vec![3.0, 4.0]; + let norm = (3.0_f64.powi(2) + 4.0_f64.powi(2)).sqrt(); // 5.0 + let state_len = 4; // 2 qubits + + // Allocate device memory + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + // Launch kernel + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + // Copy result back + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // Verify normalization: [0.6, 0.8, 0.0, 0.0] + assert!( + (state_h[0].x - 0.6).abs() < EPSILON, + "First element should be 0.6" + ); + assert!( + (state_h[0].y).abs() < EPSILON, + "First element imaginary should be 0" + ); + assert!( + (state_h[1].x - 0.8).abs() < EPSILON, + "Second element should be 0.8" + ); + assert!( + (state_h[1].y).abs() < EPSILON, + "Second element imaginary should be 0" + ); + assert!((state_h[2].x).abs() < EPSILON, "Third element should be 0"); + assert!((state_h[3].x).abs() < EPSILON, "Fourth element should be 0"); + + // Verify state is normalized + let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum(); + assert!( + (total_prob - 1.0).abs() < EPSILON, + "Total probability should be 1.0" + ); + + println!("PASS: Basic amplitude encoding works correctly"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_power_of_two() { + println!("Testing amplitude encoding with power-of-two input..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Test with 8 input values (fills 3-qubit state) + let input: Vec<f64> = (1..=8).map(|x| x as f64).collect(); + let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt(); + let state_len = 8; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // Verify all elements are correctly normalized + for i in 0..state_len { + let expected = input[i] / norm; + assert!( + (state_h[i].x - expected).abs() < EPSILON, + "Element {} should be {}, got {}", + i, + expected, + state_h[i].x + ); + assert!((state_h[i].y).abs() < EPSILON, "Imaginary part should be 0"); + } + + // Verify normalization + let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum(); + assert!( + (total_prob - 1.0).abs() < EPSILON, + "Total probability should be 1.0" + ); + + println!("PASS: Power-of-two input encoding works correctly"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_odd_input_length() { + println!("Testing amplitude encoding with odd input length..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Test with 3 input values, state size 4 + let input = vec![1.0, 2.0, 2.0]; + let norm = (1.0_f64 + 4.0 + 4.0).sqrt(); // 3.0 + let state_len = 4; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // Verify: [1/3, 2/3, 2/3, 0] + assert!((state_h[0].x - 1.0 / 3.0).abs() < EPSILON); + assert!((state_h[1].x - 2.0 / 3.0).abs() < EPSILON); + assert!((state_h[2].x - 2.0 / 3.0).abs() < EPSILON); + assert!( + (state_h[3].x).abs() < EPSILON, + "Fourth element should be padded with 0" + ); + + println!("PASS: Odd input length handled correctly"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_large_state() { + println!("Testing amplitude encoding with large state vector..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Test with 1024 elements (10 qubits) + let input_len = 1024; + let input: Vec<f64> = (0..input_len).map(|i| (i + 1) as f64).collect(); + let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt(); + let state_len = 1024; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // Spot check a few values + for i in [0, 100, 500, 1023] { + let expected = input[i] / norm; + assert!( + (state_h[i].x - expected).abs() < EPSILON, + "Element {} mismatch", + i + ); + } + + // Verify normalization + let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum(); + assert!( + (total_prob - 1.0).abs() < EPSILON, + "Total probability should be 1.0" + ); + + println!("PASS: Large state vector encoding works correctly"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_zero_norm_error() { + println!("Testing amplitude encoding with zero norm (error case)..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input = vec![0.0, 0.0, 0.0]; + let norm = 0.0; // Invalid! + let state_len = 4; + + let input_d = device.htod_copy(input).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + 3, + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + // Should return CUDA error code for invalid value + assert_ne!(result, 0, "Should reject zero norm"); + println!( + "PASS: Zero norm correctly rejected with error code {}", + result + ); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_negative_norm_error() { + println!("Testing amplitude encoding with negative norm (error case)..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input = vec![1.0, 2.0, 3.0]; + let norm = -5.0; // Invalid! + let state_len = 4; + + let input_d = device.htod_copy(input).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + 3, + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + // Should return CUDA error code for invalid value + assert_ne!(result, 0, "Should reject negative norm"); + println!( + "PASS: Negative norm correctly rejected with error code {}", + result + ); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_vectorized_load() { + println!("Testing vectorized double2 memory access optimization..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Use exactly 16 elements to test vectorized loads (8 threads * 2 elements each) + let input: Vec<f64> = (1..=16).map(|x| x as f64).collect(); + let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt(); + let state_len = 16; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // Verify all elements processed correctly through vectorized loads + for i in 0..state_len { + let expected = input[i] / norm; + assert!( + (state_h[i].x - expected).abs() < EPSILON, + "Vectorized load: element {} should be {}, got {}", + i, + expected, + state_h[i].x + ); + } + + println!("PASS: Vectorized memory access works correctly"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_amplitude_encode_small_input_large_state() { + println!("Testing small input with large state vector..."); + + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + // Only 2 input values, but 16-element state (padding with zeros) + let input = vec![3.0, 4.0]; + let norm = 5.0; + let state_len = 16; + + let input_d = device.htod_copy(input.clone()).unwrap(); + let mut state_d = device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap(); + + let result = unsafe { + launch_amplitude_encode( + *input_d.device_ptr() as *const f64, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + input.len(), + state_len, + norm, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 0, "Kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + + // First two elements should be normalized values + assert!((state_h[0].x - 0.6).abs() < EPSILON); + assert!((state_h[1].x - 0.8).abs() < EPSILON); + + // Rest should be zero + for i in 2..state_len { + assert!( + state_h[i].x.abs() < EPSILON && state_h[i].y.abs() < EPSILON, + "Element {} should be zero-padded", + i + ); + } + + println!("PASS: Small input with large state padding works correctly"); +} + +#[test] +#[cfg(not(target_os = "linux"))] +fn test_amplitude_encode_dummy_non_linux() { + println!("Testing dummy implementation on non-Linux platform..."); + + // The dummy implementation should return error code 999 + let result = unsafe { + qdp_kernels::launch_amplitude_encode( + std::ptr::null(), + std::ptr::null_mut(), + 0, + 0, + 1.0, + std::ptr::null_mut(), + ) + }; + + assert_eq!(result, 999, "Dummy implementation should return 999"); + println!("PASS: Non-Linux dummy implementation returns expected error code"); +}
