(mahout) 13/50: [QDP] add vanilla gpu kernel (#677)

guanmingchiu Tue, 06 Jan 2026 08:45:43 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


commit 5416fce5667e9121da94291ba3e2ae6407e3797a
Author: Ryan Huang <[email protected]>
AuthorDate: Fri Dec 5 19:33:00 2025 -0800

    [QDP] add vanilla gpu kernel (#677)
    
    * add vanilla gpu kernel
    
    Signed-off-by: ryankert01 <[email protected]>
    
    * Optimize amplitude encoding kernel with safety checks
    
    Refactored kernel to use multiplication for normalization and optimized 
memory access with double2. Added safety checks for input parameters.
    
    * Remove comments on norm validation and optimization
    
    Removed comments regarding norm checks and optimization.
    
    * add docker dev container and test
    
    Signed-off-by: ryankert01 <[email protected]>
    
    * add docker dev containe
    
    Signed-off-by: ryankert01 <[email protected]>
    
    * delete unrelated strings
    
    Signed-off-by: ryankert01 <[email protected]>
    
    * delete unrelated strings
    
    Signed-off-by: ryankert01 <[email protected]>
    
    ---------
    
    Signed-off-by: ryankert01 <[email protected]>
---
 .devcontainer/devcontainer.json           |  29 ++
 .devcontainer/setup.sh                    |  32 +++
 qdp/qdp-kernels/src/amplitude.cu          | 108 ++++---
 qdp/qdp-kernels/tests/amplitude_encode.rs | 460 ++++++++++++++++++++++++++++++
 4 files changed, 589 insertions(+), 40 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..f297220f1
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,29 @@
+{
+    "name": "CUDA Python + Rust Dev",
+    "image": "nvidia/cuda:12.4.1-devel-ubuntu22.04",
+    // Give container full GPU access
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // Auto mount your GitHub repo as workspace
+    "workspaceFolder": "/workspace",
+    "workspaceMount": 
"source=${localWorkspaceFolder},target=/workspace,type=bind",
+    // Install Python via devcontainers-features
+    "features": {
+        "ghcr.io/devcontainers/features/python:1": {
+            "version": "3.10"
+        }
+    },
+    // Additional setup (Rust, CUDA tools, etc.)
+    "postCreateCommand": "bash /workspace/.devcontainer/setup.sh",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "tamasfe.even-better-toml",
+                "rust-lang.rust-analyzer"
+            ]
+        }
+    }
+}
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
new file mode 100644
index 000000000..e74548e98
--- /dev/null
+++ b/.devcontainer/setup.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -eux
+
+# Install Rust + Cargo
+if ! command -v cargo >/dev/null 2>&1; then
+    curl https://sh.rustup.rs -sSf | sh -s -- -y
+    echo 'source $HOME/.cargo/env' >> ~/.bashrc
+fi
+
+# Common dev tools
+apt-get update
+apt-get install -y \
+    build-essential \
+    pkg-config \
+    git \
+    vim
+
+
+# uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# peotry
+apt update
+apt install apt-utils -y
+apt install pipx -y
+pipx ensurepath
+pipx install poetry
+
+# setup pre-install hook
+poetry install --extras dev
+poetry run pre-commit install
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index da2465c46..b1cd06fea 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -15,32 +15,59 @@
 // limitations under the License.
 
 // Amplitude Encoding CUDA Kernel
-//
-// This is a minimal skeleton implementation for the Core Architecture.
-// TODO: Implement full optimized kernel with parallel normalization.
-//
-// Purpose of this skeleton:
-// - Provides the function signature required by mahout-core
-// - Ensures the project compiles and links correctly
-// - Allows CI/CD to pass for the Core PR
-//
-// The actual parallel normalization and state encoding logic will be
-// implemented in the next PR, focusing on CUDA optimization strategies.
 
 #include <cuda_runtime.h>
 #include <cuComplex.h>
+#include <vector_types.h>
+
+__global__ void amplitude_encode_kernel(
+    const double* __restrict__ input,
+    cuDoubleComplex* __restrict__ state,
+    size_t input_len,
+    size_t state_len,
+    double inv_norm
+) {
+    // We process 2 elements per thread to maximize memory bandwidth via 
double2
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Each thread handles two state amplitudes (indices 2*idx and 2*idx + 1)
+    size_t state_idx_base = idx * 2;
+
+    if (state_idx_base >= state_len) return;
+
+    double v1 = 0.0;
+    double v2 = 0.0;
+
+    // Vectorized Load Optimization:
+    // If we are well within bounds, treat input as double2 to issue a single 
128-bit load instruction.
+    // This reduces memory transactions and improves throughput on RTX cards.
+    if (state_idx_base + 1 < input_len) {
+        // Reinterpret cast to load two doubles at once
+        // Note: Assumes input is reasonably aligned (standard cudaMalloc 
provides 256-byte alignment)
+        const double2* input_vec = reinterpret_cast<const double2*>(input);
+        double2 loaded = input_vec[idx];
+        v1 = loaded.x;
+        v2 = loaded.y;
+    }
+    // Handle edge case: Odd input length
+    else if (state_idx_base < input_len) {
+        v1 = input[state_idx_base];
+        // v2 remains 0.0
+    }
+
+    // Write output:
+    // Apply pre-calculated reciprocal (multiplication is faster than division)
+    state[state_idx_base]     = make_cuDoubleComplex(v1 * inv_norm, 0.0);
+
+    // Check boundary for the second element (state_len is usually power of 2, 
but good to be safe)
+    if (state_idx_base + 1 < state_len) {
+        state[state_idx_base + 1] = make_cuDoubleComplex(v2 * inv_norm, 0.0);
+    }
+}
 
 extern "C" {
 
-/// Launch amplitude encoding kernel (skeleton implementation)
-///
-/// TODO: Full implementation with:
-/// - Parallel normalization kernel
-/// - Coalesced memory access patterns
-/// - Warp-level optimizations
-/// - Stream support for async execution
-///
-/// For now, this returns success to allow Core compilation.
+/// Launch amplitude encoding kernel
 ///
 /// # Arguments
 /// * input_d - Device pointer to input data (already normalized by host)
@@ -60,26 +87,27 @@ int launch_amplitude_encode(
     double norm,
     cudaStream_t stream
 ) {
-    // Skeleton implementation - ensures FFI linkage is correct
-    // This allows the project to compile and pass CI/CD checks.
-    //
-    // TODO: Implement full CUDA kernel:
-    // 1. Kernel launch with optimal grid/block dimensions
-    // 2. Parallel normalization and complex number construction
-    // 3. Zero-padding for unused state vector elements
-    // 4. Error checking and stream synchronization
-
-    // Suppress unused parameter warnings (parameters will be used in full 
implementation)
-    (void)input_d;
-    (void)state_d;
-    (void)input_len;
-    (void)state_len;
-    (void)norm;
-    (void)stream;
-
-    // For now, just return success
-    // TODO: Launch actual kernel here
-    return cudaSuccess;
+    if (norm <= 0.0) {
+        return cudaErrorInvalidValue;
+    }
+
+    double inv_norm = 1.0 / norm;
+
+    cuDoubleComplex* state_complex_d = static_cast<cuDoubleComplex*>(state_d);
+
+    const int blockSize = 256;
+    // Halve the grid size because each thread now processes 2 elements
+    const int gridSize = (state_len / 2 + blockSize - 1) / blockSize;
+
+    amplitude_encode_kernel<<<gridSize, blockSize, 0, stream>>>(
+        input_d,
+        state_complex_d,
+        input_len,
+        state_len,
+        inv_norm // Pass reciprocal
+    );
+
+    return (int)cudaGetLastError();
 }
 
 // TODO: Future encoding methods:
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs 
b/qdp/qdp-kernels/tests/amplitude_encode.rs
new file mode 100644
index 000000000..2ac125c3e
--- /dev/null
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -0,0 +1,460 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests for amplitude encoding CUDA kernel
+
+#[cfg(target_os = "linux")]
+use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
+#[cfg(target_os = "linux")]
+use qdp_kernels::{CuDoubleComplex, launch_amplitude_encode};
+
+const EPSILON: f64 = 1e-10;
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_basic() {
+    println!("Testing basic amplitude encoding...");
+
+    // Initialize CUDA device
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test input: [3.0, 4.0] -> normalized to [0.6, 0.8]
+    let input = vec![3.0, 4.0];
+    let norm = (3.0_f64.powi(2) + 4.0_f64.powi(2)).sqrt(); // 5.0
+    let state_len = 4; // 2 qubits
+
+    // Allocate device memory
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    // Launch kernel
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    // Copy result back
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // Verify normalization: [0.6, 0.8, 0.0, 0.0]
+    assert!(
+        (state_h[0].x - 0.6).abs() < EPSILON,
+        "First element should be 0.6"
+    );
+    assert!(
+        (state_h[0].y).abs() < EPSILON,
+        "First element imaginary should be 0"
+    );
+    assert!(
+        (state_h[1].x - 0.8).abs() < EPSILON,
+        "Second element should be 0.8"
+    );
+    assert!(
+        (state_h[1].y).abs() < EPSILON,
+        "Second element imaginary should be 0"
+    );
+    assert!((state_h[2].x).abs() < EPSILON, "Third element should be 0");
+    assert!((state_h[3].x).abs() < EPSILON, "Fourth element should be 0");
+
+    // Verify state is normalized
+    let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+    assert!(
+        (total_prob - 1.0).abs() < EPSILON,
+        "Total probability should be 1.0"
+    );
+
+    println!("PASS: Basic amplitude encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_power_of_two() {
+    println!("Testing amplitude encoding with power-of-two input...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test with 8 input values (fills 3-qubit state)
+    let input: Vec<f64> = (1..=8).map(|x| x as f64).collect();
+    let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+    let state_len = 8;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // Verify all elements are correctly normalized
+    for i in 0..state_len {
+        let expected = input[i] / norm;
+        assert!(
+            (state_h[i].x - expected).abs() < EPSILON,
+            "Element {} should be {}, got {}",
+            i,
+            expected,
+            state_h[i].x
+        );
+        assert!((state_h[i].y).abs() < EPSILON, "Imaginary part should be 0");
+    }
+
+    // Verify normalization
+    let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+    assert!(
+        (total_prob - 1.0).abs() < EPSILON,
+        "Total probability should be 1.0"
+    );
+
+    println!("PASS: Power-of-two input encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_odd_input_length() {
+    println!("Testing amplitude encoding with odd input length...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test with 3 input values, state size 4
+    let input = vec![1.0, 2.0, 2.0];
+    let norm = (1.0_f64 + 4.0 + 4.0).sqrt(); // 3.0
+    let state_len = 4;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // Verify: [1/3, 2/3, 2/3, 0]
+    assert!((state_h[0].x - 1.0 / 3.0).abs() < EPSILON);
+    assert!((state_h[1].x - 2.0 / 3.0).abs() < EPSILON);
+    assert!((state_h[2].x - 2.0 / 3.0).abs() < EPSILON);
+    assert!(
+        (state_h[3].x).abs() < EPSILON,
+        "Fourth element should be padded with 0"
+    );
+
+    println!("PASS: Odd input length handled correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_large_state() {
+    println!("Testing amplitude encoding with large state vector...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Test with 1024 elements (10 qubits)
+    let input_len = 1024;
+    let input: Vec<f64> = (0..input_len).map(|i| (i + 1) as f64).collect();
+    let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+    let state_len = 1024;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // Spot check a few values
+    for i in [0, 100, 500, 1023] {
+        let expected = input[i] / norm;
+        assert!(
+            (state_h[i].x - expected).abs() < EPSILON,
+            "Element {} mismatch",
+            i
+        );
+    }
+
+    // Verify normalization
+    let total_prob: f64 = state_h.iter().map(|c| c.x * c.x + c.y * c.y).sum();
+    assert!(
+        (total_prob - 1.0).abs() < EPSILON,
+        "Total probability should be 1.0"
+    );
+
+    println!("PASS: Large state vector encoding works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_zero_norm_error() {
+    println!("Testing amplitude encoding with zero norm (error case)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input = vec![0.0, 0.0, 0.0];
+    let norm = 0.0; // Invalid!
+    let state_len = 4;
+
+    let input_d = device.htod_copy(input).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            3,
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    // Should return CUDA error code for invalid value
+    assert_ne!(result, 0, "Should reject zero norm");
+    println!(
+        "PASS: Zero norm correctly rejected with error code {}",
+        result
+    );
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_negative_norm_error() {
+    println!("Testing amplitude encoding with negative norm (error case)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input = vec![1.0, 2.0, 3.0];
+    let norm = -5.0; // Invalid!
+    let state_len = 4;
+
+    let input_d = device.htod_copy(input).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            3,
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    // Should return CUDA error code for invalid value
+    assert_ne!(result, 0, "Should reject negative norm");
+    println!(
+        "PASS: Negative norm correctly rejected with error code {}",
+        result
+    );
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_vectorized_load() {
+    println!("Testing vectorized double2 memory access optimization...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Use exactly 16 elements to test vectorized loads (8 threads * 2 
elements each)
+    let input: Vec<f64> = (1..=16).map(|x| x as f64).collect();
+    let norm: f64 = input.iter().map(|x| x * x).sum::<f64>().sqrt();
+    let state_len = 16;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // Verify all elements processed correctly through vectorized loads
+    for i in 0..state_len {
+        let expected = input[i] / norm;
+        assert!(
+            (state_h[i].x - expected).abs() < EPSILON,
+            "Vectorized load: element {} should be {}, got {}",
+            i,
+            expected,
+            state_h[i].x
+        );
+    }
+
+    println!("PASS: Vectorized memory access works correctly");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_small_input_large_state() {
+    println!("Testing small input with large state vector...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    // Only 2 input values, but 16-element state (padding with zeros)
+    let input = vec![3.0, 4.0];
+    let norm = 5.0;
+    let state_len = 16;
+
+    let input_d = device.htod_copy(input.clone()).unwrap();
+    let mut state_d = 
device.alloc_zeros::<CuDoubleComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            input.len(),
+            state_len,
+            norm,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+
+    // First two elements should be normalized values
+    assert!((state_h[0].x - 0.6).abs() < EPSILON);
+    assert!((state_h[1].x - 0.8).abs() < EPSILON);
+
+    // Rest should be zero
+    for i in 2..state_len {
+        assert!(
+            state_h[i].x.abs() < EPSILON && state_h[i].y.abs() < EPSILON,
+            "Element {} should be zero-padded",
+            i
+        );
+    }
+
+    println!("PASS: Small input with large state padding works correctly");
+}
+
+#[test]
+#[cfg(not(target_os = "linux"))]
+fn test_amplitude_encode_dummy_non_linux() {
+    println!("Testing dummy implementation on non-Linux platform...");
+
+    // The dummy implementation should return error code 999
+    let result = unsafe {
+        qdp_kernels::launch_amplitude_encode(
+            std::ptr::null(),
+            std::ptr::null_mut(),
+            0,
+            0,
+            1.0,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 999, "Dummy implementation should return 999");
+    println!("PASS: Non-Linux dummy implementation returns expected error 
code");
+}

(mahout) 13/50: [QDP] add vanilla gpu kernel (#677)

Reply via email to