(mahout) branch main updated: fix: misaligned vector loads for odd-length samples in batch amplitude and norm kernels (#1108)

hcr Tue, 03 Mar 2026 22:42:17 -0800

This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new cedcc7622 fix: misaligned vector loads for odd-length samples in batch 
amplitude and norm kernels (#1108)
cedcc7622 is described below

commit cedcc762286f8bc7c3ba7569a5deb4e5e15b8bba
Author: Vic Wen <[email protected]>
AuthorDate: Wed Mar 4 14:41:58 2026 +0800

    fix: misaligned vector loads for odd-length samples in batch amplitude and 
norm kernels (#1108)
---
 qdp/qdp-kernels/src/amplitude.cu          |  73 ++++++++----
 qdp/qdp-kernels/tests/amplitude_encode.rs | 185 +++++++++++++++++++++++++++++-
 2 files changed, 235 insertions(+), 23 deletions(-)

diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 67a863eaa..224e6bcd6 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -20,6 +20,7 @@
 #include <cuComplex.h>
 #include <vector_types.h>
 #include <math.h>
+#include <stdint.h>
 #include "kernel_config.h"
 
 __global__ void amplitude_encode_kernel(
@@ -231,10 +232,10 @@ int launch_amplitude_encode_f32(
 /// - state_batch: [sample0_state | sample1_state | ... | sampleN_state]
 ///
 /// Optimizations:
-/// 1. Vectorized double2 loads for 128-bit memory transactions
+/// 1. Vectorized double2 loads for 128-bit memory transactions when aligned
 /// 2. Grid-stride loop for arbitrary batch sizes
 /// 3. Coalesced memory access within warps
-/// 4. Minimized register pressure
+/// 4. Scalar fallback for misaligned sample bases and odd tails
 __global__ void amplitude_encode_batch_kernel(
     const double* __restrict__ input_batch,
     cuDoubleComplex* __restrict__ state_batch,
@@ -264,17 +265,21 @@ __global__ void amplitude_encode_batch_kernel(
         // Load inverse norm (cached by L1)
         const double inv_norm = inv_norms[sample_idx];
 
-        // Vectorized load: read 2 doubles as double2 for 128-bit transaction
         double v1, v2;
-        if (elem_offset + 1 < input_len) {
-            // Aligned vectorized load
-            const double2 vec_data = __ldg(reinterpret_cast<const 
double2*>(input_batch + input_base) + elem_pair);
+        const double* sample_input = input_batch + input_base;
+        const bool sample_input_aligned =
+            (reinterpret_cast<uintptr_t>(sample_input) & (alignof(double2) - 
1)) == 0;
+
+        if (sample_input_aligned && elem_offset + 1 < input_len) {
+            const double2 vec_data =
+                __ldg(reinterpret_cast<const double2*>(sample_input) + 
elem_pair);
             v1 = vec_data.x;
             v2 = vec_data.y;
         } else if (elem_offset < input_len) {
-            // Edge case: single element load
-            v1 = __ldg(input_batch + input_base + elem_offset);
-            v2 = 0.0;
+            v1 = __ldg(sample_input + elem_offset);
+            v2 = (elem_offset + 1 < input_len)
+                ? __ldg(sample_input + elem_offset + 1)
+                : 0.0;
         } else {
             // Padding region
             v1 = v2 = 0.0;
@@ -432,20 +437,33 @@ __global__ void l2_norm_batch_kernel(
 
     const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
     const size_t stride = blockDim.x * blocks_per_sample;
+    const double* sample_input = input_batch + base;
+    const bool sample_input_aligned =
+        (reinterpret_cast<uintptr_t>(sample_input) & (alignof(double2) - 1)) 
== 0;
 
     double local_sum = 0.0;
 
     size_t vec_offset = vec_idx;
     size_t offset = vec_offset * 2;
-    while (offset + 1 < sample_len) {
-        const double2 v = __ldg(reinterpret_cast<const double2*>(input_batch + 
base) + vec_offset);
-        local_sum += v.x * v.x + v.y * v.y;
-        vec_offset += stride;
-        offset = vec_offset * 2;
+    if (sample_input_aligned) {
+        while (offset + 1 < sample_len) {
+            const double2 v = __ldg(reinterpret_cast<const 
double2*>(sample_input) + vec_offset);
+            local_sum += v.x * v.x + v.y * v.y;
+            vec_offset += stride;
+            offset = vec_offset * 2;
+        }
+    } else {
+        while (offset + 1 < sample_len) {
+            const double v1 = __ldg(sample_input + offset);
+            const double v2 = __ldg(sample_input + offset + 1);
+            local_sum += v1 * v1 + v2 * v2;
+            vec_offset += stride;
+            offset = vec_offset * 2;
+        }
     }
 
     if (offset < sample_len) {
-        const double v = __ldg(input_batch + base + offset);
+        const double v = __ldg(sample_input + offset);
         local_sum += v * v;
     }
 
@@ -472,20 +490,33 @@ __global__ void l2_norm_batch_kernel_f32(
 
     const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
     const size_t stride = blockDim.x * blocks_per_sample;
+    const float* sample_input = input_batch + base;
+    const bool sample_input_aligned =
+        (reinterpret_cast<uintptr_t>(sample_input) & (alignof(float2) - 1)) == 
0;
 
     float local_sum = 0.0f;
 
     size_t vec_offset = vec_idx;
     size_t offset = vec_offset * 2;
-    while (offset + 1 < sample_len) {
-        const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch + 
base) + vec_offset);
-        local_sum += v.x * v.x + v.y * v.y;
-        vec_offset += stride;
-        offset = vec_offset * 2;
+    if (sample_input_aligned) {
+        while (offset + 1 < sample_len) {
+            const float2 v = __ldg(reinterpret_cast<const 
float2*>(sample_input) + vec_offset);
+            local_sum += v.x * v.x + v.y * v.y;
+            vec_offset += stride;
+            offset = vec_offset * 2;
+        }
+    } else {
+        while (offset + 1 < sample_len) {
+            const float v1 = __ldg(sample_input + offset);
+            const float v2 = __ldg(sample_input + offset + 1);
+            local_sum += v1 * v1 + v2 * v2;
+            vec_offset += stride;
+            offset = vec_offset * 2;
+        }
     }
 
     if (offset < sample_len) {
-        const float v = __ldg(input_batch + base + offset);
+        const float v = __ldg(sample_input + offset);
         local_sum += v * v;
     }
 
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs 
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index 579a4e7ec..53f91505f 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -25,13 +25,50 @@
 use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
-    CuComplex, CuDoubleComplex, launch_amplitude_encode, 
launch_amplitude_encode_f32,
-    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, 
launch_l2_norm_f32,
+    CuComplex, CuDoubleComplex, launch_amplitude_encode, 
launch_amplitude_encode_batch,
+    launch_amplitude_encode_f32, launch_l2_norm, launch_l2_norm_batch, 
launch_l2_norm_batch_f32,
+    launch_l2_norm_f32,
 };
 
 const EPSILON: f64 = 1e-10;
 const EPSILON_F32: f32 = 1e-5;
 
+#[cfg(target_os = "linux")]
+fn assert_batch_state_matches_f64(
+    state_h: &[CuDoubleComplex],
+    input: &[f64],
+    num_samples: usize,
+    sample_len: usize,
+    state_len: usize,
+) {
+    for sample_idx in 0..num_samples {
+        let sample = &input[sample_idx * sample_len..(sample_idx + 1) * 
sample_len];
+        let norm = sample.iter().map(|v| v * v).sum::<f64>().sqrt();
+        for elem_idx in 0..state_len {
+            let expected = if elem_idx < sample_len {
+                sample[elem_idx] / norm
+            } else {
+                0.0
+            };
+            let actual = state_h[sample_idx * state_len + elem_idx];
+            assert!(
+                (actual.x - expected).abs() < EPSILON,
+                "sample {} element {} expected {}, got {}",
+                sample_idx,
+                elem_idx,
+                expected,
+                actual.x
+            );
+            assert!(
+                actual.y.abs() < EPSILON,
+                "sample {} element {} imaginary should be 0",
+                sample_idx,
+                elem_idx
+            );
+        }
+    }
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_amplitude_encode_basic() {
@@ -574,6 +611,52 @@ fn test_amplitude_encode_small_input_large_state() {
     println!("PASS: Small input with large state padding works correctly");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_amplitude_encode_batch_odd_sample_length_handles_misaligned_samples() {
+    println!("Testing batch amplitude encoding with odd sample length 
(float64)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let num_samples = 2usize;
+    let sample_len = 3usize;
+    let state_len = 4usize;
+    let input: Vec<f64> = vec![1.0, 2.0, 2.0, 2.0, 1.0, 2.0];
+    let inv_norms: Vec<f64> = input
+        .chunks(sample_len)
+        .map(|sample| 1.0 / sample.iter().map(|v| v * v).sum::<f64>().sqrt())
+        .collect();
+
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+    let inv_norms_d = device.htod_sync_copy(inv_norms.as_slice()).unwrap();
+    let mut state_d = device
+        .alloc_zeros::<CuDoubleComplex>(num_samples * state_len)
+        .unwrap();
+
+    let result = unsafe {
+        launch_amplitude_encode_batch(
+            *input_d.device_ptr() as *const f64,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            *inv_norms_d.device_ptr() as *const f64,
+            num_samples,
+            sample_len,
+            state_len,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(result, 0, "Batch kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+    assert_batch_state_matches_f64(&state_h, &input, num_samples, sample_len, 
state_len);
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_l2_norm_single_kernel() {
@@ -671,6 +754,55 @@ fn test_l2_norm_batch_kernel_stream() {
     println!("PASS: Batched norm reduction on stream matches CPU");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_odd_sample_len() {
+    println!("Testing batched L2 norm reduction with odd sample length 
(float64)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let sample_len = 3usize;
+    let num_samples = 2usize;
+    let input: Vec<f64> = vec![1.0, 2.0, 2.0, 2.0, 1.0, 2.0];
+    let expected: Vec<f64> = input
+        .chunks(sample_len)
+        .map(|chunk| 1.0 / chunk.iter().map(|v| v * v).sum::<f64>().sqrt())
+        .collect();
+
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+    let mut norms_d = device.alloc_zeros::<f64>(num_samples).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch(
+            *input_d.device_ptr() as *const f64,
+            num_samples,
+            sample_len,
+            *norms_d.device_ptr_mut() as *mut f64,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(status, 0, "Batch norm kernel should succeed");
+    device.synchronize().unwrap();
+
+    let norms_h = device.dtoh_sync_copy(&norms_d).unwrap();
+    for (i, (got, expect)) in norms_h.iter().zip(expected.iter()).enumerate() {
+        assert!(
+            (got - expect).abs() < EPSILON,
+            "Sample {} inv norm mismatch: expected {}, got {}",
+            i,
+            expect,
+            got
+        );
+    }
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_l2_norm_batch_kernel_zero_num_samples() {
@@ -840,6 +972,55 @@ fn test_l2_norm_batch_kernel_f32() {
     println!("PASS: Batched norm reduction (float32) matches CPU");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_f32_odd_sample_len() {
+    println!("Testing batched L2 norm reduction with odd sample length 
(float32)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let sample_len = 3usize;
+    let num_samples = 2usize;
+    let input: Vec<f32> = vec![1.0, 2.0, 2.0, 2.0, 1.0, 2.0];
+    let expected: Vec<f32> = input
+        .chunks(sample_len)
+        .map(|chunk| 1.0 / chunk.iter().map(|v| v * v).sum::<f32>().sqrt())
+        .collect();
+
+    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
+    let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            num_samples,
+            sample_len,
+            *norms_d.device_ptr_mut() as *mut f32,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(status, 0, "Batch norm f32 kernel should succeed");
+    device.synchronize().unwrap();
+
+    let norms_h = device.dtoh_sync_copy(&norms_d).unwrap();
+    for (i, (got, expect)) in norms_h.iter().zip(expected.iter()).enumerate() {
+        assert!(
+            (got - expect).abs() < EPSILON_F32,
+            "Sample {} inv norm mismatch: expected {}, got {}",
+            i,
+            expect,
+            got
+        );
+    }
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_l2_norm_batch_kernel_zero_num_samples_f32() {

(mahout) branch main updated: fix: misaligned vector loads for odd-length samples in batch amplitude and norm kernels (#1108)

Reply via email to