Re: [PR] fix: misaligned vector loads for odd-length samples in batch amplitude and norm kernels [mahout]

via GitHub Mon, 02 Mar 2026 16:27:06 -0800


Copilot commented on code in PR #1108:
URL: https://github.com/apache/mahout/pull/1108#discussion_r2875399684



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -432,20 +435,29 @@ __global__ void l2_norm_batch_kernel(
 
     const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
     const size_t stride = blockDim.x * blocks_per_sample;
+    const double* sample_input = input_batch + base;
+    const bool sample_input_aligned =
+        (reinterpret_cast<uintptr_t>(sample_input) % alignof(double2)) == 0;
 
     double local_sum = 0.0;
 
     size_t vec_offset = vec_idx;
     size_t offset = vec_offset * 2;
     while (offset + 1 < sample_len) {
-        const double2 v = __ldg(reinterpret_cast<const double2*>(input_batch + 
base) + vec_offset);
-        local_sum += v.x * v.x + v.y * v.y;
+        if (sample_input_aligned) {
+            const double2 v = __ldg(reinterpret_cast<const 
double2*>(sample_input) + vec_offset);
+            local_sum += v.x * v.x + v.y * v.y;
+        } else {
+            const double v1 = __ldg(sample_input + offset);
+            const double v2 = __ldg(sample_input + offset + 1);
+            local_sum += v1 * v1 + v2 * v2;
+        }

Review Comment:
   `sample_input_aligned` is constant per-sample, but the `if 
(sample_input_aligned)` branch is inside the hot `while` loop. Consider 
splitting into two separate loops/paths (aligned vs misaligned) so the aligned 
fast path doesn’t pay the per-iteration branch overhead.



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -472,20 +484,29 @@ __global__ void l2_norm_batch_kernel_f32(
 
     const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
     const size_t stride = blockDim.x * blocks_per_sample;
+    const float* sample_input = input_batch + base;
+    const bool sample_input_aligned =
+        (reinterpret_cast<uintptr_t>(sample_input) % alignof(float2)) == 0;
 
     float local_sum = 0.0f;
 
     size_t vec_offset = vec_idx;
     size_t offset = vec_offset * 2;
     while (offset + 1 < sample_len) {
-        const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch + 
base) + vec_offset);
-        local_sum += v.x * v.x + v.y * v.y;
+        if (sample_input_aligned) {
+            const float2 v = __ldg(reinterpret_cast<const 
float2*>(sample_input) + vec_offset);
+            local_sum += v.x * v.x + v.y * v.y;
+        } else {
+            const float v1 = __ldg(sample_input + offset);
+            const float v2 = __ldg(sample_input + offset + 1);
+            local_sum += v1 * v1 + v2 * v2;
+        }

Review Comment:
   As with the f64 kernel, the `if (sample_input_aligned)` branch is inside the 
inner `while` loop even though it’s constant per sample. Consider separating 
aligned vs misaligned code paths to keep the aligned case branch-free in the 
loop.



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -472,20 +484,29 @@ __global__ void l2_norm_batch_kernel_f32(
 
     const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
     const size_t stride = blockDim.x * blocks_per_sample;
+    const float* sample_input = input_batch + base;
+    const bool sample_input_aligned =
+        (reinterpret_cast<uintptr_t>(sample_input) % alignof(float2)) == 0;

Review Comment:
   Alignment is checked with a modulo. Since `alignof(float2)` is a 
power-of-two, prefer a bitmask alignment test to avoid generating a remainder 
operation in device code.
   ```suggestion
           (reinterpret_cast<uintptr_t>(sample_input) & (alignof(float2) - 1)) 
== 0;
   ```



##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -264,17 +265,19 @@ __global__ void amplitude_encode_batch_kernel(
         // Load inverse norm (cached by L1)
         const double inv_norm = inv_norms[sample_idx];
 
-        // Vectorized load: read 2 doubles as double2 for 128-bit transaction
+        const double* sample_input = input_batch + input_base;
+        const bool sample_input_aligned =
+            (reinterpret_cast<uintptr_t>(sample_input) % alignof(double2)) == 
0;

Review Comment:
   Alignment is checked with a modulo on every iteration. Since 
`alignof(double2)` is a power-of-two, using a bitmask-style alignment check 
avoids an expensive remainder operation in device code and makes the intent 
clearer.
   ```suggestion
           const uintptr_t sample_addr = 
reinterpret_cast<uintptr_t>(sample_input);
           const bool sample_input_aligned =
               (sample_addr & (alignof(double2) - 1)) == 0;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] fix: misaligned vector loads for odd-length samples in batch amplitude and norm kernels [mahout]

Reply via email to