Copilot commented on code in PR #1108:
URL: https://github.com/apache/mahout/pull/1108#discussion_r2875399684
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -432,20 +435,29 @@ __global__ void l2_norm_batch_kernel(
const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * blocks_per_sample;
+ const double* sample_input = input_batch + base;
+ const bool sample_input_aligned =
+ (reinterpret_cast<uintptr_t>(sample_input) % alignof(double2)) == 0;
double local_sum = 0.0;
size_t vec_offset = vec_idx;
size_t offset = vec_offset * 2;
while (offset + 1 < sample_len) {
- const double2 v = __ldg(reinterpret_cast<const double2*>(input_batch +
base) + vec_offset);
- local_sum += v.x * v.x + v.y * v.y;
+ if (sample_input_aligned) {
+ const double2 v = __ldg(reinterpret_cast<const
double2*>(sample_input) + vec_offset);
+ local_sum += v.x * v.x + v.y * v.y;
+ } else {
+ const double v1 = __ldg(sample_input + offset);
+ const double v2 = __ldg(sample_input + offset + 1);
+ local_sum += v1 * v1 + v2 * v2;
+ }
Review Comment:
`sample_input_aligned` is constant per-sample, but the `if
(sample_input_aligned)` branch is inside the hot `while` loop. Consider
splitting into two separate loops/paths (aligned vs misaligned) so the aligned
fast path doesn’t pay the per-iteration branch overhead.
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -472,20 +484,29 @@ __global__ void l2_norm_batch_kernel_f32(
const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * blocks_per_sample;
+ const float* sample_input = input_batch + base;
+ const bool sample_input_aligned =
+ (reinterpret_cast<uintptr_t>(sample_input) % alignof(float2)) == 0;
float local_sum = 0.0f;
size_t vec_offset = vec_idx;
size_t offset = vec_offset * 2;
while (offset + 1 < sample_len) {
- const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch +
base) + vec_offset);
- local_sum += v.x * v.x + v.y * v.y;
+ if (sample_input_aligned) {
+ const float2 v = __ldg(reinterpret_cast<const
float2*>(sample_input) + vec_offset);
+ local_sum += v.x * v.x + v.y * v.y;
+ } else {
+ const float v1 = __ldg(sample_input + offset);
+ const float v2 = __ldg(sample_input + offset + 1);
+ local_sum += v1 * v1 + v2 * v2;
+ }
Review Comment:
As with the f64 kernel, the `if (sample_input_aligned)` branch is inside the
inner `while` loop even though it’s constant per sample. Consider separating
aligned vs misaligned code paths to keep the aligned case branch-free in the
loop.
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -472,20 +484,29 @@ __global__ void l2_norm_batch_kernel_f32(
const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * blocks_per_sample;
+ const float* sample_input = input_batch + base;
+ const bool sample_input_aligned =
+ (reinterpret_cast<uintptr_t>(sample_input) % alignof(float2)) == 0;
Review Comment:
Alignment is checked with a modulo. Since `alignof(float2)` is a
power-of-two, prefer a bitmask alignment test to avoid generating a remainder
operation in device code.
```suggestion
(reinterpret_cast<uintptr_t>(sample_input) & (alignof(float2) - 1))
== 0;
```
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -264,17 +265,19 @@ __global__ void amplitude_encode_batch_kernel(
// Load inverse norm (cached by L1)
const double inv_norm = inv_norms[sample_idx];
- // Vectorized load: read 2 doubles as double2 for 128-bit transaction
+ const double* sample_input = input_batch + input_base;
+ const bool sample_input_aligned =
+ (reinterpret_cast<uintptr_t>(sample_input) % alignof(double2)) ==
0;
Review Comment:
Alignment is checked with a modulo on every iteration. Since
`alignof(double2)` is a power-of-two, using a bitmask-style alignment check
avoids an expensive remainder operation in device code and makes the intent
clearer.
```suggestion
const uintptr_t sample_addr =
reinterpret_cast<uintptr_t>(sample_input);
const bool sample_input_aligned =
(sample_addr & (alignof(double2) - 1)) == 0;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]