rich7420 commented on code in PR #918:
URL: https://github.com/apache/mahout/pull/918#discussion_r2724245973
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -391,6 +455,46 @@ __global__ void l2_norm_batch_kernel(
}
}
+/// Kernel: accumulate L2 norms for a batch (float32).
+/// Grid is organized as (blocks_per_sample * num_samples) blocks.
+__global__ void l2_norm_batch_kernel_f32(
+ const float* __restrict__ input_batch,
+ size_t num_samples,
+ size_t sample_len,
+ size_t blocks_per_sample,
+ float* __restrict__ out_norms
+) {
+ const size_t sample_idx = blockIdx.x / blocks_per_sample;
+ if (sample_idx >= num_samples) return;
+
+ const size_t block_in_sample = blockIdx.x % blocks_per_sample;
+ const size_t base = sample_idx * sample_len;
+
+ const size_t vec_idx = block_in_sample * blockDim.x + threadIdx.x;
+ const size_t stride = blockDim.x * blocks_per_sample;
+
+ float local_sum = 0.0f;
+
+ size_t vec_offset = vec_idx;
+ size_t offset = vec_offset * 2;
+ while (offset + 1 < sample_len) {
+ const float2 v = __ldg(reinterpret_cast<const float2*>(input_batch +
base) + vec_offset);
+ local_sum += v.x * v.x + v.y * v.y;
Review Comment:
input_batch + base is a float*, and float2 loads require 8‑byte alignment.
If sample_len is odd, then base = sample_idx * sample_len is odd, so the
address is offset by 4 bytes and the reinterpret_cast<const float2*> load is
misaligned. That maybe trigger misaligned memory access or force less efficient
load paths on some GPUs.
##########
qdp/qdp-kernels/src/amplitude.cu:
##########
@@ -512,6 +677,66 @@ int launch_l2_norm_batch(
return (int)cudaGetLastError();
}
+/// Launch L2 norm reduction for a batch of vectors (float32).
+/// Writes inverse norms for each sample into `inv_norms_out_d`.
+int launch_l2_norm_batch_f32(
+ const float* input_batch_d,
+ size_t num_samples,
+ size_t sample_len,
+ float* inv_norms_out_d,
+ cudaStream_t stream
+) {
+ if (num_samples == 0 || sample_len == 0) {
+ return cudaErrorInvalidValue;
+ }
+
+ cudaError_t memset_status = cudaMemsetAsync(
+ inv_norms_out_d,
+ 0,
+ num_samples * sizeof(float),
+ stream
+ );
+ if (memset_status != cudaSuccess) {
+ return memset_status;
+ }
+
+ const int blockSize = DEFAULT_BLOCK_SIZE;
+ const size_t elements_per_block = blockSize * 2; // float2 per thread
+ size_t blocks_per_sample = (sample_len + elements_per_block - 1) /
elements_per_block;
+ const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
+ if (blocks_per_sample == 0) blocks_per_sample = 1;
+ if (blocks_per_sample > max_blocks_per_sample) {
+ blocks_per_sample = max_blocks_per_sample;
+ }
+
+ size_t gridSize = num_samples * blocks_per_sample;
+ const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit
for 1D launch
+ if (gridSize > max_grid) {
+ blocks_per_sample = max_grid / num_samples;
+ if (blocks_per_sample == 0) {
+ blocks_per_sample = 1;
+ }
+ gridSize = num_samples * blocks_per_sample;
Review Comment:
If num_samples exceeds the 1D grid limit, blocks_per_sample becomes 1 but
gridSize = num_samples still exceeds max_grid, leading to invalid launch. I
think we could add an explicit guard (return error) or switch to 2D grid /
looped samples. WDYT?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]