This is an automated email from the ASF dual-hosted git repository.
hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git
The following commit(s) were added to refs/heads/main by this push:
new 4438a9eeb [QDP] Fix invalid CUDA kernel launch when num_samples
exceeds grid dimension limit (#968)
4438a9eeb is described below
commit 4438a9eeb621d865a19646d4dad3ddbeb3930cc8
Author: Vic Wen <[email protected]>
AuthorDate: Fri Feb 6 16:19:49 2026 +0800
[QDP] Fix invalid CUDA kernel launch when num_samples exceeds grid
dimension limit (#968)
---
qdp/qdp-kernels/src/amplitude.cu | 40 +++++++++-
qdp/qdp-kernels/src/kernel_config.h | 4 +-
qdp/qdp-kernels/tests/amplitude_encode.rs | 125 ++++++++++++++++++++++++++----
3 files changed, 149 insertions(+), 20 deletions(-)
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 06676e081..67a863eaa 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -617,6 +617,37 @@ int launch_l2_norm_f32(
return (int)cudaGetLastError();
}
+/// Returns the current device's max grid dimension (X), fallback to
CUDA_MAX_GRID_DIM_1D if error.
+static size_t get_max_grid_dim_1d(void) {
+ static size_t cached_max_grid_dim_1d = 0;
+ if (cached_max_grid_dim_1d != 0) {
+ return cached_max_grid_dim_1d;
+ }
+
+ int device = -1;
+ // Fallback to CUDA_MAX_GRID_DIM_1D if error
+ if (cudaGetDevice(&device) != cudaSuccess || device < 0) {
+ cached_max_grid_dim_1d = CUDA_MAX_GRID_DIM_1D;
+ return cached_max_grid_dim_1d;
+ }
+
+ int max_x = 0;
+ cudaError_t err = cudaDeviceGetAttribute(
+ &max_x,
+ cudaDevAttrMaxGridDimX,
+ device
+ );
+
+ // Fallback to CUDA_MAX_GRID_DIM_1D if error
+ if (err != cudaSuccess || max_x <= 0) {
+ cached_max_grid_dim_1d = CUDA_MAX_GRID_DIM_1D;
+ return cached_max_grid_dim_1d;
+ }
+
+ cached_max_grid_dim_1d = (size_t)max_x;
+ return cached_max_grid_dim_1d;
+}
+
/// Launch L2 norm reduction for a batch of vectors.
/// Writes inverse norms for each sample into `inv_norms_out_d`.
int launch_l2_norm_batch(
@@ -642,6 +673,12 @@ int launch_l2_norm_batch(
const int blockSize = DEFAULT_BLOCK_SIZE;
const size_t elements_per_block = blockSize * 2; // double2 per thread
+ const size_t max_grid = get_max_grid_dim_1d();
+
+ if (num_samples > max_grid) {
+ return cudaErrorInvalidValue;
+ }
+
size_t blocks_per_sample = (sample_len + elements_per_block - 1) /
elements_per_block;
const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
if (blocks_per_sample == 0) blocks_per_sample = 1;
@@ -650,7 +687,6 @@ int launch_l2_norm_batch(
}
size_t gridSize = num_samples * blocks_per_sample;
- const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit
for 1D launch
if (gridSize > max_grid) {
blocks_per_sample = max_grid / num_samples;
if (blocks_per_sample == 0) {
@@ -702,7 +738,7 @@ int launch_l2_norm_batch_f32(
const int blockSize = DEFAULT_BLOCK_SIZE;
const size_t elements_per_block = blockSize * 2; // float2 per thread
- const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit
for 1D launch
+ const size_t max_grid = get_max_grid_dim_1d();
if (num_samples > max_grid) {
return cudaErrorInvalidValue;
}
diff --git a/qdp/qdp-kernels/src/kernel_config.h
b/qdp/qdp-kernels/src/kernel_config.h
index 4ce4526cb..9a5708ac3 100644
--- a/qdp/qdp-kernels/src/kernel_config.h
+++ b/qdp/qdp-kernels/src/kernel_config.h
@@ -46,9 +46,9 @@
// Limits per-sample parallelism to maintain good load balancing
#define MAX_BLOCKS_PER_SAMPLE 32
-// CUDA grid dimension limit for 1D launches
+// CUDA grid dimension limit for 1D launches (2^31 - 1, signed 32-bit int max)
// This is a hardware limitation, not a tunable parameter
-#define CUDA_MAX_GRID_DIM_1D 65535
+#define CUDA_MAX_GRID_DIM_1D 2147483647
// ============================================================================
// Qubit Limits
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index db68dd012..579a4e7ec 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -671,6 +671,74 @@ fn test_l2_norm_batch_kernel_stream() {
println!("PASS: Batched norm reduction on stream matches CPU");
}
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_num_samples() {
+ println!("Testing batched L2 norm rejection when num_samples==0
(float64)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input_d = device.alloc_zeros::<f64>(2).unwrap();
+ let mut norms_d = device.alloc_zeros::<f64>(1).unwrap();
+
+ let status = unsafe {
+ launch_l2_norm_batch(
+ *input_d.device_ptr() as *const f64,
+ 0, // num_samples == 0
+ 2, // sample_len
+ *norms_d.device_ptr_mut() as *mut f64,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(
+ status, 1,
+ "Should reject num_samples==0 (cudaErrorInvalidValue), got {}",
+ status
+ );
+ println!("PASS: Correctly rejected num_samples==0 (f64)");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_sample_len() {
+ println!("Testing batched L2 norm rejection when sample_len==0
(float64)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input_d = device.alloc_zeros::<f64>(1).unwrap();
+ let mut norms_d = device.alloc_zeros::<f64>(1).unwrap();
+
+ let status = unsafe {
+ launch_l2_norm_batch(
+ *input_d.device_ptr() as *const f64,
+ 1, // num_samples
+ 0, // sample_len == 0
+ *norms_d.device_ptr_mut() as *mut f64,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(
+ status, 1,
+ "Should reject sample_len==0 (cudaErrorInvalidValue), got {}",
+ status
+ );
+ println!("PASS: Correctly rejected sample_len==0 (f64)");
+}
+
#[test]
#[cfg(target_os = "linux")]
fn test_l2_norm_single_kernel_f32() {
@@ -774,8 +842,8 @@ fn test_l2_norm_batch_kernel_f32() {
#[test]
#[cfg(target_os = "linux")]
-fn test_l2_norm_batch_kernel_grid_limit_f32() {
- println!("Testing batched L2 norm reduction with grid limit boundary
(float32)...");
+fn test_l2_norm_batch_kernel_zero_num_samples_f32() {
+ println!("Testing batched L2 norm rejection when num_samples==0
(float32)...");
let device = match CudaDevice::new(0) {
Ok(d) => d,
@@ -785,34 +853,59 @@ fn test_l2_norm_batch_kernel_grid_limit_f32() {
}
};
- // Test that num_samples exceeding CUDA_MAX_GRID_DIM_1D (65535) returns
error
- const MAX_GRID_DIM: usize = 65535;
- let num_samples = MAX_GRID_DIM + 1; // Exceeds limit
- let sample_len = 2;
-
- let input: Vec<f32> = vec![1.0; num_samples * sample_len];
- let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
- let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+ let input_d = device.alloc_zeros::<f32>(2).unwrap();
+ let mut norms_d = device.alloc_zeros::<f32>(1).unwrap();
let status = unsafe {
launch_l2_norm_batch_f32(
*input_d.device_ptr() as *const f32,
- num_samples,
- sample_len,
+ 0, // num_samples == 0
+ 2, // sample_len
*norms_d.device_ptr_mut() as *mut f32,
std::ptr::null_mut(),
)
};
- // Should return error because num_samples exceeds grid limit
- // cudaErrorInvalidValue = 1 (from cuda_error_to_string)
assert_eq!(
status, 1,
- "Should reject num_samples exceeding CUDA_MAX_GRID_DIM_1D (f32), got
error code {}",
+ "Should reject num_samples==0 (cudaErrorInvalidValue), got {}",
status
);
+ println!("PASS: Correctly rejected num_samples==0 (f32)");
+}
- println!("PASS: Correctly rejected num_samples exceeding grid limit
(f32)");
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_sample_len_f32() {
+ println!("Testing batched L2 norm rejection when sample_len==0
(float32)...");
+
+ let device = match CudaDevice::new(0) {
+ Ok(d) => d,
+ Err(_) => {
+ println!("SKIP: No CUDA device available");
+ return;
+ }
+ };
+
+ let input_d = device.alloc_zeros::<f32>(1).unwrap();
+ let mut norms_d = device.alloc_zeros::<f32>(1).unwrap();
+
+ let status = unsafe {
+ launch_l2_norm_batch_f32(
+ *input_d.device_ptr() as *const f32,
+ 1, // num_samples
+ 0, // sample_len == 0
+ *norms_d.device_ptr_mut() as *mut f32,
+ std::ptr::null_mut(),
+ )
+ };
+
+ assert_eq!(
+ status, 1,
+ "Should reject sample_len==0 (cudaErrorInvalidValue), got {}",
+ status
+ );
+ println!("PASS: Correctly rejected sample_len==0 (f32)");
}
#[test]