(mahout) branch main updated: [QDP] Fix invalid CUDA kernel launch when num_samples exceeds grid dimension limit (#968)

hcr Fri, 06 Feb 2026 00:20:01 -0800

This is an automated email from the ASF dual-hosted git repository.

hcr pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git



The following commit(s) were added to refs/heads/main by this push:
     new 4438a9eeb [QDP] Fix invalid CUDA kernel launch when num_samples 
exceeds grid dimension limit (#968)
4438a9eeb is described below

commit 4438a9eeb621d865a19646d4dad3ddbeb3930cc8
Author: Vic Wen <[email protected]>
AuthorDate: Fri Feb 6 16:19:49 2026 +0800

    [QDP] Fix invalid CUDA kernel launch when num_samples exceeds grid 
dimension limit (#968)
---
 qdp/qdp-kernels/src/amplitude.cu          |  40 +++++++++-
 qdp/qdp-kernels/src/kernel_config.h       |   4 +-
 qdp/qdp-kernels/tests/amplitude_encode.rs | 125 ++++++++++++++++++++++++++----
 3 files changed, 149 insertions(+), 20 deletions(-)

diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 06676e081..67a863eaa 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -617,6 +617,37 @@ int launch_l2_norm_f32(
     return (int)cudaGetLastError();
 }
 
+/// Returns the current device's max grid dimension (X), fallback to 
CUDA_MAX_GRID_DIM_1D if error.
+static size_t get_max_grid_dim_1d(void) {
+    static size_t cached_max_grid_dim_1d = 0;
+    if (cached_max_grid_dim_1d != 0) {
+        return cached_max_grid_dim_1d;
+    }
+
+    int device = -1;
+    // Fallback to CUDA_MAX_GRID_DIM_1D if error
+    if (cudaGetDevice(&device) != cudaSuccess || device < 0) {
+        cached_max_grid_dim_1d = CUDA_MAX_GRID_DIM_1D;
+        return cached_max_grid_dim_1d;
+    }
+
+    int max_x = 0;
+    cudaError_t err = cudaDeviceGetAttribute(
+        &max_x,
+        cudaDevAttrMaxGridDimX,
+        device
+    );
+
+    // Fallback to CUDA_MAX_GRID_DIM_1D if error
+    if (err != cudaSuccess || max_x <= 0) {
+        cached_max_grid_dim_1d = CUDA_MAX_GRID_DIM_1D;
+        return cached_max_grid_dim_1d;
+    }
+
+    cached_max_grid_dim_1d = (size_t)max_x;
+    return cached_max_grid_dim_1d;
+}
+
 /// Launch L2 norm reduction for a batch of vectors.
 /// Writes inverse norms for each sample into `inv_norms_out_d`.
 int launch_l2_norm_batch(
@@ -642,6 +673,12 @@ int launch_l2_norm_batch(
 
     const int blockSize = DEFAULT_BLOCK_SIZE;
     const size_t elements_per_block = blockSize * 2; // double2 per thread
+    const size_t max_grid = get_max_grid_dim_1d();
+
+    if (num_samples > max_grid) {
+        return cudaErrorInvalidValue;
+    }
+
     size_t blocks_per_sample = (sample_len + elements_per_block - 1) / 
elements_per_block;
     const size_t max_blocks_per_sample = MAX_BLOCKS_PER_SAMPLE;
     if (blocks_per_sample == 0) blocks_per_sample = 1;
@@ -650,7 +687,6 @@ int launch_l2_norm_batch(
     }
 
     size_t gridSize = num_samples * blocks_per_sample;
-    const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit 
for 1D launch
     if (gridSize > max_grid) {
         blocks_per_sample = max_grid / num_samples;
         if (blocks_per_sample == 0) {
@@ -702,7 +738,7 @@ int launch_l2_norm_batch_f32(
 
     const int blockSize = DEFAULT_BLOCK_SIZE;
     const size_t elements_per_block = blockSize * 2; // float2 per thread
-    const size_t max_grid = CUDA_MAX_GRID_DIM_1D; // CUDA grid dimension limit 
for 1D launch
+    const size_t max_grid = get_max_grid_dim_1d();
     if (num_samples > max_grid) {
         return cudaErrorInvalidValue;
     }
diff --git a/qdp/qdp-kernels/src/kernel_config.h 
b/qdp/qdp-kernels/src/kernel_config.h
index 4ce4526cb..9a5708ac3 100644
--- a/qdp/qdp-kernels/src/kernel_config.h
+++ b/qdp/qdp-kernels/src/kernel_config.h
@@ -46,9 +46,9 @@
 // Limits per-sample parallelism to maintain good load balancing
 #define MAX_BLOCKS_PER_SAMPLE 32
 
-// CUDA grid dimension limit for 1D launches
+// CUDA grid dimension limit for 1D launches (2^31 - 1, signed 32-bit int max)
 // This is a hardware limitation, not a tunable parameter
-#define CUDA_MAX_GRID_DIM_1D 65535
+#define CUDA_MAX_GRID_DIM_1D 2147483647
 
 // ============================================================================
 // Qubit Limits
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs 
b/qdp/qdp-kernels/tests/amplitude_encode.rs
index db68dd012..579a4e7ec 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -671,6 +671,74 @@ fn test_l2_norm_batch_kernel_stream() {
     println!("PASS: Batched norm reduction on stream matches CPU");
 }
 
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_num_samples() {
+    println!("Testing batched L2 norm rejection when num_samples==0 
(float64)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.alloc_zeros::<f64>(2).unwrap();
+    let mut norms_d = device.alloc_zeros::<f64>(1).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch(
+            *input_d.device_ptr() as *const f64,
+            0, // num_samples == 0
+            2, // sample_len
+            *norms_d.device_ptr_mut() as *mut f64,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(
+        status, 1,
+        "Should reject num_samples==0 (cudaErrorInvalidValue), got {}",
+        status
+    );
+    println!("PASS: Correctly rejected num_samples==0 (f64)");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_sample_len() {
+    println!("Testing batched L2 norm rejection when sample_len==0 
(float64)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.alloc_zeros::<f64>(1).unwrap();
+    let mut norms_d = device.alloc_zeros::<f64>(1).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch(
+            *input_d.device_ptr() as *const f64,
+            1, // num_samples
+            0, // sample_len == 0
+            *norms_d.device_ptr_mut() as *mut f64,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(
+        status, 1,
+        "Should reject sample_len==0 (cudaErrorInvalidValue), got {}",
+        status
+    );
+    println!("PASS: Correctly rejected sample_len==0 (f64)");
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_l2_norm_single_kernel_f32() {
@@ -774,8 +842,8 @@ fn test_l2_norm_batch_kernel_f32() {
 
 #[test]
 #[cfg(target_os = "linux")]
-fn test_l2_norm_batch_kernel_grid_limit_f32() {
-    println!("Testing batched L2 norm reduction with grid limit boundary 
(float32)...");
+fn test_l2_norm_batch_kernel_zero_num_samples_f32() {
+    println!("Testing batched L2 norm rejection when num_samples==0 
(float32)...");
 
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -785,34 +853,59 @@ fn test_l2_norm_batch_kernel_grid_limit_f32() {
         }
     };
 
-    // Test that num_samples exceeding CUDA_MAX_GRID_DIM_1D (65535) returns 
error
-    const MAX_GRID_DIM: usize = 65535;
-    let num_samples = MAX_GRID_DIM + 1; // Exceeds limit
-    let sample_len = 2;
-
-    let input: Vec<f32> = vec![1.0; num_samples * sample_len];
-    let input_d = device.htod_sync_copy(input.as_slice()).unwrap();
-    let mut norms_d = device.alloc_zeros::<f32>(num_samples).unwrap();
+    let input_d = device.alloc_zeros::<f32>(2).unwrap();
+    let mut norms_d = device.alloc_zeros::<f32>(1).unwrap();
 
     let status = unsafe {
         launch_l2_norm_batch_f32(
             *input_d.device_ptr() as *const f32,
-            num_samples,
-            sample_len,
+            0, // num_samples == 0
+            2, // sample_len
             *norms_d.device_ptr_mut() as *mut f32,
             std::ptr::null_mut(),
         )
     };
 
-    // Should return error because num_samples exceeds grid limit
-    // cudaErrorInvalidValue = 1 (from cuda_error_to_string)
     assert_eq!(
         status, 1,
-        "Should reject num_samples exceeding CUDA_MAX_GRID_DIM_1D (f32), got 
error code {}",
+        "Should reject num_samples==0 (cudaErrorInvalidValue), got {}",
         status
     );
+    println!("PASS: Correctly rejected num_samples==0 (f32)");
+}
 
-    println!("PASS: Correctly rejected num_samples exceeding grid limit 
(f32)");
+#[test]
+#[cfg(target_os = "linux")]
+fn test_l2_norm_batch_kernel_zero_sample_len_f32() {
+    println!("Testing batched L2 norm rejection when sample_len==0 
(float32)...");
+
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.alloc_zeros::<f32>(1).unwrap();
+    let mut norms_d = device.alloc_zeros::<f32>(1).unwrap();
+
+    let status = unsafe {
+        launch_l2_norm_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            1, // num_samples
+            0, // sample_len == 0
+            *norms_d.device_ptr_mut() as *mut f32,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_eq!(
+        status, 1,
+        "Should reject sample_len==0 (cudaErrorInvalidValue), got {}",
+        status
+    );
+    println!("PASS: Correctly rejected sample_len==0 (f32)");
 }
 
 #[test]

(mahout) branch main updated: [QDP] Fix invalid CUDA kernel launch when num_samples exceeds grid dimension limit (#968)

Reply via email to