rich7420 commented on code in PR #675:
URL: https://github.com/apache/mahout/pull/675#discussion_r2585267111


##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -57,23 +58,27 @@ impl QuantumEncoder for AmplitudeEncoder {
                 GpuStateVector::new(_device, num_qubits)?
             };
 
-            // Copy input data to GPU (synchronous, zero-copy from slice)
-            // TODO : Use async CUDA streams for pipeline overlap
+            // SSS-Tier Optimization: Async Pipeline for large data
+            // For small data (< 1MB), use synchronous path to avoid stream 
overhead
+            // For large data, use dual-stream async pipeline for maximum 
throughput
+            const ASYNC_THRESHOLD: usize = 1024 * 1024 / 
std::mem::size_of::<f64>(); // 1MB threshold
+
+            if host_data.len() < ASYNC_THRESHOLD {
+                // Synchronous path for small data (avoids stream overhead)
             let input_slice = {
                 crate::profile_scope!("GPU::H2DCopy");
                 _device.htod_sync_copy(host_data)
                     .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to allocate input buffer: {:?}", e)))?
             };
 
-            // Launch CUDA kernel (CPU-side launch only; execution is 
asynchronous)
             let ret = {
                 crate::profile_scope!("GPU::KernelLaunch");
                 unsafe {
                     launch_amplitude_encode(
                         *input_slice.device_ptr() as *const f64,
                         state_vector.ptr() as *mut c_void,
-                        host_data.len() as i32,
-                        state_len as i32,
+                            host_data.len(),
+                            state_len,

Review Comment:
   not important



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -57,23 +58,27 @@ impl QuantumEncoder for AmplitudeEncoder {
                 GpuStateVector::new(_device, num_qubits)?
             };
 
-            // Copy input data to GPU (synchronous, zero-copy from slice)
-            // TODO : Use async CUDA streams for pipeline overlap
+            // SSS-Tier Optimization: Async Pipeline for large data
+            // For small data (< 1MB), use synchronous path to avoid stream 
overhead
+            // For large data, use dual-stream async pipeline for maximum 
throughput
+            const ASYNC_THRESHOLD: usize = 1024 * 1024 / 
std::mem::size_of::<f64>(); // 1MB threshold
+
+            if host_data.len() < ASYNC_THRESHOLD {
+                // Synchronous path for small data (avoids stream overhead)
             let input_slice = {
                 crate::profile_scope!("GPU::H2DCopy");
                 _device.htod_sync_copy(host_data)
                     .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to allocate input buffer: {:?}", e)))?
             };
 
-            // Launch CUDA kernel (CPU-side launch only; execution is 
asynchronous)
             let ret = {
                 crate::profile_scope!("GPU::KernelLaunch");
                 unsafe {
                     launch_amplitude_encode(
                         *input_slice.device_ptr() as *const f64,
                         state_vector.ptr() as *mut c_void,
-                        host_data.len() as i32,
-                        state_len as i32,
+                            host_data.len(),
+                            state_len,

Review Comment:
   not important



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to