Re: [PR] [QDP] improve amplitudeEncoders for less copy memory allocations [mahout]

via GitHub Fri, 05 Dec 2025 20:53:48 -0800


ryankert01 commented on code in PR #690:
URL: https://github.com/apache/mahout/pull/690#discussion_r2594514757



##########
qdp/qdp-core/src/gpu/encodings/amplitude.rs:
##########
@@ -121,6 +122,129 @@ impl QuantumEncoder for AmplitudeEncoder {
     fn description(&self) -> &'static str {
         "Amplitude encoding with L2 normalization"
     }
+
+    /// Override to avoid intermediate Vec allocation. Processes chunks 
directly to GPU offsets.
+    fn encode_chunked(
+        &self,
+        device: &Arc<CudaDevice>,
+        chunks: &[Float64Array],
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        #[cfg(target_os = "linux")]
+        {
+            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
+            let state_len = 1 << num_qubits;
+
+            if total_len == 0 {
+                return Err(MahoutError::InvalidInput("Input chunks cannot be 
empty".to_string()));
+            }
+            if total_len > state_len {
+                return Err(MahoutError::InvalidInput(
+                    format!("Total data length {} exceeds state vector size 
{}", total_len, state_len)
+                ));
+            }
+            if num_qubits == 0 || num_qubits > 30 {
+                return Err(MahoutError::InvalidInput(
+                    format!("Number of qubits {} must be between 1 and 30", 
num_qubits)
+                ));
+            }
+
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(device, num_qubits)?
+            };
+
+            let norm = {
+                crate::profile_scope!("CPU::L2Norm");
+                let mut norm_sq = 0.0;
+                for chunk in chunks {
+                    if chunk.null_count() == 0 {
+                        norm_sq += chunk.values().iter().map(|&x| x * 
x).sum::<f64>();
+                    } else {
+                        norm_sq += chunk.iter().map(|opt| 
opt.unwrap_or(0.0).powi(2)).sum::<f64>();
+                    }
+                }
+                let norm = norm_sq.sqrt();
+                if norm == 0.0 {
+                    return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
+                }
+                norm
+            };
+
+            let mut current_offset = 0;
+            for chunk in chunks {
+                let chunk_len = chunk.len();
+                if chunk_len == 0 {
+                    continue;
+                }
+
+                let state_ptr_offset = unsafe {
+                    state_vector.ptr().cast::<u8>()
+                        .add(current_offset * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>())
+                        .cast::<std::ffi::c_void>()
+                };
+
+                let chunk_slice = {
+                    crate::profile_scope!("GPU::ChunkH2DCopy");
+                    if chunk.null_count() == 0 {
+                        device.htod_sync_copy(chunk.values())
+                    } else {
+                        let temp_vec: Vec<f64> = chunk.iter().map(|opt| 
opt.unwrap_or(0.0)).collect();
+                        device.htod_sync_copy(&temp_vec)
+                    }
+                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to copy chunk: {:?}", e)))?
+                };
+
+                {
+                    crate::profile_scope!("GPU::KernelLaunch");
+                    let ret = unsafe {
+                        launch_amplitude_encode(
+                            *chunk_slice.device_ptr() as *const f64,
+                            state_ptr_offset,
+                            chunk_len,
+                            state_len,
+                            norm,
+                            std::ptr::null_mut(),
+                        )
+                    };
+                    if ret != 0 {
+                        return Err(MahoutError::KernelLaunch(
+                            format!("Kernel launch failed: {} ({})", ret, 
cuda_error_to_string(ret))
+                        ));
+                    }
+                }
+
+                current_offset += chunk_len;
+            }
+
+            if total_len < state_len {
+                let padding_bytes = (state_len - total_len) * 
std::mem::size_of::<qdp_kernels::CuDoubleComplex>();
+                let tail_ptr = unsafe { state_vector.ptr().add(total_len) as 
*mut c_void };
+
+                unsafe {
+                    unsafe extern "C" {
+                        fn cudaMemsetAsync(devPtr: *mut c_void, value: i32, 
count: usize, stream: *mut c_void) -> i32;
+                    }
+                    let result = cudaMemsetAsync(tail_ptr, 0, padding_bytes, 
std::ptr::null_mut());
+                    if result != 0 {
+                        return Err(MahoutError::Cuda(
+                            format!("Failed to zero-fill padding: {} ({})", 
result, cuda_error_to_string(result))
+                        ));
+                    }
+                }
+            }
+
+            device.synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", 
e)))?;
+
+            Ok(state_vector)
+        }
+
+        #[cfg(not(target_os = "linux"))]
+        {
+            Err(MahoutError::Cuda("CUDA unavailable (non-Linux)".to_string()))
+        }
+    }

Review Comment:
   same concern. It's a hard topic, and we need proper test to mitigate the 
hazard.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [QDP] improve amplitudeEncoders for less copy memory allocations [mahout]

Reply via email to