This is an automated email from the ASF dual-hosted git repository. guanmingchiu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/mahout.git
commit 2c119274966cc228c343bbbe6240759a9f68a8de Author: KUAN-HAO HUANG <[email protected]> AuthorDate: Sat Nov 29 23:43:41 2025 +0800 [QDP] add NVTX to trace validity (#650) * add nvtx and some comments * Update qdp/docs/observability/NVTX_USAGE.md Co-authored-by: Ryan Huang <[email protected]> * catch cuda include path * fix prefix errors * fix precommit error and rebuild Cargo.lock * fix precommit error * fix precommit errors --------- Co-authored-by: Ryan Huang <[email protected]> --- qdp/Cargo.lock | 10 ++ qdp/Cargo.toml | 8 ++ qdp/docs/observability/NVTX_USAGE.md | 148 ++++++++++++++++++++++++++++ qdp/qdp-core/Cargo.toml | 5 + qdp/qdp-core/examples/nvtx_profile.rs | 79 +++++++++++++++ qdp/qdp-core/src/gpu/encodings/amplitude.rs | 56 +++++++---- qdp/qdp-core/src/gpu/memory.rs | 4 +- qdp/qdp-core/src/lib.rs | 11 ++- qdp/qdp-core/src/profiling.rs | 78 +++++++++++++++ qdp/qdp-core/tests/common/mod.rs | 16 +++ qdp/qdp-core/tests/validation.rs | 16 +++ qdp/qdp-kernels/build.rs | 2 + 12 files changed, 412 insertions(+), 21 deletions(-) diff --git a/qdp/Cargo.lock b/qdp/Cargo.lock index f19b8ac3a..5d8cedd4c 100644 --- a/qdp/Cargo.lock +++ b/qdp/Cargo.lock @@ -110,6 +110,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "nvtx" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad2e855e8019f99e4b94ac33670eb4e4f570a2e044f3749a0b2c7f83b841e52c" +dependencies = [ + "cc", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -197,6 +206,7 @@ name = "qdp-core" version = "0.1.0" dependencies = [ "cudarc", + "nvtx", "qdp-kernels", "rayon", "thiserror", diff --git a/qdp/Cargo.toml b/qdp/Cargo.toml index 982b15c1d..0411c8eb3 100644 --- a/qdp/Cargo.toml +++ b/qdp/Cargo.toml @@ -24,3 +24,11 @@ cc = "1.2" thiserror = "2.0" # Parallel computing (for CPU preprocessing) rayon = "1.10" + +# Release profile optimizations +[profile.release] +opt-level = 3 # Maximum optimization +lto = "fat" # Link Time Optimization: cross-crate inlining +codegen-units = 1 # Single codegen unit for better optimization +panic = "abort" # Smaller binary, faster (unwind logic removed) +strip = true # Strip symbols for smaller binary size diff --git a/qdp/docs/observability/NVTX_USAGE.md b/qdp/docs/observability/NVTX_USAGE.md new file mode 100644 index 000000000..a4fe92ee1 --- /dev/null +++ b/qdp/docs/observability/NVTX_USAGE.md @@ -0,0 +1,148 @@ +# NVTX Profiling Guide + +## Overview + +NVTX (NVIDIA Tools Extension) provides performance markers visible in Nsight Systems. This project uses zero-cost macros that compile to no-ops when the `observability` feature is disabled. + +## Build with NVTX + +Default builds exclude NVTX for zero overhead. Enable profiling with: + +```bash +cd mahout/qdp +cargo build -p qdp-core --example nvtx_profile --features observability --release +``` + +## Run Example + +```bash +./target/release/examples/nvtx_profile +``` + +**Expected output:** +``` +=== NVTX Profiling Example === + +✓ Engine initialized +✓ Created test data: 1024 elements + +Starting encoding (NVTX markers will appear in Nsight Systems)... +Expected NVTX markers: + - Mahout::Encode + - CPU::L2Norm + - GPU::Alloc + - GPU::H2DCopy + - GPU::KernelLaunch + - GPU::Synchronize + - DLPack::Wrap + +✓ Encoding succeeded +✓ DLPack pointer: 0x558114be6250 +✓ Memory freed + +=== Test Complete === +``` + +## Profile with Nsight Systems + +```bash +nsys profile --trace=cuda,nvtx -o report ./target/release/examples/nvtx_profile +``` + +This generates `report.nsys-rep` and `report.sqlite`. + +## Viewing Results + +### GUI View (Nsight Systems) + +Open the report in Nsight Systems GUI: + +```bash +nsys-ui report.nsys-rep +``` + +In the GUI timeline view, you will see: +- Colored blocks for each NVTX marker +- CPU timeline showing `CPU::L2Norm` +- GPU timeline showing `GPU::Alloc`, `GPU::H2DCopy`, `GPU::Kernel` +- Overall workflow covered by `Mahout::Encode` + +### Command Line Statistics + +View summary statistics: + +```bash +nsys stats report.nsys-rep +``` + +**Example NVTX Range Summary output:** +``` +Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Style Range +-------- --------------- --------- ------------ ------------ ---------- ---------- ----------- -------- -------------- + 50.0 11,207,505 1 11,207,505.0 11,207,505.0 11,207,505 11,207,505 0.0 StartEnd Mahout::Encode + 48.0 10,759,758 1 10,759,758.0 10,759,758.0 10,759,758 10,759,758 0.0 StartEnd GPU::Alloc + 1.8 413,753 1 413,753.0 413,753.0 413,753 413,753 0.0 StartEnd CPU::L2Norm + 0.1 15,873 1 15,873.0 15,873.0 15,873 15,873 0.0 StartEnd GPU::H2DCopy + 0.0 317 1 317.0 317.0 317 317 0.0 StartEnd GPU::KernelLaunch +``` + +The output shows: +- Time percentage for each operation +- Total time in nanoseconds +- Number of instances +- Average, median, min, max execution times + +**CUDA API Summary** shows detailed CUDA call statistics: + + Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- ----------- ----------- -------- ---------- ----------- -------------------- + 99.2 11,760,277 2 5,880,138.5 5,880,138.5 2,913 11,757,364 8,311,652.0 cuMemAllocAsync + 0.4 45,979 2 22,989.5 22,989.5 7,938 38,041 21,286.0 cuMemcpyHtoDAsync_v2 + 0.1 14,722 1 14,722.0 14,722.0 14,722 14,722 0.0 cuEventCreate + 0.1 13,100 3 4,366.7 3,512.0 861 8,727 4,002.0 cuStreamSynchronize + 0.1 9,468 11 860.7 250.0 114 4,671 1,453.3 cuCtxSetCurrent + 0.1 6,479 1 6,479.0 6,479.0 6,479 6,479 0.0 cuEventDestroy_v2 + 0.0 4,599 2 2,299.5 2,299.5 1,773 2,826 744.6 cuMemFreeAsync +- Memory allocation (`cuMemAllocAsync`) +- Memory copies (`cuMemcpyHtoDAsync_v2`) +- Stream synchronization (`cuStreamSynchronize`) + +## NVTX Markers + +The following markers are tracked: + +- `Mahout::Encode` - Complete encoding workflow +- `CPU::L2Norm` - L2 normalization on CPU +- `GPU::Alloc` - GPU memory allocation +- `GPU::H2DCopy` - Host-to-device memory copy +- `GPU::KernelLaunch` - CPU-side kernel launch +- `GPU::Synchronize` - CPU waiting for GPU completion +- `DLPack::Wrap` - Conversion to DLPack pointer + +## Using Profiling Macros + +The project provides zero-cost macros in `qdp-core/src/profiling.rs`: + +```rust +// Profile a scope (automatically pops on exit) +crate::profile_scope!("MyOperation"); + +// Mark a point in time +crate::profile_mark!("Checkpoint"); +``` + +When `observability` feature is disabled, these macros compile to no-ops with zero runtime cost. + +## Example Location + +Source code: `qdp-core/examples/nvtx_profile.rs` + +## Troubleshooting + +**NVTX markers not appearing:** +- Ensure `--features observability` is used during build +- Verify CUDA device is available +- Check that encoding actually executes + +**nsys warnings:** +Warnings about CPU sampling are normal and can be ignored. They do not affect NVTX marker recording. diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml index 1afe5f219..721ed5e54 100644 --- a/qdp/qdp-core/Cargo.toml +++ b/qdp/qdp-core/Cargo.toml @@ -8,6 +8,11 @@ cudarc = { workspace = true } qdp-kernels = { path = "../qdp-kernels" } thiserror = { workspace = true } rayon = { workspace = true } +nvtx = { version = "1.3", optional = true } [lib] name = "qdp_core" + +[features] +default = [] +observability = ["nvtx"] diff --git a/qdp/qdp-core/examples/nvtx_profile.rs b/qdp/qdp-core/examples/nvtx_profile.rs new file mode 100644 index 000000000..c463f3f08 --- /dev/null +++ b/qdp/qdp-core/examples/nvtx_profile.rs @@ -0,0 +1,79 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +// NVTX profiling example +// Run: cargo run -p qdp-core --example nvtx_profile --features observability --release + +use qdp_core::QdpEngine; + +fn main() { + println!("=== NVTX Profiling Example ==="); + println!(); + + // Initialize engine + let engine = match QdpEngine::new(0) { + Ok(e) => { + println!("✓ Engine initialized"); + e + } + Err(e) => { + eprintln!("✗ Failed to initialize engine: {:?}", e); + return; + } + }; + + // Create test data + let data: Vec<f64> = (0..1024).map(|i| (i as f64) / 1024.0).collect(); + println!("✓ Created test data: {} elements", data.len()); + println!(); + + println!("Starting encoding (NVTX markers will appear in Nsight Systems)..."); + println!("Expected NVTX markers:"); + println!(" - Mahout::Encode"); + println!(" - CPU::L2Norm"); + println!(" - GPU::Alloc"); + println!(" - GPU::H2DCopy"); + println!(" - GPU::Kernel"); + println!(); + + // Perform encoding (this will trigger NVTX markers) + match engine.encode(&data, 10, "amplitude") { + Ok(ptr) => { + println!("✓ Encoding succeeded"); + println!("✓ DLPack pointer: {:p}", ptr); + + // Clean up + unsafe { + let managed = &mut *ptr; + if let Some(deleter) = managed.deleter.take() { + deleter(ptr); + println!("✓ Memory freed"); + } + } + } + Err(e) => { + eprintln!("✗ Encoding failed: {:?}", e); + } + } + + println!(); + println!("=== Test Complete ==="); + println!(); + println!("To view NVTX markers, use Nsight Systems:"); + println!(" nsys profile --trace=cuda,nvtx cargo run -p qdp-core --example nvtx_profile --features observability --release"); + println!("Then open the generated .nsys-rep file in Nsight Systems"); +} diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs index fe0fbf7fd..fb61b1d3c 100644 --- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs +++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs @@ -26,7 +26,7 @@ use super::QuantumEncoder; #[cfg(target_os = "linux")] use std::ffi::c_void; #[cfg(target_os = "linux")] -use cudarc::driver::{CudaSlice, DevicePtr}; +use cudarc::driver::DevicePtr; #[cfg(target_os = "linux")] use qdp_kernels::launch_amplitude_encode; @@ -70,8 +70,11 @@ impl QuantumEncoder for AmplitudeEncoder { } // Calculate L2 norm (parallel on CPU for speed) - let norm_sq: f64 = host_data.par_iter().map(|x| x * x).sum(); - let norm = norm_sq.sqrt(); + let norm = { + crate::profile_scope!("CPU::L2Norm"); + let norm_sq: f64 = host_data.par_iter().map(|x| x * x).sum(); + norm_sq.sqrt() + }; if norm == 0.0 { return Err(MahoutError::InvalidInput("Input data has zero norm".to_string())); @@ -80,23 +83,32 @@ impl QuantumEncoder for AmplitudeEncoder { #[cfg(target_os = "linux")] { // Allocate GPU state vector - let state_vector = GpuStateVector::new(_device, num_qubits)?; + let state_vector = { + crate::profile_scope!("GPU::Alloc"); + GpuStateVector::new(_device, num_qubits)? + }; // Copy input data to GPU (synchronous, zero-copy from slice) - let input_slice: CudaSlice<f64> = _device.htod_sync_copy(host_data) - .map_err(|e| MahoutError::MemoryAllocation(format!("Failed to allocate input buffer: {:?}", e)))?; - - // Launch CUDA kernel - // Safety: pointers valid until kernel completes (htod_sync_copy waits) - let ret = unsafe { - launch_amplitude_encode( - *input_slice.device_ptr() as *const f64, - state_vector.ptr() as *mut c_void, - host_data.len() as i32, - state_len as i32, - norm, - std::ptr::null_mut(), // default stream - ) + // TODO : Use async CUDA streams for pipeline overlap + let input_slice = { + crate::profile_scope!("GPU::H2DCopy"); + _device.htod_sync_copy(host_data) + .map_err(|e| MahoutError::MemoryAllocation(format!("Failed to allocate input buffer: {:?}", e)))? + }; + + // Launch CUDA kernel (CPU-side launch only; execution is asynchronous) + let ret = { + crate::profile_scope!("GPU::KernelLaunch"); + unsafe { + launch_amplitude_encode( + *input_slice.device_ptr() as *const f64, + state_vector.ptr() as *mut c_void, + host_data.len() as i32, + state_len as i32, + norm, + std::ptr::null_mut(), // default stream + ) + } }; if ret != 0 { @@ -108,6 +120,14 @@ impl QuantumEncoder for AmplitudeEncoder { return Err(MahoutError::KernelLaunch(error_msg)); } + // Block until all work on the device is complete + { + crate::profile_scope!("GPU::Synchronize"); + _device + .synchronize() + .map_err(|e| MahoutError::Cuda(format!("CUDA device synchronize failed: {:?}", e)))?; + } + Ok(state_vector) } diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs index ed786c79b..38bb90e27 100644 --- a/qdp/qdp-core/src/gpu/memory.rs +++ b/qdp/qdp-core/src/gpu/memory.rs @@ -57,10 +57,10 @@ impl GpuStateVector { pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> { let _size_elements = 1 << qubits; - // Use alloc_zeros for device-side allocation (critical for performance): + // Use device-side allocation (critical for performance): // - No CPU RAM usage (avoids OOM for large states) - // - No PCIe transfer (GPU hardware zero-fill) // - Fast: microseconds vs seconds for 30 qubits (16GB) + // TODO: Use uninitialized alloc() when kernel fully implements padding #[cfg(target_os = "linux")] { // Allocate GPU memory (zero-initialized) diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs index a6439ca55..6634b15c4 100644 --- a/qdp/qdp-core/src/lib.rs +++ b/qdp/qdp-core/src/lib.rs @@ -18,6 +18,9 @@ pub mod dlpack; pub mod gpu; pub mod error; +#[macro_use] +mod profiling; + pub use error::{MahoutError, Result}; use std::sync::Arc; @@ -66,9 +69,15 @@ impl QdpEngine { num_qubits: usize, encoding_method: &str, ) -> Result<*mut DLManagedTensor> { + crate::profile_scope!("Mahout::Encode"); + let encoder = get_encoder(encoding_method)?; let state_vector = encoder.encode(&self.device, data, num_qubits)?; - Ok(state_vector.to_dlpack()) + let dlpack_ptr = { + crate::profile_scope!("DLPack::Wrap"); + state_vector.to_dlpack() + }; + Ok(dlpack_ptr) } /// Get CUDA device reference for advanced operations diff --git a/qdp/qdp-core/src/profiling.rs b/qdp/qdp-core/src/profiling.rs new file mode 100644 index 000000000..0b60e6c82 --- /dev/null +++ b/qdp/qdp-core/src/profiling.rs @@ -0,0 +1,78 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +// Zero-cost profiling macros for NVTX integration +// +// Provides clean abstraction over NVTX markers without cluttering business logic. +// When observability feature is disabled, these macros compile to no-ops. + +/// Profile a scope using RAII guard pattern +/// +/// Automatically pushes NVTX range on entry and pops on scope exit. +/// Uses Rust's Drop mechanism to ensure proper cleanup even on early returns. +/// +/// # Example +/// ```rust +/// fn my_function() { +/// crate::profile_scope!("MyFunction"); +/// // ... code ... +/// // Guard automatically pops when function returns +/// } +/// ``` +#[cfg(feature = "observability")] +#[macro_export] +macro_rules! profile_scope { + ($name:expr) => { + let _scope_guard = nvtx::range!($name); + }; +} + +/// No-op version when observability is disabled +/// +/// Compiler eliminates this completely, zero runtime cost. +#[cfg(not(feature = "observability"))] +#[macro_export] +macro_rules! profile_scope { + ($name:expr) => { + // Zero-cost: compiler removes this entirely + }; +} + +/// Mark a point in time with NVTX marker +/// +/// Useful for marking specific events without creating a range. +/// +/// # Example +/// ```rust +/// crate::profile_mark!("CheckpointReached"); +/// ``` +#[cfg(feature = "observability")] +#[macro_export] +macro_rules! profile_mark { + ($name:expr) => { + nvtx::mark!($name); + }; +} + +/// No-op version when observability is disabled +#[cfg(not(feature = "observability"))] +#[macro_export] +macro_rules! profile_mark { + ($name:expr) => { + // Zero-cost: compiler removes this entirely + }; +} diff --git a/qdp/qdp-core/tests/common/mod.rs b/qdp/qdp-core/tests/common/mod.rs index d13e4623d..f105a5436 100644 --- a/qdp/qdp-core/tests/common/mod.rs +++ b/qdp/qdp-core/tests/common/mod.rs @@ -1,3 +1,19 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + /// Create test data with normalized values pub fn create_test_data(size: usize) -> Vec<f64> { (0..size).map(|i| (i as f64) / (size as f64)).collect() diff --git a/qdp/qdp-core/tests/validation.rs b/qdp/qdp-core/tests/validation.rs index e8a1a3443..cc12a995a 100644 --- a/qdp/qdp-core/tests/validation.rs +++ b/qdp/qdp-core/tests/validation.rs @@ -1,3 +1,19 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // Input validation and error handling tests use qdp_core::{QdpEngine, MahoutError}; diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs index 0956708ac..2e3b01b27 100644 --- a/qdp/qdp-kernels/build.rs +++ b/qdp/qdp-kernels/build.rs @@ -60,6 +60,8 @@ fn main() { // This uses cc crate's CUDA support to invoke nvcc let mut build = cc::Build::new(); + build.include(format!("{}/include", cuda_path)); + build .cuda(true) .flag("-cudart=shared") // Use shared CUDA runtime
