(mahout) 05/50: [QDP] add NVTX to trace validity (#650)

guanmingchiu Tue, 06 Jan 2026 08:45:20 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


commit 2c119274966cc228c343bbbe6240759a9f68a8de
Author: KUAN-HAO HUANG <[email protected]>
AuthorDate: Sat Nov 29 23:43:41 2025 +0800

    [QDP] add NVTX to trace validity (#650)
    
    * add nvtx and some comments
    
    * Update qdp/docs/observability/NVTX_USAGE.md
    
    Co-authored-by: Ryan Huang <[email protected]>
    
    * catch cuda include path
    
    * fix prefix errors
    
    * fix precommit error and rebuild Cargo.lock
    
    * fix precommit error
    
    * fix precommit errors
    
    ---------
    
    Co-authored-by: Ryan Huang <[email protected]>
---
 qdp/Cargo.lock                              |  10 ++
 qdp/Cargo.toml                              |   8 ++
 qdp/docs/observability/NVTX_USAGE.md        | 148 ++++++++++++++++++++++++++++
 qdp/qdp-core/Cargo.toml                     |   5 +
 qdp/qdp-core/examples/nvtx_profile.rs       |  79 +++++++++++++++
 qdp/qdp-core/src/gpu/encodings/amplitude.rs |  56 +++++++----
 qdp/qdp-core/src/gpu/memory.rs              |   4 +-
 qdp/qdp-core/src/lib.rs                     |  11 ++-
 qdp/qdp-core/src/profiling.rs               |  78 +++++++++++++++
 qdp/qdp-core/tests/common/mod.rs            |  16 +++
 qdp/qdp-core/tests/validation.rs            |  16 +++
 qdp/qdp-kernels/build.rs                    |   2 +
 12 files changed, 412 insertions(+), 21 deletions(-)

diff --git a/qdp/Cargo.lock b/qdp/Cargo.lock
index f19b8ac3a..5d8cedd4c 100644
--- a/qdp/Cargo.lock
+++ b/qdp/Cargo.lock
@@ -110,6 +110,15 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "nvtx"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index";
+checksum = "ad2e855e8019f99e4b94ac33670eb4e4f570a2e044f3749a0b2c7f83b841e52c"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
@@ -197,6 +206,7 @@ name = "qdp-core"
 version = "0.1.0"
 dependencies = [
  "cudarc",
+ "nvtx",
  "qdp-kernels",
  "rayon",
  "thiserror",
diff --git a/qdp/Cargo.toml b/qdp/Cargo.toml
index 982b15c1d..0411c8eb3 100644
--- a/qdp/Cargo.toml
+++ b/qdp/Cargo.toml
@@ -24,3 +24,11 @@ cc = "1.2"
 thiserror = "2.0"
 # Parallel computing (for CPU preprocessing)
 rayon = "1.10"
+
+# Release profile optimizations
+[profile.release]
+opt-level = 3          # Maximum optimization
+lto = "fat"            # Link Time Optimization: cross-crate inlining
+codegen-units = 1      # Single codegen unit for better optimization
+panic = "abort"        # Smaller binary, faster (unwind logic removed)
+strip = true           # Strip symbols for smaller binary size
diff --git a/qdp/docs/observability/NVTX_USAGE.md 
b/qdp/docs/observability/NVTX_USAGE.md
new file mode 100644
index 000000000..a4fe92ee1
--- /dev/null
+++ b/qdp/docs/observability/NVTX_USAGE.md
@@ -0,0 +1,148 @@
+# NVTX Profiling Guide
+
+## Overview
+
+NVTX (NVIDIA Tools Extension) provides performance markers visible in Nsight 
Systems. This project uses zero-cost macros that compile to no-ops when the 
`observability` feature is disabled.
+
+## Build with NVTX
+
+Default builds exclude NVTX for zero overhead. Enable profiling with:
+
+```bash
+cd mahout/qdp
+cargo build -p qdp-core --example nvtx_profile --features observability 
--release
+```
+
+## Run Example
+
+```bash
+./target/release/examples/nvtx_profile
+```
+
+**Expected output:**
+```
+=== NVTX Profiling Example ===
+
+✓ Engine initialized
+✓ Created test data: 1024 elements
+
+Starting encoding (NVTX markers will appear in Nsight Systems)...
+Expected NVTX markers:
+  - Mahout::Encode
+  - CPU::L2Norm
+  - GPU::Alloc
+  - GPU::H2DCopy
+  - GPU::KernelLaunch
+  - GPU::Synchronize
+  - DLPack::Wrap
+
+✓ Encoding succeeded
+✓ DLPack pointer: 0x558114be6250
+✓ Memory freed
+
+=== Test Complete ===
+```
+
+## Profile with Nsight Systems
+
+```bash
+nsys profile --trace=cuda,nvtx -o report ./target/release/examples/nvtx_profile
+```
+
+This generates `report.nsys-rep` and `report.sqlite`.
+
+## Viewing Results
+
+### GUI View (Nsight Systems)
+
+Open the report in Nsight Systems GUI:
+
+```bash
+nsys-ui report.nsys-rep
+```
+
+In the GUI timeline view, you will see:
+- Colored blocks for each NVTX marker
+- CPU timeline showing `CPU::L2Norm`
+- GPU timeline showing `GPU::Alloc`, `GPU::H2DCopy`, `GPU::Kernel`
+- Overall workflow covered by `Mahout::Encode`
+
+### Command Line Statistics
+
+View summary statistics:
+
+```bash
+nsys stats report.nsys-rep
+```
+
+**Example NVTX Range Summary output:**
+```
+Time (%)  Total Time (ns)  Instances    Avg (ns)      Med (ns)     Min (ns)    
Max (ns)   StdDev (ns)   Style        Range
+--------  ---------------  ---------  ------------  ------------  ----------  
----------  -----------  --------  --------------
+   50.0       11,207,505          1  11,207,505.0  11,207,505.0  11,207,505  
11,207,505          0.0  StartEnd  Mahout::Encode
+   48.0       10,759,758          1  10,759,758.0  10,759,758.0  10,759,758  
10,759,758          0.0  StartEnd  GPU::Alloc
+    1.8          413,753          1     413,753.0     413,753.0     413,753    
 413,753          0.0  StartEnd  CPU::L2Norm
+    0.1           15,873          1      15,873.0      15,873.0      15,873    
  15,873          0.0  StartEnd  GPU::H2DCopy
+    0.0              317          1         317.0         317.0         317    
     317          0.0  StartEnd  GPU::KernelLaunch
+```
+
+The output shows:
+- Time percentage for each operation
+- Total time in nanoseconds
+- Number of instances
+- Average, median, min, max execution times
+
+**CUDA API Summary** shows detailed CUDA call statistics:
+
+ Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)   
Max (ns)   StdDev (ns)          Name
+ --------  ---------------  ---------  -----------  -----------  --------  
----------  -----------  --------------------
+     99.2       11,760,277          2  5,880,138.5  5,880,138.5     2,913  
11,757,364  8,311,652.0  cuMemAllocAsync
+      0.4           45,979          2     22,989.5     22,989.5     7,938      
38,041     21,286.0  cuMemcpyHtoDAsync_v2
+      0.1           14,722          1     14,722.0     14,722.0    14,722      
14,722          0.0  cuEventCreate
+      0.1           13,100          3      4,366.7      3,512.0       861      
 8,727      4,002.0  cuStreamSynchronize
+      0.1            9,468         11        860.7        250.0       114      
 4,671      1,453.3  cuCtxSetCurrent
+      0.1            6,479          1      6,479.0      6,479.0     6,479      
 6,479          0.0  cuEventDestroy_v2
+      0.0            4,599          2      2,299.5      2,299.5     1,773      
 2,826        744.6  cuMemFreeAsync
+- Memory allocation (`cuMemAllocAsync`)
+- Memory copies (`cuMemcpyHtoDAsync_v2`)
+- Stream synchronization (`cuStreamSynchronize`)
+
+## NVTX Markers
+
+The following markers are tracked:
+
+- `Mahout::Encode` - Complete encoding workflow
+- `CPU::L2Norm` - L2 normalization on CPU
+- `GPU::Alloc` - GPU memory allocation
+- `GPU::H2DCopy` - Host-to-device memory copy
+- `GPU::KernelLaunch` - CPU-side kernel launch
+- `GPU::Synchronize` - CPU waiting for GPU completion
+- `DLPack::Wrap` - Conversion to DLPack pointer
+
+## Using Profiling Macros
+
+The project provides zero-cost macros in `qdp-core/src/profiling.rs`:
+
+```rust
+// Profile a scope (automatically pops on exit)
+crate::profile_scope!("MyOperation");
+
+// Mark a point in time
+crate::profile_mark!("Checkpoint");
+```
+
+When `observability` feature is disabled, these macros compile to no-ops with 
zero runtime cost.
+
+## Example Location
+
+Source code: `qdp-core/examples/nvtx_profile.rs`
+
+## Troubleshooting
+
+**NVTX markers not appearing:**
+- Ensure `--features observability` is used during build
+- Verify CUDA device is available
+- Check that encoding actually executes
+
+**nsys warnings:**
+Warnings about CPU sampling are normal and can be ignored. They do not affect 
NVTX marker recording.
diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml
index 1afe5f219..721ed5e54 100644
--- a/qdp/qdp-core/Cargo.toml
+++ b/qdp/qdp-core/Cargo.toml
@@ -8,6 +8,11 @@ cudarc = { workspace = true }
 qdp-kernels = { path = "../qdp-kernels" }
 thiserror = { workspace = true }
 rayon = { workspace = true }
+nvtx = { version = "1.3", optional = true }
 
 [lib]
 name = "qdp_core"
+
+[features]
+default = []
+observability = ["nvtx"]
diff --git a/qdp/qdp-core/examples/nvtx_profile.rs 
b/qdp/qdp-core/examples/nvtx_profile.rs
new file mode 100644
index 000000000..c463f3f08
--- /dev/null
+++ b/qdp/qdp-core/examples/nvtx_profile.rs
@@ -0,0 +1,79 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+// NVTX profiling example
+// Run: cargo run -p qdp-core --example nvtx_profile --features observability 
--release
+
+use qdp_core::QdpEngine;
+
+fn main() {
+    println!("=== NVTX Profiling Example ===");
+    println!();
+
+    // Initialize engine
+    let engine = match QdpEngine::new(0) {
+        Ok(e) => {
+            println!("✓ Engine initialized");
+            e
+        }
+        Err(e) => {
+            eprintln!("✗ Failed to initialize engine: {:?}", e);
+            return;
+        }
+    };
+
+    // Create test data
+    let data: Vec<f64> = (0..1024).map(|i| (i as f64) / 1024.0).collect();
+    println!("✓ Created test data: {} elements", data.len());
+    println!();
+
+    println!("Starting encoding (NVTX markers will appear in Nsight 
Systems)...");
+    println!("Expected NVTX markers:");
+    println!("  - Mahout::Encode");
+    println!("  - CPU::L2Norm");
+    println!("  - GPU::Alloc");
+    println!("  - GPU::H2DCopy");
+    println!("  - GPU::Kernel");
+    println!();
+
+    // Perform encoding (this will trigger NVTX markers)
+    match engine.encode(&data, 10, "amplitude") {
+        Ok(ptr) => {
+            println!("✓ Encoding succeeded");
+            println!("✓ DLPack pointer: {:p}", ptr);
+
+            // Clean up
+            unsafe {
+                let managed = &mut *ptr;
+                if let Some(deleter) = managed.deleter.take() {
+                    deleter(ptr);
+                    println!("✓ Memory freed");
+                }
+            }
+        }
+        Err(e) => {
+            eprintln!("✗ Encoding failed: {:?}", e);
+        }
+    }
+
+    println!();
+    println!("=== Test Complete ===");
+    println!();
+    println!("To view NVTX markers, use Nsight Systems:");
+    println!("  nsys profile --trace=cuda,nvtx cargo run -p qdp-core --example 
nvtx_profile --features observability --release");
+    println!("Then open the generated .nsys-rep file in Nsight Systems");
+}
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs 
b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index fe0fbf7fd..fb61b1d3c 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -26,7 +26,7 @@ use super::QuantumEncoder;
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
 #[cfg(target_os = "linux")]
-use cudarc::driver::{CudaSlice, DevicePtr};
+use cudarc::driver::DevicePtr;
 #[cfg(target_os = "linux")]
 use qdp_kernels::launch_amplitude_encode;
 
@@ -70,8 +70,11 @@ impl QuantumEncoder for AmplitudeEncoder {
         }
 
         // Calculate L2 norm (parallel on CPU for speed)
-        let norm_sq: f64 = host_data.par_iter().map(|x| x * x).sum();
-        let norm = norm_sq.sqrt();
+        let norm = {
+            crate::profile_scope!("CPU::L2Norm");
+            let norm_sq: f64 = host_data.par_iter().map(|x| x * x).sum();
+            norm_sq.sqrt()
+        };
 
         if norm == 0.0 {
             return Err(MahoutError::InvalidInput("Input data has zero 
norm".to_string()));
@@ -80,23 +83,32 @@ impl QuantumEncoder for AmplitudeEncoder {
         #[cfg(target_os = "linux")]
         {
             // Allocate GPU state vector
-            let state_vector = GpuStateVector::new(_device, num_qubits)?;
+            let state_vector = {
+                crate::profile_scope!("GPU::Alloc");
+                GpuStateVector::new(_device, num_qubits)?
+            };
 
             // Copy input data to GPU (synchronous, zero-copy from slice)
-            let input_slice: CudaSlice<f64> = _device.htod_sync_copy(host_data)
-                .map_err(|e| MahoutError::MemoryAllocation(format!("Failed to 
allocate input buffer: {:?}", e)))?;
-
-            // Launch CUDA kernel
-            // Safety: pointers valid until kernel completes (htod_sync_copy 
waits)
-            let ret = unsafe {
-                launch_amplitude_encode(
-                    *input_slice.device_ptr() as *const f64,
-                    state_vector.ptr() as *mut c_void,
-                    host_data.len() as i32,
-                    state_len as i32,
-                    norm,
-                    std::ptr::null_mut(), // default stream
-                )
+            // TODO : Use async CUDA streams for pipeline overlap
+            let input_slice = {
+                crate::profile_scope!("GPU::H2DCopy");
+                _device.htod_sync_copy(host_data)
+                    .map_err(|e| MahoutError::MemoryAllocation(format!("Failed 
to allocate input buffer: {:?}", e)))?
+            };
+
+            // Launch CUDA kernel (CPU-side launch only; execution is 
asynchronous)
+            let ret = {
+                crate::profile_scope!("GPU::KernelLaunch");
+                unsafe {
+                    launch_amplitude_encode(
+                        *input_slice.device_ptr() as *const f64,
+                        state_vector.ptr() as *mut c_void,
+                        host_data.len() as i32,
+                        state_len as i32,
+                        norm,
+                        std::ptr::null_mut(), // default stream
+                    )
+                }
             };
 
             if ret != 0 {
@@ -108,6 +120,14 @@ impl QuantumEncoder for AmplitudeEncoder {
                 return Err(MahoutError::KernelLaunch(error_msg));
             }
 
+            // Block until all work on the device is complete
+            {
+                crate::profile_scope!("GPU::Synchronize");
+                _device
+                    .synchronize()
+                    .map_err(|e| MahoutError::Cuda(format!("CUDA device 
synchronize failed: {:?}", e)))?;
+            }
+
             Ok(state_vector)
         }
 
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index ed786c79b..38bb90e27 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -57,10 +57,10 @@ impl GpuStateVector {
     pub fn new(_device: &Arc<CudaDevice>, qubits: usize) -> Result<Self> {
         let _size_elements = 1 << qubits;
 
-        // Use alloc_zeros for device-side allocation (critical for 
performance):
+        // Use device-side allocation (critical for performance):
         // - No CPU RAM usage (avoids OOM for large states)
-        // - No PCIe transfer (GPU hardware zero-fill)
         // - Fast: microseconds vs seconds for 30 qubits (16GB)
+        // TODO: Use uninitialized alloc() when kernel fully implements padding
         #[cfg(target_os = "linux")]
         {
             // Allocate GPU memory (zero-initialized)
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index a6439ca55..6634b15c4 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -18,6 +18,9 @@ pub mod dlpack;
 pub mod gpu;
 pub mod error;
 
+#[macro_use]
+mod profiling;
+
 pub use error::{MahoutError, Result};
 
 use std::sync::Arc;
@@ -66,9 +69,15 @@ impl QdpEngine {
         num_qubits: usize,
         encoding_method: &str,
     ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::Encode");
+
         let encoder = get_encoder(encoding_method)?;
         let state_vector = encoder.encode(&self.device, data, num_qubits)?;
-        Ok(state_vector.to_dlpack())
+        let dlpack_ptr = {
+            crate::profile_scope!("DLPack::Wrap");
+            state_vector.to_dlpack()
+        };
+        Ok(dlpack_ptr)
     }
 
     /// Get CUDA device reference for advanced operations
diff --git a/qdp/qdp-core/src/profiling.rs b/qdp/qdp-core/src/profiling.rs
new file mode 100644
index 000000000..0b60e6c82
--- /dev/null
+++ b/qdp/qdp-core/src/profiling.rs
@@ -0,0 +1,78 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+// Zero-cost profiling macros for NVTX integration
+//
+// Provides clean abstraction over NVTX markers without cluttering business 
logic.
+// When observability feature is disabled, these macros compile to no-ops.
+
+/// Profile a scope using RAII guard pattern
+///
+/// Automatically pushes NVTX range on entry and pops on scope exit.
+/// Uses Rust's Drop mechanism to ensure proper cleanup even on early returns.
+///
+/// # Example
+/// ```rust
+/// fn my_function() {
+///     crate::profile_scope!("MyFunction");
+///     // ... code ...
+///     // Guard automatically pops when function returns
+/// }
+/// ```
+#[cfg(feature = "observability")]
+#[macro_export]
+macro_rules! profile_scope {
+    ($name:expr) => {
+        let _scope_guard = nvtx::range!($name);
+    };
+}
+
+/// No-op version when observability is disabled
+///
+/// Compiler eliminates this completely, zero runtime cost.
+#[cfg(not(feature = "observability"))]
+#[macro_export]
+macro_rules! profile_scope {
+    ($name:expr) => {
+        // Zero-cost: compiler removes this entirely
+    };
+}
+
+/// Mark a point in time with NVTX marker
+///
+/// Useful for marking specific events without creating a range.
+///
+/// # Example
+/// ```rust
+/// crate::profile_mark!("CheckpointReached");
+/// ```
+#[cfg(feature = "observability")]
+#[macro_export]
+macro_rules! profile_mark {
+    ($name:expr) => {
+        nvtx::mark!($name);
+    };
+}
+
+/// No-op version when observability is disabled
+#[cfg(not(feature = "observability"))]
+#[macro_export]
+macro_rules! profile_mark {
+    ($name:expr) => {
+        // Zero-cost: compiler removes this entirely
+    };
+}
diff --git a/qdp/qdp-core/tests/common/mod.rs b/qdp/qdp-core/tests/common/mod.rs
index d13e4623d..f105a5436 100644
--- a/qdp/qdp-core/tests/common/mod.rs
+++ b/qdp/qdp-core/tests/common/mod.rs
@@ -1,3 +1,19 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 /// Create test data with normalized values
 pub fn create_test_data(size: usize) -> Vec<f64> {
     (0..size).map(|i| (i as f64) / (size as f64)).collect()
diff --git a/qdp/qdp-core/tests/validation.rs b/qdp/qdp-core/tests/validation.rs
index e8a1a3443..cc12a995a 100644
--- a/qdp/qdp-core/tests/validation.rs
+++ b/qdp/qdp-core/tests/validation.rs
@@ -1,3 +1,19 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // Input validation and error handling tests
 
 use qdp_core::{QdpEngine, MahoutError};
diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs
index 0956708ac..2e3b01b27 100644
--- a/qdp/qdp-kernels/build.rs
+++ b/qdp/qdp-kernels/build.rs
@@ -60,6 +60,8 @@ fn main() {
     // This uses cc crate's CUDA support to invoke nvcc
     let mut build = cc::Build::new();
 
+    build.include(format!("{}/include", cuda_path));
+
     build
         .cuda(true)
         .flag("-cudart=shared")  // Use shared CUDA runtime

(mahout) 05/50: [QDP] add NVTX to trace validity (#650)

Reply via email to