(mahout) 24/50: [QDP] improve e2e benchmark (#713)

guanmingchiu Tue, 06 Jan 2026 08:45:46 -0800

This is an automated email from the ASF dual-hosted git repository.

guanmingchiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


commit 213b702be8f7dba01a651a4efc8e047d6caa39af
Author: KUAN-HAO HUANG <[email protected]>
AuthorDate: Fri Dec 12 16:11:56 2025 +0800

    [QDP] improve e2e benchmark (#713)
    
    * improve e2e benchmark
    
    * gc.coolect first
---
 .../{benchmark_e2e_final.py => benchmark_e2e.py}   | 54 +++++++++++++++++++++-
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/qdp/benchmark/benchmark_e2e_final.py 
b/qdp/benchmark/benchmark_e2e.py
similarity index 92%
rename from qdp/benchmark/benchmark_e2e_final.py
rename to qdp/benchmark/benchmark_e2e.py
index c00476b2d..b72c81d1f 100644
--- a/qdp/benchmark/benchmark_e2e_final.py
+++ b/qdp/benchmark/benchmark_e2e.py
@@ -35,6 +35,7 @@ import torch.nn as nn
 import numpy as np
 import os
 import itertools
+import gc
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.ipc as ipc
@@ -63,6 +64,14 @@ HIDDEN_DIM = 16
 BATCH_SIZE = 64  # Small batch to stress loop overhead
 
 
+def clean_cache():
+    """Clear GPU cache and Python garbage collection."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
 class DummyQNN(nn.Module):
     def __init__(self, n_qubits):
         super().__init__()
@@ -102,6 +111,9 @@ def generate_data(n_qubits, n_samples):
     print(f"  Generated {n_samples} samples")
     print(f"  Parquet: {parquet_size:.2f} MB, Arrow IPC: {arrow_size:.2f} MB")
 
+    # Clean cache after data generation
+    clean_cache()
+
 
 # -----------------------------------------------------------
 # 1. Qiskit Full Pipeline
@@ -111,6 +123,9 @@ def run_qiskit(n_qubits, n_samples):
         print("\n[Qiskit] Not installed, skipping.")
         return 0.0, None
 
+    # Clean cache before starting benchmark
+    clean_cache()
+
     print("\n[Qiskit] Full Pipeline (Disk -> GPU)...")
     model = DummyQNN(n_qubits).cuda()
     backend = AerSimulator(method="statevector")
@@ -162,6 +177,10 @@ def run_qiskit(n_qubits, n_samples):
     print(f"\n  Total Time: {total_time:.4f} s")
 
     all_qiskit_tensor = torch.cat(all_qiskit_states, dim=0)
+
+    # Clean cache after benchmark completion
+    clean_cache()
+
     return total_time, all_qiskit_tensor
 
 
@@ -173,6 +192,9 @@ def run_pennylane(n_qubits, n_samples):
         print("\n[PennyLane] Not installed, skipping.")
         return 0.0, None
 
+    # Clean cache before starting benchmark
+    clean_cache()
+
     print("\n[PennyLane] Full Pipeline (Disk -> GPU)...")
 
     dev = qml.device("default.qubit", wires=n_qubits)
@@ -224,6 +246,9 @@ def run_pennylane(n_qubits, n_samples):
         all_pl_states, dim=0
     )  # Should handle cases where last batch is smaller
 
+    # Clean cache after benchmark completion
+    clean_cache()
+
     return total_time, all_pl_states_tensor
 
 
@@ -231,6 +256,9 @@ def run_pennylane(n_qubits, n_samples):
 # 3. Mahout Parquet Pipeline
 # -----------------------------------------------------------
 def run_mahout_parquet(engine, n_qubits, n_samples):
+    # Clean cache before starting benchmark
+    clean_cache()
+
     print("\n[Mahout-Parquet] Full Pipeline (Parquet -> GPU)...")
     model = DummyQNN(n_qubits).cuda()
 
@@ -251,10 +279,10 @@ def run_mahout_parquet(engine, n_qubits, n_samples):
 
     # Reshape to [n_samples, state_len] (still complex)
     state_len = 1 << n_qubits
-    gpu_reshaped = gpu_batched.view(n_samples, state_len)
 
     # Convert to float for model (batch already on GPU)
     reshape_start = time.perf_counter()
+    gpu_reshaped = gpu_batched.view(n_samples, state_len)
     gpu_all_data = gpu_reshaped.abs().to(torch.float32)
     reshape_time = time.perf_counter() - reshape_start
     print(f"  Reshape & convert: {reshape_time:.4f} s")
@@ -267,6 +295,10 @@ def run_mahout_parquet(engine, n_qubits, n_samples):
     torch.cuda.synchronize()
     total_time = time.perf_counter() - start_time
     print(f"  Total Time: {total_time:.4f} s")
+
+    # Clean cache after benchmark completion
+    clean_cache()
+
     return total_time, gpu_reshaped
 
 
@@ -274,6 +306,9 @@ def run_mahout_parquet(engine, n_qubits, n_samples):
 # 4. Mahout Arrow IPC Pipeline
 # -----------------------------------------------------------
 def run_mahout_arrow(engine, n_qubits, n_samples):
+    # Clean cache before starting benchmark
+    clean_cache()
+
     print("\n[Mahout-Arrow] Full Pipeline (Arrow IPC -> GPU)...")
     model = DummyQNN(n_qubits).cuda()
 
@@ -291,9 +326,9 @@ def run_mahout_arrow(engine, n_qubits, n_samples):
     print(f"  DLPack conversion: {dlpack_time:.4f} s")
 
     state_len = 1 << n_qubits
-    gpu_reshaped = gpu_batched.view(n_samples, state_len)
 
     reshape_start = time.perf_counter()
+    gpu_reshaped = gpu_batched.view(n_samples, state_len)
     gpu_all_data = gpu_reshaped.abs().to(torch.float32)
     reshape_time = time.perf_counter() - reshape_start
     print(f"  Reshape & convert: {reshape_time:.4f} s")
@@ -305,6 +340,10 @@ def run_mahout_arrow(engine, n_qubits, n_samples):
     torch.cuda.synchronize()
     total_time = time.perf_counter() - start_time
     print(f"  Total Time: {total_time:.4f} s")
+
+    # Clean cache after benchmark completion
+    clean_cache()
+
     return total_time, gpu_reshaped
 
 
@@ -378,6 +417,9 @@ if __name__ == "__main__":
         print(f"Mahout Init Error: {e}")
         exit(1)
 
+    # Clean cache before starting benchmarks
+    clean_cache()
+
     print("\n" + "=" * 70)
     print(f"E2E BENCHMARK: {args.qubits} Qubits, {args.samples} Samples")
     print("=" * 70)
@@ -391,19 +433,27 @@ if __name__ == "__main__":
     # Run benchmarks
     if "pennylane" in args.frameworks:
         t_pl, pl_all_states = run_pennylane(args.qubits, args.samples)
+        # Clean cache between framework benchmarks
+        clean_cache()
 
     if "qiskit" in args.frameworks:
         t_qiskit, qiskit_all_states = run_qiskit(args.qubits, args.samples)
+        # Clean cache between framework benchmarks
+        clean_cache()
 
     if "mahout-parquet" in args.frameworks:
         t_mahout_parquet, mahout_parquet_all_states = run_mahout_parquet(
             engine, args.qubits, args.samples
         )
+        # Clean cache between framework benchmarks
+        clean_cache()
 
     if "mahout-arrow" in args.frameworks:
         t_mahout_arrow, mahout_arrow_all_states = run_mahout_arrow(
             engine, args.qubits, args.samples
         )
+        # Clean cache between framework benchmarks
+        clean_cache()
 
     print("\n" + "=" * 70)
     print("E2E LATENCY (Lower is Better)")

(mahout) 24/50: [QDP] improve e2e benchmark (#713)

Reply via email to