Re: [PR] [QDP] The Scaling Test (Latency vs. Qubits) [mahout]

via GitHub Sun, 04 Jan 2026 02:17:18 -0800


rich7420 commented on code in PR #778:
URL: https://github.com/apache/mahout/pull/778#discussion_r2659556182



##########
qdp/qdp-python/benchmark/benchmark_latency.py:
##########
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Data-to-State latency benchmark: CPU RAM -> GPU VRAM.
+
+Run:
+    python qdp/qdp-python/benchmark/benchmark_latency.py --qubits 16 \
+        --batches 200 --batch-size 64 --prefetch 16
+"""
+
+from __future__ import annotations
+
+import argparse
+import queue
+import threading
+import time
+
+import numpy as np
+import torch
+
+from mahout_qdp import QdpEngine
+
+BAR = "=" * 70
+SEP = "-" * 70
+FRAMEWORK_CHOICES = ("pennylane", "qiskit-init", "qiskit-statevector", 
"mahout")
+FRAMEWORK_LABELS = {
+    "mahout": "Mahout",
+    "pennylane": "PennyLane",
+    "qiskit-init": "Qiskit Initialize",
+    "qiskit-statevector": "Qiskit Statevector",
+}
+
+try:
+    import pennylane as qml
+
+    HAS_PENNYLANE = True
+except ImportError:
+    HAS_PENNYLANE = False
+
+try:
+    from qiskit import QuantumCircuit, transpile
+    from qiskit_aer import AerSimulator
+    from qiskit.quantum_info import Statevector
+
+    HAS_QISKIT = True
+except ImportError:
+    HAS_QISKIT = False
+
+
+def sync_cuda() -> None:
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def build_sample(seed: int, vector_len: int) -> np.ndarray:
+    mask = np.uint64(vector_len - 1)
+    scale = 1.0 / vector_len
+    idx = np.arange(vector_len, dtype=np.uint64)
+    mixed = (idx + np.uint64(seed)) & mask
+    return mixed.astype(np.float64) * scale
+
+
+def prefetched_batches(
+    total_batches: int, batch_size: int, vector_len: int, prefetch: int
+):
+    q: queue.Queue[np.ndarray | None] = queue.Queue(maxsize=prefetch)
+
+    def producer():
+        for batch_idx in range(total_batches):
+            base = batch_idx * batch_size
+            batch = [build_sample(base + i, vector_len) for i in 
range(batch_size)]
+            q.put(np.stack(batch))
+        q.put(None)
+
+    threading.Thread(target=producer, daemon=True).start()
+
+    while True:
+        batch = q.get()
+        if batch is None:
+            break
+        yield batch
+
+
+def normalize_batch(batch: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(batch, axis=1, keepdims=True)
+    norms[norms == 0] = 1.0
+    return batch / norms
+
+
+def parse_frameworks(raw: str) -> list[str]:
+    if raw.lower() == "all":
+        return list(FRAMEWORK_CHOICES)
+
+    selected: list[str] = []
+    for part in raw.split(","):
+        name = part.strip().lower()
+        if not name:
+            continue
+        if name not in FRAMEWORK_CHOICES:
+            raise ValueError(
+                f"Unknown framework '{name}'. Choose from: "
+                f"{', '.join(FRAMEWORK_CHOICES)} or 'all'."
+            )
+        if name not in selected:
+            selected.append(name)
+
+    return selected if selected else list(FRAMEWORK_CHOICES)
+
+
+def run_mahout(num_qubits: int, total_batches: int, batch_size: int, prefetch: 
int):
+    try:
+        engine = QdpEngine(0)
+    except Exception as exc:
+        print(f"[Mahout] Init failed: {exc}")
+        return 0.0, 0.0
+
+    sync_cuda()
+    start = time.perf_counter()
+    processed = 0
+
+    for batch in prefetched_batches(
+        total_batches, batch_size, 1 << num_qubits, prefetch
+    ):
+        normalized = normalize_batch(batch)
+        for sample in normalized:
+            qtensor = engine.encode(sample.tolist(), num_qubits, "amplitude")
+            _ = torch.utils.dlpack.from_dlpack(qtensor)
+            processed += 1
+
+    sync_cuda()
+    duration = time.perf_counter() - start
+    latency_ms = (duration / processed) * 1000 if processed > 0 else 0.0
+    print(f"  Total Time: {duration:.4f} s ({latency_ms:.3f} ms/vector)")
+    return duration, latency_ms
+
+
+def run_pennylane(num_qubits: int, total_batches: int, batch_size: int, 
prefetch: int):
+    if not HAS_PENNYLANE:
+        print("[PennyLane] Not installed, skipping.")
+        return 0.0, 0.0
+
+    dev = qml.device("default.qubit", wires=num_qubits)
+
+    @qml.qnode(dev, interface="torch")
+    def circuit(inputs):
+        qml.AmplitudeEmbedding(
+            features=inputs, wires=range(num_qubits), normalize=True, 
pad_with=0.0
+        )
+        return qml.state()
+
+    sync_cuda()
+    start = time.perf_counter()
+    processed = 0
+
+    for batch in prefetched_batches(
+        total_batches, batch_size, 1 << num_qubits, prefetch
+    ):
+        batch_cpu = torch.tensor(batch, dtype=torch.float64)
+        try:
+            state_cpu = circuit(batch_cpu)
+        except Exception:
+            state_cpu = torch.stack([circuit(x) for x in batch_cpu])
+        _ = state_cpu.to("cuda", dtype=torch.complex64)
+        processed += len(batch_cpu)
+
+    sync_cuda()
+    duration = time.perf_counter() - start
+    latency_ms = (duration / processed) * 1000 if processed > 0 else 0.0
+    print(f"  Total Time: {duration:.4f} s ({latency_ms:.3f} ms/vector)")
+    return duration, latency_ms
+
+
+def run_qiskit_init(
+    num_qubits: int, total_batches: int, batch_size: int, prefetch: int
+):
+    if not HAS_QISKIT:
+        print("[Qiskit] Not installed, skipping.")
+        return 0.0, 0.0
+
+    backend = AerSimulator(method="statevector")
+    sync_cuda()
+    start = time.perf_counter()
+    processed = 0
+
+    for batch in prefetched_batches(
+        total_batches, batch_size, 1 << num_qubits, prefetch
+    ):
+        normalized = normalize_batch(batch)
+        for vec in normalized:
+            qc = QuantumCircuit(num_qubits)
+            qc.initialize(vec, range(num_qubits))
+            qc.save_statevector()
+            t_qc = transpile(qc, backend)
+            state = backend.run(t_qc).result().get_statevector().data
+            _ = torch.tensor(state, device="cuda", dtype=torch.complex64)
+            processed += 1
+
+    sync_cuda()
+    duration = time.perf_counter() - start
+    latency_ms = (duration / processed) * 1000 if processed > 0 else 0.0
+    print(f"  Total Time: {duration:.4f} s ({latency_ms:.3f} ms/vector)")
+    return duration, latency_ms
+
+
+def run_qiskit_statevector(
+    num_qubits: int, total_batches: int, batch_size: int, prefetch: int
+):
+    if not HAS_QISKIT:
+        print("[Qiskit] Not installed, skipping.")
+        return 0.0, 0.0
+
+    sync_cuda()
+    start = time.perf_counter()
+    processed = 0
+
+    for batch in prefetched_batches(
+        total_batches, batch_size, 1 << num_qubits, prefetch
+    ):
+        normalized = normalize_batch(batch)
+        for vec in normalized:
+            state = Statevector(vec)
+            _ = torch.tensor(state.data, device="cuda", dtype=torch.complex64)
+            processed += 1
+
+    sync_cuda()
+    duration = time.perf_counter() - start
+    latency_ms = (duration / processed) * 1000 if processed > 0 else 0.0
+    print(f"  Total Time: {duration:.4f} s ({latency_ms:.3f} ms/vector)")
+    return duration, latency_ms
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark Data-to-State latency across frameworks."
+    )
+    parser.add_argument(
+        "--qubits",
+        type=int,
+        default=16,
+        help="Number of qubits (power-of-two vector length).",
+    )
+    parser.add_argument("--batches", type=int, default=200, help="Total 
batches.")
+    parser.add_argument("--batch-size", type=int, default=64, help="Vectors 
per batch.")
+    parser.add_argument(
+        "--prefetch", type=int, default=16, help="CPU-side prefetch depth."
+    )
+    parser.add_argument(
+        "--frameworks",
+        type=str,
+        default="all",
+        help=(
+            "Comma-separated list of frameworks to run "
+            "(pennylane,qiskit-init,qiskit-statevector,mahout) or 'all'."
+        ),
+    )
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA device not available; GPU is required.")
+
+    try:
+        frameworks = parse_frameworks(args.frameworks)
+    except ValueError as exc:
+        parser.error(str(exc))
+
+    total_vectors = args.batches * args.batch_size
+    vector_len = 1 << args.qubits
+
+    print(f"Generating {total_vectors} samples of {args.qubits} qubits...")
+    print(f"  Batch size   : {args.batch_size}")
+    print(f"  Vector length: {vector_len}")
+    print(f"  Batches      : {args.batches}")
+    print(f"  Prefetch     : {args.prefetch}")
+    print(f"  Frameworks   : {', '.join(frameworks)}")
+    bytes_per_vec = vector_len * 8
+    print(f"  Generated {total_vectors} samples")
+    print(
+        f"  PennyLane/Qiskit format: {total_vectors * bytes_per_vec / (1024 * 
1024):.2f} MB"
+    )
+    print(f"  Mahout format: {total_vectors * bytes_per_vec / (1024 * 
1024):.2f} MB")
+    print()
+
+    print(BAR)
+    print(
+        f"DATA-TO-STATE LATENCY BENCHMARK: {args.qubits} Qubits, 
{total_vectors} Samples"
+    )
+    print(BAR)
+
+    t_pl = l_pl = t_q_init = l_q_init = t_q_sv = l_q_sv = t_mahout = l_mahout 
= 0.0

Review Comment:
   that's hard to read. could you turn into clearer description?



##########
qdp/qdp-python/benchmark/benchmark_latency.md:
##########
@@ -0,0 +1,78 @@
+# Data-to-State Latency Benchmark
+
+This benchmark isolates the "Data-to-State" pipeline (CPU RAM -> GPU VRAM) and
+compares Mahout (QDP) against PennyLane and Qiskit baselines:
+
+- Qiskit Initialize (`qiskit-init`): circuit-based state preparation.
+- Qiskit Statevector (`qiskit-statevector`): raw data loading baseline.
+
+The primary metric is average time-to-state in milliseconds (lower is better).
+
+## Workload
+
+- Qubits: 16 (vector length `2^16`)
+- Batches: 200
+- Batch size: 64
+- Prefetch depth: 16 (CPU producer queue)
+
+## Running
+
+```bash
+# Latency test (CPU RAM -> GPU VRAM)
+python qdp/qdp-python/benchmark/benchmark_latency.py --qubits 16 \
+  --batches 200 --batch-size 64 --prefetch 16
+
+# Run only selected frameworks
+python qdp/qdp-python/benchmark/benchmark_latency.py --frameworks 
mahout,pennylane
+```
+
+## Example Output
+
+```
+Generating 12800 samples of 16 qubits...
+  Batch size   : 64
+  Vector length: 65536
+  Batches      : 200
+  Prefetch     : 16
+  Frameworks   : pennylane, qiskit-init, qiskit-statevector, mahout
+  Generated 12800 samples
+  PennyLane/Qiskit format: 6400.00 MB
+  Mahout format: 6400.00 MB
+
+======================================================================
+DATA-TO-STATE LATENCY BENCHMARK: 16 Qubits, 12800 Samples
+======================================================================
+
+[PennyLane] Full Pipeline (DataLoader -> GPU)...
+  Total Time: 26.1952 s (2.047 ms/vector)
+
+[Qiskit Initialize] Full Pipeline (DataLoader -> GPU)...
+  Total Time: 975.8720 s (76.243 ms/vector)
+
+[Qiskit Statevector] Full Pipeline (DataLoader -> GPU)...
+  Total Time: 115.5840 s (9.030 ms/vector)
+
+[Mahout] Full Pipeline (DataLoader -> GPU)...
+  Total Time: 11.5384 s (0.901 ms/vector)
+
+======================================================================
+LATENCY (Lower is Better)
+Samples: 12800, Qubits: 16
+======================================================================
+Mahout             0.901 ms/vector
+PennyLane          2.047 ms/vector
+Qiskit Statevector 9.030 ms/vector
+Qiskit Initialize  76.243 ms/vector
+----------------------------------------------------------------------
+Speedup vs PennyLane:       2.27x
+Speedup vs Qiskit Init:    84.61x
+Speedup vs Qiskit Statevec: 10.02x
+```
+
+## Notes

Review Comment:
   Should we remind that the results may vary depending on the device?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [QDP] The Scaling Test (Latency vs. Qubits) [mahout]

Reply via email to