This is an automated email from the ASF dual-hosted git repository.

ryankert01 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/mahout.git


The following commit(s) were added to refs/heads/main by this push:
     new 0076003b0 [QDP] Pr1 phase kernel opt (#1386)
0076003b0 is described below

commit 0076003b0777ade5ae4d2e8ad9b2e8b0c83a3cd3
Author: aloha1357 <[email protected]>
AuthorDate: Tue Jun 9 20:24:47 2026 +0200

    [QDP] Pr1 phase kernel opt (#1386)
    
    * feat(qdp): optimize phase kernel divergence and hoist constant mem 
computation
    
    * test(qdp): add phase encoding tests and benchmark script
    
    ---------
    
    Co-authored-by: Ryan Huang <[email protected]>
---
 qdp/qdp-kernels/src/iqp.cu                  |   8 +--
 qdp/qdp-kernels/src/phase.cu                |  51 +++++++-------
 qdp/qdp-python/benchmark/README.md          |   2 +
 qdp/qdp-python/benchmark/benchmark_phase.py | 105 ++++++++++++++++++++++++++++
 testing/qdp/test_bindings.py                |  51 ++++++++++++++
 5 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/qdp/qdp-kernels/src/iqp.cu b/qdp/qdp-kernels/src/iqp.cu
index a521255f4..51fd022ff 100644
--- a/qdp/qdp-kernels/src/iqp.cu
+++ b/qdp/qdp-kernels/src/iqp.cu
@@ -43,9 +43,7 @@ __device__ double compute_phase(
 
     // Single-qubit Z terms: sum_i x_i * data[i]
     for (unsigned int i = 0; i < num_qubits; ++i) {
-        if ((x >> i) & 1U) {
-            phase += data[i];
-        }
+        phase += data[i] * (double)((x >> i) & 1U);
     }
 
     // Two-qubit ZZ terms (if enabled): sum_{i<j} x_i * x_j * data[n + 
pair_index]
@@ -53,9 +51,7 @@ __device__ double compute_phase(
         unsigned int pair_idx = num_qubits;
         for (unsigned int i = 0; i < num_qubits; ++i) {
             for (unsigned int j = i + 1; j < num_qubits; ++j) {
-                if (((x >> i) & 1U) && ((x >> j) & 1U)) {
-                    phase += data[pair_idx];
-                }
+                phase += data[pair_idx] * (double)(((x >> i) & 1U) & ((x >> j) 
& 1U));
                 pair_idx++;
             }
         }
diff --git a/qdp/qdp-kernels/src/phase.cu b/qdp/qdp-kernels/src/phase.cu
index 1ee757a04..c87c1adac 100644
--- a/qdp/qdp-kernels/src/phase.cu
+++ b/qdp/qdp-kernels/src/phase.cu
@@ -29,24 +29,19 @@
 #include <cuda_runtime.h>
 #include <cuComplex.h>
 #include <math.h>
-#include "kernel_config.h"
 
-// Precompute 1/√2^n as a compile-time-friendly inline.
-// For n qubits the norm factor is pow(M_SQRT1_2, n).
-__device__ __forceinline__ double phase_norm(unsigned int num_qubits) {
-    // M_SQRT1_2 = 1/√2 ≈ 0.7071067811865476
-    double factor = 1.0;
-    for (unsigned int k = 0; k < num_qubits; ++k) {
-        factor *= M_SQRT1_2;
-    }
-    return factor;
-}
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.70710678118654752440
+#endif
+
+#include "kernel_config.h"
 
 __global__ void phase_encode_kernel(
     const double* __restrict__ phases,
     cuDoubleComplex* __restrict__ state,
     size_t state_len,
-    unsigned int num_qubits
+    unsigned int num_qubits,
+    double norm_factor
 ) {
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= state_len) return;
@@ -55,16 +50,14 @@ __global__ void phase_encode_kernel(
     double phi = 0.0;
     double norm = 1.0;
     for (unsigned int bit = 0; bit < num_qubits; ++bit) {
-        if ((idx >> bit) & 1U) {
-            phi += phases[bit];
-        }
+        phi += phases[bit] * (double)((idx >> bit) & 1U);
         norm *= M_SQRT1_2;
     }
 
     double re, im;
     sincos(phi, &im, &re);   // re = cos(φ), im = sin(φ)
 
-    state[idx] = make_cuDoubleComplex(norm * re, norm * im);
+    state[idx] = make_cuDoubleComplex(norm_factor * re, norm_factor * im);
 }
 
 __global__ void phase_encode_batch_kernel(
@@ -72,7 +65,8 @@ __global__ void phase_encode_batch_kernel(
     cuDoubleComplex* __restrict__ state_batch,
     size_t num_samples,
     size_t state_len,
-    unsigned int num_qubits
+    unsigned int num_qubits,
+    double norm_factor
 ) {
     const size_t total_elements = num_samples * state_len;
     const size_t stride = gridDim.x * blockDim.x;
@@ -87,16 +81,13 @@ __global__ void phase_encode_batch_kernel(
 
         double phi = 0.0;
         for (unsigned int bit = 0; bit < num_qubits; ++bit) {
-            if ((element_idx >> bit) & 1U) {
-                phi += phases[bit];
-            }
+            phi += phases[bit] * (double)((element_idx >> bit) & 1U);
         }
 
-        double norm = phase_norm(num_qubits);
         double re, im;
         sincos(phi, &im, &re);
 
-        state_batch[global_idx] = make_cuDoubleComplex(norm * re, norm * im);
+        state_batch[global_idx] = make_cuDoubleComplex(norm_factor * re, 
norm_factor * im);
     }
 }
 
@@ -133,11 +124,17 @@ int launch_phase_encode(
     const int blockSize = DEFAULT_BLOCK_SIZE;
     const int gridSize = (state_len + blockSize - 1) / blockSize;
 
+    double norm_factor = 1.0;
+    for (unsigned int k = 0; k < num_qubits; ++k) {
+        norm_factor *= M_SQRT1_2;
+    }
+
     phase_encode_kernel<<<gridSize, blockSize, 0, stream>>>(
         phases_d,
         state_complex_d,
         state_len,
-        num_qubits
+        num_qubits,
+        norm_factor
     );
 
     return (int)cudaGetLastError();
@@ -175,12 +172,18 @@ int launch_phase_encode_batch(
     const size_t max_blocks = MAX_GRID_BLOCKS;
     const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed : 
max_blocks;
 
+    double norm_factor = 1.0;
+    for (unsigned int k = 0; k < num_qubits; ++k) {
+        norm_factor *= M_SQRT1_2;
+    }
+
     phase_encode_batch_kernel<<<gridSize, blockSize, 0, stream>>>(
         phases_batch_d,
         state_complex_d,
         num_samples,
         state_len,
-        num_qubits
+        num_qubits,
+        norm_factor
     );
 
     return (int)cudaGetLastError();
diff --git a/qdp/qdp-python/benchmark/README.md 
b/qdp/qdp-python/benchmark/README.md
index 0dbea81ee..d30715a1c 100644
--- a/qdp/qdp-python/benchmark/README.md
+++ b/qdp/qdp-python/benchmark/README.md
@@ -8,6 +8,7 @@ scripts:
 - `benchmark_throughput.py`: DataLoader-style throughput benchmark
   that measures vectors/sec across Mahout, PennyLane, and Qiskit.
 - `benchmark_latency.py`: Data-to-State latency benchmark (CPU RAM -> GPU 
VRAM).
+- `benchmark_phase.py`: GPU phase encoding latency benchmark (batch encode 
timing).
 
 ## Quick Start
 
@@ -32,6 +33,7 @@ To run individual benchmarks after setup:
 uv run --project qdp/qdp-python python 
qdp/qdp-python/benchmark/benchmark_e2e.py
 uv run --project qdp/qdp-python python 
qdp/qdp-python/benchmark/benchmark_latency.py
 uv run --project qdp/qdp-python python 
qdp/qdp-python/benchmark/benchmark_throughput.py
+uv run --project qdp/qdp-python python 
qdp/qdp-python/benchmark/benchmark_phase.py
 ```
 
 This keeps all benchmark dependencies in the unified repo root venv 
(`mahout/.venv`).
diff --git a/qdp/qdp-python/benchmark/benchmark_phase.py 
b/qdp/qdp-python/benchmark/benchmark_phase.py
new file mode 100644
index 000000000..c0a5e9039
--- /dev/null
+++ b/qdp/qdp-python/benchmark/benchmark_phase.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPU phase encoding latency benchmark.
+
+Measures batch phase encoding on the current build. For before/after 
comparisons,
+checkout the baseline commit, rebuild the extension, run with ``--label 
baseline``,
+then repeat on the optimized branch with ``--label optimized``.
+
+Run from repo root::
+
+    uv run --project qdp/qdp-python python 
qdp/qdp-python/benchmark/benchmark_phase.py
+"""
+
+from __future__ import annotations
+
+import argparse
+
+import torch
+from qumat_qdp import QdpEngine
+
+
+def benchmark_phase(
+    num_qubits: int,
+    num_samples: int,
+    iters: int = 5,
+) -> tuple[float, float]:
+    """Return (total_us, per_sample_us) for batch phase encoding on GPU."""
+    phases = torch.tensor(
+        [[0.1 * (k + 1) for k in range(num_qubits)]] * num_samples,
+        dtype=torch.float64,
+        device="cuda",
+    )
+    engine = QdpEngine(0)
+
+    for _ in range(3):
+        qtensor = engine.encode(phases, num_qubits, "phase")
+        _ = torch.from_dlpack(qtensor)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+
+    start.record()
+    for _ in range(iters):
+        qtensor = engine.encode(phases, num_qubits, "phase")
+        _ = torch.from_dlpack(qtensor)
+    end.record()
+    torch.cuda.synchronize()
+
+    total_ms = start.elapsed_time(end) / iters
+    total_us = total_ms * 1000.0
+    per_sample_us = total_us / num_samples
+    return total_us, per_sample_us
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="GPU phase encoding 
benchmark")
+    parser.add_argument("--qubits", type=int, default=14)
+    parser.add_argument("--batch-size", type=int, default=128)
+    parser.add_argument("--iterations", type=int, default=5)
+    parser.add_argument(
+        "--label",
+        choices=("baseline", "optimized"),
+        default="optimized",
+        help="Tag for the current checkout (baseline or optimized)",
+    )
+    args = parser.parse_args()
+
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA not available. Cannot benchmark.")
+
+    device_name = torch.cuda.get_device_name(0)
+    total_us, per_sample_us = benchmark_phase(
+        args.qubits, args.batch_size, args.iterations
+    )
+
+    print("=" * 70)
+    print("Phase encoding GPU benchmark")
+    print(f"GPU: {device_name}")
+    print(
+        f"Config: qubits={args.qubits}, batch_size={args.batch_size}, "
+        f"iterations={args.iterations}, label={args.label}"
+    )
+    print("=" * 70)
+    print(f"Total batch time:  {total_us:.1f} us")
+    print(f"Per sample:        {per_sample_us:.2f} us ({per_sample_us / 
1000:.3f} ms)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index b6a2b60f5..eb5d92ce9 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -1795,3 +1795,54 @@ def test_phase_encode_shape(data_shape, expected_shape):
     torch_tensor = torch.from_dlpack(qtensor)
 
     assert torch_tensor.shape == expected_shape
+
+
+@requires_qdp
[email protected]
+def test_phase_encode_batch_large():
+    """Test larger batch phase encoding (covers batch launch path)."""
+    pytest.importorskip("torch")
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+    batch_size = 128
+    n = 8
+    data = torch.rand((batch_size, n), dtype=torch.float64) * 5.9 + 0.1
+    qtensor = engine.encode(data, n, "phase")
+    torch_tensor = torch.from_dlpack(qtensor)
+
+    assert torch_tensor.shape == (batch_size, 1 << n)
+    for i in range(batch_size):
+        sample = torch_tensor[i]
+        norm_sq = torch.sum(torch.abs(sample) ** 2).item()
+        assert abs(norm_sq - 1.0) < 1e-8, f"Sample {i} norm {norm_sq}"
+        assert bool(torch.all(torch.isfinite(sample))), (
+            f"Non-finite values in sample {i}"
+        )
+
+
+@requires_qdp
[email protected]
+def test_phase_encode_large_n_correctness():
+    """Test phase encoding at N=14 for correctness on large state vectors."""
+    pytest.importorskip("torch")
+    from _qdp import QdpEngine
+
+    if not torch.cuda.is_available():
+        pytest.skip("GPU required for QdpEngine")
+
+    engine = QdpEngine(0)
+    n = 14
+    phases = [0.1 * (k + 1) for k in range(n)]
+    qtensor = engine.encode(phases, n, "phase")
+    torch_tensor = torch.from_dlpack(qtensor)
+
+    assert torch_tensor.shape == (1, 1 << n)
+    norm_sq = torch.sum(torch.abs(torch_tensor) ** 2).item()
+    assert abs(norm_sq - 1.0) < 1e-9, f"Expected unit norm at N=14, got 
{norm_sq}"
+    assert bool(torch.all(torch.isfinite(torch_tensor))), (
+        "Non-finite values in large-N phase output"
+    )

Reply via email to