This is an automated email from the ASF dual-hosted git repository.

andrewmusselman pushed a commit to branch pytorch-gpu-capability-check
in repository https://gitbox.apache.org/repos/asf/mahout.git

commit 2f96816e0d36742146aa96ca15bf355f1e8e537f
Author: Andrew Musselman <[email protected]>
AuthorDate: Sun May 17 08:38:48 2026 -0700

    fix(qdp): fall back to CPU when GPU arch not in PyTorch's compiled list
---
 qdp/qdp-python/qumat_qdp/api.py      | 11 ++------
 qdp/qdp-python/qumat_qdp/loader.py   | 53 ++++++++++++++++++++++++++++++------
 testing/qdp_python/test_torch_ref.py | 32 ++++++++++++++++++++--
 3 files changed, 76 insertions(+), 20 deletions(-)

diff --git a/qdp/qdp-python/qumat_qdp/api.py b/qdp/qdp-python/qumat_qdp/api.py
index 6493dd0f3..e1e8b20e3 100644
--- a/qdp/qdp-python/qumat_qdp/api.py
+++ b/qdp/qdp-python/qumat_qdp/api.py
@@ -189,17 +189,10 @@ class QdpBenchmark:
     def _run_throughput_pytorch(self) -> ThroughputResult:
         import torch
 
+        from qumat_qdp.loader import _select_torch_device
         from qumat_qdp.torch_ref import encode
 
-        if torch.cuda.is_available():
-            if self._device_id < 0 or self._device_id >= 
torch.cuda.device_count():
-                raise ValueError(
-                    f"Invalid CUDA device_id {self._device_id}; "
-                    f"{torch.cuda.device_count()} device(s) available."
-                )
-            device = f"cuda:{self._device_id}"
-        else:
-            device = "cpu"
+        device = _select_torch_device(torch, self._device_id)
         # _validate() guarantees these are not None.
         assert self._num_qubits is not None
         assert self._total_batches is not None
diff --git a/qdp/qdp-python/qumat_qdp/loader.py 
b/qdp/qdp-python/qumat_qdp/loader.py
index a3443f1ba..34fae6a1a 100644
--- a/qdp/qdp-python/qumat_qdp/loader.py
+++ b/qdp/qdp-python/qumat_qdp/loader.py
@@ -75,6 +75,49 @@ _BACKEND_AUTO = "auto"
 _VALID_BACKENDS = frozenset({_BACKEND_RUST, _BACKEND_PYTORCH, _BACKEND_AUTO})
 
 
+def _select_torch_device(torch, device_id: int) -> str:
+    """Pick a torch device the current PyTorch build can actually use.
+
+    ``torch.cuda.is_available()`` returns True whenever a usable driver and at
+    least one GPU are present, but does not check whether the GPU's compute
+    capability is in the PyTorch wheel's compiled arch list. Running on an
+    unsupported GPU surfaces as ``cudaErrorNoKernelImageForDevice`` the first
+    time a kernel launches -- a particularly opaque failure for users on
+    Pascal-and-earlier hardware where recent PyTorch wheels no longer ship
+    matching kernels.
+
+    Intersect the device's capability with ``torch.cuda.get_arch_list()`` and
+    fall back to CPU (with a warning) when they don't match. Raises
+    ``ValueError`` on an out-of-range ``device_id`` to preserve the prior
+    contract for callers that explicitly request a specific GPU.
+    """
+    if not torch.cuda.is_available():
+        return "cpu"
+
+    if device_id < 0 or device_id >= torch.cuda.device_count():
+        raise ValueError(
+            f"Invalid CUDA device_id {device_id}; "
+            f"{torch.cuda.device_count()} device(s) available."
+        )
+
+    arch_list = torch.cuda.get_arch_list()
+    if arch_list:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        device_arch = f"sm_{major}{minor}"
+        if device_arch not in arch_list:
+            warnings.warn(
+                f"GPU {device_id} ({torch.cuda.get_device_name(device_id)}, "
+                f"{device_arch}) is not in this PyTorch build's supported "
+                f"arch list ({sorted(arch_list)}). Falling back to CPU. "
+                "Install a PyTorch wheel that targets this GPU, or set "
+                "CUDA_VISIBLE_DEVICES= to silence this warning.",
+                stacklevel=2,
+            )
+            return "cpu"
+
+    return f"cuda:{device_id}"
+
+
 def _path_extension(path: str) -> str:
     """Return the lowercase extension of `path` (handling remote 
URLs/queries)."""
     is_remote = "://" in path
@@ -478,15 +521,7 @@ class QuantumDataLoader:
 
         from qumat_qdp.torch_ref import encode
 
-        if torch.cuda.is_available():
-            if self._device_id < 0 or self._device_id >= 
torch.cuda.device_count():
-                raise ValueError(
-                    f"Invalid CUDA device_id {self._device_id}; "
-                    f"{torch.cuda.device_count()} device(s) available."
-                )
-            device = f"cuda:{self._device_id}"
-        else:
-            device = "cpu"
+        device = _select_torch_device(torch, self._device_id)
 
         if use_synthetic:
             return self._pytorch_synthetic_iter(torch, encode, device)
diff --git a/testing/qdp_python/test_torch_ref.py 
b/testing/qdp_python/test_torch_ref.py
index c6c49883b..0fe3fffd1 100644
--- a/testing/qdp_python/test_torch_ref.py
+++ b/testing/qdp_python/test_torch_ref.py
@@ -36,6 +36,28 @@ from qumat_qdp.torch_ref import (
     iqp_encode,
 )
 
+
+def _torch_cuda_usable(device_id: int = 0) -> bool:
+    """True iff the current PyTorch build can launch kernels on ``device_id``.
+
+    ``torch.cuda.is_available()`` alone is not enough: on GPUs whose compute
+    capability isn't in the wheel's compiled arch list (e.g. Pascal sm_61
+    against a recent wheel that ships sm_70+), it returns True but every
+    kernel launch fails with ``cudaErrorNoKernelImageForDevice``. Mirror
+    ``qumat_qdp.loader._select_torch_device``'s capability check so the
+    GPU-only tests skip cleanly instead of erroring.
+    """
+    if not torch.cuda.is_available():
+        return False
+    if device_id < 0 or device_id >= torch.cuda.device_count():
+        return False
+    arch_list = torch.cuda.get_arch_list()
+    if not arch_list:
+        return True
+    major, minor = torch.cuda.get_device_capability(device_id)
+    return f"sm_{major}{minor}" in arch_list
+
+
 # ---------------------------------------------------------------------------
 # Amplitude encoding
 # ---------------------------------------------------------------------------
@@ -349,7 +371,10 @@ class TestDevicePlacement:
         result = amplitude_encode(data, num_qubits=2, device="cpu")
         assert result.device.type == "cpu"
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not 
available")
+    @pytest.mark.skipif(
+        not _torch_cuda_usable(),
+        reason="CUDA not available or GPU compute capability not supported by 
this PyTorch build",
+    )
     def test_gpu_output(self):
         data = torch.randn(2, 4, dtype=torch.float64)
         result = amplitude_encode(data, num_qubits=2, device="cuda:0")
@@ -369,7 +394,10 @@ class TestCrossValidation:
         pytest.importorskip("_qdp")
 
     @pytest.mark.gpu
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not 
available")
+    @pytest.mark.skipif(
+        not _torch_cuda_usable(),
+        reason="CUDA not available or GPU compute capability not supported by 
this PyTorch build",
+    )
     @pytest.mark.parametrize("encoding", ["amplitude", "angle", "basis", 
"iqp"])
     def test_encoding_matches_rust(self, encoding):
         import _qdp

Reply via email to