NVIDIA · cpcloud · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py b/cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
@@ -2,12 +2,18 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
+import os
 from contextlib import contextmanager
 from functools import cache
 
 import pytest
 
 from cuda.bindings import nvml
+from cuda.pathfinder import DynamicLibNotFoundError
+
+
+def _running_in_ci() -> bool:
+    return os.environ.get("CI") is not None
 
 
 @cache
@@ -16,16 +22,27 @@ def hardware_supports_nvml():
     Tries to call the simplest NVML API possible to see if just the basics
     works.  If not we are probably on one of the platforms where NVML is not
     supported at all (e.g. Jetson Orin).
+
+    Runtime-load/init failures are treated as "unsupported" on local/dev
+    machines so NVML test modules skip cleanly.  In CI we re-raise those
+    failures to avoid masking real infrastructure regressions.
     """
-    nvml.init_v2()
+    initialized = False
     try:
+        nvml.init_v2()
+        initialized = True
         nvml.system_get_driver_branch()
     except (nvml.NotSupportedError, nvml.UnknownError):
         return False
+    except (DynamicLibNotFoundError, nvml.NvmlError):
+        if _running_in_ci():
+            raise
+        return False
     else:
         return True
     finally:
-        nvml.shutdown()
+        if initialized:
+            nvml.shutdown()
 
 
 @contextmanager

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
@@ -8,6 +8,7 @@
 import pytest
 
 import cuda.bindings.driver as cuda
+from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
 
 # Import shared test helpers for tests across subprojects.
 # PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
@@ -46,3 +47,9 @@ def ctx(device):
     yield ctx
     (err,) = cuda.cuCtxDestroy(ctx)
     assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+@pytest.fixture(scope="session")
+def require_nvml_runtime_or_skip_local():
+    if not hardware_supports_nvml():
+        pytest.skip("NVML runtime is unavailable on this system")
diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py
@@ -8,6 +8,8 @@
 from cuda.bindings import nvml
 from cuda.bindings._test_helpers.arch_check import unsupported_before  # noqa: F401
 
+pytestmark = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
+
 
 class NVMLInitializer:
     def __init__(self):
@@ -27,7 +29,8 @@ def nvml_init():
 
 
 @pytest.fixture(scope="session", autouse=True)
-def device_info():
+def device_info(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     dev_count = None
     bus_id_to_board_details = {}
 

diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
@@ -25,6 +25,13 @@ def cuda_version_less_than(target):
     return get_cuda_version() < target
 
 
+@pytest.fixture(scope="module")
+def require_cuda_13_1_or_skip(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
+    if cuda_version_less_than(13010):
+        pytest.skip("Introduced in 13.1")
+
+
 def test_device_capabilities(all_devices):
     for device in all_devices:
         capabilities = nvml.device_get_capabilities(device)
@@ -94,7 +101,7 @@ def test_device_get_performance_modes(all_devices):
         assert isinstance(modes, str)
 
 
-@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
+@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
 def test_device_get_unrepairable_memory_flag(all_devices):
     for device in all_devices:
         with unsupported_before(device, None):
@@ -109,7 +116,7 @@ def test_device_vgpu_get_heterogeneous_mode(all_devices):
         assert isinstance(mode, int)
 
 
-@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
+@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
 def test_read_prm_counters(all_devices):
     for device in all_devices:
         counters = nvml.PRMCounter_v1(5)

diff --git a/cuda_bindings/tests/test_arch_check.py b/cuda_bindings/tests/test_arch_check.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+import pytest
+
+from cuda.bindings import nvml
+from cuda.bindings._test_helpers import arch_check
+from cuda.pathfinder import DynamicLibNotFoundError
+
+
+def _raise(exc):
+    def _inner():
+        raise exc
+
+    return _inner
+
+
+def _make_not_supported_error():
+    err = nvml.NotSupportedError.__new__(nvml.NotSupportedError)
+    Exception.__init__(err, "Not supported")
+    return err
+
+
+def _make_lib_rm_version_mismatch_error():
+    err = nvml.LibRmVersionMismatchError.__new__(nvml.LibRmVersionMismatchError)
+    Exception.__init__(err, "Driver/library version mismatch")
+    return err
+
+
+def _make_dynamic_lib_not_found_error():
+    return DynamicLibNotFoundError("Failure finding libnvml.so")
+
+
+@pytest.fixture(autouse=True)
+def clear_hardware_supports_nvml_cache():
+    arch_check.hardware_supports_nvml.cache_clear()
+    yield
+    arch_check.hardware_supports_nvml.cache_clear()
+
+
+def test_hardware_supports_nvml_returns_true_when_probe_succeeds(monkeypatch):
+    calls = []
+
+    monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
+    monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", lambda: "560")
+    monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))
+
+    assert arch_check.hardware_supports_nvml() is True
+    assert calls == ["init", "shutdown"]
+
+
+def test_hardware_supports_nvml_returns_false_for_not_supported(monkeypatch):
+    calls = []
+
+    monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
+    monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", _raise(_make_not_supported_error()))
+    monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))
+
+    assert arch_check.hardware_supports_nvml() is False
+    assert calls == ["init", "shutdown"]
+
+
+@pytest.mark.parametrize(
+    "error_factory",
+    [
+        _make_lib_rm_version_mismatch_error,
+        _make_dynamic_lib_not_found_error,
+    ],
+)
+def test_hardware_supports_nvml_runtime_errors_skip_locally(monkeypatch, error_factory):
+    monkeypatch.delenv("CI", raising=False)
+    monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(error_factory()))
+
+    assert arch_check.hardware_supports_nvml() is False
+
+
+@pytest.mark.parametrize(
+    "error_factory",
+    [
+        _make_lib_rm_version_mismatch_error,
+        _make_dynamic_lib_not_found_error,
+    ],
+)
+def test_hardware_supports_nvml_runtime_errors_fail_in_ci(monkeypatch, error_factory):
+    err = error_factory()
+    monkeypatch.setenv("CI", "1")
+    monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(err))
+
+    with pytest.raises(type(err)):
+        arch_check.hardware_supports_nvml()
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -552,6 +552,7 @@ def test_get_error_name_and_string():
     assert s == b"CUDA_ERROR_INVALID_DEVICE"
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 @pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existence needed")
 def test_device_get_name(device):
     # TODO: Refactor this test once we have nvml bindings to avoid the use of subprocess

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -25,6 +25,7 @@
     PinnedMemoryResourceOptions,
     _device,
 )
+from cuda.core import system as core_system
 from cuda.core._utils.cuda_utils import handle_return
 
 # Import shared test helpers for tests across subprojects.
@@ -59,6 +60,17 @@ def skip_if_managed_memory_unsupported(device):
         pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
 
 
+@pytest.fixture(scope="session")
+def require_nvml_runtime_or_skip_local():
+    if not core_system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+        pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")
+
+    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
+
+    if not hardware_supports_nvml():
+        pytest.skip("NVML runtime is unavailable on this system")
+
+
 def create_managed_memory_resource_or_skip(*args, **kwargs):
     try:
         return ManagedMemoryResource(*args, **kwargs)
@@ -209,13 +221,15 @@ def _mempool_device_impl(num):
 
 
 @pytest.fixture
-def mempool_device_x2():
+def mempool_device_x2(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """Fixture that provides two devices if available, otherwise skips test."""
     return _mempool_device_impl(2)
 
 
 @pytest.fixture
-def mempool_device_x3():
+def mempool_device_x3(request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """Fixture that provides three devices if available, otherwise skips test."""
     return _mempool_device_impl(3)
 

diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py
@@ -5,21 +5,7 @@
 
 import pytest
 
-from cuda.core import system
-
-SHOULD_SKIP_NVML_TESTS = not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE
-
-
-if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
-    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
-
-    SHOULD_SKIP_NVML_TESTS |= not hardware_supports_nvml()
-
-
-skip_if_nvml_unsupported = pytest.mark.skipif(
-    SHOULD_SKIP_NVML_TESTS,
-    reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+, and hardware that supports NVML",
-)
+skip_if_nvml_unsupported = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 
 
 def unsupported_before(device, expected_device_arch):

diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
@@ -25,7 +25,7 @@
 
 @pytest.fixture(autouse=True, scope="module")
 def check_gpu_available():
-    if not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE or system.get_num_devices() == 0:
+    if system.get_num_devices() == 0:
         pytest.skip("No GPUs available to run device tests", allow_module_level=True)
 
 

diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py
@@ -18,6 +18,8 @@
 
 from .conftest import skip_if_nvml_unsupported
 
+pytestmark = skip_if_nvml_unsupported
+
 
 def test_driver_version():
     driver_version = system.get_driver_version()

diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
@@ -26,21 +26,9 @@ def cuda_version():
     return _py_major_ver, _driver_ver
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_to_system_device(deinit_cuda):
-    from cuda.core.system import _system
-
     device = Device()
-
-    if not _system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
-        with pytest.raises(RuntimeError):
-            device.to_system_device()
-        pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")
-
-    from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml
-
-    if not hardware_supports_nvml():
-        pytest.skip("NVML not supported on this platform")
-
     from cuda.core.system import Device as SystemDevice
 
     system_device = device.to_system_device()
@@ -87,6 +75,7 @@ def test_device_alloc_zero_bytes(deinit_cuda):
     assert buffer.device_id == int(device)
 
 
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_device_id(deinit_cuda):
     for device in Device.get_all_devices():
         device.set_current()

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -365,6 +365,7 @@ def test_buffer_external_host():
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_device(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -389,6 +390,7 @@ def test_buffer_external_device(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_pinned_alloc(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -414,6 +416,7 @@ def test_buffer_external_pinned_alloc(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_pinned_registered(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:
@@ -447,6 +450,7 @@ def test_buffer_external_pinned_registered(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_buffer_external_managed(change_device):
     n = ccx_system.get_num_devices()
     if n < 1:

diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
@@ -131,6 +131,7 @@ def test_get_kernel(init_cuda):
         ("cluster_scheduling_policy_preference", int),
     ],
 )
+@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
 def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type):
     kernel, _ = get_saxpy_kernel_cubin
     method = getattr(kernel.attributes, attr)

diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py
@@ -146,7 +146,8 @@ def sample_program_nvvm(init_cuda):
 
 
 @pytest.fixture
-def sample_device_alt(init_cuda):
+def sample_device_alt(init_cuda, request):
+    request.getfixturevalue("require_nvml_runtime_or_skip_local")
     """An alternate Device object (requires multi-GPU)."""
     if system.get_num_devices() < 2:
         pytest.skip("requires multi-GPU")