Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions cuda_bindings/cuda/bindings/_test_helpers/arch_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


import os
from contextlib import contextmanager
from functools import cache

import pytest

from cuda.bindings import nvml
from cuda.pathfinder import DynamicLibNotFoundError


def _running_in_ci() -> bool:
return os.environ.get("CI") is not None


@cache
Expand All @@ -16,16 +22,27 @@ def hardware_supports_nvml():
Tries to call the simplest NVML API possible to see if just the basics
works. If not we are probably on one of the platforms where NVML is not
supported at all (e.g. Jetson Orin).

Runtime-load/init failures are treated as "unsupported" on local/dev
machines so NVML test modules skip cleanly. In CI we re-raise those
failures to avoid masking real infrastructure regressions.
"""
nvml.init_v2()
initialized = False
try:
nvml.init_v2()
initialized = True
nvml.system_get_driver_branch()
except (nvml.NotSupportedError, nvml.UnknownError):
return False
except (DynamicLibNotFoundError, nvml.NvmlError):
if _running_in_ci():
raise
return False
else:
return True
finally:
nvml.shutdown()
if initialized:
nvml.shutdown()


@contextmanager
Expand Down
7 changes: 7 additions & 0 deletions cuda_bindings/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

import cuda.bindings.driver as cuda
from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml

# Import shared test helpers for tests across subprojects.
# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
Expand Down Expand Up @@ -46,3 +47,9 @@ def ctx(device):
yield ctx
(err,) = cuda.cuCtxDestroy(ctx)
assert err == cuda.CUresult.CUDA_SUCCESS


@pytest.fixture(scope="session")
def require_nvml_runtime_or_skip_local():
if not hardware_supports_nvml():
pytest.skip("NVML runtime is unavailable on this system")
5 changes: 4 additions & 1 deletion cuda_bindings/tests/nvml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from cuda.bindings import nvml
from cuda.bindings._test_helpers.arch_check import unsupported_before # noqa: F401

pytestmark = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")


class NVMLInitializer:
def __init__(self):
Expand All @@ -27,7 +29,8 @@ def nvml_init():


@pytest.fixture(scope="session", autouse=True)
def device_info():
def device_info(request):
request.getfixturevalue("require_nvml_runtime_or_skip_local")
dev_count = None
bus_id_to_board_details = {}

Expand Down
11 changes: 9 additions & 2 deletions cuda_bindings/tests/nvml/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ def cuda_version_less_than(target):
return get_cuda_version() < target


@pytest.fixture(scope="module")
def require_cuda_13_1_or_skip(request):
request.getfixturevalue("require_nvml_runtime_or_skip_local")
if cuda_version_less_than(13010):
pytest.skip("Introduced in 13.1")


def test_device_capabilities(all_devices):
for device in all_devices:
capabilities = nvml.device_get_capabilities(device)
Expand Down Expand Up @@ -94,7 +101,7 @@ def test_device_get_performance_modes(all_devices):
assert isinstance(modes, str)


@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
def test_device_get_unrepairable_memory_flag(all_devices):
for device in all_devices:
with unsupported_before(device, None):
Expand All @@ -109,7 +116,7 @@ def test_device_vgpu_get_heterogeneous_mode(all_devices):
assert isinstance(mode, int)


@pytest.mark.skipif(cuda_version_less_than(13010), reason="Introduced in 13.1")
@pytest.mark.usefixtures("require_cuda_13_1_or_skip")
def test_read_prm_counters(all_devices):
for device in all_devices:
counters = nvml.PRMCounter_v1(5)
Expand Down
91 changes: 91 additions & 0 deletions cuda_bindings/tests/test_arch_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


import pytest

from cuda.bindings import nvml
from cuda.bindings._test_helpers import arch_check
from cuda.pathfinder import DynamicLibNotFoundError


def _raise(exc):
def _inner():
raise exc

return _inner


def _make_not_supported_error():
err = nvml.NotSupportedError.__new__(nvml.NotSupportedError)
Exception.__init__(err, "Not supported")
return err


def _make_lib_rm_version_mismatch_error():
err = nvml.LibRmVersionMismatchError.__new__(nvml.LibRmVersionMismatchError)
Exception.__init__(err, "Driver/library version mismatch")
return err


def _make_dynamic_lib_not_found_error():
return DynamicLibNotFoundError("Failure finding libnvml.so")


@pytest.fixture(autouse=True)
def clear_hardware_supports_nvml_cache():
arch_check.hardware_supports_nvml.cache_clear()
yield
arch_check.hardware_supports_nvml.cache_clear()


def test_hardware_supports_nvml_returns_true_when_probe_succeeds(monkeypatch):
calls = []

monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", lambda: "560")
monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))

assert arch_check.hardware_supports_nvml() is True
assert calls == ["init", "shutdown"]


def test_hardware_supports_nvml_returns_false_for_not_supported(monkeypatch):
calls = []

monkeypatch.setattr(arch_check.nvml, "init_v2", lambda: calls.append("init"))
monkeypatch.setattr(arch_check.nvml, "system_get_driver_branch", _raise(_make_not_supported_error()))
monkeypatch.setattr(arch_check.nvml, "shutdown", lambda: calls.append("shutdown"))

assert arch_check.hardware_supports_nvml() is False
assert calls == ["init", "shutdown"]


@pytest.mark.parametrize(
"error_factory",
[
_make_lib_rm_version_mismatch_error,
_make_dynamic_lib_not_found_error,
],
)
def test_hardware_supports_nvml_runtime_errors_skip_locally(monkeypatch, error_factory):
monkeypatch.delenv("CI", raising=False)
monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(error_factory()))

assert arch_check.hardware_supports_nvml() is False


@pytest.mark.parametrize(
"error_factory",
[
_make_lib_rm_version_mismatch_error,
_make_dynamic_lib_not_found_error,
],
)
def test_hardware_supports_nvml_runtime_errors_fail_in_ci(monkeypatch, error_factory):
err = error_factory()
monkeypatch.setenv("CI", "1")
monkeypatch.setattr(arch_check.nvml, "init_v2", _raise(err))

with pytest.raises(type(err)):
arch_check.hardware_supports_nvml()
1 change: 1 addition & 0 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ def test_get_error_name_and_string():
assert s == b"CUDA_ERROR_INVALID_DEVICE"


@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
@pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existence needed")
def test_device_get_name(device):
# TODO: Refactor this test once we have nvml bindings to avoid the use of subprocess
Expand Down
18 changes: 16 additions & 2 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
PinnedMemoryResourceOptions,
_device,
)
from cuda.core import system as core_system
from cuda.core._utils.cuda_utils import handle_return

# Import shared test helpers for tests across subprojects.
Expand Down Expand Up @@ -59,6 +60,17 @@ def skip_if_managed_memory_unsupported(device):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")


@pytest.fixture(scope="session")
def require_nvml_runtime_or_skip_local():
if not core_system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")

from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml

if not hardware_supports_nvml():
pytest.skip("NVML runtime is unavailable on this system")


def create_managed_memory_resource_or_skip(*args, **kwargs):
try:
return ManagedMemoryResource(*args, **kwargs)
Expand Down Expand Up @@ -209,13 +221,15 @@ def _mempool_device_impl(num):


@pytest.fixture
def mempool_device_x2():
def mempool_device_x2(request):
request.getfixturevalue("require_nvml_runtime_or_skip_local")
"""Fixture that provides two devices if available, otherwise skips test."""
return _mempool_device_impl(2)


@pytest.fixture
def mempool_device_x3():
def mempool_device_x3(request):
request.getfixturevalue("require_nvml_runtime_or_skip_local")
"""Fixture that provides three devices if available, otherwise skips test."""
return _mempool_device_impl(3)

Expand Down
16 changes: 1 addition & 15 deletions cuda_core/tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,7 @@

import pytest

from cuda.core import system

SHOULD_SKIP_NVML_TESTS = not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE


if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml

SHOULD_SKIP_NVML_TESTS |= not hardware_supports_nvml()


skip_if_nvml_unsupported = pytest.mark.skipif(
SHOULD_SKIP_NVML_TESTS,
reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+, and hardware that supports NVML",
)
skip_if_nvml_unsupported = pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")


def unsupported_before(device, expected_device_arch):
Expand Down
2 changes: 1 addition & 1 deletion cuda_core/tests/system/test_system_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

@pytest.fixture(autouse=True, scope="module")
def check_gpu_available():
if not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE or system.get_num_devices() == 0:
if system.get_num_devices() == 0:
pytest.skip("No GPUs available to run device tests", allow_module_level=True)


Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/system/test_system_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

from .conftest import skip_if_nvml_unsupported

pytestmark = skip_if_nvml_unsupported


def test_driver_version():
driver_version = system.get_driver_version()
Expand Down
15 changes: 2 additions & 13 deletions cuda_core/tests/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,9 @@ def cuda_version():
return _py_major_ver, _driver_ver


@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_to_system_device(deinit_cuda):
from cuda.core.system import _system

device = Device()

if not _system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
with pytest.raises(RuntimeError):
device.to_system_device()
pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")

from cuda.bindings._test_helpers.arch_check import hardware_supports_nvml

if not hardware_supports_nvml():
pytest.skip("NVML not supported on this platform")

from cuda.core.system import Device as SystemDevice

system_device = device.to_system_device()
Expand Down Expand Up @@ -87,6 +75,7 @@ def test_device_alloc_zero_bytes(deinit_cuda):
assert buffer.device_id == int(device)


@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_device_id(deinit_cuda):
for device in Device.get_all_devices():
device.set_current()
Expand Down
4 changes: 4 additions & 0 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ def test_buffer_external_host():


@pytest.mark.parametrize("change_device", [True, False])
@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_buffer_external_device(change_device):
n = ccx_system.get_num_devices()
if n < 1:
Expand All @@ -389,6 +390,7 @@ def test_buffer_external_device(change_device):


@pytest.mark.parametrize("change_device", [True, False])
@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_buffer_external_pinned_alloc(change_device):
n = ccx_system.get_num_devices()
if n < 1:
Expand All @@ -414,6 +416,7 @@ def test_buffer_external_pinned_alloc(change_device):


@pytest.mark.parametrize("change_device", [True, False])
@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_buffer_external_pinned_registered(change_device):
n = ccx_system.get_num_devices()
if n < 1:
Expand Down Expand Up @@ -447,6 +450,7 @@ def test_buffer_external_pinned_registered(change_device):


@pytest.mark.parametrize("change_device", [True, False])
@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_buffer_external_managed(change_device):
n = ccx_system.get_num_devices()
if n < 1:
Expand Down
1 change: 1 addition & 0 deletions cuda_core/tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def test_get_kernel(init_cuda):
("cluster_scheduling_policy_preference", int),
],
)
@pytest.mark.usefixtures("require_nvml_runtime_or_skip_local")
def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type):
kernel, _ = get_saxpy_kernel_cubin
method = getattr(kernel.attributes, attr)
Expand Down
3 changes: 2 additions & 1 deletion cuda_core/tests/test_object_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ def sample_program_nvvm(init_cuda):


@pytest.fixture
def sample_device_alt(init_cuda):
def sample_device_alt(init_cuda, request):
request.getfixturevalue("require_nvml_runtime_or_skip_local")
"""An alternate Device object (requires multi-GPU)."""
if system.get_num_devices() < 2:
pytest.skip("requires multi-GPU")
Expand Down
Loading