1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
|
import pytest
import os
pynvml = pytest.importorskip("pynvml")
from distributed.diagnostics import nvml
from distributed.utils_test import gen_cluster
def test_one_time():
output = nvml.one_time()
assert "memory-total" in output
assert "name" in output
assert len(output["name"]) > 0
def test_1_visible_devices():
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
output = nvml.one_time()
h = nvml._pynvml_handles()
assert output["memory-total"] == pynvml.nvmlDeviceGetMemoryInfo(h).total
@pytest.mark.parametrize("CVD", ["1,0", "0,1"])
def test_2_visible_devices(CVD):
if pynvml.nvmlDeviceGetCount() <= 1:
pytest.skip("Machine only has a single GPU")
os.environ["CUDA_VISIBLE_DEVICES"] = CVD
idx = int(CVD.split(",")[0])
h = nvml._pynvml_handles()
h2 = pynvml.nvmlDeviceGetHandleByIndex(idx)
s = pynvml.nvmlDeviceGetSerial(h)
s2 = pynvml.nvmlDeviceGetSerial(h2)
assert s == s2
@gen_cluster()
async def test_gpu_metrics(s, a, b):
h = nvml._pynvml_handles()
assert "gpu" in a.metrics
assert (
s.workers[a.address].metrics["gpu"]["memory-used"]
== pynvml.nvmlDeviceGetMemoryInfo(h).used
)
assert "gpu" in a.startup_information
assert (
s.workers[a.address].extra["gpu"]["name"]
== pynvml.nvmlDeviceGetName(h).decode()
)
|