File: nvml.py

package info (click to toggle)
dask.distributed 2022.12.1%2Bds.1-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 10,164 kB
  • sloc: python: 81,938; javascript: 1,549; makefile: 228; sh: 100
file content (337 lines) | stat: -rw-r--r-- 10,711 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
from __future__ import annotations

import os
from enum import IntEnum, auto
from platform import uname
from typing import NamedTuple

from packaging.version import parse as parse_version

import dask

try:
    import pynvml
except ImportError:
    pynvml = None


class NVMLState(IntEnum):
    UNINITIALIZED = auto()
    """No attempt yet made to initialize PyNVML"""
    INITIALIZED = auto()
    """PyNVML was successfully initialized"""
    DISABLED_PYNVML_NOT_AVAILABLE = auto()
    """PyNVML not installed"""
    DISABLED_CONFIG = auto()
    """PyNVML diagnostics disabled by ``distributed.diagnostics.nvml`` config setting"""
    DISABLED_LIBRARY_NOT_FOUND = auto()
    """PyNVML available, but NVML not installed"""
    DISABLED_WSL_INSUFFICIENT_DRIVER = auto()
    """PyNVML and NVML available, but on WSL and the driver version is insufficient"""


class CudaDeviceInfo(NamedTuple):
    uuid: bytes | None = None
    device_index: int | None = None
    mig_index: int | None = None


class CudaContext(NamedTuple):
    has_context: bool
    device_info: CudaDeviceInfo | None = None


# Initialisation must occur per-process, so an initialised state is a
# (state, pid) pair
NVML_STATE = (
    NVMLState.DISABLED_PYNVML_NOT_AVAILABLE
    if pynvml is None
    else NVMLState.UNINITIALIZED
)
"""Current initialization state"""

NVML_OWNER_PID = None
"""PID of process that successfully called pynvml.nvmlInit"""

MINIMUM_WSL_VERSION = "512.15"


def is_initialized():
    """Is pynvml initialized on this process?"""
    return NVML_STATE == NVMLState.INITIALIZED and NVML_OWNER_PID == os.getpid()


def _in_wsl():
    """Check if we are in Windows Subsystem for Linux; some PyNVML queries are not supported there.
    Taken from https://www.scivision.dev/python-detect-wsl/
    """
    return "microsoft-standard" in uname().release


def init_once():
    """Idempotent (per-process) initialization of PyNVML

    Notes
    -----

    Modifies global variables NVML_STATE and NVML_OWNER_PID"""
    global NVML_STATE, NVML_OWNER_PID

    if NVML_STATE in {
        NVMLState.DISABLED_PYNVML_NOT_AVAILABLE,
        NVMLState.DISABLED_CONFIG,
        NVMLState.DISABLED_LIBRARY_NOT_FOUND,
        NVMLState.DISABLED_WSL_INSUFFICIENT_DRIVER,
    }:
        return
    elif NVML_STATE == NVMLState.INITIALIZED and NVML_OWNER_PID == os.getpid():
        return
    elif NVML_STATE == NVMLState.UNINITIALIZED and not dask.config.get(
        "distributed.diagnostics.nvml"
    ):
        NVML_STATE = NVMLState.DISABLED_CONFIG
        return
    elif (
        NVML_STATE == NVMLState.INITIALIZED and NVML_OWNER_PID != os.getpid()
    ) or NVML_STATE == NVMLState.UNINITIALIZED:
        try:
            pynvml.nvmlInit()
        except (
            pynvml.NVMLError_LibraryNotFound,
            pynvml.NVMLError_DriverNotLoaded,
            pynvml.NVMLError_Unknown,
        ):
            NVML_STATE = NVMLState.DISABLED_LIBRARY_NOT_FOUND
            return

        if _in_wsl() and parse_version(
            pynvml.nvmlSystemGetDriverVersion().decode()
        ) < parse_version(MINIMUM_WSL_VERSION):
            NVML_STATE = NVMLState.DISABLED_WSL_INSUFFICIENT_DRIVER
            return
        else:
            from distributed.worker import add_gpu_metrics

            # initialization was successful
            NVML_STATE = NVMLState.INITIALIZED
            NVML_OWNER_PID = os.getpid()
            add_gpu_metrics()
    else:
        raise RuntimeError(
            f"Unhandled initialisation state ({NVML_STATE=}, {NVML_OWNER_PID=})"
        )


def device_get_count():
    init_once()
    if not is_initialized():
        return 0
    else:
        return pynvml.nvmlDeviceGetCount()


def _pynvml_handles():
    count = device_get_count()
    if NVML_STATE == NVMLState.DISABLED_PYNVML_NOT_AVAILABLE:
        raise RuntimeError("NVML monitoring requires PyNVML and NVML to be installed")
    elif NVML_STATE == NVMLState.DISABLED_LIBRARY_NOT_FOUND:
        raise RuntimeError("PyNVML is installed, but NVML is not")
    elif NVML_STATE == NVMLState.DISABLED_WSL_INSUFFICIENT_DRIVER:
        raise RuntimeError(
            "Outdated NVIDIA drivers for WSL, please upgrade to "
            f"{MINIMUM_WSL_VERSION} or newer"
        )
    elif NVML_STATE == NVMLState.DISABLED_CONFIG:
        raise RuntimeError(
            "PyNVML monitoring disabled by 'distributed.diagnostics.nvml' "
            "config setting"
        )
    elif count == 0:
        raise RuntimeError("No GPUs available")
    else:
        try:
            gpu_idx = next(
                map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(","))
            )
        except ValueError:
            # CUDA_VISIBLE_DEVICES is not set, take first device
            gpu_idx = 0
        return pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)


def _running_process_matches(handle):
    """Check whether the current process is same as that of handle

    Parameters
    ----------
    handle : pyvnml.nvml.LP_struct_c_nvmlDevice_t
        NVML handle to CUDA device

    Returns
    -------
    out : bool
        Whether the device handle has a CUDA context on the running process.
    """
    init_once()
    if hasattr(pynvml, "nvmlDeviceGetComputeRunningProcesses_v2"):
        running_processes = pynvml.nvmlDeviceGetComputeRunningProcesses_v2(handle)
    else:
        running_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
    return any(os.getpid() == proc.pid for proc in running_processes)


def has_cuda_context():
    """Check whether the current process already has a CUDA context created.

    Returns
    -------
    out : CudaContext
        Object containing information as to whether the current process has a CUDA
        context created, and in the positive case containing also information about
        the device the context belongs to.
    """
    init_once()
    if is_initialized():
        for index in range(device_get_count()):
            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
            try:
                mig_current_mode, mig_pending_mode = pynvml.nvmlDeviceGetMigMode(handle)
            except pynvml.NVMLError_NotSupported:
                mig_current_mode = pynvml.NVML_DEVICE_MIG_DISABLE
            if mig_current_mode == pynvml.NVML_DEVICE_MIG_ENABLE:
                for mig_index in range(pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)):
                    try:
                        mig_handle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
                            handle, mig_index
                        )
                    except pynvml.NVMLError_NotFound:
                        # No MIG device with that index
                        continue
                    if _running_process_matches(mig_handle):
                        uuid = pynvml.nvmlDeviceGetUUID(mig_handle)
                        return CudaContext(
                            has_context=True,
                            device_info=CudaDeviceInfo(
                                uuid=uuid, device_index=index, mig_index=mig_index
                            ),
                        )
            else:
                if _running_process_matches(handle):
                    uuid = pynvml.nvmlDeviceGetUUID(handle)
                    return CudaContext(
                        has_context=True,
                        device_info=CudaDeviceInfo(uuid=uuid, device_index=index),
                    )
    return CudaContext(has_context=False)


def get_device_index_and_uuid(device):
    """Get both device index and UUID from device index or UUID

    Parameters
    ----------
    device : int, bytes or str
        An ``int`` with the index of a GPU, or ``bytes`` or ``str`` with the UUID
        of a CUDA (either GPU or MIG) device.

    Returns
    -------
    out : CudaDeviceInfo
        Object containing information about the device.

    Examples
    --------
    >>> get_device_index_and_uuid(0)  # doctest: +SKIP
    {'device-index': 0, 'uuid': b'GPU-e1006a74-5836-264f-5c26-53d19d212dfe'}

    >>> get_device_index_and_uuid('GPU-e1006a74-5836-264f-5c26-53d19d212dfe')  # doctest: +SKIP
    {'device-index': 0, 'uuid': b'GPU-e1006a74-5836-264f-5c26-53d19d212dfe'}

    >>> get_device_index_and_uuid('MIG-7feb6df5-eccf-5faa-ab00-9a441867e237')  # doctest: +SKIP
    {'device-index': 0, 'uuid': b'MIG-7feb6df5-eccf-5faa-ab00-9a441867e237'}
    """
    init_once()
    try:
        device_index = int(device)
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        uuid = pynvml.nvmlDeviceGetUUID(device_handle)
    except ValueError:
        uuid = device if isinstance(device, bytes) else bytes(device, "utf-8")

        # Validate UUID, get index and UUID as seen with `nvidia-smi -L`
        uuid_handle = pynvml.nvmlDeviceGetHandleByUUID(uuid)
        device_index = pynvml.nvmlDeviceGetIndex(uuid_handle)
        uuid = pynvml.nvmlDeviceGetUUID(uuid_handle)

    return CudaDeviceInfo(uuid=uuid, device_index=device_index)


def get_device_mig_mode(device):
    """Get MIG mode for a device index or UUID

    Parameters
    ----------
    device: int, bytes or str
        An ``int`` with the index of a GPU, or ``bytes`` or ``str`` with the UUID
        of a CUDA (either GPU or MIG) device.

    Returns
    -------
    out : list
        A ``list`` with two integers ``[current_mode, pending_mode]``.
    """
    init_once()
    try:
        device_index = int(device)
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
    except ValueError:
        uuid = device if isinstance(device, bytes) else bytes(device, "utf-8")
        handle = pynvml.nvmlDeviceGetHandleByUUID(uuid)
    try:
        return pynvml.nvmlDeviceGetMigMode(handle)
    except pynvml.NVMLError_NotSupported:
        return [0, 0]


def _get_utilization(h):
    try:
        return pynvml.nvmlDeviceGetUtilizationRates(h).gpu
    except pynvml.NVMLError_NotSupported:
        return None


def _get_memory_used(h):
    try:
        return pynvml.nvmlDeviceGetMemoryInfo(h).used
    except pynvml.NVMLError_NotSupported:
        return None


def _get_memory_total(h):
    try:
        return pynvml.nvmlDeviceGetMemoryInfo(h).total
    except pynvml.NVMLError_NotSupported:
        return None


def _get_name(h):
    try:
        return pynvml.nvmlDeviceGetName(h).decode()
    except pynvml.NVMLError_NotSupported:
        return None


def real_time():
    h = _pynvml_handles()
    return {
        "utilization": _get_utilization(h),
        "memory-used": _get_memory_used(h),
    }


def one_time():
    h = _pynvml_handles()
    return {
        "memory-total": _get_memory_total(h),
        "name": _get_name(h),
    }