File: nvtx.py

package info (click to toggle)
nvidia-cuda-toolkit 12.4.1-3
  • links: PTS, VCS
  • area: non-free
  • in suites: forky, sid
  • size: 18,505,836 kB
  • sloc: ansic: 203,477; cpp: 64,769; python: 34,699; javascript: 22,006; xml: 13,410; makefile: 3,085; sh: 2,343; perl: 352
file content (125 lines) | stat: -rw-r--r-- 4,621 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.

import pandas as pd

from collections import defaultdict, OrderedDict


def _compute_gpu_projection_df(nvtx_df, cuda_df, cuda_nvtx_index_map):
    # Each NVTX index will be associated with the minimum start time and the
    # maximum end time of the CUDA operations that the corresponsing NVTX range
    # encloses.
    nvtx_gpu_start_dict = OrderedDict()
    nvtx_gpu_end_dict = OrderedDict()

    for cuda_row in cuda_df.itertuples():
        if cuda_row.Index not in cuda_nvtx_index_map:
            continue

        nvtx_indices = cuda_nvtx_index_map[cuda_row.Index]
        for nvtx_index in nvtx_indices:
            if nvtx_index not in nvtx_gpu_start_dict:
                nvtx_gpu_start_dict[nvtx_index] = cuda_row.gpu_start
                nvtx_gpu_end_dict[nvtx_index] = cuda_row.gpu_end
                continue
            if cuda_row.gpu_start < nvtx_gpu_start_dict[nvtx_index]:
                nvtx_gpu_start_dict[nvtx_index] = cuda_row.gpu_start
            if cuda_row.gpu_end > nvtx_gpu_end_dict[nvtx_index]:
                nvtx_gpu_end_dict[nvtx_index] = cuda_row.gpu_end

    return pd.DataFrame(
        {
            "text": nvtx_df.loc[nvtx_gpu_end_dict.keys(), "text"],
            "start": nvtx_gpu_start_dict,
            "end": nvtx_gpu_end_dict,
        }
    )


def _find_cuda_nvtx_ranges(nvtx_df, cuda_df):
    # Each CUDA index will be associated with a set of indices of NVTX ranges
    # that enclose the corresponding CUDA operation.
    cuda_nvtx_index_map = defaultdict(set)

    cuda_time_df = pd.DataFrame(
        data={"start": cuda_df["start"], "end": cuda_df["end"]}
    ).sort_values("start")
    nvtx_start_df = pd.DataFrame(data={"time": nvtx_df["start"]}).sort_values("time")
    nvtx_end_df = pd.DataFrame(data={"time": nvtx_df["end"]}).sort_values("time")

    cuda_iter = iter(cuda_time_df.itertuples())
    nvtx_start_iter = iter(nvtx_start_df.itertuples())
    nvtx_end_iter = iter(nvtx_end_df.itertuples())

    cuda_row = next(cuda_iter)
    nvtx_start_row = next(nvtx_start_iter)
    nvtx_end_row = next(nvtx_end_iter)

    nvtx_active_indices = set()

    while True:
        if (
            nvtx_start_row is not None
            and nvtx_start_row.time <= nvtx_end_row.time
            and nvtx_start_row.time <= cuda_row.start
        ):
            nvtx_active_indices.add(nvtx_start_row.Index)

            try:
                nvtx_start_row = next(nvtx_start_iter)
            except StopIteration:
                nvtx_start_row = None
        elif nvtx_end_row.time <= cuda_row.start or nvtx_end_row.time <= cuda_row.end:
            nvtx_active_indices.remove(nvtx_end_row.Index)

            try:
                nvtx_end_row = next(nvtx_end_iter)
            except StopIteration:
                break
        else:
            if nvtx_active_indices:
                cuda_nvtx_index_map[cuda_row.Index] = nvtx_active_indices.copy()

            try:
                cuda_row = next(cuda_iter)
            except StopIteration:
                break

    return dict(cuda_nvtx_index_map)


def project_nvtx_onto_gpu(nvtx_df, cuda_df):
    """Project the NVTX ranges from the CPU onto the GPU.

    The projected range will have the start timestamp of the first enclosed GPU
    operation and the end timestamp of the last enclosed GPU operation.
    """
    # Filter ranges that are incomplete or end on a different thread.
    filtered_nvtx_df = nvtx_df[
        nvtx_df["start"].notnull()
        & nvtx_df["end"].notnull()
        & nvtx_df["endGlobalTid"].isnull()
    ]

    cuda_nvtx_index_map = {}

    nvtx_gdf = filtered_nvtx_df.groupby("globalTid")
    cuda_gdf = cuda_df.groupby("globalTid")

    for global_tid, nvtx_tid_df in nvtx_gdf:
        if global_tid not in cuda_gdf.groups:
            continue

        cuda_tid_df = cuda_gdf.get_group(global_tid)
        cuda_nvtx_tid_index_map = _find_cuda_nvtx_ranges(nvtx_tid_df, cuda_tid_df)
        cuda_nvtx_index_map.update(cuda_nvtx_tid_index_map)

    return _compute_gpu_projection_df(filtered_nvtx_df, cuda_df, cuda_nvtx_index_map)