File: run_cuda_memcheck.py

package info (click to toggle)
pytorch-cuda 2.6.0%2Bdfsg-7
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 161,620 kB
  • sloc: python: 1,278,832; cpp: 900,322; ansic: 82,710; asm: 7,754; java: 3,363; sh: 2,811; javascript: 2,443; makefile: 597; ruby: 195; xml: 84; objc: 68
file content (206 lines) | stat: -rwxr-xr-x 6,868 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python3

"""This script runs cuda-memcheck on the specified unit test. Each test case
is run in its isolated process with a timeout so that:
1) different test cases won't influence each other, and
2) in case of hang, the script would still finish in a finite amount of time.
The output will be written to a log file result.log

Example usage:
    python run_cuda_memcheck.py ../test_torch.py 600

Note that running cuda-memcheck could be very slow.
"""

import argparse
import asyncio
import multiprocessing
import os
import subprocess
import sys

import cuda_memcheck_common as cmc
import tqdm

import torch


ALL_TESTS = []
GPUS = torch.cuda.device_count()

# parse arguments
parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
parser.add_argument(
    "filename", help="the python file for a test, such as test_torch.py"
)
parser.add_argument(
    "timeout",
    type=int,
    help="kill the test if it does not terminate in a certain amount of seconds",
)
parser.add_argument(
    "--strict",
    action="store_true",
    help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
    "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
)
parser.add_argument(
    "--nproc",
    type=int,
    default=multiprocessing.cpu_count(),
    help="Number of processes running tests, default to number of cores in the system",
)
parser.add_argument(
    "--gpus",
    default="all",
    help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
)
parser.add_argument(
    "--ci",
    action="store_true",
    help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
    "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
    "to stdout instead.",
)
parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
parser.add_argument(
    "--rank", type=int, default=0, help="Which piece this process should pick"
)
args = parser.parse_args()


# Filters that ignores cublas/cudnn errors
# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
def is_ignored_only(output):
    try:
        report = cmc.parse(output)
    except cmc.ParseError:
        # in case the simple parser fails parsing the output of cuda memcheck
        # then this error is never ignored.
        return False
    count_ignored_errors = 0
    for e in report.errors:
        if (
            "libcublas" in "".join(e.stack)
            or "libcudnn" in "".join(e.stack)
            or "libcufft" in "".join(e.stack)
        ):
            count_ignored_errors += 1
    return count_ignored_errors == report.num_errors


# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"

# Discover tests:
# To get a list of tests, run:
# pytest --setup-only test/test_torch.py
# and then parse the output
proc = subprocess.Popen(
    ["pytest", "--setup-only", args.filename],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
)
stdout, stderr = proc.communicate()
lines = stdout.decode().strip().splitlines()
for line in lines:
    if "(fixtures used:" in line:
        line = line.strip().split()[0]
        line = line[line.find("::") + 2 :]
        line = line.replace("::", ".")
        ALL_TESTS.append(line)


# Do a simple filtering:
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
def is_cpu_only(name):
    name = name.lower()
    return ("cpu" in name) and "cuda" not in name


ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

# Split all tests into chunks, and only on the selected chunk
ALL_TESTS.sort()
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
start = chunk_size * args.rank
end = chunk_size * (args.rank + 1)
ALL_TESTS = ALL_TESTS[start:end]

# Run tests:
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
# This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
# specified by the user and by default is the same as the number of CPUs in the machine.
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
# or as specified by the user
progress = 0
if not args.ci:
    logfile = open("result.log", "w")
    progressbar = tqdm.tqdm(total=len(ALL_TESTS))
else:
    logfile = sys.stdout

    # create a fake progress bar that does not display anything
    class ProgressbarStub:
        def update(self, *args):
            return

    progressbar = ProgressbarStub()


async def run1(coroutine_id):
    global progress

    if args.gpus == "all":
        gpuid = coroutine_id % GPUS
    else:
        gpu_assignments = args.gpus.split(":")
        assert args.nproc == len(
            gpu_assignments
        ), "Please specify GPU assignment for each process, separated by :"
        gpuid = gpu_assignments[coroutine_id]

    while progress < len(ALL_TESTS):
        test = ALL_TESTS[progress]
        progress += 1
        cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
        proc = await asyncio.create_subprocess_shell(
            cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
        )
        try:
            stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
        except asyncio.TimeoutError:
            print("Timeout:", test, file=logfile)
            proc.kill()
            if args.ci and not args.nohang:
                sys.exit("Hang detected on cuda-memcheck")
        else:
            if proc.returncode == 0:
                print("Success:", test, file=logfile)
            else:
                stdout = stdout.decode()
                stderr = stderr.decode()
                should_display = args.strict or not is_ignored_only(stdout)
                if should_display:
                    print("Fail:", test, file=logfile)
                    print(stdout, file=logfile)
                    print(stderr, file=logfile)
                    if args.ci:
                        sys.exit("Failure detected on cuda-memcheck")
                else:
                    print("Ignored:", test, file=logfile)
        del proc
        progressbar.update(1)


async def main():
    tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
    for t in tasks:
        await t


if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())