1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
|
#!/usr/bin/env python3
"""This script runs cuda-memcheck on the specified unit test. Each test case
is run in its isolated process with a timeout so that:
1) different test cases won't influence each other, and
2) in case of hang, the script would still finish in a finite amount of time.
The output will be written to a log file result.log
Example usage:
python run_cuda_memcheck.py ../test_torch.py 600
Note that running cuda-memcheck could be very slow.
"""
import argparse
import asyncio
import multiprocessing
import os
import subprocess
import sys
import cuda_memcheck_common as cmc
import tqdm
import torch
ALL_TESTS = []
GPUS = torch.cuda.device_count()
# parse arguments
parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
parser.add_argument(
"filename", help="the python file for a test, such as test_torch.py"
)
parser.add_argument(
"timeout",
type=int,
help="kill the test if it does not terminate in a certain amount of seconds",
)
parser.add_argument(
"--strict",
action="store_true",
help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
"cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
)
parser.add_argument(
"--nproc",
type=int,
default=multiprocessing.cpu_count(),
help="Number of processes running tests, default to number of cores in the system",
)
parser.add_argument(
"--gpus",
default="all",
help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
)
parser.add_argument(
"--ci",
action="store_true",
help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
"an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
"to stdout instead.",
)
parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
parser.add_argument(
"--rank", type=int, default=0, help="Which piece this process should pick"
)
args = parser.parse_args()
# Filters that ignores cublas/cudnn errors
# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
def is_ignored_only(output):
try:
report = cmc.parse(output)
except cmc.ParseError:
# in case the simple parser fails parsing the output of cuda memcheck
# then this error is never ignored.
return False
count_ignored_errors = 0
for e in report.errors:
if (
"libcublas" in "".join(e.stack)
or "libcudnn" in "".join(e.stack)
or "libcufft" in "".join(e.stack)
):
count_ignored_errors += 1
return count_ignored_errors == report.num_errors
# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
# Discover tests:
# To get a list of tests, run:
# pytest --setup-only test/test_torch.py
# and then parse the output
proc = subprocess.Popen(
["pytest", "--setup-only", args.filename],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = proc.communicate()
lines = stdout.decode().strip().splitlines()
for line in lines:
if "(fixtures used:" in line:
line = line.strip().split()[0]
line = line[line.find("::") + 2 :]
line = line.replace("::", ".")
ALL_TESTS.append(line)
# Do a simple filtering:
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
def is_cpu_only(name):
name = name.lower()
return ("cpu" in name) and "cuda" not in name
ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
# Split all tests into chunks, and only on the selected chunk
ALL_TESTS.sort()
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
start = chunk_size * args.rank
end = chunk_size * (args.rank + 1)
ALL_TESTS = ALL_TESTS[start:end]
# Run tests:
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
# specified by the user and by default is the same as the number of CPUs in the machine.
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
# or as specified by the user
progress = 0
if not args.ci:
logfile = open("result.log", "w")
progressbar = tqdm.tqdm(total=len(ALL_TESTS))
else:
logfile = sys.stdout
# create a fake progress bar that does not display anything
class ProgressbarStub:
def update(self, *args):
return
progressbar = ProgressbarStub()
async def run1(coroutine_id):
global progress
if args.gpus == "all":
gpuid = coroutine_id % GPUS
else:
gpu_assignments = args.gpus.split(":")
assert args.nproc == len(
gpu_assignments
), "Please specify GPU assignment for each process, separated by :"
gpuid = gpu_assignments[coroutine_id]
while progress < len(ALL_TESTS):
test = ALL_TESTS[progress]
progress += 1
cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
proc = await asyncio.create_subprocess_shell(
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
except asyncio.TimeoutError:
print("Timeout:", test, file=logfile)
proc.kill()
if args.ci and not args.nohang:
sys.exit("Hang detected on cuda-memcheck")
else:
if proc.returncode == 0:
print("Success:", test, file=logfile)
else:
stdout = stdout.decode()
stderr = stderr.decode()
should_display = args.strict or not is_ignored_only(stdout)
if should_display:
print("Fail:", test, file=logfile)
print(stdout, file=logfile)
print(stderr, file=logfile)
if args.ci:
sys.exit("Failure detected on cuda-memcheck")
else:
print("Ignored:", test, file=logfile)
del proc
progressbar.update(1)
async def main():
tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
for t in tasks:
await t
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
|