1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
import csv
import os
import re
import sys
# This script takes the logs produced by the benchmark scripts (e.g.,
# torchbench.py) and parses it into a CSV file that summarizes what
# is failing and why. It is kept separate from the benchmark script
# emitting a more structured output as it is often more convenient
# to iterate quickly on log files offline instead of having to make
# a change to the benchmark script and then do a full sweep to see
# the updates.
#
# This script is not very well written, feel free to rewrite it as necessary
assert len(sys.argv) == 2
full_log = open(sys.argv[1]).read()
# If the log contains a gist URL, extract it so we can include it in the CSV
gist_url = ""
m = re.search(r"https://gist.github.com/[a-f0-9]+", full_log)
if m is not None:
gist_url = m.group(0)
# Split the log into an entry per benchmark
entries = re.split(
r"(?:cuda (?:train|eval) +([^ ]+)|WARNING:root:([^ ]+) failed to load)", full_log
)[1:]
# Entries schema example:
# `['hf_Bert', None, '
# PASS\nTIMING: entire_frame_compile:1.80925 backend_compile:6e-05\nDynamo produced 1 graph(s) covering 367 ops\n']`
def chunker(seq, size):
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
c = 0
i = 0
out = csv.DictWriter(
sys.stdout,
[
"bench",
"name",
"result",
"component",
"context",
"explain",
"frame_time",
"backend_time",
"graph_count",
"op_count",
"graph_breaks",
"unique_graph_breaks",
],
dialect="excel",
)
out.writeheader()
out.writerow({"explain": gist_url})
# Sometimes backtraces will be in third party code, which results
# in very long file names. Delete the absolute path in this case.
def normalize_file(f):
if "site-packages/" in f:
return f.split("site-packages/", 2)[1]
else:
return os.path.relpath(f)
# Assume we run torchbench, huggingface, timm_models in that order
# (as output doesn't say which suite the benchmark is part of)
# TODO: make this more robust
bench = "torchbench"
# 3 = 1 + number of matches in the entries split regex
for name, name2, log in chunker(entries, 3):
if name is None:
name = name2
if name.startswith("Albert"):
bench = "huggingface"
elif name.startswith("adv_inc"):
bench = "timm_models"
# Payload that will go into the csv
r = "UNKNOWN"
explain = ""
component = ""
context = ""
if "PASS" in log:
r = "PASS"
if "TIMEOUT" in log:
r = "FAIL TIMEOUT"
if "Accuracy failed" in log:
r = "FAIL ACCURACY"
# Attempt to extract out useful information from the traceback
log = log.split(
"The above exception was the direct cause of the following exception"
)[0]
split = log.split("Traceback (most recent call last)", maxsplit=1)
if len(split) == 2:
log = split[1]
log = log.split("Original traceback:")[0]
m = re.search(
r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\n([A-Za-z]+(?:Error|Exception|NotImplementedError): ?.*)',
log,
)
if m is not None:
r = "FAIL"
component = f"{normalize_file(m.group(1))}:{m.group(2)}"
context = m.group(3)
explain = f"{m.group(4)}"
else:
m = re.search(
r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\nAssertionError', log
)
if m is not None:
r = "FAIL"
component = f"{normalize_file(m.group(1))}:{m.group(2)}"
context = m.group(3)
explain = "AssertionError"
# Sometimes, the benchmark will say FAIL without any useful info
# See https://github.com/pytorch/torchdynamo/issues/1910
if "FAIL" in log:
r = "FAIL"
if r == "UNKNOWN":
c += 1
backend_time = None
frame_time = None
if "TIMING:" in log:
result = re.search("TIMING:(.*)\n", log).group(1)
split_str = result.split("backend_compile:")
if len(split_str) == 2:
backend_time = float(split_str[1])
frame_time = float(split_str[0].split("entire_frame_compile:")[1])
if "STATS:" in log:
result = re.search("STATS:(.*)\n", log).group(1)
# call_* op count: 970 | FakeTensor.__torch_dispatch__:35285 | ProxyTorchDispatchMode.__torch_dispatch__:13339
split_all = result.split("|")
# TODO: rewrite this to work with arbitrarily many stats
graph_count = None
op_count = None
graph_breaks = None
unique_graph_breaks = None
if m := re.search(
r"Dynamo produced (\d+) graphs covering (\d+) ops with (\d+) graph breaks \((\d+) unique\)",
log,
):
graph_count = m.group(1)
op_count = m.group(2)
graph_breaks = m.group(3)
unique_graph_breaks = m.group(4)
# If the context string is too long, don't put it in the CSV.
# This is a hack to try to make it more likely that Google Sheets will
# offer to split columns
if len(context) > 78:
context = ""
# Temporary file names are meaningless, report it's generated code in this
# case
if "/tmp/" in component:
component = "generated code"
context = ""
out.writerow(
{
"bench": bench,
"name": name,
"result": r,
"component": component,
"context": context,
"explain": explain,
"frame_time": frame_time,
"backend_time": backend_time,
"graph_count": graph_count,
"op_count": op_count,
"graph_breaks": graph_breaks,
"unique_graph_breaks": unique_graph_breaks,
}
)
i += 1
if c:
print(f"failed to classify {c} entries", file=sys.stderr)
|