1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
|
#!/usr/bin/env python
"""
Count number of references to tokens in lexer source
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:program:`count_token_references` counts how many references to all existing
tokens it can find by "grepping" the the source code of the lexers. This can
be used to find typos in token names, as those tokens are only used by one lexer.
:program:`count_token_references` supports the following options:
.. program:: count_token_references
.. option:: -v, --verbose
This gives output while the script is collecting information.
.. option:: --minfiles <COUNT>
Only report about tokens that are referenced in at least this many lexer
source files (default 1).
.. option:: --maxfiles <COUNT>
Only report about tokens that are referenced in at most this many lexer
source files (default 1).
.. option:: --minlines <COUNT>
Only report about tokens that are referenced in at least this many lexer
source lines (default 1).
.. option:: --maxlines <COUNT>
Only report about tokens that are referenced in at most this many lexer
source lines (default 10).
.. option:: -s, --subtokens
When ``--subtoken`` is given each token is also counted for each of its
parent tokens. I.e. if we have 10 occurrences of the token
``Token.Literal.Number.Integer`` and 10 occurrences of the token
``Token.Literal.Number.Hex`` but none for ``Token.Literal.Number``, with
``--subtoken`` ``Token.Literal.Number`` would be counted as having
20 references.
"""
import sys, argparse, re, pathlib
from pygments import token, lexers
def lookup_all_lexers():
"""
Iterate through all lexers and fetch them.
This should create all tokens that any of the lexers produce.
"""
count = 0
for (name, aliases, patterns, mimetypes) in lexers.get_all_lexers():
for a in aliases:
l = lexers.get_lexer_by_name(a)
break
else:
for p in patterns:
l = lexers.get_lexer_for_filename(p)
break
else:
for m in mimetypes:
l = lexers.get_lexer_for_mimetype(m)
break
count += 1
return count
def fetch_lexer_sources():
"""
Return the source code of all lexers as a dictionary, mapping filenames
to a list of lines.
"""
lexer_dir = (pathlib.Path(__file__).parent / "../pygments/lexers").resolve()
lexer_sources = {
fn: fn.read_text(encoding='utf-8').splitlines(keepends=False)
for fn in lexer_dir.glob("*.py")
}
return lexer_sources
def sub_tokens(token):
"""
Generator that yields a token and all of its sub-tokens recursively.
"""
yield token
for subtoken in token.subtypes:
yield from sub_tokens(subtoken)
class FileCount:
"""
Stores information about line numbers in a file.
This is used to store from which lines in a files a certain token is
referenced.
"""
def __init__(self, filename):
self.filename = filename
self.lines = []
def __str__(self):
if len(self.lines) > 3:
lines = ", ".join(f"{line:,}" for line in self.lines[:5])
lines = f"{lines}, ... ({len(lines):,} lines)"
else:
lines = ", ".join(f"{line:,}" for line in self.lines)
return f"{self.filename.name}[{lines}]"
def add(self, linenumber):
self.lines.append(linenumber)
def count_lines(self):
return len(self.lines)
class TokenCount:
"""
Stores information about a token and in which files it is referenced.
"""
def __init__(self, token):
self.token = token
self.files = {}
def add(self, filename, linenumber):
if filename not in self.files:
self.files[filename] = FileCount(filename)
self.files[filename].add(linenumber)
def __str__(self):
if len(self.files) > 3:
files = []
for (i, filecount) in enumerate(self.files.values()):
files.append(str(filecount))
if i >= 5:
break
files = ", ".join(files) + f", ... ({len(self.files):,} files)"
else:
files = ", ".join(str(filecount) for filecount in self.files.values())
return f"{self.count_files():,} files, {self.count_lines():,} locations: {files}"
def count_files(self):
return len(self.files)
def count_lines(self):
return sum(fc.count_lines() for fc in self.files.values())
def find_token_references(lexer_sources, args):
"""
Find all references to all tokens in the source code of all lexers.
Note that this can't be 100% reliable, as it searches the source code for
certain patterns: It searches for the last two components of a token name,
i.e. to find references to the token ``Token.Literal.Number.Integer.Long``
it searches for the regular expression ``\\bInteger.Long\\b``. This
won't work reliably for top level token like ``Token.String`` since this
is often referred to as ``String``, but searching for ``\\bString\\b``
yields too many false positives.
"""
# Maps token to :class:`TokenCount` objects.
token_references = {}
# Search for each token in each lexer source file and record in which file
# and in which line they are referenced
for t in sub_tokens(token.Token):
parts = list(t)[-2:]
if len(parts) == 0:
name = "Token"
elif len(parts) == 1:
name = f"Token.{parts[0]}"
else:
name = ".".join(parts)
token_references[t] = tokencount = TokenCount(t)
if name != "Token":
pattern = re.compile(f"\\b{name}\\b")
for (filename, sourcelines) in lexer_sources.items():
for (i, line) in enumerate(sourcelines, 1):
if pattern.search(line) is not None:
tokencount.add(filename, i)
if args.subtoken:
t2 = t
while t2 is not token.Token:
t2 = t2.parent
tokencount2 = token_references[t2]
tokencount2.add(filename, i)
return token_references
def print_result(token_references, args):
def key(item):
return (item[1].count_files(), item[1].count_lines())
for (token, locations) in sorted(token_references.items(), key=key):
if args.minfiles <= locations.count_files() <= args.maxfiles and \
args.minlines <= locations.count_lines() <= args.maxlines:
print(f"{token}: {locations}")
def main(args=None):
p = argparse.ArgumentParser(description="Count how often each token is used by the lexers")
p.add_argument(
"-v", "--verbose",
dest="verbose", help="Give more output.",
default=False, action="store_true"
)
p.add_argument(
"--minfiles",
dest="minfiles", metavar="COUNT", type=int,
help="Report all tokens referenced by at least COUNT lexer source files (default %(default)s)",
default=1
)
p.add_argument(
"--maxfiles",
dest="maxfiles", metavar="COUNT", type=int,
help="Report all tokens referenced by at most COUNT lexer source files (default %(default)s)",
default=1
)
p.add_argument(
"--minlines",
dest="minlines", metavar="COUNT", type=int,
help="Report all tokens referenced by at least COUNT lexer source lines (default %(default)s)",
default=1
)
p.add_argument(
"--maxlines",
dest="maxlines", metavar="COUNT", type=int,
help="Report all tokens referenced by at most COUNT lexer source lines (default %(default)s)",
default=10
)
p.add_argument(
"-s", "--subtoken",
dest="subtoken",
help="Include count of references to subtokens in the count for each token (default %(default)s)",
default=False, action="store_true"
)
args = p.parse_args(args)
if args.verbose:
print("Looking up all lexers ... ", end="", flush=True)
count = lookup_all_lexers()
if args.verbose:
print(f"found {count:,} lexers")
if args.verbose:
print("Fetching lexer source code ... ", end="", flush=True)
lexer_sources = fetch_lexer_sources()
if args.verbose:
print(f"found {len(lexer_sources):,} lexer source files")
if args.verbose:
print("Finding token references ... ", end="", flush=True)
token_references = find_token_references(lexer_sources, args)
if args.verbose:
print(f"found references to {len(token_references):,} tokens")
if args.verbose:
print()
print("Result:")
print_result(token_references, args)
if __name__ == "__main__":
sys.exit(main())
|