1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
|
#!/usr/bin/env python3
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later
import argparse
import aiohttp
import asyncio
import os
import re
import shutil
import signal
import subprocess
# This utility scans the dissector code for URLs, then attempts to
# fetch the links. The results are shown in stdout, but also, at
# the end of the run, written to files:
# - URLs that couldn't be loaded are written to failures.txt
# - working URLs are written to successes.txt
# - any previous failures.txt is also copied to failures_last_run.txt
#
# N.B. preferred form of RFC link is e.g., https://tools.ietf.org/html/rfc4349
# TODO:
# - option to write back to dissector file when there is a failure?
# - optionally parse previous/recent successes.txt and avoid fetching them again?
# - make sure URLs are really within comments in code?
# - use urllib.parse or similar to better check URLs?
# - improve regex to allow '+' in URL (like confluence uses)
# Try to exit soon after Ctrl-C is pressed.
should_exit = False
def signal_handler(sig, frame):
global should_exit
should_exit = True
print('You pressed Ctrl+C - exiting')
try:
tasks = asyncio.all_tasks()
except (RuntimeError):
# we haven't yet started the async link checking, we can exit directly
exit(1)
# ignore further SIGINTs while we're cancelling the running tasks
signal.signal(signal.SIGINT, signal.SIG_IGN)
for t in tasks:
t.cancel()
signal.signal(signal.SIGINT, signal_handler)
class FailedLookup:
def __init__(self):
# Fake values that will be queried (for a requests.get() return value)
self.status = 0
self.headers = {}
self.headers['content-type'] = '<NONE>'
def __str__(self):
s = ('FailedLookup: status=' + str(self.status) +
' content-type=' + self.headers['content-type'])
return s
# Dictionary from url -> result
cached_lookups = {}
class Link(object):
def __init__(self, file, line_number, url):
self.file = file
self.line_number = line_number
self.url = url
self.tested = False
self.r = None
self.success = False
def __str__(self):
epan_idx = self.file.find('epan')
if epan_idx == -1:
filename = self.file
else:
filename = self.file[epan_idx:]
s = ('SUCCESS ' if self.success else 'FAILED ') + \
filename + ':' + str(self.line_number) + ' ' + self.url
if True: # self.r:
if self.r.status:
s += " status-code=" + str(self.r.status)
if 'content-type' in self.r.headers:
s += (' content-type="' +
self.r.headers['content-type'] + '"')
else:
s += ' <No response Received>'
return s
def validate(self):
global cached_lookups
global should_exit
if should_exit:
return
self.tested = True
if self.url in cached_lookups:
self.r = cached_lookups[self.url]
else:
self.r = FailedLookup()
if self.r.status < 200 or self.r.status >= 300:
self.success = False
else:
self.success = True
if (args.verbose or not self.success) and not should_exit:
print(self)
links = []
files = []
all_urls = set()
def find_links_in_file(filename):
if os.path.isdir(filename):
return
with open(filename, 'r', encoding="utf8") as f:
for line_number, line in enumerate(f, start=1):
# TODO: not matching
# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
urls = re.findall(
r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
for url in urls:
# Lop off any trailing chars that are not part of it
url = url.rstrip(").',")
# A url must have a period somewhere
if '.' not in url:
continue
# Don't fetch this internal link..
if url.find('www.wireshark.org/tools/modelines') != -1:
continue
global links, all_urls
links.append(Link(filename, line_number, url))
all_urls.add(url)
# Scan the given folder for links to test. Recurses.
def find_links_in_folder(folder):
files_to_check = []
for root,subfolders,files in os.walk(folder):
for f in files:
if should_exit:
return
file = os.path.join(root, f)
if file.endswith('.c') or file.endswith('.adoc'):
files_to_check.append(file)
# Deal with files in sorted order.
for file in sorted(files_to_check):
find_links_in_file(file)
async def populate_cache(sem, session, url):
global cached_lookups
if should_exit:
return
async with sem:
try:
async with session.get(url) as r:
cached_lookups[url] = r
if args.verbose:
print('checking ', url, ': success', sep='')
except (asyncio.CancelledError, ValueError, ConnectionError, Exception):
cached_lookups[url] = FailedLookup()
if args.verbose:
print('checking ', url, ': failed', sep='')
async def check_all_links(links):
sem = asyncio.Semaphore(50)
timeout = aiohttp.ClientTimeout(total=25)
connector = aiohttp.TCPConnector(limit=30)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
async with aiohttp.ClientSession(connector=connector, headers=headers, timeout=timeout) as session:
tasks = [populate_cache(sem, session, u) for u in all_urls]
try:
await asyncio.gather(*tasks)
except (asyncio.CancelledError):
await session.close()
for link in links:
link.validate()
#################################################################
# Main logic.
# command-line args. Controls which dissector files should be scanned.
# If no args given, will just scan epan/dissectors folder.
parser = argparse.ArgumentParser(description='Check URL links in dissectors')
parser.add_argument('--file', action='append',
help='specify individual dissector file to test')
parser.add_argument('--commits', action='store',
help='last N commits to check')
parser.add_argument('--open', action='store_true',
help='check open files')
parser.add_argument('--verbose', action='store_true',
help='when enabled, show more output')
parser.add_argument('--docs', action='store_true',
help='when enabled, also check document folders')
args = parser.parse_args()
def is_dissector_file(filename):
p = re.compile(r'.*(packet|file)-.*\.c')
return p.match(filename)
# Get files from wherever command-line args indicate.
if args.file:
# Add specified file(s)
for f in args.file:
if not os.path.isfile(f) and not f.startswith('epan'):
f = os.path.join('epan', 'dissectors', f)
if not os.path.isfile(f):
print('Chosen file', f, 'does not exist.')
exit(1)
else:
files.append(f)
find_links_in_file(f)
elif args.commits:
# Get files affected by specified number of commits.
command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Fetch links from files (dissectors files only)
files = list(filter(is_dissector_file, files))
for f in files:
find_links_in_file(f)
elif args.open:
# Unstaged changes.
command = ['git', 'diff', '--name-only']
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
files = list(filter(is_dissector_file, files))
# Staged changes.
command = ['git', 'diff', '--staged', '--name-only']
files_staged = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
files_staged = list(filter(is_dissector_file, files_staged))
for f in files:
find_links_in_file(f)
for f in files_staged:
if f not in files:
find_links_in_file(f)
files.append(f)
elif args.docs:
# Find links from doc folder(s)
find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'doc'))
else:
# Find links from dissector folder.
find_links_in_folder(os.path.join(os.path.dirname(__file__), '..', 'epan', 'dissectors'))
# If scanning a subset of files, list them here.
print('Examining:')
if args.file or args.commits or args.open:
if files:
print(' '.join(files), '\n')
else:
print('No files to check.\n')
else:
if not args.docs:
print('All dissector modules\n')
else:
print('Document sources')
asyncio.run(check_all_links(links))
# Write failures to a file. Back up any previous first though.
if os.path.exists('failures.txt'):
shutil.copyfile('failures.txt', 'failures_last_run.txt')
with open('failures.txt', 'w') as f_f:
for link in links:
if link.tested and not link.success:
f_f.write(str(link) + '\n')
# And successes
with open('successes.txt', 'w') as f_s:
for link in links:
if link.tested and link.success:
f_s.write(str(link) + '\n')
# Count and show overall stats.
passed, failed = 0, 0
for link in links:
if link.tested:
if link.success:
passed += 1
else:
failed += 1
print('--------------------------------------------------------------------------------------------------')
print(len(links), 'links checked: ', passed, 'passed,', failed, 'failed')
|