1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
|
#!/usr/bin/env python3
# Copyright (c) 2009, Giampaolo Rodola', Himanshu Shekhar.
# All rights reserved. Use of this source code is governed by a
# BSD-style license that can be found in the LICENSE file.
"""Checks for broken links in file names specified as command line
parameters.
There are a ton of a solutions available for validating URLs in string
using regex, but less for searching, of which very few are accurate.
This snippet is intended to just do the required work, and avoid
complexities. Django Validator has pretty good regex for validation,
but we have to find urls instead of validating them (REFERENCES [7]).
There's always room for improvement.
Method:
* Match URLs using regex (REFERENCES [1]])
* Some URLs need to be fixed, as they have < (or) > due to inefficient
regex.
* Remove duplicates (because regex is not 100% efficient as of now).
* Check validity of URL, using HEAD request. (HEAD to save bandwidth)
Uses requests module for others are painful to use. REFERENCES[9]
Handles redirects, http, https, ftp as well.
REFERENCES:
Using [1] with some modifications for including ftp
[1] http://stackoverflow.com/a/6883094/5163807
[2] http://stackoverflow.com/a/31952097/5163807
[3] http://daringfireball.net/2010/07/improved_regex_for_matching_urls
[4] https://mathiasbynens.be/demo/url-regex
[5] https://github.com/django/django/blob/master/django/core/validators.py
[6] https://data.iana.org/TLD/tlds-alpha-by-domain.txt
[7] https://codereview.stackexchange.com/questions/19663/http-url-validating
[8] https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD
[9] http://docs.python-requests.org/
Author: Himanshu Shekhar <https://github.com/himanshub16> (2017)
"""
import argparse
import concurrent.futures
import functools
import os
import re
import sys
import traceback
import requests
HERE = os.path.abspath(os.path.dirname(__file__))
REGEX = re.compile(
r'(?:http|ftp|https)?://'
r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)
REQUEST_TIMEOUT = 15
# There are some status codes sent by websites on HEAD request.
# Like 503 by Microsoft, and 401 by Apple
# They need to be sent GET request
RETRY_STATUSES = [503, 401, 403]
def memoize(fun):
"""A memoize decorator."""
@functools.wraps(fun)
def wrapper(*args, **kwargs):
key = (args, frozenset(sorted(kwargs.items())))
try:
return cache[key]
except KeyError:
ret = cache[key] = fun(*args, **kwargs)
return ret
cache = {}
return wrapper
def sanitize_url(url):
url = url.rstrip(',')
url = url.rstrip('.')
url = url.lstrip('(')
url = url.rstrip(')')
url = url.lstrip('[')
url = url.rstrip(']')
url = url.lstrip('<')
url = url.rstrip('>')
return url
def find_urls(s):
matches = REGEX.findall(s) or []
return list({sanitize_url(x) for x in matches})
def parse_rst(fname):
"""Look for links in a .rst file."""
with open(fname) as f:
text = f.read()
urls = find_urls(text)
# HISTORY file has a lot of dead links.
if fname == 'HISTORY.rst' and urls:
urls = [
x
for x in urls
if not x.startswith('https://github.com/giampaolo/psutil/issues')
]
return urls
def parse_py(fname):
"""Look for links in a .py file."""
with open(fname) as f:
lines = f.readlines()
urls = set()
for i, line in enumerate(lines):
for url in find_urls(line):
# comment block
if line.lstrip().startswith('# '):
subidx = i + 1
while True:
nextline = lines[subidx].strip()
if re.match(r"^# .+", nextline):
url += nextline[1:].strip()
else:
break
subidx += 1
urls.add(url)
return list(urls)
def parse_c(fname):
"""Look for links in a .py file."""
with open(fname) as f:
lines = f.readlines()
urls = set()
for i, line in enumerate(lines):
for url in find_urls(line):
# comment block //
if line.lstrip().startswith('// '):
subidx = i + 1
while True:
nextline = lines[subidx].strip()
if re.match(r"^// .+", nextline):
url += nextline[2:].strip()
else:
break
subidx += 1
# comment block /*
elif line.lstrip().startswith('* '):
subidx = i + 1
while True:
nextline = lines[subidx].strip()
if re.match(r'^\* .+', nextline):
url += nextline[1:].strip()
else:
break
subidx += 1
urls.add(url)
return list(urls)
def parse_generic(fname):
with open(fname, errors='ignore') as f:
text = f.read()
return find_urls(text)
def get_urls(fname):
"""Extracts all URLs in fname and return them as a list."""
if fname.endswith('.rst'):
return parse_rst(fname)
elif fname.endswith('.py'):
return parse_py(fname)
elif fname.endswith(('.c', '.h')):
return parse_c(fname)
else:
with open(fname, errors='ignore') as f:
if f.readline().strip().startswith('#!/usr/bin/env python3'):
return parse_py(fname)
return parse_generic(fname)
@memoize
def validate_url(url):
"""Validate the URL by attempting an HTTP connection.
Makes an HTTP-HEAD request for each URL.
"""
try:
res = requests.head(url, timeout=REQUEST_TIMEOUT)
# some websites deny 503, like Microsoft
# and some send 401, like Apple, observations
if (not res.ok) and (res.status_code in RETRY_STATUSES):
res = requests.get(url, timeout=REQUEST_TIMEOUT)
return res.ok
except requests.exceptions.RequestException:
return False
def parallel_validator(urls):
"""Validates all urls in parallel
urls: tuple(filename, url).
"""
fails = [] # list of tuples (filename, url)
current = 0
total = len(urls)
with concurrent.futures.ThreadPoolExecutor() as executor:
fut_to_url = {
executor.submit(validate_url, url[1]): url for url in urls
}
for fut in concurrent.futures.as_completed(fut_to_url):
current += 1
sys.stdout.write(f"\r{current} / {total}")
sys.stdout.flush()
fname, url = fut_to_url[fut]
try:
ok = fut.result()
except Exception: # noqa: BLE001
fails.append((fname, url))
print()
print(f"warn: error while validating {url}", file=sys.stderr)
traceback.print_exc()
else:
if not ok:
fails.append((fname, url))
print()
return fails
def main():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('files', nargs="+")
parser.parse_args()
args = parser.parse_args()
all_urls = []
for fname in args.files:
urls = get_urls(fname)
if urls:
print(f"{len(urls):4} {fname}")
all_urls.extend((fname, url) for url in urls)
fails = parallel_validator(all_urls)
if not fails:
print("all links are valid; cheers!")
else:
for fail in fails:
fname, url = fail
print("{:<30}: {} ".format(fname, url))
print('-' * 20)
print(f"total: {len(fails)} fails!")
sys.exit(1)
if __name__ == '__main__':
try:
main()
except (KeyboardInterrupt, SystemExit):
os._exit(0)
|