1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
#!/usr/bin/env python3
# Check links in given asciidoctor files
# e.g. `tools/check_links_in_adoc.py $(find docs -name \*.adoc) *.adoc`
import argparse
import io
import os
import re
import requests # type: ignore
import sys
from typing import List, Tuple
# a regular expression matching links in Asciidoc
ADOC_LINKS = re.compile(r'(?:\W|^)(?:(?:link:|image::?|include::|xref:)([^[]+)\[|((?:ht|f)tps?://[^[]*)\[|link="?([^]"]+)"?\])')
LINK_REMOTE = re.compile('(ht|f)tps?://')
class Link():
uri: str
file: str
line: int
type: str
def __init__(self, uri: str, file: str, line: int):
self.uri = uri
self.file = file
self.line = line
if uri.startswith('http://') or uri.startswith('https://'):
self.type = 'url'
elif uri.startswith('http://') or uri.startswith('https://'):
self.type = 'ftp'
elif uri.startswith('mailto:'):
self.type = 'email'
else:
self.type = 'path'
def parse_args(args: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description='Check URLs in asciidoctor files')
parser.add_argument('--verbose', '-v', action='store_const',
const=True, default=False,
help='Output all found links, not only failed ones')
parser.add_argument('--timeout', '-t', type=int, default=2,
help='Timeout in seconds (default 2)')
parser.add_argument('adoc', type=argparse.FileType('r'), nargs='+',
help='Names of asciidoctor files')
return parser.parse_args(args)
def extract_links(adoc: io.TextIOWrapper) -> Tuple[int, List[Link]]:
links = []
line_nr = 1
for line in adoc:
line = line.strip()
for match in ADOC_LINKS.finditer(line):
if match:
for link in match.groups():
if link:
links.append(Link(link, str(adoc.name), line_nr))
line_nr += 1
return 0, links
def check_link(link: Link) -> int:
if link.type == 'url':
return check_link_url(link)
elif link.type == 'path':
return check_link_path(link)
else:
return 0 # ignored
def check_link_url(link: Link) -> int:
try:
rc = requests.head(link.uri, timeout=2, allow_redirects=True)
except (requests.ConnectionError,
requests.exceptions.ReadTimeout) as exc:
fail(link, exc)
return 2
if rc.status_code == 200:
ok(link)
return 0
else:
fail(link, rc.status_code)
return 4
def check_link_path(link: Link) -> int:
if os.path.isabs(link.uri):
fullname = link.uri
else:
dirname = os.path.dirname(link.file)
fullname = os.path.join(dirname, link.uri)
if os.path.exists(fullname):
ok(link)
return 0
else:
fail(link, 'NoFile ' + fullname)
return 1
def fail(link: Link, reason: str) -> None:
prefix = link.type.upper() + ':'
print(prefix, link.file, link.line, link.uri, '[FAIL - {}]'.format(reason))
def ok(link: Link) -> None:
if values.verbose:
prefix = link.type.upper() + ':'
print(prefix, link.file, link.line, link.uri, '[OK]')
if __name__ == '__main__':
values = parse_args(sys.argv[1:])
ret_code = 0
all_links = []
for adoc in values.adoc:
rc, links = extract_links(adoc)
ret_code |= rc
all_links += links
if ret_code:
sys.exit(ret_code)
for link in all_links:
ret_code |= check_link(link)
sys.exit(ret_code)
|