1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
|
import sys
import os
import time
import getopt
import re
import requests
# Globals
_WEB_LINKS = 0
_HELP_MSG=\
"""
Usage: python3 validate_wiki_links.py [OPTIONS]
This script needs to be run from the mpich/doc/ directory. It will crawl
through the mpich/doc/wiki directory and sub-directories, processing all
markdown files. Any link within the file is validated to ensure it still works.
If the link is a web link, it will check for a 200 HTTP response code. If the
link is to another file, it will validate if that file exists.
Short and Long OPTIONS:
-h, --help Display this help message
--web-links Enable validating of web links. Disabled by default
"""
_ERROR_MSG=\
"Error parsing option. Please use '--help' for more a list of valid options."
_ALL_FILES = {}
# Functions
'''
crawl_dir: Crawls through the starting directory and recurses for any
sub-directory. All files are added to an array and returned. Additionally all
files are added to the global _All_FILES dictionary for later use.
'''
def crawl_dir(dir_start):
global _ALL_FILES
files = []
directory = os.fsencode(dir_start)
for file in os.listdir(directory):
filename = os.fsdecode(file)
full_path = os.path.join(dir_start, filename)
if(os.path.isdir(full_path)):
# Recursive call for a sub-directory
files += crawl_dir(full_path)
else:
# We only want to process markdown files
if(os.path.splitext(filename)[1] == ".md"):
files.append(full_path)
# We only want to add the actual file name and extension
_ALL_FILES[os.path.basename(filename)] = 0
return files
'''
find_links: We look for any links of the pattern [<text>](<link>) within the
document. We return all matches that are found.
'''
def find_links(file_name):
pattern = re.compile(r'\[.*\]\(.*\)')
content = open(file_name).read()
return pattern.findall(content, re.M)
'''
parse_links: We process the found links and split them into their respective
<text> and <link> sections. The <link> content is aggregated and returned.
'''
def parse_links(links):
l = []
for link in links:
# <Text>
page = link[link.find("[")+1:link.find("]")]
#<Link>
link = link[link.find("(")+1:link.find(")")]
# In Markdown you can have a link such as
# ["my link"](<link> "wikilink")
# We still want to process those links, and just skip the remaining
# text
if len(link.split()) > 1:
link = link.split()[0]
l.append(link)
return l
'''
validate_links: Here we call the previous two functions to get all of our links
properly parsed and collected. Once we have the list we then check whether the
link is a web link, mailto link, or a file link.
- Web links are checked for a 200 HTTP response code to be considered valid.
- Mailto links are skipped.
- Files are checked to see if they exist in their relative locations.
We return a dictionary of each file and any broken links attached to it.
Additionally we count each time a file is linked to and store that information
in the global _ALL_FILES dictionary.
'''
def validate_links(file_name):
global _ALL_FILES
global _WEB_LINKS
web_link_pattern = re.compile(r'http(|s):\/\/.*\.*')
mailto_pattern = re.compile(r'mailto:.*@.*\.[a-zA-Z0-9][a-zA-Z0-9]*')
ret = {"File": file_name, "Broken": []}
links = parse_links(find_links(file_name))
for link in links:
# Web links
if web_link_pattern.match(link):
# If we dont have web link checking enabled, skip
if not _WEB_LINKS:
continue
try:
response = requests.head(link)
if not response.status_code == 200:
ret["Broken"].append(link)
except requests.exceptions.RequestException as e:
ret["Broken"].append(link)
# Mailto links - skip these by default, no easy way to validate
elif mailto_pattern.match(link):
continue
# File links
else:
# If this is a markdown relative link, skip it
if link.startswith('#'):
continue
# If file exists in our _ALL_FILES dictionary, increase its link
# count by 1.
if os.path.basename(link) in _ALL_FILES:
_ALL_FILES[os.path.basename(link)] += 1
directory = os.path.dirname(file_name)
file_path = os.path.join(directory, link)
if not os.path.exists(file_path):
ret["Broken"].append(file_path)
return ret
'''
parse_arguments: Parse command lind arguments
'''
def parse_arguments(argv):
global _HELP_MSG
global _ERROR_MSG
global _WEB_LINKS
try:
opts, args = getopt.getopt(argv, "h", ["help", "web-links"])
except getopt.GetoptError:
print(_ERROR_MSG)
sys.exit()
if args:
print(_ERROR_MSG)
sys.exit()
for opt, arg in opts:
# Output help message
if opt in ("-h", "--help"):
print(_HELP_MSG)
sys.exit()
# Used to enable checking web links
elif opt in ("--web-links"):
_WEB_LINKS = 1
# Main
def main(argv):
global _VERBOSE
global _ALL_FILES
links = []
parse_arguments(argv)
all_files = crawl_dir("wiki")
for file in all_files:
links.append(validate_links(file))
print("==== Broken Links ====")
for link in links:
if link["Broken"]:
print("File: ", link["File"])
for b in link["Broken"]:
print(" Link: ", b)
print()
print()
print("==== Unlinked Files ====")
for f in _ALL_FILES:
if _ALL_FILES[f] == 0:
print(f)
if __name__ == "__main__":
main(sys.argv[1:])
|