1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
#!/usr/bin/env python3
"""
author: deadc0de6 (https://github.com/deadc0de6)
Copyright (c) 2023, deadc0de6
URL checking script
"""
import sys
import re
from urllib.parse import urlparse
from urllib3 import Retry
import requests
from requests.adapters import HTTPAdapter
RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
RESET = '\033[0m'
RETRY_TOTAL = 10
RETRY_CONNECT = 5
TIMEOUT = 10
VALID_RET = [
200,
302,
]
IGNORES = [
'badgen.net',
'coveralls.io',
'packages.ubuntu.com',
]
OK_WHEN_FORBIDDEN = [
'linux.die.net',
'ko-fi.com'
]
IGNORE_GENERIC = []
USER_AGENT = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.36'
)
HEADERS = {
'User-Agent': USER_AGENT,
}
PATTERN = (
r"https?://[a-zA-Z0-9][a-zA-Z0-9-]{1,61}"
r"[a-zA-Z0-9]\.[=a-zA-Z0-9\_\/\?\&\%\+\#\.\-]+"
)
def get_links(path):
"""get a list of URLS"""
with open(path, encoding='utf-8') as file:
content = file.read()
entries = re.findall(PATTERN, content)
urls = list(set(entries))
return urls
def get_session():
"""get a session with retry"""
session = requests.Session()
retry_on = [404, 429, 500, 502, 503, 504]
retry = Retry(total=RETRY_TOTAL,
connect=RETRY_CONNECT,
status=RETRY_CONNECT,
backoff_factor=1,
allowed_methods=False,
status_forcelist=retry_on)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def check_links(urls):
"""check urls"""
cnt = 0
ign = 0
for url in urls:
cnt += 1
ignored = False
print(f' checking {MAGENTA}{url}{RESET}')
for ignore in IGNORE_GENERIC:
if ignore in url:
print(f' {YELLOW}[IGN]{RESET} {url}')
ign += 1
ignored = True
break
if ignored:
continue
hostname = urlparse(url).hostname
if hostname in IGNORES:
print(f' {YELLOW}[IGN]{RESET} {url}')
ign += 1
continue
verb = 'head'
try:
ret = requests.head(url,
timeout=TIMEOUT,
allow_redirects=True,
headers=HEADERS).status_code
# pylint: disable=W0703
except Exception:
ret = 404
if ret == 403 and hostname in OK_WHEN_FORBIDDEN:
msg = f' [{GREEN}OK-although-{ret}{RESET}]'
msg += f' {MAGENTA}{url}{RESET}'
print(msg)
continue
if ret not in VALID_RET:
msg = (
f' {YELLOW}[WARN]{RESET} HEAD {url} returned {ret}'
f' ... checking with GET'
)
print(msg)
verb = 'get'
sess = get_session()
ret = sess.get(url,
timeout=TIMEOUT,
allow_redirects=True,
headers=HEADERS).status_code
if ret not in VALID_RET:
print(f' {RED}[ERROR]{RESET} {url} returned {ret}')
return False
print(f' [{GREEN}OK{RESET}-{verb}-{ret}] {MAGENTA}{url}{RESET}')
print(f' {GREEN}OK{RESET} - total {cnt} links checked ({ign} ignored)')
return True
def main():
"""entry point"""
if len(sys.argv) < 2:
print(f'usage: {sys.argv[0]} <path>')
return False
print(f'checking {BLUE}{sys.argv[1]}{RESET} for links...')
links = get_links(sys.argv[1])
print(f' found {len(links)} links')
try:
if not check_links(links):
return False
# pylint: disable=W0703
except Exception as exc:
print(f'error {exc}')
return False
return True
if __name__ == '__main__':
if main():
sys.exit(0)
sys.exit(1)
|