File: check_links.py

package info (click to toggle)
dotdrop 1.15.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,812 kB
  • sloc: sh: 13,401; python: 8,186; makefile: 3
file content (159 lines) | stat: -rwxr-xr-x 4,087 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
"""
author: deadc0de6 (https://github.com/deadc0de6)
Copyright (c) 2023, deadc0de6

URL checking script
"""

import sys
import re
from urllib.parse import urlparse
from urllib3 import Retry
import requests
from requests.adapters import HTTPAdapter


RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
RESET = '\033[0m'

RETRY_TOTAL = 10
RETRY_CONNECT = 5

TIMEOUT = 10
VALID_RET = [
    200,
    302,
]
IGNORES = [
    'badgen.net',
    'coveralls.io',
    'packages.ubuntu.com',
]
OK_WHEN_FORBIDDEN = [
    'linux.die.net',
    'ko-fi.com'
]
IGNORE_GENERIC = []
USER_AGENT = (
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/58.0.3029.110 Safari/537.36'
)
HEADERS = {
    'User-Agent': USER_AGENT,
}
PATTERN = (
    r"https?://[a-zA-Z0-9][a-zA-Z0-9-]{1,61}"
    r"[a-zA-Z0-9]\.[=a-zA-Z0-9\_\/\?\&\%\+\#\.\-]+"
)


def get_links(path):
    """get a list of URLS"""
    with open(path, encoding='utf-8') as file:
        content = file.read()
    entries = re.findall(PATTERN, content)
    urls = list(set(entries))
    return urls


def get_session():
    """get a session with retry"""
    session = requests.Session()
    retry_on = [404, 429, 500, 502, 503, 504]
    retry = Retry(total=RETRY_TOTAL,
                  connect=RETRY_CONNECT,
                  status=RETRY_CONNECT,
                  backoff_factor=1,
                  allowed_methods=False,
                  status_forcelist=retry_on)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


def check_links(urls):
    """check urls"""
    cnt = 0
    ign = 0
    for url in urls:
        cnt += 1
        ignored = False
        print(f'    checking {MAGENTA}{url}{RESET}')
        for ignore in IGNORE_GENERIC:
            if ignore in url:
                print(f'    {YELLOW}[IGN]{RESET} {url}')
                ign += 1
                ignored = True
                break
        if ignored:
            continue
        hostname = urlparse(url).hostname
        if hostname in IGNORES:
            print(f'    {YELLOW}[IGN]{RESET} {url}')
            ign += 1
            continue

        verb = 'head'
        try:
            ret = requests.head(url,
                                timeout=TIMEOUT,
                                allow_redirects=True,
                                headers=HEADERS).status_code
        # pylint: disable=W0703
        except Exception:
            ret = 404
        if ret == 403 and hostname in OK_WHEN_FORBIDDEN:
            msg = f'    [{GREEN}OK-although-{ret}{RESET}]'
            msg += f' {MAGENTA}{url}{RESET}'
            print(msg)
            continue
        if ret not in VALID_RET:
            msg = (
                f'    {YELLOW}[WARN]{RESET} HEAD {url} returned {ret}'
                f' ... checking with GET'
            )
            print(msg)
            verb = 'get'
            sess = get_session()
            ret = sess.get(url,
                           timeout=TIMEOUT,
                           allow_redirects=True,
                           headers=HEADERS).status_code
            if ret not in VALID_RET:
                print(f'    {RED}[ERROR]{RESET} {url} returned {ret}')
                return False
        print(f'    [{GREEN}OK{RESET}-{verb}-{ret}] {MAGENTA}{url}{RESET}')
    print(f'    {GREEN}OK{RESET} - total {cnt} links checked ({ign} ignored)')
    return True


def main():
    """entry point"""
    if len(sys.argv) < 2:
        print(f'usage: {sys.argv[0]} <path>')
        return False

    print(f'checking {BLUE}{sys.argv[1]}{RESET} for links...')
    links = get_links(sys.argv[1])
    print(f'    found {len(links)} links')
    try:
        if not check_links(links):
            return False
    # pylint: disable=W0703
    except Exception as exc:
        print(f'error {exc}')
        return False
    return True


if __name__ == '__main__':
    if main():
        sys.exit(0)
    sys.exit(1)