File: urls-check

package info (click to toggle)
cockpit 337-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 36,232 kB
  • sloc: javascript: 47,090; python: 38,766; ansic: 35,470; xml: 6,048; sh: 3,413; makefile: 614
file content (154 lines) | stat: -rwxr-xr-x 5,247 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python3 -cimport os, sys; os.execv(os.path.dirname(sys.argv[1]) + "/../test/common/pywrap", sys.argv)

# This file is part of Cockpit.
#
# Copyright (C) 2020 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <https://www.gnu.org/licenses/>.

import argparse
import fnmatch
import os
import subprocess
import sys
import time
import urllib
import urllib.parse
from urllib.request import Request, urlopen

import task

BASE_DIR = os.path.realpath(f'{__file__}/../..')

DAYS = 7
TASK_NAME = "Validate all URLs"

USER_AGENT = "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"

IGNORE = [
    "https://www.gnome.org",  # keeps causing 403 from GitHub; DNS or user-agent ban?
    "*example.com*",  # some tests use demo urls
]

KNOWN_REDIRECTS = [
    # fnmatch-like
    "https://access.redhat.com/security/updates/classification/#",
    "https://firefox.com/",
    "https://www.microsoft.com/",
    "https://github.com/patternfly/*",
    # SVG/XML processing in *.jsx
    "http://www.w3.org/2000/svg",
    "http://www.w3.org/XML/1998/namespace",
]


def main():
    parser = argparse.ArgumentParser(description=TASK_NAME)
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    parser.add_argument('-n', '--dry-run', action="store_true",
                        help="Only show urls")
    opts = parser.parse_args()

    task.verbose = opts.verbose

    if opts.dry_run:
        (success, err) = check_urls(opts.verbose)
        print(err)
        print(success)
        sys.exit(1 if err else 0)

    since = time.time() - (DAYS * 86400)

    # When there is an open issue, don't do anything
    issues = task.api.issues(state="open")
    issues = [i for i in issues if i["title"] == TASK_NAME]
    if issues:
        return

    issues = task.api.issues(state="closed", since=since)
    issues = [i for i in issues if i["title"] == TASK_NAME]

    # If related issue was not modified in last n-DAYS, then do your thing
    if not issues:
        (success, err) = check_urls(opts.verbose)
        if err:
            # Create a new issue
            data = {
                "title": TASK_NAME,
                "body": err,
                "labels": ["bot"]
            }
            task.api.post("issues", data)
        else:
            # Try to comment on the last issue (look in the last 4*DAYS)
            # If there is not issue to be found, then just open a new one and close it
            success = success or "Task hasn't produced any output"
            since = time.time() - (DAYS * 4 * 86400)
            issues = task.api.issues(state="closed", since=since)
            issues = [i for i in issues if i["title"] == TASK_NAME]
            if issues:
                task.comment(issues[0], success)
            else:
                data = {
                    "title": TASK_NAME,
                    "body": success,
                    "labels": ["bot"]
                }
                new_issue = task.api.post("issues", data)
                task.api.post("issues/{0}".format(new_issue["number"]), {"state": "closed"})


def check_urls(verbose):
    command = r'git grep -IEho "(https?)://[-a-zA-Z0-9@:%_\+.~#?&=/]+" -- pkg ":!*.svg" | sort -u'
    urls = subprocess.check_output(command, shell=True, universal_newlines=True, cwd=BASE_DIR).split("\n")
    redirects = []
    failed = []
    for url in urls:
        if not url:
            continue

        if any(fnmatch.fnmatch(url, pattern) for pattern in IGNORE):
            if verbose:
                print(f"Ignoring: {url}")
            continue

        if verbose:
            print(f"Checking: {url}")

        try:
            # Specify agent as some websites otherwise block requests
            req = Request(url=url, headers={"User-Agent": USER_AGENT})
            resp = urlopen(req)
            if resp.geturl() != url and not any(fnmatch.fnmatch(url, pattern) for pattern in KNOWN_REDIRECTS):
                redirects.append(url)
            if resp.getcode() >= 400:
                failed.append(f"{url} : {resp.getcode()} {resp.reason}")
        except urllib.error.URLError as e:
            failed.append(f"{url} : {e.reason}")

    err = ""
    success = ""
    if failed:
        err = f"Checked {len(urls)} URLs out of which {len(failed)} is/are invalid:\n"
        err += ''.join(f'    {url}\n' for url in failed)
    else:
        success = f"Checked {len(urls)} URLs and all are valid."
    if redirects:
        success += "\nFollowing URLs are redirected:\n"
        success += ''.join(f'    {url}\n' for url in redirects)
    return success, err


if __name__ == '__main__':
    sys.exit(main())