File: update_iana_uri_schemes.py

package info (click to toggle)
linkchecker 10.5.0-1.1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,112 kB
  • sloc: python: 13,131; makefile: 134; sh: 71; xml: 36; sql: 20; javascript: 19; php: 2
file content (113 lines) | stat: -rw-r--r-- 3,094 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import sys
import re
import csv
import requests

iana_uri_schemes = "https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml"
# CSV format: URI Scheme,Template,Description,Status,Well-Known URI Support,Reference,Notes
csv_iana_uri_schemes = (
    "https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv"
)

iana_uri_schemes_dict = {}
iana_uri_schemes_other = {
    "clsid": "Microsoft specific",
    "find": "Mozilla specific",
    "gemini": "Gemini protocol",
    "isbn": "ISBN (int. book numbers)",
    "javascript": "JavaScript",
    "ms-windows-store": "Microsoft Store",
    "slack": "Slack Technologies client",
    "tg": "Telegram",
    "whatsapp": "WhatsApp",
}

filter_uri_schemes_permanent = (
    "file",
    "ftp",
    "http",
    "https",
    "mailto",
)

template = '''
# from %(uri)s
ignored_schemes_permanent = r"""
%(permanent)s
"""

ignored_schemes_provisional = r"""
%(provisional)s
"""

ignored_schemes_historical = r"""
%(historical)s
"""

ignored_schemes_other = r"""
%(other)s
"""

ignored_schemes = "^({}{}{}{})$".format(
    ignored_schemes_permanent,
    ignored_schemes_provisional,
    ignored_schemes_historical,
    ignored_schemes_other,
)
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)

is_unknown_scheme = ignored_schemes_re.match
'''


def main(args):
    parse_csv_file(csv_iana_uri_schemes, iana_uri_schemes_dict)
    for scheme in iana_uri_schemes_other:
        if (
            scheme in iana_uri_schemes_dict["Permanent"]
            or scheme in iana_uri_schemes_dict["Provisional"]
            or scheme in iana_uri_schemes_dict["Historical"]
        ):
            raise ValueError(scheme)
    for scheme in filter_uri_schemes_permanent:
        if scheme in iana_uri_schemes_dict["Permanent"]:
            del iana_uri_schemes_dict["Permanent"][scheme]
    args = dict(
        uri=iana_uri_schemes,
        permanent=get_regex(iana_uri_schemes_dict["Permanent"]),
        provisional=get_regex(iana_uri_schemes_dict["Provisional"]),
        historical=get_regex(iana_uri_schemes_dict["Historical"]),
        other=get_regex(iana_uri_schemes_other),
    )
    res = template % args
    print(res.rstrip())
    return 0


def get_regex(schemes):
    expr = [
        f"|{re.escape(scheme).ljust(10)} # {description}"
        for scheme, description in sorted(schemes.items())
    ]
    return "\n".join(expr)


def parse_csv_file(url, res):
    """Parse given URL and write res with {scheme -> description}"""
    response = requests.get(url, stream=True)
    reader = csv.reader(response.iter_lines(decode_unicode=True))
    first_row = True
    for row in reader:
        if first_row:
            # skip first row
            first_row = False
        else:
            scheme, template, description, status, urisupport, reference, notes = row
            scheme = scheme.replace(" (OBSOLETE)", "")  # remove the HTTP historic experiments flag
            if status not in res:
                res[status] = {}
            res[status][scheme] = description


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))