File: check_codepoints.py

package info (click to toggle)
python-precis-i18n 1.1.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,836 kB
  • sloc: python: 1,825; sh: 28; makefile: 3
file content (76 lines) | stat: -rw-r--r-- 2,298 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Program to check tables included in precis_i18n.unicode module.
"""

import re
from urllib.request import urlopen

import precis_i18n.unicode as ucd
from precis_i18n.codepointset import CodepointSet

PROP_REGEX = re.compile(rb"^([0-9A-Za-z.]+)\s+;\s+(\w+)\s+#")
DATAFILE_URLS = [
    "https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt",
    "https://www.unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt#Join_Type",
    "https://www.unicode.org/Public/UNIDATA/Scripts.txt",
    "https://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt#Hangul_Type",
]


def parse_unicode_datafile(url, props):
    if "#" in url:
        url, prop = url.split("#")
        prop += "="
    else:
        prop = ""

    for line in urlopen(url):
        m = PROP_REGEX.match(line)
        if m:
            codepoints, name = m.groups()
            props.setdefault("%s%s" % (prop, name.decode()), []).append(
                codepoints.decode()
            )


def compare_codepoints(varname, codepoint_str):
    codepoints = CodepointSet(codepoint_str)
    if codepoints != getattr(ucd, varname):
        print(varname)
        print(codepoint_str)


def combine(props, *names):
    result = ""
    for name in names:
        cnt = len(CodepointSet(props[name]))
        result += "# %s (%d)\n%s\n" % (name, cnt, props[name])
    return result


def main():
    props = {}
    for url in DATAFILE_URLS:
        parse_unicode_datafile(url, props)

    for prop in props:
        props[prop] = "\n".join(props[prop])

    compare_codepoints("_DEFAULT_IGNORABLE", props["Default_Ignorable_Code_Point"])
    compare_codepoints("_JOINTYPE_DUAL_JOINING", props["Join_Type=D"])
    compare_codepoints("_JOINTYPE_RIGHT_JOINING", props["Join_Type=R"])
    compare_codepoints("_JOINTYPE_LEFT_JOINING", props["Join_Type=L"])
    compare_codepoints("_JOINTYPE_TRANSPARENT", props["Join_Type=T"])
    compare_codepoints("_GREEK_SCRIPT", props["Greek"])
    compare_codepoints("_HEBREW_SCRIPT", props["Hebrew"])
    compare_codepoints(
        "_HIRAGANA_KATAKANA_HAN", combine(props, "Hiragana", "Katakana", "Han")
    )
    compare_codepoints(
        "_OLD_HANGUL_JAMO",
        combine(props, "Hangul_Type=L", "Hangul_Type=V", "Hangul_Type=T"),
    )


if __name__ == "__main__":
    main()