File: generate_data.py

package info (click to toggle)
python-guess-language 0.5.2-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, buster, forky, sid, trixie
  • size: 796 kB
  • sloc: python: 21,412; makefile: 3
file content (158 lines) | stat: -rwxr-xr-x 4,471 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""Generate the data subpackage
"""

import os
import io
import re
import shutil
import sys
from operator import itemgetter

FOLDED_NAMES = {
    "Latin-1 Supplement": "Extended Latin",
    "Latin Extended-A": "Extended Latin",
    "IPA Extensions": "Extended Latin",
    "Hiragana": "Kana",
    "Katakana": "Kana",
    "Katakana Phonetic Extensions": "Kana",
}
MAX_BLOCKS = 0x2fa1f
BLOCK_RSHIFT = 4
PACKAGE_NAME = "guess_language"
SCRIPT_DIR = os.path.dirname(__file__)
DATA_DIR = os.path.join(SCRIPT_DIR, PACKAGE_NAME, "data")
BLOCKS_PATH = os.path.join(DATA_DIR, "__init__.py")
MODELS_DIR = os.path.join(DATA_DIR, "models")
TRIGRAMS_DIR = os.path.join(SCRIPT_DIR, "trigrams")
BLOCKS_URL = "http://unicode.org/Public/UNIDATA/Blocks.txt"
BLOCKS_FN = os.path.basename(BLOCKS_URL)
ENCODING = "utf-8"
MAX_GRAMS = 300


def make_data_dir():
    for dir_path in [DATA_DIR, MODELS_DIR]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        init_path = os.path.join(dir_path, "__init__.py")
        with open(init_path, "w"):
            pass


def download_file(remote, local):
    try:
        from urllib.request import urlopen
    except ImportError:
        from urllib2 import urlopen
    from contextlib import closing

    with closing(urlopen(remote)) as inf:
        with open(local, "wb") as ouf:
            while True:
                data = inf.read()
                if not data:
                    break
                ouf.write(data)


def build_blocks():
    blocks_path = os.path.join(os.path.dirname(__file__), BLOCKS_FN)

    if not os.path.exists(blocks_path):
        download_file(BLOCKS_URL, blocks_path)

    splitter = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)$", re.I)

    with open(BLOCKS_PATH, "w", newline="\n") as f:
        f.write("BLOCK_RSHIFT = {!r}\n".format(BLOCK_RSHIFT))
        f.write("BLOCKS = [None] * {:#x}\n".format(
            MAX_BLOCKS + 1 >> BLOCK_RSHIFT))

        for line in open(blocks_path):
            if line.startswith("#"):
                continue

            m = splitter.match(line)

            if not m:
                continue

            start = int(m.group(1), 16)
            end = int(m.group(2), 16) + 1
            name = m.group(3)

            if all(not chr(n).isalpha() for n in range(start, end)):
                continue

            shifted_start = start >> BLOCK_RSHIFT
            shifted_end = end >> BLOCK_RSHIFT

            assert shifted_start << BLOCK_RSHIFT == start
            assert shifted_end << BLOCK_RSHIFT == end

            if name in FOLDED_NAMES:
                comment = name
                name = FOLDED_NAMES[name]
            else:
                comment = None

            s = "BLOCKS[{:#x}:{:#x}] = [{!r}] * {:#x}{}\n".format(
                shifted_start, shifted_end, name, shifted_end - shifted_start,
                "  # " + comment if comment else ""
            )
            f.write(s)

            if end >= MAX_BLOCKS:
                break


def build_models():
    line_re = re.compile(r"^(.{3})\s+(.*)$")
    consecutive_spaces_re = re.compile(r"\s{2,}", re.U)

    for model_file in sorted(os.listdir(TRIGRAMS_DIR)):
        model_path = os.path.join(TRIGRAMS_DIR, model_file)

        if os.path.isdir(model_path):
            continue

        model = {}  # QHash<QString,int> model

        with io.open(model_path, encoding=ENCODING) as f:
            for n, line in enumerate(f):
                m = line_re.match(line)
                if m:
                    value = m.group(1)
                    assert not consecutive_spaces_re.search(value)
                    assert n == int(m.group(2))
                    model[value] = n
            assert len(model) == MAX_GRAMS

        path = os.path.join(MODELS_DIR, model_file.lower() + ".py")

        with io.open(path, "w", encoding=ENCODING, newline="\n") as f:
            f.write("# -*- coding: {} -*-\nmodel = {{\n".format(ENCODING))
            for k, v in sorted(model.items(), key=itemgetter(1)):
                f.write(" {!r}: {!r},\n".format(k, v))
            f.write("}\n")


def generate_data(overwrite=False):
    if os.path.isdir(DATA_DIR):
        if overwrite:
            shutil.rmtree(DATA_DIR)
        else:
            return

    make_data_dir()
    build_blocks()
    build_models()


def setup_hook(config):
    generate_data()


if __name__ == "__main__":
    sys.exit(generate_data(overwrite=True))