File: fix-exemplars-bases.py

package info (click to toggle)
python-gflanguages 0.7.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 10,004 kB
  • sloc: python: 818; sh: 8; makefile: 6
file content (56 lines) | stat: -rw-r--r-- 1,741 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from collections import Counter
import unicodedata
from google.protobuf import text_format
from gflanguages import languages_public_pb2

ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")


def main(args=None):
    for path in args:
        with open(path, encoding="utf-8") as fp:
            language = text_format.Parse(
                fp.read(), languages_public_pb2.LanguageProto()
            )
        changed = False
        exemplar_values = {}
        bases = language.exemplar_chars.base.split(" ")
        marks = language.exemplar_chars.marks.split(" ")
        if not len(bases) or bases == [""]:
            continue
        new_marks = []
        new_bases = []
        for chars in marks:
            if not chars:
                continue
            if chars[0] != "\u25CC":
                chars = "\u25CC" + chars
            if chars not in new_marks:
                new_marks.append(chars)

        for chars in bases:
            if not chars:
                continue
            if chars[0] == "\u25CC":
                chars = chars[1:]
            cat = unicodedata.category(chars[0])
            if cat in ["Mn", "Mc"]:
                if chars[0] != "\u25CC":
                    chars = "\u25CC" + chars
                if chars not in new_marks:
                    new_marks.append(chars)
            else:
                new_bases.append(chars)

        language.exemplar_chars.base = " ".join(new_bases)
        language.exemplar_chars.marks = " ".join(new_marks)

        with open(path, "w", encoding="utf-8") as fp:
            fp.write(text_format.MessageToString(language, as_utf8=True))
            fp.close()


if __name__ == "__main__":
    import sys

    main(args=sys.argv[1:])