1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
|
from collections import Counter
import unicodedata
from google.protobuf import text_format
from gflanguages import languages_public_pb2
ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
def main(args=None):
for path in args:
with open(path, encoding="utf-8") as fp:
language = text_format.Parse(
fp.read(), languages_public_pb2.LanguageProto()
)
changed = False
exemplar_values = {}
bases = language.exemplar_chars.base.split(" ")
marks = language.exemplar_chars.marks.split(" ")
if not len(bases) or bases == [""]:
continue
new_marks = []
new_bases = []
for chars in marks:
if not chars:
continue
if chars[0] != "\u25CC":
chars = "\u25CC" + chars
if chars not in new_marks:
new_marks.append(chars)
for chars in bases:
if not chars:
continue
if chars[0] == "\u25CC":
chars = chars[1:]
cat = unicodedata.category(chars[0])
if cat in ["Mn", "Mc"]:
if chars[0] != "\u25CC":
chars = "\u25CC" + chars
if chars not in new_marks:
new_marks.append(chars)
else:
new_bases.append(chars)
language.exemplar_chars.base = " ".join(new_bases)
language.exemplar_chars.marks = " ".join(new_marks)
with open(path, "w", encoding="utf-8") as fp:
fp.write(text_format.MessageToString(language, as_utf8=True))
fp.close()
if __name__ == "__main__":
import sys
main(args=sys.argv[1:])
|