File: fix-exemplars-duplicates.py

package info (click to toggle)
python-gflanguages 0.7.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 10,004 kB
  • sloc: python: 818; sh: 8; makefile: 6
file content (63 lines) | stat: -rw-r--r-- 2,401 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from collections import Counter
from google.protobuf import text_format
from gflanguages import languages_public_pb2

ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")


def main(args=None):
    for path in args:
        with open(path, encoding="utf-8") as fp:
            language = text_format.Parse(
                fp.read(), languages_public_pb2.LanguageProto()
            )
        changed = False
        exemplar_values = {}
        if not hasattr(language, "exemplar_chars"):
            exit()
        for attr in ATTRIBUTES:
            if hasattr(language.exemplar_chars, attr):
                values = getattr(language.exemplar_chars, attr).split(" ")
                value_set = set()
                clean_values = []
                for value in values:
                    if value in value_set:
                        continue
                    else:
                        value_set.add(value)
                        clean_values.append(value)

                if clean_values != values:
                    if {len(set(values))} != {len(set(clean_values))}:
                        print("before: "+ " ".join(values))
                        print("after: "+ " ".join(clean_values))
                        sys.exit("Failed fixing exemplar.")
                    setattr(language.exemplar_chars, attr, " ".join(clean_values))
                    changed = True
                    exemplar_values[attr] = {
                        "before": values,
                        "after": clean_values
                    }

        if changed:
            for exemplar, values in exemplar_values.items():
                before = values["before"]
                after = values["after"]
                counter = Counter(before)
                duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1]
            print(
                f"Changed {path} {exemplar} exemplar:\n"
                f"- from {len(before)} ({len(set(before))} as set) "
                f"to {len(after)} elements\n"
                f"- removing {len(before) - len(after)} duplicate(s):\n"
                f"  {duplicates}\n"
            )
            with open(path, "w", encoding="utf-8") as fp:
                fp.write(text_format.MessageToString(language, as_utf8=True))
                fp.close()


if __name__ == "__main__":
    import sys

    main(args=sys.argv[1:])