File: write_complete_data.py

package info (click to toggle)
dateparser 1.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,140 kB
  • sloc: python: 52,721; makefile: 155; sh: 15
file content (129 lines) | stat: -rw-r--r-- 4,540 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import os
import shutil
from collections import OrderedDict

import regex as re
from ruamel.yaml import RoundTripLoader

from dateparser_scripts.order_languages import avoid_languages
from dateparser_scripts.utils import combine_dicts

cldr_date_directory = "../dateparser_data/cldr_language_data/date_translation_data/"
supplementary_directory = "../dateparser_data/supplementary_language_data/"
supplementary_date_directory = (
    "../dateparser_data/supplementary_language_data/date_translation_data/"
)
translation_data_directory = "../dateparser/data/"
date_translation_directory = "../dateparser/data/date_translation_data/"

os.chdir(os.path.dirname(os.path.abspath(__file__)))

cldr_languages = list(
    set(map(lambda x: x[:-5], os.listdir(cldr_date_directory))) - avoid_languages
)
supplementary_languages = [x[:-5] for x in os.listdir(supplementary_date_directory)]
all_languages = set(cldr_languages).union(set(supplementary_languages))

RELATIVE_PATTERN = re.compile(r"\{0\}")


def _modify_relative_data(relative_data):
    modified_relative_data = OrderedDict()
    for key, value in relative_data.items():
        for i, string in enumerate(value):
            string = RELATIVE_PATTERN.sub(r"(\\d+[.,]?\\d*)", string)
            value[i] = string
        modified_relative_data[key] = value
    return modified_relative_data


def _modify_data(language_data):
    relative_data = language_data.get("relative-type-regex", {})
    relative_data = _modify_relative_data(relative_data)
    locale_specific_data = language_data.get("locale_specific", {})
    for _, info in locale_specific_data.items():
        locale_relative_data = info.get("relative-type-regex", {})
        locale_relative_data = _modify_relative_data(locale_relative_data)


def _get_complete_date_translation_data(language):
    cldr_data = {}
    supplementary_data = {}
    if language in cldr_languages:
        with open(cldr_date_directory + language + ".json") as f:
            cldr_data = json.load(f, object_pairs_hook=OrderedDict)
    if language in supplementary_languages:
        with open(supplementary_date_directory + language + ".yaml") as g:
            supplementary_data = OrderedDict(RoundTripLoader(g).get_data())
    complete_data = combine_dicts(cldr_data, supplementary_data)
    if "name" not in complete_data:
        complete_data["name"] = language
    return complete_data


def _write_file(filename, text, mode, in_memory, in_memory_result):
    if in_memory:
        in_memory_result[filename] = text
    else:
        with open(filename, mode) as out:
            out.write(text)


def write_complete_data(in_memory=False):
    """
    This function is responsible of generating the needed py files from the
    CLDR files (JSON format) and supplementary language data (YAML format).

    Use it with in_memory=True to avoid writing real files and getting a
    dictionary containing the file names and their content (used when testing).
    """
    in_memory_result = {}

    if not in_memory:
        if not os.path.isdir(translation_data_directory):
            os.mkdir(translation_data_directory)
        if os.path.isdir(date_translation_directory):
            shutil.rmtree(date_translation_directory)
        os.mkdir(date_translation_directory)

    with open(supplementary_directory + "base_data.yaml") as f:
        base_data = RoundTripLoader(f).get_data()

    for language in all_languages:
        date_translation_data = _get_complete_date_translation_data(language)
        date_translation_data = combine_dicts(date_translation_data, base_data)
        _modify_data(date_translation_data)
        translation_data = json.dumps(
            date_translation_data, indent=4, separators=(",", ": "), ensure_ascii=False
        )
        out_text = ("info = " + translation_data + "\n").encode("utf-8")
        _write_file(
            date_translation_directory + language + ".py",
            out_text,
            "wb",
            in_memory,
            in_memory_result,
        )

    init_text = (
        "from dateparser.data import date_translation_data\n"
        "from .languages_info import language_order, language_locale_dict\n"
    )

    _write_file(
        translation_data_directory + "__init__.py",
        init_text,
        "w",
        False,
        in_memory_result,
    )
    _write_file(
        date_translation_directory + "__init__.py", "", "w", False, in_memory_result
    )

    return in_memory_result


if __name__ == "__main__":
    write_complete_data()