File: translation_script.py

package info (click to toggle)
postbooks 4.10.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 112,660 kB
  • ctags: 22,890
  • sloc: cpp: 310,358; sh: 607; xml: 214; python: 140; awk: 104; makefile: 50
file content (233 lines) | stat: -rw-r--r-- 9,312 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#HOW TO USE THIS FROM THE COMMAND LINE
#Format your entry as:
#File_path_to_python.exe [SPACE] file_path_to_python_script [SPACE] (optional) -o preferred_filename [SPACE] API_key [SPACE] Untranslated_xml


from lxml import etree
import urllib.request
from urllib.parse import quote
import re
import sys
import getopt


#Used to generate the output name of the new xml file
def output_name(input_name):
    output_string = ''
    for i in range(0,len(input_name)-3):
        output_string += input_name[i]
    output_string += '.ts'
    return output_string


#Used in translate function to format the output translation to match the source text format in the xml file. There was a problem with google translate adding/removing spaces before/after &, >, and <.
#Google translate was also changing the encoding(i.e. < to u\003c)
def append_to_dict(phrase_to_find, text, dict_name, dict_entry):

    regex = re.compile(phrase_to_find)
    num_indexes = [m.start(0) for m in re.finditer(regex, text)]
    for index in num_indexes:
        dict_name[index] = dict_entry


def assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, iteration, constant, target_phrase):

    index_tracker.append(index_list_tr[iteration] + constant)
    temp_string = translated_text[index_tracker[iteration]:index_tracker[iteration+1]]
    temp_string = temp_string.replace(r'\u003e', '>')
    temp_string = temp_string.replace(r'\u003c', '<')

    #if the translation added a space before the target then chop it off
    if translated_text[index_list_tr[iteration]-1] == ' ' and source_text[index_list_sr[iteration]-1] != ' ':
        temp_string = temp_string.replace(' ' + target_phrase, target_phrase, 1)

    #should chop is method through which spaces are taken away from the end of previous targets
    if should_chop[0]:
        if temp_string[0] == ' ':
            temp_string = temp_string.replace(' ', '', 1)
        should_chop[0] = False

    #if the translation added a space after the target then chop through bool(should_chop[0]). Conversely if translation took away a space at the end then add it back.
    if len(translated_text)-1 >= index_list_tr[iteration]+constant and len(source_text)-1 >= index_list_sr[iteration]+constant:
        if translated_text[index_list_tr[iteration]+constant] == ' ' and source_text[index_list_sr[iteration]+constant] != ' ':
            should_chop[0] = True
        if translated_text[index_list_tr[iteration]+constant] != ' ' and source_text[index_list_sr[iteration]+constant] == ' ':
            temp_string += ' '

    #if the translation took away a space before the target then add it back
    if translated_text[index_list_tr[iteration]-1] != ' ' and source_text[index_list_sr[iteration]-1] == ' ':
        new_string = ''
        index = temp_string.index(target_phrase)
        for z in range(0,len(temp_string)):
            if z != index:
                new_string += temp_string[z]
            else:
                new_string += ' '
                new_string += temp_string[z]
        temp_string = new_string

    #add formatted string to partial strings list
    partial_strings.append(temp_string)


def space_handler(source_text, translated_text):

    # SECTION 1: INITIALIZING VARIABLES
    index_tracker = [0]
    partial_strings = []
    output_string = ''
    should_chop = [False]

    index_dict_sr = {}
    index_dict_tr = {}
    index_list_sr = []
    index_list_tr = []

    append_to_dict(r'&', source_text, index_dict_sr, 'a')
    append_to_dict(r'&', translated_text, index_dict_tr, 'a')
    append_to_dict(r'>', source_text, index_dict_sr, 'ga')
    append_to_dict(r'>', translated_text, index_dict_tr, 'ga')
    append_to_dict('\\\\u003e', translated_text, index_dict_tr, 'gu')
    append_to_dict(r'<', source_text, index_dict_sr, 'la')
    append_to_dict(r'<', translated_text, index_dict_tr, 'la')
    append_to_dict('\\\\u003c', translated_text, index_dict_tr, 'lu')




    #SECTION 3: GENERATE INDEX LISTS AND SORT. DICTS ARE FOR REFERENCE
    #put the dict keys in lists and sort
    for key in index_dict_tr:
        index_list_tr.append(key)
    for key in index_dict_sr:
        index_list_sr.append(key)
    index_list_tr.sort()
    index_list_sr.sort()




    #SECTION 4: DIVIDE TRANSLATION UP INTO PARTIAL STRINGS AND REPLACE PARTS OF THE STRING ACCORDINGLY
    for i in range(0,len(index_list_tr)+1):

        if i == len(index_list_tr):
            temp_string = translated_text[index_tracker[i]::]
            if should_chop[0]:
               if temp_string[0] == ' ':
                    temp_string = temp_string.replace(' ','',1)
            partial_strings.append(temp_string)
            break

        if index_dict_tr[index_list_tr[i]] == 'a':
            assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, i, 5, '&')

        if index_dict_tr[index_list_tr[i]] == 'ga':
            assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, i, 4, '>')

        if index_dict_tr[index_list_tr[i]] == 'gu':
            assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, i, 6, '>')

        if index_dict_tr[index_list_tr[i]] == 'la':
            assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, i, 4, '<')

        if index_dict_tr[index_list_tr[i]] == 'lu':
            assemble_partial_string(should_chop, partial_strings, index_tracker, index_list_tr, index_list_sr, source_text, translated_text, i, 6, '<')




    #SECTION 5: GENERATE OUTPUT
    for string in partial_strings:
        output_string += string

    return output_string






#This function is the one used in the translate.xslt file
def translate(context, phrase, lang = 'fr' ): # will have to get information from locale

    # unreserved_characters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'
    #                          'A','B','C','D','E','F','G','H','I','J','K','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
    #                          '0','1','2','3','4','5','6','7','8','9','-','_','.','~']

    reserved_characters = [' ','!','*','\'','(',')',';',':','@','&','=','+','$',',','/','?','%','#','[',']']

    base_url = 'https://www.googleapis.com/language/translate/v2?key='
    API_key = args[0]#MANUALLY ENTER API KEY
    source_language = 'source=en'
    translation_language = 'target=' + lang
    formatted_phrase = ''

    for letter in phrase:
        if letter in reserved_characters:
            formatted_phrase += quote(letter,safe="")
        else:
            formatted_phrase += letter

    url = base_url + API_key + '&' + source_language + '&' + translation_language + '&q=' + formatted_phrase

    try:
        stream = urllib.request.urlopen(url)
    except:
        return ""

    #The json output of the request to google may contain more than one string that matches the regex. The correct one to use is always first. Other matches give irrelevant information.
    translation_regex = re.compile(r'"translatedText":\s"(.*)"', re.IGNORECASE)
    translated_text = []
    mo = ''
    for line in stream:
        line = line.decode('UTF-8')
        mo = translation_regex.findall(line)
        if len(mo) != 0:
            translated_text.append(mo[0])

    translated_string = translated_text[0]

    #Since values get unencoded when reading the source text in xml files, reencode the ones that are used in the space_handler() function
    source_text = phrase
    source_text = source_text.replace('&', '&')
    source_text = source_text.replace('<', '&lt;')
    source_text = source_text.replace('>', '&gt;')
    print("Out of Google Translate:      " + phrase)
    print("Reformatted source_text:      " + source_text)
    print("Original translation:         " + translated_string)

    #reformat spaces in translated string to match those in the source text
    translated_string = space_handler(source_text, translated_string)
    print("Space handled translation:    " + translated_string)
    print()


    return translated_string

arg_info = getopt.getopt(sys.argv[1:], "o:")
args = arg_info[1]

ns = etree.FunctionNamespace("mynamespace")
ns['translate'] = translate

dom = etree.parse(args[1]) #MANUALLY: place the base.ts as the argument

xslt = etree.parse("translate.xslt") #MANUALLY: place the xslt stylesheet as the argument
transform = etree.XSLT(xslt)

newdom = transform(dom)


#if there is a filename specified in the command line
if len(arg_info[0]) > 0:
    filename = arg_info[0][0][1]
else:
    filename = output_name(args[1]) #MANUALLY: name the output file


output_xml = open(filename,'w+')
output_xml.write(str(newdom))
output_xml.close()