File: m17n_translit.py

package info (click to toggle)
ibus-typing-booster 2.10.5-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 87,788 kB
  • sloc: xml: 799,041; python: 22,939; sh: 3,519; makefile: 342
file content (331 lines) | stat: -rw-r--r-- 12,011 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# -*- coding: utf-8 -*-
# vim:et sts=4 sw=4
#
# ibus-typing-booster - A completion input method for IBus
#
# Copyright (c) 2015-2016 Mike FABIAN <mfabian@redhat.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>

'''A module to do transliteration using m17n-lib.
'''

import sys
import ctypes

class libm17n__MSymbolStruct(ctypes.Structure):
    pass
libm17n__MSymbol = ctypes.POINTER(libm17n__MSymbolStruct)
class libm17n__MPlist(ctypes.Structure):
    pass
class libm17n__MConverter(ctypes.Structure):
    pass
class libm17n__MInputMethod(ctypes.Structure):
    pass
class libm17n__MInputContext(ctypes.Structure):
    pass
class libm17n__MText(ctypes.Structure):
    pass
libm17n__MSymbolStruct._fields_ = [
    ('managing_key', ctypes.c_uint),
    ('name', ctypes.c_char_p),
    ('length', ctypes.c_int),
    ('plist', libm17n__MPlist),
    ('next', ctypes.POINTER(libm17n__MSymbolStruct))]

libm17n__lib = None
libm17n__msymbol = None
libm17n__mplist = None
libm17n__mconv_buffer_converter = None
libm17n__mconv_reset_converter = None
libm17n__mconv_rebind_buffer = None
libm17n__mconv_encode = None
libm17n__minput_open_im = None
libm17n__minput_create_ic = None
libm17n__minput_reset_ic = None
libm17n__minput_filter = None
libm17n__minput_lookup = None
libm17n__mtext = None
libm17n__mtext_len = None
libm17n__Mcoding_utf_8 = None

_utf8_converter = None

def mtext_to_string(mtext_pointer):
    '''Return the text contained in an MText object as a Python string

    :param mtext_pointer: pointer to the MText object to get the text from
    :type mtext_pointer: pointer to an libm17n MText object
    :rtype: string
    '''
    libm17n__mconv_reset_converter(_utf8_converter)
    # one Unicode character cannot have more than 6 UTF-8 bytes
    # (actually not more than 4 ...)
    bufsize = (libm17n__mtext_len(mtext_pointer) + 1) * 6
    conversion_buffer = bytes(bufsize)
    libm17n__mconv_rebind_buffer(
        _utf8_converter,
        ctypes.c_char_p(conversion_buffer),
        ctypes.c_int(bufsize))
    libm17n__mconv_encode(_utf8_converter, mtext_pointer)
    # maybe not all of the buffer was really used for the conversion,
    # cut of the unused part:
    conversion_buffer = conversion_buffer[0:conversion_buffer.find(b'\x00')]
    return conversion_buffer.decode('utf-8')

def _init():
    '''Open libm17n and fill global variables for functions and
    variables from libm17n
    '''
    global libm17n__lib
    libm17n__lib = ctypes.CDLL('libm17n.so.0', mode = ctypes.RTLD_GLOBAL)
    libm17n__lib.m17n_init()
    global libm17n__mplist
    libm17n__mplist = libm17n__lib.mplist
    libm17n__mplist.argtypes = []
    libm17n__mplist.restype = ctypes.POINTER(libm17n__MPlist)
    global libm17n__mconv_buffer_converter
    libm17n__mconv_buffer_converter = libm17n__lib.mconv_buffer_converter
    libm17n__mconv_buffer_converter.argtypes = [
        libm17n__MSymbol, ctypes.c_char_p, ctypes.c_int]
    libm17n__mconv_buffer_converter.restype = ctypes.POINTER(
        libm17n__MConverter)
    global libm17n__mconv_reset_converter
    libm17n__mconv_reset_converter = libm17n__lib.mconv_reset_converter
    libm17n__mconv_reset_converter.argtypes = [
        ctypes.POINTER(libm17n__MConverter)]
    libm17n__mconv_reset_converter.restype = ctypes.c_int
    global libm17n__mconv_rebind_buffer
    libm17n__mconv_rebind_buffer = libm17n__lib.mconv_rebind_buffer
    libm17n__mconv_rebind_buffer.argtypes = [
        ctypes.POINTER(libm17n__MConverter), ctypes.c_char_p, ctypes.c_int]
    libm17n__mconv_rebind_buffer.restype = ctypes.POINTER(libm17n__MConverter)
    global libm17n__mconv_encode
    libm17n__mconv_encode = libm17n__lib.mconv_encode
    libm17n__mconv_encode.argtypes = [
        ctypes.POINTER(libm17n__MConverter), ctypes.POINTER(libm17n__MText)]
    libm17n__mconv_encode.restype = ctypes.c_int
    global libm17n__msymbol
    libm17n__msymbol = libm17n__lib.msymbol
    libm17n__msymbol.argtypes = [ctypes.c_char_p]
    libm17n__msymbol.restype = libm17n__MSymbol
    global libm17n__minput_open_im
    libm17n__minput_open_im = libm17n__lib.minput_open_im
    libm17n__minput_open_im.argtypes = [
        libm17n__MSymbol, libm17n__MSymbol, ctypes.c_void_p]
    libm17n__minput_open_im.restype = ctypes.POINTER(libm17n__MInputMethod)
    global libm17n__minput_create_ic
    libm17n__minput_create_ic = libm17n__lib.minput_create_ic
    libm17n__minput_create_ic.argtypes = [
        ctypes.POINTER(libm17n__MInputMethod), ctypes.c_void_p]
    libm17n__minput_create_ic.restype = ctypes.POINTER(libm17n__MInputContext)
    global libm17n__minput_reset_ic
    libm17n__minput_reset_ic = libm17n__lib.minput_reset_ic
    libm17n__minput_reset_ic.argtypes = [
        ctypes.POINTER(libm17n__MInputContext)]
    global libm17n__minput_filter
    libm17n__minput_filter = libm17n__lib.minput_filter
    libm17n__minput_filter.argtypes = [
        ctypes.POINTER(libm17n__MInputContext),
        libm17n__MSymbol,
        ctypes.c_void_p]
    libm17n__minput_filter.restype = ctypes.c_int
    global libm17n__minput_lookup
    libm17n__minput_lookup = libm17n__lib.minput_lookup
    libm17n__minput_lookup.argtypes = [
        ctypes.POINTER(libm17n__MInputContext),
        libm17n__MSymbol,
        ctypes.c_void_p,
        ctypes.POINTER(libm17n__MText)]
    libm17n__minput_lookup.restype = ctypes.c_int
    global libm17n__mtext
    libm17n__mtext = libm17n__lib.mtext
    libm17n__mtext.argtypes = []
    libm17n__mtext.restype = ctypes.POINTER(libm17n__MText)
    global libm17n__mtext_len
    libm17n__mtext_len = libm17n__lib.mtext_len
    libm17n__mtext_len.argtypes = [ctypes.POINTER(libm17n__MText)]
    libm17n__mtext_len.restype = ctypes.c_int
    global libm17n__Mcoding_utf_8
    libm17n__Mcoding_utf_8 = libm17n__MSymbol.in_dll(
        ctypes.pythonapi, 'Mcoding_utf_8')
    global _utf8_converter
    _utf8_converter = libm17n__mconv_buffer_converter(
        libm17n__Mcoding_utf_8, ctypes.c_char_p(None), ctypes.c_int(0))

def _del():
    '''Cleanup'''
    libm17n__lib.m17n_fini()

class __ModuleInitializer:
    def __init__(self):
        _init()
        return

    def __del__(self):
        return

__module_init = __ModuleInitializer()


class Transliterator:
    '''A class for transliterators using libm17n

    If initializing the transliterator fails, for example because a
    non-existing input method was given as the argument, a ValueError
    is raised:

    Examples:

    Russian transliteration:

    >>> trans = Transliterator('ru-translit')
    >>> trans.transliterate(list('y'))
    'ы'
    >>> trans.transliterate(list('yo'))
    'ё'
    >>> trans.transliterate(list('yo y'))
    'ё ы'

    Marathi transliteration:

    >>> trans = Transliterator('mr-itrans')
    >>> trans.transliterate(list('praviN'))
    'प्रविण्'
    >>> trans.transliterate(list('namaste'))
    'नमस्ते'

    Hindi transliteration:

    >>> trans = Transliterator('hi-itrans')
    >>> trans.transliterate(list('namaste'))
    'नमस्ते'

    >>> trans.transliterate(list('. '))
    '। '

    Hindi-Inscript2 uses the AltGr key a lot, 'G-4' is
    the MSymbol name for AltGr-4 and it transliterates
    to something different than just '4':

    >>> trans = Transliterator('hi-inscript2')
    >>> trans.transliterate(['4', 'G-4'])
    '४₹'

    >>> trans = Transliterator('hi-inscript2')
    >>> trans.transliterate(['G-p'])
    'ज़'

    AltGr-3 ('G-3') is not used though in Hindi-Inscript2.
    Therefore, 'G-3' transliterates just as 'G-3':

    >>> trans = Transliterator('hi-inscript2')
    >>> trans.transliterate(['3', 'G-3'])
    '३G-3'

    In mr-inscript2, 'G-1' transliterates to U+200D ZERO WIDTH JOINER
    ('\xe2\x80\x8d' in UTF-8 encoding):

    >>> trans = Transliterator('mr-inscript2')
    >>> trans.transliterate(['j', 'd', 'G-1', '/']).encode('utf-8')
    b'\xe0\xa4\xb0\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\xaf'

    >>> trans = Transliterator('t-latn-post')
    >>> trans.transliterate(list('gru"n'))
    'grün'

    >>> trans = Transliterator('NoIME')
    >>> trans.transliterate(['a', 'b', 'c', 'C-c', 'G-4', 'C-α', 'G-α'])
    'abcC-cG-4C-αG-α'

    >>> trans = Transliterator('ko-romaja')
    >>> trans.transliterate(list('annyeonghaseyo'))
    '안녕하세요'

    >>> trans = Transliterator('si-wijesekera')
    >>> trans.transliterate(list('vksIal kjSka '))
    'ඩනිෂ්ක නවීන් '

    '''
    def __init__(self, ime):
        '''Initialize the input method to use for the transliteration

        Raises ValueError if something fails.

        :param ime: Full name of the m17n input method, for example
                    “hi-inscript2” or “t-latn-post”. There is one
                    special input method name “NoIME”. The input method
                    “NoIME” is just a dummy which does not do transliteration
                    at all, it only joins the list of Msymbol names to
                    a string.
        :type ime: string
        '''
        self._dummy = False
        if ime == 'NoIME':
            self._dummy = True
            return
        language = ime.split('-')[0]
        name = '-'.join(ime.split('-')[1:])
        self._im = libm17n__minput_open_im(
            libm17n__msymbol(ctypes.c_char_p(language.encode('utf-8'))),
            libm17n__msymbol(ctypes.c_char_p(name.encode('utf-8'))),
            ctypes.c_void_p(None))
        try:
            _im_contents = self._im.contents
        except ValueError: # NULL pointer access
            raise ValueError('minput_open_im() failed')
        self._ic = libm17n__minput_create_ic(self._im, ctypes.c_void_p(None))
        try:
            _ic_contents = self._ic.contents
        except ValueError: # NULL pointer access
            raise ValueError('minput_create_ic() failed')

    def transliterate(self, msymbol_list):
        '''Transliterate a list of Msymbol names

        :param msymbol_list: A list of strings which are interpreted
                             as the names of Msymbols to transliterate.
                             If the input method has the special name “NoIME”,
                             no transliteration is done, the list of
                             Msymbols is just joined to a single string.
        :type msymbol_list: A list of strings
        :return: The transliteration
        :rtype: string
        '''
        if not isinstance(msymbol_list, list):
            raise ValueError('Argument of transliterate() must be a list.')
        if self._dummy:
            return ''.join(msymbol_list)
        libm17n__minput_reset_ic(self._ic)
        output = ''
        for symbol in msymbol_list + ['nil']:
            _symbol = libm17n__msymbol(symbol.encode('utf-8'))
            retval = libm17n__minput_filter(
                self._ic, _symbol, ctypes.c_void_p(None))
            if retval == 0:
                _mt = libm17n__mtext()
                retval = libm17n__minput_lookup(
                    self._ic, _symbol, ctypes.c_void_p(None), _mt)
                if libm17n__mtext_len(_mt) > 0:
                    output += mtext_to_string(_mt)
                if retval and symbol != 'nil':
                    output += symbol
        return output

if __name__ == "__main__":
    import doctest
    (FAILED, _ATTEMPTED) = doctest.testmod()
    if FAILED:
        sys.exit(1)
    sys.exit(0)