File: convert_kmap.py

package info (click to toggle)
lyx 2.0.3-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 96,552 kB
  • sloc: cpp: 388,556; python: 19,985; ansic: 9,725; sh: 5,696; makefile: 3,907; pascal: 1,388; objc: 985; perl: 319; yacc: 289; tcl: 163; xml: 23; sed: 16
file content (213 lines) | stat: -rw-r--r-- 7,207 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#! /usr/bin/env python
# -*- coding: utf-8 -*-

# file convert_kmap.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.

# author Georg Baum

# Full author contact details are available in file CREDITS

# This script converts a kmap file from LaTeX commands to unicode characters
# The kmap file is read and written in utf8 encoding


import os, re, string, sys, unicodedata

def usage(prog_name):
    return ("Usage: %s unicodesymbolsfile inputfile outputfile\n" % prog_name +
            "or     %s unicodesymbolsfile <inputfile >outputfile" % prog_name)


def error(message):
    sys.stderr.write(message + '\n')
    sys.exit(1)


def trim_eol(line):
    " Remove end of line char(s)."
    if line[-2:-1] == '\r':
        return line[:-2]
    elif line[-1:] == '\r' or line[-1:] == '\n':
        return line[:-1]
    else:
        # file with no EOL in last line
        return line


def read(input):
    " Read input file and strip lineendings."
    lines = list()
    while 1:
        line = input.readline()
        if not line:
            break
        lines.append(trim_eol(line).decode('utf8'))
    return lines


def escape(word):
    " Escape a word for LyXLex."
    re_quote = re.compile(r'\s|,')
    retval = u''
    i = 0
    for c in word:
        if c == '\\' or c == '"' or c == '#':
            retval = retval + u'\\'
        retval = retval + c
    if re_quote.match(retval):
        return u'"%s"' % retval
    return retval


def unescape(word):
    " Unescape a LyXLex escaped word."
    if len(word) > 1 and word[0] == '"' and word[-1] == '"':
        start = 1
        stop = len(word) - 1
    else:
        start = 0
        stop = len(word)
    retval = u''
    i = start
    while i < stop:
        if word[i] == '\\' and i < stop - 1:
            i = i + 1
        retval = retval + word[i]
        i = i + 1
    return retval


def readsymbols(input):
    " Build the symbol list from the unicodesymbols file and add some hardcoded symbols."
    symbols = list()
    while 1:
        line = input.readline()
        if not line:
            break
        line = trim_eol(line)
        tokens = line.split()
        if len(tokens) > 0 and tokens[0][0] != '#':
            if len(tokens) > 1:
                tokens[1] = unescape(tokens[1])
            if tokens[0][0:2] == "0x":
                tokens[0] = int(tokens[0][2:], 16)
                symbols.append(tokens)
    # special cases from .cdef files (e.g. duplicates with different commands)
    symbols.append([0x00a1, '\\nobreakspace'])
    symbols.append([0x00a7, '\\S'])
    symbols.append([0x00a9, '\\copyright'])
    symbols.append([0x00b1, '$\\pm$'])
    symbols.append([0x00b5, '$\\mu$'])
    symbols.append([0x00b7, '$\\cdot$'])
    symbols.append([0x00b9, '$\\mathonesuperior$'])
    symbols.append([0x00d7, '$\\times$'])
    symbols.append([0x00d7, '\\times'])
    symbols.append([0x00f7, '\\div'])
    symbols.append([0x20ac, '\\euro'])
    # special caron, see lib/lyx2lyx/lyx_1_5.py for an explanation
    symbols.append([0x030c, '\\q', '', 'combining'])
    return symbols


def write(output, lines):
    " Write output file with native lineendings."
    for line in lines:
        output.write(line.encode('utf8') + os.linesep)


def translate_symbol(unicodesymbols, symbol, try_combining = True):
    " Translate a symbol from LaTeX to unicode."
    re_combining = re.compile(r'^[^a-zA-Z]')
    if len(symbol) == 1:
        return symbol
    for i in unicodesymbols:
        # Play safe and don't try combining symbols (not sure if this is
        # needed)
        if i[1] == symbol and (len(i) < 4 or i[3].find('combining') < 0):
            return unichr(i[0])
    if try_combining:
        # no direct match, see whether this is a combining sequence
        for i in unicodesymbols:
            if len(i) > 3 and i[3].find('combining') >= 0 and symbol.find(i[1]) == 0:
                # Test whether this is really a combining sequence, e.g.
                # \"o or \d{o}, and not a symbol like \dh that shares the
                # beginning with a combining symbol
                translated = symbol[len(i[1]):]
                if translated != '' and re_combining.match(translated):
                    # Really a combining sequence
                    if len(translated) > 1 and translated[0] == '{' and translated[-1] == '}':
                        # Strip braces from things like \d{o}
                        translated = translated[1:-1]
                    else:
                        # for some strange reason \\'\\i does not get
                        # correctly combined, so we try \\'\\i which has an
                        # entry in unicodesymbols
                        combined = translate_symbol(unicodesymbols, u'%s{%s}' % (i[1], translated))
                        if combined != '':
                            return combined
                    if len(translated) > 1:
                        # The base character may be a symbol itself, e.g \"{\i}
                        translated = translate_symbol(unicodesymbols, translated, False)
                    # Play safe and only translate combining sequences with
                    # one base character
                    if len(translated) == 1 and (i[1] != '\\q' or translated in ['t', 'd', 'l', 'L']):
                        return unicodedata.normalize("NFKC", translated + unichr(i[0]))
                    else:
                        # we founed a combining character, but could not convert the argument to a single character
                        return ''
    return ''


def convert(lines, unicodesymbols):
    " Translate all symbols in lines from LaTeX to unicode."
    # convert both commented and active entries
    re_kmap = re.compile(r'^(#?\s*\\kmap\s+\S+\s+)([^\s]+)(.*)$')
    re_kxmod = re.compile(r'^(#?\s*\\kxmod\s+\S+\s+\S+\s+)([^\s]+)(.*)$')
    for i in range(len(lines)):
        match = re_kmap.match(lines[i])
        if not match:
            match = re_kxmod.match(lines[i])
        if match:
            symbol = unescape(match.group(2))
            if len(symbol) > 2 and symbol[-2:] == '{}':
                # The unicodesymbols file does not include the trailing delimiter {}
                symbol = symbol[0:-2]
            translated = translate_symbol(unicodesymbols, symbol)
            if translated == '':
                lines[i] = u'%s%s%s' % (match.group(1), match.group(2), match.group(3))
            else:
                lines[i] = u'%s%s%s' % (match.group(1), escape(translated), match.group(3))
                continue


def main(argv):

    # Open files
    if len(argv) == 2:
        input = sys.stdin
        output = sys.stdout
    elif len(argv) == 4:
        input = open(argv[2], 'rb')
        output = open(argv[3], 'wb')
    else:
        error(usage(argv[0]))
    unicodesymbols = open(argv[1], 'rb')

    # Do the real work
    symbols = readsymbols(unicodesymbols)
    lines = read(input)
    convert(lines, symbols)
    write(output, lines)

    # Close files
    if len(argv) == 3:
        input.close()
        output.close()

    return 0


if __name__ == "__main__":
    main(sys.argv)