File: apertium-editdist

package info (click to toggle)
apertium 3.9.12-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,024 kB
  • sloc: cpp: 22,288; ansic: 4,875; xml: 2,566; python: 1,428; sh: 1,117; lex: 1,088; makefile: 591
file content (302 lines) | stat: -rwxr-xr-x 13,538 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#!/usr/bin/python3

# see apertium-editdist --help for usage

import sys
import struct
import argparse

description_string = "Produce an edit distance transducer in ATT format."

epilog_string = """
For the default case, all the desired transitions are generated with weight 1.0.

The specification file should be in the following format:
* First, an (optional) list of tokens separated by newlines
  All transitions involving these tokens that are otherwise unspecified
  are generated with weight 1.0. Symbol weights can be specified by appending
  a tab and a weight to the symbol. Transitions involving such a symbol
  will have the user-specified weight added to it.
* If you want to exclude symbols that may be induced from a transducer,
  add a leading ~ character to that line.
* If you want to specify transitions, insert a line with the content "@@"
  (without the quotes)
* In the following lines, specified transitions with the form
  FROM <TAB> TO <TAB> WEIGHT
  where FROM is the source token, TO is the destination token and WEIGHT is
  a nonnegative floating point number specifying the weight. By default,
  if only one transition involving FROM and TO is specified, the same WEIGHT
  will be used to specify the transition TO -> FROM (assuming that both are
  listed in the list of tokens).
* If the command line option to generate swaps is set, you can also specify swap
  weights with
  FROM,TO <TAB> TO,FROM <TAB> WEIGHT
  Again, unspecified swaps will be generated automatically with weight 1.0.
* Lines starting with ## are comments.

with d for distance and S for size of alphabet plus one
(for epsilon), expected output is a transducer in ATT format with
* Swapless:
** d + 1 states
** d*(S^2 + S - 1) transitions
* Swapful:
** d*(S^2 - 3S + 3) + 1 states
** d*(3S^2 - 5S + 3) transitions
"""

def maketrans(from_st, to_st, from_sy, to_sy, weight):
    return "{0}\t{1}\t{2}\t{3}\t{4}".format(from_st, to_st, from_sy, to_sy, weight)

class Transducer:
    def __init__(self, dist, other, epsilon, swap, elim):
        self.distance = dist
        self.alphabet = {}
        self.exclusions = set()
        self.substitutions = {}
        self.swaps = {}
        self.should_swap = swap
        self.should_elim = elim
        self.other = other
        self.epsilon = epsilon
        self.swapstate = self.distance + 1
        self.skipstate = self.swapstate + 1
        self.transitions = []

    def read_input_file(self, fname):
        def parse_error(err, line):
            nonlocal fname
            sys.stderr.write('Syntax error on line %s of %s: %s.\n' % (fname, line, err))
            sys.exit(1)
        with open(fname) as fin:
            in_alpha = True
            for i, line_ in enumerate(fin, 1):
                line = line_.strip()
                if not line or line.startswith('##'):
                    continue
                if in_alpha:
                    if line == '@@':
                        in_alpha = False
                    elif len(line) > 1 and line[0] == '~':
                        self.exclusions.add(line[1:].strip())
                        continue
                    elif '\t' in line:
                        if line.count('\t') > 1:
                            parse_error('Too many tabs', i)
                        symbol, weight = line.split('\t')
                        try:
                            self.alphabet[symbol] = float(weight)
                        except:
                            parse_error('Unable to parse weight', i)
                    else:
                        self.alphabet[line] = 0.0
                else:
                    if line.count('\t') > 2 or line.count('\t') == 1:
                        parse_error('Wrong number of tabs, expected 3 tab-separated columns', i)
                    elif line.count('\t') == 0:
                        parse_error('Substitutions and swaps must be tab-separated', i)
                    l, r, w = line.split('\t')
                    weight = 0.0
                    try:
                        weight = float(w)
                    except:
                        parse_error('Unable to parse weight', i)
                    if ',' in line:
                        frompair = tuple(l.split(','))
                        topair = tuple(r.split(','))
                        if not (len(frompair) == len(topair) == 2):
                            parse_error('Swap-specification has wrong number of comma separators', i)
                        self.swaps.setdefault((frompair, topair), weight)
                    else:
                        self.substitutions.setdefault((l, r), weight)

    def read_optimized_lookup_alphabet(self, fname):
        with open(fname, "rb") as fin:
            byt = fin.read(5)
            if byt == b"HFST\0":
                # just ignore any hfst3 header
                header_length = struct.unpack_from("<H", fin.read(3), 0)[0]
                fin.read(header_length)
                # hopefully there's nothing surprising in here
                byt = fin.read(56)
            else:
                byt += fin.read(56 - 5)
            symbol_count = struct.unpack_from("<H", byt, 2)[0]
            for n in range(symbol_count):
                s = fin.read(1)
                while s[-1] != 0:
                    s += fin.read(1)
                sym = s[:-1].decode('utf-8')
                if len(sym) != 1:
                    sys.stderr.write("Ignored symbol " + sym + "\n")
                elif not sym.isspace() and sym not in self.exclusions:
                    self.alphabet.setdefault(sym, 0.0)

    def extend_alphabet(self, alpha):
        for c in alpha:
            if c not in self.exclusions:
                self.alphabet.setdefault(c, 0.0)

    def clean_alphabet(self):
        # depending on what order read_input_file(), extend_alphabet()
        # and read_optimzed_lookup_alphabet() are called in, symbols
        # might get included in the alphabet which should be excluded
        # so remove those
        for s in self.exclusions:
            if s in self.alphabet:
                del self.alphabet[s]

    def generate(self):
        # for substitutions and swaps that weren't defined by the user,
        # generate standard subs and swaps
        self.substitutions.setdefault((self.other, self.epsilon), 1.0)
        for symbol in self.alphabet:
            w = 1.0 + self.alphabet[symbol]
            self.substitutions.setdefault((self.other, symbol), w)
            self.substitutions.setdefault((self.epsilon, symbol), w)
            self.substitutions.setdefault((symbol, self.epsilon), w)
            for symbol2 in self.alphabet:
                if symbol == symbol2: continue
                w += self.alphabet[symbol2]
                p12 = (symbol, symbol2)
                p21 = (symbol2, symbol)
                self.swaps.setdefault((p12, p21), self.swaps.get((p21, p12), w))
                self.substitutions.setdefault(p12, self.substitutions.get(p21, w))

    def next_special(self, state):
        if state == "swap":
            self.swapstate += 1
            while self.swapstate == self.skipstate:
                self.swapstate += 1
        elif state == "skip":
            self.skipstate += 1
            while self.skipstate == self.swapstate:
                self.skipstate += 1
        else:
            raise Exception

    def make_identities(self, state, nextstate = None):
        if nextstate is None:
            nextstate = state
        ret = []
        for symbol in self.alphabet:
            if symbol not in (self.epsilon, self.other):
                ret.append(maketrans(state, nextstate, symbol, symbol, 0.0))
        return ret

    def make_swaps(self, state, nextstate = None):
        if nextstate is None:
            nextstate = state + 1
        ret = []
        if self.should_swap:
            for swap in self.swaps:
                ret.append(maketrans(state, self.swapstate, swap[0][0], swap[0][1], self.swaps[swap]))
                ret.append(maketrans(self.swapstate, nextstate, swap[1][0], swap[1][1], 0.0))
                self.next_special("swap")
        return ret

    # for substitutions, we try to eliminate redundancies by refusing to do
    # deletion right after insertion and insertion right after deletion
    def make_substitutions(self, state, nextstate = None):
        if nextstate is None:
            nextstate = state + 1
        ret = []
        for sub in self.substitutions:
            if (nextstate + 1 >= self.distance) or not self.should_elim:
                ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
            elif sub[1] is self.epsilon: # deletion
                ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub]))
                ret += self.make_identities(self.skipstate, nextstate)
                ret += self.make_swaps(self.skipstate, nextstate + 1)
                for sub2 in self.substitutions:
                    # after deletion, refuse to do insertion
                    if sub2[0] != self.epsilon:
                        ret.append(maketrans(self.skipstate, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
                self.next_special("skip")
            elif sub[0] is self.epsilon: # insertion
                ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub]))
                ret += self.make_identities(self.skipstate, nextstate)
                ret += self.make_swaps(self.skipstate, nextstate + 1)
                for sub2 in self.substitutions:
                    # after insertion, refuse to do deletion
                    if sub2[1] != self.epsilon:
                        ret.append(maketrans(self.skipstate, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
                self.next_special("skip")
            else:
                ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
        return ret

    def make_transitions(self):
        for state in range(self.distance):
            self.transitions.append(str(state + 1) + "\t0.0") # final states
            self.transitions += self.make_identities(state)
            self.transitions += self.make_substitutions(state)
            self.transitions += self.make_swaps(state)
        self.transitions += self.make_identities(self.distance)

    def verbose_report(self):
        template = '''
{states} states and {trans} transitions written for
distance {dist} and base alphabet size {alpha_size}

The alphabet was:
{alpha}
The exclusions were:
{excl}
'''
        alpha = ' '.join('%s:%s' % (s, w) for s, w in self.alphabet.items())
        excl = ' '.join(self.exclusions)
        report = template.format(states=max(self.skipstate, self.swapstate),
                                 trans=len(self.transitions),
                                 dist=self.distance,
                                 alpha_size=len(self.alphabet),
                                 alpha=alpha,
                                 excl=excl)
        sys.stderr.write(report)

def main():
    parser = argparse.ArgumentParser(description=description_string,
                                     epilog=epilog_string,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-d', '--distance', type=int, metavar='DIST', default=1,
                        help='edit depth, default is 1')
    parser.add_argument('-e', '--epsilon', default='@0@', metavar='EPS',
                        help='epsilon symbol, default is @0@')
    parser.add_argument('--no-elim', action='store_true',
                        help="don't reduce elimination")
    parser.add_argument('-s', '--swap', action='store_true',
                        help='generate swaps in addition to insertions and deletions')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='print summary to stderr')
    alpha = parser.add_argument_group('Alphabet', 'Specify the alphabet of the transducer (these may be combined and repeated)')
    alpha.add_argument('-i', '--input', dest='inputfile', metavar='INPUT',
                       action='append', default=[],
                       help='specification file in edit-distance syntax')
    alpha.add_argument('-a', '--alphabet', dest='alphabetfile', metavar='TRANS',
                       action='append', default=[],
                       help='optimized-lookup format transducer to read alphabet from (ignoring multi-character symbols)')
    alpha.add_argument('alphabet_string', nargs='*',
                       help='the alphabet as a string on the command line')
    options = parser.parse_args()

    transducer = Transducer(options.distance, '@_UNKNOWN_SYMBOL_@', options.epsilon, options.swap, not options.no_elim)

    if not options.inputfile and not options.alphabet_string and not options.alphabetfile:
        parser.error('Must provide an alphabet')
    for fl in options.inputfile:
        transducer.read_input_file(fl)
    for s in options.alphabet_string:
        transducer.extend_alphabet(s)
    for fl in options.alphabetfile:
        transducer.read_optimized_lookup_alphabet(fl)
    transducer.clean_alphabet()

    transducer.generate()
    transducer.make_transitions()
    for transition in transducer.transitions:
        print(transition)

    if options.verbose:
        transducer.verbose_report()

if __name__ == '__main__':
    main()