1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
|
#!/usr/bin/python
# see editdist.py --help for usage
import sys
import struct
import codecs
from optparse import OptionParser
debug = False
usage_string = "usage: %prog [options] alphabet"
info_string = """
Produce an edit distance transducer.
Output is either an arc-by-arc -generated ATT listing (deprecated), or a large
regular expression.
There are three ways to give the alphabet and weights:
* giving the alphabet as a command line argument
(weights are implicitly 1.0 per error)
* giving a file with specialized configuration syntax
* giving a transducer in optimized-lookup format to induce an alphabet
(in this case only symbols with length 1 are considered,
and weights are implicitly 1.0 per error)
These ways may be combined freely.
The specification file should be in the following format:
* First, an (optional) list of tokens separated by newlines
All transitions involving these tokens that are otherwise unspecified
are generated with weight 1.0. Symbol weights can be specified by appending
a tab and a weight to the symbol. Transitions involving such a symbol
will have the user-specified weight added to it.
* If you want to exclude symbols that may be induced from a transducer,
add a leading ~ character to that line.
* If you want to specify transitions, insert a line with the content "@@"
(without the quotes)
* In the following lines, specified transitions with the form
FROM <TAB> TO <TAB> WEIGHT
where FROM is the source token, TO is the destination token and WEIGHT is
a nonnegative floating point number specifying the weight. By default,
if only one transition involving FROM and TO is specified, the same WEIGHT
will be used to specify the transition TO -> FROM (assuming that both are
listed in the list of tokens).
* If the command line option to generate swaps is set, you can also specify swap
weights with
FROM,TO <TAB> TO,FROM <TAB> WEIGHT
Again, unspecified swaps will be generated automatically with weight 1.0.
* Lines starting with ## are comments.
with d for distance and S for size of alphabet plus one
(for epsilon), expected output is a transducer in ATT format with
* Swapless:
** d + 1 states
** d*(S^2 + S - 1) transitions
* Swapful:
** d*(S^2 - 3S + 3) + 1 states
** d*(3S^2 - 5S + 3) transitions
"""
OTHER = u'@_UNKNOWN_SYMBOL_@'
class MyOptionParser(OptionParser):
# This is needed to override the formatting of the help string
def format_epilog(self, formatter):
return self.epilog
parser = MyOptionParser(usage=usage_string, epilog=info_string)
parser.add_option("-r", "--regex", action = "store_true", dest = "make_regex",
help = "write a regular expression")
parser.add_option("-e", "--epsilon", dest = "epsilon",
help = "specify symbol to use as epsilon, default is @0@",
metavar = "EPS")
parser.add_option("-d", "--distance", type = "int", dest = "distance",
help = "specify edit depth, default is 1",
metavar = "DIST")
parser.add_option("-w", "--default-weight", type = "float", dest = "default_weight",
help = "weight per correction when nothing else is specified (the default default is 1.0)",
metavar = "DIST")
parser.add_option("-s", "--swap", action = "store_true", dest="swap",
help = "generate swaps (as well as insertions and deletions)")
parser.add_option("", "--no-elim", action = "store_true", dest="no_elim",
help = "don't do redundancy elimination")
parser.add_option("-m", "--minimum-edit", type = "int", dest = "minimum_edit",
help = "minimum accepting edit (default is 1)")
parser.add_option("", "--no-string-initial-correction", action = "store_true",
dest = "no_initial",
help = "don't make corrections at the beginning of the string")
parser.add_option("-i", "--input", dest = "inputfile",
help = "optional file with special edit-distance syntax",
metavar = "INPUT")
parser.add_option("-o", "--output-file", dest = "outputfile", help = "output file (default is stdout)", metavar = "OUTPUT")
parser.add_option("-a", "--alphabet", dest = "alphabetfile",
help = "read the alphabet from an existing optimized-lookup format transducer",
metavar = "ALPHABET")
parser.add_option("-v", "--verbose", action = "store_true", dest="verbose",
help = "print some diagnostics to standard error")
parser.set_defaults(make_regex = False)
parser.set_defaults(epsilon = '@0@')
parser.set_defaults(distance = 1)
parser.set_defaults(default_weight = 1.0)
parser.set_defaults(swap = False)
parser.set_defaults(no_elim = False)
parser.set_defaults(no_initial = False)
parser.set_defaults(minimum_edit = 0)
parser.set_defaults(verbose = False)
(options, args) = parser.parse_args()
# Some utility classes
class Header:
"""Read and provide interface to header"""
def __init__(self, file):
bytes = file.read(5) # "HFST\0"
if str(struct.unpack_from("<5s", bytes, 0)) == "('HFST\\x00',)":
# just ignore any hfst3 header
remaining = struct.unpack_from("<H", file.read(3), 0)[0]
self.handle_hfst3_header(file, remaining)
bytes = file.read(56) # 2 unsigned shorts, 4 unsigned ints and 9 uint-bools
else:
bytes = bytes + file.read(56 - 5)
self.number_of_input_symbols = struct.unpack_from("<H", bytes, 0)[0]
self.number_of_symbols = struct.unpack_from("<H", bytes, 2)[0]
self.size_of_transition_index_table = struct.unpack_from("<I", bytes, 4)[0]
self.size_of_transition_target_table = struct.unpack_from("<I", bytes, 8)[0]
self.number_of_states = struct.unpack_from("<I", bytes, 12)[0]
self.number_of_transitions = struct.unpack_from("<I", bytes, 16)[0]
self.weighted = struct.unpack_from("<I", bytes, 20)[0] != 0
self.deterministic = struct.unpack_from("<I", bytes, 24)[0] != 0
self.input_deterministic = struct.unpack_from("<I", bytes, 28)[0] != 0
self.minimized = struct.unpack_from("<I", bytes, 32)[0] != 0
self.cyclic = struct.unpack_from("<I", bytes, 36)[0] != 0
self.has_epsilon_epsilon_transitions = struct.unpack_from("<I", bytes, 40)[0] != 0
self.has_input_epsilon_transitions = struct.unpack_from("<I", bytes, 44)[0] != 0
self.has_input_epsilon_cycles = struct.unpack_from("<I", bytes, 48)[0] != 0
self.has_unweighted_input_epsilon_cycles = struct.unpack_from("<I", bytes, 52)[0] != 0
def handle_hfst3_header(self, file, remaining):
chars = struct.unpack_from("<" + str(remaining) + "c",
file.read(remaining), 0)
# assume the h3-header doesn't say anything surprising for now
class Alphabet:
"""Read and provide interface to alphabet"""
def __init__(self, file, number_of_symbols):
stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
self.keyTable = [] # list of unicode objects, use foo.encode("utf-8") to print
for x in range(number_of_symbols):
symbol = ""
while True:
byte = file.read(1)
if byte == '\0': # a symbol has ended
symbol = unicode(symbol, "utf-8")
if len(symbol) != 1:
stderr_u8.write("Ignored symbol " + symbol + "\n")
else:
self.keyTable.append(symbol)
break
symbol += byte
def p(string): # stupid python, or possibly stupid me
return string.encode('utf-8')
def maketrans(from_st, to_st, from_sy, to_sy, weight):
return str(from_st) + "\t" + str(to_st) + "\t" + p(from_sy) + "\t" + p(to_sy) + "\t" + str(weight)
class Transducer:
def __init__(self, alphabet, _other = OTHER, _epsilon = options.epsilon, _distance = options.distance):
self.alphabet = alphabet
self.substitutions = {}
self.swaps = {}
self.other = _other
self.epsilon = _epsilon
self.distance = _distance
self.transitions = []
# the first self.distance states are always used, for others we
# grab state numbers from this counter
self.state_clock = self.distance + 1
self.debug_messages = []
def process_pair_info(self, specification):
for pair, weight in specification["edits"].iteritems():
self.substitutions[pair] = weight
for pairpair, weight in specification["swaps"].iteritems():
self.swaps[pairpair] = weight
def generate(self):
# for substitutions and swaps that weren't defined by the user,
# generate standard subs and swaps
if (self.other, self.epsilon) not in self.substitutions:
self.substitutions[(self.other, self.epsilon)] = options.default_weight
for symbol in self.alphabet.keys():
if (self.other, symbol) not in self.substitutions:
self.substitutions[(self.other, symbol)] = options.default_weight + alphabet[symbol]
if (self.epsilon, symbol) not in self.substitutions:
self.substitutions[(self.epsilon, symbol)] = options.default_weight + alphabet[symbol]
if (symbol, self.epsilon) not in self.substitutions:
self.substitutions[(symbol, self.epsilon)] = options.default_weight + alphabet[symbol]
for symbol2 in self.alphabet.keys():
if symbol == symbol2: continue
if ((symbol, symbol2), (symbol2, symbol)) not in self.swaps:
if ((symbol2, symbol), (symbol, symbol2)) in self.swaps:
self.swaps[((symbol, symbol2), (symbol2, symbol))] = self.swaps[((symbol2, symbol), (symbol, symbol2))]
else:
self.swaps[((symbol, symbol2), (symbol2, symbol))] = options.default_weight + alphabet[symbol] + alphabet[symbol2]
if (symbol, symbol2) not in self.substitutions:
if (symbol2, symbol) in self.substitutions:
self.substitutions[(symbol, symbol2)] = self.substitutions[(symbol2, symbol)]
else:
self.substitutions[(symbol, symbol2)] = options.default_weight + alphabet[symbol] + alphabet[symbol2]
def make_identities(self, state, nextstate = None):
if nextstate is None:
nextstate = state
ret = []
for symbol in self.alphabet.keys():
if symbol not in (self.epsilon, self.other):
ret.append(maketrans(state, nextstate, symbol, symbol, 0.0))
return ret
def make_swaps(self, state, nextstate = None):
if nextstate is None:
nextstate = state + 1
ret = []
if options.swap:
for swap in self.swaps:
swapstate = self.state_clock
self.state_clock += 1
self.debug_messages.append(str(swapstate) + " is a swap state for " + swap[0][0] + " and " + swap[0][1])
ret.append(maketrans(state, swapstate, swap[0][0], swap[0][1], self.swaps[swap]))
ret.append(maketrans(swapstate, nextstate, swap[1][0], swap[1][1], 0.0))
return ret
# for substitutions, we try to eliminate redundancies by refusing to do
# deletion right after insertion and insertion right after deletion
def make_substitutions(self, state, nextstate = None):
if nextstate is None:
nextstate = state + 1
ret = []
eliminate = False
# unless we're about to hit the maximum edit or we're not eliminating
# redundancies, make skip states for delete and insert
if (nextstate < options.distance) and not options.no_elim:
eliminate = True
delete_skip = self.state_clock
self.state_clock += 1
insert_skip = self.state_clock
self.state_clock += 1
ret += self.make_identities(delete_skip, nextstate)
ret += self.make_swaps(delete_skip, nextstate + 1)
ret += self.make_identities(insert_skip, nextstate)
ret += self.make_swaps(insert_skip, nextstate + 1)
for sub in self.substitutions:
if not eliminate:
ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
elif sub[1] is self.epsilon: # (eliminating) deletion
ret.append(maketrans(state, delete_skip, sub[0], sub[1], self.substitutions[sub]))
for sub2 in self.substitutions:
# after deletion, refuse to do insertion
if sub2[0] != self.epsilon:
ret.append(maketrans(delete_skip, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
elif sub[0] is self.epsilon: # (eliminating) insertion
ret.append(maketrans(state, insert_skip, sub[0], sub[1], self.substitutions[sub]))
for sub2 in self.substitutions:
# after insertion, refuse to do deletion
if sub2[1] != self.epsilon:
ret.append(maketrans(insert_skip, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
else:
ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
return ret
def make_transitions(self):
# If we're not editing in the initial state, there's an extra state
# where we just want identities
for state in range(options.distance + options.no_initial):
if options.minimum_edit != 0:
options.minimum_edit -= 1
else:
self.transitions.append(str(state) + "\t0.0") # final states
if state == 0 and options.no_initial:
self.transitions += self.make_identities(state, state + 1)
continue # Don't do initial corrections
else:
self.transitions += self.make_identities(state)
self.transitions += self.make_substitutions(state)
self.transitions += self.make_swaps(state)
self.transitions += self.make_identities(options.distance + options.no_initial)
self.transitions.append(str(options.distance + options.no_initial) + "\t0.0")
alphabet = {}
exclusions = set()
pair_info = {"edits": {}, "swaps": {}}
if options.inputfile == None and options.alphabetfile == None \
and len(args) == 0:
print "Specify at least one of INPUT, ALPHABET or alphabet string"
sys.exit()
if len(args) > 1:
print "Too many options!"
sys.exit()
if options.outputfile == None:
outputfile = codecs.getwriter('utf-8')(sys.stdout)
else:
outputfile = codecs.getwriter('utf-8')(open(options.outputfile, 'w'))
if options.inputfile != None:
try:
inputfile = open(options.inputfile)
except IOError:
print "Couldn't open " + options.inputfile
sys.exit()
while True:
# first the single-symbol info
line = unicode(inputfile.readline(), 'utf-8')
if line in ("@@\n", ""):
break
if line.strip() != "":
if line.startswith(u'##'):
continue
if len(line) > 1 and line.startswith(u'~'):
exclusions.add(line[1:].strip())
continue
if '\t' in line:
weight = float(line.split('\t')[1])
symbol = line.split('\t')[0]
else:
weight = 0.0
symbol = line.strip("\n")
alphabet[symbol] = weight
while True:
# then pairs
line = unicode(inputfile.readline(), 'utf-8')
if line.startswith('##'):
continue
if line == "\n":
continue
if line == "":
break
parts = line.split('\t')
if len(parts) != 3:
raise ValueError("Got specification with " + str(len(parts)) +\
" parts, expected 3:\n" + specification)
weight = float(parts[2])
if ',' in parts[0]:
frompair = tuple(parts[0].split(','))
topair = tuple(parts[1].split(','))
if not (len(frompair) == len(topair) == 2):
raise ValueError("Got swap-specification with incorrect number "
"of comma separators:\n" + specification)
if (frompair, topair) not in pair_info["swaps"]:
pair_info["swaps"][(frompair, topair)] = weight
for sym in [frompair[0], frompair[1], topair[0], topair[1]]:
if sym != '' and sym not in alphabet:
alphabet[sym] = weight
else:
if not (parts[0], parts[1]) in pair_info["edits"]:
pair_info["edits"][(parts[0], parts[1])] = weight
for sym in [parts[0], parts[1]]:
if sym != '' and sym not in alphabet:
alphabet[sym] = weight
if len(args) == 1:
for c in unicode(args[0], 'utf-8'):
if c not in alphabet.keys() and c not in exclusions:
alphabet[c] = 0.0
if options.alphabetfile != None:
afile = open(options.alphabetfile, "rb")
ol_header = Header(afile)
ol_alphabet = Alphabet(afile, ol_header.number_of_symbols)
for c in filter(lambda x: x.strip() != '', ol_alphabet.keyTable[:]):
if c not in alphabet.keys() and c not in exclusions:
alphabet[c] = 0.0
epsilon = unicode(options.epsilon, 'utf-8')
def replace_rules(alphabet, pair_info, weight = options.default_weight):
corr = ' "<CORR>" '
unk = OTHER
corrections = "["
# first, the empty string may become the empty string anywhere
corrections += '"" -> \t[ "" |\n'
for a in alphabet:
this_weight = weight
# insertions
if ('', a) in pair_info["edits"]:
this_weight = pair_info["edits"][('', a)] + alphabet[a]
corrections += '\t[ "' + a + '" ' + corr + ' ]::' + str(this_weight) + ' |\n'
# trim the extra left by the last pass
corrections = corrections[:-3]
corrections += ' ] ,,\n'
for a in alphabet:
this_weight = weight
# the left-hand side of the rule
corrections += '"' + a + '" ->\t[ '
# identity
corrections += '"' + a + '" |\n'
# deletion
if (a, '') in pair_info["edits"]:
this_weight = pair_info["edits"][(a, '')]
corrections += '\t[ ""' + corr + ']::' + str(this_weight) + ' |\n'
for b in alphabet:
this_weight = weight + alphabet[b]
#substitutions
if a == b:
# we don't handle identities here
continue
if (a, b) in pair_info["edits"]:
this_weight = pair_info["edits"][(a, b)] + alphabet[b]
corrections += '\t[ "' + b + '"' + corr + ']::' + str(this_weight) + ' |\n'
corrections = corrections[:-3]
corrections += ' ] ,,\n'
# now the unknown symbol
corrections += '"' + unk + '" -> [\n\t[""' + corr + ']::' + str(weight) + ' |\n'
for a in alphabet:
corrections += '\t[ "' + a + '"' + corr + ']::' + str(weight) + ' |\n'
# trim the end again
corrections = corrections[:-3]
corrections += ' ]]'
return corrections
if options.make_regex:
corrections = replace_rules(alphabet, pair_info)
corr_counter = '[[ [? - "<CORR>"]* ( "<CORR>":0 ) [? - "<CORR>"]* ]^' + str(options.distance) + ']'
corr_eater = '[[? - "<CORR>"]*]'
full_regex = corr_eater + '\n.o.\n' + corrections.encode('utf-8') + '\n.o.\n' + corr_counter + ";\n"
outputfile.write(full_regex.decode('utf-8'))
else:
transducer = Transducer(alphabet)
transducer.process_pair_info(pair_info)
transducer.generate()
transducer.make_transitions()
for transition in transducer.transitions:
outputfile.write(transition.decode('utf-8'))
outputfile.write('\n')
stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
if options.verbose:
stderr_u8.write("\n" + str(transducer.state_clock) + " states and " + str(len(transducer.transitions)) + " transitions written for "+
"distance " + str(options.distance) + " and base alphabet size " + str(len(transducer.alphabet)) +"\n\n")
stderr_u8.write("The alphabet was:\n")
for symbol, weight in alphabet.iteritems():
stderr_u8.write(symbol + "\t" + str(weight) + "\n")
if len(exclusions) != 0:
stderr_u8.write("The exclusions were:\n")
for symbol in exclusions:
stderr_u8.write(symbol + "\n")
print
if debug:
for message in transducer.debug_messages:
print message
|