1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
|
#!/usr/bin/env python3
#
# Copyright (c) 2009 Carnegie Mellon University
#
# You may copy and modify this freely under the same terms as
# Sphinx-III
"""
Generate a single-pronunciation dictionary from an input dictionary
and the output of force alignment.
"""
__author__ = "David Huggins-Daines <dhdaines@gmail.com>"
__version__ = "$Revision $"
from collections import defaultdict
from cmusphinx import s3dict
import sys
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: %s INDICT FALIGNOUT [OUTDICT]" % sys.argv[0],
file=sys.stderr)
sys.exit(1)
indict = s3dict.open(sys.argv[1])
counts = defaultdict(int)
falignout = open(sys.argv[2])
for spam in falignout:
for word in spam.split()[:-1]:
if word in indict:
counts[word] += 1
words = list(indict.words())
words.sort()
if len(sys.argv) > 3:
outfh = open(sys.argv[3], "w")
else:
outfh = sys.stdout
for w in words:
alts = sum(1 for x in indict.alts(w))
if alts == 1:
print("%s\t\t%s" % (w, " ".join(indict[w])), file=outfh)
else:
bestalt = None
bestcount = 0
for a in range(1, alts + 1):
if a == 1:
wstr = w
else:
wstr = "%s(%d)" % (w, a)
if counts[wstr] > bestcount:
bestcount = counts[wstr]
bestalt = wstr
if bestalt == None:
print("%s\t\t%s" % (w, " ".join(indict[w])), file=outfh)
else:
print("%s\t\t%s" % (w, " ".join(indict[bestalt])), file=outfh)
|